Browse Source

[infiniband] Handle duplicate Communication Management REPs

We will terminate our transaction as soon as we receive the first CM
REP, since that provides all the state that we need.  However, the
peer may resend the REP if it didn't see our RTU, and if we don't
respond with another RTU we risk being disconnected.  (This protocol
appears not to handle retries gracefully.)

Fix by adding a management agent that will listen for these duplicate
REPs and send back an RTU.
tags/v0.9.8
Michael Brown 14 years ago
parent
commit
46073f1239
2 changed files with 58 additions and 0 deletions
  1. 3
    0
      src/include/gpxe/ib_cm.h
  2. 55
    0
      src/net/infiniband/ib_cm.c

+ 3
- 0
src/include/gpxe/ib_cm.h View File

@@ -46,6 +46,9 @@ struct ib_connection {
46 46
 	/** Connection operations */
47 47
 	struct ib_connection_operations *op;
48 48
 
49
+	/** List of connections */
50
+	struct list_head list;
51
+
49 52
 	/** Path to target */
50 53
 	struct ib_path *path;
51 54
 	/** Connection request management transaction */

+ 55
- 0
src/net/infiniband/ib_cm.c View File

@@ -36,6 +36,9 @@ FILE_LICENCE ( GPL2_OR_LATER );
36 36
  *
37 37
  */
38 38
 
39
+/** List of connections */
40
+static LIST_HEAD ( ib_cm_conns );
41
+
39 42
 /**
40 43
  * Send "ready to use" response
41 44
  *
@@ -71,6 +74,54 @@ static int ib_cm_send_rtu ( struct ib_device *ibdev,
71 74
 	return 0;
72 75
 }
73 76
 
77
+/**
78
+ * Handle duplicate connection replies
79
+ *
80
+ * @v ibdev		Infiniband device
81
+ * @v mi		Management interface
82
+ * @v mad		Received MAD
83
+ * @v av		Source address vector
84
+ * @ret rc		Return status code
85
+ *
86
+ * If a "ready to use" MAD is lost, the peer may resend the connection
87
+ * reply.  We have to respond to these with duplicate "ready to use"
88
+ * MADs, otherwise the peer may time out and drop the connection.
89
+ */
90
+static void ib_cm_connect_rep ( struct ib_device *ibdev,
91
+				struct ib_mad_interface *mi,
92
+				union ib_mad *mad,
93
+				struct ib_address_vector *av ) {
94
+	struct ib_cm_connect_reply *connect_rep =
95
+		&mad->cm.cm_data.connect_reply;
96
+	struct ib_connection *conn;
97
+	int rc;
98
+
99
+	/* Identify connection */
100
+	list_for_each_entry ( conn, &ib_cm_conns, list ) {
101
+		if ( ntohl ( connect_rep->remote_id ) != conn->local_id )
102
+			continue;
103
+		/* Try to send "ready to use" reply */
104
+		if ( ( rc = ib_cm_send_rtu ( ibdev, mi, conn, av ) ) != 0 ) {
105
+			/* Ignore errors */
106
+			return;
107
+		}
108
+		return;
109
+	}
110
+
111
+	DBG ( "CM unidentified connection %08x\n",
112
+	      ntohl ( connect_rep->remote_id ) );
113
+}
114
+
115
+/** Communication management agents */
116
+struct ib_mad_agent ib_cm_agent[] __ib_mad_agent = {
117
+	{
118
+		.mgmt_class = IB_MGMT_CLASS_CM,
119
+		.class_version = IB_CM_CLASS_VERSION,
120
+		.attr_id = htons ( IB_CM_ATTR_CONNECT_REPLY ),
121
+		.handle = ib_cm_connect_rep,
122
+	},
123
+};
124
+
74 125
 /**
75 126
  * Handle connection request transaction completion
76 127
  *
@@ -296,6 +347,9 @@ ib_create_conn ( struct ib_device *ibdev, struct ib_queue_pair *qp,
296 347
 		goto err_create_path;
297 348
 	ib_path_set_ownerdata ( conn->path, conn );
298 349
 
350
+	/* Add to list of connections */
351
+	list_add ( &conn->list, &ib_cm_conns );
352
+
299 353
 	DBGC ( conn, "CM %p created for IBDEV %p QPN %lx\n",
300 354
 	       conn, ibdev, qp->qpn );
301 355
 	DBGC ( conn, "CM %p connecting to %08x:%08x:%08x:%08x %08x:%08x\n",
@@ -324,6 +378,7 @@ void ib_destroy_conn ( struct ib_device *ibdev,
324 378
 		       struct ib_queue_pair *qp __unused,
325 379
 		       struct ib_connection *conn ) {
326 380
 
381
+	list_del ( &conn->list );
327 382
 	if ( conn->madx )
328 383
 		ib_destroy_madx ( ibdev, ibdev->gsi, conn->madx );
329 384
 	if ( conn->path )

Loading…
Cancel
Save