Parcourir la source

[ipoib] Kill off the IPoIB pseudo-header

Some Infiniband cards will not be as accommodating as the Arbel and
Hermon cards in providing enough space for us to push a fake extra
header at the start of the received packet.  We must therefore make do
with squeezing enough information to identify source and destination
addresses into the two bytes of padding within a genuine IPoIB
link-layer header.
tags/v0.9.6
Michael Brown il y a 16 ans
Parent
révision
9a35830d1f
3 fichiers modifiés avec 218 ajouts et 133 suppressions
  1. 197
    109
      src/drivers/net/ipoib.c
  2. 20
    23
      src/include/gpxe/ipoib.h
  3. 1
    1
      src/include/gpxe/netdevice.h

+ 197
- 109
src/drivers/net/ipoib.c Voir le fichier

@@ -85,49 +85,146 @@ struct ipoib_device {
85 85
 	int broadcast_attached;
86 86
 };
87 87
 
88
+/** TID half used to identify get path record replies */
89
+#define IPOIB_TID_GET_PATH_REC 0x11111111UL
90
+
91
+/** TID half used to identify multicast member record replies */
92
+#define IPOIB_TID_MC_MEMBER_REC 0x22222222UL
93
+
94
+/** IPoIB metadata TID */
95
+static uint32_t ipoib_meta_tid = 0;
96
+
97
+/** Broadcast QPN used in IPoIB MAC addresses
98
+ *
99
+ * This is a guaranteed invalid real QPN
100
+ */
101
+#define IPOIB_BROADCAST_QPN 0xffffffffUL
102
+
103
+/** Broadcast IPoIB address */
104
+static struct ipoib_mac ipoib_broadcast = {
105
+	.qpn = ntohl ( IPOIB_BROADCAST_QPN ),
106
+	.gid.u.bytes = 	{ 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
107
+			  0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
108
+};
109
+
110
+/****************************************************************************
111
+ *
112
+ * IPoIB peer cache
113
+ *
114
+ ****************************************************************************
115
+ */
116
+
88 117
 /**
89
- * IPoIB path cache entry
118
+ * IPoIB peer address
90 119
  *
91 120
  * This serves a similar role to the ARP cache for Ethernet.  (ARP
92 121
  * *is* used on IPoIB; we have two caches to maintain.)
93 122
  */
94
-struct ipoib_cached_path {
95
-	/** Destination GID */
96
-	struct ib_gid gid;
97
-	/** Destination LID */
98
-	unsigned int dlid;
123
+struct ipoib_peer {
124
+	/** Key */
125
+	uint8_t key;
126
+	/** MAC address */
127
+	struct ipoib_mac mac;
128
+	/** LID */
129
+	unsigned int lid;
99 130
 	/** Service level */
100 131
 	unsigned int sl;
101 132
 	/** Rate */
102 133
 	unsigned int rate;
103 134
 };
104 135
 
105
-/** Number of IPoIB path cache entries */
106
-#define IPOIB_NUM_CACHED_PATHS 2
136
+/** Number of IPoIB peer cache entries
137
+ *
138
+ * Must be a power of two.
139
+ */
140
+#define IPOIB_NUM_CACHED_PEERS 4
107 141
 
108
-/** IPoIB path cache */
109
-static struct ipoib_cached_path ipoib_path_cache[IPOIB_NUM_CACHED_PATHS];
142
+/** IPoIB peer address cache */
143
+static struct ipoib_peer ipoib_peer_cache[IPOIB_NUM_CACHED_PEERS];
110 144
 
111
-/** Oldest IPoIB path cache entry index */
112
-static unsigned int ipoib_path_cache_idx = 0;
145
+/** Oldest IPoIB peer cache entry index */
146
+static unsigned int ipoib_peer_cache_idx = 1;
113 147
 
114
-/** TID half used to identify get path record replies */
115
-#define IPOIB_TID_GET_PATH_REC 0x11111111UL
148
+/**
149
+ * Look up cached peer by key
150
+ *
151
+ * @v key		Peer cache key
152
+ * @ret peer		Peer cache entry, or NULL
153
+ */
154
+static struct ipoib_peer * ipoib_lookup_peer_by_key ( unsigned int key ) {
155
+	struct ipoib_peer *peer;
156
+	unsigned int i;
116 157
 
117
-/** TID half used to identify multicast member record replies */
118
-#define IPOIB_TID_MC_MEMBER_REC 0x22222222UL
158
+	for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) {
159
+		peer = &ipoib_peer_cache[i];
160
+		if ( peer->key == key )
161
+			return peer;
162
+	}
119 163
 
120
-/** IPoIB metadata TID */
121
-static uint32_t ipoib_meta_tid = 0;
164
+	if ( key != 0 ) {
165
+		DBG ( "IPoIB warning: peer cache lost track of key %x while "
166
+		      "still in use\n", key );
167
+	}
168
+	return NULL;
169
+}
122 170
 
123
-/** IPv4 broadcast GID */
124
-static const struct ib_gid ipv4_broadcast_gid = {
125
-	{ { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
126
-	    0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff } }
127
-};
171
+/**
172
+ * Look up cached peer by GID
173
+ *
174
+ * @v gid		Peer GID
175
+ * @ret peer		Peer cache entry, or NULL
176
+ */
177
+static struct ipoib_peer *
178
+ipoib_lookup_peer_by_gid ( const struct ib_gid *gid ) {
179
+	struct ipoib_peer *peer;
180
+	unsigned int i;
128 181
 
129
-/** Maximum time we will wait for the broadcast join to succeed */
130
-#define IPOIB_JOIN_MAX_DELAY_MS 1000
182
+	for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) {
183
+		peer = &ipoib_peer_cache[i];
184
+		if ( memcmp ( &peer->mac.gid, gid,
185
+			      sizeof ( peer->mac.gid) ) == 0 ) {
186
+			return peer;
187
+		}
188
+	}
189
+
190
+	return NULL;
191
+}
192
+
193
+/**
194
+ * Store GID and QPN in peer cache
195
+ *
196
+ * @v gid		Peer GID
197
+ * @v qpn		Peer QPN
198
+ * @ret peer		Peer cache entry
199
+ */
200
+static struct ipoib_peer *
201
+ipoib_cache_peer ( const struct ib_gid *gid, unsigned long qpn ) {
202
+	struct ipoib_peer *peer;
203
+	unsigned int key;
204
+
205
+	/* Look for existing cache entry */
206
+	peer = ipoib_lookup_peer_by_gid ( gid );
207
+	if ( peer ) {
208
+		assert ( peer->mac.qpn = ntohl ( qpn ) );
209
+		return peer;
210
+	}
211
+
212
+	/* No entry found: create a new one */
213
+	key = ipoib_peer_cache_idx++;
214
+	peer = &ipoib_peer_cache[ key % IPOIB_NUM_CACHED_PEERS ];
215
+	if ( peer->key )
216
+		DBG ( "IPoIB peer %x evicted from cache\n", peer->key );
217
+
218
+	memset ( peer, 0, sizeof ( *peer ) );
219
+	peer->key = key;
220
+	peer->mac.qpn = htonl ( qpn );
221
+	memcpy ( &peer->mac.gid, gid, sizeof ( peer->mac.gid ) );
222
+	DBG ( "IPoIB peer %x has GID %08lx:%08lx:%08lx:%08lx and QPN %lx\n",
223
+	      peer->key, htonl ( gid->u.dwords[0] ),
224
+	      htonl ( gid->u.dwords[1] ), htonl ( gid->u.dwords[2] ),
225
+	      htonl ( gid->u.dwords[3] ), qpn );
226
+	return peer;
227
+}
131 228
 
132 229
 /****************************************************************************
133 230
  *
@@ -136,17 +233,6 @@ static const struct ib_gid ipv4_broadcast_gid = {
136 233
  ****************************************************************************
137 234
  */
138 235
 
139
-/** Broadcast QPN used in IPoIB MAC addresses
140
- *
141
- * This is a guaranteed invalid real QPN
142
- */
143
-#define IPOIB_BROADCAST_QPN 0xffffffffUL
144
-
145
-/** Broadcast IPoIB address */
146
-static struct ipoib_mac ipoib_broadcast = {
147
-	.qpn = ntohl ( IPOIB_BROADCAST_QPN ),
148
-};
149
-
150 236
 /**
151 237
  * Add IPoIB link-layer header
152 238
  *
@@ -160,12 +246,19 @@ static int ipoib_push ( struct io_buffer *iobuf, const void *ll_dest,
160 246
 			const void *ll_source __unused, uint16_t net_proto ) {
161 247
 	struct ipoib_hdr *ipoib_hdr =
162 248
 		iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
249
+	const struct ipoib_mac *dest_mac = ll_dest;
250
+	const struct ipoib_mac *src_mac = ll_source;
251
+	struct ipoib_peer *dest;
252
+	struct ipoib_peer *src;
253
+
254
+	/* Add link-layer addresses to cache */
255
+	dest = ipoib_cache_peer ( &dest_mac->gid, ntohl ( dest_mac->qpn ) );
256
+	src = ipoib_cache_peer ( &src_mac->gid, ntohl ( src_mac->qpn ) );
163 257
 
164 258
 	/* Build IPoIB header */
165
-	memcpy ( &ipoib_hdr->pseudo.peer, ll_dest,
166
-		 sizeof ( ipoib_hdr->pseudo.peer ) );
167
-	ipoib_hdr->real.proto = net_proto;
168
-	ipoib_hdr->real.reserved = 0;
259
+	ipoib_hdr->proto = net_proto;
260
+	ipoib_hdr->u.peer.dest = dest->key;
261
+	ipoib_hdr->u.peer.src = src->key;
169 262
 
170 263
 	return 0;
171 264
 }
@@ -182,6 +275,8 @@ static int ipoib_push ( struct io_buffer *iobuf, const void *ll_dest,
182 275
 static int ipoib_pull ( struct io_buffer *iobuf, const void **ll_dest,
183 276
 			const void **ll_source, uint16_t *net_proto ) {
184 277
 	struct ipoib_hdr *ipoib_hdr = iobuf->data;
278
+	struct ipoib_peer *dest;
279
+	struct ipoib_peer *source;
185 280
 
186 281
 	/* Sanity check */
187 282
 	if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
@@ -193,10 +288,17 @@ static int ipoib_pull ( struct io_buffer *iobuf, const void **ll_dest,
193 288
 	/* Strip off IPoIB header */
194 289
 	iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
195 290
 
291
+	/* Identify source and destination addresses, and clear
292
+	 * reserved word in IPoIB header
293
+	 */
294
+	dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest );
295
+	source = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.src );
296
+	ipoib_hdr->u.reserved = 0;
297
+
196 298
 	/* Fill in required fields */
197
-	*ll_dest = &ipoib_broadcast; /* Doesn't really exist in packet */
198
-	*ll_source = &ipoib_hdr->pseudo.peer;
199
-	*net_proto = ipoib_hdr->real.proto;
299
+	*ll_dest = ( dest ? &dest->mac : &ipoib_broadcast );
300
+	*ll_source = ( source ? &source->mac : &ipoib_broadcast );
301
+	*net_proto = ipoib_hdr->proto;
200 302
 
201 303
 	return 0;
202 304
 }
@@ -327,28 +429,6 @@ static int ipoib_create_qset ( struct ipoib_device *ipoib,
327 429
 	return rc;
328 430
 }
329 431
 
330
-/**
331
- * Find path cache entry by GID
332
- *
333
- * @v gid		GID
334
- * @ret entry		Path cache entry, or NULL
335
- */
336
-static struct ipoib_cached_path *
337
-ipoib_find_cached_path ( struct ib_gid *gid ) {
338
-	struct ipoib_cached_path *path;
339
-	unsigned int i;
340
-
341
-	for ( i = 0 ; i < IPOIB_NUM_CACHED_PATHS ; i++ ) {
342
-		path = &ipoib_path_cache[i];
343
-		if ( memcmp ( &path->gid, gid, sizeof ( *gid ) ) == 0 )
344
-			return path;
345
-	}
346
-	DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx cache miss\n",
347
-	      htonl ( gid->u.dwords[0] ), htonl ( gid->u.dwords[1] ),
348
-	      htonl ( gid->u.dwords[2] ), htonl ( gid->u.dwords[3] ) );
349
-	return NULL;
350
-}
351
-
352 432
 /**
353 433
  * Transmit path record request
354 434
  *
@@ -477,18 +557,17 @@ static int ipoib_transmit ( struct net_device *netdev,
477 557
 			    struct io_buffer *iobuf ) {
478 558
 	struct ipoib_device *ipoib = netdev->priv;
479 559
 	struct ib_device *ibdev = ipoib->ibdev;
480
-	struct ipoib_pseudo_hdr *ipoib_pshdr = iobuf->data;
560
+	struct ipoib_hdr *ipoib_hdr;
561
+	struct ipoib_peer *dest;
481 562
 	struct ib_address_vector av;
482 563
 	struct ib_gid *gid;
483
-	struct ipoib_cached_path *path;
484
-	int rc;
485 564
 
486 565
 	/* Sanity check */
487
-	if ( iob_len ( iobuf ) < sizeof ( *ipoib_pshdr ) ) {
566
+	if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
488 567
 		DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
489 568
 		return -EINVAL;
490 569
 	}
491
-	iob_pull ( iobuf, ( sizeof ( *ipoib_pshdr ) ) );
570
+	ipoib_hdr = iobuf->data;
492 571
 
493 572
 	/* Attempting transmission while link is down will put the
494 573
 	 * queue pair into an error state, so don't try it.
@@ -496,30 +575,33 @@ static int ipoib_transmit ( struct net_device *netdev,
496 575
 	if ( ! ib_link_ok ( ibdev ) )
497 576
 		return -ENETUNREACH;
498 577
 
578
+	/* Identify destination address */
579
+	dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest );
580
+	if ( ! dest )
581
+		return -ENXIO;
582
+	ipoib_hdr->u.reserved = 0;
583
+
499 584
 	/* Construct address vector */
500 585
 	memset ( &av, 0, sizeof ( av ) );
501
-	av.qkey = IB_GLOBAL_QKEY;
586
+	av.qkey = ipoib->data_qkey;
502 587
 	av.gid_present = 1;
503
-	if ( ipoib_pshdr->peer.qpn == htonl ( IPOIB_BROADCAST_QPN ) ) {
504
-		/* Broadcast address */
588
+	if ( dest->mac.qpn == htonl ( IPOIB_BROADCAST_QPN ) ) {
589
+		/* Broadcast */
505 590
 		av.qpn = IB_BROADCAST_QPN;
506 591
 		av.lid = ipoib->broadcast_lid;
507 592
 		gid = &ipoib->broadcast_gid;
508 593
 	} else {
509
-		/* Unicast - look in path cache */
510
-		path = ipoib_find_cached_path ( &ipoib_pshdr->peer.gid );
511
-		if ( ! path ) {
512
-			/* No path entry - get path record */
513
-			rc = ipoib_get_path_record ( ipoib,
514
-						     &ipoib_pshdr->peer.gid );
515
-			netdev_tx_complete ( netdev, iobuf );
516
-			return rc;
594
+		/* Unicast */
595
+		if ( ! dest->lid ) {
596
+			/* No LID yet - get path record to fetch LID */
597
+			ipoib_get_path_record ( ipoib, &dest->mac.gid );
598
+			return -ENOENT;
517 599
 		}
518
-		av.qpn = ntohl ( ipoib_pshdr->peer.qpn );
519
-		av.lid = path->dlid;
520
-		av.rate = path->rate;
521
-		av.sl = path->sl;
522
-		gid = &ipoib_pshdr->peer.gid;
600
+		av.qpn = ntohl ( dest->mac.qpn );
601
+		av.lid = dest->lid;
602
+		av.rate = dest->rate;
603
+		av.sl = dest->sl;
604
+		gid = &dest->mac.gid;
523 605
 	}
524 606
 	memcpy ( &av.gid, gid, sizeof ( av.gid ) );
525 607
 
@@ -553,28 +635,35 @@ static void ipoib_data_complete_send ( struct ib_device *ibdev __unused,
553 635
  */
554 636
 static void ipoib_data_complete_recv ( struct ib_device *ibdev __unused,
555 637
 				       struct ib_queue_pair *qp,
556
-				       struct ib_address_vector *av __unused,
638
+				       struct ib_address_vector *av,
557 639
 				       struct io_buffer *iobuf, int rc ) {
558 640
 	struct net_device *netdev = ib_qp_get_ownerdata ( qp );
559 641
 	struct ipoib_device *ipoib = netdev->priv;
560
-	struct ipoib_pseudo_hdr *ipoib_pshdr;
642
+	struct ipoib_hdr *ipoib_hdr;
643
+	struct ipoib_peer *src;
561 644
 
562 645
 	if ( rc != 0 ) {
563 646
 		netdev_rx_err ( netdev, iobuf, rc );
564 647
 		return;
565 648
 	}
566 649
 
567
-	if ( iob_len ( iobuf ) < sizeof ( struct ipoib_real_hdr ) ) {
650
+	/* Sanity check */
651
+	if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
568 652
 		DBGC ( ipoib, "IPoIB %p received data packet too short to "
569 653
 		       "contain IPoIB header\n", ipoib );
570 654
 		DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
571 655
 		netdev_rx_err ( netdev, iobuf, -EIO );
572 656
 		return;
573 657
 	}
658
+	ipoib_hdr = iobuf->data;
574 659
 
575
-	ipoib_pshdr = iob_push ( iobuf, sizeof ( *ipoib_pshdr ) );
576
-	/* FIXME: fill in a MAC address for the sake of AoE! */
660
+	/* Parse source address */
661
+	if ( av->gid_present ) {
662
+		src = ipoib_cache_peer ( &av->gid, av->qpn );
663
+		ipoib_hdr->u.peer.src = src->key;
664
+	}
577 665
 
666
+	/* Hand off to network layer */
578 667
 	netdev_rx ( netdev, iobuf );
579 668
 }
580 669
 
@@ -611,26 +700,25 @@ static void ipoib_meta_complete_send ( struct ib_device *ibdev __unused,
611 700
  * @v ipoib		IPoIB device
612 701
  * @v path_record	Path record
613 702
  */
614
-static void ipoib_recv_path_record ( struct ipoib_device *ipoib __unused,
703
+static void ipoib_recv_path_record ( struct ipoib_device *ipoib,
615 704
 				     struct ib_mad_path_record *path_record ) {
616
-	struct ipoib_cached_path *path;
705
+	struct ipoib_peer *peer;
706
+
707
+	/* Locate peer cache entry */
708
+	peer = ipoib_lookup_peer_by_gid ( &path_record->dgid );
709
+	if ( ! peer ) {
710
+		DBGC ( ipoib, "IPoIB %p received unsolicited path record\n",
711
+		       ipoib );
712
+		return;
713
+	}
617 714
 
618 715
 	/* Update path cache entry */
619
-	path = &ipoib_path_cache[ipoib_path_cache_idx];
620
-	memcpy ( &path->gid, &path_record->dgid, sizeof ( path->gid ) );
621
-	path->dlid = ntohs ( path_record->dlid );
622
-	path->sl = ( path_record->reserved__sl & 0x0f );
623
-	path->rate = ( path_record->rate_selector__rate & 0x3f );
624
-
625
-	DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx dlid %x sl %x rate %x\n",
626
-	      htonl ( path->gid.u.dwords[0] ), htonl ( path->gid.u.dwords[1] ),
627
-	      htonl ( path->gid.u.dwords[2] ), htonl ( path->gid.u.dwords[3] ),
628
-	      path->dlid, path->sl, path->rate );
629
-	
630
-	/* Update path cache index */
631
-	ipoib_path_cache_idx++;
632
-	if ( ipoib_path_cache_idx == IPOIB_NUM_CACHED_PATHS )
633
-		ipoib_path_cache_idx = 0;
716
+	peer->lid = ntohs ( path_record->dlid );
717
+	peer->sl = ( path_record->reserved__sl & 0x0f );
718
+	peer->rate = ( path_record->rate_selector__rate & 0x3f );
719
+
720
+	DBG ( "IPoIB peer %x has dlid %x sl %x rate %x\n",
721
+	      peer->key, peer->lid, peer->sl, peer->rate );
634 722
 }
635 723
 
636 724
 /**
@@ -933,7 +1021,7 @@ static void ipoib_set_ib_params ( struct ipoib_device *ipoib ) {
933 1021
 	memcpy ( &mac->gid, &ibdev->gid, sizeof ( mac->gid ) );
934 1022
 
935 1023
 	/* Calculate broadcast GID based on partition key */
936
-	memcpy ( &ipoib->broadcast_gid, &ipv4_broadcast_gid,
1024
+	memcpy ( &ipoib->broadcast_gid, &ipoib_broadcast.gid,
937 1025
 		 sizeof ( ipoib->broadcast_gid ) );
938 1026
 	ipoib->broadcast_gid.u.words[2] = htons ( ibdev->pkey );
939 1027
 

+ 20
- 23
src/include/gpxe/ipoib.h Voir le fichier

@@ -26,33 +26,30 @@ struct ipoib_mac {
26 26
 } __attribute__ (( packed ));
27 27
 
28 28
 /** IPoIB link-layer header length */
29
-#define IPOIB_HLEN 24
29
+#define IPOIB_HLEN 4
30 30
 
31
-/**
32
- * IPoIB link-layer header pseudo portion
33
- *
34
- * This part doesn't actually exist on the wire, but it provides a
35
- * convenient way to fit into the typical network device model.
36
- */
37
-struct ipoib_pseudo_hdr {
38
-	/** Peer address */
39
-	struct ipoib_mac peer;
40
-} __attribute__ (( packed ));
41
-
42
-/** IPoIB link-layer header real portion */
43
-struct ipoib_real_hdr {
31
+/** IPoIB link-layer header */
32
+struct ipoib_hdr {
44 33
 	/** Network-layer protocol */
45 34
 	uint16_t proto;
46 35
 	/** Reserved, must be zero */
47
-	uint16_t reserved;
48
-} __attribute__ (( packed ));
49
-
50
-/** An IPoIB link-layer header */
51
-struct ipoib_hdr {
52
-	/** Pseudo portion */
53
-	struct ipoib_pseudo_hdr pseudo;
54
-	/** Real portion */
55
-	struct ipoib_real_hdr real;
36
+	union {
37
+		/** Reserved, must be zero */
38
+		uint16_t reserved;
39
+		/** Peer addresses
40
+		 *
41
+		 * We use these fields internally to represent the
42
+		 * peer addresses using a lookup key.  There simply
43
+		 * isn't enough room in the IPoIB header to store
44
+		 * literal source or destination MAC addresses.
45
+		 */
46
+		struct {
47
+			/** Destination address key */
48
+			uint8_t dest;
49
+			/** Source address key */
50
+			uint8_t src;
51
+		} __attribute__ (( packed )) peer;
52
+	} __attribute__ (( packed )) u;
56 53
 } __attribute__ (( packed ));
57 54
 
58 55
 extern struct ll_protocol ipoib_protocol;

+ 1
- 1
src/include/gpxe/netdevice.h Voir le fichier

@@ -23,7 +23,7 @@ struct device;
23 23
 #define MAX_LL_ADDR_LEN 20
24 24
 
25 25
 /** Maximum length of a link-layer header */
26
-#define MAX_LL_HEADER_LEN 32
26
+#define MAX_LL_HEADER_LEN 6
27 27
 
28 28
 /** Maximum length of a network-layer address */
29 29
 #define MAX_NET_ADDR_LEN 4

Chargement…
Annuler
Enregistrer