You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ipoib.c 27KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000
  1. /*
  2. * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License as
  6. * published by the Free Software Foundation; either version 2 of the
  7. * License, or any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful, but
  10. * WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17. */
  18. #include <stdint.h>
  19. #include <stdio.h>
  20. #include <unistd.h>
  21. #include <string.h>
  22. #include <byteswap.h>
  23. #include <errno.h>
  24. #include <gpxe/if_arp.h>
  25. #include <gpxe/iobuf.h>
  26. #include <gpxe/netdevice.h>
  27. #include <gpxe/infiniband.h>
  28. #include <gpxe/ipoib.h>
  29. /** @file
  30. *
  31. * IP over Infiniband
  32. */
  33. /** IPoIB MTU */
  34. #define IPOIB_MTU 2048
  35. /** Number of IPoIB data send work queue entries */
  36. #define IPOIB_DATA_NUM_SEND_WQES 2
  37. /** Number of IPoIB data receive work queue entries */
  38. #define IPOIB_DATA_NUM_RECV_WQES 4
  39. /** Number of IPoIB data completion entries */
  40. #define IPOIB_DATA_NUM_CQES 8
  41. /** Number of IPoIB metadata send work queue entries */
  42. #define IPOIB_META_NUM_SEND_WQES 2
  43. /** Number of IPoIB metadata receive work queue entries */
  44. #define IPOIB_META_NUM_RECV_WQES 2
  45. /** Number of IPoIB metadata completion entries */
  46. #define IPOIB_META_NUM_CQES 8
  47. /** An IPoIB queue set */
  48. struct ipoib_queue_set {
  49. /** Completion queue */
  50. struct ib_completion_queue *cq;
  51. /** Queue pair */
  52. struct ib_queue_pair *qp;
  53. /** Receive work queue fill level */
  54. unsigned int recv_fill;
  55. /** Receive work queue maximum fill level */
  56. unsigned int recv_max_fill;
  57. };
  58. /** An IPoIB device */
  59. struct ipoib_device {
  60. /** Network device */
  61. struct net_device *netdev;
  62. /** Underlying Infiniband device */
  63. struct ib_device *ibdev;
  64. /** Data queue set */
  65. struct ipoib_queue_set data;
  66. /** Data queue set */
  67. struct ipoib_queue_set meta;
  68. /** Broadcast GID */
  69. struct ib_gid broadcast_gid;
  70. /** Broadcast LID */
  71. unsigned int broadcast_lid;
  72. /** Data queue key */
  73. unsigned long data_qkey;
  74. /** Attached to multicast group
  75. *
  76. * This flag indicates whether or not we have attached our
  77. * data queue pair to the broadcast multicast GID.
  78. */
  79. int broadcast_attached;
  80. };
  81. /**
  82. * IPoIB path cache entry
  83. *
  84. * This serves a similar role to the ARP cache for Ethernet. (ARP
  85. * *is* used on IPoIB; we have two caches to maintain.)
  86. */
  87. struct ipoib_cached_path {
  88. /** Destination GID */
  89. struct ib_gid gid;
  90. /** Destination LID */
  91. unsigned int dlid;
  92. /** Service level */
  93. unsigned int sl;
  94. /** Rate */
  95. unsigned int rate;
  96. };
  97. /** Number of IPoIB path cache entries */
  98. #define IPOIB_NUM_CACHED_PATHS 2
  99. /** IPoIB path cache */
  100. static struct ipoib_cached_path ipoib_path_cache[IPOIB_NUM_CACHED_PATHS];
  101. /** Oldest IPoIB path cache entry index */
  102. static unsigned int ipoib_path_cache_idx = 0;
  103. /** TID half used to identify get path record replies */
  104. #define IPOIB_TID_GET_PATH_REC 0x11111111UL
  105. /** TID half used to identify multicast member record replies */
  106. #define IPOIB_TID_MC_MEMBER_REC 0x22222222UL
  107. /** IPoIB metadata TID */
  108. static uint32_t ipoib_meta_tid = 0;
  109. /** IPv4 broadcast GID */
  110. static const struct ib_gid ipv4_broadcast_gid = {
  111. { { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  112. 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff } }
  113. };
  114. /** Maximum time we will wait for the broadcast join to succeed */
  115. #define IPOIB_JOIN_MAX_DELAY_MS 1000
  116. /****************************************************************************
  117. *
  118. * IPoIB link layer
  119. *
  120. ****************************************************************************
  121. */
  122. /** Broadcast QPN used in IPoIB MAC addresses
  123. *
  124. * This is a guaranteed invalid real QPN
  125. */
  126. #define IPOIB_BROADCAST_QPN 0xffffffffUL
  127. /** Broadcast IPoIB address */
  128. static struct ipoib_mac ipoib_broadcast = {
  129. .qpn = ntohl ( IPOIB_BROADCAST_QPN ),
  130. };
  131. /**
  132. * Transmit IPoIB packet
  133. *
  134. * @v iobuf I/O buffer
  135. * @v netdev Network device
  136. * @v net_protocol Network-layer protocol
  137. * @v ll_dest Link-layer destination address
  138. *
  139. * Prepends the IPoIB link-layer header and transmits the packet.
  140. */
  141. static int ipoib_tx ( struct io_buffer *iobuf, struct net_device *netdev,
  142. struct net_protocol *net_protocol,
  143. const void *ll_dest ) {
  144. struct ipoib_hdr *ipoib_hdr =
  145. iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
  146. /* Build IPoIB header */
  147. memcpy ( &ipoib_hdr->pseudo.peer, ll_dest,
  148. sizeof ( ipoib_hdr->pseudo.peer ) );
  149. ipoib_hdr->real.proto = net_protocol->net_proto;
  150. ipoib_hdr->real.reserved = 0;
  151. /* Hand off to network device */
  152. return netdev_tx ( netdev, iobuf );
  153. }
  154. /**
  155. * Process received IPoIB packet
  156. *
  157. * @v iobuf I/O buffer
  158. * @v netdev Network device
  159. *
  160. * Strips off the IPoIB link-layer header and passes up to the
  161. * network-layer protocol.
  162. */
  163. static int ipoib_rx ( struct io_buffer *iobuf, struct net_device *netdev ) {
  164. struct ipoib_hdr *ipoib_hdr = iobuf->data;
  165. /* Sanity check */
  166. if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
  167. DBG ( "IPoIB packet too short for link-layer header\n" );
  168. DBG_HD ( iobuf->data, iob_len ( iobuf ) );
  169. free_iob ( iobuf );
  170. return -EINVAL;
  171. }
  172. /* Strip off IPoIB header */
  173. iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
  174. /* Hand off to network-layer protocol */
  175. return net_rx ( iobuf, netdev, ipoib_hdr->real.proto,
  176. &ipoib_hdr->pseudo.peer );
  177. }
  178. /**
  179. * Transcribe IPoIB address
  180. *
  181. * @v ll_addr Link-layer address
  182. * @ret string Link-layer address in human-readable format
  183. */
  184. const char * ipoib_ntoa ( const void *ll_addr ) {
  185. static char buf[45];
  186. const struct ipoib_mac *mac = ll_addr;
  187. snprintf ( buf, sizeof ( buf ), "%08lx:%08lx:%08lx:%08lx:%08lx",
  188. htonl ( mac->qpn ), htonl ( mac->gid.u.dwords[0] ),
  189. htonl ( mac->gid.u.dwords[1] ),
  190. htonl ( mac->gid.u.dwords[2] ),
  191. htonl ( mac->gid.u.dwords[3] ) );
  192. return buf;
  193. }
  194. /** IPoIB protocol */
  195. struct ll_protocol ipoib_protocol __ll_protocol = {
  196. .name = "IPoIB",
  197. .ll_proto = htons ( ARPHRD_INFINIBAND ),
  198. .ll_addr_len = IPOIB_ALEN,
  199. .ll_header_len = IPOIB_HLEN,
  200. .ll_broadcast = ( uint8_t * ) &ipoib_broadcast,
  201. .tx = ipoib_tx,
  202. .rx = ipoib_rx,
  203. .ntoa = ipoib_ntoa,
  204. };
  205. /****************************************************************************
  206. *
  207. * IPoIB network device
  208. *
  209. ****************************************************************************
  210. */
  211. /**
  212. * Destroy queue set
  213. *
  214. * @v ipoib IPoIB device
  215. * @v qset Queue set
  216. */
  217. static void ipoib_destroy_qset ( struct ipoib_device *ipoib,
  218. struct ipoib_queue_set *qset ) {
  219. struct ib_device *ibdev = ipoib->ibdev;
  220. if ( qset->qp )
  221. ib_destroy_qp ( ibdev, qset->qp );
  222. if ( qset->cq )
  223. ib_destroy_cq ( ibdev, qset->cq );
  224. memset ( qset, 0, sizeof ( *qset ) );
  225. }
  226. /**
  227. * Create queue set
  228. *
  229. * @v ipoib IPoIB device
  230. * @v qset Queue set
  231. * @ret rc Return status code
  232. */
  233. static int ipoib_create_qset ( struct ipoib_device *ipoib,
  234. struct ipoib_queue_set *qset,
  235. unsigned int num_cqes,
  236. unsigned int num_send_wqes,
  237. unsigned int num_recv_wqes,
  238. unsigned long qkey ) {
  239. struct ib_device *ibdev = ipoib->ibdev;
  240. int rc;
  241. /* Sanity check */
  242. assert ( qset->cq == NULL );
  243. assert ( qset->qp == NULL );
  244. /* Store queue parameters */
  245. qset->recv_max_fill = num_recv_wqes;
  246. /* Allocate completion queue */
  247. qset->cq = ib_create_cq ( ibdev, num_cqes );
  248. if ( ! qset->cq ) {
  249. DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
  250. ipoib );
  251. rc = -ENOMEM;
  252. goto err;
  253. }
  254. /* Allocate queue pair */
  255. qset->qp = ib_create_qp ( ibdev, num_send_wqes, qset->cq,
  256. num_recv_wqes, qset->cq, qkey );
  257. if ( ! qset->qp ) {
  258. DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
  259. ipoib );
  260. rc = -ENOMEM;
  261. goto err;
  262. }
  263. ib_qp_set_ownerdata ( qset->qp, ipoib->netdev );
  264. return 0;
  265. err:
  266. ipoib_destroy_qset ( ipoib, qset );
  267. return rc;
  268. }
  269. /**
  270. * Find path cache entry by GID
  271. *
  272. * @v gid GID
  273. * @ret entry Path cache entry, or NULL
  274. */
  275. static struct ipoib_cached_path *
  276. ipoib_find_cached_path ( struct ib_gid *gid ) {
  277. struct ipoib_cached_path *path;
  278. unsigned int i;
  279. for ( i = 0 ; i < IPOIB_NUM_CACHED_PATHS ; i++ ) {
  280. path = &ipoib_path_cache[i];
  281. if ( memcmp ( &path->gid, gid, sizeof ( *gid ) ) == 0 )
  282. return path;
  283. }
  284. DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx cache miss\n",
  285. htonl ( gid->u.dwords[0] ), htonl ( gid->u.dwords[1] ),
  286. htonl ( gid->u.dwords[2] ), htonl ( gid->u.dwords[3] ) );
  287. return NULL;
  288. }
  289. /**
  290. * Transmit path record request
  291. *
  292. * @v ipoib IPoIB device
  293. * @v gid Destination GID
  294. * @ret rc Return status code
  295. */
  296. static int ipoib_get_path_record ( struct ipoib_device *ipoib,
  297. struct ib_gid *gid ) {
  298. struct ib_device *ibdev = ipoib->ibdev;
  299. struct io_buffer *iobuf;
  300. struct ib_mad_path_record *path_record;
  301. struct ib_address_vector av;
  302. int rc;
  303. /* Allocate I/O buffer */
  304. iobuf = alloc_iob ( sizeof ( *path_record ) );
  305. if ( ! iobuf )
  306. return -ENOMEM;
  307. iob_put ( iobuf, sizeof ( *path_record ) );
  308. path_record = iobuf->data;
  309. memset ( path_record, 0, sizeof ( *path_record ) );
  310. /* Construct path record request */
  311. path_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
  312. path_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
  313. path_record->mad_hdr.class_version = 2;
  314. path_record->mad_hdr.method = IB_MGMT_METHOD_GET;
  315. path_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_PATH_REC );
  316. path_record->mad_hdr.tid[0] = IPOIB_TID_GET_PATH_REC;
  317. path_record->mad_hdr.tid[1] = ipoib_meta_tid++;
  318. path_record->sa_hdr.comp_mask[1] =
  319. htonl ( IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID );
  320. memcpy ( &path_record->dgid, gid, sizeof ( path_record->dgid ) );
  321. memcpy ( &path_record->sgid, &ibdev->port_gid,
  322. sizeof ( path_record->sgid ) );
  323. /* Construct address vector */
  324. memset ( &av, 0, sizeof ( av ) );
  325. av.dlid = ibdev->sm_lid;
  326. av.dest_qp = IB_SA_QPN;
  327. av.qkey = IB_GLOBAL_QKEY;
  328. /* Post send request */
  329. if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
  330. iobuf ) ) != 0 ) {
  331. DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
  332. ipoib, strerror ( rc ) );
  333. free_iob ( iobuf );
  334. return rc;
  335. }
  336. return 0;
  337. }
  338. /**
  339. * Transmit multicast group membership request
  340. *
  341. * @v ipoib IPoIB device
  342. * @v gid Multicast GID
  343. * @v join Join (rather than leave) group
  344. * @ret rc Return status code
  345. */
  346. static int ipoib_mc_member_record ( struct ipoib_device *ipoib,
  347. struct ib_gid *gid, int join ) {
  348. struct ib_device *ibdev = ipoib->ibdev;
  349. struct io_buffer *iobuf;
  350. struct ib_mad_mc_member_record *mc_member_record;
  351. struct ib_address_vector av;
  352. int rc;
  353. /* Allocate I/O buffer */
  354. iobuf = alloc_iob ( sizeof ( *mc_member_record ) );
  355. if ( ! iobuf )
  356. return -ENOMEM;
  357. iob_put ( iobuf, sizeof ( *mc_member_record ) );
  358. mc_member_record = iobuf->data;
  359. memset ( mc_member_record, 0, sizeof ( *mc_member_record ) );
  360. /* Construct path record request */
  361. mc_member_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
  362. mc_member_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
  363. mc_member_record->mad_hdr.class_version = 2;
  364. mc_member_record->mad_hdr.method =
  365. ( join ? IB_MGMT_METHOD_SET : IB_MGMT_METHOD_DELETE );
  366. mc_member_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_MC_MEMBER_REC );
  367. mc_member_record->mad_hdr.tid[0] = IPOIB_TID_MC_MEMBER_REC;
  368. mc_member_record->mad_hdr.tid[1] = ipoib_meta_tid++;
  369. mc_member_record->sa_hdr.comp_mask[1] =
  370. htonl ( IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
  371. IB_SA_MCMEMBER_REC_JOIN_STATE );
  372. mc_member_record->scope__join_state = 1;
  373. memcpy ( &mc_member_record->mgid, gid,
  374. sizeof ( mc_member_record->mgid ) );
  375. memcpy ( &mc_member_record->port_gid, &ibdev->port_gid,
  376. sizeof ( mc_member_record->port_gid ) );
  377. /* Construct address vector */
  378. memset ( &av, 0, sizeof ( av ) );
  379. av.dlid = ibdev->sm_lid;
  380. av.dest_qp = IB_SA_QPN;
  381. av.qkey = IB_GLOBAL_QKEY;
  382. /* Post send request */
  383. if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
  384. iobuf ) ) != 0 ) {
  385. DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
  386. ipoib, strerror ( rc ) );
  387. free_iob ( iobuf );
  388. return rc;
  389. }
  390. return 0;
  391. }
  392. /**
  393. * Transmit packet via IPoIB network device
  394. *
  395. * @v netdev Network device
  396. * @v iobuf I/O buffer
  397. * @ret rc Return status code
  398. */
  399. static int ipoib_transmit ( struct net_device *netdev,
  400. struct io_buffer *iobuf ) {
  401. struct ipoib_device *ipoib = netdev->priv;
  402. struct ib_device *ibdev = ipoib->ibdev;
  403. struct ipoib_pseudo_hdr *ipoib_pshdr = iobuf->data;
  404. struct ib_address_vector av;
  405. struct ib_gid *gid;
  406. struct ipoib_cached_path *path;
  407. int rc;
  408. /* Sanity check */
  409. if ( iob_len ( iobuf ) < sizeof ( *ipoib_pshdr ) ) {
  410. DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
  411. return -EINVAL;
  412. }
  413. iob_pull ( iobuf, ( sizeof ( *ipoib_pshdr ) ) );
  414. /* Construct address vector */
  415. memset ( &av, 0, sizeof ( av ) );
  416. av.qkey = IB_GLOBAL_QKEY;
  417. av.gid_present = 1;
  418. if ( ipoib_pshdr->peer.qpn == htonl ( IPOIB_BROADCAST_QPN ) ) {
  419. /* Broadcast address */
  420. av.dest_qp = IB_BROADCAST_QPN;
  421. av.dlid = ipoib->broadcast_lid;
  422. gid = &ipoib->broadcast_gid;
  423. } else {
  424. /* Unicast - look in path cache */
  425. path = ipoib_find_cached_path ( &ipoib_pshdr->peer.gid );
  426. if ( ! path ) {
  427. /* No path entry - get path record */
  428. rc = ipoib_get_path_record ( ipoib,
  429. &ipoib_pshdr->peer.gid );
  430. netdev_tx_complete ( netdev, iobuf );
  431. return rc;
  432. }
  433. av.dest_qp = ntohl ( ipoib_pshdr->peer.qpn );
  434. av.dlid = path->dlid;
  435. av.rate = path->rate;
  436. av.sl = path->sl;
  437. gid = &ipoib_pshdr->peer.gid;
  438. }
  439. memcpy ( &av.gid, gid, sizeof ( av.gid ) );
  440. return ib_post_send ( ibdev, ipoib->data.qp, &av, iobuf );
  441. }
  442. /**
  443. * Handle IPoIB data send completion
  444. *
  445. * @v ibdev Infiniband device
  446. * @v qp Queue pair
  447. * @v completion Completion
  448. * @v iobuf I/O buffer
  449. */
  450. static void ipoib_data_complete_send ( struct ib_device *ibdev __unused,
  451. struct ib_queue_pair *qp,
  452. struct ib_completion *completion,
  453. struct io_buffer *iobuf ) {
  454. struct net_device *netdev = ib_qp_get_ownerdata ( qp );
  455. netdev_tx_complete_err ( netdev, iobuf,
  456. ( completion->syndrome ? -EIO : 0 ) );
  457. }
  458. /**
  459. * Handle IPoIB data receive completion
  460. *
  461. * @v ibdev Infiniband device
  462. * @v qp Queue pair
  463. * @v completion Completion
  464. * @v iobuf I/O buffer
  465. */
  466. static void ipoib_data_complete_recv ( struct ib_device *ibdev __unused,
  467. struct ib_queue_pair *qp,
  468. struct ib_completion *completion,
  469. struct io_buffer *iobuf ) {
  470. struct net_device *netdev = ib_qp_get_ownerdata ( qp );
  471. struct ipoib_device *ipoib = netdev->priv;
  472. struct ipoib_pseudo_hdr *ipoib_pshdr;
  473. if ( completion->syndrome ) {
  474. netdev_rx_err ( netdev, iobuf, -EIO );
  475. goto done;
  476. }
  477. iob_put ( iobuf, completion->len );
  478. if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
  479. DBGC ( ipoib, "IPoIB %p received data packet too short to "
  480. "contain GRH\n", ipoib );
  481. DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
  482. netdev_rx_err ( netdev, iobuf, -EIO );
  483. goto done;
  484. }
  485. iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
  486. if ( iob_len ( iobuf ) < sizeof ( struct ipoib_real_hdr ) ) {
  487. DBGC ( ipoib, "IPoIB %p received data packet too short to "
  488. "contain IPoIB header\n", ipoib );
  489. DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
  490. netdev_rx_err ( netdev, iobuf, -EIO );
  491. goto done;
  492. }
  493. ipoib_pshdr = iob_push ( iobuf, sizeof ( *ipoib_pshdr ) );
  494. /* FIXME: fill in a MAC address for the sake of AoE! */
  495. netdev_rx ( netdev, iobuf );
  496. done:
  497. ipoib->data.recv_fill--;
  498. }
  499. /**
  500. * Handle IPoIB metadata send completion
  501. *
  502. * @v ibdev Infiniband device
  503. * @v qp Queue pair
  504. * @v completion Completion
  505. * @v iobuf I/O buffer
  506. */
  507. static void ipoib_meta_complete_send ( struct ib_device *ibdev __unused,
  508. struct ib_queue_pair *qp,
  509. struct ib_completion *completion,
  510. struct io_buffer *iobuf ) {
  511. struct net_device *netdev = ib_qp_get_ownerdata ( qp );
  512. struct ipoib_device *ipoib = netdev->priv;
  513. if ( completion->syndrome ) {
  514. DBGC ( ipoib, "IPoIB %p metadata TX completion error %x\n",
  515. ipoib, completion->syndrome );
  516. }
  517. free_iob ( iobuf );
  518. }
  519. /**
  520. * Handle received IPoIB path record
  521. *
  522. * @v ipoib IPoIB device
  523. * @v path_record Path record
  524. */
  525. static void ipoib_recv_path_record ( struct ipoib_device *ipoib __unused,
  526. struct ib_mad_path_record *path_record ) {
  527. struct ipoib_cached_path *path;
  528. /* Update path cache entry */
  529. path = &ipoib_path_cache[ipoib_path_cache_idx];
  530. memcpy ( &path->gid, &path_record->dgid, sizeof ( path->gid ) );
  531. path->dlid = ntohs ( path_record->dlid );
  532. path->sl = ( path_record->reserved__sl & 0x0f );
  533. path->rate = ( path_record->rate_selector__rate & 0x3f );
  534. DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx dlid %x sl %x rate %x\n",
  535. htonl ( path->gid.u.dwords[0] ), htonl ( path->gid.u.dwords[1] ),
  536. htonl ( path->gid.u.dwords[2] ), htonl ( path->gid.u.dwords[3] ),
  537. path->dlid, path->sl, path->rate );
  538. /* Update path cache index */
  539. ipoib_path_cache_idx++;
  540. if ( ipoib_path_cache_idx == IPOIB_NUM_CACHED_PATHS )
  541. ipoib_path_cache_idx = 0;
  542. }
  543. /**
  544. * Handle received IPoIB multicast membership record
  545. *
  546. * @v ipoib IPoIB device
  547. * @v mc_member_record Multicast membership record
  548. */
  549. static void ipoib_recv_mc_member_record ( struct ipoib_device *ipoib,
  550. struct ib_mad_mc_member_record *mc_member_record ) {
  551. int joined;
  552. int rc;
  553. /* Record parameters */
  554. joined = ( mc_member_record->scope__join_state & 0x0f );
  555. ipoib->data_qkey = ntohl ( mc_member_record->qkey );
  556. ipoib->broadcast_lid = ntohs ( mc_member_record->mlid );
  557. DBGC ( ipoib, "IPoIB %p %s broadcast group: qkey %lx mlid %x\n",
  558. ipoib, ( joined ? "joined" : "left" ), ipoib->data_qkey,
  559. ipoib->broadcast_lid );
  560. /* Update data queue pair qkey */
  561. if ( ( rc = ib_modify_qp ( ipoib->ibdev, ipoib->data.qp,
  562. IB_MODIFY_QKEY, ipoib->data_qkey ) ) != 0 ){
  563. DBGC ( ipoib, "IPoIB %p could not update data qkey: %s\n",
  564. ipoib, strerror ( rc ) );
  565. return;
  566. }
  567. }
  568. /**
  569. * Handle IPoIB metadata receive completion
  570. *
  571. * @v ibdev Infiniband device
  572. * @v qp Queue pair
  573. * @v completion Completion
  574. * @v iobuf I/O buffer
  575. */
  576. static void ipoib_meta_complete_recv ( struct ib_device *ibdev __unused,
  577. struct ib_queue_pair *qp,
  578. struct ib_completion *completion,
  579. struct io_buffer *iobuf ) {
  580. struct net_device *netdev = ib_qp_get_ownerdata ( qp );
  581. struct ipoib_device *ipoib = netdev->priv;
  582. union ib_mad *mad;
  583. if ( completion->syndrome ) {
  584. DBGC ( ipoib, "IPoIB %p metadata RX completion error %x\n",
  585. ipoib, completion->syndrome );
  586. goto done;
  587. }
  588. iob_put ( iobuf, completion->len );
  589. if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
  590. DBGC ( ipoib, "IPoIB %p received metadata packet too short "
  591. "to contain GRH\n", ipoib );
  592. DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
  593. goto done;
  594. }
  595. iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
  596. if ( iob_len ( iobuf ) < sizeof ( *mad ) ) {
  597. DBGC ( ipoib, "IPoIB %p received metadata packet too short "
  598. "to contain reply\n", ipoib );
  599. DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
  600. goto done;
  601. }
  602. mad = iobuf->data;
  603. if ( mad->mad_hdr.status != 0 ) {
  604. DBGC ( ipoib, "IPoIB %p metadata RX err status %04x\n",
  605. ipoib, ntohs ( mad->mad_hdr.status ) );
  606. goto done;
  607. }
  608. switch ( mad->mad_hdr.tid[0] ) {
  609. case IPOIB_TID_GET_PATH_REC:
  610. ipoib_recv_path_record ( ipoib, &mad->path_record );
  611. break;
  612. case IPOIB_TID_MC_MEMBER_REC:
  613. ipoib_recv_mc_member_record ( ipoib, &mad->mc_member_record );
  614. break;
  615. default:
  616. DBGC ( ipoib, "IPoIB %p unwanted response:\n",
  617. ipoib );
  618. DBGC_HD ( ipoib, mad, sizeof ( *mad ) );
  619. break;
  620. }
  621. done:
  622. ipoib->meta.recv_fill--;
  623. free_iob ( iobuf );
  624. }
  625. /**
  626. * Refill IPoIB receive ring
  627. *
  628. * @v ipoib IPoIB device
  629. */
  630. static void ipoib_refill_recv ( struct ipoib_device *ipoib,
  631. struct ipoib_queue_set *qset ) {
  632. struct ib_device *ibdev = ipoib->ibdev;
  633. struct io_buffer *iobuf;
  634. int rc;
  635. while ( qset->recv_fill < qset->recv_max_fill ) {
  636. iobuf = alloc_iob ( IPOIB_MTU );
  637. if ( ! iobuf )
  638. break;
  639. if ( ( rc = ib_post_recv ( ibdev, qset->qp, iobuf ) ) != 0 ) {
  640. free_iob ( iobuf );
  641. break;
  642. }
  643. qset->recv_fill++;
  644. }
  645. }
  646. /**
  647. * Poll IPoIB network device
  648. *
  649. * @v netdev Network device
  650. */
  651. static void ipoib_poll ( struct net_device *netdev ) {
  652. struct ipoib_device *ipoib = netdev->priv;
  653. struct ib_device *ibdev = ipoib->ibdev;
  654. ib_poll_cq ( ibdev, ipoib->meta.cq, ipoib_meta_complete_send,
  655. ipoib_meta_complete_recv );
  656. ib_poll_cq ( ibdev, ipoib->data.cq, ipoib_data_complete_send,
  657. ipoib_data_complete_recv );
  658. ipoib_refill_recv ( ipoib, &ipoib->meta );
  659. ipoib_refill_recv ( ipoib, &ipoib->data );
  660. }
  661. /**
  662. * Enable/disable interrupts on IPoIB network device
  663. *
  664. * @v netdev Network device
  665. * @v enable Interrupts should be enabled
  666. */
  667. static void ipoib_irq ( struct net_device *netdev __unused,
  668. int enable __unused ) {
  669. /* No implementation */
  670. }
  671. /**
  672. * Join IPv4 broadcast multicast group
  673. *
  674. * @v ipoib IPoIB device
  675. * @ret rc Return status code
  676. */
  677. static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
  678. int rc;
  679. /* Sanity check */
  680. if ( ! ipoib->data.qp )
  681. return 0;
  682. /* Attach data queue to broadcast multicast GID */
  683. assert ( ipoib->broadcast_attached == 0 );
  684. if ( ( rc = ib_mcast_attach ( ipoib->ibdev, ipoib->data.qp,
  685. &ipoib->broadcast_gid ) ) != 0 ){
  686. DBGC ( ipoib, "IPoIB %p could not attach to broadcast GID: "
  687. "%s\n", ipoib, strerror ( rc ) );
  688. return rc;
  689. }
  690. ipoib->broadcast_attached = 1;
  691. /* Initiate broadcast group join */
  692. if ( ( rc = ipoib_mc_member_record ( ipoib, &ipoib->broadcast_gid,
  693. 1 ) ) != 0 ) {
  694. DBGC ( ipoib, "IPoIB %p could not send broadcast join: %s\n",
  695. ipoib, strerror ( rc ) );
  696. return rc;
  697. }
  698. return 0;
  699. }
  700. /**
  701. * Leave IPv4 broadcast multicast group
  702. *
  703. * @v ipoib IPoIB device
  704. */
  705. static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
  706. /* Detach data queue from broadcast multicast GID */
  707. if ( ipoib->broadcast_attached ) {
  708. assert ( ipoib->data.qp != NULL );
  709. ib_mcast_detach ( ipoib->ibdev, ipoib->data.qp,
  710. &ipoib->broadcast_gid );
  711. ipoib->broadcast_attached = 0;
  712. }
  713. }
  714. /**
  715. * Open IPoIB network device
  716. *
  717. * @v netdev Network device
  718. * @ret rc Return status code
  719. */
  720. static int ipoib_open ( struct net_device *netdev ) {
  721. struct ipoib_device *ipoib = netdev->priv;
  722. struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
  723. int rc;
  724. /* Allocate metadata queue set */
  725. if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->meta,
  726. IPOIB_META_NUM_CQES,
  727. IPOIB_META_NUM_SEND_WQES,
  728. IPOIB_META_NUM_RECV_WQES,
  729. IB_GLOBAL_QKEY ) ) != 0 ) {
  730. DBGC ( ipoib, "IPoIB %p could not allocate metadata QP: %s\n",
  731. ipoib, strerror ( rc ) );
  732. goto err_create_meta_qset;
  733. }
  734. /* Allocate data queue set */
  735. if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->data,
  736. IPOIB_DATA_NUM_CQES,
  737. IPOIB_DATA_NUM_SEND_WQES,
  738. IPOIB_DATA_NUM_RECV_WQES,
  739. IB_GLOBAL_QKEY ) ) != 0 ) {
  740. DBGC ( ipoib, "IPoIB %p could not allocate data QP: %s\n",
  741. ipoib, strerror ( rc ) );
  742. goto err_create_data_qset;
  743. }
  744. /* Update MAC address with data QPN */
  745. mac->qpn = htonl ( ipoib->data.qp->qpn );
  746. /* Fill receive rings */
  747. ipoib_refill_recv ( ipoib, &ipoib->meta );
  748. ipoib_refill_recv ( ipoib, &ipoib->data );
  749. /* Join broadcast group */
  750. if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) {
  751. DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
  752. ipoib, strerror ( rc ) );
  753. goto err_join_broadcast;
  754. }
  755. return 0;
  756. err_join_broadcast:
  757. ipoib_destroy_qset ( ipoib, &ipoib->data );
  758. err_create_data_qset:
  759. ipoib_destroy_qset ( ipoib, &ipoib->meta );
  760. err_create_meta_qset:
  761. return rc;
  762. }
  763. /**
  764. * Close IPoIB network device
  765. *
  766. * @v netdev Network device
  767. */
  768. static void ipoib_close ( struct net_device *netdev ) {
  769. struct ipoib_device *ipoib = netdev->priv;
  770. struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
  771. /* Leave broadcast group */
  772. ipoib_leave_broadcast_group ( ipoib );
  773. /* Remove data QPN from MAC address */
  774. mac->qpn = 0;
  775. /* Tear down the queues */
  776. ipoib_destroy_qset ( ipoib, &ipoib->data );
  777. ipoib_destroy_qset ( ipoib, &ipoib->meta );
  778. }
  779. /** IPoIB network device operations */
  780. static struct net_device_operations ipoib_operations = {
  781. .open = ipoib_open,
  782. .close = ipoib_close,
  783. .transmit = ipoib_transmit,
  784. .poll = ipoib_poll,
  785. .irq = ipoib_irq,
  786. };
  787. /**
  788. * Update IPoIB dynamic Infiniband parameters
  789. *
  790. * @v ipoib IPoIB device
  791. *
  792. * The Infiniband port GID and partition key will change at runtime,
  793. * when the link is established (or lost). The MAC address is based
  794. * on the port GID, and the broadcast GID is based on the partition
  795. * key. This function recalculates these IPoIB device parameters.
  796. */
  797. static void ipoib_set_ib_params ( struct ipoib_device *ipoib ) {
  798. struct ib_device *ibdev = ipoib->ibdev;
  799. struct ipoib_mac *mac;
  800. /* Calculate GID portion of MAC address based on port GID */
  801. mac = ( ( struct ipoib_mac * ) ipoib->netdev->ll_addr );
  802. memcpy ( &mac->gid, &ibdev->port_gid, sizeof ( mac->gid ) );
  803. /* Calculate broadcast GID based on partition key */
  804. memcpy ( &ipoib->broadcast_gid, &ipv4_broadcast_gid,
  805. sizeof ( ipoib->broadcast_gid ) );
  806. ipoib->broadcast_gid.u.words[2] = htons ( ibdev->pkey );
  807. }
  808. /**
  809. * Handle link status change
  810. *
  811. * @v ibdev Infiniband device
  812. */
  813. void ipoib_link_state_changed ( struct ib_device *ibdev ) {
  814. struct net_device *netdev = ib_get_ownerdata ( ibdev );
  815. struct ipoib_device *ipoib = netdev->priv;
  816. int rc;
  817. /* Leave existing broadcast group */
  818. ipoib_leave_broadcast_group ( ipoib );
  819. /* Update MAC address and broadcast GID based on new port GID
  820. * and partition key.
  821. */
  822. ipoib_set_ib_params ( ipoib );
  823. /* Join new broadcast group */
  824. if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) {
  825. DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
  826. "%s\n", ipoib, strerror ( rc ) );
  827. return;
  828. }
  829. }
  830. /**
  831. * Probe IPoIB device
  832. *
  833. * @v ibdev Infiniband device
  834. * @ret rc Return status code
  835. */
  836. int ipoib_probe ( struct ib_device *ibdev ) {
  837. struct net_device *netdev;
  838. struct ipoib_device *ipoib;
  839. int rc;
  840. /* Allocate network device */
  841. netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
  842. if ( ! netdev )
  843. return -ENOMEM;
  844. netdev_init ( netdev, &ipoib_operations );
  845. ipoib = netdev->priv;
  846. ib_set_ownerdata ( ibdev, netdev );
  847. netdev->dev = ibdev->dev;
  848. memset ( ipoib, 0, sizeof ( *ipoib ) );
  849. ipoib->netdev = netdev;
  850. ipoib->ibdev = ibdev;
  851. /* Calculate as much of the broadcast GID and the MAC address
  852. * as we can. We won't know either of these in full until we
  853. * have link-up.
  854. */
  855. ipoib_set_ib_params ( ipoib );
  856. /* Register network device */
  857. if ( ( rc = register_netdev ( netdev ) ) != 0 )
  858. goto err_register_netdev;
  859. return 0;
  860. err_register_netdev:
  861. netdev_nullify ( netdev );
  862. netdev_put ( netdev );
  863. return rc;
  864. }
  865. /**
  866. * Remove IPoIB device
  867. *
  868. * @v ibdev Infiniband device
  869. */
  870. void ipoib_remove ( struct ib_device *ibdev ) {
  871. struct net_device *netdev = ib_get_ownerdata ( ibdev );
  872. unregister_netdev ( netdev );
  873. netdev_nullify ( netdev );
  874. netdev_put ( netdev );
  875. }