You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ipoib.c 24KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918
  1. /*
  2. * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License as
  6. * published by the Free Software Foundation; either version 2 of the
  7. * License, or any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful, but
  10. * WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17. * 02110-1301, USA.
  18. */
  19. FILE_LICENCE ( GPL2_OR_LATER );
  20. #include <stdint.h>
  21. #include <stdlib.h>
  22. #include <stdio.h>
  23. #include <unistd.h>
  24. #include <string.h>
  25. #include <byteswap.h>
  26. #include <errno.h>
  27. #include <ipxe/errortab.h>
  28. #include <ipxe/malloc.h>
  29. #include <ipxe/if_arp.h>
  30. #include <ipxe/if_ether.h>
  31. #include <ipxe/ethernet.h>
  32. #include <ipxe/iobuf.h>
  33. #include <ipxe/netdevice.h>
  34. #include <ipxe/infiniband.h>
  35. #include <ipxe/ib_pathrec.h>
  36. #include <ipxe/ib_mcast.h>
  37. #include <ipxe/retry.h>
  38. #include <ipxe/ipoib.h>
  39. /** @file
  40. *
  41. * IP over Infiniband
  42. */
  43. /** Number of IPoIB send work queue entries */
  44. #define IPOIB_NUM_SEND_WQES 2
  45. /** Number of IPoIB receive work queue entries */
  46. #define IPOIB_NUM_RECV_WQES 4
  47. /** Number of IPoIB completion entries */
  48. #define IPOIB_NUM_CQES 8
  49. /** An IPoIB device */
  50. struct ipoib_device {
  51. /** Network device */
  52. struct net_device *netdev;
  53. /** Underlying Infiniband device */
  54. struct ib_device *ibdev;
  55. /** Completion queue */
  56. struct ib_completion_queue *cq;
  57. /** Queue pair */
  58. struct ib_queue_pair *qp;
  59. /** Local MAC */
  60. struct ipoib_mac mac;
  61. /** Broadcast MAC */
  62. struct ipoib_mac broadcast;
  63. /** Joined to IPv4 broadcast multicast group
  64. *
  65. * This flag indicates whether or not we have initiated the
  66. * join to the IPv4 broadcast multicast group.
  67. */
  68. int broadcast_joined;
  69. /** IPv4 broadcast multicast group membership */
  70. struct ib_mc_membership broadcast_membership;
  71. /** REMAC cache */
  72. struct list_head peers;
  73. };
  74. /** Broadcast IPoIB address */
  75. static struct ipoib_mac ipoib_broadcast = {
  76. .flags__qpn = htonl ( IB_QPN_BROADCAST ),
  77. .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  78. 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
  79. };
  80. /** Link status for "broadcast join in progress" */
  81. #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
  82. #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
  83. ( EINFO_EINPROGRESS, 0x01, "Joining" )
  84. /** Human-readable message for the link status */
  85. struct errortab ipoib_errors[] __errortab = {
  86. __einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
  87. };
  88. /****************************************************************************
  89. *
  90. * IPoIB REMAC cache
  91. *
  92. ****************************************************************************
  93. */
  94. /** An IPoIB REMAC cache entry */
  95. struct ipoib_peer {
  96. /** List of REMAC cache entries */
  97. struct list_head list;
  98. /** Remote Ethermet MAC */
  99. struct ipoib_remac remac;
  100. /** MAC address */
  101. struct ipoib_mac mac;
  102. };
  103. /**
  104. * Find IPoIB MAC from REMAC
  105. *
  106. * @v ipoib IPoIB device
  107. * @v remac Remote Ethernet MAC
  108. * @ret mac IPoIB MAC (or NULL if not found)
  109. */
  110. static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
  111. const struct ipoib_remac *remac ) {
  112. struct ipoib_peer *peer;
  113. /* Check for broadcast REMAC */
  114. if ( is_broadcast_ether_addr ( remac ) )
  115. return &ipoib->broadcast;
  116. /* Try to find via REMAC cache */
  117. list_for_each_entry ( peer, &ipoib->peers, list ) {
  118. if ( memcmp ( remac, &peer->remac,
  119. sizeof ( peer->remac ) ) == 0 ) {
  120. /* Move peer to start of list */
  121. list_del ( &peer->list );
  122. list_add ( &peer->list, &ipoib->peers );
  123. return &peer->mac;
  124. }
  125. }
  126. DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
  127. ipoib, eth_ntoa ( remac ) );
  128. return NULL;
  129. }
  130. /**
  131. * Add IPoIB MAC to REMAC cache
  132. *
  133. * @v ipoib IPoIB device
  134. * @v remac Remote Ethernet MAC
  135. * @v mac IPoIB MAC
  136. * @ret rc Return status code
  137. */
  138. static int ipoib_map_remac ( struct ipoib_device *ipoib,
  139. const struct ipoib_remac *remac,
  140. const struct ipoib_mac *mac ) {
  141. struct ipoib_peer *peer;
  142. /* Check for existing entry in REMAC cache */
  143. list_for_each_entry ( peer, &ipoib->peers, list ) {
  144. if ( memcmp ( remac, &peer->remac,
  145. sizeof ( peer->remac ) ) == 0 ) {
  146. /* Move peer to start of list */
  147. list_del ( &peer->list );
  148. list_add ( &peer->list, &ipoib->peers );
  149. /* Update MAC */
  150. memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
  151. return 0;
  152. }
  153. }
  154. /* Create new entry */
  155. peer = malloc ( sizeof ( *peer ) );
  156. if ( ! peer )
  157. return -ENOMEM;
  158. memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
  159. memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
  160. list_add ( &peer->list, &ipoib->peers );
  161. return 0;
  162. }
  163. /**
  164. * Flush REMAC cache
  165. *
  166. * @v ipoib IPoIB device
  167. */
  168. static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
  169. struct ipoib_peer *peer;
  170. struct ipoib_peer *tmp;
  171. list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
  172. list_del ( &peer->list );
  173. free ( peer );
  174. }
  175. }
  176. /**
  177. * Discard some entries from the REMAC cache
  178. *
  179. * @ret discarded Number of cached items discarded
  180. */
  181. static unsigned int ipoib_discard_remac ( void ) {
  182. struct ib_device *ibdev;
  183. struct ipoib_device *ipoib;
  184. struct ipoib_peer *peer;
  185. unsigned int discarded = 0;
  186. /* Try to discard one cache entry for each IPoIB device */
  187. for_each_ibdev ( ibdev ) {
  188. ipoib = ib_get_ownerdata ( ibdev );
  189. list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
  190. list_del ( &peer->list );
  191. free ( peer );
  192. discarded++;
  193. break;
  194. }
  195. }
  196. return discarded;
  197. }
  198. /** IPoIB cache discarder */
  199. struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_NORMAL ) = {
  200. .discard = ipoib_discard_remac,
  201. };
  202. /****************************************************************************
  203. *
  204. * IPoIB link layer
  205. *
  206. ****************************************************************************
  207. */
  208. /**
  209. * Initialise IPoIB link-layer address
  210. *
  211. * @v hw_addr Hardware address
  212. * @v ll_addr Link-layer address
  213. */
  214. static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
  215. const uint8_t *guid = hw_addr;
  216. uint8_t *eth_addr = ll_addr;
  217. uint8_t guid_mask = IPOIB_GUID_MASK;
  218. unsigned int i;
  219. /* Extract bytes from GUID according to mask */
  220. for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
  221. if ( guid_mask & 0x80 )
  222. *(eth_addr++) = *guid;
  223. }
  224. }
  225. /** IPoIB protocol */
  226. struct ll_protocol ipoib_protocol __ll_protocol = {
  227. .name = "IPoIB",
  228. .ll_proto = htons ( ARPHRD_ETHER ),
  229. .hw_addr_len = sizeof ( union ib_guid ),
  230. .ll_addr_len = ETH_ALEN,
  231. .ll_header_len = ETH_HLEN,
  232. .push = eth_push,
  233. .pull = eth_pull,
  234. .init_addr = ipoib_init_addr,
  235. .ntoa = eth_ntoa,
  236. .mc_hash = eth_mc_hash,
  237. .eth_addr = eth_eth_addr,
  238. .flags = LL_NAME_ONLY,
  239. };
  240. /**
  241. * Allocate IPoIB device
  242. *
  243. * @v priv_size Size of driver private data
  244. * @ret netdev Network device, or NULL
  245. */
  246. struct net_device * alloc_ipoibdev ( size_t priv_size ) {
  247. struct net_device *netdev;
  248. netdev = alloc_netdev ( priv_size );
  249. if ( netdev ) {
  250. netdev->ll_protocol = &ipoib_protocol;
  251. netdev->ll_broadcast = eth_broadcast;
  252. netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
  253. }
  254. return netdev;
  255. }
  256. /****************************************************************************
  257. *
  258. * IPoIB translation layer
  259. *
  260. ****************************************************************************
  261. */
  262. /**
  263. * Translate transmitted ARP packet
  264. *
  265. * @v netdev Network device
  266. * @v iobuf Packet to be transmitted (with no link-layer headers)
  267. * @ret rc Return status code
  268. */
  269. static int ipoib_translate_tx_arp ( struct net_device *netdev,
  270. struct io_buffer *iobuf ) {
  271. struct ipoib_device *ipoib = netdev->priv;
  272. struct arphdr *arphdr = iobuf->data;
  273. struct ipoib_mac *target_ha = NULL;
  274. void *sender_pa;
  275. void *target_pa;
  276. /* Do nothing unless ARP contains eIPoIB link-layer addresses */
  277. if ( arphdr->ar_hln != ETH_ALEN )
  278. return 0;
  279. /* Fail unless we have room to expand packet */
  280. if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
  281. ETH_ALEN ) ) ) {
  282. DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
  283. ipoib );
  284. return -ENOBUFS;
  285. }
  286. /* Look up REMAC, if applicable */
  287. if ( arphdr->ar_op == ARPOP_REPLY ) {
  288. target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
  289. if ( ! target_ha )
  290. return -ENXIO;
  291. }
  292. /* Construct new packet */
  293. iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
  294. sender_pa = arp_sender_pa ( arphdr );
  295. target_pa = arp_target_pa ( arphdr );
  296. arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
  297. arphdr->ar_hln = sizeof ( ipoib->mac );
  298. memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
  299. memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
  300. memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
  301. memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
  302. if ( target_ha ) {
  303. memcpy ( arp_target_ha ( arphdr ), target_ha,
  304. sizeof ( *target_ha ) );
  305. }
  306. return 0;
  307. }
  308. /**
  309. * Translate transmitted packet
  310. *
  311. * @v netdev Network device
  312. * @v iobuf Packet to be transmitted (with no link-layer headers)
  313. * @v net_proto Network-layer protocol (in network byte order)
  314. * @ret rc Return status code
  315. */
  316. static int ipoib_translate_tx ( struct net_device *netdev,
  317. struct io_buffer *iobuf, uint16_t net_proto ) {
  318. switch ( net_proto ) {
  319. case htons ( ETH_P_ARP ) :
  320. return ipoib_translate_tx_arp ( netdev, iobuf );
  321. case htons ( ETH_P_IP ) :
  322. /* No translation needed */
  323. return 0;
  324. default:
  325. /* Cannot handle other traffic via eIPoIB */
  326. return -ENOTSUP;
  327. }
  328. }
  329. /**
  330. * Translate received ARP packet
  331. *
  332. * @v netdev Network device
  333. * @v iobuf Received packet (with no link-layer headers)
  334. * @v remac Constructed Remote Ethernet MAC
  335. * @ret rc Return status code
  336. */
  337. static int ipoib_translate_rx_arp ( struct net_device *netdev,
  338. struct io_buffer *iobuf,
  339. struct ipoib_remac *remac ) {
  340. struct ipoib_device *ipoib = netdev->priv;
  341. struct arphdr *arphdr = iobuf->data;
  342. void *sender_pa;
  343. void *target_pa;
  344. int rc;
  345. /* Do nothing unless ARP contains IPoIB link-layer addresses */
  346. if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
  347. return 0;
  348. /* Create REMAC cache entry */
  349. if ( ( rc = ipoib_map_remac ( ipoib, remac,
  350. arp_sender_ha ( arphdr ) ) ) != 0 ) {
  351. DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
  352. ipoib, strerror ( rc ) );
  353. return rc;
  354. }
  355. /* Construct new packet */
  356. sender_pa = arp_sender_pa ( arphdr );
  357. target_pa = arp_target_pa ( arphdr );
  358. arphdr->ar_hrd = htons ( ARPHRD_ETHER );
  359. arphdr->ar_hln = ETH_ALEN;
  360. memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
  361. memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
  362. memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
  363. memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
  364. if ( arphdr->ar_op == ARPOP_REPLY ) {
  365. /* Assume received replies were directed to us */
  366. memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
  367. }
  368. iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
  369. return 0;
  370. }
  371. /**
  372. * Translate received packet
  373. *
  374. * @v netdev Network device
  375. * @v iobuf Received packet (with no link-layer headers)
  376. * @v remac Constructed Remote Ethernet MAC
  377. * @v net_proto Network-layer protocol (in network byte order)
  378. * @ret rc Return status code
  379. */
  380. static int ipoib_translate_rx ( struct net_device *netdev,
  381. struct io_buffer *iobuf,
  382. struct ipoib_remac *remac,
  383. uint16_t net_proto ) {
  384. switch ( net_proto ) {
  385. case htons ( ETH_P_ARP ) :
  386. return ipoib_translate_rx_arp ( netdev, iobuf, remac );
  387. case htons ( ETH_P_IP ) :
  388. /* No translation needed */
  389. return 0;
  390. default:
  391. /* Cannot handle other traffic via eIPoIB */
  392. return -ENOTSUP;
  393. }
  394. }
  395. /****************************************************************************
  396. *
  397. * IPoIB network device
  398. *
  399. ****************************************************************************
  400. */
  401. /**
  402. * Transmit packet via IPoIB network device
  403. *
  404. * @v netdev Network device
  405. * @v iobuf I/O buffer
  406. * @ret rc Return status code
  407. */
  408. static int ipoib_transmit ( struct net_device *netdev,
  409. struct io_buffer *iobuf ) {
  410. struct ipoib_device *ipoib = netdev->priv;
  411. struct ib_device *ibdev = ipoib->ibdev;
  412. struct ethhdr *ethhdr;
  413. struct ipoib_hdr *ipoib_hdr;
  414. struct ipoib_mac *mac;
  415. struct ib_address_vector dest;
  416. uint16_t net_proto;
  417. int rc;
  418. /* Sanity check */
  419. if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
  420. DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
  421. return -EINVAL;
  422. }
  423. /* Attempting transmission while link is down will put the
  424. * queue pair into an error state, so don't try it.
  425. */
  426. if ( ! ib_link_ok ( ibdev ) )
  427. return -ENETUNREACH;
  428. /* Strip eIPoIB header */
  429. ethhdr = iobuf->data;
  430. net_proto = ethhdr->h_protocol;
  431. iob_pull ( iobuf, sizeof ( *ethhdr ) );
  432. /* Identify destination address */
  433. mac = ipoib_find_remac ( ipoib, ( ( void *) ethhdr->h_dest ) );
  434. if ( ! mac )
  435. return -ENXIO;
  436. /* Translate packet if applicable */
  437. if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
  438. return rc;
  439. /* Prepend real IPoIB header */
  440. ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
  441. ipoib_hdr->proto = net_proto;
  442. ipoib_hdr->reserved = 0;
  443. /* Construct address vector */
  444. memset ( &dest, 0, sizeof ( dest ) );
  445. dest.qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
  446. dest.gid_present = 1;
  447. memcpy ( &dest.gid, &mac->gid, sizeof ( dest.gid ) );
  448. if ( ( rc = ib_resolve_path ( ibdev, &dest ) ) != 0 ) {
  449. /* Path not resolved yet */
  450. return rc;
  451. }
  452. return ib_post_send ( ibdev, ipoib->qp, &dest, iobuf );
  453. }
  454. /**
  455. * Handle IPoIB send completion
  456. *
  457. * @v ibdev Infiniband device
  458. * @v qp Queue pair
  459. * @v iobuf I/O buffer
  460. * @v rc Completion status code
  461. */
  462. static void ipoib_complete_send ( struct ib_device *ibdev __unused,
  463. struct ib_queue_pair *qp,
  464. struct io_buffer *iobuf, int rc ) {
  465. struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
  466. netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
  467. }
  468. /**
  469. * Handle IPoIB receive completion
  470. *
  471. * @v ibdev Infiniband device
  472. * @v qp Queue pair
  473. * @v dest Destination address vector, or NULL
  474. * @v source Source address vector, or NULL
  475. * @v iobuf I/O buffer
  476. * @v rc Completion status code
  477. */
  478. static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
  479. struct ib_queue_pair *qp,
  480. struct ib_address_vector *dest,
  481. struct ib_address_vector *source,
  482. struct io_buffer *iobuf, int rc ) {
  483. struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
  484. struct net_device *netdev = ipoib->netdev;
  485. struct ipoib_hdr *ipoib_hdr;
  486. struct ethhdr *ethhdr;
  487. struct ipoib_remac remac;
  488. uint16_t net_proto;
  489. /* Record errors */
  490. if ( rc != 0 ) {
  491. netdev_rx_err ( netdev, iobuf, rc );
  492. return;
  493. }
  494. /* Sanity check */
  495. if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
  496. DBGC ( ipoib, "IPoIB %p received packet too short to "
  497. "contain IPoIB header\n", ipoib );
  498. DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
  499. netdev_rx_err ( netdev, iobuf, -EIO );
  500. return;
  501. }
  502. if ( ! source ) {
  503. DBGC ( ipoib, "IPoIB %p received packet without address "
  504. "vector\n", ipoib );
  505. netdev_rx_err ( netdev, iobuf, -ENOTTY );
  506. return;
  507. }
  508. /* Strip real IPoIB header */
  509. ipoib_hdr = iobuf->data;
  510. net_proto = ipoib_hdr->proto;
  511. iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
  512. /* Construct source address from remote QPN and LID */
  513. remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
  514. remac.lid = htons ( source->lid );
  515. /* Translate packet if applicable */
  516. if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
  517. net_proto ) ) != 0 ) {
  518. netdev_rx_err ( netdev, iobuf, rc );
  519. return;
  520. }
  521. /* Prepend eIPoIB header */
  522. ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
  523. memcpy ( &ethhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
  524. ethhdr->h_protocol = net_proto;
  525. /* Construct destination address */
  526. if ( dest->gid_present && ( memcmp ( &dest->gid, &ipoib->broadcast.gid,
  527. sizeof ( dest->gid ) ) == 0 ) ) {
  528. /* Broadcast GID; use the Ethernet broadcast address */
  529. memcpy ( &ethhdr->h_dest, eth_broadcast,
  530. sizeof ( ethhdr->h_dest ) );
  531. } else {
  532. /* Assume destination address is local Ethernet MAC */
  533. memcpy ( &ethhdr->h_dest, netdev->ll_addr,
  534. sizeof ( ethhdr->h_dest ) );
  535. }
  536. /* Hand off to network layer */
  537. netdev_rx ( netdev, iobuf );
  538. }
  539. /** IPoIB completion operations */
  540. static struct ib_completion_queue_operations ipoib_cq_op = {
  541. .complete_send = ipoib_complete_send,
  542. .complete_recv = ipoib_complete_recv,
  543. };
  544. /**
  545. * Allocate IPoIB receive I/O buffer
  546. *
  547. * @v len Length of buffer
  548. * @ret iobuf I/O buffer, or NULL
  549. *
  550. * Some Infiniband hardware requires 2kB alignment of receive buffers
  551. * and provides no way to disable header separation. The result is
  552. * that there are only four bytes of link-layer header (the real IPoIB
  553. * header) before the payload. This is not sufficient space to insert
  554. * an eIPoIB link-layer pseudo-header.
  555. *
  556. * We therefore allocate I/O buffers offset to start slightly before
  557. * the natural alignment boundary, in order to allow sufficient space.
  558. */
  559. static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
  560. struct io_buffer *iobuf;
  561. size_t reserve_len;
  562. /* Calculate additional length required at start of buffer */
  563. reserve_len = ( sizeof ( struct ethhdr ) -
  564. sizeof ( struct ipoib_hdr ) );
  565. /* Allocate buffer */
  566. iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
  567. if ( iobuf ) {
  568. iob_reserve ( iobuf, reserve_len );
  569. }
  570. return iobuf;
  571. }
  572. /** IPoIB queue pair operations */
  573. static struct ib_queue_pair_operations ipoib_qp_op = {
  574. .alloc_iob = ipoib_alloc_iob,
  575. };
  576. /**
  577. * Poll IPoIB network device
  578. *
  579. * @v netdev Network device
  580. */
  581. static void ipoib_poll ( struct net_device *netdev ) {
  582. struct ipoib_device *ipoib = netdev->priv;
  583. struct ib_device *ibdev = ipoib->ibdev;
  584. /* Poll Infiniband device */
  585. ib_poll_eq ( ibdev );
  586. /* Poll the retry timers (required for IPoIB multicast join) */
  587. retry_poll();
  588. }
  589. /**
  590. * Handle IPv4 broadcast multicast group join completion
  591. *
  592. * @v ibdev Infiniband device
  593. * @v qp Queue pair
  594. * @v membership Multicast group membership
  595. * @v rc Status code
  596. * @v mad Response MAD (or NULL on error)
  597. */
  598. void ipoib_join_complete ( struct ib_device *ibdev __unused,
  599. struct ib_queue_pair *qp __unused,
  600. struct ib_mc_membership *membership, int rc,
  601. union ib_mad *mad __unused ) {
  602. struct ipoib_device *ipoib = container_of ( membership,
  603. struct ipoib_device, broadcast_membership );
  604. /* Record join status as link status */
  605. netdev_link_err ( ipoib->netdev, rc );
  606. }
  607. /**
  608. * Join IPv4 broadcast multicast group
  609. *
  610. * @v ipoib IPoIB device
  611. * @ret rc Return status code
  612. */
  613. static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
  614. int rc;
  615. if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
  616. &ipoib->broadcast_membership,
  617. &ipoib->broadcast.gid,
  618. ipoib_join_complete ) ) != 0 ) {
  619. DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
  620. ipoib, strerror ( rc ) );
  621. return rc;
  622. }
  623. ipoib->broadcast_joined = 1;
  624. return 0;
  625. }
  626. /**
  627. * Leave IPv4 broadcast multicast group
  628. *
  629. * @v ipoib IPoIB device
  630. */
  631. static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
  632. if ( ipoib->broadcast_joined ) {
  633. ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
  634. &ipoib->broadcast_membership );
  635. ipoib->broadcast_joined = 0;
  636. }
  637. }
  638. /**
  639. * Handle link status change
  640. *
  641. * @v ibdev Infiniband device
  642. */
  643. static void ipoib_link_state_changed ( struct ib_device *ibdev ) {
  644. struct net_device *netdev = ib_get_ownerdata ( ibdev );
  645. struct ipoib_device *ipoib = netdev->priv;
  646. int rc;
  647. /* Leave existing broadcast group */
  648. ipoib_leave_broadcast_group ( ipoib );
  649. /* Update MAC address based on potentially-new GID prefix */
  650. memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
  651. sizeof ( ipoib->mac.gid.s.prefix ) );
  652. /* Update broadcast GID based on potentially-new partition key */
  653. ipoib->broadcast.gid.words[2] =
  654. htons ( ibdev->pkey | IB_PKEY_FULL );
  655. /* Set net device link state to reflect Infiniband link state */
  656. rc = ib_link_rc ( ibdev );
  657. netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
  658. /* Join new broadcast group */
  659. if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) &&
  660. ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
  661. DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
  662. "%s\n", ipoib, strerror ( rc ) );
  663. netdev_link_err ( netdev, rc );
  664. return;
  665. }
  666. }
  667. /**
  668. * Open IPoIB network device
  669. *
  670. * @v netdev Network device
  671. * @ret rc Return status code
  672. */
  673. static int ipoib_open ( struct net_device *netdev ) {
  674. struct ipoib_device *ipoib = netdev->priv;
  675. struct ib_device *ibdev = ipoib->ibdev;
  676. int rc;
  677. /* Open IB device */
  678. if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
  679. DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
  680. ipoib, strerror ( rc ) );
  681. goto err_ib_open;
  682. }
  683. /* Allocate completion queue */
  684. ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op );
  685. if ( ! ipoib->cq ) {
  686. DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
  687. ipoib );
  688. rc = -ENOMEM;
  689. goto err_create_cq;
  690. }
  691. /* Allocate queue pair */
  692. ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
  693. ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
  694. &ipoib_qp_op );
  695. if ( ! ipoib->qp ) {
  696. DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
  697. ipoib );
  698. rc = -ENOMEM;
  699. goto err_create_qp;
  700. }
  701. ib_qp_set_ownerdata ( ipoib->qp, ipoib );
  702. /* Update MAC address with QPN */
  703. ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
  704. /* Fill receive rings */
  705. ib_refill_recv ( ibdev, ipoib->qp );
  706. /* Fake a link status change to join the broadcast group */
  707. ipoib_link_state_changed ( ibdev );
  708. return 0;
  709. ib_destroy_qp ( ibdev, ipoib->qp );
  710. err_create_qp:
  711. ib_destroy_cq ( ibdev, ipoib->cq );
  712. err_create_cq:
  713. ib_close ( ibdev );
  714. err_ib_open:
  715. return rc;
  716. }
  717. /**
  718. * Close IPoIB network device
  719. *
  720. * @v netdev Network device
  721. */
  722. static void ipoib_close ( struct net_device *netdev ) {
  723. struct ipoib_device *ipoib = netdev->priv;
  724. struct ib_device *ibdev = ipoib->ibdev;
  725. /* Flush REMAC cache */
  726. ipoib_flush_remac ( ipoib );
  727. /* Leave broadcast group */
  728. ipoib_leave_broadcast_group ( ipoib );
  729. /* Remove QPN from MAC address */
  730. ipoib->mac.flags__qpn = 0;
  731. /* Tear down the queues */
  732. ib_destroy_qp ( ibdev, ipoib->qp );
  733. ib_destroy_cq ( ibdev, ipoib->cq );
  734. /* Close IB device */
  735. ib_close ( ibdev );
  736. }
  737. /** IPoIB network device operations */
  738. static struct net_device_operations ipoib_operations = {
  739. .open = ipoib_open,
  740. .close = ipoib_close,
  741. .transmit = ipoib_transmit,
  742. .poll = ipoib_poll,
  743. };
  744. /**
  745. * Probe IPoIB device
  746. *
  747. * @v ibdev Infiniband device
  748. * @ret rc Return status code
  749. */
  750. static int ipoib_probe ( struct ib_device *ibdev ) {
  751. struct net_device *netdev;
  752. struct ipoib_device *ipoib;
  753. int rc;
  754. /* Allocate network device */
  755. netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
  756. if ( ! netdev )
  757. return -ENOMEM;
  758. netdev_init ( netdev, &ipoib_operations );
  759. ipoib = netdev->priv;
  760. ib_set_ownerdata ( ibdev, netdev );
  761. netdev->dev = ibdev->dev;
  762. memset ( ipoib, 0, sizeof ( *ipoib ) );
  763. ipoib->netdev = netdev;
  764. ipoib->ibdev = ibdev;
  765. INIT_LIST_HEAD ( &ipoib->peers );
  766. /* Extract hardware address */
  767. memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
  768. sizeof ( ibdev->gid.s.guid ) );
  769. /* Set local MAC address */
  770. memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
  771. sizeof ( ipoib->mac.gid.s.guid ) );
  772. /* Set default broadcast MAC address */
  773. memcpy ( &ipoib->broadcast, &ipoib_broadcast,
  774. sizeof ( ipoib->broadcast ) );
  775. /* Register network device */
  776. if ( ( rc = register_netdev ( netdev ) ) != 0 )
  777. goto err_register_netdev;
  778. return 0;
  779. err_register_netdev:
  780. netdev_nullify ( netdev );
  781. netdev_put ( netdev );
  782. return rc;
  783. }
  784. /**
  785. * Remove IPoIB device
  786. *
  787. * @v ibdev Infiniband device
  788. */
  789. static void ipoib_remove ( struct ib_device *ibdev ) {
  790. struct net_device *netdev = ib_get_ownerdata ( ibdev );
  791. unregister_netdev ( netdev );
  792. netdev_nullify ( netdev );
  793. netdev_put ( netdev );
  794. }
  795. /** IPoIB driver */
  796. struct ib_driver ipoib_driver __ib_driver = {
  797. .name = "IPoIB",
  798. .probe = ipoib_probe,
  799. .notify = ipoib_link_state_changed,
  800. .remove = ipoib_remove,
  801. };