You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ipoib.c 24KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923
  1. /*
  2. * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License as
  6. * published by the Free Software Foundation; either version 2 of the
  7. * License, or any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful, but
  10. * WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17. * 02110-1301, USA.
  18. *
  19. * You can also choose to distribute this program under the terms of
  20. * the Unmodified Binary Distribution Licence (as given in the file
  21. * COPYING.UBDL), provided that you have satisfied its requirements.
  22. */
  23. FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
  24. #include <stdint.h>
  25. #include <stdlib.h>
  26. #include <stdio.h>
  27. #include <unistd.h>
  28. #include <string.h>
  29. #include <byteswap.h>
  30. #include <errno.h>
  31. #include <ipxe/errortab.h>
  32. #include <ipxe/malloc.h>
  33. #include <ipxe/if_arp.h>
  34. #include <ipxe/if_ether.h>
  35. #include <ipxe/ethernet.h>
  36. #include <ipxe/iobuf.h>
  37. #include <ipxe/netdevice.h>
  38. #include <ipxe/infiniband.h>
  39. #include <ipxe/ib_pathrec.h>
  40. #include <ipxe/ib_mcast.h>
  41. #include <ipxe/retry.h>
  42. #include <ipxe/ipoib.h>
  43. /** @file
  44. *
  45. * IP over Infiniband
  46. */
  47. /** Number of IPoIB send work queue entries */
  48. #define IPOIB_NUM_SEND_WQES 2
  49. /** Number of IPoIB receive work queue entries */
  50. #define IPOIB_NUM_RECV_WQES 4
  51. /** Number of IPoIB completion entries */
  52. #define IPOIB_NUM_CQES 8
  53. /** An IPoIB device */
  54. struct ipoib_device {
  55. /** Network device */
  56. struct net_device *netdev;
  57. /** Underlying Infiniband device */
  58. struct ib_device *ibdev;
  59. /** Completion queue */
  60. struct ib_completion_queue *cq;
  61. /** Queue pair */
  62. struct ib_queue_pair *qp;
  63. /** Local MAC */
  64. struct ipoib_mac mac;
  65. /** Broadcast MAC */
  66. struct ipoib_mac broadcast;
  67. /** Joined to IPv4 broadcast multicast group
  68. *
  69. * This flag indicates whether or not we have initiated the
  70. * join to the IPv4 broadcast multicast group.
  71. */
  72. int broadcast_joined;
  73. /** IPv4 broadcast multicast group membership */
  74. struct ib_mc_membership broadcast_membership;
  75. /** REMAC cache */
  76. struct list_head peers;
  77. };
  78. /** Broadcast IPoIB address */
  79. static struct ipoib_mac ipoib_broadcast = {
  80. .flags__qpn = htonl ( IB_QPN_BROADCAST ),
  81. .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  82. 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
  83. };
  84. /** Link status for "broadcast join in progress" */
  85. #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
  86. #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
  87. ( EINFO_EINPROGRESS, 0x01, "Joining" )
  88. /** Human-readable message for the link status */
  89. struct errortab ipoib_errors[] __errortab = {
  90. __einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
  91. };
  92. /****************************************************************************
  93. *
  94. * IPoIB REMAC cache
  95. *
  96. ****************************************************************************
  97. */
  98. /** An IPoIB REMAC cache entry */
  99. struct ipoib_peer {
  100. /** List of REMAC cache entries */
  101. struct list_head list;
  102. /** Remote Ethermet MAC */
  103. struct ipoib_remac remac;
  104. /** MAC address */
  105. struct ipoib_mac mac;
  106. };
  107. /**
  108. * Find IPoIB MAC from REMAC
  109. *
  110. * @v ipoib IPoIB device
  111. * @v remac Remote Ethernet MAC
  112. * @ret mac IPoIB MAC (or NULL if not found)
  113. */
  114. static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
  115. const struct ipoib_remac *remac ) {
  116. struct ipoib_peer *peer;
  117. /* Check for broadcast REMAC */
  118. if ( is_broadcast_ether_addr ( remac ) )
  119. return &ipoib->broadcast;
  120. /* Try to find via REMAC cache */
  121. list_for_each_entry ( peer, &ipoib->peers, list ) {
  122. if ( memcmp ( remac, &peer->remac,
  123. sizeof ( peer->remac ) ) == 0 ) {
  124. /* Move peer to start of list */
  125. list_del ( &peer->list );
  126. list_add ( &peer->list, &ipoib->peers );
  127. return &peer->mac;
  128. }
  129. }
  130. DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
  131. ipoib, eth_ntoa ( remac ) );
  132. return NULL;
  133. }
  134. /**
  135. * Add IPoIB MAC to REMAC cache
  136. *
  137. * @v ipoib IPoIB device
  138. * @v remac Remote Ethernet MAC
  139. * @v mac IPoIB MAC
  140. * @ret rc Return status code
  141. */
  142. static int ipoib_map_remac ( struct ipoib_device *ipoib,
  143. const struct ipoib_remac *remac,
  144. const struct ipoib_mac *mac ) {
  145. struct ipoib_peer *peer;
  146. /* Check for existing entry in REMAC cache */
  147. list_for_each_entry ( peer, &ipoib->peers, list ) {
  148. if ( memcmp ( remac, &peer->remac,
  149. sizeof ( peer->remac ) ) == 0 ) {
  150. /* Move peer to start of list */
  151. list_del ( &peer->list );
  152. list_add ( &peer->list, &ipoib->peers );
  153. /* Update MAC */
  154. memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
  155. return 0;
  156. }
  157. }
  158. /* Create new entry */
  159. peer = malloc ( sizeof ( *peer ) );
  160. if ( ! peer )
  161. return -ENOMEM;
  162. memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
  163. memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
  164. list_add ( &peer->list, &ipoib->peers );
  165. return 0;
  166. }
  167. /**
  168. * Flush REMAC cache
  169. *
  170. * @v ipoib IPoIB device
  171. */
  172. static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
  173. struct ipoib_peer *peer;
  174. struct ipoib_peer *tmp;
  175. list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
  176. list_del ( &peer->list );
  177. free ( peer );
  178. }
  179. }
  180. /**
  181. * Discard some entries from the REMAC cache
  182. *
  183. * @ret discarded Number of cached items discarded
  184. */
  185. static unsigned int ipoib_discard_remac ( void ) {
  186. struct ib_device *ibdev;
  187. struct ipoib_device *ipoib;
  188. struct ipoib_peer *peer;
  189. unsigned int discarded = 0;
  190. /* Try to discard one cache entry for each IPoIB device */
  191. for_each_ibdev ( ibdev ) {
  192. ipoib = ib_get_ownerdata ( ibdev );
  193. list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
  194. list_del ( &peer->list );
  195. free ( peer );
  196. discarded++;
  197. break;
  198. }
  199. }
  200. return discarded;
  201. }
  202. /** IPoIB cache discarder */
  203. struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_NORMAL ) = {
  204. .discard = ipoib_discard_remac,
  205. };
  206. /****************************************************************************
  207. *
  208. * IPoIB link layer
  209. *
  210. ****************************************************************************
  211. */
  212. /**
  213. * Initialise IPoIB link-layer address
  214. *
  215. * @v hw_addr Hardware address
  216. * @v ll_addr Link-layer address
  217. */
  218. static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
  219. const uint8_t *guid = hw_addr;
  220. uint8_t *eth_addr = ll_addr;
  221. uint8_t guid_mask = IPOIB_GUID_MASK;
  222. unsigned int i;
  223. /* Extract bytes from GUID according to mask */
  224. for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
  225. if ( guid_mask & 0x80 )
  226. *(eth_addr++) = *guid;
  227. }
  228. }
  229. /** IPoIB protocol */
  230. struct ll_protocol ipoib_protocol __ll_protocol = {
  231. .name = "IPoIB",
  232. .ll_proto = htons ( ARPHRD_ETHER ),
  233. .hw_addr_len = sizeof ( union ib_guid ),
  234. .ll_addr_len = ETH_ALEN,
  235. .ll_header_len = ETH_HLEN,
  236. .push = eth_push,
  237. .pull = eth_pull,
  238. .init_addr = ipoib_init_addr,
  239. .ntoa = eth_ntoa,
  240. .mc_hash = eth_mc_hash,
  241. .eth_addr = eth_eth_addr,
  242. .eui64 = eth_eui64,
  243. .flags = LL_NAME_ONLY,
  244. };
  245. /**
  246. * Allocate IPoIB device
  247. *
  248. * @v priv_size Size of driver private data
  249. * @ret netdev Network device, or NULL
  250. */
  251. struct net_device * alloc_ipoibdev ( size_t priv_size ) {
  252. struct net_device *netdev;
  253. netdev = alloc_netdev ( priv_size );
  254. if ( netdev ) {
  255. netdev->ll_protocol = &ipoib_protocol;
  256. netdev->ll_broadcast = eth_broadcast;
  257. netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
  258. }
  259. return netdev;
  260. }
  261. /****************************************************************************
  262. *
  263. * IPoIB translation layer
  264. *
  265. ****************************************************************************
  266. */
  267. /**
  268. * Translate transmitted ARP packet
  269. *
  270. * @v netdev Network device
  271. * @v iobuf Packet to be transmitted (with no link-layer headers)
  272. * @ret rc Return status code
  273. */
  274. static int ipoib_translate_tx_arp ( struct net_device *netdev,
  275. struct io_buffer *iobuf ) {
  276. struct ipoib_device *ipoib = netdev->priv;
  277. struct arphdr *arphdr = iobuf->data;
  278. struct ipoib_mac *target_ha = NULL;
  279. void *sender_pa;
  280. void *target_pa;
  281. /* Do nothing unless ARP contains eIPoIB link-layer addresses */
  282. if ( arphdr->ar_hln != ETH_ALEN )
  283. return 0;
  284. /* Fail unless we have room to expand packet */
  285. if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
  286. ETH_ALEN ) ) ) {
  287. DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
  288. ipoib );
  289. return -ENOBUFS;
  290. }
  291. /* Look up REMAC, if applicable */
  292. if ( arphdr->ar_op == ARPOP_REPLY ) {
  293. target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
  294. if ( ! target_ha )
  295. return -ENXIO;
  296. }
  297. /* Construct new packet */
  298. iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
  299. sender_pa = arp_sender_pa ( arphdr );
  300. target_pa = arp_target_pa ( arphdr );
  301. arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
  302. arphdr->ar_hln = sizeof ( ipoib->mac );
  303. memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
  304. memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
  305. memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
  306. memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
  307. if ( target_ha ) {
  308. memcpy ( arp_target_ha ( arphdr ), target_ha,
  309. sizeof ( *target_ha ) );
  310. }
  311. return 0;
  312. }
  313. /**
  314. * Translate transmitted packet
  315. *
  316. * @v netdev Network device
  317. * @v iobuf Packet to be transmitted (with no link-layer headers)
  318. * @v net_proto Network-layer protocol (in network byte order)
  319. * @ret rc Return status code
  320. */
  321. static int ipoib_translate_tx ( struct net_device *netdev,
  322. struct io_buffer *iobuf, uint16_t net_proto ) {
  323. switch ( net_proto ) {
  324. case htons ( ETH_P_ARP ) :
  325. return ipoib_translate_tx_arp ( netdev, iobuf );
  326. case htons ( ETH_P_IP ) :
  327. /* No translation needed */
  328. return 0;
  329. default:
  330. /* Cannot handle other traffic via eIPoIB */
  331. return -ENOTSUP;
  332. }
  333. }
  334. /**
  335. * Translate received ARP packet
  336. *
  337. * @v netdev Network device
  338. * @v iobuf Received packet (with no link-layer headers)
  339. * @v remac Constructed Remote Ethernet MAC
  340. * @ret rc Return status code
  341. */
  342. static int ipoib_translate_rx_arp ( struct net_device *netdev,
  343. struct io_buffer *iobuf,
  344. struct ipoib_remac *remac ) {
  345. struct ipoib_device *ipoib = netdev->priv;
  346. struct arphdr *arphdr = iobuf->data;
  347. void *sender_pa;
  348. void *target_pa;
  349. int rc;
  350. /* Do nothing unless ARP contains IPoIB link-layer addresses */
  351. if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
  352. return 0;
  353. /* Create REMAC cache entry */
  354. if ( ( rc = ipoib_map_remac ( ipoib, remac,
  355. arp_sender_ha ( arphdr ) ) ) != 0 ) {
  356. DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
  357. ipoib, strerror ( rc ) );
  358. return rc;
  359. }
  360. /* Construct new packet */
  361. sender_pa = arp_sender_pa ( arphdr );
  362. target_pa = arp_target_pa ( arphdr );
  363. arphdr->ar_hrd = htons ( ARPHRD_ETHER );
  364. arphdr->ar_hln = ETH_ALEN;
  365. memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
  366. memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
  367. memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
  368. memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
  369. if ( arphdr->ar_op == ARPOP_REPLY ) {
  370. /* Assume received replies were directed to us */
  371. memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
  372. }
  373. iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
  374. return 0;
  375. }
  376. /**
  377. * Translate received packet
  378. *
  379. * @v netdev Network device
  380. * @v iobuf Received packet (with no link-layer headers)
  381. * @v remac Constructed Remote Ethernet MAC
  382. * @v net_proto Network-layer protocol (in network byte order)
  383. * @ret rc Return status code
  384. */
  385. static int ipoib_translate_rx ( struct net_device *netdev,
  386. struct io_buffer *iobuf,
  387. struct ipoib_remac *remac,
  388. uint16_t net_proto ) {
  389. switch ( net_proto ) {
  390. case htons ( ETH_P_ARP ) :
  391. return ipoib_translate_rx_arp ( netdev, iobuf, remac );
  392. case htons ( ETH_P_IP ) :
  393. /* No translation needed */
  394. return 0;
  395. default:
  396. /* Cannot handle other traffic via eIPoIB */
  397. return -ENOTSUP;
  398. }
  399. }
  400. /****************************************************************************
  401. *
  402. * IPoIB network device
  403. *
  404. ****************************************************************************
  405. */
  406. /**
  407. * Transmit packet via IPoIB network device
  408. *
  409. * @v netdev Network device
  410. * @v iobuf I/O buffer
  411. * @ret rc Return status code
  412. */
  413. static int ipoib_transmit ( struct net_device *netdev,
  414. struct io_buffer *iobuf ) {
  415. struct ipoib_device *ipoib = netdev->priv;
  416. struct ib_device *ibdev = ipoib->ibdev;
  417. struct ethhdr *ethhdr;
  418. struct ipoib_hdr *ipoib_hdr;
  419. struct ipoib_mac *mac;
  420. struct ib_address_vector dest;
  421. uint16_t net_proto;
  422. int rc;
  423. /* Sanity check */
  424. if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
  425. DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
  426. return -EINVAL;
  427. }
  428. /* Attempting transmission while link is down will put the
  429. * queue pair into an error state, so don't try it.
  430. */
  431. if ( ! ib_link_ok ( ibdev ) )
  432. return -ENETUNREACH;
  433. /* Strip eIPoIB header */
  434. ethhdr = iobuf->data;
  435. net_proto = ethhdr->h_protocol;
  436. iob_pull ( iobuf, sizeof ( *ethhdr ) );
  437. /* Identify destination address */
  438. mac = ipoib_find_remac ( ipoib, ( ( void *) ethhdr->h_dest ) );
  439. if ( ! mac )
  440. return -ENXIO;
  441. /* Translate packet if applicable */
  442. if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
  443. return rc;
  444. /* Prepend real IPoIB header */
  445. ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
  446. ipoib_hdr->proto = net_proto;
  447. ipoib_hdr->reserved = 0;
  448. /* Construct address vector */
  449. memset ( &dest, 0, sizeof ( dest ) );
  450. dest.qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
  451. dest.gid_present = 1;
  452. memcpy ( &dest.gid, &mac->gid, sizeof ( dest.gid ) );
  453. if ( ( rc = ib_resolve_path ( ibdev, &dest ) ) != 0 ) {
  454. /* Path not resolved yet */
  455. return rc;
  456. }
  457. return ib_post_send ( ibdev, ipoib->qp, &dest, iobuf );
  458. }
  459. /**
  460. * Handle IPoIB send completion
  461. *
  462. * @v ibdev Infiniband device
  463. * @v qp Queue pair
  464. * @v iobuf I/O buffer
  465. * @v rc Completion status code
  466. */
  467. static void ipoib_complete_send ( struct ib_device *ibdev __unused,
  468. struct ib_queue_pair *qp,
  469. struct io_buffer *iobuf, int rc ) {
  470. struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
  471. netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
  472. }
  473. /**
  474. * Handle IPoIB receive completion
  475. *
  476. * @v ibdev Infiniband device
  477. * @v qp Queue pair
  478. * @v dest Destination address vector, or NULL
  479. * @v source Source address vector, or NULL
  480. * @v iobuf I/O buffer
  481. * @v rc Completion status code
  482. */
  483. static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
  484. struct ib_queue_pair *qp,
  485. struct ib_address_vector *dest,
  486. struct ib_address_vector *source,
  487. struct io_buffer *iobuf, int rc ) {
  488. struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
  489. struct net_device *netdev = ipoib->netdev;
  490. struct ipoib_hdr *ipoib_hdr;
  491. struct ethhdr *ethhdr;
  492. struct ipoib_remac remac;
  493. uint16_t net_proto;
  494. /* Record errors */
  495. if ( rc != 0 ) {
  496. netdev_rx_err ( netdev, iobuf, rc );
  497. return;
  498. }
  499. /* Sanity check */
  500. if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
  501. DBGC ( ipoib, "IPoIB %p received packet too short to "
  502. "contain IPoIB header\n", ipoib );
  503. DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
  504. netdev_rx_err ( netdev, iobuf, -EIO );
  505. return;
  506. }
  507. if ( ! source ) {
  508. DBGC ( ipoib, "IPoIB %p received packet without address "
  509. "vector\n", ipoib );
  510. netdev_rx_err ( netdev, iobuf, -ENOTTY );
  511. return;
  512. }
  513. /* Strip real IPoIB header */
  514. ipoib_hdr = iobuf->data;
  515. net_proto = ipoib_hdr->proto;
  516. iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
  517. /* Construct source address from remote QPN and LID */
  518. remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
  519. remac.lid = htons ( source->lid );
  520. /* Translate packet if applicable */
  521. if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
  522. net_proto ) ) != 0 ) {
  523. netdev_rx_err ( netdev, iobuf, rc );
  524. return;
  525. }
  526. /* Prepend eIPoIB header */
  527. ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
  528. memcpy ( &ethhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
  529. ethhdr->h_protocol = net_proto;
  530. /* Construct destination address */
  531. if ( dest->gid_present && ( memcmp ( &dest->gid, &ipoib->broadcast.gid,
  532. sizeof ( dest->gid ) ) == 0 ) ) {
  533. /* Broadcast GID; use the Ethernet broadcast address */
  534. memcpy ( &ethhdr->h_dest, eth_broadcast,
  535. sizeof ( ethhdr->h_dest ) );
  536. } else {
  537. /* Assume destination address is local Ethernet MAC */
  538. memcpy ( &ethhdr->h_dest, netdev->ll_addr,
  539. sizeof ( ethhdr->h_dest ) );
  540. }
  541. /* Hand off to network layer */
  542. netdev_rx ( netdev, iobuf );
  543. }
  544. /** IPoIB completion operations */
  545. static struct ib_completion_queue_operations ipoib_cq_op = {
  546. .complete_send = ipoib_complete_send,
  547. .complete_recv = ipoib_complete_recv,
  548. };
  549. /**
  550. * Allocate IPoIB receive I/O buffer
  551. *
  552. * @v len Length of buffer
  553. * @ret iobuf I/O buffer, or NULL
  554. *
  555. * Some Infiniband hardware requires 2kB alignment of receive buffers
  556. * and provides no way to disable header separation. The result is
  557. * that there are only four bytes of link-layer header (the real IPoIB
  558. * header) before the payload. This is not sufficient space to insert
  559. * an eIPoIB link-layer pseudo-header.
  560. *
  561. * We therefore allocate I/O buffers offset to start slightly before
  562. * the natural alignment boundary, in order to allow sufficient space.
  563. */
  564. static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
  565. struct io_buffer *iobuf;
  566. size_t reserve_len;
  567. /* Calculate additional length required at start of buffer */
  568. reserve_len = ( sizeof ( struct ethhdr ) -
  569. sizeof ( struct ipoib_hdr ) );
  570. /* Allocate buffer */
  571. iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
  572. if ( iobuf ) {
  573. iob_reserve ( iobuf, reserve_len );
  574. }
  575. return iobuf;
  576. }
  577. /** IPoIB queue pair operations */
  578. static struct ib_queue_pair_operations ipoib_qp_op = {
  579. .alloc_iob = ipoib_alloc_iob,
  580. };
  581. /**
  582. * Poll IPoIB network device
  583. *
  584. * @v netdev Network device
  585. */
  586. static void ipoib_poll ( struct net_device *netdev ) {
  587. struct ipoib_device *ipoib = netdev->priv;
  588. struct ib_device *ibdev = ipoib->ibdev;
  589. /* Poll Infiniband device */
  590. ib_poll_eq ( ibdev );
  591. /* Poll the retry timers (required for IPoIB multicast join) */
  592. retry_poll();
  593. }
  594. /**
  595. * Handle IPv4 broadcast multicast group join completion
  596. *
  597. * @v ibdev Infiniband device
  598. * @v qp Queue pair
  599. * @v membership Multicast group membership
  600. * @v rc Status code
  601. * @v mad Response MAD (or NULL on error)
  602. */
  603. void ipoib_join_complete ( struct ib_device *ibdev __unused,
  604. struct ib_queue_pair *qp __unused,
  605. struct ib_mc_membership *membership, int rc,
  606. union ib_mad *mad __unused ) {
  607. struct ipoib_device *ipoib = container_of ( membership,
  608. struct ipoib_device, broadcast_membership );
  609. /* Record join status as link status */
  610. netdev_link_err ( ipoib->netdev, rc );
  611. }
  612. /**
  613. * Join IPv4 broadcast multicast group
  614. *
  615. * @v ipoib IPoIB device
  616. * @ret rc Return status code
  617. */
  618. static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
  619. int rc;
  620. if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
  621. &ipoib->broadcast_membership,
  622. &ipoib->broadcast.gid,
  623. ipoib_join_complete ) ) != 0 ) {
  624. DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
  625. ipoib, strerror ( rc ) );
  626. return rc;
  627. }
  628. ipoib->broadcast_joined = 1;
  629. return 0;
  630. }
  631. /**
  632. * Leave IPv4 broadcast multicast group
  633. *
  634. * @v ipoib IPoIB device
  635. */
  636. static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
  637. if ( ipoib->broadcast_joined ) {
  638. ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
  639. &ipoib->broadcast_membership );
  640. ipoib->broadcast_joined = 0;
  641. }
  642. }
  643. /**
  644. * Handle link status change
  645. *
  646. * @v ibdev Infiniband device
  647. */
  648. static void ipoib_link_state_changed ( struct ib_device *ibdev ) {
  649. struct net_device *netdev = ib_get_ownerdata ( ibdev );
  650. struct ipoib_device *ipoib = netdev->priv;
  651. int rc;
  652. /* Leave existing broadcast group */
  653. ipoib_leave_broadcast_group ( ipoib );
  654. /* Update MAC address based on potentially-new GID prefix */
  655. memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
  656. sizeof ( ipoib->mac.gid.s.prefix ) );
  657. /* Update broadcast GID based on potentially-new partition key */
  658. ipoib->broadcast.gid.words[2] =
  659. htons ( ibdev->pkey | IB_PKEY_FULL );
  660. /* Set net device link state to reflect Infiniband link state */
  661. rc = ib_link_rc ( ibdev );
  662. netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
  663. /* Join new broadcast group */
  664. if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) &&
  665. ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
  666. DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
  667. "%s\n", ipoib, strerror ( rc ) );
  668. netdev_link_err ( netdev, rc );
  669. return;
  670. }
  671. }
  672. /**
  673. * Open IPoIB network device
  674. *
  675. * @v netdev Network device
  676. * @ret rc Return status code
  677. */
  678. static int ipoib_open ( struct net_device *netdev ) {
  679. struct ipoib_device *ipoib = netdev->priv;
  680. struct ib_device *ibdev = ipoib->ibdev;
  681. int rc;
  682. /* Open IB device */
  683. if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
  684. DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
  685. ipoib, strerror ( rc ) );
  686. goto err_ib_open;
  687. }
  688. /* Allocate completion queue */
  689. ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op );
  690. if ( ! ipoib->cq ) {
  691. DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
  692. ipoib );
  693. rc = -ENOMEM;
  694. goto err_create_cq;
  695. }
  696. /* Allocate queue pair */
  697. ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
  698. ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
  699. &ipoib_qp_op );
  700. if ( ! ipoib->qp ) {
  701. DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
  702. ipoib );
  703. rc = -ENOMEM;
  704. goto err_create_qp;
  705. }
  706. ib_qp_set_ownerdata ( ipoib->qp, ipoib );
  707. /* Update MAC address with QPN */
  708. ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
  709. /* Fill receive rings */
  710. ib_refill_recv ( ibdev, ipoib->qp );
  711. /* Fake a link status change to join the broadcast group */
  712. ipoib_link_state_changed ( ibdev );
  713. return 0;
  714. ib_destroy_qp ( ibdev, ipoib->qp );
  715. err_create_qp:
  716. ib_destroy_cq ( ibdev, ipoib->cq );
  717. err_create_cq:
  718. ib_close ( ibdev );
  719. err_ib_open:
  720. return rc;
  721. }
  722. /**
  723. * Close IPoIB network device
  724. *
  725. * @v netdev Network device
  726. */
  727. static void ipoib_close ( struct net_device *netdev ) {
  728. struct ipoib_device *ipoib = netdev->priv;
  729. struct ib_device *ibdev = ipoib->ibdev;
  730. /* Flush REMAC cache */
  731. ipoib_flush_remac ( ipoib );
  732. /* Leave broadcast group */
  733. ipoib_leave_broadcast_group ( ipoib );
  734. /* Remove QPN from MAC address */
  735. ipoib->mac.flags__qpn = 0;
  736. /* Tear down the queues */
  737. ib_destroy_qp ( ibdev, ipoib->qp );
  738. ib_destroy_cq ( ibdev, ipoib->cq );
  739. /* Close IB device */
  740. ib_close ( ibdev );
  741. }
  742. /** IPoIB network device operations */
  743. static struct net_device_operations ipoib_operations = {
  744. .open = ipoib_open,
  745. .close = ipoib_close,
  746. .transmit = ipoib_transmit,
  747. .poll = ipoib_poll,
  748. };
  749. /**
  750. * Probe IPoIB device
  751. *
  752. * @v ibdev Infiniband device
  753. * @ret rc Return status code
  754. */
  755. static int ipoib_probe ( struct ib_device *ibdev ) {
  756. struct net_device *netdev;
  757. struct ipoib_device *ipoib;
  758. int rc;
  759. /* Allocate network device */
  760. netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
  761. if ( ! netdev )
  762. return -ENOMEM;
  763. netdev_init ( netdev, &ipoib_operations );
  764. ipoib = netdev->priv;
  765. ib_set_ownerdata ( ibdev, netdev );
  766. netdev->dev = ibdev->dev;
  767. memset ( ipoib, 0, sizeof ( *ipoib ) );
  768. ipoib->netdev = netdev;
  769. ipoib->ibdev = ibdev;
  770. INIT_LIST_HEAD ( &ipoib->peers );
  771. /* Extract hardware address */
  772. memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
  773. sizeof ( ibdev->gid.s.guid ) );
  774. /* Set local MAC address */
  775. memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
  776. sizeof ( ipoib->mac.gid.s.guid ) );
  777. /* Set default broadcast MAC address */
  778. memcpy ( &ipoib->broadcast, &ipoib_broadcast,
  779. sizeof ( ipoib->broadcast ) );
  780. /* Register network device */
  781. if ( ( rc = register_netdev ( netdev ) ) != 0 )
  782. goto err_register_netdev;
  783. return 0;
  784. err_register_netdev:
  785. netdev_nullify ( netdev );
  786. netdev_put ( netdev );
  787. return rc;
  788. }
  789. /**
  790. * Remove IPoIB device
  791. *
  792. * @v ibdev Infiniband device
  793. */
  794. static void ipoib_remove ( struct ib_device *ibdev ) {
  795. struct net_device *netdev = ib_get_ownerdata ( ibdev );
  796. unregister_netdev ( netdev );
  797. netdev_nullify ( netdev );
  798. netdev_put ( netdev );
  799. }
  800. /** IPoIB driver */
  801. struct ib_driver ipoib_driver __ib_driver = {
  802. .name = "IPoIB",
  803. .probe = ipoib_probe,
  804. .notify = ipoib_link_state_changed,
  805. .remove = ipoib_remove,
  806. };