You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919
  1. /*
  2. * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License as
  6. * published by the Free Software Foundation; either version 2 of the
  7. * License, or any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful, but
  10. * WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17. * 02110-1301, USA.
  18. */
  19. FILE_LICENCE ( GPL2_OR_LATER );
  20. #include <stdint.h>
  21. #include <stdlib.h>
  22. #include <stdio.h>
  23. #include <unistd.h>
  24. #include <string.h>
  25. #include <byteswap.h>
  26. #include <errno.h>
  27. #include <ipxe/errortab.h>
  28. #include <ipxe/malloc.h>
  29. #include <ipxe/if_arp.h>
  30. #include <ipxe/if_ether.h>
  31. #include <ipxe/ethernet.h>
  32. #include <ipxe/iobuf.h>
  33. #include <ipxe/netdevice.h>
  34. #include <ipxe/infiniband.h>
  35. #include <ipxe/ib_pathrec.h>
  36. #include <ipxe/ib_mcast.h>
  37. #include <ipxe/retry.h>
  38. #include <ipxe/ipoib.h>
  39. /** @file
  40. *
  41. * IP over Infiniband
  42. */
  43. /** Number of IPoIB send work queue entries */
  44. #define IPOIB_NUM_SEND_WQES 2
  45. /** Number of IPoIB receive work queue entries */
  46. #define IPOIB_NUM_RECV_WQES 4
  47. /** Number of IPoIB completion entries */
  48. #define IPOIB_NUM_CQES 8
  49. /** An IPoIB device */
  50. struct ipoib_device {
  51. /** Network device */
  52. struct net_device *netdev;
  53. /** Underlying Infiniband device */
  54. struct ib_device *ibdev;
  55. /** Completion queue */
  56. struct ib_completion_queue *cq;
  57. /** Queue pair */
  58. struct ib_queue_pair *qp;
  59. /** Local MAC */
  60. struct ipoib_mac mac;
  61. /** Broadcast MAC */
  62. struct ipoib_mac broadcast;
  63. /** Joined to IPv4 broadcast multicast group
  64. *
  65. * This flag indicates whether or not we have initiated the
  66. * join to the IPv4 broadcast multicast group.
  67. */
  68. int broadcast_joined;
  69. /** IPv4 broadcast multicast group membership */
  70. struct ib_mc_membership broadcast_membership;
  71. /** REMAC cache */
  72. struct list_head peers;
  73. };
  74. /** Broadcast IPoIB address */
  75. static struct ipoib_mac ipoib_broadcast = {
  76. .flags__qpn = htonl ( IB_QPN_BROADCAST ),
  77. .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  78. 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
  79. };
  80. /** Link status for "broadcast join in progress" */
  81. #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
  82. #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
  83. ( EINFO_EINPROGRESS, 0x01, "Joining" )
  84. /** Human-readable message for the link status */
  85. struct errortab ipoib_errors[] __errortab = {
  86. __einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
  87. };
  88. /****************************************************************************
  89. *
  90. * IPoIB REMAC cache
  91. *
  92. ****************************************************************************
  93. */
  94. /** An IPoIB REMAC cache entry */
  95. struct ipoib_peer {
  96. /** List of REMAC cache entries */
  97. struct list_head list;
  98. /** Remote Ethermet MAC */
  99. struct ipoib_remac remac;
  100. /** MAC address */
  101. struct ipoib_mac mac;
  102. };
  103. /**
  104. * Find IPoIB MAC from REMAC
  105. *
  106. * @v ipoib IPoIB device
  107. * @v remac Remote Ethernet MAC
  108. * @ret mac IPoIB MAC (or NULL if not found)
  109. */
  110. static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
  111. const struct ipoib_remac *remac ) {
  112. struct ipoib_peer *peer;
  113. /* Check for broadcast REMAC */
  114. if ( is_broadcast_ether_addr ( remac ) )
  115. return &ipoib->broadcast;
  116. /* Try to find via REMAC cache */
  117. list_for_each_entry ( peer, &ipoib->peers, list ) {
  118. if ( memcmp ( remac, &peer->remac,
  119. sizeof ( peer->remac ) ) == 0 ) {
  120. /* Move peer to start of list */
  121. list_del ( &peer->list );
  122. list_add ( &peer->list, &ipoib->peers );
  123. return &peer->mac;
  124. }
  125. }
  126. DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
  127. ipoib, eth_ntoa ( remac ) );
  128. return NULL;
  129. }
  130. /**
  131. * Add IPoIB MAC to REMAC cache
  132. *
  133. * @v ipoib IPoIB device
  134. * @v remac Remote Ethernet MAC
  135. * @v mac IPoIB MAC
  136. * @ret rc Return status code
  137. */
  138. static int ipoib_map_remac ( struct ipoib_device *ipoib,
  139. const struct ipoib_remac *remac,
  140. const struct ipoib_mac *mac ) {
  141. struct ipoib_peer *peer;
  142. /* Check for existing entry in REMAC cache */
  143. list_for_each_entry ( peer, &ipoib->peers, list ) {
  144. if ( memcmp ( remac, &peer->remac,
  145. sizeof ( peer->remac ) ) == 0 ) {
  146. /* Move peer to start of list */
  147. list_del ( &peer->list );
  148. list_add ( &peer->list, &ipoib->peers );
  149. /* Update MAC */
  150. memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
  151. return 0;
  152. }
  153. }
  154. /* Create new entry */
  155. peer = malloc ( sizeof ( *peer ) );
  156. if ( ! peer )
  157. return -ENOMEM;
  158. memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
  159. memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
  160. list_add ( &peer->list, &ipoib->peers );
  161. return 0;
  162. }
  163. /**
  164. * Flush REMAC cache
  165. *
  166. * @v ipoib IPoIB device
  167. */
  168. static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
  169. struct ipoib_peer *peer;
  170. struct ipoib_peer *tmp;
  171. list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
  172. list_del ( &peer->list );
  173. free ( peer );
  174. }
  175. }
  176. /**
  177. * Discard some entries from the REMAC cache
  178. *
  179. * @ret discarded Number of cached items discarded
  180. */
  181. static unsigned int ipoib_discard_remac ( void ) {
  182. struct ib_device *ibdev;
  183. struct ipoib_device *ipoib;
  184. struct ipoib_peer *peer;
  185. unsigned int discarded = 0;
  186. /* Try to discard one cache entry for each IPoIB device */
  187. for_each_ibdev ( ibdev ) {
  188. ipoib = ib_get_ownerdata ( ibdev );
  189. list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
  190. list_del ( &peer->list );
  191. free ( peer );
  192. discarded++;
  193. break;
  194. }
  195. }
  196. return discarded;
  197. }
  198. /** IPoIB cache discarder */
  199. struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_NORMAL ) = {
  200. .discard = ipoib_discard_remac,
  201. };
  202. /****************************************************************************
  203. *
  204. * IPoIB link layer
  205. *
  206. ****************************************************************************
  207. */
  208. /**
  209. * Initialise IPoIB link-layer address
  210. *
  211. * @v hw_addr Hardware address
  212. * @v ll_addr Link-layer address
  213. */
  214. static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
  215. const uint8_t *guid = hw_addr;
  216. uint8_t *eth_addr = ll_addr;
  217. uint8_t guid_mask = IPOIB_GUID_MASK;
  218. unsigned int i;
  219. /* Extract bytes from GUID according to mask */
  220. for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
  221. if ( guid_mask & 0x80 )
  222. *(eth_addr++) = *guid;
  223. }
  224. }
  225. /** IPoIB protocol */
  226. struct ll_protocol ipoib_protocol __ll_protocol = {
  227. .name = "IPoIB",
  228. .ll_proto = htons ( ARPHRD_ETHER ),
  229. .hw_addr_len = sizeof ( union ib_guid ),
  230. .ll_addr_len = ETH_ALEN,
  231. .ll_header_len = ETH_HLEN,
  232. .push = eth_push,
  233. .pull = eth_pull,
  234. .init_addr = ipoib_init_addr,
  235. .ntoa = eth_ntoa,
  236. .mc_hash = eth_mc_hash,
  237. .eth_addr = eth_eth_addr,
  238. .eui64 = eth_eui64,
  239. .flags = LL_NAME_ONLY,
  240. };
  241. /**
  242. * Allocate IPoIB device
  243. *
  244. * @v priv_size Size of driver private data
  245. * @ret netdev Network device, or NULL
  246. */
  247. struct net_device * alloc_ipoibdev ( size_t priv_size ) {
  248. struct net_device *netdev;
  249. netdev = alloc_netdev ( priv_size );
  250. if ( netdev ) {
  251. netdev->ll_protocol = &ipoib_protocol;
  252. netdev->ll_broadcast = eth_broadcast;
  253. netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
  254. }
  255. return netdev;
  256. }
  257. /****************************************************************************
  258. *
  259. * IPoIB translation layer
  260. *
  261. ****************************************************************************
  262. */
  263. /**
  264. * Translate transmitted ARP packet
  265. *
  266. * @v netdev Network device
  267. * @v iobuf Packet to be transmitted (with no link-layer headers)
  268. * @ret rc Return status code
  269. */
  270. static int ipoib_translate_tx_arp ( struct net_device *netdev,
  271. struct io_buffer *iobuf ) {
  272. struct ipoib_device *ipoib = netdev->priv;
  273. struct arphdr *arphdr = iobuf->data;
  274. struct ipoib_mac *target_ha = NULL;
  275. void *sender_pa;
  276. void *target_pa;
  277. /* Do nothing unless ARP contains eIPoIB link-layer addresses */
  278. if ( arphdr->ar_hln != ETH_ALEN )
  279. return 0;
  280. /* Fail unless we have room to expand packet */
  281. if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
  282. ETH_ALEN ) ) ) {
  283. DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
  284. ipoib );
  285. return -ENOBUFS;
  286. }
  287. /* Look up REMAC, if applicable */
  288. if ( arphdr->ar_op == ARPOP_REPLY ) {
  289. target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
  290. if ( ! target_ha )
  291. return -ENXIO;
  292. }
  293. /* Construct new packet */
  294. iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
  295. sender_pa = arp_sender_pa ( arphdr );
  296. target_pa = arp_target_pa ( arphdr );
  297. arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
  298. arphdr->ar_hln = sizeof ( ipoib->mac );
  299. memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
  300. memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
  301. memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
  302. memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
  303. if ( target_ha ) {
  304. memcpy ( arp_target_ha ( arphdr ), target_ha,
  305. sizeof ( *target_ha ) );
  306. }
  307. return 0;
  308. }
  309. /**
  310. * Translate transmitted packet
  311. *
  312. * @v netdev Network device
  313. * @v iobuf Packet to be transmitted (with no link-layer headers)
  314. * @v net_proto Network-layer protocol (in network byte order)
  315. * @ret rc Return status code
  316. */
  317. static int ipoib_translate_tx ( struct net_device *netdev,
  318. struct io_buffer *iobuf, uint16_t net_proto ) {
  319. switch ( net_proto ) {
  320. case htons ( ETH_P_ARP ) :
  321. return ipoib_translate_tx_arp ( netdev, iobuf );
  322. case htons ( ETH_P_IP ) :
  323. /* No translation needed */
  324. return 0;
  325. default:
  326. /* Cannot handle other traffic via eIPoIB */
  327. return -ENOTSUP;
  328. }
  329. }
  330. /**
  331. * Translate received ARP packet
  332. *
  333. * @v netdev Network device
  334. * @v iobuf Received packet (with no link-layer headers)
  335. * @v remac Constructed Remote Ethernet MAC
  336. * @ret rc Return status code
  337. */
  338. static int ipoib_translate_rx_arp ( struct net_device *netdev,
  339. struct io_buffer *iobuf,
  340. struct ipoib_remac *remac ) {
  341. struct ipoib_device *ipoib = netdev->priv;
  342. struct arphdr *arphdr = iobuf->data;
  343. void *sender_pa;
  344. void *target_pa;
  345. int rc;
  346. /* Do nothing unless ARP contains IPoIB link-layer addresses */
  347. if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
  348. return 0;
  349. /* Create REMAC cache entry */
  350. if ( ( rc = ipoib_map_remac ( ipoib, remac,
  351. arp_sender_ha ( arphdr ) ) ) != 0 ) {
  352. DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
  353. ipoib, strerror ( rc ) );
  354. return rc;
  355. }
  356. /* Construct new packet */
  357. sender_pa = arp_sender_pa ( arphdr );
  358. target_pa = arp_target_pa ( arphdr );
  359. arphdr->ar_hrd = htons ( ARPHRD_ETHER );
  360. arphdr->ar_hln = ETH_ALEN;
  361. memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
  362. memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
  363. memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
  364. memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
  365. if ( arphdr->ar_op == ARPOP_REPLY ) {
  366. /* Assume received replies were directed to us */
  367. memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
  368. }
  369. iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
  370. return 0;
  371. }
  372. /**
  373. * Translate received packet
  374. *
  375. * @v netdev Network device
  376. * @v iobuf Received packet (with no link-layer headers)
  377. * @v remac Constructed Remote Ethernet MAC
  378. * @v net_proto Network-layer protocol (in network byte order)
  379. * @ret rc Return status code
  380. */
  381. static int ipoib_translate_rx ( struct net_device *netdev,
  382. struct io_buffer *iobuf,
  383. struct ipoib_remac *remac,
  384. uint16_t net_proto ) {
  385. switch ( net_proto ) {
  386. case htons ( ETH_P_ARP ) :
  387. return ipoib_translate_rx_arp ( netdev, iobuf, remac );
  388. case htons ( ETH_P_IP ) :
  389. /* No translation needed */
  390. return 0;
  391. default:
  392. /* Cannot handle other traffic via eIPoIB */
  393. return -ENOTSUP;
  394. }
  395. }
  396. /****************************************************************************
  397. *
  398. * IPoIB network device
  399. *
  400. ****************************************************************************
  401. */
  402. /**
  403. * Transmit packet via IPoIB network device
  404. *
  405. * @v netdev Network device
  406. * @v iobuf I/O buffer
  407. * @ret rc Return status code
  408. */
  409. static int ipoib_transmit ( struct net_device *netdev,
  410. struct io_buffer *iobuf ) {
  411. struct ipoib_device *ipoib = netdev->priv;
  412. struct ib_device *ibdev = ipoib->ibdev;
  413. struct ethhdr *ethhdr;
  414. struct ipoib_hdr *ipoib_hdr;
  415. struct ipoib_mac *mac;
  416. struct ib_address_vector dest;
  417. uint16_t net_proto;
  418. int rc;
  419. /* Sanity check */
  420. if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
  421. DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
  422. return -EINVAL;
  423. }
  424. /* Attempting transmission while link is down will put the
  425. * queue pair into an error state, so don't try it.
  426. */
  427. if ( ! ib_link_ok ( ibdev ) )
  428. return -ENETUNREACH;
  429. /* Strip eIPoIB header */
  430. ethhdr = iobuf->data;
  431. net_proto = ethhdr->h_protocol;
  432. iob_pull ( iobuf, sizeof ( *ethhdr ) );
  433. /* Identify destination address */
  434. mac = ipoib_find_remac ( ipoib, ( ( void *) ethhdr->h_dest ) );
  435. if ( ! mac )
  436. return -ENXIO;
  437. /* Translate packet if applicable */
  438. if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
  439. return rc;
  440. /* Prepend real IPoIB header */
  441. ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
  442. ipoib_hdr->proto = net_proto;
  443. ipoib_hdr->reserved = 0;
  444. /* Construct address vector */
  445. memset ( &dest, 0, sizeof ( dest ) );
  446. dest.qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
  447. dest.gid_present = 1;
  448. memcpy ( &dest.gid, &mac->gid, sizeof ( dest.gid ) );
  449. if ( ( rc = ib_resolve_path ( ibdev, &dest ) ) != 0 ) {
  450. /* Path not resolved yet */
  451. return rc;
  452. }
  453. return ib_post_send ( ibdev, ipoib->qp, &dest, iobuf );
  454. }
  455. /**
  456. * Handle IPoIB send completion
  457. *
  458. * @v ibdev Infiniband device
  459. * @v qp Queue pair
  460. * @v iobuf I/O buffer
  461. * @v rc Completion status code
  462. */
  463. static void ipoib_complete_send ( struct ib_device *ibdev __unused,
  464. struct ib_queue_pair *qp,
  465. struct io_buffer *iobuf, int rc ) {
  466. struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
  467. netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
  468. }
  469. /**
  470. * Handle IPoIB receive completion
  471. *
  472. * @v ibdev Infiniband device
  473. * @v qp Queue pair
  474. * @v dest Destination address vector, or NULL
  475. * @v source Source address vector, or NULL
  476. * @v iobuf I/O buffer
  477. * @v rc Completion status code
  478. */
  479. static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
  480. struct ib_queue_pair *qp,
  481. struct ib_address_vector *dest,
  482. struct ib_address_vector *source,
  483. struct io_buffer *iobuf, int rc ) {
  484. struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
  485. struct net_device *netdev = ipoib->netdev;
  486. struct ipoib_hdr *ipoib_hdr;
  487. struct ethhdr *ethhdr;
  488. struct ipoib_remac remac;
  489. uint16_t net_proto;
  490. /* Record errors */
  491. if ( rc != 0 ) {
  492. netdev_rx_err ( netdev, iobuf, rc );
  493. return;
  494. }
  495. /* Sanity check */
  496. if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
  497. DBGC ( ipoib, "IPoIB %p received packet too short to "
  498. "contain IPoIB header\n", ipoib );
  499. DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
  500. netdev_rx_err ( netdev, iobuf, -EIO );
  501. return;
  502. }
  503. if ( ! source ) {
  504. DBGC ( ipoib, "IPoIB %p received packet without address "
  505. "vector\n", ipoib );
  506. netdev_rx_err ( netdev, iobuf, -ENOTTY );
  507. return;
  508. }
  509. /* Strip real IPoIB header */
  510. ipoib_hdr = iobuf->data;
  511. net_proto = ipoib_hdr->proto;
  512. iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
  513. /* Construct source address from remote QPN and LID */
  514. remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
  515. remac.lid = htons ( source->lid );
  516. /* Translate packet if applicable */
  517. if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
  518. net_proto ) ) != 0 ) {
  519. netdev_rx_err ( netdev, iobuf, rc );
  520. return;
  521. }
  522. /* Prepend eIPoIB header */
  523. ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
  524. memcpy ( &ethhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
  525. ethhdr->h_protocol = net_proto;
  526. /* Construct destination address */
  527. if ( dest->gid_present && ( memcmp ( &dest->gid, &ipoib->broadcast.gid,
  528. sizeof ( dest->gid ) ) == 0 ) ) {
  529. /* Broadcast GID; use the Ethernet broadcast address */
  530. memcpy ( &ethhdr->h_dest, eth_broadcast,
  531. sizeof ( ethhdr->h_dest ) );
  532. } else {
  533. /* Assume destination address is local Ethernet MAC */
  534. memcpy ( &ethhdr->h_dest, netdev->ll_addr,
  535. sizeof ( ethhdr->h_dest ) );
  536. }
  537. /* Hand off to network layer */
  538. netdev_rx ( netdev, iobuf );
  539. }
  540. /** IPoIB completion operations */
  541. static struct ib_completion_queue_operations ipoib_cq_op = {
  542. .complete_send = ipoib_complete_send,
  543. .complete_recv = ipoib_complete_recv,
  544. };
  545. /**
  546. * Allocate IPoIB receive I/O buffer
  547. *
  548. * @v len Length of buffer
  549. * @ret iobuf I/O buffer, or NULL
  550. *
  551. * Some Infiniband hardware requires 2kB alignment of receive buffers
  552. * and provides no way to disable header separation. The result is
  553. * that there are only four bytes of link-layer header (the real IPoIB
  554. * header) before the payload. This is not sufficient space to insert
  555. * an eIPoIB link-layer pseudo-header.
  556. *
  557. * We therefore allocate I/O buffers offset to start slightly before
  558. * the natural alignment boundary, in order to allow sufficient space.
  559. */
  560. static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
  561. struct io_buffer *iobuf;
  562. size_t reserve_len;
  563. /* Calculate additional length required at start of buffer */
  564. reserve_len = ( sizeof ( struct ethhdr ) -
  565. sizeof ( struct ipoib_hdr ) );
  566. /* Allocate buffer */
  567. iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
  568. if ( iobuf ) {
  569. iob_reserve ( iobuf, reserve_len );
  570. }
  571. return iobuf;
  572. }
  573. /** IPoIB queue pair operations */
  574. static struct ib_queue_pair_operations ipoib_qp_op = {
  575. .alloc_iob = ipoib_alloc_iob,
  576. };
  577. /**
  578. * Poll IPoIB network device
  579. *
  580. * @v netdev Network device
  581. */
  582. static void ipoib_poll ( struct net_device *netdev ) {
  583. struct ipoib_device *ipoib = netdev->priv;
  584. struct ib_device *ibdev = ipoib->ibdev;
  585. /* Poll Infiniband device */
  586. ib_poll_eq ( ibdev );
  587. /* Poll the retry timers (required for IPoIB multicast join) */
  588. retry_poll();
  589. }
  590. /**
  591. * Handle IPv4 broadcast multicast group join completion
  592. *
  593. * @v ibdev Infiniband device
  594. * @v qp Queue pair
  595. * @v membership Multicast group membership
  596. * @v rc Status code
  597. * @v mad Response MAD (or NULL on error)
  598. */
  599. void ipoib_join_complete ( struct ib_device *ibdev __unused,
  600. struct ib_queue_pair *qp __unused,
  601. struct ib_mc_membership *membership, int rc,
  602. union ib_mad *mad __unused ) {
  603. struct ipoib_device *ipoib = container_of ( membership,
  604. struct ipoib_device, broadcast_membership );
  605. /* Record join status as link status */
  606. netdev_link_err ( ipoib->netdev, rc );
  607. }
  608. /**
  609. * Join IPv4 broadcast multicast group
  610. *
  611. * @v ipoib IPoIB device
  612. * @ret rc Return status code
  613. */
  614. static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
  615. int rc;
  616. if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
  617. &ipoib->broadcast_membership,
  618. &ipoib->broadcast.gid,
  619. ipoib_join_complete ) ) != 0 ) {
  620. DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
  621. ipoib, strerror ( rc ) );
  622. return rc;
  623. }
  624. ipoib->broadcast_joined = 1;
  625. return 0;
  626. }
  627. /**
  628. * Leave IPv4 broadcast multicast group
  629. *
  630. * @v ipoib IPoIB device
  631. */
  632. static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
  633. if ( ipoib->broadcast_joined ) {
  634. ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
  635. &ipoib->broadcast_membership );
  636. ipoib->broadcast_joined = 0;
  637. }
  638. }
  639. /**
  640. * Handle link status change
  641. *
  642. * @v ibdev Infiniband device
  643. */
  644. static void ipoib_link_state_changed ( struct ib_device *ibdev ) {
  645. struct net_device *netdev = ib_get_ownerdata ( ibdev );
  646. struct ipoib_device *ipoib = netdev->priv;
  647. int rc;
  648. /* Leave existing broadcast group */
  649. ipoib_leave_broadcast_group ( ipoib );
  650. /* Update MAC address based on potentially-new GID prefix */
  651. memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
  652. sizeof ( ipoib->mac.gid.s.prefix ) );
  653. /* Update broadcast GID based on potentially-new partition key */
  654. ipoib->broadcast.gid.words[2] =
  655. htons ( ibdev->pkey | IB_PKEY_FULL );
  656. /* Set net device link state to reflect Infiniband link state */
  657. rc = ib_link_rc ( ibdev );
  658. netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
  659. /* Join new broadcast group */
  660. if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) &&
  661. ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
  662. DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
  663. "%s\n", ipoib, strerror ( rc ) );
  664. netdev_link_err ( netdev, rc );
  665. return;
  666. }
  667. }
  668. /**
  669. * Open IPoIB network device
  670. *
  671. * @v netdev Network device
  672. * @ret rc Return status code
  673. */
  674. static int ipoib_open ( struct net_device *netdev ) {
  675. struct ipoib_device *ipoib = netdev->priv;
  676. struct ib_device *ibdev = ipoib->ibdev;
  677. int rc;
  678. /* Open IB device */
  679. if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
  680. DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
  681. ipoib, strerror ( rc ) );
  682. goto err_ib_open;
  683. }
  684. /* Allocate completion queue */
  685. ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op );
  686. if ( ! ipoib->cq ) {
  687. DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
  688. ipoib );
  689. rc = -ENOMEM;
  690. goto err_create_cq;
  691. }
  692. /* Allocate queue pair */
  693. ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
  694. ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
  695. &ipoib_qp_op );
  696. if ( ! ipoib->qp ) {
  697. DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
  698. ipoib );
  699. rc = -ENOMEM;
  700. goto err_create_qp;
  701. }
  702. ib_qp_set_ownerdata ( ipoib->qp, ipoib );
  703. /* Update MAC address with QPN */
  704. ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
  705. /* Fill receive rings */
  706. ib_refill_recv ( ibdev, ipoib->qp );
  707. /* Fake a link status change to join the broadcast group */
  708. ipoib_link_state_changed ( ibdev );
  709. return 0;
  710. ib_destroy_qp ( ibdev, ipoib->qp );
  711. err_create_qp:
  712. ib_destroy_cq ( ibdev, ipoib->cq );
  713. err_create_cq:
  714. ib_close ( ibdev );
  715. err_ib_open:
  716. return rc;
  717. }
  718. /**
  719. * Close IPoIB network device
  720. *
  721. * @v netdev Network device
  722. */
  723. static void ipoib_close ( struct net_device *netdev ) {
  724. struct ipoib_device *ipoib = netdev->priv;
  725. struct ib_device *ibdev = ipoib->ibdev;
  726. /* Flush REMAC cache */
  727. ipoib_flush_remac ( ipoib );
  728. /* Leave broadcast group */
  729. ipoib_leave_broadcast_group ( ipoib );
  730. /* Remove QPN from MAC address */
  731. ipoib->mac.flags__qpn = 0;
  732. /* Tear down the queues */
  733. ib_destroy_qp ( ibdev, ipoib->qp );
  734. ib_destroy_cq ( ibdev, ipoib->cq );
  735. /* Close IB device */
  736. ib_close ( ibdev );
  737. }
  738. /** IPoIB network device operations */
  739. static struct net_device_operations ipoib_operations = {
  740. .open = ipoib_open,
  741. .close = ipoib_close,
  742. .transmit = ipoib_transmit,
  743. .poll = ipoib_poll,
  744. };
  745. /**
  746. * Probe IPoIB device
  747. *
  748. * @v ibdev Infiniband device
  749. * @ret rc Return status code
  750. */
  751. static int ipoib_probe ( struct ib_device *ibdev ) {
  752. struct net_device *netdev;
  753. struct ipoib_device *ipoib;
  754. int rc;
  755. /* Allocate network device */
  756. netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
  757. if ( ! netdev )
  758. return -ENOMEM;
  759. netdev_init ( netdev, &ipoib_operations );
  760. ipoib = netdev->priv;
  761. ib_set_ownerdata ( ibdev, netdev );
  762. netdev->dev = ibdev->dev;
  763. memset ( ipoib, 0, sizeof ( *ipoib ) );
  764. ipoib->netdev = netdev;
  765. ipoib->ibdev = ibdev;
  766. INIT_LIST_HEAD ( &ipoib->peers );
  767. /* Extract hardware address */
  768. memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
  769. sizeof ( ibdev->gid.s.guid ) );
  770. /* Set local MAC address */
  771. memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
  772. sizeof ( ipoib->mac.gid.s.guid ) );
  773. /* Set default broadcast MAC address */
  774. memcpy ( &ipoib->broadcast, &ipoib_broadcast,
  775. sizeof ( ipoib->broadcast ) );
  776. /* Register network device */
  777. if ( ( rc = register_netdev ( netdev ) ) != 0 )
  778. goto err_register_netdev;
  779. return 0;
  780. err_register_netdev:
  781. netdev_nullify ( netdev );
  782. netdev_put ( netdev );
  783. return rc;
  784. }
  785. /**
  786. * Remove IPoIB device
  787. *
  788. * @v ibdev Infiniband device
  789. */
  790. static void ipoib_remove ( struct ib_device *ibdev ) {
  791. struct net_device *netdev = ib_get_ownerdata ( ibdev );
  792. unregister_netdev ( netdev );
  793. netdev_nullify ( netdev );
  794. netdev_put ( netdev );
  795. }
  796. /** IPoIB driver */
  797. struct ib_driver ipoib_driver __ib_driver = {
  798. .name = "IPoIB",
  799. .probe = ipoib_probe,
  800. .notify = ipoib_link_state_changed,
  801. .remove = ipoib_remove,
  802. };