選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

ipoib.c 27KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029
  1. /*
  2. * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License as
  6. * published by the Free Software Foundation; either version 2 of the
  7. * License, or any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful, but
  10. * WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17. * 02110-1301, USA.
  18. *
  19. * You can also choose to distribute this program under the terms of
  20. * the Unmodified Binary Distribution Licence (as given in the file
  21. * COPYING.UBDL), provided that you have satisfied its requirements.
  22. */
  23. FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
  24. #include <stdint.h>
  25. #include <stdlib.h>
  26. #include <stdio.h>
  27. #include <unistd.h>
  28. #include <string.h>
  29. #include <byteswap.h>
  30. #include <errno.h>
  31. #include <ipxe/errortab.h>
  32. #include <ipxe/malloc.h>
  33. #include <ipxe/if_arp.h>
  34. #include <ipxe/arp.h>
  35. #include <ipxe/if_ether.h>
  36. #include <ipxe/ethernet.h>
  37. #include <ipxe/ip.h>
  38. #include <ipxe/iobuf.h>
  39. #include <ipxe/netdevice.h>
  40. #include <ipxe/infiniband.h>
  41. #include <ipxe/ib_pathrec.h>
  42. #include <ipxe/ib_mcast.h>
  43. #include <ipxe/retry.h>
  44. #include <ipxe/ipoib.h>
  45. /** @file
  46. *
  47. * IP over Infiniband
  48. */
  49. /* Disambiguate the various error causes */
  50. #define ENXIO_ARP_REPLY __einfo_error ( EINFO_ENXIO_ARP_REPLY )
  51. #define EINFO_ENXIO_ARP_REPLY \
  52. __einfo_uniqify ( EINFO_ENXIO, 0x01, \
  53. "Missing REMAC for ARP reply target address" )
  54. #define ENXIO_NON_IPV4 __einfo_error ( EINFO_ENXIO_NON_IPV4 )
  55. #define EINFO_ENXIO_NON_IPV4 \
  56. __einfo_uniqify ( EINFO_ENXIO, 0x02, \
  57. "Missing REMAC for non-IPv4 packet" )
  58. #define ENXIO_ARP_SENT __einfo_error ( EINFO_ENXIO_ARP_SENT )
  59. #define EINFO_ENXIO_ARP_SENT \
  60. __einfo_uniqify ( EINFO_ENXIO, 0x03, \
  61. "Missing REMAC for IPv4 packet (ARP sent)" )
  62. /** Number of IPoIB send work queue entries */
  63. #define IPOIB_NUM_SEND_WQES 2
  64. /** Number of IPoIB receive work queue entries */
  65. #define IPOIB_NUM_RECV_WQES 4
  66. /** Number of IPoIB completion entries */
  67. #define IPOIB_NUM_CQES 8
  68. /** An IPoIB broadcast address */
  69. struct ipoib_broadcast {
  70. /** MAC address */
  71. struct ipoib_mac mac;
  72. /** Address vector */
  73. struct ib_address_vector av;
  74. /** Multicast group membership */
  75. struct ib_mc_membership membership;
  76. };
  77. /** An IPoIB device */
  78. struct ipoib_device {
  79. /** Network device */
  80. struct net_device *netdev;
  81. /** Underlying Infiniband device */
  82. struct ib_device *ibdev;
  83. /** List of IPoIB devices */
  84. struct list_head list;
  85. /** Completion queue */
  86. struct ib_completion_queue *cq;
  87. /** Queue pair */
  88. struct ib_queue_pair *qp;
  89. /** Local MAC */
  90. struct ipoib_mac mac;
  91. /** Broadcast address */
  92. struct ipoib_broadcast broadcast;
  93. /** REMAC cache */
  94. struct list_head peers;
  95. };
  96. /** Broadcast IPoIB address */
  97. static struct ipoib_mac ipoib_broadcast = {
  98. .flags__qpn = htonl ( IB_QPN_BROADCAST ),
  99. .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  100. 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
  101. };
  102. /** Link status for "broadcast join in progress" */
  103. #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
  104. #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
  105. ( EINFO_EINPROGRESS, 0x01, "Joining" )
  106. /** Human-readable message for the link status */
  107. struct errortab ipoib_errors[] __errortab = {
  108. __einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
  109. };
  110. /** List of all IPoIB devices */
  111. static LIST_HEAD ( ipoib_devices );
  112. static struct net_device_operations ipoib_operations;
  113. /****************************************************************************
  114. *
  115. * IPoIB REMAC cache
  116. *
  117. ****************************************************************************
  118. */
  119. /** An IPoIB REMAC cache entry */
  120. struct ipoib_peer {
  121. /** List of REMAC cache entries */
  122. struct list_head list;
  123. /** Remote Ethermet MAC */
  124. struct ipoib_remac remac;
  125. /** MAC address */
  126. struct ipoib_mac mac;
  127. };
  128. /**
  129. * Find IPoIB MAC from REMAC
  130. *
  131. * @v ipoib IPoIB device
  132. * @v remac Remote Ethernet MAC
  133. * @ret mac IPoIB MAC (or NULL if not found)
  134. */
  135. static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
  136. const struct ipoib_remac *remac ) {
  137. struct ipoib_peer *peer;
  138. /* Check for broadcast or multicast REMAC. We transmit
  139. * multicasts as broadcasts for simplicity.
  140. */
  141. if ( is_multicast_ether_addr ( remac ) )
  142. return &ipoib->broadcast.mac;
  143. /* Try to find via REMAC cache */
  144. list_for_each_entry ( peer, &ipoib->peers, list ) {
  145. if ( memcmp ( remac, &peer->remac,
  146. sizeof ( peer->remac ) ) == 0 ) {
  147. /* Move peer to start of list */
  148. list_del ( &peer->list );
  149. list_add ( &peer->list, &ipoib->peers );
  150. return &peer->mac;
  151. }
  152. }
  153. DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
  154. ipoib, eth_ntoa ( remac ) );
  155. return NULL;
  156. }
  157. /**
  158. * Add IPoIB MAC to REMAC cache
  159. *
  160. * @v ipoib IPoIB device
  161. * @v remac Remote Ethernet MAC
  162. * @v mac IPoIB MAC
  163. * @ret rc Return status code
  164. */
  165. static int ipoib_map_remac ( struct ipoib_device *ipoib,
  166. const struct ipoib_remac *remac,
  167. const struct ipoib_mac *mac ) {
  168. struct ipoib_peer *peer;
  169. /* Check for existing entry in REMAC cache */
  170. list_for_each_entry ( peer, &ipoib->peers, list ) {
  171. if ( memcmp ( remac, &peer->remac,
  172. sizeof ( peer->remac ) ) == 0 ) {
  173. /* Move peer to start of list */
  174. list_del ( &peer->list );
  175. list_add ( &peer->list, &ipoib->peers );
  176. /* Update MAC */
  177. memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
  178. return 0;
  179. }
  180. }
  181. /* Create new entry */
  182. peer = malloc ( sizeof ( *peer ) );
  183. if ( ! peer )
  184. return -ENOMEM;
  185. memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
  186. memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
  187. list_add ( &peer->list, &ipoib->peers );
  188. return 0;
  189. }
  190. /**
  191. * Flush REMAC cache
  192. *
  193. * @v ipoib IPoIB device
  194. */
  195. static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
  196. struct ipoib_peer *peer;
  197. struct ipoib_peer *tmp;
  198. list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
  199. list_del ( &peer->list );
  200. free ( peer );
  201. }
  202. }
  203. /**
  204. * Discard some entries from the REMAC cache
  205. *
  206. * @ret discarded Number of cached items discarded
  207. */
  208. static unsigned int ipoib_discard_remac ( void ) {
  209. struct net_device *netdev;
  210. struct ipoib_device *ipoib;
  211. struct ipoib_peer *peer;
  212. unsigned int discarded = 0;
  213. /* Try to discard one cache entry for each IPoIB device */
  214. for_each_netdev ( netdev ) {
  215. /* Skip non-IPoIB devices */
  216. if ( netdev->op != &ipoib_operations )
  217. continue;
  218. ipoib = netdev->priv;
  219. /* Discard least recently used cache entry (if any) */
  220. list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
  221. list_del ( &peer->list );
  222. free ( peer );
  223. discarded++;
  224. break;
  225. }
  226. }
  227. return discarded;
  228. }
  229. /** IPoIB cache discarder */
  230. struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = {
  231. .discard = ipoib_discard_remac,
  232. };
  233. /****************************************************************************
  234. *
  235. * IPoIB link layer
  236. *
  237. ****************************************************************************
  238. */
  239. /**
  240. * Initialise IPoIB link-layer address
  241. *
  242. * @v hw_addr Hardware address
  243. * @v ll_addr Link-layer address
  244. */
  245. static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
  246. const uint8_t *guid = hw_addr;
  247. uint8_t *eth_addr = ll_addr;
  248. uint8_t guid_mask = IPOIB_GUID_MASK;
  249. unsigned int i;
  250. /* Extract bytes from GUID according to mask */
  251. for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
  252. if ( guid_mask & 0x80 )
  253. *(eth_addr++) = *guid;
  254. }
  255. }
  256. /** IPoIB protocol */
  257. struct ll_protocol ipoib_protocol __ll_protocol = {
  258. .name = "IPoIB",
  259. .ll_proto = htons ( ARPHRD_ETHER ),
  260. .hw_addr_len = sizeof ( union ib_guid ),
  261. .ll_addr_len = ETH_ALEN,
  262. .ll_header_len = ETH_HLEN,
  263. .push = eth_push,
  264. .pull = eth_pull,
  265. .init_addr = ipoib_init_addr,
  266. .ntoa = eth_ntoa,
  267. .mc_hash = eth_mc_hash,
  268. .eth_addr = eth_eth_addr,
  269. .eui64 = eth_eui64,
  270. .flags = LL_NAME_ONLY,
  271. };
  272. /**
  273. * Allocate IPoIB device
  274. *
  275. * @v priv_size Size of driver private data
  276. * @ret netdev Network device, or NULL
  277. */
  278. struct net_device * alloc_ipoibdev ( size_t priv_size ) {
  279. struct net_device *netdev;
  280. netdev = alloc_netdev ( priv_size );
  281. if ( netdev ) {
  282. netdev->ll_protocol = &ipoib_protocol;
  283. netdev->ll_broadcast = eth_broadcast;
  284. netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
  285. }
  286. return netdev;
  287. }
  288. /****************************************************************************
  289. *
  290. * IPoIB translation layer
  291. *
  292. ****************************************************************************
  293. */
  294. /**
  295. * Translate transmitted ARP packet
  296. *
  297. * @v netdev Network device
  298. * @v iobuf Packet to be transmitted (with no link-layer headers)
  299. * @ret rc Return status code
  300. */
  301. static int ipoib_translate_tx_arp ( struct net_device *netdev,
  302. struct io_buffer *iobuf ) {
  303. struct ipoib_device *ipoib = netdev->priv;
  304. struct arphdr *arphdr = iobuf->data;
  305. struct ipoib_mac *target_ha = NULL;
  306. void *sender_pa;
  307. void *target_pa;
  308. /* Do nothing unless ARP contains eIPoIB link-layer addresses */
  309. if ( arphdr->ar_hln != ETH_ALEN )
  310. return 0;
  311. /* Fail unless we have room to expand packet */
  312. if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
  313. ETH_ALEN ) ) ) {
  314. DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
  315. ipoib );
  316. return -ENOBUFS;
  317. }
  318. /* Look up REMAC, if applicable */
  319. if ( arphdr->ar_op == ARPOP_REPLY ) {
  320. target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
  321. if ( ! target_ha ) {
  322. DBGC ( ipoib, "IPoIB %p no REMAC for %s ARP reply\n",
  323. ipoib, eth_ntoa ( arp_target_pa ( arphdr ) ) );
  324. return -ENXIO_ARP_REPLY;
  325. }
  326. }
  327. /* Construct new packet */
  328. iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
  329. sender_pa = arp_sender_pa ( arphdr );
  330. target_pa = arp_target_pa ( arphdr );
  331. arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
  332. arphdr->ar_hln = sizeof ( ipoib->mac );
  333. memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
  334. memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
  335. memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
  336. memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
  337. if ( target_ha ) {
  338. memcpy ( arp_target_ha ( arphdr ), target_ha,
  339. sizeof ( *target_ha ) );
  340. }
  341. return 0;
  342. }
  343. /**
  344. * Translate transmitted packet
  345. *
  346. * @v netdev Network device
  347. * @v iobuf Packet to be transmitted (with no link-layer headers)
  348. * @v net_proto Network-layer protocol (in network byte order)
  349. * @ret rc Return status code
  350. */
  351. static int ipoib_translate_tx ( struct net_device *netdev,
  352. struct io_buffer *iobuf, uint16_t net_proto ) {
  353. switch ( net_proto ) {
  354. case htons ( ETH_P_ARP ) :
  355. return ipoib_translate_tx_arp ( netdev, iobuf );
  356. case htons ( ETH_P_IP ) :
  357. /* No translation needed */
  358. return 0;
  359. default:
  360. /* Cannot handle other traffic via eIPoIB */
  361. return -ENOTSUP;
  362. }
  363. }
  364. /**
  365. * Translate received ARP packet
  366. *
  367. * @v netdev Network device
  368. * @v iobuf Received packet (with no link-layer headers)
  369. * @v remac Constructed Remote Ethernet MAC
  370. * @ret rc Return status code
  371. */
  372. static int ipoib_translate_rx_arp ( struct net_device *netdev,
  373. struct io_buffer *iobuf,
  374. struct ipoib_remac *remac ) {
  375. struct ipoib_device *ipoib = netdev->priv;
  376. struct arphdr *arphdr = iobuf->data;
  377. void *sender_pa;
  378. void *target_pa;
  379. int rc;
  380. /* Do nothing unless ARP contains IPoIB link-layer addresses */
  381. if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
  382. return 0;
  383. /* Create REMAC cache entry */
  384. if ( ( rc = ipoib_map_remac ( ipoib, remac,
  385. arp_sender_ha ( arphdr ) ) ) != 0 ) {
  386. DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
  387. ipoib, strerror ( rc ) );
  388. return rc;
  389. }
  390. /* Construct new packet */
  391. sender_pa = arp_sender_pa ( arphdr );
  392. target_pa = arp_target_pa ( arphdr );
  393. arphdr->ar_hrd = htons ( ARPHRD_ETHER );
  394. arphdr->ar_hln = ETH_ALEN;
  395. memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
  396. memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
  397. memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
  398. memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
  399. if ( arphdr->ar_op == ARPOP_REPLY ) {
  400. /* Assume received replies were directed to us */
  401. memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
  402. }
  403. iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
  404. return 0;
  405. }
  406. /**
  407. * Translate received packet
  408. *
  409. * @v netdev Network device
  410. * @v iobuf Received packet (with no link-layer headers)
  411. * @v remac Constructed Remote Ethernet MAC
  412. * @v net_proto Network-layer protocol (in network byte order)
  413. * @ret rc Return status code
  414. */
  415. static int ipoib_translate_rx ( struct net_device *netdev,
  416. struct io_buffer *iobuf,
  417. struct ipoib_remac *remac,
  418. uint16_t net_proto ) {
  419. switch ( net_proto ) {
  420. case htons ( ETH_P_ARP ) :
  421. return ipoib_translate_rx_arp ( netdev, iobuf, remac );
  422. case htons ( ETH_P_IP ) :
  423. /* No translation needed */
  424. return 0;
  425. default:
  426. /* Cannot handle other traffic via eIPoIB */
  427. return -ENOTSUP;
  428. }
  429. }
  430. /****************************************************************************
  431. *
  432. * IPoIB network device
  433. *
  434. ****************************************************************************
  435. */
  436. /**
  437. * Transmit packet via IPoIB network device
  438. *
  439. * @v netdev Network device
  440. * @v iobuf I/O buffer
  441. * @ret rc Return status code
  442. */
  443. static int ipoib_transmit ( struct net_device *netdev,
  444. struct io_buffer *iobuf ) {
  445. struct ipoib_device *ipoib = netdev->priv;
  446. struct ib_device *ibdev = ipoib->ibdev;
  447. struct ethhdr *ethhdr;
  448. struct iphdr *iphdr;
  449. struct ipoib_hdr *ipoib_hdr;
  450. struct ipoib_remac *remac;
  451. struct ipoib_mac *mac;
  452. struct ib_address_vector *dest;
  453. struct ib_address_vector av;
  454. uint16_t net_proto;
  455. int rc;
  456. /* Sanity check */
  457. if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
  458. DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
  459. return -EINVAL;
  460. }
  461. /* Attempting transmission while link is down will put the
  462. * queue pair into an error state, so don't try it.
  463. */
  464. if ( ! ib_link_ok ( ibdev ) )
  465. return -ENETUNREACH;
  466. /* Strip eIPoIB header */
  467. ethhdr = iobuf->data;
  468. remac = ( ( struct ipoib_remac * ) ethhdr->h_dest );
  469. net_proto = ethhdr->h_protocol;
  470. iob_pull ( iobuf, sizeof ( *ethhdr ) );
  471. /* Identify destination address */
  472. if ( is_multicast_ether_addr ( remac ) ) {
  473. /* Transmit multicasts as broadcasts, for simplicity */
  474. dest = &ipoib->broadcast.av;
  475. } else if ( ( mac = ipoib_find_remac ( ipoib, remac ) ) ) {
  476. /* Construct address vector from IPoIB MAC */
  477. dest = &av;
  478. memset ( dest, 0, sizeof ( *dest ) );
  479. dest->qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
  480. dest->qkey = ipoib->broadcast.av.qkey;
  481. dest->gid_present = 1;
  482. memcpy ( &dest->gid, &mac->gid, sizeof ( dest->gid ) );
  483. if ( ( rc = ib_resolve_path ( ibdev, dest ) ) != 0 ) {
  484. /* Path not resolved yet */
  485. return rc;
  486. }
  487. } else {
  488. /* Generate a new ARP request (if possible) to trigger
  489. * population of the REMAC cache entry.
  490. */
  491. if ( ( net_proto != htons ( ETH_P_IP ) ) ||
  492. ( iob_len ( iobuf ) < sizeof ( *iphdr ) ) ) {
  493. DBGC ( ipoib, "IPoIB %p no REMAC for %s non-IPv4 "
  494. "packet type %04x\n", ipoib,
  495. eth_ntoa ( ethhdr->h_dest ),
  496. ntohs ( net_proto ) );
  497. return -ENXIO_NON_IPV4;
  498. }
  499. iphdr = iobuf->data;
  500. if ( ( rc = arp_tx_request ( netdev, &ipv4_protocol,
  501. &iphdr->dest, &iphdr->src ) ) !=0){
  502. DBGC ( ipoib, "IPoIB %p could not ARP for %s/%s/",
  503. ipoib, eth_ntoa ( ethhdr->h_dest ),
  504. inet_ntoa ( iphdr->dest ) );
  505. DBGC ( ipoib, "%s: %s\n", inet_ntoa ( iphdr->src ),
  506. strerror ( rc ) );
  507. return rc;
  508. }
  509. DBGC ( ipoib, "IPoIB %p no REMAC for %s/%s/", ipoib,
  510. eth_ntoa ( ethhdr->h_dest ), inet_ntoa ( iphdr->dest ) );
  511. DBGC ( ipoib, "%s\n", inet_ntoa ( iphdr->src ) );
  512. return -ENXIO_ARP_SENT;
  513. }
  514. /* Translate packet if applicable */
  515. if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
  516. return rc;
  517. /* Prepend real IPoIB header */
  518. ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
  519. ipoib_hdr->proto = net_proto;
  520. ipoib_hdr->reserved = 0;
  521. /* Transmit packet */
  522. return ib_post_send ( ibdev, ipoib->qp, dest, iobuf );
  523. }
  524. /**
  525. * Handle IPoIB send completion
  526. *
  527. * @v ibdev Infiniband device
  528. * @v qp Queue pair
  529. * @v iobuf I/O buffer
  530. * @v rc Completion status code
  531. */
  532. static void ipoib_complete_send ( struct ib_device *ibdev __unused,
  533. struct ib_queue_pair *qp,
  534. struct io_buffer *iobuf, int rc ) {
  535. struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
  536. netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
  537. }
  538. /**
  539. * Handle IPoIB receive completion
  540. *
  541. * @v ibdev Infiniband device
  542. * @v qp Queue pair
  543. * @v dest Destination address vector, or NULL
  544. * @v source Source address vector, or NULL
  545. * @v iobuf I/O buffer
  546. * @v rc Completion status code
  547. */
  548. static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
  549. struct ib_queue_pair *qp,
  550. struct ib_address_vector *dest,
  551. struct ib_address_vector *source,
  552. struct io_buffer *iobuf, int rc ) {
  553. struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
  554. struct net_device *netdev = ipoib->netdev;
  555. struct ipoib_hdr *ipoib_hdr;
  556. struct ethhdr *ethhdr;
  557. struct ipoib_remac remac;
  558. uint16_t net_proto;
  559. /* Record errors */
  560. if ( rc != 0 ) {
  561. netdev_rx_err ( netdev, iobuf, rc );
  562. return;
  563. }
  564. /* Sanity check */
  565. if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
  566. DBGC ( ipoib, "IPoIB %p received packet too short to "
  567. "contain IPoIB header\n", ipoib );
  568. DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
  569. netdev_rx_err ( netdev, iobuf, -EIO );
  570. return;
  571. }
  572. if ( ! source ) {
  573. DBGC ( ipoib, "IPoIB %p received packet without address "
  574. "vector\n", ipoib );
  575. netdev_rx_err ( netdev, iobuf, -ENOTTY );
  576. return;
  577. }
  578. /* Strip real IPoIB header */
  579. ipoib_hdr = iobuf->data;
  580. net_proto = ipoib_hdr->proto;
  581. iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
  582. /* Construct source address from remote QPN and LID */
  583. remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
  584. remac.lid = htons ( source->lid );
  585. /* Translate packet if applicable */
  586. if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
  587. net_proto ) ) != 0 ) {
  588. netdev_rx_err ( netdev, iobuf, rc );
  589. return;
  590. }
  591. /* Prepend eIPoIB header */
  592. ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
  593. memcpy ( &ethhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
  594. ethhdr->h_protocol = net_proto;
  595. /* Construct destination address */
  596. if ( IB_LID_MULTICAST ( dest->lid ) ) {
  597. /* Multicast LID; use the Ethernet broadcast address */
  598. memcpy ( &ethhdr->h_dest, eth_broadcast,
  599. sizeof ( ethhdr->h_dest ) );
  600. } else {
  601. /* Assume destination address is local Ethernet MAC */
  602. memcpy ( &ethhdr->h_dest, netdev->ll_addr,
  603. sizeof ( ethhdr->h_dest ) );
  604. }
  605. /* Hand off to network layer */
  606. netdev_rx ( netdev, iobuf );
  607. }
  608. /** IPoIB completion operations */
  609. static struct ib_completion_queue_operations ipoib_cq_op = {
  610. .complete_send = ipoib_complete_send,
  611. .complete_recv = ipoib_complete_recv,
  612. };
  613. /**
  614. * Allocate IPoIB receive I/O buffer
  615. *
  616. * @v len Length of buffer
  617. * @ret iobuf I/O buffer, or NULL
  618. *
  619. * Some Infiniband hardware requires 2kB alignment of receive buffers
  620. * and provides no way to disable header separation. The result is
  621. * that there are only four bytes of link-layer header (the real IPoIB
  622. * header) before the payload. This is not sufficient space to insert
  623. * an eIPoIB link-layer pseudo-header.
  624. *
  625. * We therefore allocate I/O buffers offset to start slightly before
  626. * the natural alignment boundary, in order to allow sufficient space.
  627. */
  628. static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
  629. struct io_buffer *iobuf;
  630. size_t reserve_len;
  631. /* Calculate additional length required at start of buffer */
  632. reserve_len = ( sizeof ( struct ethhdr ) -
  633. sizeof ( struct ipoib_hdr ) );
  634. /* Allocate buffer */
  635. iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
  636. if ( iobuf ) {
  637. iob_reserve ( iobuf, reserve_len );
  638. }
  639. return iobuf;
  640. }
  641. /** IPoIB queue pair operations */
  642. static struct ib_queue_pair_operations ipoib_qp_op = {
  643. .alloc_iob = ipoib_alloc_iob,
  644. };
  645. /**
  646. * Poll IPoIB network device
  647. *
  648. * @v netdev Network device
  649. */
  650. static void ipoib_poll ( struct net_device *netdev ) {
  651. struct ipoib_device *ipoib = netdev->priv;
  652. struct ib_device *ibdev = ipoib->ibdev;
  653. /* Poll Infiniband device */
  654. ib_poll_eq ( ibdev );
  655. /* Poll the retry timers (required for IPoIB multicast join) */
  656. retry_poll();
  657. }
  658. /**
  659. * Handle IPv4 broadcast multicast group join completion
  660. *
  661. * @v membership Multicast group membership
  662. * @v rc Status code
  663. */
  664. void ipoib_join_complete ( struct ib_mc_membership *membership, int rc ) {
  665. struct ipoib_device *ipoib = container_of ( membership,
  666. struct ipoib_device,
  667. broadcast.membership );
  668. /* Record join status as link status */
  669. netdev_link_err ( ipoib->netdev, rc );
  670. }
  671. /**
  672. * Join IPv4 broadcast multicast group
  673. *
  674. * @v ipoib IPoIB device
  675. * @ret rc Return status code
  676. */
  677. static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
  678. int rc;
  679. /* Join multicast group */
  680. if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
  681. &ipoib->broadcast.membership,
  682. &ipoib->broadcast.av, 0,
  683. ipoib_join_complete ) ) != 0 ) {
  684. DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
  685. ipoib, strerror ( rc ) );
  686. return rc;
  687. }
  688. return 0;
  689. }
  690. /**
  691. * Leave IPv4 broadcast multicast group
  692. *
  693. * @v ipoib IPoIB device
  694. */
  695. static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
  696. /* Leave multicast group */
  697. ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
  698. &ipoib->broadcast.membership );
  699. }
  700. /**
  701. * Handle link status change
  702. *
  703. * @v ipoib IPoIB device
  704. */
  705. static void ipoib_link_state_changed ( struct ipoib_device *ipoib ) {
  706. struct ib_device *ibdev = ipoib->ibdev;
  707. struct net_device *netdev = ipoib->netdev;
  708. int rc;
  709. /* Leave existing broadcast group */
  710. if ( ipoib->qp )
  711. ipoib_leave_broadcast_group ( ipoib );
  712. /* Update MAC address based on potentially-new GID prefix */
  713. memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
  714. sizeof ( ipoib->mac.gid.s.prefix ) );
  715. /* Update broadcast MAC GID based on potentially-new partition key */
  716. ipoib->broadcast.mac.gid.words[2] =
  717. htons ( ibdev->pkey | IB_PKEY_FULL );
  718. /* Construct broadcast address vector from broadcast MAC address */
  719. memset ( &ipoib->broadcast.av, 0, sizeof ( ipoib->broadcast.av ) );
  720. ipoib->broadcast.av.qpn = IB_QPN_BROADCAST;
  721. ipoib->broadcast.av.gid_present = 1;
  722. memcpy ( &ipoib->broadcast.av.gid, &ipoib->broadcast.mac.gid,
  723. sizeof ( ipoib->broadcast.av.gid ) );
  724. /* Set net device link state to reflect Infiniband link state */
  725. rc = ib_link_rc ( ibdev );
  726. netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
  727. /* Join new broadcast group */
  728. if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && ipoib->qp &&
  729. ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
  730. DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
  731. "%s\n", ipoib, strerror ( rc ) );
  732. netdev_link_err ( netdev, rc );
  733. return;
  734. }
  735. }
  736. /**
  737. * Open IPoIB network device
  738. *
  739. * @v netdev Network device
  740. * @ret rc Return status code
  741. */
  742. static int ipoib_open ( struct net_device *netdev ) {
  743. struct ipoib_device *ipoib = netdev->priv;
  744. struct ib_device *ibdev = ipoib->ibdev;
  745. int rc;
  746. /* Open IB device */
  747. if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
  748. DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
  749. ipoib, strerror ( rc ) );
  750. goto err_ib_open;
  751. }
  752. /* Allocate completion queue */
  753. ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op );
  754. if ( ! ipoib->cq ) {
  755. DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
  756. ipoib );
  757. rc = -ENOMEM;
  758. goto err_create_cq;
  759. }
  760. /* Allocate queue pair */
  761. ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
  762. ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
  763. &ipoib_qp_op );
  764. if ( ! ipoib->qp ) {
  765. DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
  766. ipoib );
  767. rc = -ENOMEM;
  768. goto err_create_qp;
  769. }
  770. ib_qp_set_ownerdata ( ipoib->qp, ipoib );
  771. /* Update MAC address with QPN */
  772. ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
  773. /* Fill receive rings */
  774. ib_refill_recv ( ibdev, ipoib->qp );
  775. /* Fake a link status change to join the broadcast group */
  776. ipoib_link_state_changed ( ipoib );
  777. return 0;
  778. ib_destroy_qp ( ibdev, ipoib->qp );
  779. err_create_qp:
  780. ib_destroy_cq ( ibdev, ipoib->cq );
  781. err_create_cq:
  782. ib_close ( ibdev );
  783. err_ib_open:
  784. return rc;
  785. }
  786. /**
  787. * Close IPoIB network device
  788. *
  789. * @v netdev Network device
  790. */
  791. static void ipoib_close ( struct net_device *netdev ) {
  792. struct ipoib_device *ipoib = netdev->priv;
  793. struct ib_device *ibdev = ipoib->ibdev;
  794. /* Flush REMAC cache */
  795. ipoib_flush_remac ( ipoib );
  796. /* Leave broadcast group */
  797. ipoib_leave_broadcast_group ( ipoib );
  798. /* Remove QPN from MAC address */
  799. ipoib->mac.flags__qpn = 0;
  800. /* Tear down the queues */
  801. ib_destroy_qp ( ibdev, ipoib->qp );
  802. ipoib->qp = NULL;
  803. ib_destroy_cq ( ibdev, ipoib->cq );
  804. ipoib->cq = NULL;
  805. /* Close IB device */
  806. ib_close ( ibdev );
  807. }
  808. /** IPoIB network device operations */
  809. static struct net_device_operations ipoib_operations = {
  810. .open = ipoib_open,
  811. .close = ipoib_close,
  812. .transmit = ipoib_transmit,
  813. .poll = ipoib_poll,
  814. };
  815. /**
  816. * Probe IPoIB device
  817. *
  818. * @v ibdev Infiniband device
  819. * @ret rc Return status code
  820. */
  821. static int ipoib_probe ( struct ib_device *ibdev ) {
  822. struct net_device *netdev;
  823. struct ipoib_device *ipoib;
  824. int rc;
  825. /* Allocate network device */
  826. netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
  827. if ( ! netdev )
  828. return -ENOMEM;
  829. netdev_init ( netdev, &ipoib_operations );
  830. ipoib = netdev->priv;
  831. netdev->dev = ibdev->dev;
  832. memset ( ipoib, 0, sizeof ( *ipoib ) );
  833. ipoib->netdev = netdev;
  834. ipoib->ibdev = ibdev;
  835. INIT_LIST_HEAD ( &ipoib->peers );
  836. /* Extract hardware address */
  837. memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
  838. sizeof ( ibdev->gid.s.guid ) );
  839. /* Set local MAC address */
  840. memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
  841. sizeof ( ipoib->mac.gid.s.guid ) );
  842. /* Set default broadcast MAC address */
  843. memcpy ( &ipoib->broadcast.mac, &ipoib_broadcast,
  844. sizeof ( ipoib->broadcast.mac ) );
  845. /* Add to list of IPoIB devices */
  846. list_add_tail ( &ipoib->list, &ipoib_devices );
  847. /* Register network device */
  848. if ( ( rc = register_netdev ( netdev ) ) != 0 )
  849. goto err_register_netdev;
  850. return 0;
  851. unregister_netdev ( netdev );
  852. err_register_netdev:
  853. list_del ( &ipoib->list );
  854. netdev_nullify ( netdev );
  855. netdev_put ( netdev );
  856. return rc;
  857. }
  858. /**
  859. * Handle device or link status change
  860. *
  861. * @v ibdev Infiniband device
  862. */
  863. static void ipoib_notify ( struct ib_device *ibdev ) {
  864. struct ipoib_device *ipoib;
  865. /* Handle link status change for any attached IPoIB devices */
  866. list_for_each_entry ( ipoib, &ipoib_devices, list ) {
  867. if ( ipoib->ibdev != ibdev )
  868. continue;
  869. ipoib_link_state_changed ( ipoib );
  870. }
  871. }
  872. /**
  873. * Remove IPoIB device
  874. *
  875. * @v ibdev Infiniband device
  876. */
  877. static void ipoib_remove ( struct ib_device *ibdev ) {
  878. struct ipoib_device *ipoib;
  879. struct ipoib_device *tmp;
  880. struct net_device *netdev;
  881. /* Remove any attached IPoIB devices */
  882. list_for_each_entry_safe ( ipoib, tmp, &ipoib_devices, list ) {
  883. if ( ipoib->ibdev != ibdev )
  884. continue;
  885. netdev = ipoib->netdev;
  886. unregister_netdev ( netdev );
  887. list_del ( &ipoib->list );
  888. netdev_nullify ( netdev );
  889. netdev_put ( netdev );
  890. }
  891. }
  892. /** IPoIB driver */
  893. struct ib_driver ipoib_driver __ib_driver = {
  894. .name = "IPoIB",
  895. .probe = ipoib_probe,
  896. .notify = ipoib_notify,
  897. .remove = ipoib_remove,
  898. };