You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ipoib.c 28KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046
  1. /*
  2. * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License as
  6. * published by the Free Software Foundation; either version 2 of the
  7. * License, or any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful, but
  10. * WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17. * 02110-1301, USA.
  18. *
  19. * You can also choose to distribute this program under the terms of
  20. * the Unmodified Binary Distribution Licence (as given in the file
  21. * COPYING.UBDL), provided that you have satisfied its requirements.
  22. */
  23. FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
  24. #include <stdint.h>
  25. #include <stdlib.h>
  26. #include <stdio.h>
  27. #include <unistd.h>
  28. #include <string.h>
  29. #include <byteswap.h>
  30. #include <errno.h>
  31. #include <ipxe/errortab.h>
  32. #include <ipxe/malloc.h>
  33. #include <ipxe/if_arp.h>
  34. #include <ipxe/arp.h>
  35. #include <ipxe/if_ether.h>
  36. #include <ipxe/ethernet.h>
  37. #include <ipxe/ip.h>
  38. #include <ipxe/iobuf.h>
  39. #include <ipxe/netdevice.h>
  40. #include <ipxe/infiniband.h>
  41. #include <ipxe/ib_pathrec.h>
  42. #include <ipxe/ib_mcast.h>
  43. #include <ipxe/retry.h>
  44. #include <ipxe/ipoib.h>
  45. /** @file
  46. *
  47. * IP over Infiniband
  48. */
  49. /* Disambiguate the various error causes */
  50. #define ENXIO_ARP_REPLY __einfo_error ( EINFO_ENXIO_ARP_REPLY )
  51. #define EINFO_ENXIO_ARP_REPLY \
  52. __einfo_uniqify ( EINFO_ENXIO, 0x01, \
  53. "Missing REMAC for ARP reply target address" )
  54. #define ENXIO_NON_IPV4 __einfo_error ( EINFO_ENXIO_NON_IPV4 )
  55. #define EINFO_ENXIO_NON_IPV4 \
  56. __einfo_uniqify ( EINFO_ENXIO, 0x02, \
  57. "Missing REMAC for non-IPv4 packet" )
  58. #define ENXIO_ARP_SENT __einfo_error ( EINFO_ENXIO_ARP_SENT )
  59. #define EINFO_ENXIO_ARP_SENT \
  60. __einfo_uniqify ( EINFO_ENXIO, 0x03, \
  61. "Missing REMAC for IPv4 packet (ARP sent)" )
  62. /** Number of IPoIB send work queue entries */
  63. #define IPOIB_NUM_SEND_WQES 8
  64. /** Number of IPoIB receive work queue entries */
  65. #define IPOIB_NUM_RECV_WQES 4
  66. /** Number of IPoIB completion entries */
  67. #define IPOIB_NUM_CQES 16
  68. /** An IPoIB broadcast address */
  69. struct ipoib_broadcast {
  70. /** MAC address */
  71. struct ipoib_mac mac;
  72. /** Address vector */
  73. struct ib_address_vector av;
  74. /** Multicast group membership */
  75. struct ib_mc_membership membership;
  76. };
  77. /** An IPoIB device */
  78. struct ipoib_device {
  79. /** Network device */
  80. struct net_device *netdev;
  81. /** Underlying Infiniband device */
  82. struct ib_device *ibdev;
  83. /** List of IPoIB devices */
  84. struct list_head list;
  85. /** Completion queue */
  86. struct ib_completion_queue *cq;
  87. /** Queue pair */
  88. struct ib_queue_pair *qp;
  89. /** Local MAC */
  90. struct ipoib_mac mac;
  91. /** Broadcast address */
  92. struct ipoib_broadcast broadcast;
  93. /** REMAC cache */
  94. struct list_head peers;
  95. };
  96. /** Broadcast IPoIB address */
  97. static struct ipoib_mac ipoib_broadcast = {
  98. .flags__qpn = htonl ( IB_QPN_BROADCAST ),
  99. .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  100. 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
  101. };
  102. /** Link status for "broadcast join in progress" */
  103. #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
  104. #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
  105. ( EINFO_EINPROGRESS, 0x01, "Joining" )
  106. /** Human-readable message for the link status */
  107. struct errortab ipoib_errors[] __errortab = {
  108. __einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
  109. };
  110. /** List of all IPoIB devices */
  111. static LIST_HEAD ( ipoib_devices );
  112. static struct net_device_operations ipoib_operations;
  113. /****************************************************************************
  114. *
  115. * IPoIB REMAC cache
  116. *
  117. ****************************************************************************
  118. */
  119. /** An IPoIB REMAC cache entry */
  120. struct ipoib_peer {
  121. /** List of REMAC cache entries */
  122. struct list_head list;
  123. /** Remote Ethermet MAC */
  124. struct ipoib_remac remac;
  125. /** MAC address */
  126. struct ipoib_mac mac;
  127. };
  128. /**
  129. * Find IPoIB MAC from REMAC
  130. *
  131. * @v ipoib IPoIB device
  132. * @v remac Remote Ethernet MAC
  133. * @ret mac IPoIB MAC (or NULL if not found)
  134. */
  135. static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
  136. const struct ipoib_remac *remac ) {
  137. struct ipoib_peer *peer;
  138. /* Check for broadcast or multicast REMAC. We transmit
  139. * multicasts as broadcasts for simplicity.
  140. */
  141. if ( is_multicast_ether_addr ( remac ) )
  142. return &ipoib->broadcast.mac;
  143. /* Try to find via REMAC cache */
  144. list_for_each_entry ( peer, &ipoib->peers, list ) {
  145. if ( memcmp ( remac, &peer->remac,
  146. sizeof ( peer->remac ) ) == 0 ) {
  147. /* Move peer to start of list */
  148. list_del ( &peer->list );
  149. list_add ( &peer->list, &ipoib->peers );
  150. return &peer->mac;
  151. }
  152. }
  153. DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
  154. ipoib, eth_ntoa ( remac ) );
  155. return NULL;
  156. }
  157. /**
  158. * Add IPoIB MAC to REMAC cache
  159. *
  160. * @v ipoib IPoIB device
  161. * @v remac Remote Ethernet MAC
  162. * @v mac IPoIB MAC
  163. * @ret rc Return status code
  164. */
  165. static int ipoib_map_remac ( struct ipoib_device *ipoib,
  166. const struct ipoib_remac *remac,
  167. const struct ipoib_mac *mac ) {
  168. struct ipoib_peer *peer;
  169. /* Check for existing entry in REMAC cache */
  170. list_for_each_entry ( peer, &ipoib->peers, list ) {
  171. if ( memcmp ( remac, &peer->remac,
  172. sizeof ( peer->remac ) ) == 0 ) {
  173. /* Move peer to start of list */
  174. list_del ( &peer->list );
  175. list_add ( &peer->list, &ipoib->peers );
  176. /* Update MAC */
  177. memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
  178. return 0;
  179. }
  180. }
  181. /* Create new entry */
  182. peer = malloc ( sizeof ( *peer ) );
  183. if ( ! peer )
  184. return -ENOMEM;
  185. memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
  186. memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
  187. list_add ( &peer->list, &ipoib->peers );
  188. return 0;
  189. }
  190. /**
  191. * Flush REMAC cache
  192. *
  193. * @v ipoib IPoIB device
  194. */
  195. static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
  196. struct ipoib_peer *peer;
  197. struct ipoib_peer *tmp;
  198. list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
  199. list_del ( &peer->list );
  200. free ( peer );
  201. }
  202. }
  203. /**
  204. * Discard some entries from the REMAC cache
  205. *
  206. * @ret discarded Number of cached items discarded
  207. */
  208. static unsigned int ipoib_discard_remac ( void ) {
  209. struct net_device *netdev;
  210. struct ipoib_device *ipoib;
  211. struct ipoib_peer *peer;
  212. unsigned int discarded = 0;
  213. /* Try to discard one cache entry for each IPoIB device */
  214. for_each_netdev ( netdev ) {
  215. /* Skip non-IPoIB devices */
  216. if ( netdev->op != &ipoib_operations )
  217. continue;
  218. ipoib = netdev->priv;
  219. /* Discard least recently used cache entry (if any) */
  220. list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
  221. list_del ( &peer->list );
  222. free ( peer );
  223. discarded++;
  224. break;
  225. }
  226. }
  227. return discarded;
  228. }
  229. /** IPoIB cache discarder */
  230. struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = {
  231. .discard = ipoib_discard_remac,
  232. };
  233. /****************************************************************************
  234. *
  235. * IPoIB link layer
  236. *
  237. ****************************************************************************
  238. */
  239. /**
  240. * Initialise IPoIB link-layer address
  241. *
  242. * @v hw_addr Hardware address
  243. * @v ll_addr Link-layer address
  244. */
  245. static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
  246. const uint8_t *guid = hw_addr;
  247. uint8_t *eth_addr = ll_addr;
  248. uint8_t guid_mask = IPOIB_GUID_MASK;
  249. unsigned int i;
  250. /* Extract bytes from GUID according to mask */
  251. for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
  252. if ( guid_mask & 0x80 )
  253. *(eth_addr++) = *guid;
  254. }
  255. }
  256. /** IPoIB protocol */
  257. struct ll_protocol ipoib_protocol __ll_protocol = {
  258. .name = "IPoIB",
  259. .ll_proto = htons ( ARPHRD_ETHER ),
  260. .hw_addr_len = sizeof ( union ib_guid ),
  261. .ll_addr_len = ETH_ALEN,
  262. .ll_header_len = ETH_HLEN,
  263. .push = eth_push,
  264. .pull = eth_pull,
  265. .init_addr = ipoib_init_addr,
  266. .ntoa = eth_ntoa,
  267. .mc_hash = eth_mc_hash,
  268. .eth_addr = eth_eth_addr,
  269. .eui64 = eth_eui64,
  270. .flags = LL_NAME_ONLY,
  271. };
  272. /**
  273. * Allocate IPoIB device
  274. *
  275. * @v priv_size Size of driver private data
  276. * @ret netdev Network device, or NULL
  277. */
  278. struct net_device * alloc_ipoibdev ( size_t priv_size ) {
  279. struct net_device *netdev;
  280. netdev = alloc_netdev ( priv_size );
  281. if ( netdev ) {
  282. netdev->ll_protocol = &ipoib_protocol;
  283. netdev->ll_broadcast = eth_broadcast;
  284. netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
  285. }
  286. return netdev;
  287. }
  288. /****************************************************************************
  289. *
  290. * IPoIB translation layer
  291. *
  292. ****************************************************************************
  293. */
  294. /**
  295. * Translate transmitted ARP packet
  296. *
  297. * @v netdev Network device
  298. * @v iobuf Packet to be transmitted (with no link-layer headers)
  299. * @ret rc Return status code
  300. */
  301. static int ipoib_translate_tx_arp ( struct net_device *netdev,
  302. struct io_buffer *iobuf ) {
  303. struct ipoib_device *ipoib = netdev->priv;
  304. struct arphdr *arphdr = iobuf->data;
  305. struct ipoib_mac *target_ha = NULL;
  306. void *sender_pa;
  307. void *target_pa;
  308. /* Do nothing unless ARP contains eIPoIB link-layer addresses */
  309. if ( arphdr->ar_hln != ETH_ALEN )
  310. return 0;
  311. /* Fail unless we have room to expand packet */
  312. if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
  313. ETH_ALEN ) ) ) {
  314. DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
  315. ipoib );
  316. return -ENOBUFS;
  317. }
  318. /* Look up REMAC, if applicable */
  319. if ( arphdr->ar_op == ARPOP_REPLY ) {
  320. target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
  321. if ( ! target_ha ) {
  322. DBGC ( ipoib, "IPoIB %p no REMAC for %s ARP reply\n",
  323. ipoib, eth_ntoa ( arp_target_pa ( arphdr ) ) );
  324. return -ENXIO_ARP_REPLY;
  325. }
  326. }
  327. /* Construct new packet */
  328. iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
  329. sender_pa = arp_sender_pa ( arphdr );
  330. target_pa = arp_target_pa ( arphdr );
  331. arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
  332. arphdr->ar_hln = sizeof ( ipoib->mac );
  333. memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
  334. memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
  335. memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
  336. memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
  337. if ( target_ha ) {
  338. memcpy ( arp_target_ha ( arphdr ), target_ha,
  339. sizeof ( *target_ha ) );
  340. }
  341. return 0;
  342. }
  343. /**
  344. * Translate transmitted packet
  345. *
  346. * @v netdev Network device
  347. * @v iobuf Packet to be transmitted (with no link-layer headers)
  348. * @v net_proto Network-layer protocol (in network byte order)
  349. * @ret rc Return status code
  350. */
  351. static int ipoib_translate_tx ( struct net_device *netdev,
  352. struct io_buffer *iobuf, uint16_t net_proto ) {
  353. switch ( net_proto ) {
  354. case htons ( ETH_P_ARP ) :
  355. return ipoib_translate_tx_arp ( netdev, iobuf );
  356. case htons ( ETH_P_IP ) :
  357. /* No translation needed */
  358. return 0;
  359. default:
  360. /* Cannot handle other traffic via eIPoIB */
  361. return -ENOTSUP;
  362. }
  363. }
  364. /**
  365. * Translate received ARP packet
  366. *
  367. * @v netdev Network device
  368. * @v iobuf Received packet (with no link-layer headers)
  369. * @v remac Constructed Remote Ethernet MAC
  370. * @ret rc Return status code
  371. */
  372. static int ipoib_translate_rx_arp ( struct net_device *netdev,
  373. struct io_buffer *iobuf,
  374. struct ipoib_remac *remac ) {
  375. struct ipoib_device *ipoib = netdev->priv;
  376. struct arphdr *arphdr = iobuf->data;
  377. void *sender_pa;
  378. void *target_pa;
  379. int rc;
  380. /* Do nothing unless ARP contains IPoIB link-layer addresses */
  381. if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
  382. return 0;
  383. /* Create REMAC cache entry */
  384. if ( ( rc = ipoib_map_remac ( ipoib, remac,
  385. arp_sender_ha ( arphdr ) ) ) != 0 ) {
  386. DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
  387. ipoib, strerror ( rc ) );
  388. return rc;
  389. }
  390. /* Construct new packet */
  391. sender_pa = arp_sender_pa ( arphdr );
  392. target_pa = arp_target_pa ( arphdr );
  393. arphdr->ar_hrd = htons ( ARPHRD_ETHER );
  394. arphdr->ar_hln = ETH_ALEN;
  395. memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
  396. memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
  397. memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
  398. memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
  399. if ( arphdr->ar_op == ARPOP_REPLY ) {
  400. /* Assume received replies were directed to us */
  401. memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
  402. }
  403. iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
  404. return 0;
  405. }
  406. /**
  407. * Translate received packet
  408. *
  409. * @v netdev Network device
  410. * @v iobuf Received packet (with no link-layer headers)
  411. * @v remac Constructed Remote Ethernet MAC
  412. * @v net_proto Network-layer protocol (in network byte order)
  413. * @ret rc Return status code
  414. */
  415. static int ipoib_translate_rx ( struct net_device *netdev,
  416. struct io_buffer *iobuf,
  417. struct ipoib_remac *remac,
  418. uint16_t net_proto ) {
  419. switch ( net_proto ) {
  420. case htons ( ETH_P_ARP ) :
  421. return ipoib_translate_rx_arp ( netdev, iobuf, remac );
  422. case htons ( ETH_P_IP ) :
  423. /* No translation needed */
  424. return 0;
  425. default:
  426. /* Cannot handle other traffic via eIPoIB */
  427. return -ENOTSUP;
  428. }
  429. }
  430. /****************************************************************************
  431. *
  432. * IPoIB network device
  433. *
  434. ****************************************************************************
  435. */
  436. /**
  437. * Transmit packet via IPoIB network device
  438. *
  439. * @v netdev Network device
  440. * @v iobuf I/O buffer
  441. * @ret rc Return status code
  442. */
  443. static int ipoib_transmit ( struct net_device *netdev,
  444. struct io_buffer *iobuf ) {
  445. struct ipoib_device *ipoib = netdev->priv;
  446. struct ib_device *ibdev = ipoib->ibdev;
  447. struct ethhdr *ethhdr;
  448. struct iphdr *iphdr;
  449. struct ipoib_hdr *ipoib_hdr;
  450. struct ipoib_remac *remac;
  451. struct ipoib_mac *mac;
  452. struct ib_address_vector *dest;
  453. struct ib_address_vector av;
  454. uint16_t net_proto;
  455. int rc;
  456. /* Sanity check */
  457. if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
  458. DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
  459. return -EINVAL;
  460. }
  461. /* Attempting transmission while link is down will put the
  462. * queue pair into an error state, so don't try it.
  463. */
  464. if ( ! ib_link_ok ( ibdev ) )
  465. return -ENETUNREACH;
  466. /* Strip eIPoIB header */
  467. ethhdr = iobuf->data;
  468. remac = ( ( struct ipoib_remac * ) ethhdr->h_dest );
  469. net_proto = ethhdr->h_protocol;
  470. iob_pull ( iobuf, sizeof ( *ethhdr ) );
  471. /* Identify destination address */
  472. if ( is_multicast_ether_addr ( remac ) ) {
  473. /* Transmit multicasts as broadcasts, for simplicity */
  474. dest = &ipoib->broadcast.av;
  475. } else if ( ( mac = ipoib_find_remac ( ipoib, remac ) ) ) {
  476. /* Construct address vector from IPoIB MAC */
  477. dest = &av;
  478. memset ( dest, 0, sizeof ( *dest ) );
  479. dest->qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
  480. dest->qkey = ipoib->broadcast.av.qkey;
  481. dest->gid_present = 1;
  482. memcpy ( &dest->gid, &mac->gid, sizeof ( dest->gid ) );
  483. if ( ( rc = ib_resolve_path ( ibdev, dest ) ) != 0 ) {
  484. /* Path not resolved yet */
  485. return rc;
  486. }
  487. } else {
  488. /* Generate a new ARP request (if possible) to trigger
  489. * population of the REMAC cache entry.
  490. */
  491. if ( ( net_proto != htons ( ETH_P_IP ) ) ||
  492. ( iob_len ( iobuf ) < sizeof ( *iphdr ) ) ) {
  493. DBGC ( ipoib, "IPoIB %p no REMAC for %s non-IPv4 "
  494. "packet type %04x\n", ipoib,
  495. eth_ntoa ( ethhdr->h_dest ),
  496. ntohs ( net_proto ) );
  497. return -ENXIO_NON_IPV4;
  498. }
  499. iphdr = iobuf->data;
  500. if ( ( rc = arp_tx_request ( netdev, &ipv4_protocol,
  501. &iphdr->dest, &iphdr->src ) ) !=0){
  502. DBGC ( ipoib, "IPoIB %p could not ARP for %s/%s/",
  503. ipoib, eth_ntoa ( ethhdr->h_dest ),
  504. inet_ntoa ( iphdr->dest ) );
  505. DBGC ( ipoib, "%s: %s\n", inet_ntoa ( iphdr->src ),
  506. strerror ( rc ) );
  507. return rc;
  508. }
  509. DBGC ( ipoib, "IPoIB %p no REMAC for %s/%s/", ipoib,
  510. eth_ntoa ( ethhdr->h_dest ), inet_ntoa ( iphdr->dest ) );
  511. DBGC ( ipoib, "%s\n", inet_ntoa ( iphdr->src ) );
  512. return -ENXIO_ARP_SENT;
  513. }
  514. /* Translate packet if applicable */
  515. if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
  516. return rc;
  517. /* Prepend real IPoIB header */
  518. ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
  519. ipoib_hdr->proto = net_proto;
  520. ipoib_hdr->reserved = 0;
  521. /* Transmit packet */
  522. return ib_post_send ( ibdev, ipoib->qp, dest, iobuf );
  523. }
  524. /**
  525. * Handle IPoIB send completion
  526. *
  527. * @v ibdev Infiniband device
  528. * @v qp Queue pair
  529. * @v iobuf I/O buffer
  530. * @v rc Completion status code
  531. */
  532. static void ipoib_complete_send ( struct ib_device *ibdev __unused,
  533. struct ib_queue_pair *qp,
  534. struct io_buffer *iobuf, int rc ) {
  535. struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
  536. netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
  537. }
  538. /**
  539. * Handle IPoIB receive completion
  540. *
  541. * @v ibdev Infiniband device
  542. * @v qp Queue pair
  543. * @v dest Destination address vector, or NULL
  544. * @v source Source address vector, or NULL
  545. * @v iobuf I/O buffer
  546. * @v rc Completion status code
  547. */
  548. static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
  549. struct ib_queue_pair *qp,
  550. struct ib_address_vector *dest,
  551. struct ib_address_vector *source,
  552. struct io_buffer *iobuf, int rc ) {
  553. struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
  554. struct net_device *netdev = ipoib->netdev;
  555. struct ipoib_hdr *ipoib_hdr;
  556. struct ethhdr *ethhdr;
  557. struct ipoib_remac remac;
  558. uint16_t net_proto;
  559. /* Record errors */
  560. if ( rc != 0 ) {
  561. netdev_rx_err ( netdev, iobuf, rc );
  562. return;
  563. }
  564. /* Sanity check */
  565. if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
  566. DBGC ( ipoib, "IPoIB %p received packet too short to "
  567. "contain IPoIB header\n", ipoib );
  568. DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
  569. netdev_rx_err ( netdev, iobuf, -EIO );
  570. return;
  571. }
  572. if ( ! source ) {
  573. DBGC ( ipoib, "IPoIB %p received packet without address "
  574. "vector\n", ipoib );
  575. netdev_rx_err ( netdev, iobuf, -ENOTTY );
  576. return;
  577. }
  578. /* Strip real IPoIB header */
  579. ipoib_hdr = iobuf->data;
  580. net_proto = ipoib_hdr->proto;
  581. iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
  582. /* Construct source address from remote QPN and LID */
  583. remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
  584. remac.lid = htons ( source->lid );
  585. /* Translate packet if applicable */
  586. if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
  587. net_proto ) ) != 0 ) {
  588. netdev_rx_err ( netdev, iobuf, rc );
  589. return;
  590. }
  591. /* Prepend eIPoIB header */
  592. ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
  593. memcpy ( &ethhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
  594. ethhdr->h_protocol = net_proto;
  595. /* Construct destination address */
  596. if ( dest->gid_present && IB_GID_MULTICAST ( &dest->gid ) ) {
  597. /* Multicast GID: use the Ethernet broadcast address */
  598. memcpy ( &ethhdr->h_dest, eth_broadcast,
  599. sizeof ( ethhdr->h_dest ) );
  600. } else {
  601. /* Assume destination address is local Ethernet MAC */
  602. memcpy ( &ethhdr->h_dest, netdev->ll_addr,
  603. sizeof ( ethhdr->h_dest ) );
  604. }
  605. /* Hand off to network layer */
  606. netdev_rx ( netdev, iobuf );
  607. }
  608. /** IPoIB completion operations */
  609. static struct ib_completion_queue_operations ipoib_cq_op = {
  610. .complete_send = ipoib_complete_send,
  611. .complete_recv = ipoib_complete_recv,
  612. };
  613. /**
  614. * Allocate IPoIB receive I/O buffer
  615. *
  616. * @v len Length of buffer
  617. * @ret iobuf I/O buffer, or NULL
  618. *
  619. * Some Infiniband hardware requires 2kB alignment of receive buffers
  620. * and provides no way to disable header separation. The result is
  621. * that there are only four bytes of link-layer header (the real IPoIB
  622. * header) before the payload. This is not sufficient space to insert
  623. * an eIPoIB link-layer pseudo-header.
  624. *
  625. * We therefore allocate I/O buffers offset to start slightly before
  626. * the natural alignment boundary, in order to allow sufficient space.
  627. */
  628. static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
  629. struct io_buffer *iobuf;
  630. size_t reserve_len;
  631. /* Calculate additional length required at start of buffer */
  632. reserve_len = ( sizeof ( struct ethhdr ) -
  633. sizeof ( struct ipoib_hdr ) );
  634. /* Allocate buffer */
  635. iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
  636. if ( iobuf ) {
  637. iob_reserve ( iobuf, reserve_len );
  638. }
  639. return iobuf;
  640. }
  641. /** IPoIB queue pair operations */
  642. static struct ib_queue_pair_operations ipoib_qp_op = {
  643. .alloc_iob = ipoib_alloc_iob,
  644. };
  645. /**
  646. * Poll IPoIB network device
  647. *
  648. * @v netdev Network device
  649. */
  650. static void ipoib_poll ( struct net_device *netdev ) {
  651. struct ipoib_device *ipoib = netdev->priv;
  652. struct ib_device *ibdev = ipoib->ibdev;
  653. /* Poll Infiniband device */
  654. ib_poll_eq ( ibdev );
  655. /* Poll the retry timers (required for IPoIB multicast join) */
  656. retry_poll();
  657. }
  658. /**
  659. * Handle IPv4 broadcast multicast group join completion
  660. *
  661. * @v membership Multicast group membership
  662. * @v rc Status code
  663. */
  664. void ipoib_join_complete ( struct ib_mc_membership *membership, int rc ) {
  665. struct ipoib_device *ipoib = container_of ( membership,
  666. struct ipoib_device,
  667. broadcast.membership );
  668. /* Record join status as link status */
  669. netdev_link_err ( ipoib->netdev, rc );
  670. }
  671. /**
  672. * Join IPv4 broadcast multicast group
  673. *
  674. * @v ipoib IPoIB device
  675. * @ret rc Return status code
  676. */
  677. static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
  678. int rc;
  679. /* Join multicast group */
  680. if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
  681. &ipoib->broadcast.membership,
  682. &ipoib->broadcast.av, 0,
  683. ipoib_join_complete ) ) != 0 ) {
  684. DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
  685. ipoib, strerror ( rc ) );
  686. return rc;
  687. }
  688. return 0;
  689. }
  690. /**
  691. * Leave IPv4 broadcast multicast group
  692. *
  693. * @v ipoib IPoIB device
  694. */
  695. static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
  696. /* Leave multicast group */
  697. ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
  698. &ipoib->broadcast.membership );
  699. }
  700. /**
  701. * Handle link status change
  702. *
  703. * @v ipoib IPoIB device
  704. */
  705. static void ipoib_link_state_changed ( struct ipoib_device *ipoib ) {
  706. struct ib_device *ibdev = ipoib->ibdev;
  707. struct net_device *netdev = ipoib->netdev;
  708. int rc;
  709. /* Leave existing broadcast group */
  710. if ( ipoib->qp )
  711. ipoib_leave_broadcast_group ( ipoib );
  712. /* Update MAC address based on potentially-new GID prefix */
  713. memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
  714. sizeof ( ipoib->mac.gid.s.prefix ) );
  715. /* Update broadcast MAC GID based on potentially-new partition key */
  716. ipoib->broadcast.mac.gid.words[2] =
  717. htons ( ibdev->pkey | IB_PKEY_FULL );
  718. /* Construct broadcast address vector from broadcast MAC address */
  719. memset ( &ipoib->broadcast.av, 0, sizeof ( ipoib->broadcast.av ) );
  720. ipoib->broadcast.av.qpn = IB_QPN_BROADCAST;
  721. ipoib->broadcast.av.gid_present = 1;
  722. memcpy ( &ipoib->broadcast.av.gid, &ipoib->broadcast.mac.gid,
  723. sizeof ( ipoib->broadcast.av.gid ) );
  724. /* Set net device link state to reflect Infiniband link state */
  725. rc = ib_link_rc ( ibdev );
  726. netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
  727. /* Join new broadcast group */
  728. if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && ipoib->qp &&
  729. ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
  730. DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
  731. "%s\n", ipoib, strerror ( rc ) );
  732. netdev_link_err ( netdev, rc );
  733. return;
  734. }
  735. }
  736. /**
  737. * Open IPoIB network device
  738. *
  739. * @v netdev Network device
  740. * @ret rc Return status code
  741. */
  742. static int ipoib_open ( struct net_device *netdev ) {
  743. struct ipoib_device *ipoib = netdev->priv;
  744. struct ib_device *ibdev = ipoib->ibdev;
  745. int rc;
  746. /* Open IB device */
  747. if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
  748. DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
  749. ipoib, strerror ( rc ) );
  750. goto err_ib_open;
  751. }
  752. /* Allocate completion queue */
  753. if ( ( rc = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op,
  754. &ipoib->cq ) ) != 0 ) {
  755. DBGC ( ipoib, "IPoIB %p could not create completion queue: "
  756. "%s\n", ipoib, strerror ( rc ) );
  757. goto err_create_cq;
  758. }
  759. /* Allocate queue pair */
  760. if ( ( rc = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
  761. ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
  762. &ipoib_qp_op, netdev->name,
  763. &ipoib->qp ) ) != 0 ) {
  764. DBGC ( ipoib, "IPoIB %p could not create queue pair: %s\n",
  765. ipoib, strerror ( rc ) );
  766. goto err_create_qp;
  767. }
  768. ib_qp_set_ownerdata ( ipoib->qp, ipoib );
  769. /* Update MAC address with QPN */
  770. ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
  771. /* Fill receive rings */
  772. ib_refill_recv ( ibdev, ipoib->qp );
  773. /* Fake a link status change to join the broadcast group */
  774. ipoib_link_state_changed ( ipoib );
  775. return 0;
  776. ib_destroy_qp ( ibdev, ipoib->qp );
  777. err_create_qp:
  778. ib_destroy_cq ( ibdev, ipoib->cq );
  779. err_create_cq:
  780. ib_close ( ibdev );
  781. err_ib_open:
  782. return rc;
  783. }
  784. /**
  785. * Close IPoIB network device
  786. *
  787. * @v netdev Network device
  788. */
  789. static void ipoib_close ( struct net_device *netdev ) {
  790. struct ipoib_device *ipoib = netdev->priv;
  791. struct ib_device *ibdev = ipoib->ibdev;
  792. /* Flush REMAC cache */
  793. ipoib_flush_remac ( ipoib );
  794. /* Leave broadcast group */
  795. ipoib_leave_broadcast_group ( ipoib );
  796. /* Remove QPN from MAC address */
  797. ipoib->mac.flags__qpn = 0;
  798. /* Tear down the queues */
  799. ib_destroy_qp ( ibdev, ipoib->qp );
  800. ipoib->qp = NULL;
  801. ib_destroy_cq ( ibdev, ipoib->cq );
  802. ipoib->cq = NULL;
  803. /* Close IB device */
  804. ib_close ( ibdev );
  805. }
  806. /** IPoIB network device operations */
  807. static struct net_device_operations ipoib_operations = {
  808. .open = ipoib_open,
  809. .close = ipoib_close,
  810. .transmit = ipoib_transmit,
  811. .poll = ipoib_poll,
  812. };
  813. /**
  814. * Probe IPoIB device
  815. *
  816. * @v ibdev Infiniband device
  817. * @ret rc Return status code
  818. */
  819. static int ipoib_probe ( struct ib_device *ibdev ) {
  820. struct net_device *netdev;
  821. struct ipoib_device *ipoib;
  822. int rc;
  823. /* Allocate network device */
  824. netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
  825. if ( ! netdev )
  826. return -ENOMEM;
  827. netdev_init ( netdev, &ipoib_operations );
  828. ipoib = netdev->priv;
  829. netdev->dev = ibdev->dev;
  830. memset ( ipoib, 0, sizeof ( *ipoib ) );
  831. ipoib->netdev = netdev;
  832. ipoib->ibdev = ibdev;
  833. INIT_LIST_HEAD ( &ipoib->peers );
  834. /* Extract hardware address */
  835. memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
  836. sizeof ( ibdev->gid.s.guid ) );
  837. memcpy ( netdev->ll_addr, ibdev->lemac, ETH_ALEN );
  838. /* Set local MAC address */
  839. memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
  840. sizeof ( ipoib->mac.gid.s.guid ) );
  841. /* Set default broadcast MAC address */
  842. memcpy ( &ipoib->broadcast.mac, &ipoib_broadcast,
  843. sizeof ( ipoib->broadcast.mac ) );
  844. /* Add to list of IPoIB devices */
  845. list_add_tail ( &ipoib->list, &ipoib_devices );
  846. /* Register network device */
  847. if ( ( rc = register_netdev ( netdev ) ) != 0 )
  848. goto err_register_netdev;
  849. return 0;
  850. unregister_netdev ( netdev );
  851. err_register_netdev:
  852. list_del ( &ipoib->list );
  853. netdev_nullify ( netdev );
  854. netdev_put ( netdev );
  855. return rc;
  856. }
  857. /**
  858. * Handle device or link status change
  859. *
  860. * @v ibdev Infiniband device
  861. */
  862. static void ipoib_notify ( struct ib_device *ibdev ) {
  863. struct ipoib_device *ipoib;
  864. /* Handle link status change for any attached IPoIB devices */
  865. list_for_each_entry ( ipoib, &ipoib_devices, list ) {
  866. if ( ipoib->ibdev != ibdev )
  867. continue;
  868. ipoib_link_state_changed ( ipoib );
  869. }
  870. }
  871. /**
  872. * Remove IPoIB device
  873. *
  874. * @v ibdev Infiniband device
  875. */
  876. static void ipoib_remove ( struct ib_device *ibdev ) {
  877. struct ipoib_device *ipoib;
  878. struct ipoib_device *tmp;
  879. struct net_device *netdev;
  880. /* Remove any attached IPoIB devices */
  881. list_for_each_entry_safe ( ipoib, tmp, &ipoib_devices, list ) {
  882. if ( ipoib->ibdev != ibdev )
  883. continue;
  884. netdev = ipoib->netdev;
  885. unregister_netdev ( netdev );
  886. list_del ( &ipoib->list );
  887. netdev_nullify ( netdev );
  888. netdev_put ( netdev );
  889. }
  890. }
  891. /** IPoIB driver */
  892. struct ib_driver ipoib_driver __ib_driver = {
  893. .name = "IPoIB",
  894. .probe = ipoib_probe,
  895. .notify = ipoib_notify,
  896. .remove = ipoib_remove,
  897. };
  898. /**
  899. * Find IPoIB network device
  900. *
  901. * @v ibdev Infiniband device
  902. * @ret netdev IPoIB network device, or NULL if not found
  903. */
  904. struct net_device * ipoib_netdev ( struct ib_device *ibdev ) {
  905. struct ipoib_device *ipoib;
  906. /* Find matching IPoIB device */
  907. list_for_each_entry ( ipoib, &ipoib_devices, list ) {
  908. if ( ipoib->ibdev != ibdev )
  909. continue;
  910. return ipoib->netdev;
  911. }
  912. return NULL;
  913. }