You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019
  1. /*
  2. * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License as
  6. * published by the Free Software Foundation; either version 2 of the
  7. * License, or any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful, but
  10. * WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17. */
  18. #include <stdint.h>
  19. #include <stdio.h>
  20. #include <unistd.h>
  21. #include <string.h>
  22. #include <byteswap.h>
  23. #include <errno.h>
  24. #include <gpxe/if_arp.h>
  25. #include <gpxe/iobuf.h>
  26. #include <gpxe/netdevice.h>
  27. #include <gpxe/infiniband.h>
  28. #include <gpxe/ipoib.h>
  29. /** @file
  30. *
  31. * IP over Infiniband
  32. */
  33. /** IPoIB MTU */
  34. #define IPOIB_MTU 2048
  35. /** Number of IPoIB data send work queue entries */
  36. #define IPOIB_DATA_NUM_SEND_WQES 2
  37. /** Number of IPoIB data receive work queue entries */
  38. #define IPOIB_DATA_NUM_RECV_WQES 4
  39. /** Number of IPoIB data completion entries */
  40. #define IPOIB_DATA_NUM_CQES 8
  41. /** Number of IPoIB metadata send work queue entries */
  42. #define IPOIB_META_NUM_SEND_WQES 2
  43. /** Number of IPoIB metadata receive work queue entries */
  44. #define IPOIB_META_NUM_RECV_WQES 2
  45. /** Number of IPoIB metadata completion entries */
  46. #define IPOIB_META_NUM_CQES 8
  47. /** An IPoIB queue set */
  48. struct ipoib_queue_set {
  49. /** Completion queue */
  50. struct ib_completion_queue *cq;
  51. /** Queue pair */
  52. struct ib_queue_pair *qp;
  53. /** Receive work queue fill level */
  54. unsigned int recv_fill;
  55. /** Receive work queue maximum fill level */
  56. unsigned int recv_max_fill;
  57. };
  58. /** An IPoIB device */
  59. struct ipoib_device {
  60. /** Network device */
  61. struct net_device *netdev;
  62. /** Underlying Infiniband device */
  63. struct ib_device *ibdev;
  64. /** Data queue set */
  65. struct ipoib_queue_set data;
  66. /** Data queue set */
  67. struct ipoib_queue_set meta;
  68. /** Broadcast GID */
  69. struct ib_gid broadcast_gid;
  70. /** Broadcast LID */
  71. unsigned int broadcast_lid;
  72. /** Data queue key */
  73. unsigned long data_qkey;
  74. /** Attached to multicast group
  75. *
  76. * This flag indicates whether or not we have attached our
  77. * data queue pair to the broadcast multicast GID.
  78. */
  79. int broadcast_attached;
  80. };
  81. /**
  82. * IPoIB path cache entry
  83. *
  84. * This serves a similar role to the ARP cache for Ethernet. (ARP
  85. * *is* used on IPoIB; we have two caches to maintain.)
  86. */
  87. struct ipoib_cached_path {
  88. /** Destination GID */
  89. struct ib_gid gid;
  90. /** Destination LID */
  91. unsigned int dlid;
  92. /** Service level */
  93. unsigned int sl;
  94. /** Rate */
  95. unsigned int rate;
  96. };
  97. /** Number of IPoIB path cache entries */
  98. #define IPOIB_NUM_CACHED_PATHS 2
  99. /** IPoIB path cache */
  100. static struct ipoib_cached_path ipoib_path_cache[IPOIB_NUM_CACHED_PATHS];
  101. /** Oldest IPoIB path cache entry index */
  102. static unsigned int ipoib_path_cache_idx = 0;
  103. /** TID half used to identify get path record replies */
  104. #define IPOIB_TID_GET_PATH_REC 0x11111111UL
  105. /** TID half used to identify multicast member record replies */
  106. #define IPOIB_TID_MC_MEMBER_REC 0x22222222UL
  107. /** IPoIB metadata TID */
  108. static uint32_t ipoib_meta_tid = 0;
  109. /** IPv4 broadcast GID */
  110. static const struct ib_gid ipv4_broadcast_gid = {
  111. { { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  112. 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff } }
  113. };
  114. /** Maximum time we will wait for the broadcast join to succeed */
  115. #define IPOIB_JOIN_MAX_DELAY_MS 1000
  116. /****************************************************************************
  117. *
  118. * IPoIB link layer
  119. *
  120. ****************************************************************************
  121. */
  122. /** Broadcast QPN used in IPoIB MAC addresses
  123. *
  124. * This is a guaranteed invalid real QPN
  125. */
  126. #define IPOIB_BROADCAST_QPN 0xffffffffUL
  127. /** Broadcast IPoIB address */
  128. static struct ipoib_mac ipoib_broadcast = {
  129. .qpn = ntohl ( IPOIB_BROADCAST_QPN ),
  130. };
  131. /**
  132. * Add IPoIB link-layer header
  133. *
  134. * @v iobuf I/O buffer
  135. * @v netdev Network device
  136. * @v net_protocol Network-layer protocol
  137. * @v ll_dest Link-layer destination address
  138. */
  139. static int ipoib_push ( struct io_buffer *iobuf,
  140. struct net_device *netdev __unused,
  141. struct net_protocol *net_protocol,
  142. const void *ll_dest ) {
  143. struct ipoib_hdr *ipoib_hdr =
  144. iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
  145. /* Build IPoIB header */
  146. memcpy ( &ipoib_hdr->pseudo.peer, ll_dest,
  147. sizeof ( ipoib_hdr->pseudo.peer ) );
  148. ipoib_hdr->real.proto = net_protocol->net_proto;
  149. ipoib_hdr->real.reserved = 0;
  150. return 0;
  151. }
  152. /**
  153. * Remove IPoIB link-layer header
  154. *
  155. * @v iobuf I/O buffer
  156. * @v netdev Network device
  157. * @v net_proto Network-layer protocol, in network-byte order
  158. * @v ll_source Source link-layer address
  159. * @ret rc Return status code
  160. */
  161. static int ipoib_pull ( struct io_buffer *iobuf,
  162. struct net_device *netdev __unused,
  163. uint16_t *net_proto, const void **ll_source ) {
  164. struct ipoib_hdr *ipoib_hdr = iobuf->data;
  165. /* Sanity check */
  166. if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
  167. DBG ( "IPoIB packet too short for link-layer header\n" );
  168. DBG_HD ( iobuf->data, iob_len ( iobuf ) );
  169. return -EINVAL;
  170. }
  171. /* Strip off IPoIB header */
  172. iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
  173. /* Fill in required fields */
  174. *net_proto = ipoib_hdr->real.proto;
  175. *ll_source = &ipoib_hdr->pseudo.peer;
  176. return 0;
  177. }
  178. /**
  179. * Transcribe IPoIB address
  180. *
  181. * @v ll_addr Link-layer address
  182. * @ret string Link-layer address in human-readable format
  183. */
  184. const char * ipoib_ntoa ( const void *ll_addr ) {
  185. static char buf[45];
  186. const struct ipoib_mac *mac = ll_addr;
  187. snprintf ( buf, sizeof ( buf ), "%08lx:%08lx:%08lx:%08lx:%08lx",
  188. htonl ( mac->qpn ), htonl ( mac->gid.u.dwords[0] ),
  189. htonl ( mac->gid.u.dwords[1] ),
  190. htonl ( mac->gid.u.dwords[2] ),
  191. htonl ( mac->gid.u.dwords[3] ) );
  192. return buf;
  193. }
  194. /** IPoIB protocol */
  195. struct ll_protocol ipoib_protocol __ll_protocol = {
  196. .name = "IPoIB",
  197. .ll_proto = htons ( ARPHRD_INFINIBAND ),
  198. .ll_addr_len = IPOIB_ALEN,
  199. .ll_header_len = IPOIB_HLEN,
  200. .ll_broadcast = ( uint8_t * ) &ipoib_broadcast,
  201. .push = ipoib_push,
  202. .pull = ipoib_pull,
  203. .ntoa = ipoib_ntoa,
  204. };
  205. /****************************************************************************
  206. *
  207. * IPoIB network device
  208. *
  209. ****************************************************************************
  210. */
  211. /**
  212. * Destroy queue set
  213. *
  214. * @v ipoib IPoIB device
  215. * @v qset Queue set
  216. */
  217. static void ipoib_destroy_qset ( struct ipoib_device *ipoib,
  218. struct ipoib_queue_set *qset ) {
  219. struct ib_device *ibdev = ipoib->ibdev;
  220. if ( qset->qp )
  221. ib_destroy_qp ( ibdev, qset->qp );
  222. if ( qset->cq )
  223. ib_destroy_cq ( ibdev, qset->cq );
  224. memset ( qset, 0, sizeof ( *qset ) );
  225. }
  226. /**
  227. * Create queue set
  228. *
  229. * @v ipoib IPoIB device
  230. * @v qset Queue set
  231. * @ret rc Return status code
  232. */
  233. static int ipoib_create_qset ( struct ipoib_device *ipoib,
  234. struct ipoib_queue_set *qset,
  235. unsigned int num_cqes,
  236. unsigned int num_send_wqes,
  237. unsigned int num_recv_wqes,
  238. unsigned long qkey ) {
  239. struct ib_device *ibdev = ipoib->ibdev;
  240. int rc;
  241. /* Sanity check */
  242. assert ( qset->cq == NULL );
  243. assert ( qset->qp == NULL );
  244. /* Store queue parameters */
  245. qset->recv_max_fill = num_recv_wqes;
  246. /* Allocate completion queue */
  247. qset->cq = ib_create_cq ( ibdev, num_cqes );
  248. if ( ! qset->cq ) {
  249. DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
  250. ipoib );
  251. rc = -ENOMEM;
  252. goto err;
  253. }
  254. /* Allocate queue pair */
  255. qset->qp = ib_create_qp ( ibdev, num_send_wqes, qset->cq,
  256. num_recv_wqes, qset->cq, qkey );
  257. if ( ! qset->qp ) {
  258. DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
  259. ipoib );
  260. rc = -ENOMEM;
  261. goto err;
  262. }
  263. ib_qp_set_ownerdata ( qset->qp, ipoib->netdev );
  264. return 0;
  265. err:
  266. ipoib_destroy_qset ( ipoib, qset );
  267. return rc;
  268. }
  269. /**
  270. * Find path cache entry by GID
  271. *
  272. * @v gid GID
  273. * @ret entry Path cache entry, or NULL
  274. */
  275. static struct ipoib_cached_path *
  276. ipoib_find_cached_path ( struct ib_gid *gid ) {
  277. struct ipoib_cached_path *path;
  278. unsigned int i;
  279. for ( i = 0 ; i < IPOIB_NUM_CACHED_PATHS ; i++ ) {
  280. path = &ipoib_path_cache[i];
  281. if ( memcmp ( &path->gid, gid, sizeof ( *gid ) ) == 0 )
  282. return path;
  283. }
  284. DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx cache miss\n",
  285. htonl ( gid->u.dwords[0] ), htonl ( gid->u.dwords[1] ),
  286. htonl ( gid->u.dwords[2] ), htonl ( gid->u.dwords[3] ) );
  287. return NULL;
  288. }
  289. /**
  290. * Transmit path record request
  291. *
  292. * @v ipoib IPoIB device
  293. * @v gid Destination GID
  294. * @ret rc Return status code
  295. */
  296. static int ipoib_get_path_record ( struct ipoib_device *ipoib,
  297. struct ib_gid *gid ) {
  298. struct ib_device *ibdev = ipoib->ibdev;
  299. struct io_buffer *iobuf;
  300. struct ib_mad_path_record *path_record;
  301. struct ib_address_vector av;
  302. int rc;
  303. /* Allocate I/O buffer */
  304. iobuf = alloc_iob ( sizeof ( *path_record ) );
  305. if ( ! iobuf )
  306. return -ENOMEM;
  307. iob_put ( iobuf, sizeof ( *path_record ) );
  308. path_record = iobuf->data;
  309. memset ( path_record, 0, sizeof ( *path_record ) );
  310. /* Construct path record request */
  311. path_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
  312. path_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
  313. path_record->mad_hdr.class_version = 2;
  314. path_record->mad_hdr.method = IB_MGMT_METHOD_GET;
  315. path_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_PATH_REC );
  316. path_record->mad_hdr.tid[0] = IPOIB_TID_GET_PATH_REC;
  317. path_record->mad_hdr.tid[1] = ipoib_meta_tid++;
  318. path_record->sa_hdr.comp_mask[1] =
  319. htonl ( IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID );
  320. memcpy ( &path_record->dgid, gid, sizeof ( path_record->dgid ) );
  321. memcpy ( &path_record->sgid, &ibdev->port_gid,
  322. sizeof ( path_record->sgid ) );
  323. /* Construct address vector */
  324. memset ( &av, 0, sizeof ( av ) );
  325. av.dlid = ibdev->sm_lid;
  326. av.dest_qp = IB_SA_QPN;
  327. av.qkey = IB_GLOBAL_QKEY;
  328. /* Post send request */
  329. if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
  330. iobuf ) ) != 0 ) {
  331. DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
  332. ipoib, strerror ( rc ) );
  333. free_iob ( iobuf );
  334. return rc;
  335. }
  336. return 0;
  337. }
  338. /**
  339. * Transmit multicast group membership request
  340. *
  341. * @v ipoib IPoIB device
  342. * @v gid Multicast GID
  343. * @v join Join (rather than leave) group
  344. * @ret rc Return status code
  345. */
  346. static int ipoib_mc_member_record ( struct ipoib_device *ipoib,
  347. struct ib_gid *gid, int join ) {
  348. struct ib_device *ibdev = ipoib->ibdev;
  349. struct io_buffer *iobuf;
  350. struct ib_mad_mc_member_record *mc_member_record;
  351. struct ib_address_vector av;
  352. int rc;
  353. /* Allocate I/O buffer */
  354. iobuf = alloc_iob ( sizeof ( *mc_member_record ) );
  355. if ( ! iobuf )
  356. return -ENOMEM;
  357. iob_put ( iobuf, sizeof ( *mc_member_record ) );
  358. mc_member_record = iobuf->data;
  359. memset ( mc_member_record, 0, sizeof ( *mc_member_record ) );
  360. /* Construct path record request */
  361. mc_member_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
  362. mc_member_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
  363. mc_member_record->mad_hdr.class_version = 2;
  364. mc_member_record->mad_hdr.method =
  365. ( join ? IB_MGMT_METHOD_SET : IB_MGMT_METHOD_DELETE );
  366. mc_member_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_MC_MEMBER_REC );
  367. mc_member_record->mad_hdr.tid[0] = IPOIB_TID_MC_MEMBER_REC;
  368. mc_member_record->mad_hdr.tid[1] = ipoib_meta_tid++;
  369. mc_member_record->sa_hdr.comp_mask[1] =
  370. htonl ( IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
  371. IB_SA_MCMEMBER_REC_JOIN_STATE );
  372. mc_member_record->scope__join_state = 1;
  373. memcpy ( &mc_member_record->mgid, gid,
  374. sizeof ( mc_member_record->mgid ) );
  375. memcpy ( &mc_member_record->port_gid, &ibdev->port_gid,
  376. sizeof ( mc_member_record->port_gid ) );
  377. /* Construct address vector */
  378. memset ( &av, 0, sizeof ( av ) );
  379. av.dlid = ibdev->sm_lid;
  380. av.dest_qp = IB_SA_QPN;
  381. av.qkey = IB_GLOBAL_QKEY;
  382. /* Post send request */
  383. if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
  384. iobuf ) ) != 0 ) {
  385. DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
  386. ipoib, strerror ( rc ) );
  387. free_iob ( iobuf );
  388. return rc;
  389. }
  390. return 0;
  391. }
  392. /**
  393. * Transmit packet via IPoIB network device
  394. *
  395. * @v netdev Network device
  396. * @v iobuf I/O buffer
  397. * @ret rc Return status code
  398. */
  399. static int ipoib_transmit ( struct net_device *netdev,
  400. struct io_buffer *iobuf ) {
  401. struct ipoib_device *ipoib = netdev->priv;
  402. struct ib_device *ibdev = ipoib->ibdev;
  403. struct ipoib_pseudo_hdr *ipoib_pshdr = iobuf->data;
  404. struct ib_address_vector av;
  405. struct ib_gid *gid;
  406. struct ipoib_cached_path *path;
  407. int rc;
  408. /* Sanity check */
  409. if ( iob_len ( iobuf ) < sizeof ( *ipoib_pshdr ) ) {
  410. DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
  411. return -EINVAL;
  412. }
  413. iob_pull ( iobuf, ( sizeof ( *ipoib_pshdr ) ) );
  414. /* Attempting transmission while link is down will put the
  415. * queue pair into an error state, so don't try it.
  416. */
  417. if ( ! ibdev->link_up )
  418. return -ENETUNREACH;
  419. /* Construct address vector */
  420. memset ( &av, 0, sizeof ( av ) );
  421. av.qkey = IB_GLOBAL_QKEY;
  422. av.gid_present = 1;
  423. if ( ipoib_pshdr->peer.qpn == htonl ( IPOIB_BROADCAST_QPN ) ) {
  424. /* Broadcast address */
  425. av.dest_qp = IB_BROADCAST_QPN;
  426. av.dlid = ipoib->broadcast_lid;
  427. gid = &ipoib->broadcast_gid;
  428. } else {
  429. /* Unicast - look in path cache */
  430. path = ipoib_find_cached_path ( &ipoib_pshdr->peer.gid );
  431. if ( ! path ) {
  432. /* No path entry - get path record */
  433. rc = ipoib_get_path_record ( ipoib,
  434. &ipoib_pshdr->peer.gid );
  435. netdev_tx_complete ( netdev, iobuf );
  436. return rc;
  437. }
  438. av.dest_qp = ntohl ( ipoib_pshdr->peer.qpn );
  439. av.dlid = path->dlid;
  440. av.rate = path->rate;
  441. av.sl = path->sl;
  442. gid = &ipoib_pshdr->peer.gid;
  443. }
  444. memcpy ( &av.gid, gid, sizeof ( av.gid ) );
  445. return ib_post_send ( ibdev, ipoib->data.qp, &av, iobuf );
  446. }
  447. /**
  448. * Handle IPoIB data send completion
  449. *
  450. * @v ibdev Infiniband device
  451. * @v qp Queue pair
  452. * @v completion Completion
  453. * @v iobuf I/O buffer
  454. */
  455. static void ipoib_data_complete_send ( struct ib_device *ibdev __unused,
  456. struct ib_queue_pair *qp,
  457. struct ib_completion *completion,
  458. struct io_buffer *iobuf ) {
  459. struct net_device *netdev = ib_qp_get_ownerdata ( qp );
  460. netdev_tx_complete_err ( netdev, iobuf,
  461. ( completion->syndrome ? -EIO : 0 ) );
  462. }
  463. /**
  464. * Handle IPoIB data receive completion
  465. *
  466. * @v ibdev Infiniband device
  467. * @v qp Queue pair
  468. * @v completion Completion
  469. * @v iobuf I/O buffer
  470. */
  471. static void ipoib_data_complete_recv ( struct ib_device *ibdev __unused,
  472. struct ib_queue_pair *qp,
  473. struct ib_completion *completion,
  474. struct io_buffer *iobuf ) {
  475. struct net_device *netdev = ib_qp_get_ownerdata ( qp );
  476. struct ipoib_device *ipoib = netdev->priv;
  477. struct ipoib_pseudo_hdr *ipoib_pshdr;
  478. if ( completion->syndrome ) {
  479. netdev_rx_err ( netdev, iobuf, -EIO );
  480. goto done;
  481. }
  482. iob_put ( iobuf, completion->len );
  483. if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
  484. DBGC ( ipoib, "IPoIB %p received data packet too short to "
  485. "contain GRH\n", ipoib );
  486. DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
  487. netdev_rx_err ( netdev, iobuf, -EIO );
  488. goto done;
  489. }
  490. iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
  491. if ( iob_len ( iobuf ) < sizeof ( struct ipoib_real_hdr ) ) {
  492. DBGC ( ipoib, "IPoIB %p received data packet too short to "
  493. "contain IPoIB header\n", ipoib );
  494. DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
  495. netdev_rx_err ( netdev, iobuf, -EIO );
  496. goto done;
  497. }
  498. ipoib_pshdr = iob_push ( iobuf, sizeof ( *ipoib_pshdr ) );
  499. /* FIXME: fill in a MAC address for the sake of AoE! */
  500. netdev_rx ( netdev, iobuf );
  501. done:
  502. ipoib->data.recv_fill--;
  503. }
  504. /**
  505. * Handle IPoIB metadata send completion
  506. *
  507. * @v ibdev Infiniband device
  508. * @v qp Queue pair
  509. * @v completion Completion
  510. * @v iobuf I/O buffer
  511. */
  512. static void ipoib_meta_complete_send ( struct ib_device *ibdev __unused,
  513. struct ib_queue_pair *qp,
  514. struct ib_completion *completion,
  515. struct io_buffer *iobuf ) {
  516. struct net_device *netdev = ib_qp_get_ownerdata ( qp );
  517. struct ipoib_device *ipoib = netdev->priv;
  518. if ( completion->syndrome ) {
  519. DBGC ( ipoib, "IPoIB %p metadata TX completion error %x\n",
  520. ipoib, completion->syndrome );
  521. }
  522. free_iob ( iobuf );
  523. }
  524. /**
  525. * Handle received IPoIB path record
  526. *
  527. * @v ipoib IPoIB device
  528. * @v path_record Path record
  529. */
  530. static void ipoib_recv_path_record ( struct ipoib_device *ipoib __unused,
  531. struct ib_mad_path_record *path_record ) {
  532. struct ipoib_cached_path *path;
  533. /* Update path cache entry */
  534. path = &ipoib_path_cache[ipoib_path_cache_idx];
  535. memcpy ( &path->gid, &path_record->dgid, sizeof ( path->gid ) );
  536. path->dlid = ntohs ( path_record->dlid );
  537. path->sl = ( path_record->reserved__sl & 0x0f );
  538. path->rate = ( path_record->rate_selector__rate & 0x3f );
  539. DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx dlid %x sl %x rate %x\n",
  540. htonl ( path->gid.u.dwords[0] ), htonl ( path->gid.u.dwords[1] ),
  541. htonl ( path->gid.u.dwords[2] ), htonl ( path->gid.u.dwords[3] ),
  542. path->dlid, path->sl, path->rate );
  543. /* Update path cache index */
  544. ipoib_path_cache_idx++;
  545. if ( ipoib_path_cache_idx == IPOIB_NUM_CACHED_PATHS )
  546. ipoib_path_cache_idx = 0;
  547. }
  548. /**
  549. * Handle received IPoIB multicast membership record
  550. *
  551. * @v ipoib IPoIB device
  552. * @v mc_member_record Multicast membership record
  553. */
  554. static void ipoib_recv_mc_member_record ( struct ipoib_device *ipoib,
  555. struct ib_mad_mc_member_record *mc_member_record ) {
  556. int joined;
  557. int rc;
  558. /* Record parameters */
  559. joined = ( mc_member_record->scope__join_state & 0x0f );
  560. ipoib->data_qkey = ntohl ( mc_member_record->qkey );
  561. ipoib->broadcast_lid = ntohs ( mc_member_record->mlid );
  562. DBGC ( ipoib, "IPoIB %p %s broadcast group: qkey %lx mlid %x\n",
  563. ipoib, ( joined ? "joined" : "left" ), ipoib->data_qkey,
  564. ipoib->broadcast_lid );
  565. /* Update data queue pair qkey */
  566. if ( ( rc = ib_modify_qp ( ipoib->ibdev, ipoib->data.qp,
  567. IB_MODIFY_QKEY, ipoib->data_qkey ) ) != 0 ){
  568. DBGC ( ipoib, "IPoIB %p could not update data qkey: %s\n",
  569. ipoib, strerror ( rc ) );
  570. return;
  571. }
  572. }
  573. /**
  574. * Handle IPoIB metadata receive completion
  575. *
  576. * @v ibdev Infiniband device
  577. * @v qp Queue pair
  578. * @v completion Completion
  579. * @v iobuf I/O buffer
  580. */
  581. static void ipoib_meta_complete_recv ( struct ib_device *ibdev __unused,
  582. struct ib_queue_pair *qp,
  583. struct ib_completion *completion,
  584. struct io_buffer *iobuf ) {
  585. struct net_device *netdev = ib_qp_get_ownerdata ( qp );
  586. struct ipoib_device *ipoib = netdev->priv;
  587. union ib_mad *mad;
  588. if ( completion->syndrome ) {
  589. DBGC ( ipoib, "IPoIB %p metadata RX completion error %x\n",
  590. ipoib, completion->syndrome );
  591. goto done;
  592. }
  593. iob_put ( iobuf, completion->len );
  594. if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
  595. DBGC ( ipoib, "IPoIB %p received metadata packet too short "
  596. "to contain GRH\n", ipoib );
  597. DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
  598. goto done;
  599. }
  600. iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
  601. if ( iob_len ( iobuf ) < sizeof ( *mad ) ) {
  602. DBGC ( ipoib, "IPoIB %p received metadata packet too short "
  603. "to contain reply\n", ipoib );
  604. DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
  605. goto done;
  606. }
  607. mad = iobuf->data;
  608. if ( mad->mad_hdr.status != 0 ) {
  609. DBGC ( ipoib, "IPoIB %p metadata RX err status %04x\n",
  610. ipoib, ntohs ( mad->mad_hdr.status ) );
  611. goto done;
  612. }
  613. switch ( mad->mad_hdr.tid[0] ) {
  614. case IPOIB_TID_GET_PATH_REC:
  615. ipoib_recv_path_record ( ipoib, &mad->path_record );
  616. break;
  617. case IPOIB_TID_MC_MEMBER_REC:
  618. ipoib_recv_mc_member_record ( ipoib, &mad->mc_member_record );
  619. break;
  620. default:
  621. DBGC ( ipoib, "IPoIB %p unwanted response:\n",
  622. ipoib );
  623. DBGC_HD ( ipoib, mad, sizeof ( *mad ) );
  624. break;
  625. }
  626. done:
  627. ipoib->meta.recv_fill--;
  628. free_iob ( iobuf );
  629. }
  630. /**
  631. * Refill IPoIB receive ring
  632. *
  633. * @v ipoib IPoIB device
  634. */
  635. static void ipoib_refill_recv ( struct ipoib_device *ipoib,
  636. struct ipoib_queue_set *qset ) {
  637. struct ib_device *ibdev = ipoib->ibdev;
  638. struct io_buffer *iobuf;
  639. int rc;
  640. while ( qset->recv_fill < qset->recv_max_fill ) {
  641. iobuf = alloc_iob ( IPOIB_MTU );
  642. if ( ! iobuf )
  643. break;
  644. if ( ( rc = ib_post_recv ( ibdev, qset->qp, iobuf ) ) != 0 ) {
  645. free_iob ( iobuf );
  646. break;
  647. }
  648. qset->recv_fill++;
  649. }
  650. }
  651. /**
  652. * Poll IPoIB network device
  653. *
  654. * @v netdev Network device
  655. */
  656. static void ipoib_poll ( struct net_device *netdev ) {
  657. struct ipoib_device *ipoib = netdev->priv;
  658. struct ib_device *ibdev = ipoib->ibdev;
  659. ib_poll_cq ( ibdev, ipoib->meta.cq, ipoib_meta_complete_send,
  660. ipoib_meta_complete_recv );
  661. ib_poll_cq ( ibdev, ipoib->data.cq, ipoib_data_complete_send,
  662. ipoib_data_complete_recv );
  663. ipoib_refill_recv ( ipoib, &ipoib->meta );
  664. ipoib_refill_recv ( ipoib, &ipoib->data );
  665. }
  666. /**
  667. * Enable/disable interrupts on IPoIB network device
  668. *
  669. * @v netdev Network device
  670. * @v enable Interrupts should be enabled
  671. */
  672. static void ipoib_irq ( struct net_device *netdev __unused,
  673. int enable __unused ) {
  674. /* No implementation */
  675. }
  676. /**
  677. * Join IPv4 broadcast multicast group
  678. *
  679. * @v ipoib IPoIB device
  680. * @ret rc Return status code
  681. */
  682. static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
  683. int rc;
  684. /* Sanity check */
  685. if ( ! ipoib->data.qp )
  686. return 0;
  687. /* Attach data queue to broadcast multicast GID */
  688. assert ( ipoib->broadcast_attached == 0 );
  689. if ( ( rc = ib_mcast_attach ( ipoib->ibdev, ipoib->data.qp,
  690. &ipoib->broadcast_gid ) ) != 0 ){
  691. DBGC ( ipoib, "IPoIB %p could not attach to broadcast GID: "
  692. "%s\n", ipoib, strerror ( rc ) );
  693. return rc;
  694. }
  695. ipoib->broadcast_attached = 1;
  696. /* Initiate broadcast group join */
  697. if ( ( rc = ipoib_mc_member_record ( ipoib, &ipoib->broadcast_gid,
  698. 1 ) ) != 0 ) {
  699. DBGC ( ipoib, "IPoIB %p could not send broadcast join: %s\n",
  700. ipoib, strerror ( rc ) );
  701. return rc;
  702. }
  703. /* We will set link up on the network device when we receive
  704. * the broadcast join response.
  705. */
  706. return 0;
  707. }
  708. /**
  709. * Leave IPv4 broadcast multicast group
  710. *
  711. * @v ipoib IPoIB device
  712. */
  713. static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
  714. /* Detach data queue from broadcast multicast GID */
  715. if ( ipoib->broadcast_attached ) {
  716. assert ( ipoib->data.qp != NULL );
  717. ib_mcast_detach ( ipoib->ibdev, ipoib->data.qp,
  718. &ipoib->broadcast_gid );
  719. ipoib->broadcast_attached = 0;
  720. }
  721. }
  722. /**
  723. * Open IPoIB network device
  724. *
  725. * @v netdev Network device
  726. * @ret rc Return status code
  727. */
  728. static int ipoib_open ( struct net_device *netdev ) {
  729. struct ipoib_device *ipoib = netdev->priv;
  730. struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
  731. int rc;
  732. /* Allocate metadata queue set */
  733. if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->meta,
  734. IPOIB_META_NUM_CQES,
  735. IPOIB_META_NUM_SEND_WQES,
  736. IPOIB_META_NUM_RECV_WQES,
  737. IB_GLOBAL_QKEY ) ) != 0 ) {
  738. DBGC ( ipoib, "IPoIB %p could not allocate metadata QP: %s\n",
  739. ipoib, strerror ( rc ) );
  740. goto err_create_meta_qset;
  741. }
  742. /* Allocate data queue set */
  743. if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->data,
  744. IPOIB_DATA_NUM_CQES,
  745. IPOIB_DATA_NUM_SEND_WQES,
  746. IPOIB_DATA_NUM_RECV_WQES,
  747. IB_GLOBAL_QKEY ) ) != 0 ) {
  748. DBGC ( ipoib, "IPoIB %p could not allocate data QP: %s\n",
  749. ipoib, strerror ( rc ) );
  750. goto err_create_data_qset;
  751. }
  752. /* Update MAC address with data QPN */
  753. mac->qpn = htonl ( ipoib->data.qp->qpn );
  754. /* Fill receive rings */
  755. ipoib_refill_recv ( ipoib, &ipoib->meta );
  756. ipoib_refill_recv ( ipoib, &ipoib->data );
  757. /* Join broadcast group */
  758. if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) {
  759. DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
  760. ipoib, strerror ( rc ) );
  761. goto err_join_broadcast;
  762. }
  763. return 0;
  764. err_join_broadcast:
  765. ipoib_destroy_qset ( ipoib, &ipoib->data );
  766. err_create_data_qset:
  767. ipoib_destroy_qset ( ipoib, &ipoib->meta );
  768. err_create_meta_qset:
  769. return rc;
  770. }
  771. /**
  772. * Close IPoIB network device
  773. *
  774. * @v netdev Network device
  775. */
  776. static void ipoib_close ( struct net_device *netdev ) {
  777. struct ipoib_device *ipoib = netdev->priv;
  778. struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
  779. /* Leave broadcast group */
  780. ipoib_leave_broadcast_group ( ipoib );
  781. /* Remove data QPN from MAC address */
  782. mac->qpn = 0;
  783. /* Tear down the queues */
  784. ipoib_destroy_qset ( ipoib, &ipoib->data );
  785. ipoib_destroy_qset ( ipoib, &ipoib->meta );
  786. }
  787. /** IPoIB network device operations */
  788. static struct net_device_operations ipoib_operations = {
  789. .open = ipoib_open,
  790. .close = ipoib_close,
  791. .transmit = ipoib_transmit,
  792. .poll = ipoib_poll,
  793. .irq = ipoib_irq,
  794. };
  795. /**
  796. * Update IPoIB dynamic Infiniband parameters
  797. *
  798. * @v ipoib IPoIB device
  799. *
  800. * The Infiniband port GID and partition key will change at runtime,
  801. * when the link is established (or lost). The MAC address is based
  802. * on the port GID, and the broadcast GID is based on the partition
  803. * key. This function recalculates these IPoIB device parameters.
  804. */
  805. static void ipoib_set_ib_params ( struct ipoib_device *ipoib ) {
  806. struct ib_device *ibdev = ipoib->ibdev;
  807. struct net_device *netdev = ipoib->netdev;
  808. struct ipoib_mac *mac;
  809. /* Calculate GID portion of MAC address based on port GID */
  810. mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
  811. memcpy ( &mac->gid, &ibdev->port_gid, sizeof ( mac->gid ) );
  812. /* Calculate broadcast GID based on partition key */
  813. memcpy ( &ipoib->broadcast_gid, &ipv4_broadcast_gid,
  814. sizeof ( ipoib->broadcast_gid ) );
  815. ipoib->broadcast_gid.u.words[2] = htons ( ibdev->pkey );
  816. /* Set net device link state to reflect Infiniband link state */
  817. if ( ibdev->link_up ) {
  818. netdev_link_up ( netdev );
  819. } else {
  820. netdev_link_down ( netdev );
  821. }
  822. }
  823. /**
  824. * Handle link status change
  825. *
  826. * @v ibdev Infiniband device
  827. */
  828. void ipoib_link_state_changed ( struct ib_device *ibdev ) {
  829. struct net_device *netdev = ib_get_ownerdata ( ibdev );
  830. struct ipoib_device *ipoib = netdev->priv;
  831. int rc;
  832. /* Leave existing broadcast group */
  833. ipoib_leave_broadcast_group ( ipoib );
  834. /* Update MAC address and broadcast GID based on new port GID
  835. * and partition key.
  836. */
  837. ipoib_set_ib_params ( ipoib );
  838. /* Join new broadcast group */
  839. if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) {
  840. DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
  841. "%s\n", ipoib, strerror ( rc ) );
  842. return;
  843. }
  844. }
  845. /**
  846. * Probe IPoIB device
  847. *
  848. * @v ibdev Infiniband device
  849. * @ret rc Return status code
  850. */
  851. int ipoib_probe ( struct ib_device *ibdev ) {
  852. struct net_device *netdev;
  853. struct ipoib_device *ipoib;
  854. int rc;
  855. /* Allocate network device */
  856. netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
  857. if ( ! netdev )
  858. return -ENOMEM;
  859. netdev_init ( netdev, &ipoib_operations );
  860. ipoib = netdev->priv;
  861. ib_set_ownerdata ( ibdev, netdev );
  862. netdev->dev = ibdev->dev;
  863. memset ( ipoib, 0, sizeof ( *ipoib ) );
  864. ipoib->netdev = netdev;
  865. ipoib->ibdev = ibdev;
  866. /* Calculate as much of the broadcast GID and the MAC address
  867. * as we can. We won't know either of these in full until we
  868. * have link-up.
  869. */
  870. ipoib_set_ib_params ( ipoib );
  871. /* Register network device */
  872. if ( ( rc = register_netdev ( netdev ) ) != 0 )
  873. goto err_register_netdev;
  874. return 0;
  875. err_register_netdev:
  876. netdev_nullify ( netdev );
  877. netdev_put ( netdev );
  878. return rc;
  879. }
  880. /**
  881. * Remove IPoIB device
  882. *
  883. * @v ibdev Infiniband device
  884. */
  885. void ipoib_remove ( struct ib_device *ibdev ) {
  886. struct net_device *netdev = ib_get_ownerdata ( ibdev );
  887. unregister_netdev ( netdev );
  888. netdev_nullify ( netdev );
  889. netdev_put ( netdev );
  890. }