12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046 |
- /*
- * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- *
- * You can also choose to distribute this program under the terms of
- * the Unmodified Binary Distribution Licence (as given in the file
- * COPYING.UBDL), provided that you have satisfied its requirements.
- */
-
- FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
-
- #include <stdint.h>
- #include <stdlib.h>
- #include <stdio.h>
- #include <unistd.h>
- #include <string.h>
- #include <byteswap.h>
- #include <errno.h>
- #include <ipxe/errortab.h>
- #include <ipxe/malloc.h>
- #include <ipxe/if_arp.h>
- #include <ipxe/arp.h>
- #include <ipxe/if_ether.h>
- #include <ipxe/ethernet.h>
- #include <ipxe/ip.h>
- #include <ipxe/iobuf.h>
- #include <ipxe/netdevice.h>
- #include <ipxe/infiniband.h>
- #include <ipxe/ib_pathrec.h>
- #include <ipxe/ib_mcast.h>
- #include <ipxe/retry.h>
- #include <ipxe/ipoib.h>
-
- /** @file
- *
- * IP over Infiniband
- */
-
- /* Disambiguate the various error causes */
- #define ENXIO_ARP_REPLY __einfo_error ( EINFO_ENXIO_ARP_REPLY )
- #define EINFO_ENXIO_ARP_REPLY \
- __einfo_uniqify ( EINFO_ENXIO, 0x01, \
- "Missing REMAC for ARP reply target address" )
- #define ENXIO_NON_IPV4 __einfo_error ( EINFO_ENXIO_NON_IPV4 )
- #define EINFO_ENXIO_NON_IPV4 \
- __einfo_uniqify ( EINFO_ENXIO, 0x02, \
- "Missing REMAC for non-IPv4 packet" )
- #define ENXIO_ARP_SENT __einfo_error ( EINFO_ENXIO_ARP_SENT )
- #define EINFO_ENXIO_ARP_SENT \
- __einfo_uniqify ( EINFO_ENXIO, 0x03, \
- "Missing REMAC for IPv4 packet (ARP sent)" )
-
- /** Number of IPoIB send work queue entries */
- #define IPOIB_NUM_SEND_WQES 8
-
- /** Number of IPoIB receive work queue entries */
- #define IPOIB_NUM_RECV_WQES 4
-
- /** Number of IPoIB completion entries */
- #define IPOIB_NUM_CQES 16
-
- /** An IPoIB broadcast address */
- struct ipoib_broadcast {
- /** MAC address */
- struct ipoib_mac mac;
- /** Address vector */
- struct ib_address_vector av;
- /** Multicast group membership */
- struct ib_mc_membership membership;
- };
-
- /** An IPoIB device */
- struct ipoib_device {
- /** Network device */
- struct net_device *netdev;
- /** Underlying Infiniband device */
- struct ib_device *ibdev;
- /** List of IPoIB devices */
- struct list_head list;
- /** Completion queue */
- struct ib_completion_queue *cq;
- /** Queue pair */
- struct ib_queue_pair *qp;
- /** Local MAC */
- struct ipoib_mac mac;
- /** Broadcast address */
- struct ipoib_broadcast broadcast;
- /** REMAC cache */
- struct list_head peers;
- };
-
- /** Broadcast IPoIB address */
- static struct ipoib_mac ipoib_broadcast = {
- .flags__qpn = htonl ( IB_QPN_BROADCAST ),
- .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
- };
-
- /** Link status for "broadcast join in progress" */
- #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
- #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
- ( EINFO_EINPROGRESS, 0x01, "Joining" )
-
- /** Human-readable message for the link status */
- struct errortab ipoib_errors[] __errortab = {
- __einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
- };
-
- /** List of all IPoIB devices */
- static LIST_HEAD ( ipoib_devices );
-
- static struct net_device_operations ipoib_operations;
-
- /****************************************************************************
- *
- * IPoIB REMAC cache
- *
- ****************************************************************************
- */
-
- /** An IPoIB REMAC cache entry */
- struct ipoib_peer {
- /** List of REMAC cache entries */
- struct list_head list;
- /** Remote Ethermet MAC */
- struct ipoib_remac remac;
- /** MAC address */
- struct ipoib_mac mac;
- };
-
- /**
- * Find IPoIB MAC from REMAC
- *
- * @v ipoib IPoIB device
- * @v remac Remote Ethernet MAC
- * @ret mac IPoIB MAC (or NULL if not found)
- */
- static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
- const struct ipoib_remac *remac ) {
- struct ipoib_peer *peer;
-
- /* Check for broadcast or multicast REMAC. We transmit
- * multicasts as broadcasts for simplicity.
- */
- if ( is_multicast_ether_addr ( remac ) )
- return &ipoib->broadcast.mac;
-
- /* Try to find via REMAC cache */
- list_for_each_entry ( peer, &ipoib->peers, list ) {
- if ( memcmp ( remac, &peer->remac,
- sizeof ( peer->remac ) ) == 0 ) {
- /* Move peer to start of list */
- list_del ( &peer->list );
- list_add ( &peer->list, &ipoib->peers );
- return &peer->mac;
- }
- }
-
- DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
- ipoib, eth_ntoa ( remac ) );
- return NULL;
- }
-
- /**
- * Add IPoIB MAC to REMAC cache
- *
- * @v ipoib IPoIB device
- * @v remac Remote Ethernet MAC
- * @v mac IPoIB MAC
- * @ret rc Return status code
- */
- static int ipoib_map_remac ( struct ipoib_device *ipoib,
- const struct ipoib_remac *remac,
- const struct ipoib_mac *mac ) {
- struct ipoib_peer *peer;
-
- /* Check for existing entry in REMAC cache */
- list_for_each_entry ( peer, &ipoib->peers, list ) {
- if ( memcmp ( remac, &peer->remac,
- sizeof ( peer->remac ) ) == 0 ) {
- /* Move peer to start of list */
- list_del ( &peer->list );
- list_add ( &peer->list, &ipoib->peers );
- /* Update MAC */
- memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
- return 0;
- }
- }
-
- /* Create new entry */
- peer = malloc ( sizeof ( *peer ) );
- if ( ! peer )
- return -ENOMEM;
- memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
- memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
- list_add ( &peer->list, &ipoib->peers );
-
- return 0;
- }
-
- /**
- * Flush REMAC cache
- *
- * @v ipoib IPoIB device
- */
- static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
- struct ipoib_peer *peer;
- struct ipoib_peer *tmp;
-
- list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
- list_del ( &peer->list );
- free ( peer );
- }
- }
-
- /**
- * Discard some entries from the REMAC cache
- *
- * @ret discarded Number of cached items discarded
- */
- static unsigned int ipoib_discard_remac ( void ) {
- struct net_device *netdev;
- struct ipoib_device *ipoib;
- struct ipoib_peer *peer;
- unsigned int discarded = 0;
-
- /* Try to discard one cache entry for each IPoIB device */
- for_each_netdev ( netdev ) {
-
- /* Skip non-IPoIB devices */
- if ( netdev->op != &ipoib_operations )
- continue;
- ipoib = netdev->priv;
-
- /* Discard least recently used cache entry (if any) */
- list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
- list_del ( &peer->list );
- free ( peer );
- discarded++;
- break;
- }
- }
-
- return discarded;
- }
-
- /** IPoIB cache discarder */
- struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = {
- .discard = ipoib_discard_remac,
- };
-
- /****************************************************************************
- *
- * IPoIB link layer
- *
- ****************************************************************************
- */
-
- /**
- * Initialise IPoIB link-layer address
- *
- * @v hw_addr Hardware address
- * @v ll_addr Link-layer address
- */
- static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
- const uint8_t *guid = hw_addr;
- uint8_t *eth_addr = ll_addr;
- uint8_t guid_mask = IPOIB_GUID_MASK;
- unsigned int i;
-
- /* Extract bytes from GUID according to mask */
- for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
- if ( guid_mask & 0x80 )
- *(eth_addr++) = *guid;
- }
- }
-
- /** IPoIB protocol */
- struct ll_protocol ipoib_protocol __ll_protocol = {
- .name = "IPoIB",
- .ll_proto = htons ( ARPHRD_ETHER ),
- .hw_addr_len = sizeof ( union ib_guid ),
- .ll_addr_len = ETH_ALEN,
- .ll_header_len = ETH_HLEN,
- .push = eth_push,
- .pull = eth_pull,
- .init_addr = ipoib_init_addr,
- .ntoa = eth_ntoa,
- .mc_hash = eth_mc_hash,
- .eth_addr = eth_eth_addr,
- .eui64 = eth_eui64,
- .flags = LL_NAME_ONLY,
- };
-
- /**
- * Allocate IPoIB device
- *
- * @v priv_size Size of driver private data
- * @ret netdev Network device, or NULL
- */
- struct net_device * alloc_ipoibdev ( size_t priv_size ) {
- struct net_device *netdev;
-
- netdev = alloc_netdev ( priv_size );
- if ( netdev ) {
- netdev->ll_protocol = &ipoib_protocol;
- netdev->ll_broadcast = eth_broadcast;
- netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
- }
- return netdev;
- }
-
- /****************************************************************************
- *
- * IPoIB translation layer
- *
- ****************************************************************************
- */
-
- /**
- * Translate transmitted ARP packet
- *
- * @v netdev Network device
- * @v iobuf Packet to be transmitted (with no link-layer headers)
- * @ret rc Return status code
- */
- static int ipoib_translate_tx_arp ( struct net_device *netdev,
- struct io_buffer *iobuf ) {
- struct ipoib_device *ipoib = netdev->priv;
- struct arphdr *arphdr = iobuf->data;
- struct ipoib_mac *target_ha = NULL;
- void *sender_pa;
- void *target_pa;
-
- /* Do nothing unless ARP contains eIPoIB link-layer addresses */
- if ( arphdr->ar_hln != ETH_ALEN )
- return 0;
-
- /* Fail unless we have room to expand packet */
- if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
- ETH_ALEN ) ) ) {
- DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
- ipoib );
- return -ENOBUFS;
- }
-
- /* Look up REMAC, if applicable */
- if ( arphdr->ar_op == ARPOP_REPLY ) {
- target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
- if ( ! target_ha ) {
- DBGC ( ipoib, "IPoIB %p no REMAC for %s ARP reply\n",
- ipoib, eth_ntoa ( arp_target_pa ( arphdr ) ) );
- return -ENXIO_ARP_REPLY;
- }
- }
-
- /* Construct new packet */
- iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
- sender_pa = arp_sender_pa ( arphdr );
- target_pa = arp_target_pa ( arphdr );
- arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
- arphdr->ar_hln = sizeof ( ipoib->mac );
- memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
- memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
- memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
- memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
- if ( target_ha ) {
- memcpy ( arp_target_ha ( arphdr ), target_ha,
- sizeof ( *target_ha ) );
- }
-
- return 0;
- }
-
- /**
- * Translate transmitted packet
- *
- * @v netdev Network device
- * @v iobuf Packet to be transmitted (with no link-layer headers)
- * @v net_proto Network-layer protocol (in network byte order)
- * @ret rc Return status code
- */
- static int ipoib_translate_tx ( struct net_device *netdev,
- struct io_buffer *iobuf, uint16_t net_proto ) {
-
- switch ( net_proto ) {
- case htons ( ETH_P_ARP ) :
- return ipoib_translate_tx_arp ( netdev, iobuf );
- case htons ( ETH_P_IP ) :
- /* No translation needed */
- return 0;
- default:
- /* Cannot handle other traffic via eIPoIB */
- return -ENOTSUP;
- }
- }
-
- /**
- * Translate received ARP packet
- *
- * @v netdev Network device
- * @v iobuf Received packet (with no link-layer headers)
- * @v remac Constructed Remote Ethernet MAC
- * @ret rc Return status code
- */
- static int ipoib_translate_rx_arp ( struct net_device *netdev,
- struct io_buffer *iobuf,
- struct ipoib_remac *remac ) {
- struct ipoib_device *ipoib = netdev->priv;
- struct arphdr *arphdr = iobuf->data;
- void *sender_pa;
- void *target_pa;
- int rc;
-
- /* Do nothing unless ARP contains IPoIB link-layer addresses */
- if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
- return 0;
-
- /* Create REMAC cache entry */
- if ( ( rc = ipoib_map_remac ( ipoib, remac,
- arp_sender_ha ( arphdr ) ) ) != 0 ) {
- DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
- ipoib, strerror ( rc ) );
- return rc;
- }
-
- /* Construct new packet */
- sender_pa = arp_sender_pa ( arphdr );
- target_pa = arp_target_pa ( arphdr );
- arphdr->ar_hrd = htons ( ARPHRD_ETHER );
- arphdr->ar_hln = ETH_ALEN;
- memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
- memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
- memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
- memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
- if ( arphdr->ar_op == ARPOP_REPLY ) {
- /* Assume received replies were directed to us */
- memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
- }
- iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
-
- return 0;
- }
-
- /**
- * Translate received packet
- *
- * @v netdev Network device
- * @v iobuf Received packet (with no link-layer headers)
- * @v remac Constructed Remote Ethernet MAC
- * @v net_proto Network-layer protocol (in network byte order)
- * @ret rc Return status code
- */
- static int ipoib_translate_rx ( struct net_device *netdev,
- struct io_buffer *iobuf,
- struct ipoib_remac *remac,
- uint16_t net_proto ) {
-
- switch ( net_proto ) {
- case htons ( ETH_P_ARP ) :
- return ipoib_translate_rx_arp ( netdev, iobuf, remac );
- case htons ( ETH_P_IP ) :
- /* No translation needed */
- return 0;
- default:
- /* Cannot handle other traffic via eIPoIB */
- return -ENOTSUP;
- }
- }
-
- /****************************************************************************
- *
- * IPoIB network device
- *
- ****************************************************************************
- */
-
- /**
- * Transmit packet via IPoIB network device
- *
- * @v netdev Network device
- * @v iobuf I/O buffer
- * @ret rc Return status code
- */
- static int ipoib_transmit ( struct net_device *netdev,
- struct io_buffer *iobuf ) {
- struct ipoib_device *ipoib = netdev->priv;
- struct ib_device *ibdev = ipoib->ibdev;
- struct ethhdr *ethhdr;
- struct iphdr *iphdr;
- struct ipoib_hdr *ipoib_hdr;
- struct ipoib_remac *remac;
- struct ipoib_mac *mac;
- struct ib_address_vector *dest;
- struct ib_address_vector av;
- uint16_t net_proto;
- int rc;
-
- /* Sanity check */
- if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
- DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
- return -EINVAL;
- }
-
- /* Attempting transmission while link is down will put the
- * queue pair into an error state, so don't try it.
- */
- if ( ! ib_link_ok ( ibdev ) )
- return -ENETUNREACH;
-
- /* Strip eIPoIB header */
- ethhdr = iobuf->data;
- remac = ( ( struct ipoib_remac * ) ethhdr->h_dest );
- net_proto = ethhdr->h_protocol;
- iob_pull ( iobuf, sizeof ( *ethhdr ) );
-
- /* Identify destination address */
- if ( is_multicast_ether_addr ( remac ) ) {
-
- /* Transmit multicasts as broadcasts, for simplicity */
- dest = &ipoib->broadcast.av;
-
- } else if ( ( mac = ipoib_find_remac ( ipoib, remac ) ) ) {
-
- /* Construct address vector from IPoIB MAC */
- dest = &av;
- memset ( dest, 0, sizeof ( *dest ) );
- dest->qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
- dest->qkey = ipoib->broadcast.av.qkey;
- dest->gid_present = 1;
- memcpy ( &dest->gid, &mac->gid, sizeof ( dest->gid ) );
- if ( ( rc = ib_resolve_path ( ibdev, dest ) ) != 0 ) {
- /* Path not resolved yet */
- return rc;
- }
-
- } else {
-
- /* Generate a new ARP request (if possible) to trigger
- * population of the REMAC cache entry.
- */
- if ( ( net_proto != htons ( ETH_P_IP ) ) ||
- ( iob_len ( iobuf ) < sizeof ( *iphdr ) ) ) {
- DBGC ( ipoib, "IPoIB %p no REMAC for %s non-IPv4 "
- "packet type %04x\n", ipoib,
- eth_ntoa ( ethhdr->h_dest ),
- ntohs ( net_proto ) );
- return -ENXIO_NON_IPV4;
- }
- iphdr = iobuf->data;
- if ( ( rc = arp_tx_request ( netdev, &ipv4_protocol,
- &iphdr->dest, &iphdr->src ) ) !=0){
- DBGC ( ipoib, "IPoIB %p could not ARP for %s/%s/",
- ipoib, eth_ntoa ( ethhdr->h_dest ),
- inet_ntoa ( iphdr->dest ) );
- DBGC ( ipoib, "%s: %s\n", inet_ntoa ( iphdr->src ),
- strerror ( rc ) );
- return rc;
- }
- DBGC ( ipoib, "IPoIB %p no REMAC for %s/%s/", ipoib,
- eth_ntoa ( ethhdr->h_dest ), inet_ntoa ( iphdr->dest ) );
- DBGC ( ipoib, "%s\n", inet_ntoa ( iphdr->src ) );
- return -ENXIO_ARP_SENT;
- }
-
- /* Translate packet if applicable */
- if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
- return rc;
-
- /* Prepend real IPoIB header */
- ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
- ipoib_hdr->proto = net_proto;
- ipoib_hdr->reserved = 0;
-
- /* Transmit packet */
- return ib_post_send ( ibdev, ipoib->qp, dest, iobuf );
- }
-
- /**
- * Handle IPoIB send completion
- *
- * @v ibdev Infiniband device
- * @v qp Queue pair
- * @v iobuf I/O buffer
- * @v rc Completion status code
- */
- static void ipoib_complete_send ( struct ib_device *ibdev __unused,
- struct ib_queue_pair *qp,
- struct io_buffer *iobuf, int rc ) {
- struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
-
- netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
- }
-
- /**
- * Handle IPoIB receive completion
- *
- * @v ibdev Infiniband device
- * @v qp Queue pair
- * @v dest Destination address vector, or NULL
- * @v source Source address vector, or NULL
- * @v iobuf I/O buffer
- * @v rc Completion status code
- */
- static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
- struct ib_queue_pair *qp,
- struct ib_address_vector *dest,
- struct ib_address_vector *source,
- struct io_buffer *iobuf, int rc ) {
- struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
- struct net_device *netdev = ipoib->netdev;
- struct ipoib_hdr *ipoib_hdr;
- struct ethhdr *ethhdr;
- struct ipoib_remac remac;
- uint16_t net_proto;
-
- /* Record errors */
- if ( rc != 0 ) {
- netdev_rx_err ( netdev, iobuf, rc );
- return;
- }
-
- /* Sanity check */
- if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
- DBGC ( ipoib, "IPoIB %p received packet too short to "
- "contain IPoIB header\n", ipoib );
- DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
- netdev_rx_err ( netdev, iobuf, -EIO );
- return;
- }
- if ( ! source ) {
- DBGC ( ipoib, "IPoIB %p received packet without address "
- "vector\n", ipoib );
- netdev_rx_err ( netdev, iobuf, -ENOTTY );
- return;
- }
-
- /* Strip real IPoIB header */
- ipoib_hdr = iobuf->data;
- net_proto = ipoib_hdr->proto;
- iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
-
- /* Construct source address from remote QPN and LID */
- remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
- remac.lid = htons ( source->lid );
-
- /* Translate packet if applicable */
- if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
- net_proto ) ) != 0 ) {
- netdev_rx_err ( netdev, iobuf, rc );
- return;
- }
-
- /* Prepend eIPoIB header */
- ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
- memcpy ( ðhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
- ethhdr->h_protocol = net_proto;
-
- /* Construct destination address */
- if ( dest->gid_present && IB_GID_MULTICAST ( &dest->gid ) ) {
- /* Multicast GID: use the Ethernet broadcast address */
- memcpy ( ðhdr->h_dest, eth_broadcast,
- sizeof ( ethhdr->h_dest ) );
- } else {
- /* Assume destination address is local Ethernet MAC */
- memcpy ( ðhdr->h_dest, netdev->ll_addr,
- sizeof ( ethhdr->h_dest ) );
- }
-
- /* Hand off to network layer */
- netdev_rx ( netdev, iobuf );
- }
-
- /** IPoIB completion operations */
- static struct ib_completion_queue_operations ipoib_cq_op = {
- .complete_send = ipoib_complete_send,
- .complete_recv = ipoib_complete_recv,
- };
-
- /**
- * Allocate IPoIB receive I/O buffer
- *
- * @v len Length of buffer
- * @ret iobuf I/O buffer, or NULL
- *
- * Some Infiniband hardware requires 2kB alignment of receive buffers
- * and provides no way to disable header separation. The result is
- * that there are only four bytes of link-layer header (the real IPoIB
- * header) before the payload. This is not sufficient space to insert
- * an eIPoIB link-layer pseudo-header.
- *
- * We therefore allocate I/O buffers offset to start slightly before
- * the natural alignment boundary, in order to allow sufficient space.
- */
- static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
- struct io_buffer *iobuf;
- size_t reserve_len;
-
- /* Calculate additional length required at start of buffer */
- reserve_len = ( sizeof ( struct ethhdr ) -
- sizeof ( struct ipoib_hdr ) );
-
- /* Allocate buffer */
- iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
- if ( iobuf ) {
- iob_reserve ( iobuf, reserve_len );
- }
- return iobuf;
- }
-
- /** IPoIB queue pair operations */
- static struct ib_queue_pair_operations ipoib_qp_op = {
- .alloc_iob = ipoib_alloc_iob,
- };
-
- /**
- * Poll IPoIB network device
- *
- * @v netdev Network device
- */
- static void ipoib_poll ( struct net_device *netdev ) {
- struct ipoib_device *ipoib = netdev->priv;
- struct ib_device *ibdev = ipoib->ibdev;
-
- /* Poll Infiniband device */
- ib_poll_eq ( ibdev );
-
- /* Poll the retry timers (required for IPoIB multicast join) */
- retry_poll();
- }
-
- /**
- * Handle IPv4 broadcast multicast group join completion
- *
- * @v membership Multicast group membership
- * @v rc Status code
- */
- void ipoib_join_complete ( struct ib_mc_membership *membership, int rc ) {
- struct ipoib_device *ipoib = container_of ( membership,
- struct ipoib_device,
- broadcast.membership );
-
- /* Record join status as link status */
- netdev_link_err ( ipoib->netdev, rc );
- }
-
- /**
- * Join IPv4 broadcast multicast group
- *
- * @v ipoib IPoIB device
- * @ret rc Return status code
- */
- static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
- int rc;
-
- /* Join multicast group */
- if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
- &ipoib->broadcast.membership,
- &ipoib->broadcast.av, 0,
- ipoib_join_complete ) ) != 0 ) {
- DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
- ipoib, strerror ( rc ) );
- return rc;
- }
-
- return 0;
- }
-
- /**
- * Leave IPv4 broadcast multicast group
- *
- * @v ipoib IPoIB device
- */
- static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
-
- /* Leave multicast group */
- ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
- &ipoib->broadcast.membership );
- }
-
- /**
- * Handle link status change
- *
- * @v ipoib IPoIB device
- */
- static void ipoib_link_state_changed ( struct ipoib_device *ipoib ) {
- struct ib_device *ibdev = ipoib->ibdev;
- struct net_device *netdev = ipoib->netdev;
- int rc;
-
- /* Leave existing broadcast group */
- if ( ipoib->qp )
- ipoib_leave_broadcast_group ( ipoib );
-
- /* Update MAC address based on potentially-new GID prefix */
- memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
- sizeof ( ipoib->mac.gid.s.prefix ) );
-
- /* Update broadcast MAC GID based on potentially-new partition key */
- ipoib->broadcast.mac.gid.words[2] =
- htons ( ibdev->pkey | IB_PKEY_FULL );
-
- /* Construct broadcast address vector from broadcast MAC address */
- memset ( &ipoib->broadcast.av, 0, sizeof ( ipoib->broadcast.av ) );
- ipoib->broadcast.av.qpn = IB_QPN_BROADCAST;
- ipoib->broadcast.av.gid_present = 1;
- memcpy ( &ipoib->broadcast.av.gid, &ipoib->broadcast.mac.gid,
- sizeof ( ipoib->broadcast.av.gid ) );
-
- /* Set net device link state to reflect Infiniband link state */
- rc = ib_link_rc ( ibdev );
- netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
-
- /* Join new broadcast group */
- if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && ipoib->qp &&
- ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
- DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
- "%s\n", ipoib, strerror ( rc ) );
- netdev_link_err ( netdev, rc );
- return;
- }
- }
-
- /**
- * Open IPoIB network device
- *
- * @v netdev Network device
- * @ret rc Return status code
- */
- static int ipoib_open ( struct net_device *netdev ) {
- struct ipoib_device *ipoib = netdev->priv;
- struct ib_device *ibdev = ipoib->ibdev;
- int rc;
-
- /* Open IB device */
- if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
- DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
- ipoib, strerror ( rc ) );
- goto err_ib_open;
- }
-
- /* Allocate completion queue */
- if ( ( rc = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op,
- &ipoib->cq ) ) != 0 ) {
- DBGC ( ipoib, "IPoIB %p could not create completion queue: "
- "%s\n", ipoib, strerror ( rc ) );
- goto err_create_cq;
- }
-
- /* Allocate queue pair */
- if ( ( rc = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
- ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
- &ipoib_qp_op, netdev->name,
- &ipoib->qp ) ) != 0 ) {
- DBGC ( ipoib, "IPoIB %p could not create queue pair: %s\n",
- ipoib, strerror ( rc ) );
- goto err_create_qp;
- }
- ib_qp_set_ownerdata ( ipoib->qp, ipoib );
-
- /* Update MAC address with QPN */
- ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
-
- /* Fill receive rings */
- ib_refill_recv ( ibdev, ipoib->qp );
-
- /* Fake a link status change to join the broadcast group */
- ipoib_link_state_changed ( ipoib );
-
- return 0;
-
- ib_destroy_qp ( ibdev, ipoib->qp );
- err_create_qp:
- ib_destroy_cq ( ibdev, ipoib->cq );
- err_create_cq:
- ib_close ( ibdev );
- err_ib_open:
- return rc;
- }
-
- /**
- * Close IPoIB network device
- *
- * @v netdev Network device
- */
- static void ipoib_close ( struct net_device *netdev ) {
- struct ipoib_device *ipoib = netdev->priv;
- struct ib_device *ibdev = ipoib->ibdev;
-
- /* Flush REMAC cache */
- ipoib_flush_remac ( ipoib );
-
- /* Leave broadcast group */
- ipoib_leave_broadcast_group ( ipoib );
-
- /* Remove QPN from MAC address */
- ipoib->mac.flags__qpn = 0;
-
- /* Tear down the queues */
- ib_destroy_qp ( ibdev, ipoib->qp );
- ipoib->qp = NULL;
- ib_destroy_cq ( ibdev, ipoib->cq );
- ipoib->cq = NULL;
-
- /* Close IB device */
- ib_close ( ibdev );
- }
-
- /** IPoIB network device operations */
- static struct net_device_operations ipoib_operations = {
- .open = ipoib_open,
- .close = ipoib_close,
- .transmit = ipoib_transmit,
- .poll = ipoib_poll,
- };
-
- /**
- * Probe IPoIB device
- *
- * @v ibdev Infiniband device
- * @ret rc Return status code
- */
- static int ipoib_probe ( struct ib_device *ibdev ) {
- struct net_device *netdev;
- struct ipoib_device *ipoib;
- int rc;
-
- /* Allocate network device */
- netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
- if ( ! netdev )
- return -ENOMEM;
- netdev_init ( netdev, &ipoib_operations );
- ipoib = netdev->priv;
- netdev->dev = ibdev->dev;
- memset ( ipoib, 0, sizeof ( *ipoib ) );
- ipoib->netdev = netdev;
- ipoib->ibdev = ibdev;
- INIT_LIST_HEAD ( &ipoib->peers );
-
- /* Extract hardware address */
- memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
- sizeof ( ibdev->gid.s.guid ) );
- memcpy ( netdev->ll_addr, ibdev->lemac, ETH_ALEN );
-
- /* Set local MAC address */
- memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
- sizeof ( ipoib->mac.gid.s.guid ) );
-
- /* Set default broadcast MAC address */
- memcpy ( &ipoib->broadcast.mac, &ipoib_broadcast,
- sizeof ( ipoib->broadcast.mac ) );
-
- /* Add to list of IPoIB devices */
- list_add_tail ( &ipoib->list, &ipoib_devices );
-
- /* Register network device */
- if ( ( rc = register_netdev ( netdev ) ) != 0 )
- goto err_register_netdev;
-
- return 0;
-
- unregister_netdev ( netdev );
- err_register_netdev:
- list_del ( &ipoib->list );
- netdev_nullify ( netdev );
- netdev_put ( netdev );
- return rc;
- }
-
- /**
- * Handle device or link status change
- *
- * @v ibdev Infiniband device
- */
- static void ipoib_notify ( struct ib_device *ibdev ) {
- struct ipoib_device *ipoib;
-
- /* Handle link status change for any attached IPoIB devices */
- list_for_each_entry ( ipoib, &ipoib_devices, list ) {
- if ( ipoib->ibdev != ibdev )
- continue;
- ipoib_link_state_changed ( ipoib );
- }
- }
-
- /**
- * Remove IPoIB device
- *
- * @v ibdev Infiniband device
- */
- static void ipoib_remove ( struct ib_device *ibdev ) {
- struct ipoib_device *ipoib;
- struct ipoib_device *tmp;
- struct net_device *netdev;
-
- /* Remove any attached IPoIB devices */
- list_for_each_entry_safe ( ipoib, tmp, &ipoib_devices, list ) {
- if ( ipoib->ibdev != ibdev )
- continue;
- netdev = ipoib->netdev;
- unregister_netdev ( netdev );
- list_del ( &ipoib->list );
- netdev_nullify ( netdev );
- netdev_put ( netdev );
- }
- }
-
- /** IPoIB driver */
- struct ib_driver ipoib_driver __ib_driver = {
- .name = "IPoIB",
- .probe = ipoib_probe,
- .notify = ipoib_notify,
- .remove = ipoib_remove,
- };
-
- /**
- * Find IPoIB network device
- *
- * @v ibdev Infiniband device
- * @ret netdev IPoIB network device, or NULL if not found
- */
- struct net_device * ipoib_netdev ( struct ib_device *ibdev ) {
- struct ipoib_device *ipoib;
-
- /* Find matching IPoIB device */
- list_for_each_entry ( ipoib, &ipoib_devices, list ) {
- if ( ipoib->ibdev != ibdev )
- continue;
- return ipoib->netdev;
- }
- return NULL;
- }
|