Browse Source

Merge branch '3leaf'

tags/v0.9.3
Michael Brown 17 years ago
parent
commit
1620b3512c

+ 1
- 0
src/Makefile View File

@@ -152,6 +152,7 @@ SRCDIRS		+= drivers/scsi
152 152
 SRCDIRS		+= drivers/ata
153 153
 SRCDIRS		+= drivers/nvs
154 154
 SRCDIRS		+= drivers/bitbash
155
+SRCDIRS		+= drivers/infiniband
155 156
 SRCDIRS		+= interface/pxe
156 157
 SRCDIRS		+= tests
157 158
 SRCDIRS		+= crypto crypto/axtls crypto/matrixssl

+ 3460
- 0
src/drivers/infiniband/MT25218_PRM.h
File diff suppressed because it is too large
View File


+ 2129
- 0
src/drivers/infiniband/arbel.c
File diff suppressed because it is too large
View File


+ 461
- 0
src/drivers/infiniband/arbel.h View File

@@ -0,0 +1,461 @@
1
+#ifndef _ARBEL_H
2
+#define _ARBEL_H
3
+
4
+/** @file
5
+ *
6
+ * Mellanox Arbel Infiniband HCA driver
7
+ *
8
+ */
9
+
10
+#include <stdint.h>
11
+#include <gpxe/uaccess.h>
12
+#include "mlx_bitops.h"
13
+#include "MT25218_PRM.h"
14
+
15
+/*
16
+ * Hardware constants
17
+ *
18
+ */
19
+
20
+/* PCI BARs */
21
+#define ARBEL_PCI_CONFIG_BAR		PCI_BASE_ADDRESS_0
22
+#define ARBEL_PCI_CONFIG_BAR_SIZE	0x100000
23
+#define ARBEL_PCI_UAR_BAR		PCI_BASE_ADDRESS_2
24
+#define ARBEL_PCI_UAR_IDX		1
25
+#define ARBEL_PCI_UAR_SIZE		0x1000
26
+
27
+/* UAR context table (UCE) resource types */
28
+#define ARBEL_UAR_RES_NONE		0x00
29
+#define ARBEL_UAR_RES_CQ_CI		0x01
30
+#define ARBEL_UAR_RES_CQ_ARM		0x02
31
+#define ARBEL_UAR_RES_SQ		0x03
32
+#define ARBEL_UAR_RES_RQ		0x04
33
+#define ARBEL_UAR_RES_GROUP_SEP		0x07
34
+
35
+/* Work queue entry and completion queue entry opcodes */
36
+#define ARBEL_OPCODE_SEND		0x0a
37
+#define ARBEL_OPCODE_RECV_ERROR		0xfe
38
+#define ARBEL_OPCODE_SEND_ERROR		0xff
39
+
40
+/* HCA command register opcodes */
41
+#define ARBEL_HCR_QUERY_DEV_LIM		0x0003
42
+#define ARBEL_HCR_QUERY_FW		0x0004
43
+#define ARBEL_HCR_INIT_HCA		0x0007
44
+#define ARBEL_HCR_CLOSE_HCA		0x0008
45
+#define ARBEL_HCR_INIT_IB		0x0009
46
+#define ARBEL_HCR_CLOSE_IB		0x000a
47
+#define ARBEL_HCR_SW2HW_MPT		0x000d
48
+#define ARBEL_HCR_MAP_EQ		0x0012
49
+#define ARBEL_HCR_SW2HW_EQ		0x0013
50
+#define ARBEL_HCR_HW2SW_EQ		0x0014
51
+#define ARBEL_HCR_SW2HW_CQ		0x0016
52
+#define ARBEL_HCR_HW2SW_CQ		0x0017
53
+#define ARBEL_HCR_RST2INIT_QPEE		0x0019
54
+#define ARBEL_HCR_INIT2RTR_QPEE		0x001a
55
+#define ARBEL_HCR_RTR2RTS_QPEE		0x001b
56
+#define ARBEL_HCR_2RST_QPEE		0x0021
57
+#define ARBEL_HCR_MAD_IFC		0x0024
58
+#define ARBEL_HCR_READ_MGM		0x0025
59
+#define ARBEL_HCR_WRITE_MGM		0x0026
60
+#define ARBEL_HCR_MGID_HASH		0x0027
61
+#define ARBEL_HCR_RUN_FW		0x0ff6
62
+#define ARBEL_HCR_DISABLE_LAM		0x0ff7
63
+#define ARBEL_HCR_ENABLE_LAM		0x0ff8
64
+#define ARBEL_HCR_UNMAP_ICM		0x0ff9
65
+#define ARBEL_HCR_MAP_ICM		0x0ffa
66
+#define ARBEL_HCR_UNMAP_ICM_AUX		0x0ffb
67
+#define ARBEL_HCR_MAP_ICM_AUX		0x0ffc
68
+#define ARBEL_HCR_SET_ICM_SIZE		0x0ffd
69
+#define ARBEL_HCR_UNMAP_FA		0x0ffe
70
+#define ARBEL_HCR_MAP_FA		0x0fff
71
+
72
+/* Service types */
73
+#define ARBEL_ST_UD			0x03
74
+
75
+/* MTUs */
76
+#define ARBEL_MTU_2048			0x04
77
+
78
+#define ARBEL_NO_EQ			64
79
+
80
+#define ARBEL_INVALID_LKEY		0x00000100UL
81
+
82
+#define ARBEL_PAGE_SIZE			4096
83
+
84
+#define ARBEL_DB_POST_SND_OFFSET	0x10
85
+
86
+/*
87
+ * Datatypes that seem to be missing from the autogenerated documentation
88
+ *
89
+ */
90
+struct arbelprm_mgm_hash_st {
91
+	pseudo_bit_t reserved0[0x00020];
92
+/* -------------- */
93
+	pseudo_bit_t hash[0x00010];
94
+	pseudo_bit_t reserved1[0x00010];
95
+} __attribute__ (( packed ));
96
+
97
+struct arbelprm_scalar_parameter_st {
98
+	pseudo_bit_t reserved0[0x00020];
99
+/* -------------- */
100
+	pseudo_bit_t value[0x00020];
101
+} __attribute__ (( packed ));
102
+
103
+/*
104
+ * Wrapper structures for hardware datatypes
105
+ *
106
+ */
107
+
108
+struct MLX_DECLARE_STRUCT ( arbelprm_access_lam );
109
+struct MLX_DECLARE_STRUCT ( arbelprm_completion_queue_context );
110
+struct MLX_DECLARE_STRUCT ( arbelprm_completion_queue_entry );
111
+struct MLX_DECLARE_STRUCT ( arbelprm_completion_with_error );
112
+struct MLX_DECLARE_STRUCT ( arbelprm_cq_arm_db_record );
113
+struct MLX_DECLARE_STRUCT ( arbelprm_cq_ci_db_record );
114
+struct MLX_DECLARE_STRUCT ( arbelprm_eqc );
115
+struct MLX_DECLARE_STRUCT ( arbelprm_hca_command_register );
116
+struct MLX_DECLARE_STRUCT ( arbelprm_init_hca );
117
+struct MLX_DECLARE_STRUCT ( arbelprm_init_ib );
118
+struct MLX_DECLARE_STRUCT ( arbelprm_mad_ifc );
119
+struct MLX_DECLARE_STRUCT ( arbelprm_mgm_entry );
120
+struct MLX_DECLARE_STRUCT ( arbelprm_mgm_hash );
121
+struct MLX_DECLARE_STRUCT ( arbelprm_mpt );
122
+struct MLX_DECLARE_STRUCT ( arbelprm_qp_db_record );
123
+struct MLX_DECLARE_STRUCT ( arbelprm_qp_ee_state_transitions );
124
+struct MLX_DECLARE_STRUCT ( arbelprm_query_dev_lim );
125
+struct MLX_DECLARE_STRUCT ( arbelprm_query_fw );
126
+struct MLX_DECLARE_STRUCT ( arbelprm_queue_pair_ee_context_entry );
127
+struct MLX_DECLARE_STRUCT ( arbelprm_recv_wqe_segment_next );
128
+struct MLX_DECLARE_STRUCT ( arbelprm_scalar_parameter );
129
+struct MLX_DECLARE_STRUCT ( arbelprm_send_doorbell );
130
+struct MLX_DECLARE_STRUCT ( arbelprm_ud_address_vector );
131
+struct MLX_DECLARE_STRUCT ( arbelprm_virtual_physical_mapping );
132
+struct MLX_DECLARE_STRUCT ( arbelprm_wqe_segment_ctrl_send );
133
+struct MLX_DECLARE_STRUCT ( arbelprm_wqe_segment_data_ptr );
134
+struct MLX_DECLARE_STRUCT ( arbelprm_wqe_segment_next );
135
+struct MLX_DECLARE_STRUCT ( arbelprm_wqe_segment_ud );
136
+
137
+/*
138
+ * Composite hardware datatypes
139
+ *
140
+ */
141
+
142
+#define ARBEL_MAX_GATHER 1
143
+
144
+struct arbelprm_ud_send_wqe {
145
+	struct arbelprm_wqe_segment_next next;
146
+	struct arbelprm_wqe_segment_ctrl_send ctrl;
147
+	struct arbelprm_wqe_segment_ud ud;
148
+	struct arbelprm_wqe_segment_data_ptr data[ARBEL_MAX_GATHER];
149
+} __attribute__ (( packed ));
150
+
151
+#define ARBEL_MAX_SCATTER 1
152
+
153
+struct arbelprm_recv_wqe {
154
+	/* The autogenerated header is inconsistent between send and
155
+	 * receive WQEs.  The "ctrl" structure for receive WQEs is
156
+	 * defined to include the "next" structure.  Since the "ctrl"
157
+	 * part of the "ctrl" structure contains only "reserved, must
158
+	 * be zero" bits, we ignore its definition and provide
159
+	 * something more usable.
160
+	 */
161
+	struct arbelprm_recv_wqe_segment_next next;
162
+	uint32_t ctrl[2]; /* All "reserved, must be zero" */
163
+	struct arbelprm_wqe_segment_data_ptr data[ARBEL_MAX_SCATTER];
164
+} __attribute__ (( packed ));
165
+
166
+union arbelprm_completion_entry {
167
+	struct arbelprm_completion_queue_entry normal;
168
+	struct arbelprm_completion_with_error error;
169
+} __attribute__ (( packed ));
170
+
171
+union arbelprm_doorbell_record {
172
+	struct arbelprm_cq_arm_db_record cq_arm;
173
+	struct arbelprm_cq_ci_db_record cq_ci;
174
+	struct arbelprm_qp_db_record qp;
175
+} __attribute__ (( packed ));
176
+
177
+union arbelprm_doorbell_register {
178
+	struct arbelprm_send_doorbell send;
179
+	uint32_t dword[2];
180
+} __attribute__ (( packed ));
181
+
182
+union arbelprm_mad {
183
+	struct arbelprm_mad_ifc ifc;
184
+	union ib_mad mad;
185
+} __attribute__ (( packed ));
186
+
187
+/*
188
+ * gPXE-specific definitions
189
+ *
190
+ */
191
+
192
+/** Arbel device limits */
193
+struct arbel_dev_limits {
194
+	/** Number of reserved QPs */
195
+	unsigned int reserved_qps;
196
+	/** QP context entry size */
197
+	size_t qpc_entry_size;
198
+	/** Extended QP context entry size */
199
+	size_t eqpc_entry_size;
200
+	/** Number of reserved SRQs */
201
+	unsigned int reserved_srqs;
202
+	/** SRQ context entry size */
203
+	size_t srqc_entry_size;
204
+	/** Number of reserved EEs */
205
+	unsigned int reserved_ees;
206
+	/** EE context entry size */
207
+	size_t eec_entry_size;
208
+	/** Extended EE context entry size */
209
+	size_t eeec_entry_size;
210
+	/** Number of reserved CQs */
211
+	unsigned int reserved_cqs;
212
+	/** CQ context entry size */
213
+	size_t cqc_entry_size;
214
+	/** Number of reserved MTTs */
215
+	unsigned int reserved_mtts;
216
+	/** MTT entry size */
217
+	size_t mtt_entry_size;
218
+	/** Number of reserved MRWs */
219
+	unsigned int reserved_mrws;
220
+	/** MPT entry size */
221
+	size_t mpt_entry_size;
222
+	/** Number of reserved RDBs */
223
+	unsigned int reserved_rdbs;
224
+	/** EQ context entry size */
225
+	size_t eqc_entry_size;
226
+	/** Number of reserved UARs */
227
+	unsigned int reserved_uars;
228
+};
229
+
230
+/** Alignment of Arbel send work queue entries */
231
+#define ARBEL_SEND_WQE_ALIGN 128
232
+
233
+/** An Arbel send work queue entry */
234
+union arbel_send_wqe {
235
+	struct arbelprm_ud_send_wqe ud;
236
+	uint8_t force_align[ARBEL_SEND_WQE_ALIGN];
237
+} __attribute__ (( packed ));
238
+
239
+/** An Arbel send work queue */
240
+struct arbel_send_work_queue {
241
+	/** Doorbell record number */
242
+	unsigned int doorbell_idx;
243
+	/** Work queue entries */
244
+	union arbel_send_wqe *wqe;
245
+	/** Size of work queue */
246
+	size_t wqe_size;
247
+};
248
+
249
+/** Alignment of Arbel receive work queue entries */
250
+#define ARBEL_RECV_WQE_ALIGN 64
251
+
252
+/** An Arbel receive work queue entry */
253
+union arbel_recv_wqe {
254
+	struct arbelprm_recv_wqe recv;
255
+	uint8_t force_align[ARBEL_RECV_WQE_ALIGN];
256
+} __attribute__ (( packed ));
257
+
258
+/** An Arbel receive work queue */
259
+struct arbel_recv_work_queue {
260
+	/** Doorbell record number */
261
+	unsigned int doorbell_idx;
262
+	/** Work queue entries */
263
+	union arbel_recv_wqe *wqe;
264
+	/** Size of work queue */
265
+	size_t wqe_size;
266
+};
267
+
268
+/** Maximum number of allocatable queue pairs
269
+ *
270
+ * This is a policy decision, not a device limit.
271
+ */
272
+#define ARBEL_MAX_QPS		8
273
+
274
+/** Base queue pair number */
275
+#define ARBEL_QPN_BASE 0x550000
276
+
277
+/** An Arbel queue pair */
278
+struct arbel_queue_pair {
279
+	/** Send work queue */
280
+	struct arbel_send_work_queue send;
281
+	/** Receive work queue */
282
+	struct arbel_recv_work_queue recv;
283
+};
284
+
285
+/** Maximum number of allocatable completion queues
286
+ *
287
+ * This is a policy decision, not a device limit.
288
+ */
289
+#define ARBEL_MAX_CQS		8
290
+
291
+/** An Arbel completion queue */
292
+struct arbel_completion_queue {
293
+	/** Consumer counter doorbell record number */
294
+	unsigned int ci_doorbell_idx;
295
+	/** Arm queue doorbell record number */
296
+	unsigned int arm_doorbell_idx;
297
+	/** Completion queue entries */
298
+	union arbelprm_completion_entry *cqe;
299
+	/** Size of completion queue */
300
+	size_t cqe_size;
301
+};
302
+
303
+/** An Arbel resource bitmask */
304
+typedef uint32_t arbel_bitmask_t;
305
+
306
+/** Size of an Arbel resource bitmask */
307
+#define ARBEL_BITMASK_SIZE(max_entries)					     \
308
+	( ( (max_entries) + ( 8 * sizeof ( arbel_bitmask_t ) ) - 1 ) /	     \
309
+	  ( 8 * sizeof ( arbel_bitmask_t ) ) )
310
+
311
+/** An Arbel device */
312
+struct arbel {
313
+	/** PCI configuration registers */
314
+	void *config;
315
+	/** PCI user Access Region */
316
+	void *uar;
317
+
318
+	/** Command input mailbox */
319
+	void *mailbox_in;
320
+	/** Command output mailbox */
321
+	void *mailbox_out;
322
+
323
+	/** Firmware area in external memory */
324
+	userptr_t firmware_area;
325
+	/** ICM size */
326
+	size_t icm_len;
327
+	/** ICM AUX size */
328
+	size_t icm_aux_len;
329
+	/** ICM area */
330
+	userptr_t icm;
331
+
332
+	/** Doorbell records */
333
+	union arbelprm_doorbell_record *db_rec;
334
+	/** Reserved LKey
335
+	 *
336
+	 * Used to get unrestricted memory access.
337
+	 */
338
+	unsigned long reserved_lkey;
339
+
340
+	/** Completion queue in-use bitmask */
341
+	arbel_bitmask_t cq_inuse[ ARBEL_BITMASK_SIZE ( ARBEL_MAX_CQS ) ];
342
+	/** Queue pair in-use bitmask */
343
+	arbel_bitmask_t qp_inuse[ ARBEL_BITMASK_SIZE ( ARBEL_MAX_QPS ) ];
344
+	
345
+	/** Device limits */
346
+	struct arbel_dev_limits limits;
347
+};
348
+
349
+/** Global protection domain */
350
+#define ARBEL_GLOBAL_PD			0x123456
351
+
352
+/** Memory key prefix */
353
+#define ARBEL_MKEY_PREFIX		0x77000000UL
354
+
355
+/*
356
+ * HCA commands
357
+ *
358
+ */
359
+
360
+#define ARBEL_HCR_BASE			0x80680
361
+#define ARBEL_HCR_REG(x)		( ARBEL_HCR_BASE + 4 * (x) )
362
+#define ARBEL_HCR_MAX_WAIT_MS		2000
363
+#define ARBEL_MBOX_ALIGN		4096
364
+#define ARBEL_MBOX_SIZE			512
365
+
366
+/* HCA command is split into
367
+ *
368
+ * bits  11:0	Opcode
369
+ * bit     12	Input uses mailbox
370
+ * bit     13	Output uses mailbox
371
+ * bits 22:14	Input parameter length (in dwords)
372
+ * bits 31:23	Output parameter length (in dwords)
373
+ *
374
+ * Encoding the information in this way allows us to cut out several
375
+ * parameters to the arbel_command() call.
376
+ */
377
+#define ARBEL_HCR_IN_MBOX		0x00001000UL
378
+#define ARBEL_HCR_OUT_MBOX		0x00002000UL
379
+#define ARBEL_HCR_OPCODE( _command )	( (_command) & 0xfff )
380
+#define ARBEL_HCR_IN_LEN( _command )	( ( (_command) >> 12 ) & 0x7fc )
381
+#define ARBEL_HCR_OUT_LEN( _command )	( ( (_command) >> 21 ) & 0x7fc )
382
+
383
+/** Build HCR command from component parts */
384
+#define ARBEL_HCR_INOUT_CMD( _opcode, _in_mbox, _in_len,		     \
385
+			     _out_mbox, _out_len )			     \
386
+	( (_opcode) |							     \
387
+	  ( (_in_mbox) ? ARBEL_HCR_IN_MBOX : 0 ) |			     \
388
+	  ( ( (_in_len) / 4 ) << 14 ) |					     \
389
+	  ( (_out_mbox) ? ARBEL_HCR_OUT_MBOX : 0 ) |			     \
390
+	  ( ( (_out_len) / 4 ) << 23 ) )
391
+
392
+#define ARBEL_HCR_IN_CMD( _opcode, _in_mbox, _in_len )			     \
393
+	ARBEL_HCR_INOUT_CMD ( _opcode, _in_mbox, _in_len, 0, 0 )
394
+
395
+#define ARBEL_HCR_OUT_CMD( _opcode, _out_mbox, _out_len )		     \
396
+	ARBEL_HCR_INOUT_CMD ( _opcode, 0, 0, _out_mbox, _out_len )
397
+
398
+#define ARBEL_HCR_VOID_CMD( _opcode )					     \
399
+	ARBEL_HCR_INOUT_CMD ( _opcode, 0, 0, 0, 0 )
400
+
401
+/*
402
+ * Doorbell record allocation
403
+ *
404
+ * The doorbell record map looks like:
405
+ *
406
+ *    ARBEL_MAX_CQS * Arm completion queue doorbell
407
+ *    ARBEL_MAX_QPS * Send work request doorbell
408
+ *    Group separator
409
+ *    ...(empty space)...
410
+ *    ARBEL_MAX_QPS * Receive work request doorbell
411
+ *    ARBEL_MAX_CQS * Completion queue consumer counter update doorbell
412
+ */
413
+
414
+#define ARBEL_MAX_DOORBELL_RECORDS 512
415
+#define ARBEL_GROUP_SEPARATOR_DOORBELL ( ARBEL_MAX_CQS + ARBEL_MAX_QPS )
416
+
417
+/**
418
+ * Get arm completion queue doorbell index
419
+ *
420
+ * @v cqn_offset	Completion queue number offset
421
+ * @ret doorbell_idx	Doorbell index
422
+ */
423
+static inline unsigned int
424
+arbel_cq_arm_doorbell_idx ( unsigned int cqn_offset ) {
425
+	return cqn_offset;
426
+}
427
+
428
+/**
429
+ * Get send work request doorbell index
430
+ *
431
+ * @v qpn_offset	Queue pair number offset
432
+ * @ret doorbell_idx	Doorbell index
433
+ */
434
+static inline unsigned int
435
+arbel_send_doorbell_idx ( unsigned int qpn_offset ) {
436
+	return ( ARBEL_MAX_CQS + qpn_offset );
437
+}
438
+
439
+/**
440
+ * Get receive work request doorbell index
441
+ *
442
+ * @v qpn_offset	Queue pair number offset
443
+ * @ret doorbell_idx	Doorbell index
444
+ */
445
+static inline unsigned int
446
+arbel_recv_doorbell_idx ( unsigned int qpn_offset ) {
447
+	return ( ARBEL_MAX_DOORBELL_RECORDS - ARBEL_MAX_CQS - qpn_offset - 1 );
448
+}
449
+
450
+/**
451
+ * Get completion queue consumer counter doorbell index
452
+ *
453
+ * @v cqn_offset	Completion queue number offset
454
+ * @ret doorbell_idx	Doorbell index
455
+ */
456
+static inline unsigned int
457
+arbel_cq_ci_doorbell_idx ( unsigned int cqn_offset ) {
458
+	return ( ARBEL_MAX_DOORBELL_RECORDS - cqn_offset - 1 );
459
+}
460
+
461
+#endif /* _ARBEL_H */

+ 209
- 0
src/drivers/infiniband/mlx_bitops.h View File

@@ -0,0 +1,209 @@
1
+#ifndef _MLX_BITOPS_H
2
+#define _MLX_BITOPS_H
3
+
4
+/*
5
+ * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
6
+ *
7
+ * This program is free software; you can redistribute it and/or
8
+ * modify it under the terms of the GNU General Public License as
9
+ * published by the Free Software Foundation; either version 2 of the
10
+ * License, or any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful, but
13
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
+ * General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20
+ */
21
+
22
+/**
23
+ * @file
24
+ *
25
+ * Mellanox bit operations
26
+ *
27
+ */
28
+
29
+/* Datatype used to represent a bit in the Mellanox autogenerated headers */
30
+typedef unsigned char pseudo_bit_t;
31
+
32
+/**
33
+ * Wrapper structure for pseudo_bit_t structures
34
+ *
35
+ * This structure provides a wrapper around the autogenerated
36
+ * pseudo_bit_t structures.  It has the correct size, and also
37
+ * encapsulates type information about the underlying pseudo_bit_t
38
+ * structure, which allows the MLX_FILL etc. macros to work without
39
+ * requiring explicit type information.
40
+ */
41
+#define MLX_DECLARE_STRUCT( _structure )				     \
42
+	_structure {							     \
43
+	    union {							     \
44
+		uint8_t bytes[ sizeof ( struct _structure ## _st ) / 8 ];    \
45
+		uint32_t dwords[ sizeof ( struct _structure ## _st ) / 32 ]; \
46
+		struct _structure ## _st *dummy[0];			     \
47
+	    } u;							     \
48
+	}
49
+
50
+/** Get pseudo_bit_t structure type from wrapper structure pointer */
51
+#define MLX_PSEUDO_STRUCT( _ptr )					     \
52
+	typeof ( *((_ptr)->u.dummy[0]) )
53
+
54
+/** Bit offset of a field within a pseudo_bit_t structure */
55
+#define MLX_BIT_OFFSET( _structure_st, _field )				     \
56
+	offsetof ( _structure_st, _field )
57
+
58
+/** Dword offset of a field within a pseudo_bit_t structure */
59
+#define MLX_DWORD_OFFSET( _structure_st, _field )			     \
60
+	( MLX_BIT_OFFSET ( _structure_st, _field ) / 32 )
61
+
62
+/** Dword bit offset of a field within a pseudo_bit_t structure
63
+ *
64
+ * Yes, using mod-32 would work, but would lose the check for the
65
+ * error of specifying a mismatched field name and dword index.
66
+ */
67
+#define MLX_DWORD_BIT_OFFSET( _structure_st, _index, _field )		     \
68
+	( MLX_BIT_OFFSET ( _structure_st, _field ) - ( 32 * (_index) ) )
69
+
70
+/** Bit width of a field within a pseudo_bit_t structure */
71
+#define MLX_BIT_WIDTH( _structure_st, _field )				     \
72
+	sizeof ( ( ( _structure_st * ) NULL )->_field )
73
+
74
+/** Bit mask for a field within a pseudo_bit_t structure */
75
+#define MLX_BIT_MASK( _structure_st, _field )				     \
76
+	( ( ~( ( uint32_t ) 0 ) ) >>					     \
77
+	  ( 32 - MLX_BIT_WIDTH ( _structure_st, _field ) ) )
78
+
79
+/*
80
+ * Assemble native-endian dword from named fields and values
81
+ *
82
+ */
83
+
84
+#define MLX_ASSEMBLE_1( _structure_st, _index, _field, _value )		     \
85
+	( (_value) << MLX_DWORD_BIT_OFFSET ( _structure_st, _index, _field ) )
86
+
87
+#define MLX_ASSEMBLE_2( _structure_st, _index, _field, _value, ... )	     \
88
+	( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) |	     \
89
+	  MLX_ASSEMBLE_1 ( _structure_st, _index, __VA_ARGS__ ) )
90
+
91
+#define MLX_ASSEMBLE_3( _structure_st, _index, _field, _value, ... )	     \
92
+	( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) |	     \
93
+	  MLX_ASSEMBLE_2 ( _structure_st, _index, __VA_ARGS__ ) )
94
+
95
+#define MLX_ASSEMBLE_4( _structure_st, _index, _field, _value, ... )	     \
96
+	( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) |	     \
97
+	  MLX_ASSEMBLE_3 ( _structure_st, _index, __VA_ARGS__ ) )
98
+
99
+#define MLX_ASSEMBLE_5( _structure_st, _index, _field, _value, ... )	     \
100
+	( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) |	     \
101
+	  MLX_ASSEMBLE_4 ( _structure_st, _index, __VA_ARGS__ ) )
102
+
103
+#define MLX_ASSEMBLE_6( _structure_st, _index, _field, _value, ... )	     \
104
+	( MLX_ASSEMBLE_1 ( _structure_st, _index, _field, _value ) |	     \
105
+	  MLX_ASSEMBLE_5 ( _structure_st, _index, __VA_ARGS__ ) )
106
+
107
+/*
108
+ * Build native-endian (positive) dword bitmasks from named fields
109
+ *
110
+ */
111
+
112
+#define MLX_MASK_1( _structure_st, _index, _field )			     \
113
+	( MLX_BIT_MASK ( _structure_st, _field ) <<			     \
114
+	  MLX_DWORD_BIT_OFFSET ( _structure_st, _index, _field ) )
115
+
116
+#define MLX_MASK_2( _structure_st, _index, _field, ... )		     \
117
+	( MLX_MASK_1 ( _structure_st, _index, _field ) |		     \
118
+	  MLX_MASK_1 ( _structure_st, _index, __VA_ARGS__ ) )
119
+
120
+#define MLX_MASK_3( _structure_st, _index, _field, ... )		     \
121
+	( MLX_MASK_1 ( _structure_st, _index, _field ) |		     \
122
+	  MLX_MASK_2 ( _structure_st, _index, __VA_ARGS__ ) )
123
+
124
+#define MLX_MASK_4( _structure_st, _index, _field, ... )		     \
125
+	( MLX_MASK_1 ( _structure_st, _index, _field ) |		     \
126
+	  MLX_MASK_3 ( _structure_st, _index, __VA_ARGS__ ) )
127
+
128
+#define MLX_MASK_5( _structure_st, _index, _field, ... )		     \
129
+	( MLX_MASK_1 ( _structure_st, _index, _field ) |		     \
130
+	  MLX_MASK_4 ( _structure_st, _index, __VA_ARGS__ ) )
131
+
132
+#define MLX_MASK_6( _structure_st, _index, _field, ... )		     \
133
+	( MLX_MASK_1 ( _structure_st, _index, _field ) |		     \
134
+	  MLX_MASK_5 ( _structure_st, _index, __VA_ARGS__ ) )
135
+
136
+/*
137
+ * Populate big-endian dwords from named fields and values
138
+ *
139
+ */
140
+
141
+#define MLX_FILL( _ptr, _index, _assembled )				     \
142
+	do {								     \
143
+		uint32_t *__ptr = &(_ptr)->u.dwords[(_index)];		     \
144
+		uint32_t __assembled = (_assembled);			     \
145
+		*__ptr = cpu_to_be32 ( __assembled );			     \
146
+	} while ( 0 )
147
+
148
+#define MLX_FILL_1( _ptr, _index, ... )					     \
149
+	MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_1 ( MLX_PSEUDO_STRUCT ( _ptr ),\
150
+						  _index, __VA_ARGS__ ) )
151
+
152
+#define MLX_FILL_2( _ptr, _index, ... )					     \
153
+	MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_2 ( MLX_PSEUDO_STRUCT ( _ptr ),\
154
+						  _index, __VA_ARGS__ ) )
155
+
156
+#define MLX_FILL_3( _ptr, _index, ... )					     \
157
+	MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_3 ( MLX_PSEUDO_STRUCT ( _ptr ),\
158
+						  _index, __VA_ARGS__ ) )
159
+
160
+#define MLX_FILL_4( _ptr, _index, ... )					     \
161
+	MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_4 ( MLX_PSEUDO_STRUCT ( _ptr ),\
162
+						  _index, __VA_ARGS__ ) )
163
+
164
+#define MLX_FILL_5( _ptr, _index, ... )					     \
165
+	MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_5 ( MLX_PSEUDO_STRUCT ( _ptr ),\
166
+						  _index, __VA_ARGS__ ) )
167
+
168
+#define MLX_FILL_6( _ptr, _index, ... )					     \
169
+	MLX_FILL ( _ptr, _index, MLX_ASSEMBLE_6 ( MLX_PSEUDO_STRUCT ( _ptr ),\
170
+						  _index, __VA_ARGS__ ) )
171
+
172
+/*
173
+ * Modify big-endian dword using named field and value
174
+ *
175
+ */
176
+
177
+#define MLX_SET( _ptr, _field, _value )					     \
178
+	do {								     \
179
+		unsigned int __index = 					     \
180
+		    MLX_DWORD_OFFSET ( MLX_PSEUDO_STRUCT ( _ptr ), _field ); \
181
+		uint32_t *__ptr = &(_ptr)->u.dwords[__index];		     \
182
+		uint32_t __value = be32_to_cpu ( *__ptr );		     \
183
+		__value &= ~( MLX_MASK_1 ( MLX_PSEUDO_STRUCT ( _ptr ),	     \
184
+					   __index, _field ) );		     \
185
+		__value |= MLX_ASSEMBLE_1 ( MLX_PSEUDO_STRUCT ( _ptr ),	     \
186
+					    __index, _field, _value );	     \
187
+		*__ptr = cpu_to_be32 ( __value );			     \
188
+	} while ( 0 )
189
+
190
+/*
191
+ * Extract value of named field
192
+ *
193
+ */
194
+
195
+#define MLX_GET( _ptr, _field )						     \
196
+	( {								     \
197
+		unsigned int __index = 					     \
198
+		    MLX_DWORD_OFFSET ( MLX_PSEUDO_STRUCT ( _ptr ), _field ); \
199
+		uint32_t *__ptr = &(_ptr)->u.dwords[__index];		     \
200
+		uint32_t __value = be32_to_cpu ( *__ptr );		     \
201
+		__value >>=						     \
202
+		    MLX_DWORD_BIT_OFFSET ( MLX_PSEUDO_STRUCT ( _ptr ),	     \
203
+					    __index, _field );		     \
204
+		__value &=						     \
205
+		    MLX_BIT_MASK ( MLX_PSEUDO_STRUCT ( _ptr ), _field );     \
206
+		__value;						     \
207
+	} )
208
+
209
+#endif /* _MLX_BITOPS_H */

+ 930
- 0
src/drivers/net/ipoib.c View File

@@ -0,0 +1,930 @@
1
+/*
2
+ * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
3
+ *
4
+ * This program is free software; you can redistribute it and/or
5
+ * modify it under the terms of the GNU General Public License as
6
+ * published by the Free Software Foundation; either version 2 of the
7
+ * License, or any later version.
8
+ *
9
+ * This program is distributed in the hope that it will be useful, but
10
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
+ * General Public License for more details.
13
+ *
14
+ * You should have received a copy of the GNU General Public License
15
+ * along with this program; if not, write to the Free Software
16
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
+ */
18
+
19
+#include <stdint.h>
20
+#include <stdio.h>
21
+#include <unistd.h>
22
+#include <string.h>
23
+#include <byteswap.h>
24
+#include <errno.h>
25
+#include "timer.h"
26
+#include <gpxe/if_arp.h>
27
+#include <gpxe/iobuf.h>
28
+#include <gpxe/netdevice.h>
29
+#include <gpxe/infiniband.h>
30
+#include <gpxe/ipoib.h>
31
+
32
+/** @file
33
+ *
34
+ * IP over Infiniband
35
+ */
36
+
37
+/** IPoIB MTU */
38
+#define IPOIB_MTU 2048
39
+
40
+/** Number of IPoIB data send work queue entries */
41
+#define IPOIB_DATA_NUM_SEND_WQES 2
42
+
43
+/** Number of IPoIB data receive work queue entries */
44
+#define IPOIB_DATA_NUM_RECV_WQES 4
45
+
46
+/** Number of IPoIB data completion entries */
47
+#define IPOIB_DATA_NUM_CQES 8
48
+
49
+/** Number of IPoIB metadata send work queue entries */
50
+#define IPOIB_META_NUM_SEND_WQES 2
51
+
52
+/** Number of IPoIB metadata receive work queue entries */
53
+#define IPOIB_META_NUM_RECV_WQES 2
54
+
55
+/** Number of IPoIB metadata completion entries */
56
+#define IPOIB_META_NUM_CQES 8
57
+
58
+/** An IPoIB queue set */
59
+struct ipoib_queue_set {
60
+	/** Completion queue */
61
+	struct ib_completion_queue *cq;
62
+	/** Queue pair */
63
+	struct ib_queue_pair *qp;
64
+	/** Receive work queue fill level */
65
+	unsigned int recv_fill;
66
+	/** Receive work queue maximum fill level */
67
+	unsigned int recv_max_fill;
68
+};
69
+
70
+/** An IPoIB device */
71
+struct ipoib_device {
72
+	/** Network device */
73
+	struct net_device *netdev;
74
+	/** Underlying Infiniband device */
75
+	struct ib_device *ibdev;
76
+	/** Data queue set */
77
+	struct ipoib_queue_set data;
78
+	/** Data queue set */
79
+	struct ipoib_queue_set meta;
80
+	/** Broadcast GID */
81
+	struct ib_gid broadcast_gid;
82
+	/** Broadcast LID */
83
+	unsigned int broadcast_lid;
84
+	/** Joined to broadcast group */
85
+	int broadcast_joined;
86
+	/** Data queue key */
87
+	unsigned long data_qkey;
88
+};
89
+
90
+/**
91
+ * IPoIB path cache entry
92
+ *
93
+ * This serves a similar role to the ARP cache for Ethernet.  (ARP
94
+ * *is* used on IPoIB; we have two caches to maintain.)
95
+ */
96
+struct ipoib_cached_path {
97
+	/** Destination GID */
98
+	struct ib_gid gid;
99
+	/** Destination LID */
100
+	unsigned int dlid;
101
+	/** Service level */
102
+	unsigned int sl;
103
+	/** Rate */
104
+	unsigned int rate;
105
+};
106
+
107
+/** Number of IPoIB path cache entries */
108
+#define IPOIB_NUM_CACHED_PATHS 2
109
+
110
+/** IPoIB path cache */
111
+static struct ipoib_cached_path ipoib_path_cache[IPOIB_NUM_CACHED_PATHS];
112
+
113
+/** Oldest IPoIB path cache entry index */
114
+static unsigned int ipoib_path_cache_idx = 0;
115
+
116
+/** TID half used to identify get path record replies */
117
+#define IPOIB_TID_GET_PATH_REC 0x11111111UL
118
+
119
+/** TID half used to identify multicast member record replies */
120
+#define IPOIB_TID_MC_MEMBER_REC 0x22222222UL
121
+
122
+/** IPoIB metadata TID */
123
+static uint32_t ipoib_meta_tid = 0;
124
+
125
+/** IPv4 broadcast GID */
126
+static const struct ib_gid ipv4_broadcast_gid = {
127
+	{ { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
128
+	    0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff } }
129
+};
130
+
131
+/** Maximum time we will wait for the broadcast join to succeed */
132
+#define IPOIB_JOIN_MAX_DELAY_MS 1000
133
+
134
+/****************************************************************************
135
+ *
136
+ * IPoIB link layer
137
+ *
138
+ ****************************************************************************
139
+ */
140
+
141
+/** Broadcast QPN used in IPoIB MAC addresses
142
+ *
143
+ * This is a guaranteed invalid real QPN
144
+ */
145
+#define IPOIB_BROADCAST_QPN 0xffffffffUL
146
+
147
+/** Broadcast IPoIB address */
148
+static struct ipoib_mac ipoib_broadcast = {
149
+	.qpn = ntohl ( IPOIB_BROADCAST_QPN ),
150
+};
151
+
152
+/**
153
+ * Transmit IPoIB packet
154
+ *
155
+ * @v iobuf		I/O buffer
156
+ * @v netdev		Network device
157
+ * @v net_protocol	Network-layer protocol
158
+ * @v ll_dest		Link-layer destination address
159
+ *
160
+ * Prepends the IPoIB link-layer header and transmits the packet.
161
+ */
162
+static int ipoib_tx ( struct io_buffer *iobuf, struct net_device *netdev,
163
+		      struct net_protocol *net_protocol,
164
+		      const void *ll_dest ) {
165
+	struct ipoib_hdr *ipoib_hdr =
166
+		iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
167
+
168
+	/* Build IPoIB header */
169
+	memcpy ( &ipoib_hdr->pseudo.peer, ll_dest,
170
+		 sizeof ( ipoib_hdr->pseudo.peer ) );
171
+	ipoib_hdr->real.proto = net_protocol->net_proto;
172
+	ipoib_hdr->real.reserved = 0;
173
+
174
+	/* Hand off to network device */
175
+	return netdev_tx ( netdev, iobuf );
176
+}
177
+
178
+/**
179
+ * Process received IPoIB packet
180
+ *
181
+ * @v iobuf	I/O buffer
182
+ * @v netdev	Network device
183
+ *
184
+ * Strips off the IPoIB link-layer header and passes up to the
185
+ * network-layer protocol.
186
+ */
187
+static int ipoib_rx ( struct io_buffer *iobuf, struct net_device *netdev ) {
188
+	struct ipoib_hdr *ipoib_hdr = iobuf->data;
189
+
190
+	/* Sanity check */
191
+	if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
192
+		DBG ( "IPoIB packet too short for link-layer header\n" );
193
+		DBG_HD ( iobuf->data, iob_len ( iobuf ) );
194
+		free_iob ( iobuf );
195
+		return -EINVAL;
196
+	}
197
+
198
+	/* Strip off IPoIB header */
199
+	iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
200
+
201
+	/* Hand off to network-layer protocol */
202
+	return net_rx ( iobuf, netdev, ipoib_hdr->real.proto,
203
+			&ipoib_hdr->pseudo.peer );
204
+}
205
+
206
+/**
207
+ * Transcribe IPoIB address
208
+ *
209
+ * @v ll_addr	Link-layer address
210
+ * @ret string	Link-layer address in human-readable format
211
+ */
212
+const char * ipoib_ntoa ( const void *ll_addr ) {
213
+	static char buf[45];
214
+	const struct ipoib_mac *mac = ll_addr;
215
+
216
+	snprintf ( buf, sizeof ( buf ), "%08lx:%08lx:%08lx:%08lx:%08lx",
217
+		   htonl ( mac->qpn ), htonl ( mac->gid.u.dwords[0] ),
218
+		   htonl ( mac->gid.u.dwords[1] ),
219
+		   htonl ( mac->gid.u.dwords[2] ),
220
+		   htonl ( mac->gid.u.dwords[3] ) );
221
+	return buf;
222
+}
223
+
224
+/** IPoIB protocol */
225
+struct ll_protocol ipoib_protocol __ll_protocol = {
226
+	.name		= "IPoIB",
227
+	.ll_proto	= htons ( ARPHRD_INFINIBAND ),
228
+	.ll_addr_len	= IPOIB_ALEN,
229
+	.ll_header_len	= IPOIB_HLEN,
230
+	.ll_broadcast	= ( uint8_t * ) &ipoib_broadcast,
231
+	.tx		= ipoib_tx,
232
+	.rx		= ipoib_rx,
233
+	.ntoa		= ipoib_ntoa,
234
+};
235
+
236
+/****************************************************************************
237
+ *
238
+ * IPoIB network device
239
+ *
240
+ ****************************************************************************
241
+ */
242
+
243
+/**
244
+ * Destroy queue set
245
+ *
246
+ * @v ipoib		IPoIB device
247
+ * @v qset		Queue set
248
+ */
249
+static void ipoib_destroy_qset ( struct ipoib_device *ipoib,
250
+				 struct ipoib_queue_set *qset ) {
251
+	struct ib_device *ibdev = ipoib->ibdev;
252
+
253
+	if ( qset->qp )
254
+		ib_destroy_qp ( ibdev, qset->qp );
255
+	if ( qset->cq )
256
+		ib_destroy_cq ( ibdev, qset->cq );
257
+	memset ( qset, 0, sizeof ( *qset ) );
258
+}
259
+
260
+/**
261
+ * Create queue set
262
+ *
263
+ * @v ipoib		IPoIB device
264
+ * @v qset		Queue set
265
+ * @ret rc		Return status code
266
+ */
267
+static int ipoib_create_qset ( struct ipoib_device *ipoib,
268
+			       struct ipoib_queue_set *qset,
269
+			       unsigned int num_cqes,
270
+			       unsigned int num_send_wqes,
271
+			       unsigned int num_recv_wqes,
272
+			       unsigned long qkey ) {
273
+	struct ib_device *ibdev = ipoib->ibdev;
274
+	int rc;
275
+
276
+	/* Store queue parameters */
277
+	qset->recv_max_fill = num_recv_wqes;
278
+
279
+	/* Allocate completion queue */
280
+	qset->cq = ib_create_cq ( ibdev, num_cqes );
281
+	if ( ! qset->cq ) {
282
+		DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
283
+		       ipoib );
284
+		rc = -ENOMEM;
285
+		goto err;
286
+	}
287
+
288
+	/* Allocate queue pair */
289
+	qset->qp = ib_create_qp ( ibdev, num_send_wqes, qset->cq,
290
+				  num_recv_wqes, qset->cq, qkey );
291
+	if ( ! qset->qp ) {
292
+		DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
293
+		       ipoib );
294
+		rc = -ENOMEM;
295
+		goto err;
296
+	}
297
+	qset->qp->owner_priv = ipoib->netdev;
298
+
299
+	return 0;
300
+
301
+ err:
302
+	ipoib_destroy_qset ( ipoib, qset );
303
+	return rc;
304
+}
305
+
306
+/**
307
+ * Find path cache entry by GID
308
+ *
309
+ * @v gid		GID
310
+ * @ret entry		Path cache entry, or NULL
311
+ */
312
+static struct ipoib_cached_path *
313
+ipoib_find_cached_path ( struct ib_gid *gid ) {
314
+	struct ipoib_cached_path *path;
315
+	unsigned int i;
316
+
317
+	for ( i = 0 ; i < IPOIB_NUM_CACHED_PATHS ; i++ ) {
318
+		path = &ipoib_path_cache[i];
319
+		if ( memcmp ( &path->gid, gid, sizeof ( *gid ) ) == 0 )
320
+			return path;
321
+	}
322
+	DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx cache miss\n",
323
+	      htonl ( gid->u.dwords[0] ), htonl ( gid->u.dwords[1] ),
324
+	      htonl ( gid->u.dwords[2] ), htonl ( gid->u.dwords[3] ) );
325
+	return NULL;
326
+}
327
+
328
+/**
329
+ * Transmit path record request
330
+ *
331
+ * @v ipoib		IPoIB device
332
+ * @v gid		Destination GID
333
+ * @ret rc		Return status code
334
+ */
335
+static int ipoib_get_path_record ( struct ipoib_device *ipoib,
336
+				   struct ib_gid *gid ) {
337
+	struct ib_device *ibdev = ipoib->ibdev;
338
+	struct io_buffer *iobuf;
339
+	struct ib_mad_path_record *path_record;
340
+	struct ib_address_vector av;
341
+	int rc;
342
+
343
+	/* Allocate I/O buffer */
344
+	iobuf = alloc_iob ( sizeof ( *path_record ) );
345
+	if ( ! iobuf )
346
+		return -ENOMEM;
347
+	iob_put ( iobuf, sizeof ( *path_record ) );
348
+	path_record = iobuf->data;
349
+	memset ( path_record, 0, sizeof ( *path_record ) );
350
+
351
+	/* Construct path record request */
352
+	path_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
353
+	path_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
354
+	path_record->mad_hdr.class_version = 2;
355
+	path_record->mad_hdr.method = IB_MGMT_METHOD_GET;
356
+	path_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_PATH_REC );
357
+	path_record->mad_hdr.tid[0] = IPOIB_TID_GET_PATH_REC;
358
+	path_record->mad_hdr.tid[1] = ipoib_meta_tid++;
359
+	path_record->sa_hdr.comp_mask[1] =
360
+		htonl ( IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID );
361
+	memcpy ( &path_record->dgid, gid, sizeof ( path_record->dgid ) );
362
+	memcpy ( &path_record->sgid, &ibdev->port_gid,
363
+		 sizeof ( path_record->sgid ) );
364
+
365
+	/* Construct address vector */
366
+	memset ( &av, 0, sizeof ( av ) );
367
+	av.dlid = ibdev->sm_lid;
368
+	av.dest_qp = IB_SA_QPN;
369
+	av.qkey = IB_GLOBAL_QKEY;
370
+
371
+	/* Post send request */
372
+	if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
373
+				   iobuf ) ) != 0 ) {
374
+		DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
375
+		       ipoib, strerror ( rc ) );
376
+		free_iob ( iobuf );
377
+		return rc;
378
+	}
379
+
380
+	return 0;
381
+}
382
+
383
+/**
384
+ * Transmit multicast group membership request
385
+ *
386
+ * @v ipoib		IPoIB device
387
+ * @v gid		Multicast GID
388
+ * @v join		Join (rather than leave) group
389
+ * @ret rc		Return status code
390
+ */
391
+static int ipoib_mc_member_record ( struct ipoib_device *ipoib,
392
+				    struct ib_gid *gid, int join ) {
393
+	struct ib_device *ibdev = ipoib->ibdev;
394
+	struct io_buffer *iobuf;
395
+	struct ib_mad_mc_member_record *mc_member_record;
396
+	struct ib_address_vector av;
397
+	int rc;
398
+
399
+	/* Allocate I/O buffer */
400
+	iobuf = alloc_iob ( sizeof ( *mc_member_record ) );
401
+	if ( ! iobuf )
402
+		return -ENOMEM;
403
+	iob_put ( iobuf, sizeof ( *mc_member_record ) );
404
+	mc_member_record = iobuf->data;
405
+	memset ( mc_member_record, 0, sizeof ( *mc_member_record ) );
406
+
407
+	/* Construct path record request */
408
+	mc_member_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
409
+	mc_member_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
410
+	mc_member_record->mad_hdr.class_version = 2;
411
+	mc_member_record->mad_hdr.method = 
412
+		( join ? IB_MGMT_METHOD_SET : IB_MGMT_METHOD_DELETE );
413
+	mc_member_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_MC_MEMBER_REC );
414
+	mc_member_record->mad_hdr.tid[0] = IPOIB_TID_MC_MEMBER_REC;
415
+	mc_member_record->mad_hdr.tid[1] = ipoib_meta_tid++;
416
+	mc_member_record->sa_hdr.comp_mask[1] =
417
+		htonl ( IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
418
+			IB_SA_MCMEMBER_REC_JOIN_STATE );
419
+	mc_member_record->scope__join_state = 1;
420
+	memcpy ( &mc_member_record->mgid, gid,
421
+		 sizeof ( mc_member_record->mgid ) );
422
+	memcpy ( &mc_member_record->port_gid, &ibdev->port_gid,
423
+		 sizeof ( mc_member_record->port_gid ) );
424
+
425
+	/* Construct address vector */
426
+	memset ( &av, 0, sizeof ( av ) );
427
+	av.dlid = ibdev->sm_lid;
428
+	av.dest_qp = IB_SA_QPN;
429
+	av.qkey = IB_GLOBAL_QKEY;
430
+
431
+	/* Post send request */
432
+	if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
433
+				   iobuf ) ) != 0 ) {
434
+		DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
435
+		       ipoib, strerror ( rc ) );
436
+		free_iob ( iobuf );
437
+		return rc;
438
+	}
439
+
440
+	return 0;
441
+}
442
+
443
+/**
444
+ * Transmit packet via IPoIB network device
445
+ *
446
+ * @v netdev		Network device
447
+ * @v iobuf		I/O buffer
448
+ * @ret rc		Return status code
449
+ */
450
+static int ipoib_transmit ( struct net_device *netdev,
451
+			    struct io_buffer *iobuf ) {
452
+	struct ipoib_device *ipoib = netdev->priv;
453
+	struct ib_device *ibdev = ipoib->ibdev;
454
+	struct ipoib_pseudo_hdr *ipoib_pshdr = iobuf->data;
455
+	struct ib_address_vector av;
456
+	struct ib_gid *gid;
457
+	struct ipoib_cached_path *path;
458
+	int rc;
459
+
460
+	/* Sanity check */
461
+	if ( iob_len ( iobuf ) < sizeof ( *ipoib_pshdr ) ) {
462
+		DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
463
+		return -EINVAL;
464
+	}
465
+	iob_pull ( iobuf, ( sizeof ( *ipoib_pshdr ) ) );
466
+
467
+	/* Construct address vector */
468
+	memset ( &av, 0, sizeof ( av ) );
469
+	av.qkey = IB_GLOBAL_QKEY;
470
+	av.gid_present = 1;
471
+	if ( ipoib_pshdr->peer.qpn == htonl ( IPOIB_BROADCAST_QPN ) ) {
472
+		/* Broadcast address */
473
+		av.dest_qp = IB_BROADCAST_QPN;
474
+		av.dlid = ipoib->broadcast_lid;
475
+		gid = &ipoib->broadcast_gid;
476
+	} else {
477
+		/* Unicast - look in path cache */
478
+		path = ipoib_find_cached_path ( &ipoib_pshdr->peer.gid );
479
+		if ( ! path ) {
480
+			/* No path entry - get path record */
481
+			rc = ipoib_get_path_record ( ipoib,
482
+						     &ipoib_pshdr->peer.gid );
483
+			netdev_tx_complete ( netdev, iobuf );
484
+			return rc;
485
+		}
486
+		av.dest_qp = ntohl ( ipoib_pshdr->peer.qpn );
487
+		av.dlid = path->dlid;
488
+		av.rate = path->rate;
489
+		av.sl = path->sl;
490
+		gid = &ipoib_pshdr->peer.gid;
491
+	}
492
+	memcpy ( &av.gid, gid, sizeof ( av.gid ) );
493
+
494
+	return ib_post_send ( ibdev, ipoib->data.qp, &av, iobuf );
495
+}
496
+
497
+/**
498
+ * Handle IPoIB data send completion
499
+ *
500
+ * @v ibdev		Infiniband device
501
+ * @v qp		Queue pair
502
+ * @v completion	Completion
503
+ * @v iobuf		I/O buffer
504
+ */
505
+static void ipoib_data_complete_send ( struct ib_device *ibdev __unused,
506
+				       struct ib_queue_pair *qp,
507
+				       struct ib_completion *completion,
508
+				       struct io_buffer *iobuf ) {
509
+	struct net_device *netdev = qp->owner_priv;
510
+
511
+	netdev_tx_complete_err ( netdev, iobuf,
512
+				 ( completion->syndrome ? -EIO : 0 ) );
513
+}
514
+
515
+/**
516
+ * Handle IPoIB data receive completion
517
+ *
518
+ * @v ibdev		Infiniband device
519
+ * @v qp		Queue pair
520
+ * @v completion	Completion
521
+ * @v iobuf		I/O buffer
522
+ */
523
+static void ipoib_data_complete_recv ( struct ib_device *ibdev __unused,
524
+				       struct ib_queue_pair *qp,
525
+				       struct ib_completion *completion,
526
+				       struct io_buffer *iobuf ) {
527
+	struct net_device *netdev = qp->owner_priv;
528
+	struct ipoib_device *ipoib = netdev->priv;
529
+	struct ipoib_pseudo_hdr *ipoib_pshdr;
530
+
531
+	if ( completion->syndrome ) {
532
+		netdev_rx_err ( netdev, iobuf, -EIO );
533
+		goto done;
534
+	}
535
+
536
+	iob_put ( iobuf, completion->len );
537
+	if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
538
+		DBGC ( ipoib, "IPoIB %p received data packet too short to "
539
+		       "contain GRH\n", ipoib );
540
+		DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
541
+		netdev_rx_err ( netdev, iobuf, -EIO );
542
+		goto done;
543
+	}
544
+	iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
545
+
546
+	if ( iob_len ( iobuf ) < sizeof ( struct ipoib_real_hdr ) ) {
547
+		DBGC ( ipoib, "IPoIB %p received data packet too short to "
548
+		       "contain IPoIB header\n", ipoib );
549
+		DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
550
+		netdev_rx_err ( netdev, iobuf, -EIO );
551
+		goto done;
552
+	}
553
+
554
+	ipoib_pshdr = iob_push ( iobuf, sizeof ( *ipoib_pshdr ) );
555
+	/* FIXME: fill in a MAC address for the sake of AoE! */
556
+
557
+	netdev_rx ( netdev, iobuf );
558
+
559
+ done:
560
+	ipoib->data.recv_fill--;
561
+}
562
+
563
+/**
564
+ * Handle IPoIB metadata send completion
565
+ *
566
+ * @v ibdev		Infiniband device
567
+ * @v qp		Queue pair
568
+ * @v completion	Completion
569
+ * @v iobuf		I/O buffer
570
+ */
571
+static void ipoib_meta_complete_send ( struct ib_device *ibdev __unused,
572
+				       struct ib_queue_pair *qp,
573
+				       struct ib_completion *completion,
574
+				       struct io_buffer *iobuf ) {
575
+	struct net_device *netdev = qp->owner_priv;
576
+	struct ipoib_device *ipoib = netdev->priv;
577
+
578
+	if ( completion->syndrome ) {
579
+		DBGC ( ipoib, "IPoIB %p metadata TX completion error %x\n",
580
+		       ipoib, completion->syndrome );
581
+	}
582
+	free_iob ( iobuf );
583
+}
584
+
585
+/**
586
+ * Handle received IPoIB path record
587
+ *
588
+ * @v ipoib		IPoIB device
589
+ * @v path_record	Path record
590
+ */
591
+static void ipoib_recv_path_record ( struct ipoib_device *ipoib __unused,
592
+				     struct ib_mad_path_record *path_record ) {
593
+	struct ipoib_cached_path *path;
594
+
595
+	/* Update path cache entry */
596
+	path = &ipoib_path_cache[ipoib_path_cache_idx];
597
+	memcpy ( &path->gid, &path_record->dgid, sizeof ( path->gid ) );
598
+	path->dlid = ntohs ( path_record->dlid );
599
+	path->sl = ( path_record->reserved__sl & 0x0f );
600
+	path->rate = ( path_record->rate_selector__rate & 0x3f );
601
+
602
+	DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx dlid %x sl %x rate %x\n",
603
+	      htonl ( path->gid.u.dwords[0] ), htonl ( path->gid.u.dwords[1] ),
604
+	      htonl ( path->gid.u.dwords[2] ), htonl ( path->gid.u.dwords[3] ),
605
+	      path->dlid, path->sl, path->rate );
606
+	
607
+	/* Update path cache index */
608
+	ipoib_path_cache_idx++;
609
+	if ( ipoib_path_cache_idx == IPOIB_NUM_CACHED_PATHS )
610
+		ipoib_path_cache_idx = 0;
611
+}
612
+
613
+/**
614
+ * Handle received IPoIB multicast membership record
615
+ *
616
+ * @v ipoib		IPoIB device
617
+ * @v mc_member_record	Multicast membership record
618
+ */
619
+static void ipoib_recv_mc_member_record ( struct ipoib_device *ipoib,
620
+			  struct ib_mad_mc_member_record *mc_member_record ) {
621
+	/* Record parameters */
622
+	ipoib->broadcast_joined =
623
+		( mc_member_record->scope__join_state & 0x0f );
624
+	ipoib->data_qkey = ntohl ( mc_member_record->qkey );
625
+	ipoib->broadcast_lid = ntohs ( mc_member_record->mlid );
626
+	DBGC ( ipoib, "IPoIB %p %s broadcast group: qkey %lx mlid %x\n",
627
+	       ipoib, ( ipoib->broadcast_joined ? "joined" : "left" ),
628
+	       ipoib->data_qkey, ipoib->broadcast_lid );
629
+}
630
+
631
+/**
632
+ * Handle IPoIB metadata receive completion
633
+ *
634
+ * @v ibdev		Infiniband device
635
+ * @v qp		Queue pair
636
+ * @v completion	Completion
637
+ * @v iobuf		I/O buffer
638
+ */
639
+static void ipoib_meta_complete_recv ( struct ib_device *ibdev __unused,
640
+				       struct ib_queue_pair *qp,
641
+				       struct ib_completion *completion,
642
+				       struct io_buffer *iobuf ) {
643
+	struct net_device *netdev = qp->owner_priv;
644
+	struct ipoib_device *ipoib = netdev->priv;
645
+	union ib_mad *mad;
646
+
647
+	if ( completion->syndrome ) {
648
+		DBGC ( ipoib, "IPoIB %p metadata RX completion error %x\n",
649
+		       ipoib, completion->syndrome );
650
+		goto done;
651
+	}
652
+
653
+	iob_put ( iobuf, completion->len );
654
+	if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
655
+		DBGC ( ipoib, "IPoIB %p received metadata packet too short "
656
+		       "to contain GRH\n", ipoib );
657
+		DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
658
+		goto done;
659
+	}
660
+	iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
661
+	if ( iob_len ( iobuf ) < sizeof ( *mad ) ) {
662
+		DBGC ( ipoib, "IPoIB %p received metadata packet too short "
663
+		       "to contain reply\n", ipoib );
664
+		DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
665
+		goto done;
666
+	}
667
+	mad = iobuf->data;
668
+
669
+	if ( mad->mad_hdr.status != 0 ) {
670
+		DBGC ( ipoib, "IPoIB %p metadata RX err status %04x\n",
671
+		       ipoib, ntohs ( mad->mad_hdr.status ) );
672
+		goto done;
673
+	}
674
+
675
+	switch ( mad->mad_hdr.tid[0] ) {
676
+	case IPOIB_TID_GET_PATH_REC:
677
+		ipoib_recv_path_record ( ipoib, &mad->path_record );
678
+		break;
679
+	case IPOIB_TID_MC_MEMBER_REC:
680
+		ipoib_recv_mc_member_record ( ipoib, &mad->mc_member_record );
681
+		break;
682
+	default:
683
+		DBGC ( ipoib, "IPoIB %p unwanted response:\n",
684
+		       ipoib );
685
+		DBGC_HD ( ipoib, mad, sizeof ( *mad ) );
686
+		break;
687
+	}
688
+
689
+ done:
690
+	ipoib->meta.recv_fill--;
691
+	free_iob ( iobuf );
692
+}
693
+
694
+/**
695
+ * Refill IPoIB receive ring
696
+ *
697
+ * @v ipoib		IPoIB device
698
+ */
699
+static void ipoib_refill_recv ( struct ipoib_device *ipoib,
700
+				struct ipoib_queue_set *qset ) {
701
+	struct ib_device *ibdev = ipoib->ibdev;
702
+	struct io_buffer *iobuf;
703
+	int rc;
704
+
705
+	while ( qset->recv_fill < qset->recv_max_fill ) {
706
+		iobuf = alloc_iob ( IPOIB_MTU );
707
+		if ( ! iobuf )
708
+			break;
709
+		if ( ( rc = ib_post_recv ( ibdev, qset->qp, iobuf ) ) != 0 ) {
710
+			free_iob ( iobuf );
711
+			break;
712
+		}
713
+		qset->recv_fill++;
714
+	}
715
+}
716
+
717
+/**
718
+ * Poll IPoIB network device
719
+ *
720
+ * @v netdev		Network device
721
+ */
722
+static void ipoib_poll ( struct net_device *netdev ) {
723
+	struct ipoib_device *ipoib = netdev->priv;
724
+	struct ib_device *ibdev = ipoib->ibdev;
725
+
726
+	ib_poll_cq ( ibdev, ipoib->meta.cq, ipoib_meta_complete_send,
727
+		     ipoib_meta_complete_recv );
728
+	ib_poll_cq ( ibdev, ipoib->data.cq, ipoib_data_complete_send,
729
+		     ipoib_data_complete_recv );
730
+	ipoib_refill_recv ( ipoib, &ipoib->meta );
731
+	ipoib_refill_recv ( ipoib, &ipoib->data );
732
+}
733
+
734
+/**
735
+ * Enable/disable interrupts on IPoIB network device
736
+ *
737
+ * @v netdev		Network device
738
+ * @v enable		Interrupts should be enabled
739
+ */
740
+static void ipoib_irq ( struct net_device *netdev __unused,
741
+			int enable __unused ) {
742
+	/* No implementation */
743
+}
744
+
745
+/**
746
+ * Open IPoIB network device
747
+ *
748
+ * @v netdev		Network device
749
+ * @ret rc		Return status code
750
+ */
751
+static int ipoib_open ( struct net_device *netdev ) {
752
+	struct ipoib_device *ipoib = netdev->priv;
753
+	struct ib_device *ibdev = ipoib->ibdev;
754
+	int rc;
755
+
756
+	/* Attach to broadcast multicast GID */
757
+	if ( ( rc = ib_mcast_attach ( ibdev, ipoib->data.qp,
758
+				      &ipoib->broadcast_gid ) ) != 0 ) {
759
+		DBG ( "Could not attach to broadcast GID: %s\n",
760
+		      strerror ( rc ) );
761
+		return rc;
762
+	}
763
+
764
+	/* Fill receive rings */
765
+	ipoib_refill_recv ( ipoib, &ipoib->meta );
766
+	ipoib_refill_recv ( ipoib, &ipoib->data );
767
+
768
+	return 0;
769
+}
770
+
771
+/**
772
+ * Close IPoIB network device
773
+ *
774
+ * @v netdev		Network device
775
+ */
776
+static void ipoib_close ( struct net_device *netdev ) {
777
+	struct ipoib_device *ipoib = netdev->priv;
778
+	struct ib_device *ibdev = ipoib->ibdev;
779
+
780
+	/* Detach from broadcast multicast GID */
781
+	ib_mcast_detach ( ibdev, ipoib->data.qp, &ipoib->broadcast_gid );
782
+
783
+	/* FIXME: should probably flush the receive ring */
784
+}
785
+
786
+/** IPoIB network device operations */
787
+static struct net_device_operations ipoib_operations = {
788
+	.open		= ipoib_open,
789
+	.close		= ipoib_close,
790
+	.transmit	= ipoib_transmit,
791
+	.poll		= ipoib_poll,
792
+	.irq		= ipoib_irq,
793
+};
794
+
795
+/**
796
+ * Join IPoIB broadcast group
797
+ *
798
+ * @v ipoib		IPoIB device
799
+ * @ret rc		Return status code
800
+ */
801
+static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
802
+	struct ib_device *ibdev = ipoib->ibdev;
803
+	unsigned int delay_ms;
804
+	int rc;
805
+
806
+	/* Make sure we have some receive descriptors */
807
+	ipoib_refill_recv ( ipoib, &ipoib->meta );
808
+
809
+	/* Send join request */
810
+	if ( ( rc = ipoib_mc_member_record ( ipoib, &ipoib->broadcast_gid,
811
+					     1 ) ) != 0 ) {
812
+		DBGC ( ipoib, "IPoIB %p could not send broadcast join: %s\n",
813
+		       ipoib, strerror ( rc ) );
814
+		return rc;
815
+	}
816
+
817
+	/* Wait for join to complete.  Ideally we wouldn't delay for
818
+	 * this long, but we need the queue key before we can set up
819
+	 * the data queue pair, which we need before we can know the
820
+	 * MAC address.
821
+	 */
822
+	for ( delay_ms = IPOIB_JOIN_MAX_DELAY_MS ; delay_ms ; delay_ms-- ) {
823
+		mdelay ( 1 );
824
+		ib_poll_cq ( ibdev, ipoib->meta.cq, ipoib_meta_complete_send,
825
+			     ipoib_meta_complete_recv );
826
+		ipoib_refill_recv ( ipoib, &ipoib->meta );
827
+		if ( ipoib->broadcast_joined )
828
+			return 0;
829
+	}
830
+	DBGC ( ipoib, "IPoIB %p timed out waiting for broadcast join\n",
831
+	       ipoib );
832
+
833
+	return -ETIMEDOUT;
834
+}
835
+
836
+/**
837
+ * Probe IPoIB device
838
+ *
839
+ * @v ibdev		Infiniband device
840
+ * @ret rc		Return status code
841
+ */
842
+int ipoib_probe ( struct ib_device *ibdev ) {
843
+	struct net_device *netdev;
844
+	struct ipoib_device *ipoib;
845
+	struct ipoib_mac *mac;
846
+	int rc;
847
+
848
+	/* Allocate network device */
849
+	netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
850
+	if ( ! netdev )
851
+		return -ENOMEM;
852
+	netdev_init ( netdev, &ipoib_operations );
853
+	ipoib = netdev->priv;
854
+	ib_set_ownerdata ( ibdev, netdev );
855
+	netdev->dev = ibdev->dev;
856
+	memset ( ipoib, 0, sizeof ( *ipoib ) );
857
+	ipoib->netdev = netdev;
858
+	ipoib->ibdev = ibdev;
859
+
860
+	/* Calculate broadcast GID */
861
+	memcpy ( &ipoib->broadcast_gid, &ipv4_broadcast_gid,
862
+		 sizeof ( ipoib->broadcast_gid ) );
863
+	ipoib->broadcast_gid.u.words[2] = htons ( ibdev->pkey );
864
+
865
+	/* Allocate metadata queue set */
866
+	if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->meta,
867
+					IPOIB_META_NUM_CQES,
868
+					IPOIB_META_NUM_SEND_WQES,
869
+					IPOIB_META_NUM_RECV_WQES,
870
+					IB_GLOBAL_QKEY ) ) != 0 ) {
871
+		DBGC ( ipoib, "IPoIB %p could not allocate metadata QP: %s\n",
872
+		       ipoib, strerror ( rc ) );
873
+		goto err_create_meta_qset;
874
+	}
875
+
876
+	/* Join broadcast group */
877
+	if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) {
878
+		DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
879
+		       ipoib, strerror ( rc ) );
880
+		goto err_join_broadcast_group;
881
+	}
882
+
883
+	/* Allocate data queue set */
884
+	if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->data,
885
+					IPOIB_DATA_NUM_CQES,
886
+					IPOIB_DATA_NUM_SEND_WQES,
887
+					IPOIB_DATA_NUM_RECV_WQES,
888
+					ipoib->data_qkey ) ) != 0 ) {
889
+		DBGC ( ipoib, "IPoIB %p could not allocate data QP: %s\n",
890
+		       ipoib, strerror ( rc ) );
891
+		goto err_create_data_qset;
892
+	}
893
+
894
+	/* Construct MAC address */
895
+	mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
896
+	mac->qpn = htonl ( ipoib->data.qp->qpn );
897
+	memcpy ( &mac->gid, &ibdev->port_gid, sizeof ( mac->gid ) );
898
+
899
+	/* Register network device */
900
+	if ( ( rc = register_netdev ( netdev ) ) != 0 )
901
+		goto err_register_netdev;
902
+
903
+	return 0;
904
+
905
+ err_register_netdev:
906
+	ipoib_destroy_qset ( ipoib, &ipoib->data );
907
+ err_join_broadcast_group:
908
+ err_create_data_qset:
909
+	ipoib_destroy_qset ( ipoib, &ipoib->meta );
910
+ err_create_meta_qset:
911
+	netdev_nullify ( netdev );
912
+	netdev_put ( netdev );
913
+	return rc;
914
+}
915
+
916
+/**
917
+ * Remove IPoIB device
918
+ *
919
+ * @v ibdev		Infiniband device
920
+ */
921
+void ipoib_remove ( struct ib_device *ibdev ) {
922
+	struct net_device *netdev = ib_get_ownerdata ( ibdev );
923
+	struct ipoib_device *ipoib = netdev->priv;
924
+
925
+	unregister_netdev ( netdev );
926
+	ipoib_destroy_qset ( ipoib, &ipoib->data );
927
+	ipoib_destroy_qset ( ipoib, &ipoib->meta );
928
+	netdev_nullify ( netdev );
929
+	netdev_put ( netdev );
930
+}

+ 578
- 0
src/include/gpxe/infiniband.h View File

@@ -0,0 +1,578 @@
1
+#ifndef _GPXE_INFINIBAND_H
2
+#define _GPXE_INFINIBAND_H
3
+
4
+/** @file
5
+ *
6
+ * Infiniband protocol
7
+ *
8
+ */
9
+
10
+#include <stdint.h>
11
+#include <gpxe/device.h>
12
+
13
+/** Subnet administrator QPN */
14
+#define IB_SA_QPN 1
15
+
16
+/** Broadcast QPN */
17
+#define IB_BROADCAST_QPN 0xffffffUL
18
+
19
+/** Subnet administrator queue key */
20
+#define IB_GLOBAL_QKEY 0x80010000UL
21
+
22
+/** An Infiniband Global Identifier */
23
+struct ib_gid {
24
+	union {
25
+		uint8_t bytes[16];
26
+		uint16_t words[8];
27
+		uint32_t dwords[4];
28
+	} u;
29
+};
30
+
31
+/** An Infiniband Global Route Header */
32
+struct ib_global_route_header {
33
+	/** IP version, traffic class, and flow label
34
+	 *
35
+	 *  4 bits : Version of the GRH
36
+	 *  8 bits : Traffic class
37
+	 * 20 bits : Flow label
38
+	 */
39
+	uint32_t ipver_tclass_flowlabel;
40
+	/** Payload length */
41
+	uint16_t paylen;
42
+	/** Next header */
43
+	uint8_t nxthdr;
44
+	/** Hop limit */
45
+	uint8_t hoplmt;
46
+	/** Source GID */
47
+	struct ib_gid sgid;
48
+	/** Destiniation GID */
49
+	struct ib_gid dgid;
50
+} __attribute__ (( packed ));
51
+
52
+struct ib_device;
53
+struct ib_queue_pair;
54
+struct ib_completion_queue;
55
+
56
+/** An Infiniband Work Queue */
57
+struct ib_work_queue {
58
+	/** Containing queue pair */
59
+	struct ib_queue_pair *qp;
60
+	/** "Is a send queue" flag */
61
+	int is_send;
62
+	/** Associated completion queue */
63
+	struct ib_completion_queue *cq;
64
+	/** List of work queues on this completion queue */
65
+	struct list_head list;
66
+	/** Number of work queue entries */
67
+	unsigned int num_wqes;
68
+	/** Next work queue entry index
69
+	 *
70
+	 * This is the index of the next entry to be filled (i.e. the
71
+	 * first empty entry).  This value is not bounded by num_wqes;
72
+	 * users must logical-AND with (num_wqes-1) to generate an
73
+	 * array index.
74
+	 */
75
+	unsigned long next_idx;
76
+	/** I/O buffers assigned to work queue */
77
+	struct io_buffer **iobufs;
78
+	/** Device private data */
79
+	void *dev_priv;
80
+};
81
+
82
+/** An Infiniband Queue Pair */
83
+struct ib_queue_pair {
84
+	/** Queue Pair Number */
85
+	unsigned long qpn;
86
+	/** Queue key */
87
+	unsigned long qkey;
88
+	/** Send queue */
89
+	struct ib_work_queue send;
90
+	/** Receive queue */
91
+	struct ib_work_queue recv;
92
+	/** Device private data */
93
+	void *dev_priv;
94
+	/** Queue owner private data */
95
+	void *owner_priv;
96
+};
97
+
98
+/** An Infiniband Completion Queue */
99
+struct ib_completion_queue {
100
+	/** Completion queue number */
101
+	unsigned long cqn;
102
+	/** Number of completion queue entries */
103
+	unsigned int num_cqes;
104
+	/** Next completion queue entry index
105
+	 *
106
+	 * This is the index of the next entry to be filled (i.e. the
107
+	 * first empty entry).  This value is not bounded by num_wqes;
108
+	 * users must logical-AND with (num_wqes-1) to generate an
109
+	 * array index.
110
+	 */
111
+	unsigned long next_idx;
112
+	/** List of work queues completing to this queue */
113
+	struct list_head work_queues;
114
+	/** Device private data */
115
+	void *dev_priv;
116
+};
117
+
118
+/** An Infiniband completion */
119
+struct ib_completion {
120
+	/** Syndrome
121
+	 *
122
+	 * If non-zero, then the completion is in error.
123
+	 */
124
+	unsigned int syndrome;
125
+	/** Length */
126
+	size_t len;
127
+};
128
+
129
+/** An Infiniband completion handler
130
+ *
131
+ * @v ibdev		Infiniband device
132
+ * @v qp		Queue pair
133
+ * @v completion	Completion
134
+ * @v iobuf		I/O buffer
135
+ */
136
+typedef void ( * ib_completer_t ) ( struct ib_device *ibdev,
137
+				    struct ib_queue_pair *qp,
138
+				    struct ib_completion *completion,
139
+				    struct io_buffer *iobuf );
140
+
141
+/** An Infiniband Address Vector */
142
+struct ib_address_vector {
143
+	/** Destination Queue Pair */
144
+	unsigned int dest_qp;
145
+	/** Queue key */
146
+	unsigned long qkey;
147
+	/** Destination Local ID */
148
+	unsigned int dlid;
149
+	/** Rate */
150
+	unsigned int rate;
151
+	/** Service level */
152
+	unsigned int sl;
153
+	/** GID is present */
154
+	unsigned int gid_present;
155
+	/** GID */
156
+	struct ib_gid gid;
157
+};
158
+
159
+/**
160
+ * Infiniband device operations
161
+ *
162
+ * These represent a subset of the Infiniband Verbs.
163
+ */
164
+struct ib_device_operations {
165
+	/** Create completion queue
166
+	 *
167
+	 * @v ibdev		Infiniband device
168
+	 * @v cq		Completion queue
169
+	 * @ret rc		Return status code
170
+	 */
171
+	int ( * create_cq ) ( struct ib_device *ibdev,
172
+			      struct ib_completion_queue *cq );
173
+	/** Destroy completion queue
174
+	 *
175
+	 * @v ibdev		Infiniband device
176
+	 * @v cq		Completion queue
177
+	 */
178
+	void ( * destroy_cq ) ( struct ib_device *ibdev,
179
+				struct ib_completion_queue *cq );
180
+	/** Create queue pair
181
+	 *
182
+	 * @v ibdev		Infiniband device
183
+	 * @v qp		Queue pair
184
+	 * @ret rc		Return status code
185
+	 */
186
+	int ( * create_qp ) ( struct ib_device *ibdev,
187
+			      struct ib_queue_pair *qp );
188
+	/** Destroy queue pair
189
+	 *
190
+	 * @v ibdev		Infiniband device
191
+	 * @v qp		Queue pair
192
+	 */
193
+	void ( * destroy_qp ) ( struct ib_device *ibdev,
194
+				struct ib_queue_pair *qp );
195
+	/** Post send work queue entry
196
+	 *
197
+	 * @v ibdev		Infiniband device
198
+	 * @v qp		Queue pair
199
+	 * @v av		Address vector
200
+	 * @v iobuf		I/O buffer
201
+	 * @ret rc		Return status code
202
+	 *
203
+	 * If this method returns success, the I/O buffer remains
204
+	 * owned by the queue pair.  If this method returns failure,
205
+	 * the I/O buffer is immediately released; the failure is
206
+	 * interpreted as "failure to enqueue buffer".
207
+	 */
208
+	int ( * post_send ) ( struct ib_device *ibdev,
209
+			      struct ib_queue_pair *qp,
210
+			      struct ib_address_vector *av,
211
+			      struct io_buffer *iobuf );
212
+	/** Post receive work queue entry
213
+	 *
214
+	 * @v ibdev		Infiniband device
215
+	 * @v qp		Queue pair
216
+	 * @v iobuf		I/O buffer
217
+	 * @ret rc		Return status code
218
+	 *
219
+	 * If this method returns success, the I/O buffer remains
220
+	 * owned by the queue pair.  If this method returns failure,
221
+	 * the I/O buffer is immediately released; the failure is
222
+	 * interpreted as "failure to enqueue buffer".
223
+	 */
224
+	int ( * post_recv ) ( struct ib_device *ibdev,
225
+			      struct ib_queue_pair *qp,
226
+			      struct io_buffer *iobuf );
227
+	/** Poll completion queue
228
+	 *
229
+	 * @v ibdev		Infiniband device
230
+	 * @v cq		Completion queue
231
+	 * @v complete_send	Send completion handler
232
+	 * @v complete_recv	Receive completion handler
233
+	 *
234
+	 * The completion handler takes ownership of the I/O buffer.
235
+	 */
236
+	void ( * poll_cq ) ( struct ib_device *ibdev,
237
+			     struct ib_completion_queue *cq,
238
+			     ib_completer_t complete_send,
239
+			     ib_completer_t complete_recv );
240
+	/** Attach to multicast group
241
+	 *
242
+	 * @v ibdev		Infiniband device
243
+	 * @v qp		Queue pair
244
+	 * @v gid		Multicast GID
245
+	 * @ret rc		Return status code
246
+	 */
247
+	int ( * mcast_attach ) ( struct ib_device *ibdev,
248
+				 struct ib_queue_pair *qp,
249
+				 struct ib_gid *gid );
250
+	/** Detach from multicast group
251
+	 *
252
+	 * @v ibdev		Infiniband device
253
+	 * @v qp		Queue pair
254
+	 * @v gid		Multicast GID
255
+	 */
256
+	void ( * mcast_detach ) ( struct ib_device *ibdev,
257
+				  struct ib_queue_pair *qp,
258
+				  struct ib_gid *gid );
259
+};
260
+
261
+/** An Infiniband device */
262
+struct ib_device {
263
+	/** Port GID */
264
+	struct ib_gid port_gid;
265
+	/** Subnet manager LID */
266
+	unsigned long sm_lid;
267
+	/** Partition key */
268
+	unsigned int pkey;
269
+	/** Underlying device */
270
+	struct device *dev;
271
+	/** Infiniband operations */
272
+	struct ib_device_operations *op;
273
+	/** Device private data */
274
+	void *dev_priv;
275
+	/** Owner private data */
276
+	void *owner_priv;
277
+};
278
+
279
+extern struct ib_completion_queue * ib_create_cq ( struct ib_device *ibdev,
280
+						   unsigned int num_cqes );
281
+extern void ib_destroy_cq ( struct ib_device *ibdev,
282
+			    struct ib_completion_queue *cq );
283
+extern struct ib_queue_pair *
284
+ib_create_qp ( struct ib_device *ibdev, unsigned int num_send_wqes,
285
+	       struct ib_completion_queue *send_cq, unsigned int num_recv_wqes,
286
+	       struct ib_completion_queue *recv_cq, unsigned long qkey );
287
+extern void ib_destroy_qp ( struct ib_device *ibdev,
288
+			    struct ib_queue_pair *qp );
289
+extern struct ib_work_queue * ib_find_wq ( struct ib_completion_queue *cq,
290
+					   unsigned long qpn, int is_send );
291
+extern struct ib_device * alloc_ibdev ( size_t priv_size );
292
+extern void free_ibdev ( struct ib_device *ibdev );
293
+
294
+/**
295
+ * Post send work queue entry
296
+ *
297
+ * @v ibdev		Infiniband device
298
+ * @v qp		Queue pair
299
+ * @v av		Address vector
300
+ * @v iobuf		I/O buffer
301
+ * @ret rc		Return status code
302
+ */
303
+static inline __attribute__ (( always_inline )) int
304
+ib_post_send ( struct ib_device *ibdev, struct ib_queue_pair *qp,
305
+	       struct ib_address_vector *av, struct io_buffer *iobuf ) {
306
+	return ibdev->op->post_send ( ibdev, qp, av, iobuf );
307
+}
308
+
309
+/**
310
+ * Post receive work queue entry
311
+ *
312
+ * @v ibdev		Infiniband device
313
+ * @v qp		Queue pair
314
+ * @v iobuf		I/O buffer
315
+ * @ret rc		Return status code
316
+ */
317
+static inline __attribute__ (( always_inline )) int
318
+ib_post_recv ( struct ib_device *ibdev, struct ib_queue_pair *qp,
319
+	       struct io_buffer *iobuf ) {
320
+	return ibdev->op->post_recv ( ibdev, qp, iobuf );
321
+}
322
+
323
+/**
324
+ * Poll completion queue
325
+ *
326
+ * @v ibdev		Infiniband device
327
+ * @v cq		Completion queue
328
+ * @v complete_send	Send completion handler
329
+ * @v complete_recv	Receive completion handler
330
+ */
331
+static inline __attribute__ (( always_inline )) void
332
+ib_poll_cq ( struct ib_device *ibdev, struct ib_completion_queue *cq,
333
+	     ib_completer_t complete_send, ib_completer_t complete_recv ) {
334
+	ibdev->op->poll_cq ( ibdev, cq, complete_send, complete_recv );
335
+}
336
+
337
+
338
+/**
339
+ * Attach to multicast group
340
+ *
341
+ * @v ibdev		Infiniband device
342
+ * @v qp		Queue pair
343
+ * @v gid		Multicast GID
344
+ * @ret rc		Return status code
345
+ */
346
+static inline __attribute__ (( always_inline )) int
347
+ib_mcast_attach ( struct ib_device *ibdev, struct ib_queue_pair *qp,
348
+		  struct ib_gid *gid ) {
349
+	return ibdev->op->mcast_attach ( ibdev, qp, gid );
350
+}
351
+
352
+/**
353
+ * Detach from multicast group
354
+ *
355
+ * @v ibdev		Infiniband device
356
+ * @v qp		Queue pair
357
+ * @v gid		Multicast GID
358
+ */
359
+static inline __attribute__ (( always_inline )) void
360
+ib_mcast_detach ( struct ib_device *ibdev, struct ib_queue_pair *qp,
361
+		  struct ib_gid *gid ) {
362
+	ibdev->op->mcast_detach ( ibdev, qp, gid );
363
+}
364
+
365
+/**
366
+ * Set Infiniband owner-private data
367
+ *
368
+ * @v pci		Infiniband device
369
+ * @v priv		Private data
370
+ */
371
+static inline void ib_set_ownerdata ( struct ib_device *ibdev,
372
+				      void *owner_priv ) {
373
+	ibdev->owner_priv = owner_priv;
374
+}
375
+
376
+/**
377
+ * Get Infiniband owner-private data
378
+ *
379
+ * @v pci		Infiniband device
380
+ * @ret priv		Private data
381
+ */
382
+static inline void * ib_get_ownerdata ( struct ib_device *ibdev ) {
383
+	return ibdev->owner_priv;
384
+}
385
+
386
+/*****************************************************************************
387
+ *
388
+ * Management datagrams
389
+ *
390
+ * Portions Copyright (c) 2004 Mellanox Technologies Ltd.  All rights
391
+ * reserved.
392
+ *
393
+ */
394
+
395
+/* Management base version */
396
+#define IB_MGMT_BASE_VERSION			1
397
+
398
+/* Management classes */
399
+#define IB_MGMT_CLASS_SUBN_LID_ROUTED		0x01
400
+#define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE	0x81
401
+#define IB_MGMT_CLASS_SUBN_ADM			0x03
402
+#define IB_MGMT_CLASS_PERF_MGMT			0x04
403
+#define IB_MGMT_CLASS_BM			0x05
404
+#define IB_MGMT_CLASS_DEVICE_MGMT		0x06
405
+#define IB_MGMT_CLASS_CM			0x07
406
+#define IB_MGMT_CLASS_SNMP			0x08
407
+#define IB_MGMT_CLASS_VENDOR_RANGE2_START	0x30
408
+#define IB_MGMT_CLASS_VENDOR_RANGE2_END		0x4F
409
+
410
+/* Management methods */
411
+#define IB_MGMT_METHOD_GET			0x01
412
+#define IB_MGMT_METHOD_SET			0x02
413
+#define IB_MGMT_METHOD_GET_RESP			0x81
414
+#define IB_MGMT_METHOD_SEND			0x03
415
+#define IB_MGMT_METHOD_TRAP			0x05
416
+#define IB_MGMT_METHOD_REPORT			0x06
417
+#define IB_MGMT_METHOD_REPORT_RESP		0x86
418
+#define IB_MGMT_METHOD_TRAP_REPRESS		0x07
419
+#define IB_MGMT_METHOD_DELETE			0x15
420
+#define IB_MGMT_METHOD_RESP			0x80
421
+
422
+/* Subnet management attributes */
423
+#define IB_SMP_ATTR_NOTICE			0x0002
424
+#define IB_SMP_ATTR_NODE_DESC			0x0010
425
+#define IB_SMP_ATTR_NODE_INFO			0x0011
426
+#define IB_SMP_ATTR_SWITCH_INFO			0x0012
427
+#define IB_SMP_ATTR_GUID_INFO			0x0014
428
+#define IB_SMP_ATTR_PORT_INFO			0x0015
429
+#define IB_SMP_ATTR_PKEY_TABLE			0x0016
430
+#define IB_SMP_ATTR_SL_TO_VL_TABLE		0x0017
431
+#define IB_SMP_ATTR_VL_ARB_TABLE		0x0018
432
+#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE	0x0019
433
+#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE	0x001A
434
+#define IB_SMP_ATTR_MCAST_FORWARD_TABLE		0x001B
435
+#define IB_SMP_ATTR_SM_INFO			0x0020
436
+#define IB_SMP_ATTR_VENDOR_DIAG			0x0030
437
+#define IB_SMP_ATTR_LED_INFO			0x0031
438
+#define IB_SMP_ATTR_VENDOR_MASK			0xFF00
439
+
440
+#define IB_SA_ATTR_MC_MEMBER_REC		0x38
441
+#define IB_SA_ATTR_PATH_REC			0x35
442
+
443
+#define IB_SA_MCMEMBER_REC_MGID			(1<<0)
444
+#define IB_SA_MCMEMBER_REC_PORT_GID		(1<<1)
445
+#define IB_SA_MCMEMBER_REC_QKEY			(1<<2)
446
+#define IB_SA_MCMEMBER_REC_MLID			(1<<3)
447
+#define IB_SA_MCMEMBER_REC_MTU_SELECTOR		(1<<4)
448
+#define IB_SA_MCMEMBER_REC_MTU			(1<<5)
449
+#define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS	(1<<6)
450
+#define IB_SA_MCMEMBER_REC_PKEY			(1<<7)
451
+#define IB_SA_MCMEMBER_REC_RATE_SELECTOR	(1<<8)
452
+#define IB_SA_MCMEMBER_REC_RATE			(1<<9)
453
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR	(1<<10)
454
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME	(1<<11)
455
+#define IB_SA_MCMEMBER_REC_SL			(1<<12)
456
+#define IB_SA_MCMEMBER_REC_FLOW_LABEL		(1<<13)
457
+#define IB_SA_MCMEMBER_REC_HOP_LIMIT		(1<<14)
458
+#define IB_SA_MCMEMBER_REC_SCOPE		(1<<15)
459
+#define IB_SA_MCMEMBER_REC_JOIN_STATE		(1<<16)
460
+#define IB_SA_MCMEMBER_REC_PROXY_JOIN		(1<<17)
461
+
462
+#define IB_SA_PATH_REC_DGID			(1<<2)
463
+#define IB_SA_PATH_REC_SGID			(1<<3)
464
+
465
+struct ib_mad_hdr {
466
+	uint8_t base_version;
467
+	uint8_t mgmt_class;
468
+	uint8_t class_version;
469
+	uint8_t method;
470
+	uint16_t status;
471
+	uint16_t class_specific;
472
+	uint32_t tid[2];
473
+	uint16_t attr_id;
474
+	uint16_t resv;
475
+	uint32_t attr_mod;
476
+} __attribute__ (( packed ));
477
+
478
+struct ib_sa_hdr {
479
+	uint32_t sm_key[2];
480
+	uint16_t reserved;
481
+	uint16_t attrib_offset;
482
+	uint32_t comp_mask[2];
483
+} __attribute__ (( packed ));
484
+
485
+struct ib_rmpp_hdr {
486
+	uint32_t raw[3];
487
+} __attribute__ (( packed ));
488
+
489
+struct ib_mad_data {
490
+	struct ib_mad_hdr mad_hdr;
491
+	uint8_t data[232];
492
+} __attribute__ (( packed ));
493
+
494
+struct ib_mad_guid_info {
495
+	struct ib_mad_hdr mad_hdr;
496
+	uint32_t mkey[2];
497
+	uint32_t reserved[8];
498
+	uint8_t gid_local[8];
499
+} __attribute__ (( packed ));
500
+
501
+struct ib_mad_port_info {
502
+	struct ib_mad_hdr mad_hdr;
503
+	uint32_t mkey[2];
504
+	uint32_t reserved[8];
505
+	uint32_t mkey2[2];
506
+	uint8_t gid_prefix[8];
507
+	uint16_t lid;
508
+	uint16_t mastersm_lid;
509
+	uint32_t cap_mask;
510
+	uint16_t diag_code;
511
+	uint16_t mkey_lease_period;
512
+	uint8_t local_port_num;
513
+	uint8_t link_width_enabled;
514
+	uint8_t link_width_supported;
515
+	uint8_t link_width_active;
516
+	uint8_t port_state__link_speed_supported;
517
+	uint8_t link_down_def_state__port_phys_state;
518
+	uint8_t lmc__r1__mkey_prot_bits;
519
+	uint8_t link_speed_enabled__link_speed_active;
520
+} __attribute__ (( packed ));
521
+
522
+struct ib_mad_pkey_table {
523
+	struct ib_mad_hdr mad_hdr;
524
+	uint32_t mkey[2];
525
+	uint32_t reserved[8];
526
+	uint16_t pkey[16][2];
527
+} __attribute__ (( packed ));
528
+
529
+struct ib_mad_path_record {
530
+	struct ib_mad_hdr mad_hdr;
531
+	struct ib_rmpp_hdr rmpp_hdr;
532
+	struct ib_sa_hdr sa_hdr;
533
+	uint32_t reserved0[2];
534
+	struct ib_gid dgid;
535
+	struct ib_gid sgid;
536
+	uint16_t dlid;
537
+	uint16_t slid;
538
+	uint32_t hop_limit__flow_label__raw_traffic;
539
+	uint32_t pkey__numb_path__reversible__tclass;
540
+	uint8_t reserved1;
541
+	uint8_t reserved__sl;
542
+	uint8_t mtu_selector__mtu;
543
+	uint8_t rate_selector__rate;
544
+	uint32_t preference__packet_lifetime__packet_lifetime_selector;
545
+	uint32_t reserved2[35];
546
+} __attribute__ (( packed ));
547
+
548
+struct ib_mad_mc_member_record {
549
+	struct ib_mad_hdr mad_hdr;
550
+	struct ib_rmpp_hdr rmpp_hdr;
551
+	struct ib_sa_hdr sa_hdr;
552
+	struct ib_gid mgid;
553
+	struct ib_gid port_gid;
554
+	uint32_t qkey;
555
+	uint16_t mlid;
556
+	uint8_t mtu_selector__mtu;
557
+	uint8_t tclass;
558
+	uint16_t pkey;
559
+	uint8_t rate_selector__rate;
560
+	uint8_t packet_lifetime_selector__packet_lifetime;
561
+	uint32_t sl__flow_label__hop_limit;
562
+	uint8_t scope__join_state;
563
+	uint8_t proxy_join__reserved;
564
+	uint16_t reserved0;
565
+	uint32_t reserved1[37];
566
+} __attribute__ (( packed ));
567
+
568
+union ib_mad {
569
+	struct ib_mad_hdr mad_hdr;
570
+	struct ib_mad_data data;
571
+	struct ib_mad_guid_info guid_info;
572
+	struct ib_mad_port_info port_info;
573
+	struct ib_mad_pkey_table pkey_table;
574
+	struct ib_mad_path_record path_record;
575
+	struct ib_mad_mc_member_record mc_member_record;
576
+} __attribute__ (( packed ));
577
+
578
+#endif /* _GPXE_INFINIBAND_H */

+ 78
- 0
src/include/gpxe/ipoib.h View File

@@ -0,0 +1,78 @@
1
+#ifndef _GPXE_IPOIB_H
2
+#define _GPXE_IPOIB_H
3
+
4
+/** @file
5
+ *
6
+ * IP over Infiniband
7
+ */
8
+
9
+#include <gpxe/infiniband.h>
10
+
11
+/** IPoIB MAC address length */
12
+#define IPOIB_ALEN 20
13
+
14
+/** An IPoIB MAC address */
15
+struct ipoib_mac {
16
+	/** Queue pair number
17
+	 *
18
+	 * MSB must be zero; QPNs are only 24-bit.
19
+	 */
20
+	uint32_t qpn;
21
+	/** Port GID */
22
+	struct ib_gid gid;
23
+} __attribute__ (( packed ));
24
+
25
+/** IPoIB link-layer header length */
26
+#define IPOIB_HLEN 24
27
+
28
+/**
29
+ * IPoIB link-layer header pseudo portion
30
+ *
31
+ * This part doesn't actually exist on the wire, but it provides a
32
+ * convenient way to fit into the typical network device model.
33
+ */
34
+struct ipoib_pseudo_hdr {
35
+	/** Peer address */
36
+	struct ipoib_mac peer;
37
+} __attribute__ (( packed ));
38
+
39
+/** IPoIB link-layer header real portion */
40
+struct ipoib_real_hdr {
41
+	/** Network-layer protocol */
42
+	uint16_t proto;
43
+	/** Reserved, must be zero */
44
+	uint16_t reserved;
45
+} __attribute__ (( packed ));
46
+
47
+/** An IPoIB link-layer header */
48
+struct ipoib_hdr {
49
+	/** Pseudo portion */
50
+	struct ipoib_pseudo_hdr pseudo;
51
+	/** Real portion */
52
+	struct ipoib_real_hdr real;
53
+} __attribute__ (( packed ));
54
+
55
+extern struct ll_protocol ipoib_protocol;
56
+
57
+extern const char * ipoib_ntoa ( const void *ll_addr );
58
+
59
+/**
60
+ * Allocate IPoIB device
61
+ *
62
+ * @v priv_size		Size of driver private data
63
+ * @ret netdev		Network device, or NULL
64
+ */
65
+static inline struct net_device * alloc_ipoibdev ( size_t priv_size ) {
66
+	struct net_device *netdev;
67
+
68
+	netdev = alloc_netdev ( priv_size );
69
+	if ( netdev ) {
70
+		netdev->ll_protocol = &ipoib_protocol;
71
+	}
72
+	return netdev;
73
+}
74
+
75
+extern int ipoib_probe ( struct ib_device *ibdev );
76
+extern void ipoib_remove ( struct ib_device *ibdev );
77
+
78
+#endif /* _GPXE_IPOIB_H */

+ 2
- 1
src/include/gpxe/tcp.h View File

@@ -275,7 +275,8 @@ struct tcp_options {
275 275
  * actually use 65536, we use a window size of (65536-4) to ensure
276 276
  * that payloads remain dword-aligned.
277 277
  */
278
-#define TCP_MAX_WINDOW_SIZE	( 65536 - 4 )
278
+//#define TCP_MAX_WINDOW_SIZE	( 65536 - 4 )
279
+#define TCP_MAX_WINDOW_SIZE	4096
279 280
 
280 281
 /**
281 282
  * Path MTU

+ 210
- 0
src/net/infiniband.c View File

@@ -0,0 +1,210 @@
1
+/*
2
+ * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
3
+ *
4
+ * This program is free software; you can redistribute it and/or
5
+ * modify it under the terms of the GNU General Public License as
6
+ * published by the Free Software Foundation; either version 2 of the
7
+ * License, or any later version.
8
+ *
9
+ * This program is distributed in the hope that it will be useful, but
10
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
+ * General Public License for more details.
13
+ *
14
+ * You should have received a copy of the GNU General Public License
15
+ * along with this program; if not, write to the Free Software
16
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
+ */
18
+
19
+#include <stdint.h>
20
+#include <stdlib.h>
21
+#include <stdio.h>
22
+#include <string.h>
23
+#include <byteswap.h>
24
+#include <errno.h>
25
+#include <assert.h>
26
+#include <gpxe/list.h>
27
+#include <gpxe/if_arp.h>
28
+#include <gpxe/netdevice.h>
29
+#include <gpxe/iobuf.h>
30
+#include <gpxe/infiniband.h>
31
+
32
+/** @file
33
+ *
34
+ * Infiniband protocol
35
+ *
36
+ */
37
+
38
+/**
39
+ * Create completion queue
40
+ *
41
+ * @v ibdev		Infiniband device
42
+ * @v num_cqes		Number of completion queue entries
43
+ * @ret cq		New completion queue
44
+ */
45
+struct ib_completion_queue * ib_create_cq ( struct ib_device *ibdev,
46
+					    unsigned int num_cqes ) {
47
+	struct ib_completion_queue *cq;
48
+	int rc;
49
+
50
+	DBGC ( ibdev, "IBDEV %p creating completion queue\n", ibdev );
51
+
52
+	/* Allocate and initialise data structure */
53
+	cq = zalloc ( sizeof ( *cq ) );
54
+	if ( ! cq )
55
+		return NULL;
56
+	cq->num_cqes = num_cqes;
57
+	INIT_LIST_HEAD ( &cq->work_queues );
58
+
59
+	/* Perform device-specific initialisation and get CQN */
60
+	if ( ( rc = ibdev->op->create_cq ( ibdev, cq ) ) != 0 ) {
61
+		DBGC ( ibdev, "IBDEV %p could not initialise completion "
62
+		       "queue: %s\n", ibdev, strerror ( rc ) );
63
+		free ( cq );
64
+		return NULL;
65
+	}
66
+
67
+	DBGC ( ibdev, "IBDEV %p created %d-entry completion queue %p (%p) "
68
+	       "with CQN %#lx\n", ibdev, num_cqes, cq, cq->dev_priv, cq->cqn );
69
+	return cq;
70
+}
71
+
72
+/**
73
+ * Destroy completion queue
74
+ *
75
+ * @v ibdev		Infiniband device
76
+ * @v cq		Completion queue
77
+ */
78
+void ib_destroy_cq ( struct ib_device *ibdev,
79
+		     struct ib_completion_queue *cq ) {
80
+	DBGC ( ibdev, "IBDEV %p destroying completion queue %#lx\n",
81
+	       ibdev, cq->cqn );
82
+	assert ( list_empty ( &cq->work_queues ) );
83
+	ibdev->op->destroy_cq ( ibdev, cq );
84
+	free ( cq );
85
+}
86
+
87
+/**
88
+ * Create queue pair
89
+ *
90
+ * @v ibdev		Infiniband device
91
+ * @v num_send_wqes	Number of send work queue entries
92
+ * @v send_cq		Send completion queue
93
+ * @v num_recv_wqes	Number of receive work queue entries
94
+ * @v recv_cq		Receive completion queue
95
+ * @v qkey		Queue key
96
+ * @ret qp		Queue pair
97
+ */
98
+struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev,
99
+				      unsigned int num_send_wqes,
100
+				      struct ib_completion_queue *send_cq,
101
+				      unsigned int num_recv_wqes,
102
+				      struct ib_completion_queue *recv_cq,
103
+				      unsigned long qkey ) {
104
+	struct ib_queue_pair *qp;
105
+	size_t total_size;
106
+	int rc;
107
+
108
+	DBGC ( ibdev, "IBDEV %p creating queue pair\n", ibdev );
109
+
110
+	/* Allocate and initialise data structure */
111
+	total_size = ( sizeof ( *qp ) +
112
+		       ( num_send_wqes * sizeof ( qp->send.iobufs[0] ) ) +
113
+		       ( num_recv_wqes * sizeof ( qp->recv.iobufs[0] ) ) );
114
+	qp = zalloc ( total_size );
115
+	if ( ! qp )
116
+		return NULL;
117
+	qp->qkey = qkey;
118
+	qp->send.qp = qp;
119
+	qp->send.is_send = 1;
120
+	qp->send.cq = send_cq;
121
+	list_add ( &qp->send.list, &send_cq->work_queues );
122
+	qp->send.num_wqes = num_send_wqes;
123
+	qp->send.iobufs = ( ( ( void * ) qp ) + sizeof ( *qp ) );
124
+	qp->recv.qp = qp;
125
+	qp->recv.cq = recv_cq;
126
+	list_add ( &qp->recv.list, &recv_cq->work_queues );
127
+	qp->recv.num_wqes = num_recv_wqes;
128
+	qp->recv.iobufs = ( ( ( void * ) qp ) + sizeof ( *qp ) +
129
+			    ( num_send_wqes * sizeof ( qp->send.iobufs[0] ) ));
130
+
131
+	/* Perform device-specific initialisation and get QPN */
132
+	if ( ( rc = ibdev->op->create_qp ( ibdev, qp ) ) != 0 ) {
133
+		DBGC ( ibdev, "IBDEV %p could not initialise queue pair: "
134
+		       "%s\n", ibdev, strerror ( rc ) );
135
+		free ( qp );
136
+		return NULL;
137
+	}
138
+
139
+	DBGC ( ibdev, "IBDEV %p created queue pair %p (%p) with QPN %#lx\n",
140
+	       ibdev, qp, qp->dev_priv, qp->qpn );
141
+	DBGC ( ibdev, "IBDEV %p QPN %#lx has %d send entries at [%p,%p)\n",
142
+	       ibdev, qp->qpn, num_send_wqes, qp->send.iobufs,
143
+	       qp->recv.iobufs );
144
+	DBGC ( ibdev, "IBDEV %p QPN %#lx has %d receive entries at [%p,%p)\n",
145
+	       ibdev, qp->qpn, num_send_wqes, qp->recv.iobufs,
146
+	       ( ( ( void * ) qp ) + total_size ) );
147
+	return qp;
148
+}
149
+
150
+/**
151
+ * Destroy queue pair
152
+ *
153
+ * @v ibdev		Infiniband device
154
+ * @v qp		Queue pair
155
+ */
156
+void ib_destroy_qp ( struct ib_device *ibdev,
157
+		     struct ib_queue_pair *qp ) {
158
+	DBGC ( ibdev, "IBDEV %p destroying queue pair %#lx\n",
159
+	       ibdev, qp->qpn );
160
+	ibdev->op->destroy_qp ( ibdev, qp );
161
+	list_del ( &qp->send.list );
162
+	list_del ( &qp->recv.list );
163
+	free ( qp );
164
+}
165
+
166
+/**
167
+ * Find work queue belonging to completion queue
168
+ *
169
+ * @v cq		Completion queue
170
+ * @v qpn		Queue pair number
171
+ * @v is_send		Find send work queue (rather than receive)
172
+ * @ret wq		Work queue, or NULL if not found
173
+ */
174
+struct ib_work_queue * ib_find_wq ( struct ib_completion_queue *cq,
175
+				    unsigned long qpn, int is_send ) {
176
+	struct ib_work_queue *wq;
177
+
178
+	list_for_each_entry ( wq, &cq->work_queues, list ) {
179
+		if ( ( wq->qp->qpn == qpn ) && ( wq->is_send == is_send ) )
180
+			return wq;
181
+	}
182
+	return NULL;
183
+}
184
+
185
+/**
186
+ * Allocate Infiniband device
187
+ *
188
+ * @v priv_size		Size of private data area
189
+ * @ret ibdev		Infiniband device, or NULL
190
+ */
191
+struct ib_device * alloc_ibdev ( size_t priv_size ) {
192
+	struct ib_device *ibdev;
193
+	size_t total_len;
194
+
195
+	total_len = ( sizeof ( *ibdev ) + priv_size );
196
+	ibdev = zalloc ( total_len );
197
+	if ( ibdev ) {
198
+		ibdev->dev_priv = ( ( ( void * ) ibdev ) + sizeof ( *ibdev ) );
199
+	}
200
+	return ibdev;
201
+}
202
+
203
+/**
204
+ * Free Infiniband device
205
+ *
206
+ * @v ibdev		Infiniband device
207
+ */
208
+void free_ibdev ( struct ib_device *ibdev ) {
209
+	free ( ibdev );
210
+}

Loading…
Cancel
Save