Преглед на файлове

[tcp] Implement support for TCP Selective Acknowledgements (SACK)

The TCP Selective Acknowledgement option (specified in RFC2018)
provides a mechanism for the receiver to indicate packets that have
been received out of order (e.g. due to earlier dropped packets).

iPXE often operates in environments in which there is a high
probability of packet loss.  For example, the legacy USB keyboard
emulation in some BIOSes involves polling the USB bus from within a
system management interrupt: this introduces an invisible delay of
around 500us which is long enough for around 40 full-length packets to
be dropped.  Similarly, almost all 1Gbps USB2 devices will eventually
end up dropping packets because the USB2 bus does not provide enough
bandwidth to sustain a 1Gbps stream, and most devices will not provide
enough internal buffering to hold a full TCP window's worth of
received packets.

Add support for sending TCP Selective Acknowledgements.  This provides
the sender with more detailed information about which packets have
been lost, and so allows for a more efficient retransmission strategy.

We include a SACK-permitted option in our SYN packet, since
experimentation shows that at least Linux peers will not include a
SACK-permitted option in the SYN-ACK packet if one was not present in
the initial SYN.  (RFC2018 does not seem to mandate this behaviour,
but it is consistent with the approach taken in RFC1323.)  We ignore
any received SACK options; this is safe to do since SACK is only ever
advisory and we never have to send non-trivial amounts of data.

Since our TCP receive queue is a candidate for cache discarding under
low memory conditions, we may end up discarding data that has been
reported as received via a SACK option.  This is permitted by RFC2018.
We follow the stricture that SACK blocks must not report data which is
no longer held by the receiver: previously-reported blocks are
validated against the current receive queue before being included
within the current SACK block list.

Experiments in a qemu VM using forced packet drops (by setting
NETDEV_DISCARD_RATE to 32) show that implementing SACK improves
throughput by around 400%.

Experiments with a USB2 NIC (an SMSC7500) show that implementing SACK
improves throughput by around 700%, increasing the download rate from
35Mbps up to 250Mbps (which is approximately the usable bandwidth
limit for USB2).

Signed-off-by: Michael Brown <mcb30@ipxe.org>
tags/v1.20.1
Michael Brown преди 9 години
родител
ревизия
e0fc8fe781
променени са 2 файла, в които са добавени 202 реда и са изтрити 4 реда
  1. 44
    0
      src/include/ipxe/tcp.h
  2. 158
    4
      src/net/tcp.c

+ 44
- 0
src/include/ipxe/tcp.h Целия файл

@@ -79,6 +79,48 @@ struct tcp_window_scale_padded_option {
79 79
  */
80 80
 #define TCP_RX_WINDOW_SCALE 9
81 81
 
82
+/** TCP selective acknowledgement permitted option */
83
+struct tcp_sack_permitted_option {
84
+	uint8_t kind;
85
+	uint8_t length;
86
+} __attribute__ (( packed ));
87
+
88
+/** Padded TCP selective acknowledgement permitted option (used for sending) */
89
+struct tcp_sack_permitted_padded_option {
90
+	uint8_t nop[2];
91
+	struct tcp_sack_permitted_option spopt;
92
+} __attribute__ (( packed ));
93
+
94
+/** Code for the TCP selective acknowledgement permitted option */
95
+#define TCP_OPTION_SACK_PERMITTED 4
96
+
97
+/** TCP selective acknowledgement option */
98
+struct tcp_sack_option {
99
+	uint8_t kind;
100
+	uint8_t length;
101
+} __attribute__ (( packed ));
102
+
103
+/** TCP selective acknowledgement block */
104
+struct tcp_sack_block {
105
+	uint32_t left;
106
+	uint32_t right;
107
+} __attribute__ (( packed ));
108
+
109
+/** Maximum number of selective acknowledgement blocks
110
+ *
111
+ * This allows for the presence of the TCP timestamp option.
112
+ */
113
+#define TCP_SACK_MAX 3
114
+
115
+/** Padded TCP selective acknowledgement option (used for sending) */
116
+struct tcp_sack_padded_option {
117
+	uint8_t nop[2];
118
+	struct tcp_sack_option sackopt;
119
+} __attribute__ (( packed ));
120
+
121
+/** Code for the TCP selective acknowledgement option */
122
+#define TCP_OPTION_SACK 5
123
+
82 124
 /** TCP timestamp option */
83 125
 struct tcp_timestamp_option {
84 126
 	uint8_t kind;
@@ -102,6 +144,8 @@ struct tcp_options {
102 144
 	const struct tcp_mss_option *mssopt;
103 145
 	/** Window scale option, if present */
104 146
 	const struct tcp_window_scale_option *wsopt;
147
+	/** SACK permitted option, if present */
148
+	const struct tcp_sack_permitted_option *spopt;
105 149
 	/** Timestamp option, if present */
106 150
 	const struct tcp_timestamp_option *tsopt;
107 151
 };

+ 158
- 4
src/net/tcp.c Целия файл

@@ -104,6 +104,9 @@ struct tcp_connection {
104 104
 	/** Maximum receive window */
105 105
 	uint32_t max_rcv_win;
106 106
 
107
+	/** Selective acknowledgement list (in host-endian order) */
108
+	struct tcp_sack_block sack[TCP_SACK_MAX];
109
+
107 110
 	/** Transmit queue */
108 111
 	struct list_head tx_queue;
109 112
 	/** Receive queue */
@@ -129,6 +132,8 @@ enum tcp_flags {
129 132
 	TCP_TS_ENABLED = 0x0002,
130 133
 	/** TCP acknowledgement is pending */
131 134
 	TCP_ACK_PENDING = 0x0004,
135
+	/** TCP selective acknowledgement is enabled */
136
+	TCP_SACK_ENABLED = 0x0008,
132 137
 };
133 138
 
134 139
 /** TCP internal header
@@ -143,6 +148,8 @@ struct tcp_rx_queued_header {
143 148
 	 * enqueued, and so excludes the SYN, if present.
144 149
 	 */
145 150
 	uint32_t seq;
151
+	/** Next SEQ value, in host-endian order */
152
+	uint32_t nxt;
146 153
 	/** Flags
147 154
 	 *
148 155
 	 * Only FIN is valid within this flags byte; all other flags
@@ -449,6 +456,94 @@ static size_t tcp_xfer_window ( struct tcp_connection *tcp ) {
449 456
 	return tcp_xmit_win ( tcp );
450 457
 }
451 458
 
459
+/**
460
+ * Find selective acknowledgement block
461
+ *
462
+ * @v tcp		TCP connection
463
+ * @v seq		SEQ value in SACK block (in host-endian order)
464
+ * @v sack		SACK block to fill in (in host-endian order)
465
+ * @ret len		Length of SACK block
466
+ */
467
+static uint32_t tcp_sack_block ( struct tcp_connection *tcp, uint32_t seq,
468
+				 struct tcp_sack_block *sack ) {
469
+	struct io_buffer *iobuf;
470
+	struct tcp_rx_queued_header *tcpqhdr;
471
+	uint32_t left = tcp->rcv_ack;
472
+	uint32_t right = left;
473
+
474
+	/* Find highest block which does not start after SEQ */
475
+	list_for_each_entry ( iobuf, &tcp->rx_queue, list ) {
476
+		tcpqhdr = iobuf->data;
477
+		if ( tcp_cmp ( tcpqhdr->seq, right ) > 0 ) {
478
+			if ( tcp_cmp ( tcpqhdr->seq, seq ) > 0 )
479
+				break;
480
+			left = tcpqhdr->seq;
481
+		}
482
+		if ( tcp_cmp ( tcpqhdr->nxt, right ) > 0 )
483
+			right = tcpqhdr->nxt;
484
+	}
485
+
486
+	/* Fail if this block does not contain SEQ */
487
+	if ( tcp_cmp ( right, seq ) < 0 )
488
+		return 0;
489
+
490
+	/* Populate SACK block */
491
+	sack->left = left;
492
+	sack->right = right;
493
+	return ( right - left );
494
+}
495
+
496
+/**
497
+ * Update TCP selective acknowledgement list
498
+ *
499
+ * @v tcp		TCP connection
500
+ * @v seq		SEQ value in first SACK block (in host-endian order)
501
+ * @ret count		Number of SACK blocks
502
+ */
503
+static unsigned int tcp_sack ( struct tcp_connection *tcp, uint32_t seq ) {
504
+	struct tcp_sack_block sack[TCP_SACK_MAX];
505
+	unsigned int old = 0;
506
+	unsigned int new = 0;
507
+	unsigned int i;
508
+	uint32_t len;
509
+
510
+	/* Populate first new SACK block */
511
+	len = tcp_sack_block ( tcp, seq, &sack[0] );
512
+	if ( len )
513
+		new++;
514
+
515
+	/* Populate remaining new SACK blocks based on old SACK blocks */
516
+	for ( old = 0 ; old < TCP_SACK_MAX ; old++ ) {
517
+
518
+		/* Stop if we run out of space in the new list */
519
+		if ( new == TCP_SACK_MAX )
520
+			break;
521
+
522
+		/* Skip empty old SACK blocks */
523
+		if ( tcp->sack[old].left == tcp->sack[old].right )
524
+			continue;
525
+
526
+		/* Populate new SACK block */
527
+		len = tcp_sack_block ( tcp, tcp->sack[old].left, &sack[new] );
528
+		if ( len == 0 )
529
+			continue;
530
+
531
+		/* Eliminate duplicates */
532
+		for ( i = 0 ; i < new ; i++ ) {
533
+			if ( sack[i].left == sack[new].left ) {
534
+				new--;
535
+				break;
536
+			}
537
+		}
538
+		new++;
539
+	}
540
+
541
+	/* Update SACK list */
542
+	memset ( tcp->sack, 0, sizeof ( tcp->sack ) );
543
+	memcpy ( tcp->sack, sack, ( new * sizeof ( tcp->sack[0] ) ) );
544
+	return new;
545
+}
546
+
452 547
 /**
453 548
  * Process TCP transmit queue
454 549
  *
@@ -493,9 +588,10 @@ static size_t tcp_process_tx_queue ( struct tcp_connection *tcp, size_t max_len,
493 588
 }
494 589
 
495 590
 /**
496
- * Transmit any outstanding data
591
+ * Transmit any outstanding data (with selective acknowledgement)
497 592
  *
498 593
  * @v tcp		TCP connection
594
+ * @v sack_seq		SEQ for first selective acknowledgement (if any)
499 595
  * 
500 596
  * Transmits any outstanding data on the connection.
501 597
  *
@@ -503,15 +599,21 @@ static size_t tcp_process_tx_queue ( struct tcp_connection *tcp, size_t max_len,
503 599
  * will have been started if necessary, and so the stack will
504 600
  * eventually attempt to retransmit the failed packet.
505 601
  */
506
-static void tcp_xmit ( struct tcp_connection *tcp ) {
602
+static void tcp_xmit_sack ( struct tcp_connection *tcp, uint32_t sack_seq ) {
507 603
 	struct io_buffer *iobuf;
508 604
 	struct tcp_header *tcphdr;
509 605
 	struct tcp_mss_option *mssopt;
510 606
 	struct tcp_window_scale_padded_option *wsopt;
511 607
 	struct tcp_timestamp_padded_option *tsopt;
608
+	struct tcp_sack_permitted_padded_option *spopt;
609
+	struct tcp_sack_padded_option *sackopt;
610
+	struct tcp_sack_block *sack;
512 611
 	void *payload;
513 612
 	unsigned int flags;
613
+	unsigned int sack_count;
614
+	unsigned int i;
514 615
 	size_t len = 0;
616
+	size_t sack_len;
515 617
 	uint32_t seq_len;
516 618
 	uint32_t app_win;
517 619
 	uint32_t max_rcv_win;
@@ -590,6 +692,10 @@ static void tcp_xmit ( struct tcp_connection *tcp ) {
590 692
 		wsopt->wsopt.kind = TCP_OPTION_WS;
591 693
 		wsopt->wsopt.length = sizeof ( wsopt->wsopt );
592 694
 		wsopt->wsopt.scale = TCP_RX_WINDOW_SCALE;
695
+		spopt = iob_push ( iobuf, sizeof ( *spopt ) );
696
+		memset ( spopt->nop, TCP_OPTION_NOP, sizeof ( spopt ) );
697
+		spopt->spopt.kind = TCP_OPTION_SACK_PERMITTED;
698
+		spopt->spopt.length = sizeof ( spopt->spopt );
593 699
 	}
594 700
 	if ( ( flags & TCP_SYN ) || ( tcp->flags & TCP_TS_ENABLED ) ) {
595 701
 		tsopt = iob_push ( iobuf, sizeof ( *tsopt ) );
@@ -599,6 +705,21 @@ static void tcp_xmit ( struct tcp_connection *tcp ) {
599 705
 		tsopt->tsopt.tsval = htonl ( currticks() );
600 706
 		tsopt->tsopt.tsecr = htonl ( tcp->ts_recent );
601 707
 	}
708
+	if ( ( tcp->flags & TCP_SACK_ENABLED ) &&
709
+	     ( ! list_empty ( &tcp->rx_queue ) ) &&
710
+	     ( ( sack_count = tcp_sack ( tcp, sack_seq ) ) != 0 ) ) {
711
+		sack_len = ( sack_count * sizeof ( *sack ) );
712
+		sackopt = iob_push ( iobuf, ( sizeof ( *sackopt ) + sack_len ));
713
+		memset ( sackopt->nop, TCP_OPTION_NOP, sizeof ( sackopt->nop ));
714
+		sackopt->sackopt.kind = TCP_OPTION_SACK;
715
+		sackopt->sackopt.length =
716
+			( sizeof ( sackopt->sackopt ) + sack_len );
717
+		sack = ( ( ( void * ) sackopt ) + sizeof ( *sackopt ) );
718
+		for ( i = 0 ; i < sack_count ; i++, sack++ ) {
719
+			sack->left = htonl ( tcp->sack[i].left );
720
+			sack->right = htonl ( tcp->sack[i].right );
721
+		}
722
+	}
602 723
 	if ( len != 0 )
603 724
 		flags |= TCP_PSH;
604 725
 	tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
@@ -635,6 +756,17 @@ static void tcp_xmit ( struct tcp_connection *tcp ) {
635 756
 	profile_stop ( &tcp_tx_profiler );
636 757
 }
637 758
 
759
+/**
760
+ * Transmit any outstanding data
761
+ *
762
+ * @v tcp		TCP connection
763
+ */
764
+static void tcp_xmit ( struct tcp_connection *tcp ) {
765
+
766
+	/* Transmit without an explicit first SACK */
767
+	tcp_xmit_sack ( tcp, tcp->rcv_ack );
768
+}
769
+
638 770
 /** TCP process descriptor */
639 771
 static struct process_descriptor tcp_process_desc =
640 772
 	PROC_DESC_ONCE ( struct tcp_connection, process, tcp_xmit );
@@ -804,6 +936,12 @@ static void tcp_rx_opts ( struct tcp_connection *tcp, const void *data,
804 936
 		case TCP_OPTION_WS:
805 937
 			options->wsopt = data;
806 938
 			break;
939
+		case TCP_OPTION_SACK_PERMITTED:
940
+			options->spopt = data;
941
+			break;
942
+		case TCP_OPTION_SACK:
943
+			/* Ignore received SACKs */
944
+			break;
807 945
 		case TCP_OPTION_TS:
808 946
 			options->tsopt = data;
809 947
 			break;
@@ -823,6 +961,7 @@ static void tcp_rx_opts ( struct tcp_connection *tcp, const void *data,
823 961
  * @v seq_len		Sequence space length to consume
824 962
  */
825 963
 static void tcp_rx_seq ( struct tcp_connection *tcp, uint32_t seq_len ) {
964
+	unsigned int sack;
826 965
 
827 966
 	/* Sanity check */
828 967
 	assert ( seq_len > 0 );
@@ -840,6 +979,16 @@ static void tcp_rx_seq ( struct tcp_connection *tcp, uint32_t seq_len ) {
840 979
 	/* Update timestamp */
841 980
 	tcp->ts_recent = tcp->ts_val;
842 981
 
982
+	/* Update SACK list */
983
+	for ( sack = 0 ; sack < TCP_SACK_MAX ; sack++ ) {
984
+		if ( tcp->sack[sack].left == tcp->sack[sack].right )
985
+			continue;
986
+		if ( tcp_cmp ( tcp->sack[sack].left, tcp->rcv_ack ) < 0 )
987
+			tcp->sack[sack].left = tcp->rcv_ack;
988
+		if ( tcp_cmp ( tcp->sack[sack].right, tcp->rcv_ack ) < 0 )
989
+			tcp->sack[sack].right = tcp->rcv_ack;
990
+	}
991
+
843 992
 	/* Mark ACK as pending */
844 993
 	tcp->flags |= TCP_ACK_PENDING;
845 994
 }
@@ -860,6 +1009,8 @@ static int tcp_rx_syn ( struct tcp_connection *tcp, uint32_t seq,
860 1009
 		tcp->rcv_ack = seq;
861 1010
 		if ( options->tsopt )
862 1011
 			tcp->flags |= TCP_TS_ENABLED;
1012
+		if ( options->spopt )
1013
+			tcp->flags |= TCP_SACK_ENABLED;
863 1014
 		if ( options->wsopt ) {
864 1015
 			tcp->snd_win_scale = options->wsopt->scale;
865 1016
 			tcp->rcv_win_scale = TCP_RX_WINDOW_SCALE;
@@ -1070,6 +1221,7 @@ static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
1070 1221
 	struct io_buffer *queued;
1071 1222
 	size_t len;
1072 1223
 	uint32_t seq_len;
1224
+	uint32_t nxt;
1073 1225
 
1074 1226
 	/* Calculate remaining flags and sequence length.  Note that
1075 1227
 	 * SYN, if present, has already been processed by this point.
@@ -1077,6 +1229,7 @@ static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
1077 1229
 	flags &= TCP_FIN;
1078 1230
 	len = iob_len ( iobuf );
1079 1231
 	seq_len = ( len + ( flags ? 1 : 0 ) );
1232
+	nxt = ( seq + seq_len );
1080 1233
 
1081 1234
 	/* Discard immediately (to save memory) if:
1082 1235
 	 *
@@ -1087,7 +1240,7 @@ static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
1087 1240
 	 */
1088 1241
 	if ( ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) ||
1089 1242
 	     ( tcp_cmp ( seq, tcp->rcv_ack + tcp->rcv_win ) >= 0 ) ||
1090
-	     ( tcp_cmp ( seq + seq_len, tcp->rcv_ack ) < 0 ) ||
1243
+	     ( tcp_cmp ( nxt, tcp->rcv_ack ) < 0 ) ||
1091 1244
 	     ( seq_len == 0 ) ) {
1092 1245
 		free_iob ( iobuf );
1093 1246
 		return;
@@ -1096,6 +1249,7 @@ static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
1096 1249
 	/* Add internal header */
1097 1250
 	tcpqhdr = iob_push ( iobuf, sizeof ( *tcpqhdr ) );
1098 1251
 	tcpqhdr->seq = seq;
1252
+	tcpqhdr->nxt = nxt;
1099 1253
 	tcpqhdr->flags = flags;
1100 1254
 
1101 1255
 	/* Add to RX queue */
@@ -1289,7 +1443,7 @@ static int tcp_rx ( struct io_buffer *iobuf,
1289 1443
 	if ( list_empty ( &tcp->rx_queue ) ) {
1290 1444
 		process_add ( &tcp->process );
1291 1445
 	} else {
1292
-		tcp_xmit ( tcp );
1446
+		tcp_xmit_sack ( tcp, seq );
1293 1447
 	}
1294 1448
 
1295 1449
 	/* If this packet was the last we expect to receive, set up

Loading…
Отказ
Запис