Browse Source

[tcp] Implement support for TCP Selective Acknowledgements (SACK)

The TCP Selective Acknowledgement option (specified in RFC2018)
provides a mechanism for the receiver to indicate packets that have
been received out of order (e.g. due to earlier dropped packets).

iPXE often operates in environments in which there is a high
probability of packet loss.  For example, the legacy USB keyboard
emulation in some BIOSes involves polling the USB bus from within a
system management interrupt: this introduces an invisible delay of
around 500us which is long enough for around 40 full-length packets to
be dropped.  Similarly, almost all 1Gbps USB2 devices will eventually
end up dropping packets because the USB2 bus does not provide enough
bandwidth to sustain a 1Gbps stream, and most devices will not provide
enough internal buffering to hold a full TCP window's worth of
received packets.

Add support for sending TCP Selective Acknowledgements.  This provides
the sender with more detailed information about which packets have
been lost, and so allows for a more efficient retransmission strategy.

We include a SACK-permitted option in our SYN packet, since
experimentation shows that at least Linux peers will not include a
SACK-permitted option in the SYN-ACK packet if one was not present in
the initial SYN.  (RFC2018 does not seem to mandate this behaviour,
but it is consistent with the approach taken in RFC1323.)  We ignore
any received SACK options; this is safe to do since SACK is only ever
advisory and we never have to send non-trivial amounts of data.

Since our TCP receive queue is a candidate for cache discarding under
low memory conditions, we may end up discarding data that has been
reported as received via a SACK option.  This is permitted by RFC2018.
We follow the stricture that SACK blocks must not report data which is
no longer held by the receiver: previously-reported blocks are
validated against the current receive queue before being included
within the current SACK block list.

Experiments in a qemu VM using forced packet drops (by setting
NETDEV_DISCARD_RATE to 32) show that implementing SACK improves
throughput by around 400%.

Experiments with a USB2 NIC (an SMSC7500) show that implementing SACK
improves throughput by around 700%, increasing the download rate from
35Mbps up to 250Mbps (which is approximately the usable bandwidth
limit for USB2).

Signed-off-by: Michael Brown <mcb30@ipxe.org>
tags/v1.20.1
Michael Brown 9 years ago
parent
commit
e0fc8fe781
2 changed files with 202 additions and 4 deletions
  1. 44
    0
      src/include/ipxe/tcp.h
  2. 158
    4
      src/net/tcp.c

+ 44
- 0
src/include/ipxe/tcp.h View File

79
  */
79
  */
80
 #define TCP_RX_WINDOW_SCALE 9
80
 #define TCP_RX_WINDOW_SCALE 9
81
 
81
 
82
+/** TCP selective acknowledgement permitted option */
83
+struct tcp_sack_permitted_option {
84
+	uint8_t kind;
85
+	uint8_t length;
86
+} __attribute__ (( packed ));
87
+
88
+/** Padded TCP selective acknowledgement permitted option (used for sending) */
89
+struct tcp_sack_permitted_padded_option {
90
+	uint8_t nop[2];
91
+	struct tcp_sack_permitted_option spopt;
92
+} __attribute__ (( packed ));
93
+
94
+/** Code for the TCP selective acknowledgement permitted option */
95
+#define TCP_OPTION_SACK_PERMITTED 4
96
+
97
+/** TCP selective acknowledgement option */
98
+struct tcp_sack_option {
99
+	uint8_t kind;
100
+	uint8_t length;
101
+} __attribute__ (( packed ));
102
+
103
+/** TCP selective acknowledgement block */
104
+struct tcp_sack_block {
105
+	uint32_t left;
106
+	uint32_t right;
107
+} __attribute__ (( packed ));
108
+
109
+/** Maximum number of selective acknowledgement blocks
110
+ *
111
+ * This allows for the presence of the TCP timestamp option.
112
+ */
113
+#define TCP_SACK_MAX 3
114
+
115
+/** Padded TCP selective acknowledgement option (used for sending) */
116
+struct tcp_sack_padded_option {
117
+	uint8_t nop[2];
118
+	struct tcp_sack_option sackopt;
119
+} __attribute__ (( packed ));
120
+
121
+/** Code for the TCP selective acknowledgement option */
122
+#define TCP_OPTION_SACK 5
123
+
82
 /** TCP timestamp option */
124
 /** TCP timestamp option */
83
 struct tcp_timestamp_option {
125
 struct tcp_timestamp_option {
84
 	uint8_t kind;
126
 	uint8_t kind;
102
 	const struct tcp_mss_option *mssopt;
144
 	const struct tcp_mss_option *mssopt;
103
 	/** Window scale option, if present */
145
 	/** Window scale option, if present */
104
 	const struct tcp_window_scale_option *wsopt;
146
 	const struct tcp_window_scale_option *wsopt;
147
+	/** SACK permitted option, if present */
148
+	const struct tcp_sack_permitted_option *spopt;
105
 	/** Timestamp option, if present */
149
 	/** Timestamp option, if present */
106
 	const struct tcp_timestamp_option *tsopt;
150
 	const struct tcp_timestamp_option *tsopt;
107
 };
151
 };

+ 158
- 4
src/net/tcp.c View File

104
 	/** Maximum receive window */
104
 	/** Maximum receive window */
105
 	uint32_t max_rcv_win;
105
 	uint32_t max_rcv_win;
106
 
106
 
107
+	/** Selective acknowledgement list (in host-endian order) */
108
+	struct tcp_sack_block sack[TCP_SACK_MAX];
109
+
107
 	/** Transmit queue */
110
 	/** Transmit queue */
108
 	struct list_head tx_queue;
111
 	struct list_head tx_queue;
109
 	/** Receive queue */
112
 	/** Receive queue */
129
 	TCP_TS_ENABLED = 0x0002,
132
 	TCP_TS_ENABLED = 0x0002,
130
 	/** TCP acknowledgement is pending */
133
 	/** TCP acknowledgement is pending */
131
 	TCP_ACK_PENDING = 0x0004,
134
 	TCP_ACK_PENDING = 0x0004,
135
+	/** TCP selective acknowledgement is enabled */
136
+	TCP_SACK_ENABLED = 0x0008,
132
 };
137
 };
133
 
138
 
134
 /** TCP internal header
139
 /** TCP internal header
143
 	 * enqueued, and so excludes the SYN, if present.
148
 	 * enqueued, and so excludes the SYN, if present.
144
 	 */
149
 	 */
145
 	uint32_t seq;
150
 	uint32_t seq;
151
+	/** Next SEQ value, in host-endian order */
152
+	uint32_t nxt;
146
 	/** Flags
153
 	/** Flags
147
 	 *
154
 	 *
148
 	 * Only FIN is valid within this flags byte; all other flags
155
 	 * Only FIN is valid within this flags byte; all other flags
449
 	return tcp_xmit_win ( tcp );
456
 	return tcp_xmit_win ( tcp );
450
 }
457
 }
451
 
458
 
459
+/**
460
+ * Find selective acknowledgement block
461
+ *
462
+ * @v tcp		TCP connection
463
+ * @v seq		SEQ value in SACK block (in host-endian order)
464
+ * @v sack		SACK block to fill in (in host-endian order)
465
+ * @ret len		Length of SACK block
466
+ */
467
+static uint32_t tcp_sack_block ( struct tcp_connection *tcp, uint32_t seq,
468
+				 struct tcp_sack_block *sack ) {
469
+	struct io_buffer *iobuf;
470
+	struct tcp_rx_queued_header *tcpqhdr;
471
+	uint32_t left = tcp->rcv_ack;
472
+	uint32_t right = left;
473
+
474
+	/* Find highest block which does not start after SEQ */
475
+	list_for_each_entry ( iobuf, &tcp->rx_queue, list ) {
476
+		tcpqhdr = iobuf->data;
477
+		if ( tcp_cmp ( tcpqhdr->seq, right ) > 0 ) {
478
+			if ( tcp_cmp ( tcpqhdr->seq, seq ) > 0 )
479
+				break;
480
+			left = tcpqhdr->seq;
481
+		}
482
+		if ( tcp_cmp ( tcpqhdr->nxt, right ) > 0 )
483
+			right = tcpqhdr->nxt;
484
+	}
485
+
486
+	/* Fail if this block does not contain SEQ */
487
+	if ( tcp_cmp ( right, seq ) < 0 )
488
+		return 0;
489
+
490
+	/* Populate SACK block */
491
+	sack->left = left;
492
+	sack->right = right;
493
+	return ( right - left );
494
+}
495
+
496
+/**
497
+ * Update TCP selective acknowledgement list
498
+ *
499
+ * @v tcp		TCP connection
500
+ * @v seq		SEQ value in first SACK block (in host-endian order)
501
+ * @ret count		Number of SACK blocks
502
+ */
503
+static unsigned int tcp_sack ( struct tcp_connection *tcp, uint32_t seq ) {
504
+	struct tcp_sack_block sack[TCP_SACK_MAX];
505
+	unsigned int old = 0;
506
+	unsigned int new = 0;
507
+	unsigned int i;
508
+	uint32_t len;
509
+
510
+	/* Populate first new SACK block */
511
+	len = tcp_sack_block ( tcp, seq, &sack[0] );
512
+	if ( len )
513
+		new++;
514
+
515
+	/* Populate remaining new SACK blocks based on old SACK blocks */
516
+	for ( old = 0 ; old < TCP_SACK_MAX ; old++ ) {
517
+
518
+		/* Stop if we run out of space in the new list */
519
+		if ( new == TCP_SACK_MAX )
520
+			break;
521
+
522
+		/* Skip empty old SACK blocks */
523
+		if ( tcp->sack[old].left == tcp->sack[old].right )
524
+			continue;
525
+
526
+		/* Populate new SACK block */
527
+		len = tcp_sack_block ( tcp, tcp->sack[old].left, &sack[new] );
528
+		if ( len == 0 )
529
+			continue;
530
+
531
+		/* Eliminate duplicates */
532
+		for ( i = 0 ; i < new ; i++ ) {
533
+			if ( sack[i].left == sack[new].left ) {
534
+				new--;
535
+				break;
536
+			}
537
+		}
538
+		new++;
539
+	}
540
+
541
+	/* Update SACK list */
542
+	memset ( tcp->sack, 0, sizeof ( tcp->sack ) );
543
+	memcpy ( tcp->sack, sack, ( new * sizeof ( tcp->sack[0] ) ) );
544
+	return new;
545
+}
546
+
452
 /**
547
 /**
453
  * Process TCP transmit queue
548
  * Process TCP transmit queue
454
  *
549
  *
493
 }
588
 }
494
 
589
 
495
 /**
590
 /**
496
- * Transmit any outstanding data
591
+ * Transmit any outstanding data (with selective acknowledgement)
497
  *
592
  *
498
  * @v tcp		TCP connection
593
  * @v tcp		TCP connection
594
+ * @v sack_seq		SEQ for first selective acknowledgement (if any)
499
  * 
595
  * 
500
  * Transmits any outstanding data on the connection.
596
  * Transmits any outstanding data on the connection.
501
  *
597
  *
503
  * will have been started if necessary, and so the stack will
599
  * will have been started if necessary, and so the stack will
504
  * eventually attempt to retransmit the failed packet.
600
  * eventually attempt to retransmit the failed packet.
505
  */
601
  */
506
-static void tcp_xmit ( struct tcp_connection *tcp ) {
602
+static void tcp_xmit_sack ( struct tcp_connection *tcp, uint32_t sack_seq ) {
507
 	struct io_buffer *iobuf;
603
 	struct io_buffer *iobuf;
508
 	struct tcp_header *tcphdr;
604
 	struct tcp_header *tcphdr;
509
 	struct tcp_mss_option *mssopt;
605
 	struct tcp_mss_option *mssopt;
510
 	struct tcp_window_scale_padded_option *wsopt;
606
 	struct tcp_window_scale_padded_option *wsopt;
511
 	struct tcp_timestamp_padded_option *tsopt;
607
 	struct tcp_timestamp_padded_option *tsopt;
608
+	struct tcp_sack_permitted_padded_option *spopt;
609
+	struct tcp_sack_padded_option *sackopt;
610
+	struct tcp_sack_block *sack;
512
 	void *payload;
611
 	void *payload;
513
 	unsigned int flags;
612
 	unsigned int flags;
613
+	unsigned int sack_count;
614
+	unsigned int i;
514
 	size_t len = 0;
615
 	size_t len = 0;
616
+	size_t sack_len;
515
 	uint32_t seq_len;
617
 	uint32_t seq_len;
516
 	uint32_t app_win;
618
 	uint32_t app_win;
517
 	uint32_t max_rcv_win;
619
 	uint32_t max_rcv_win;
590
 		wsopt->wsopt.kind = TCP_OPTION_WS;
692
 		wsopt->wsopt.kind = TCP_OPTION_WS;
591
 		wsopt->wsopt.length = sizeof ( wsopt->wsopt );
693
 		wsopt->wsopt.length = sizeof ( wsopt->wsopt );
592
 		wsopt->wsopt.scale = TCP_RX_WINDOW_SCALE;
694
 		wsopt->wsopt.scale = TCP_RX_WINDOW_SCALE;
695
+		spopt = iob_push ( iobuf, sizeof ( *spopt ) );
696
+		memset ( spopt->nop, TCP_OPTION_NOP, sizeof ( spopt ) );
697
+		spopt->spopt.kind = TCP_OPTION_SACK_PERMITTED;
698
+		spopt->spopt.length = sizeof ( spopt->spopt );
593
 	}
699
 	}
594
 	if ( ( flags & TCP_SYN ) || ( tcp->flags & TCP_TS_ENABLED ) ) {
700
 	if ( ( flags & TCP_SYN ) || ( tcp->flags & TCP_TS_ENABLED ) ) {
595
 		tsopt = iob_push ( iobuf, sizeof ( *tsopt ) );
701
 		tsopt = iob_push ( iobuf, sizeof ( *tsopt ) );
599
 		tsopt->tsopt.tsval = htonl ( currticks() );
705
 		tsopt->tsopt.tsval = htonl ( currticks() );
600
 		tsopt->tsopt.tsecr = htonl ( tcp->ts_recent );
706
 		tsopt->tsopt.tsecr = htonl ( tcp->ts_recent );
601
 	}
707
 	}
708
+	if ( ( tcp->flags & TCP_SACK_ENABLED ) &&
709
+	     ( ! list_empty ( &tcp->rx_queue ) ) &&
710
+	     ( ( sack_count = tcp_sack ( tcp, sack_seq ) ) != 0 ) ) {
711
+		sack_len = ( sack_count * sizeof ( *sack ) );
712
+		sackopt = iob_push ( iobuf, ( sizeof ( *sackopt ) + sack_len ));
713
+		memset ( sackopt->nop, TCP_OPTION_NOP, sizeof ( sackopt->nop ));
714
+		sackopt->sackopt.kind = TCP_OPTION_SACK;
715
+		sackopt->sackopt.length =
716
+			( sizeof ( sackopt->sackopt ) + sack_len );
717
+		sack = ( ( ( void * ) sackopt ) + sizeof ( *sackopt ) );
718
+		for ( i = 0 ; i < sack_count ; i++, sack++ ) {
719
+			sack->left = htonl ( tcp->sack[i].left );
720
+			sack->right = htonl ( tcp->sack[i].right );
721
+		}
722
+	}
602
 	if ( len != 0 )
723
 	if ( len != 0 )
603
 		flags |= TCP_PSH;
724
 		flags |= TCP_PSH;
604
 	tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
725
 	tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
635
 	profile_stop ( &tcp_tx_profiler );
756
 	profile_stop ( &tcp_tx_profiler );
636
 }
757
 }
637
 
758
 
759
+/**
760
+ * Transmit any outstanding data
761
+ *
762
+ * @v tcp		TCP connection
763
+ */
764
+static void tcp_xmit ( struct tcp_connection *tcp ) {
765
+
766
+	/* Transmit without an explicit first SACK */
767
+	tcp_xmit_sack ( tcp, tcp->rcv_ack );
768
+}
769
+
638
 /** TCP process descriptor */
770
 /** TCP process descriptor */
639
 static struct process_descriptor tcp_process_desc =
771
 static struct process_descriptor tcp_process_desc =
640
 	PROC_DESC_ONCE ( struct tcp_connection, process, tcp_xmit );
772
 	PROC_DESC_ONCE ( struct tcp_connection, process, tcp_xmit );
804
 		case TCP_OPTION_WS:
936
 		case TCP_OPTION_WS:
805
 			options->wsopt = data;
937
 			options->wsopt = data;
806
 			break;
938
 			break;
939
+		case TCP_OPTION_SACK_PERMITTED:
940
+			options->spopt = data;
941
+			break;
942
+		case TCP_OPTION_SACK:
943
+			/* Ignore received SACKs */
944
+			break;
807
 		case TCP_OPTION_TS:
945
 		case TCP_OPTION_TS:
808
 			options->tsopt = data;
946
 			options->tsopt = data;
809
 			break;
947
 			break;
823
  * @v seq_len		Sequence space length to consume
961
  * @v seq_len		Sequence space length to consume
824
  */
962
  */
825
 static void tcp_rx_seq ( struct tcp_connection *tcp, uint32_t seq_len ) {
963
 static void tcp_rx_seq ( struct tcp_connection *tcp, uint32_t seq_len ) {
964
+	unsigned int sack;
826
 
965
 
827
 	/* Sanity check */
966
 	/* Sanity check */
828
 	assert ( seq_len > 0 );
967
 	assert ( seq_len > 0 );
840
 	/* Update timestamp */
979
 	/* Update timestamp */
841
 	tcp->ts_recent = tcp->ts_val;
980
 	tcp->ts_recent = tcp->ts_val;
842
 
981
 
982
+	/* Update SACK list */
983
+	for ( sack = 0 ; sack < TCP_SACK_MAX ; sack++ ) {
984
+		if ( tcp->sack[sack].left == tcp->sack[sack].right )
985
+			continue;
986
+		if ( tcp_cmp ( tcp->sack[sack].left, tcp->rcv_ack ) < 0 )
987
+			tcp->sack[sack].left = tcp->rcv_ack;
988
+		if ( tcp_cmp ( tcp->sack[sack].right, tcp->rcv_ack ) < 0 )
989
+			tcp->sack[sack].right = tcp->rcv_ack;
990
+	}
991
+
843
 	/* Mark ACK as pending */
992
 	/* Mark ACK as pending */
844
 	tcp->flags |= TCP_ACK_PENDING;
993
 	tcp->flags |= TCP_ACK_PENDING;
845
 }
994
 }
860
 		tcp->rcv_ack = seq;
1009
 		tcp->rcv_ack = seq;
861
 		if ( options->tsopt )
1010
 		if ( options->tsopt )
862
 			tcp->flags |= TCP_TS_ENABLED;
1011
 			tcp->flags |= TCP_TS_ENABLED;
1012
+		if ( options->spopt )
1013
+			tcp->flags |= TCP_SACK_ENABLED;
863
 		if ( options->wsopt ) {
1014
 		if ( options->wsopt ) {
864
 			tcp->snd_win_scale = options->wsopt->scale;
1015
 			tcp->snd_win_scale = options->wsopt->scale;
865
 			tcp->rcv_win_scale = TCP_RX_WINDOW_SCALE;
1016
 			tcp->rcv_win_scale = TCP_RX_WINDOW_SCALE;
1070
 	struct io_buffer *queued;
1221
 	struct io_buffer *queued;
1071
 	size_t len;
1222
 	size_t len;
1072
 	uint32_t seq_len;
1223
 	uint32_t seq_len;
1224
+	uint32_t nxt;
1073
 
1225
 
1074
 	/* Calculate remaining flags and sequence length.  Note that
1226
 	/* Calculate remaining flags and sequence length.  Note that
1075
 	 * SYN, if present, has already been processed by this point.
1227
 	 * SYN, if present, has already been processed by this point.
1077
 	flags &= TCP_FIN;
1229
 	flags &= TCP_FIN;
1078
 	len = iob_len ( iobuf );
1230
 	len = iob_len ( iobuf );
1079
 	seq_len = ( len + ( flags ? 1 : 0 ) );
1231
 	seq_len = ( len + ( flags ? 1 : 0 ) );
1232
+	nxt = ( seq + seq_len );
1080
 
1233
 
1081
 	/* Discard immediately (to save memory) if:
1234
 	/* Discard immediately (to save memory) if:
1082
 	 *
1235
 	 *
1087
 	 */
1240
 	 */
1088
 	if ( ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) ||
1241
 	if ( ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) ||
1089
 	     ( tcp_cmp ( seq, tcp->rcv_ack + tcp->rcv_win ) >= 0 ) ||
1242
 	     ( tcp_cmp ( seq, tcp->rcv_ack + tcp->rcv_win ) >= 0 ) ||
1090
-	     ( tcp_cmp ( seq + seq_len, tcp->rcv_ack ) < 0 ) ||
1243
+	     ( tcp_cmp ( nxt, tcp->rcv_ack ) < 0 ) ||
1091
 	     ( seq_len == 0 ) ) {
1244
 	     ( seq_len == 0 ) ) {
1092
 		free_iob ( iobuf );
1245
 		free_iob ( iobuf );
1093
 		return;
1246
 		return;
1096
 	/* Add internal header */
1249
 	/* Add internal header */
1097
 	tcpqhdr = iob_push ( iobuf, sizeof ( *tcpqhdr ) );
1250
 	tcpqhdr = iob_push ( iobuf, sizeof ( *tcpqhdr ) );
1098
 	tcpqhdr->seq = seq;
1251
 	tcpqhdr->seq = seq;
1252
+	tcpqhdr->nxt = nxt;
1099
 	tcpqhdr->flags = flags;
1253
 	tcpqhdr->flags = flags;
1100
 
1254
 
1101
 	/* Add to RX queue */
1255
 	/* Add to RX queue */
1289
 	if ( list_empty ( &tcp->rx_queue ) ) {
1443
 	if ( list_empty ( &tcp->rx_queue ) ) {
1290
 		process_add ( &tcp->process );
1444
 		process_add ( &tcp->process );
1291
 	} else {
1445
 	} else {
1292
-		tcp_xmit ( tcp );
1446
+		tcp_xmit_sack ( tcp, seq );
1293
 	}
1447
 	}
1294
 
1448
 
1295
 	/* If this packet was the last we expect to receive, set up
1449
 	/* If this packet was the last we expect to receive, set up

Loading…
Cancel
Save