Browse Source

[tcp] Handle out-of-order received packets

Maintain a queue of received packets, so that lost packets need not
result in retransmission of the entire TCP window.

Increase the TCP window to 8kB, in order that we can potentially
transmit enough duplicate ACKs to trigger Fast Retransmission at the
sender.

Using a 10MB HTTP download in qemu-kvm with an artificial drop rate of
1 in 64 packets, this reduces the download time from around 26s to
around 4s.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
tags/v1.20.1
Michael Brown 14 years ago
parent
commit
68613047f0
2 changed files with 180 additions and 35 deletions
  1. 30
    1
      src/include/ipxe/tcp.h
  2. 150
    34
      src/net/tcp.c

+ 30
- 1
src/include/ipxe/tcp.h View File

@@ -287,7 +287,7 @@ struct tcp_options {
287 287
  * that payloads remain dword-aligned.
288 288
  */
289 289
 //#define TCP_MAX_WINDOW_SIZE	( 65536 - 4 )
290
-#define TCP_MAX_WINDOW_SIZE	4096
290
+#define TCP_MAX_WINDOW_SIZE	8192
291 291
 
292 292
 /**
293 293
  * Path MTU
@@ -313,6 +313,35 @@ struct tcp_options {
313 313
  */
314 314
 #define TCP_MSL ( 2 * 60 * TICKS_PER_SEC )
315 315
 
316
+/**
317
+ * Compare TCP sequence numbers
318
+ *
319
+ * @v seq1		Sequence number 1
320
+ * @v seq2		Sequence number 2
321
+ * @ret diff		Sequence difference
322
+ *
323
+ * Analogous to memcmp(), returns an integer less than, equal to, or
324
+ * greater than zero if @c seq1 is found, respectively, to be before,
325
+ * equal to, or after @c seq2.
326
+ */
327
+static inline __attribute__ (( always_inline )) int32_t
328
+tcp_cmp ( uint32_t seq1, uint32_t seq2 ) {
329
+	return ( ( int32_t ) ( seq1 - seq2 ) );
330
+}
331
+
332
+/**
333
+ * Check if TCP sequence number lies within window
334
+ *
335
+ * @v seq		Sequence number
336
+ * @v start		Start of window
337
+ * @v len		Length of window
338
+ * @ret in_window	Sequence number is within window
339
+ */
340
+static inline int tcp_in_window ( uint32_t seq, uint32_t start,
341
+				  uint32_t len ) {
342
+	return ( ( seq - start ) < len );
343
+}
344
+
316 345
 extern struct tcpip_protocol tcp_protocol;
317 346
 
318 347
 #endif /* _IPXE_TCP_H */

+ 150
- 34
src/net/tcp.c View File

@@ -80,7 +80,9 @@ struct tcp_connection {
80 80
 	uint32_t ts_recent;
81 81
 
82 82
 	/** Transmit queue */
83
-	struct list_head queue;
83
+	struct list_head tx_queue;
84
+	/** Receive queue */
85
+	struct list_head rx_queue;
84 86
 	/** Retransmission timer */
85 87
 	struct retry_timer timer;
86 88
 	/** Shutdown (TIME_WAIT) timer */
@@ -97,6 +99,29 @@ enum tcp_flags {
97 99
 	TCP_ACK_PENDING = 0x0004,
98 100
 };
99 101
 
102
+/** TCP internal header
103
+ *
104
+ * This is the header that replaces the TCP header for packets
105
+ * enqueued on the receive queue.
106
+ */
107
+struct tcp_rx_queued_header {
108
+	/** SEQ value, in host-endian order
109
+	 *
110
+	 * This represents the SEQ value at the time the packet is
111
+	 * enqueued, and so excludes the SYN, if present.
112
+	 */
113
+	uint32_t seq;
114
+	/** Flags
115
+	 *
116
+	 * Only FIN is valid within this flags byte; all other flags
117
+	 * have already been processed by the time the packet is
118
+	 * enqueued.
119
+	 */
120
+	uint8_t flags;
121
+	/** Reserved */
122
+	uint8_t reserved[3];
123
+};
124
+
100 125
 /**
101 126
  * List of registered TCP connections
102 127
  */
@@ -246,7 +271,8 @@ static int tcp_open ( struct interface *xfer, struct sockaddr *peer,
246 271
 	tcp->tcp_state = TCP_STATE_SENT ( TCP_SYN );
247 272
 	tcp_dump_state ( tcp );
248 273
 	tcp->snd_seq = random();
249
-	INIT_LIST_HEAD ( &tcp->queue );
274
+	INIT_LIST_HEAD ( &tcp->tx_queue );
275
+	INIT_LIST_HEAD ( &tcp->rx_queue );
250 276
 	memcpy ( &tcp->peer, st_peer, sizeof ( tcp->peer ) );
251 277
 
252 278
 	/* Bind to local port */
@@ -296,8 +322,14 @@ static void tcp_close ( struct tcp_connection *tcp, int rc ) {
296 322
 		tcp->tcp_state = TCP_CLOSED;
297 323
 		tcp_dump_state ( tcp );
298 324
 
325
+		/* Free any unprocessed I/O buffers */
326
+		list_for_each_entry_safe ( iobuf, tmp, &tcp->rx_queue, list ) {
327
+			list_del ( &iobuf->list );
328
+			free_iob ( iobuf );
329
+		}
330
+
299 331
 		/* Free any unsent I/O buffers */
300
-		list_for_each_entry_safe ( iobuf, tmp, &tcp->queue, list ) {
332
+		list_for_each_entry_safe ( iobuf, tmp, &tcp->tx_queue, list ) {
301 333
 			list_del ( &iobuf->list );
302 334
 			free_iob ( iobuf );
303 335
 		}
@@ -318,7 +350,7 @@ static void tcp_close ( struct tcp_connection *tcp, int rc ) {
318 350
 		tcp_rx_ack ( tcp, ( tcp->snd_seq + 1 ), 0 );
319 351
 
320 352
 	/* If we have no data remaining to send, start sending FIN */
321
-	if ( list_empty ( &tcp->queue ) ) {
353
+	if ( list_empty ( &tcp->tx_queue ) ) {
322 354
 		tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
323 355
 		tcp_dump_state ( tcp );
324 356
 	}
@@ -366,14 +398,14 @@ static size_t tcp_xmit_win ( struct tcp_connection *tcp ) {
366 398
  * (if provided) and, if @c remove is true, removed from the transmit
367 399
  * queue.
368 400
  */
369
-static size_t tcp_process_queue ( struct tcp_connection *tcp, size_t max_len,
370
-				  struct io_buffer *dest, int remove ) {
401
+static size_t tcp_process_tx_queue ( struct tcp_connection *tcp, size_t max_len,
402
+				     struct io_buffer *dest, int remove ) {
371 403
 	struct io_buffer *iobuf;
372 404
 	struct io_buffer *tmp;
373 405
 	size_t frag_len;
374 406
 	size_t len = 0;
375 407
 
376
-	list_for_each_entry_safe ( iobuf, tmp, &tcp->queue, list ) {
408
+	list_for_each_entry_safe ( iobuf, tmp, &tcp->tx_queue, list ) {
377 409
 		frag_len = iob_len ( iobuf );
378 410
 		if ( frag_len > max_len )
379 411
 			frag_len = max_len;
@@ -426,8 +458,8 @@ static int tcp_xmit ( struct tcp_connection *tcp ) {
426 458
 	 * lengths that we wish to transmit.
427 459
 	 */
428 460
 	if ( TCP_CAN_SEND_DATA ( tcp->tcp_state ) ) {
429
-		len = tcp_process_queue ( tcp, tcp_xmit_win ( tcp ),
430
-					  NULL, 0 );
461
+		len = tcp_process_tx_queue ( tcp, tcp_xmit_win ( tcp ),
462
+					     NULL, 0 );
431 463
 	}
432 464
 	seq_len = len;
433 465
 	flags = TCP_FLAGS_SENDING ( tcp->tcp_state );
@@ -461,7 +493,7 @@ static int tcp_xmit ( struct tcp_connection *tcp ) {
461 493
 	iob_reserve ( iobuf, MAX_HDR_LEN );
462 494
 
463 495
 	/* Fill data payload from transmit queue */
464
-	tcp_process_queue ( tcp, len, iobuf, 0 );
496
+	tcp_process_tx_queue ( tcp, len, iobuf, 0 );
465 497
 
466 498
 	/* Expand receive window if possible */
467 499
 	max_rcv_win = ( ( freemem * 3 ) / 4 );
@@ -735,7 +767,7 @@ static int tcp_rx_syn ( struct tcp_connection *tcp, uint32_t seq,
735 767
 	}
736 768
 
737 769
 	/* Ignore duplicate SYN */
738
-	if ( ( tcp->rcv_ack - seq ) > 0 )
770
+	if ( seq != tcp->rcv_ack )
739 771
 		return 0;
740 772
 
741 773
 	/* Acknowledge SYN */
@@ -806,14 +838,14 @@ static int tcp_rx_ack ( struct tcp_connection *tcp, uint32_t ack,
806 838
 	tcp->snd_win = win;
807 839
 
808 840
 	/* Remove any acknowledged data from transmit queue */
809
-	tcp_process_queue ( tcp, len, NULL, 1 );
841
+	tcp_process_tx_queue ( tcp, len, NULL, 1 );
810 842
 		
811 843
 	/* Mark SYN/FIN as acknowledged if applicable. */
812 844
 	if ( acked_flags )
813 845
 		tcp->tcp_state |= TCP_STATE_ACKED ( acked_flags );
814 846
 
815 847
 	/* Start sending FIN if we've had all possible data ACKed */
816
-	if ( list_empty ( &tcp->queue ) && ( tcp->flags & TCP_XFER_CLOSED ) )
848
+	if ( list_empty ( &tcp->tx_queue ) && ( tcp->flags & TCP_XFER_CLOSED ) )
817 849
 		tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
818 850
 
819 851
 	return 0;
@@ -868,7 +900,7 @@ static int tcp_rx_data ( struct tcp_connection *tcp, uint32_t seq,
868 900
 static int tcp_rx_fin ( struct tcp_connection *tcp, uint32_t seq ) {
869 901
 
870 902
 	/* Ignore duplicate or out-of-order FIN */
871
-	if ( ( tcp->rcv_ack - seq ) > 0 )
903
+	if ( seq != tcp->rcv_ack )
872 904
 		return 0;
873 905
 
874 906
 	/* Acknowledge FIN */
@@ -898,7 +930,7 @@ static int tcp_rx_rst ( struct tcp_connection *tcp, uint32_t seq ) {
898 930
 	 * ACKed.
899 931
 	 */
900 932
 	if ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) {
901
-		if ( ( seq - tcp->rcv_ack ) >= tcp->rcv_win )
933
+		if ( ! tcp_in_window ( seq, tcp->rcv_ack, tcp->rcv_win ) )
902 934
 			return 0;
903 935
 	} else {
904 936
 		if ( ! ( tcp->tcp_state & TCP_STATE_ACKED ( TCP_SYN ) ) )
@@ -914,6 +946,95 @@ static int tcp_rx_rst ( struct tcp_connection *tcp, uint32_t seq ) {
914 946
 	return -ECONNRESET;
915 947
 }
916 948
 
949
+/**
950
+ * Enqueue received TCP packet
951
+ *
952
+ * @v tcp		TCP connection
953
+ * @v seq		SEQ value (in host-endian order)
954
+ * @v flags		TCP flags
955
+ * @v iobuf		I/O buffer
956
+ */
957
+static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
958
+			     uint8_t flags, struct io_buffer *iobuf ) {
959
+	struct tcp_rx_queued_header *tcpqhdr;
960
+	struct io_buffer *queued;
961
+	size_t len;
962
+	uint32_t seq_len;
963
+
964
+	/* Calculate remaining flags and sequence length.  Note that
965
+	 * SYN, if present, has already been processed by this point.
966
+	 */
967
+	flags &= TCP_FIN;
968
+	len = iob_len ( iobuf );
969
+	seq_len = ( len + ( flags ? 1 : 0 ) );
970
+
971
+	/* Discard immediately (to save memory) if:
972
+	 *
973
+	 * a) we have not yet received a SYN (and so have no defined
974
+	 *    receive window), or
975
+	 * b) the packet lies entirely outside the receive window, or
976
+	 * c) there is no further content to process.
977
+	 */
978
+	if ( ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) ||
979
+	     ( tcp_cmp ( seq, tcp->rcv_ack + tcp->rcv_win ) >= 0 ) ||
980
+	     ( tcp_cmp ( seq + seq_len, tcp->rcv_ack ) < 0 ) ||
981
+	     ( seq_len == 0 ) ) {
982
+		free_iob ( iobuf );
983
+		return;
984
+	}
985
+
986
+	/* Add internal header */
987
+	tcpqhdr = iob_push ( iobuf, sizeof ( *tcpqhdr ) );
988
+	tcpqhdr->seq = seq;
989
+	tcpqhdr->flags = flags;
990
+
991
+	/* Add to RX queue */
992
+	list_for_each_entry ( queued, &tcp->rx_queue, list ) {
993
+		tcpqhdr = queued->data;
994
+		if ( tcp_cmp ( seq, tcpqhdr->seq ) < 0 )
995
+			break;
996
+	}
997
+	list_add_tail ( &iobuf->list, &queued->list );
998
+}
999
+
1000
+/**
1001
+ * Process receive queue
1002
+ *
1003
+ * @v tcp		TCP connection
1004
+ */
1005
+static void tcp_process_rx_queue ( struct tcp_connection *tcp ) {
1006
+	struct io_buffer *iobuf;
1007
+	struct io_buffer *tmp;
1008
+	struct tcp_rx_queued_header *tcpqhdr;
1009
+	uint32_t seq;
1010
+	unsigned int flags;
1011
+	size_t len;
1012
+
1013
+	/* Process all applicable received buffers */
1014
+	list_for_each_entry_safe ( iobuf, tmp, &tcp->rx_queue, list ) {
1015
+		tcpqhdr = iobuf->data;
1016
+		if ( tcp_cmp ( tcpqhdr->seq, tcp->rcv_ack ) > 0 )
1017
+			break;
1018
+
1019
+		/* Strip internal header and remove from RX queue */
1020
+		list_del ( &iobuf->list );
1021
+		seq = tcpqhdr->seq;
1022
+		flags = tcpqhdr->flags;
1023
+		iob_pull ( iobuf, sizeof ( *tcpqhdr ) );
1024
+		len = iob_len ( iobuf );
1025
+
1026
+		/* Handle new data, if any */
1027
+		tcp_rx_data ( tcp, seq, iob_disown ( iobuf ) );
1028
+		seq += len;
1029
+
1030
+		/* Handle FIN, if present */
1031
+		if ( flags & TCP_FIN ) {
1032
+			tcp_rx_fin ( tcp, seq );
1033
+			seq++;
1034
+		}
1035
+	}
1036
+}
1037
+
917 1038
 /**
918 1039
  * Process received packet
919 1040
  *
@@ -935,9 +1056,9 @@ static int tcp_rx ( struct io_buffer *iobuf,
935 1056
 	uint32_t seq;
936 1057
 	uint32_t ack;
937 1058
 	uint32_t win;
938
-	uint32_t ts_recent;
939 1059
 	unsigned int flags;
940 1060
 	size_t len;
1061
+	uint32_t seq_len;
941 1062
 	int rc;
942 1063
 
943 1064
 	/* Sanity check packet */
@@ -977,17 +1098,16 @@ static int tcp_rx ( struct io_buffer *iobuf,
977 1098
 	flags = tcphdr->flags;
978 1099
 	tcp_rx_opts ( tcp, ( ( ( void * ) tcphdr ) + sizeof ( *tcphdr ) ),
979 1100
 		      ( hlen - sizeof ( *tcphdr ) ), &options );
980
-	ts_recent = ( options.tsopt ?
981
-		      ntohl ( options.tsopt->tsval ) : tcp->ts_recent );
982 1101
 	iob_pull ( iobuf, hlen );
983 1102
 	len = iob_len ( iobuf );
1103
+	seq_len = ( len + ( ( flags & TCP_SYN ) ? 1 : 0 ) +
1104
+		    ( ( flags & TCP_FIN ) ? 1 : 0 ) );
984 1105
 
985 1106
 	/* Dump header */
986 1107
 	DBGC2 ( tcp, "TCP %p RX %d<-%d           %08x %08x..%08zx %4zd",
987 1108
 		tcp, ntohs ( tcphdr->dest ), ntohs ( tcphdr->src ),
988 1109
 		ntohl ( tcphdr->ack ), ntohl ( tcphdr->seq ),
989
-		( ntohl ( tcphdr->seq ) + len +
990
-		  ( ( tcphdr->flags & ( TCP_SYN | TCP_FIN ) ) ? 1 : 0 )), len);
1110
+		( ntohl ( tcphdr->seq ) + seq_len ), len );
991 1111
 	tcp_dump_flags ( tcp, tcphdr->flags );
992 1112
 	DBGC2 ( tcp, "\n" );
993 1113
 
@@ -998,6 +1118,10 @@ static int tcp_rx ( struct io_buffer *iobuf,
998 1118
 		goto discard;
999 1119
 	}
1000 1120
 
1121
+	/* Update timestamp, if applicable */
1122
+	if ( options.tsopt && tcp_in_window ( tcp->rcv_ack, seq, seq_len ) )
1123
+		tcp->ts_recent = ntohl ( options.tsopt->tsval );
1124
+
1001 1125
 	/* Handle ACK, if present */
1002 1126
 	if ( flags & TCP_ACK ) {
1003 1127
 		if ( ( rc = tcp_rx_ack ( tcp, ack, win ) ) != 0 ) {
@@ -1024,19 +1148,11 @@ static int tcp_rx ( struct io_buffer *iobuf,
1024 1148
 			goto discard;
1025 1149
 	}
1026 1150
 
1027
-	/* Handle new data, if any */
1028
-	tcp_rx_data ( tcp, seq, iob_disown ( iobuf ) );
1029
-	seq += len;
1151
+	/* Enqueue received data */
1152
+	tcp_rx_enqueue ( tcp, seq, flags, iob_disown ( iobuf ) );
1030 1153
 
1031
-	/* Handle FIN, if present */
1032
-	if ( flags & TCP_FIN ) {
1033
-		tcp_rx_fin ( tcp, seq );
1034
-		seq++;
1035
-	}
1036
-
1037
-	/* Update timestamp, if applicable */
1038
-	if ( seq == tcp->rcv_ack )
1039
-		tcp->ts_recent = ts_recent;
1154
+	/* Process receive queue */
1155
+	tcp_process_rx_queue ( tcp );
1040 1156
 
1041 1157
 	/* Dump out any state change as a result of the received packet */
1042 1158
 	tcp_dump_state ( tcp );
@@ -1101,7 +1217,7 @@ static size_t tcp_xfer_window ( struct tcp_connection *tcp ) {
1101 1217
 	 * of only one unACKed packet in the TX queue at any time; we
1102 1218
 	 * do this to conserve memory usage.
1103 1219
 	 */
1104
-	if ( ! list_empty ( &tcp->queue ) )
1220
+	if ( ! list_empty ( &tcp->tx_queue ) )
1105 1221
 		return 0;
1106 1222
 
1107 1223
 	/* Return TCP window length */
@@ -1121,7 +1237,7 @@ static int tcp_xfer_deliver ( struct tcp_connection *tcp,
1121 1237
 			      struct xfer_metadata *meta __unused ) {
1122 1238
 
1123 1239
 	/* Enqueue packet */
1124
-	list_add_tail ( &iobuf->list, &tcp->queue );
1240
+	list_add_tail ( &iobuf->list, &tcp->tx_queue );
1125 1241
 
1126 1242
 	/* Transmit data, if possible */
1127 1243
 	tcp_xmit ( tcp );

Loading…
Cancel
Save