Browse Source

[tcp] Handle out-of-order received packets

Maintain a queue of received packets, so that lost packets need not
result in retransmission of the entire TCP window.

Increase the TCP window to 8kB, in order that we can potentially
transmit enough duplicate ACKs to trigger Fast Retransmission at the
sender.

Using a 10MB HTTP download in qemu-kvm with an artificial drop rate of
1 in 64 packets, this reduces the download time from around 26s to
around 4s.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
tags/v1.20.1
Michael Brown 14 years ago
parent
commit
68613047f0
2 changed files with 180 additions and 35 deletions
  1. 30
    1
      src/include/ipxe/tcp.h
  2. 150
    34
      src/net/tcp.c

+ 30
- 1
src/include/ipxe/tcp.h View File

287
  * that payloads remain dword-aligned.
287
  * that payloads remain dword-aligned.
288
  */
288
  */
289
 //#define TCP_MAX_WINDOW_SIZE	( 65536 - 4 )
289
 //#define TCP_MAX_WINDOW_SIZE	( 65536 - 4 )
290
-#define TCP_MAX_WINDOW_SIZE	4096
290
+#define TCP_MAX_WINDOW_SIZE	8192
291
 
291
 
292
 /**
292
 /**
293
  * Path MTU
293
  * Path MTU
313
  */
313
  */
314
 #define TCP_MSL ( 2 * 60 * TICKS_PER_SEC )
314
 #define TCP_MSL ( 2 * 60 * TICKS_PER_SEC )
315
 
315
 
316
+/**
317
+ * Compare TCP sequence numbers
318
+ *
319
+ * @v seq1		Sequence number 1
320
+ * @v seq2		Sequence number 2
321
+ * @ret diff		Sequence difference
322
+ *
323
+ * Analogous to memcmp(), returns an integer less than, equal to, or
324
+ * greater than zero if @c seq1 is found, respectively, to be before,
325
+ * equal to, or after @c seq2.
326
+ */
327
+static inline __attribute__ (( always_inline )) int32_t
328
+tcp_cmp ( uint32_t seq1, uint32_t seq2 ) {
329
+	return ( ( int32_t ) ( seq1 - seq2 ) );
330
+}
331
+
332
+/**
333
+ * Check if TCP sequence number lies within window
334
+ *
335
+ * @v seq		Sequence number
336
+ * @v start		Start of window
337
+ * @v len		Length of window
338
+ * @ret in_window	Sequence number is within window
339
+ */
340
+static inline int tcp_in_window ( uint32_t seq, uint32_t start,
341
+				  uint32_t len ) {
342
+	return ( ( seq - start ) < len );
343
+}
344
+
316
 extern struct tcpip_protocol tcp_protocol;
345
 extern struct tcpip_protocol tcp_protocol;
317
 
346
 
318
 #endif /* _IPXE_TCP_H */
347
 #endif /* _IPXE_TCP_H */

+ 150
- 34
src/net/tcp.c View File

80
 	uint32_t ts_recent;
80
 	uint32_t ts_recent;
81
 
81
 
82
 	/** Transmit queue */
82
 	/** Transmit queue */
83
-	struct list_head queue;
83
+	struct list_head tx_queue;
84
+	/** Receive queue */
85
+	struct list_head rx_queue;
84
 	/** Retransmission timer */
86
 	/** Retransmission timer */
85
 	struct retry_timer timer;
87
 	struct retry_timer timer;
86
 	/** Shutdown (TIME_WAIT) timer */
88
 	/** Shutdown (TIME_WAIT) timer */
97
 	TCP_ACK_PENDING = 0x0004,
99
 	TCP_ACK_PENDING = 0x0004,
98
 };
100
 };
99
 
101
 
102
+/** TCP internal header
103
+ *
104
+ * This is the header that replaces the TCP header for packets
105
+ * enqueued on the receive queue.
106
+ */
107
+struct tcp_rx_queued_header {
108
+	/** SEQ value, in host-endian order
109
+	 *
110
+	 * This represents the SEQ value at the time the packet is
111
+	 * enqueued, and so excludes the SYN, if present.
112
+	 */
113
+	uint32_t seq;
114
+	/** Flags
115
+	 *
116
+	 * Only FIN is valid within this flags byte; all other flags
117
+	 * have already been processed by the time the packet is
118
+	 * enqueued.
119
+	 */
120
+	uint8_t flags;
121
+	/** Reserved */
122
+	uint8_t reserved[3];
123
+};
124
+
100
 /**
125
 /**
101
  * List of registered TCP connections
126
  * List of registered TCP connections
102
  */
127
  */
246
 	tcp->tcp_state = TCP_STATE_SENT ( TCP_SYN );
271
 	tcp->tcp_state = TCP_STATE_SENT ( TCP_SYN );
247
 	tcp_dump_state ( tcp );
272
 	tcp_dump_state ( tcp );
248
 	tcp->snd_seq = random();
273
 	tcp->snd_seq = random();
249
-	INIT_LIST_HEAD ( &tcp->queue );
274
+	INIT_LIST_HEAD ( &tcp->tx_queue );
275
+	INIT_LIST_HEAD ( &tcp->rx_queue );
250
 	memcpy ( &tcp->peer, st_peer, sizeof ( tcp->peer ) );
276
 	memcpy ( &tcp->peer, st_peer, sizeof ( tcp->peer ) );
251
 
277
 
252
 	/* Bind to local port */
278
 	/* Bind to local port */
296
 		tcp->tcp_state = TCP_CLOSED;
322
 		tcp->tcp_state = TCP_CLOSED;
297
 		tcp_dump_state ( tcp );
323
 		tcp_dump_state ( tcp );
298
 
324
 
325
+		/* Free any unprocessed I/O buffers */
326
+		list_for_each_entry_safe ( iobuf, tmp, &tcp->rx_queue, list ) {
327
+			list_del ( &iobuf->list );
328
+			free_iob ( iobuf );
329
+		}
330
+
299
 		/* Free any unsent I/O buffers */
331
 		/* Free any unsent I/O buffers */
300
-		list_for_each_entry_safe ( iobuf, tmp, &tcp->queue, list ) {
332
+		list_for_each_entry_safe ( iobuf, tmp, &tcp->tx_queue, list ) {
301
 			list_del ( &iobuf->list );
333
 			list_del ( &iobuf->list );
302
 			free_iob ( iobuf );
334
 			free_iob ( iobuf );
303
 		}
335
 		}
318
 		tcp_rx_ack ( tcp, ( tcp->snd_seq + 1 ), 0 );
350
 		tcp_rx_ack ( tcp, ( tcp->snd_seq + 1 ), 0 );
319
 
351
 
320
 	/* If we have no data remaining to send, start sending FIN */
352
 	/* If we have no data remaining to send, start sending FIN */
321
-	if ( list_empty ( &tcp->queue ) ) {
353
+	if ( list_empty ( &tcp->tx_queue ) ) {
322
 		tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
354
 		tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
323
 		tcp_dump_state ( tcp );
355
 		tcp_dump_state ( tcp );
324
 	}
356
 	}
366
  * (if provided) and, if @c remove is true, removed from the transmit
398
  * (if provided) and, if @c remove is true, removed from the transmit
367
  * queue.
399
  * queue.
368
  */
400
  */
369
-static size_t tcp_process_queue ( struct tcp_connection *tcp, size_t max_len,
370
-				  struct io_buffer *dest, int remove ) {
401
+static size_t tcp_process_tx_queue ( struct tcp_connection *tcp, size_t max_len,
402
+				     struct io_buffer *dest, int remove ) {
371
 	struct io_buffer *iobuf;
403
 	struct io_buffer *iobuf;
372
 	struct io_buffer *tmp;
404
 	struct io_buffer *tmp;
373
 	size_t frag_len;
405
 	size_t frag_len;
374
 	size_t len = 0;
406
 	size_t len = 0;
375
 
407
 
376
-	list_for_each_entry_safe ( iobuf, tmp, &tcp->queue, list ) {
408
+	list_for_each_entry_safe ( iobuf, tmp, &tcp->tx_queue, list ) {
377
 		frag_len = iob_len ( iobuf );
409
 		frag_len = iob_len ( iobuf );
378
 		if ( frag_len > max_len )
410
 		if ( frag_len > max_len )
379
 			frag_len = max_len;
411
 			frag_len = max_len;
426
 	 * lengths that we wish to transmit.
458
 	 * lengths that we wish to transmit.
427
 	 */
459
 	 */
428
 	if ( TCP_CAN_SEND_DATA ( tcp->tcp_state ) ) {
460
 	if ( TCP_CAN_SEND_DATA ( tcp->tcp_state ) ) {
429
-		len = tcp_process_queue ( tcp, tcp_xmit_win ( tcp ),
430
-					  NULL, 0 );
461
+		len = tcp_process_tx_queue ( tcp, tcp_xmit_win ( tcp ),
462
+					     NULL, 0 );
431
 	}
463
 	}
432
 	seq_len = len;
464
 	seq_len = len;
433
 	flags = TCP_FLAGS_SENDING ( tcp->tcp_state );
465
 	flags = TCP_FLAGS_SENDING ( tcp->tcp_state );
461
 	iob_reserve ( iobuf, MAX_HDR_LEN );
493
 	iob_reserve ( iobuf, MAX_HDR_LEN );
462
 
494
 
463
 	/* Fill data payload from transmit queue */
495
 	/* Fill data payload from transmit queue */
464
-	tcp_process_queue ( tcp, len, iobuf, 0 );
496
+	tcp_process_tx_queue ( tcp, len, iobuf, 0 );
465
 
497
 
466
 	/* Expand receive window if possible */
498
 	/* Expand receive window if possible */
467
 	max_rcv_win = ( ( freemem * 3 ) / 4 );
499
 	max_rcv_win = ( ( freemem * 3 ) / 4 );
735
 	}
767
 	}
736
 
768
 
737
 	/* Ignore duplicate SYN */
769
 	/* Ignore duplicate SYN */
738
-	if ( ( tcp->rcv_ack - seq ) > 0 )
770
+	if ( seq != tcp->rcv_ack )
739
 		return 0;
771
 		return 0;
740
 
772
 
741
 	/* Acknowledge SYN */
773
 	/* Acknowledge SYN */
806
 	tcp->snd_win = win;
838
 	tcp->snd_win = win;
807
 
839
 
808
 	/* Remove any acknowledged data from transmit queue */
840
 	/* Remove any acknowledged data from transmit queue */
809
-	tcp_process_queue ( tcp, len, NULL, 1 );
841
+	tcp_process_tx_queue ( tcp, len, NULL, 1 );
810
 		
842
 		
811
 	/* Mark SYN/FIN as acknowledged if applicable. */
843
 	/* Mark SYN/FIN as acknowledged if applicable. */
812
 	if ( acked_flags )
844
 	if ( acked_flags )
813
 		tcp->tcp_state |= TCP_STATE_ACKED ( acked_flags );
845
 		tcp->tcp_state |= TCP_STATE_ACKED ( acked_flags );
814
 
846
 
815
 	/* Start sending FIN if we've had all possible data ACKed */
847
 	/* Start sending FIN if we've had all possible data ACKed */
816
-	if ( list_empty ( &tcp->queue ) && ( tcp->flags & TCP_XFER_CLOSED ) )
848
+	if ( list_empty ( &tcp->tx_queue ) && ( tcp->flags & TCP_XFER_CLOSED ) )
817
 		tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
849
 		tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
818
 
850
 
819
 	return 0;
851
 	return 0;
868
 static int tcp_rx_fin ( struct tcp_connection *tcp, uint32_t seq ) {
900
 static int tcp_rx_fin ( struct tcp_connection *tcp, uint32_t seq ) {
869
 
901
 
870
 	/* Ignore duplicate or out-of-order FIN */
902
 	/* Ignore duplicate or out-of-order FIN */
871
-	if ( ( tcp->rcv_ack - seq ) > 0 )
903
+	if ( seq != tcp->rcv_ack )
872
 		return 0;
904
 		return 0;
873
 
905
 
874
 	/* Acknowledge FIN */
906
 	/* Acknowledge FIN */
898
 	 * ACKed.
930
 	 * ACKed.
899
 	 */
931
 	 */
900
 	if ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) {
932
 	if ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) {
901
-		if ( ( seq - tcp->rcv_ack ) >= tcp->rcv_win )
933
+		if ( ! tcp_in_window ( seq, tcp->rcv_ack, tcp->rcv_win ) )
902
 			return 0;
934
 			return 0;
903
 	} else {
935
 	} else {
904
 		if ( ! ( tcp->tcp_state & TCP_STATE_ACKED ( TCP_SYN ) ) )
936
 		if ( ! ( tcp->tcp_state & TCP_STATE_ACKED ( TCP_SYN ) ) )
914
 	return -ECONNRESET;
946
 	return -ECONNRESET;
915
 }
947
 }
916
 
948
 
949
+/**
950
+ * Enqueue received TCP packet
951
+ *
952
+ * @v tcp		TCP connection
953
+ * @v seq		SEQ value (in host-endian order)
954
+ * @v flags		TCP flags
955
+ * @v iobuf		I/O buffer
956
+ */
957
+static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
958
+			     uint8_t flags, struct io_buffer *iobuf ) {
959
+	struct tcp_rx_queued_header *tcpqhdr;
960
+	struct io_buffer *queued;
961
+	size_t len;
962
+	uint32_t seq_len;
963
+
964
+	/* Calculate remaining flags and sequence length.  Note that
965
+	 * SYN, if present, has already been processed by this point.
966
+	 */
967
+	flags &= TCP_FIN;
968
+	len = iob_len ( iobuf );
969
+	seq_len = ( len + ( flags ? 1 : 0 ) );
970
+
971
+	/* Discard immediately (to save memory) if:
972
+	 *
973
+	 * a) we have not yet received a SYN (and so have no defined
974
+	 *    receive window), or
975
+	 * b) the packet lies entirely outside the receive window, or
976
+	 * c) there is no further content to process.
977
+	 */
978
+	if ( ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) ||
979
+	     ( tcp_cmp ( seq, tcp->rcv_ack + tcp->rcv_win ) >= 0 ) ||
980
+	     ( tcp_cmp ( seq + seq_len, tcp->rcv_ack ) < 0 ) ||
981
+	     ( seq_len == 0 ) ) {
982
+		free_iob ( iobuf );
983
+		return;
984
+	}
985
+
986
+	/* Add internal header */
987
+	tcpqhdr = iob_push ( iobuf, sizeof ( *tcpqhdr ) );
988
+	tcpqhdr->seq = seq;
989
+	tcpqhdr->flags = flags;
990
+
991
+	/* Add to RX queue */
992
+	list_for_each_entry ( queued, &tcp->rx_queue, list ) {
993
+		tcpqhdr = queued->data;
994
+		if ( tcp_cmp ( seq, tcpqhdr->seq ) < 0 )
995
+			break;
996
+	}
997
+	list_add_tail ( &iobuf->list, &queued->list );
998
+}
999
+
1000
+/**
1001
+ * Process receive queue
1002
+ *
1003
+ * @v tcp		TCP connection
1004
+ */
1005
+static void tcp_process_rx_queue ( struct tcp_connection *tcp ) {
1006
+	struct io_buffer *iobuf;
1007
+	struct io_buffer *tmp;
1008
+	struct tcp_rx_queued_header *tcpqhdr;
1009
+	uint32_t seq;
1010
+	unsigned int flags;
1011
+	size_t len;
1012
+
1013
+	/* Process all applicable received buffers */
1014
+	list_for_each_entry_safe ( iobuf, tmp, &tcp->rx_queue, list ) {
1015
+		tcpqhdr = iobuf->data;
1016
+		if ( tcp_cmp ( tcpqhdr->seq, tcp->rcv_ack ) > 0 )
1017
+			break;
1018
+
1019
+		/* Strip internal header and remove from RX queue */
1020
+		list_del ( &iobuf->list );
1021
+		seq = tcpqhdr->seq;
1022
+		flags = tcpqhdr->flags;
1023
+		iob_pull ( iobuf, sizeof ( *tcpqhdr ) );
1024
+		len = iob_len ( iobuf );
1025
+
1026
+		/* Handle new data, if any */
1027
+		tcp_rx_data ( tcp, seq, iob_disown ( iobuf ) );
1028
+		seq += len;
1029
+
1030
+		/* Handle FIN, if present */
1031
+		if ( flags & TCP_FIN ) {
1032
+			tcp_rx_fin ( tcp, seq );
1033
+			seq++;
1034
+		}
1035
+	}
1036
+}
1037
+
917
 /**
1038
 /**
918
  * Process received packet
1039
  * Process received packet
919
  *
1040
  *
935
 	uint32_t seq;
1056
 	uint32_t seq;
936
 	uint32_t ack;
1057
 	uint32_t ack;
937
 	uint32_t win;
1058
 	uint32_t win;
938
-	uint32_t ts_recent;
939
 	unsigned int flags;
1059
 	unsigned int flags;
940
 	size_t len;
1060
 	size_t len;
1061
+	uint32_t seq_len;
941
 	int rc;
1062
 	int rc;
942
 
1063
 
943
 	/* Sanity check packet */
1064
 	/* Sanity check packet */
977
 	flags = tcphdr->flags;
1098
 	flags = tcphdr->flags;
978
 	tcp_rx_opts ( tcp, ( ( ( void * ) tcphdr ) + sizeof ( *tcphdr ) ),
1099
 	tcp_rx_opts ( tcp, ( ( ( void * ) tcphdr ) + sizeof ( *tcphdr ) ),
979
 		      ( hlen - sizeof ( *tcphdr ) ), &options );
1100
 		      ( hlen - sizeof ( *tcphdr ) ), &options );
980
-	ts_recent = ( options.tsopt ?
981
-		      ntohl ( options.tsopt->tsval ) : tcp->ts_recent );
982
 	iob_pull ( iobuf, hlen );
1101
 	iob_pull ( iobuf, hlen );
983
 	len = iob_len ( iobuf );
1102
 	len = iob_len ( iobuf );
1103
+	seq_len = ( len + ( ( flags & TCP_SYN ) ? 1 : 0 ) +
1104
+		    ( ( flags & TCP_FIN ) ? 1 : 0 ) );
984
 
1105
 
985
 	/* Dump header */
1106
 	/* Dump header */
986
 	DBGC2 ( tcp, "TCP %p RX %d<-%d           %08x %08x..%08zx %4zd",
1107
 	DBGC2 ( tcp, "TCP %p RX %d<-%d           %08x %08x..%08zx %4zd",
987
 		tcp, ntohs ( tcphdr->dest ), ntohs ( tcphdr->src ),
1108
 		tcp, ntohs ( tcphdr->dest ), ntohs ( tcphdr->src ),
988
 		ntohl ( tcphdr->ack ), ntohl ( tcphdr->seq ),
1109
 		ntohl ( tcphdr->ack ), ntohl ( tcphdr->seq ),
989
-		( ntohl ( tcphdr->seq ) + len +
990
-		  ( ( tcphdr->flags & ( TCP_SYN | TCP_FIN ) ) ? 1 : 0 )), len);
1110
+		( ntohl ( tcphdr->seq ) + seq_len ), len );
991
 	tcp_dump_flags ( tcp, tcphdr->flags );
1111
 	tcp_dump_flags ( tcp, tcphdr->flags );
992
 	DBGC2 ( tcp, "\n" );
1112
 	DBGC2 ( tcp, "\n" );
993
 
1113
 
998
 		goto discard;
1118
 		goto discard;
999
 	}
1119
 	}
1000
 
1120
 
1121
+	/* Update timestamp, if applicable */
1122
+	if ( options.tsopt && tcp_in_window ( tcp->rcv_ack, seq, seq_len ) )
1123
+		tcp->ts_recent = ntohl ( options.tsopt->tsval );
1124
+
1001
 	/* Handle ACK, if present */
1125
 	/* Handle ACK, if present */
1002
 	if ( flags & TCP_ACK ) {
1126
 	if ( flags & TCP_ACK ) {
1003
 		if ( ( rc = tcp_rx_ack ( tcp, ack, win ) ) != 0 ) {
1127
 		if ( ( rc = tcp_rx_ack ( tcp, ack, win ) ) != 0 ) {
1024
 			goto discard;
1148
 			goto discard;
1025
 	}
1149
 	}
1026
 
1150
 
1027
-	/* Handle new data, if any */
1028
-	tcp_rx_data ( tcp, seq, iob_disown ( iobuf ) );
1029
-	seq += len;
1151
+	/* Enqueue received data */
1152
+	tcp_rx_enqueue ( tcp, seq, flags, iob_disown ( iobuf ) );
1030
 
1153
 
1031
-	/* Handle FIN, if present */
1032
-	if ( flags & TCP_FIN ) {
1033
-		tcp_rx_fin ( tcp, seq );
1034
-		seq++;
1035
-	}
1036
-
1037
-	/* Update timestamp, if applicable */
1038
-	if ( seq == tcp->rcv_ack )
1039
-		tcp->ts_recent = ts_recent;
1154
+	/* Process receive queue */
1155
+	tcp_process_rx_queue ( tcp );
1040
 
1156
 
1041
 	/* Dump out any state change as a result of the received packet */
1157
 	/* Dump out any state change as a result of the received packet */
1042
 	tcp_dump_state ( tcp );
1158
 	tcp_dump_state ( tcp );
1101
 	 * of only one unACKed packet in the TX queue at any time; we
1217
 	 * of only one unACKed packet in the TX queue at any time; we
1102
 	 * do this to conserve memory usage.
1218
 	 * do this to conserve memory usage.
1103
 	 */
1219
 	 */
1104
-	if ( ! list_empty ( &tcp->queue ) )
1220
+	if ( ! list_empty ( &tcp->tx_queue ) )
1105
 		return 0;
1221
 		return 0;
1106
 
1222
 
1107
 	/* Return TCP window length */
1223
 	/* Return TCP window length */
1121
 			      struct xfer_metadata *meta __unused ) {
1237
 			      struct xfer_metadata *meta __unused ) {
1122
 
1238
 
1123
 	/* Enqueue packet */
1239
 	/* Enqueue packet */
1124
-	list_add_tail ( &iobuf->list, &tcp->queue );
1240
+	list_add_tail ( &iobuf->list, &tcp->tx_queue );
1125
 
1241
 
1126
 	/* Transmit data, if possible */
1242
 	/* Transmit data, if possible */
1127
 	tcp_xmit ( tcp );
1243
 	tcp_xmit ( tcp );

Loading…
Cancel
Save