Selaa lähdekoodia

[tcpip] Add faster algorithm for calculating the TCP/IP checksum

The generic TCP/IP checksum implementation requires approximately 10
CPU clocks per byte (as measured using the TSC).  Improve this to
approximately 0.5 CPU clocks per byte by using "lodsl ; adcl" in an
unrolled loop.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
tags/v1.20.1
Michael Brown 12 vuotta sitten
vanhempi
commit
ec22e08db1
2 muutettua tiedostoa jossa 174 lisäystä ja 0 poistoa
  1. 169
    0
      src/arch/x86/core/x86_tcpip.c
  2. 5
    0
      src/arch/x86/include/bits/tcpip.h

+ 169
- 0
src/arch/x86/core/x86_tcpip.c Näytä tiedosto

@@ -0,0 +1,169 @@
1
+/*
2
+ * Copyright (C) 2012 Michael Brown <mbrown@fensystems.co.uk>.
3
+ *
4
+ * This program is free software; you can redistribute it and/or
5
+ * modify it under the terms of the GNU General Public License as
6
+ * published by the Free Software Foundation; either version 2 of the
7
+ * License, or (at your option) any later version.
8
+ *
9
+ * This program is distributed in the hope that it will be useful, but
10
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
+ * General Public License for more details.
13
+ *
14
+ * You should have received a copy of the GNU General Public License
15
+ * along with this program; if not, write to the Free Software
16
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17
+ * 02110-1301, USA.
18
+ */
19
+
20
+FILE_LICENCE ( GPL2_OR_LATER );
21
+
22
+/** @file
23
+ *
24
+ * TCP/IP checksum
25
+ *
26
+ */
27
+
28
+#include <limits.h>
29
+#include <ipxe/tcpip.h>
30
+
31
+extern char x86_tcpip_loop_end[];
32
+
33
+/**
34
+ * Calculate continued TCP/IP checkum
35
+ *
36
+ * @v partial		Checksum of already-summed data, in network byte order
37
+ * @v data		Data buffer
38
+ * @v len		Length of data buffer
39
+ * @ret cksum		Updated checksum, in network byte order
40
+ */
41
+uint16_t x86_tcpip_continue_chksum ( uint16_t partial,
42
+				     const void *data, size_t len ) {
43
+	unsigned long sum = ( ( ~partial ) & 0xffff );
44
+	unsigned long initial_word_count;
45
+	unsigned long loop_count;
46
+	unsigned long loop_partial_count;
47
+	unsigned long final_word_count;
48
+	unsigned long final_byte;
49
+	unsigned long discard_S;
50
+	unsigned long discard_c;
51
+	unsigned long discard_a;
52
+	unsigned long discard_r1;
53
+	unsigned long discard_r2;
54
+
55
+	/* Calculate number of initial 16-bit words required to bring
56
+	 * the main loop into alignment.  (We don't care about the
57
+	 * speed for data aligned to less than 16 bits, since this
58
+	 * situation won't occur in practice.)
59
+	 */
60
+	if ( len >= sizeof ( sum ) ) {
61
+		initial_word_count = ( ( -( ( intptr_t ) data ) &
62
+					 ( sizeof ( sum ) - 1 ) ) >> 1 );
63
+	} else {
64
+		initial_word_count = 0;
65
+	}
66
+	len -= ( initial_word_count * 2 );
67
+
68
+	/* Calculate number of iterations of the main loop.  This loop
69
+	 * processes native machine words (32-bit or 64-bit), and is
70
+	 * unrolled 16 times.  We calculate an overall iteration
71
+	 * count, and a starting point for the first iteration.
72
+	 */
73
+	loop_count = ( len / ( sizeof ( sum ) * 16 ) );
74
+	loop_partial_count =
75
+		( ( len % ( sizeof ( sum ) * 16 ) ) / sizeof ( sum ) );
76
+
77
+	/* Calculate number of 16-bit words remaining after the main
78
+	 * loop completes.
79
+	 */
80
+	final_word_count = ( ( len % sizeof ( sum ) ) / 2 );
81
+
82
+	/* Calculate whether or not a final byte remains at the end */
83
+	final_byte = ( len & 1 );
84
+
85
+	/* Calculate the checksum */
86
+	__asm__ ( /* Calculate position at which to jump into the
87
+		   * unrolled loop.
88
+		   */
89
+		  "imul $( -x86_tcpip_loop_step_size ), %4\n\t"
90
+		  "add %5, %4\n\t"
91
+
92
+		  /* Clear carry flag before starting checksumming */
93
+		  "clc\n\t"
94
+
95
+		  /* Checksum initial words */
96
+		  "jmp 2f\n\t"
97
+		  "\n1:\n\t"
98
+		  "lodsw\n\t"
99
+		  "adcw %w2, %w0\n\t"
100
+		  "\n2:\n\t"
101
+		  "loop 1b\n\t"
102
+
103
+		  /* Main "lods;adc" loop, unrolled x16 */
104
+		  "mov %12, %3\n\t"
105
+		  "jmp *%4\n\t"
106
+		  "\nx86_tcpip_loop_start:\n\t"
107
+		  "lods%z2\n\tadc %2, %0\n\t"
108
+		  "lods%z2\n\tadc %2, %0\n\t"
109
+		  "lods%z2\n\tadc %2, %0\n\t"
110
+		  "lods%z2\n\tadc %2, %0\n\t"
111
+		  "lods%z2\n\tadc %2, %0\n\t"
112
+		  "lods%z2\n\tadc %2, %0\n\t"
113
+		  "lods%z2\n\tadc %2, %0\n\t"
114
+		  "lods%z2\n\tadc %2, %0\n\t"
115
+		  "lods%z2\n\tadc %2, %0\n\t"
116
+		  "lods%z2\n\tadc %2, %0\n\t"
117
+		  "lods%z2\n\tadc %2, %0\n\t"
118
+		  "lods%z2\n\tadc %2, %0\n\t"
119
+		  "lods%z2\n\tadc %2, %0\n\t"
120
+		  "lods%z2\n\tadc %2, %0\n\t"
121
+		  "lods%z2\n\tadc %2, %0\n\t"
122
+		  "lods%z2\n\tadc %2, %0\n\t"
123
+		  "\nx86_tcpip_loop_end:\n\t"
124
+		  "loop x86_tcpip_loop_start\n\t"
125
+		  ".equ x86_tcpip_loop_step_size, "
126
+		  "  ( ( x86_tcpip_loop_end - x86_tcpip_loop_start ) / 16 )\n\t"
127
+
128
+		  /* Checksum remaining whole words */
129
+		  "mov %13, %3\n\t"
130
+		  "jmp 2f\n\t"
131
+		  "\n1:\n\t"
132
+		  "lodsw\n\t"
133
+		  "adcw %w2, %w0\n\t"
134
+		  "\n2:\n\t"
135
+		  "loop 1b\n\t"
136
+
137
+		  /* Checksum final byte if applicable */
138
+		  "mov %14, %3\n\t"
139
+		  "loop 1f\n\t"
140
+		  "adcb (%1), %b0\n\t"
141
+		  "adcb $0, %h0\n\t"
142
+		  "\n1:\n\t"
143
+
144
+		  /* Fold down to a uint16_t */
145
+		  "push %0\n\t"
146
+		  "popw %w0\n\t"
147
+		  "popw %w2\n\t"
148
+		  "adcw %w2, %w0\n\t"
149
+#if ULONG_MAX > 0xffffffffUL /* 64-bit only */
150
+		  "popw %w2\n\t"
151
+		  "adcw %w2, %w0\n\t"
152
+		  "popw %w2\n\t"
153
+		  "adcw %w2, %w0\n\t"
154
+#endif /* 64-bit only */
155
+
156
+		  /* Consume CF */
157
+		  "adcw $0, %w0\n\t"
158
+		  "adcw $0, %w0\n\t"
159
+
160
+		  : "=&Q" ( sum ), "=&S" ( discard_S ), "=&a" ( discard_a ),
161
+		    "=&c" ( discard_c ), "=&r" ( discard_r1 ),
162
+		    "=&r" ( discard_r2 )
163
+		  : "0" ( sum ), "1" ( data ), "2" ( 0 ),
164
+		    "3" ( initial_word_count + 1 ), "4" ( loop_partial_count ),
165
+		    "5" ( x86_tcpip_loop_end ), "g" ( loop_count + 1 ),
166
+		    "g" ( final_word_count + 1 ), "g" ( final_byte ) );
167
+
168
+	return ( ~sum & 0xffff );
169
+}

+ 5
- 0
src/arch/x86/include/bits/tcpip.h Näytä tiedosto

@@ -9,4 +9,9 @@
9 9
 
10 10
 FILE_LICENCE ( GPL2_OR_LATER );
11 11
 
12
+extern uint16_t x86_tcpip_continue_chksum ( uint16_t partial,
13
+					    const void *data, size_t len );
14
+
15
+#define tcpip_continue_chksum x86_tcpip_continue_chksum
16
+
12 17
 #endif /* _BITS_TCPIP_H */

Loading…
Peruuta
Tallenna