Browse Source

[arm] Add optimised string functions for 64-bit ARM

Signed-off-by: Michael Brown <mcb30@ipxe.org>
tags/v1.20.1
Michael Brown 8 years ago
parent
commit
95716ece91

src/arch/arm/include/bits/string.h → src/arch/arm32/include/bits/string.h View File


+ 249
- 0
src/arch/arm64/core/arm64_string.c View File

@@ -0,0 +1,249 @@
1
+/*
2
+ * Copyright (C) 2016 Michael Brown <mbrown@fensystems.co.uk>.
3
+ *
4
+ * This program is free software; you can redistribute it and/or
5
+ * modify it under the terms of the GNU General Public License as
6
+ * published by the Free Software Foundation; either version 2 of the
7
+ * License, or any later version.
8
+ *
9
+ * This program is distributed in the hope that it will be useful, but
10
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
+ * General Public License for more details.
13
+ *
14
+ * You should have received a copy of the GNU General Public License
15
+ * along with this program; if not, write to the Free Software
16
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17
+ * 02110-1301, USA.
18
+ *
19
+ * You can also choose to distribute this program under the terms of
20
+ * the Unmodified Binary Distribution Licence (as given in the file
21
+ * COPYING.UBDL), provided that you have satisfied its requirements.
22
+ */
23
+
24
+/** @file
25
+ *
26
+ * Optimised string operations
27
+ *
28
+ */
29
+
30
+FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
31
+
32
+#include <string.h>
33
+
34
+/**
35
+ * Copy memory area
36
+ *
37
+ * @v dest		Destination address
38
+ * @v src		Source address
39
+ * @v len		Length
40
+ * @ret dest		Destination address
41
+ */
42
+void arm64_memcpy ( void *dest, const void *src, size_t len ) {
43
+	void *discard_dest;
44
+	void *discard_end;
45
+	const void *discard_src;
46
+	size_t discard_offset;
47
+	unsigned long discard_data;
48
+	unsigned long discard_low;
49
+	unsigned long discard_high;
50
+
51
+	/* If length is too short for an "ldp"/"stp" instruction pair,
52
+	 * then just copy individual bytes.
53
+	 */
54
+	if ( len < 16 ) {
55
+		__asm__ __volatile__ ( "cbz %0, 2f\n\t"
56
+				       "\n1:\n\t"
57
+				       "sub %0, %0, #1\n\t"
58
+				       "ldrb %w1, [%3, %0]\n\t"
59
+				       "strb %w1, [%2, %0]\n\t"
60
+				       "cbnz %0, 1b\n\t"
61
+				       "\n2:\n\t"
62
+				       : "=&r" ( discard_offset ),
63
+					 "=&r" ( discard_data )
64
+				       : "r" ( dest ), "r" ( src ), "0" ( len )
65
+				       : "memory" );
66
+		return;
67
+	}
68
+
69
+	/* Use "ldp"/"stp" to copy 16 bytes at a time: one initial
70
+	 * potentially unaligned access, multiple destination-aligned
71
+	 * accesses, one final potentially unaligned access.
72
+	 */
73
+	__asm__ __volatile__ ( "ldp %3, %4, [%1], #16\n\t"
74
+			       "stp %3, %4, [%0], #16\n\t"
75
+			       "and %3, %0, #15\n\t"
76
+			       "sub %0, %0, %3\n\t"
77
+			       "sub %1, %1, %3\n\t"
78
+			       "bic %2, %5, #15\n\t"
79
+			       "b 2f\n\t"
80
+			       "\n1:\n\t"
81
+			       "ldp %3, %4, [%1], #16\n\t"
82
+			       "stp %3, %4, [%0], #16\n\t"
83
+			       "\n2:\n\t"
84
+			       "cmp %0, %2\n\t"
85
+			       "bne 1b\n\t"
86
+			       "ldp %3, %4, [%6, #-16]\n\t"
87
+			       "stp %3, %4, [%5, #-16]\n\t"
88
+			       : "=&r" ( discard_dest ),
89
+				 "=&r" ( discard_src ),
90
+				 "=&r" ( discard_end ),
91
+				 "=&r" ( discard_low ),
92
+				 "=&r" ( discard_high )
93
+			       : "r" ( dest + len ), "r" ( src + len ),
94
+				 "0" ( dest ), "1" ( src )
95
+			       : "memory", "cc" );
96
+}
97
+
98
+/**
99
+ * Zero memory region
100
+ *
101
+ * @v dest		Destination region
102
+ * @v len		Length
103
+ */
104
+void arm64_bzero ( void *dest, size_t len ) {
105
+	size_t discard_offset;
106
+	void *discard_dest;
107
+	void *discard_end;
108
+
109
+	/* If length is too short for an "stp" instruction, then just
110
+	 * zero individual bytes.
111
+	 */
112
+	if ( len < 16 ) {
113
+		__asm__ __volatile__ ( "cbz %0, 2f\n\t"
114
+				       "\n1:\n\t"
115
+				       "sub %0, %0, #1\n\t"
116
+				       "strb wzr, [%1, %0]\n\t"
117
+				       "cbnz %0, 1b\n\t"
118
+				       "\n2:\n\t"
119
+				       : "=&r" ( discard_offset )
120
+				       : "r" ( dest ), "0" ( len )
121
+				       : "memory" );
122
+		return;
123
+	}
124
+
125
+	/* Use "stp" to zero 16 bytes at a time: one initial
126
+	 * potentially unaligned access, multiple aligned accesses,
127
+	 * one final potentially unaligned access.
128
+	 */
129
+	__asm__ __volatile__ ( "stp xzr, xzr, [%0], #16\n\t"
130
+			       "bic %0, %0, #15\n\t"
131
+			       "bic %1, %2, #15\n\t"
132
+			       "b 2f\n\t"
133
+			       "\n1:\n\t"
134
+			       "stp xzr, xzr, [%0], #16\n\t"
135
+			       "\n2:\n\t"
136
+			       "cmp %0, %1\n\t"
137
+			       "bne 1b\n\t"
138
+			       "stp xzr, xzr, [%2, #-16]\n\t"
139
+			       : "=&r" ( discard_dest ),
140
+				 "=&r" ( discard_end )
141
+			       : "r" ( dest + len ), "0" ( dest )
142
+			       : "memory", "cc" );
143
+}
144
+
145
+/**
146
+ * Fill memory region
147
+ *
148
+ * @v dest		Destination region
149
+ * @v len		Length
150
+ * @v character		Fill character
151
+ *
152
+ * The unusual parameter order is to allow for more efficient
153
+ * tail-calling to arm64_memset() when zeroing a region.
154
+ */
155
+void arm64_memset ( void *dest, size_t len, int character ) {
156
+	size_t discard_offset;
157
+
158
+	/* Use optimised zeroing code if applicable */
159
+	if ( character == 0 ) {
160
+		arm64_bzero ( dest, len );
161
+		return;
162
+	}
163
+
164
+	/* Fill one byte at a time.  Calling memset() with a non-zero
165
+	 * value is relatively rare and unlikely to be
166
+	 * performance-critical.
167
+	 */
168
+	__asm__ __volatile__ ( "cbz %0, 2f\n\t"
169
+			       "\n1:\n\t"
170
+			       "sub %0, %0, #1\n\t"
171
+			       "strb %w2, [%1, %0]\n\t"
172
+			       "cbnz %0, 1b\n\t"
173
+			       "\n2:\n\t"
174
+			       : "=&r" ( discard_offset )
175
+			       : "r" ( dest ), "r" ( character ), "0" ( len )
176
+			       : "memory" );
177
+}
178
+
179
+/**
180
+ * Copy (possibly overlapping) memory region forwards
181
+ *
182
+ * @v dest		Destination region
183
+ * @v src		Source region
184
+ * @v len		Length
185
+ */
186
+void arm64_memmove_forwards ( void *dest, const void *src, size_t len ) {
187
+	void *discard_dest;
188
+	const void *discard_src;
189
+	unsigned long discard_data;
190
+
191
+	/* Assume memmove() is not performance-critical, and perform a
192
+	 * bytewise copy for simplicity.
193
+	 */
194
+	__asm__ __volatile__ ( "b 2f\n\t"
195
+			       "\n1:\n\t"
196
+			       "ldrb %w2, [%1], #1\n\t"
197
+			       "strb %w2, [%0], #1\n\t"
198
+			       "\n2:\n\t"
199
+			       "cmp %0, %3\n\t"
200
+			       "bne 1b\n\t"
201
+			       : "=&r" ( discard_dest ),
202
+				 "=&r" ( discard_src ),
203
+				 "=&r" ( discard_data )
204
+			       : "r" ( dest + len ), "0" ( dest ), "1" ( src )
205
+			       : "memory" );
206
+}
207
+
208
+/**
209
+ * Copy (possibly overlapping) memory region backwards
210
+ *
211
+ * @v dest		Destination region
212
+ * @v src		Source region
213
+ * @v len		Length
214
+ */
215
+void arm64_memmove_backwards ( void *dest, const void *src, size_t len ) {
216
+	size_t discard_offset;
217
+	unsigned long discard_data;
218
+
219
+	/* Assume memmove() is not performance-critical, and perform a
220
+	 * bytewise copy for simplicity.
221
+	 */
222
+	__asm__ __volatile__ ( "cbz %0, 2f\n\t"
223
+			       "\n1:\n\t"
224
+			       "sub %0, %0, #1\n\t"
225
+			       "ldrb %w1, [%3, %0]\n\t"
226
+			       "strb %w1, [%2, %0]\n\t"
227
+			       "cbnz %0, 1b\n\t"
228
+			       "\n2:\n\t"
229
+			       : "=&r" ( discard_offset ),
230
+				 "=&r" ( discard_data )
231
+			       : "r" ( dest ), "r" ( src ), "0" ( len )
232
+			       : "memory" );
233
+}
234
+
235
+/**
236
+ * Copy (possibly overlapping) memory region
237
+ *
238
+ * @v dest		Destination region
239
+ * @v src		Source region
240
+ * @v len		Length
241
+ */
242
+void arm64_memmove ( void *dest, const void *src, size_t len ) {
243
+
244
+	if ( dest <= src ) {
245
+		arm64_memmove_forwards ( dest, src, len );
246
+	} else {
247
+		arm64_memmove_backwards ( dest, src, len );
248
+	}
249
+}

+ 106
- 0
src/arch/arm64/include/bits/string.h View File

@@ -0,0 +1,106 @@
1
+#ifndef BITS_STRING_H
2
+#define BITS_STRING_H
3
+
4
+FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
5
+
6
+/** @file
7
+ *
8
+ * String functions
9
+ *
10
+ */
11
+
12
+extern void arm64_bzero ( void *dest, size_t len );
13
+extern void arm64_memset ( void *dest, size_t len, int character );
14
+extern void arm64_memcpy ( void *dest, const void *src, size_t len );
15
+extern void arm64_memmove_forwards ( void *dest, const void *src, size_t len );
16
+extern void arm64_memmove_backwards ( void *dest, const void *src, size_t len );
17
+extern void arm64_memmove ( void *dest, const void *src, size_t len );
18
+
19
+/**
20
+ * Fill memory region
21
+ *
22
+ * @v dest		Destination region
23
+ * @v character		Fill character
24
+ * @v len		Length
25
+ * @ret dest		Destination region
26
+ */
27
+static inline __attribute__ (( always_inline )) void *
28
+memset ( void *dest, int character, size_t len ) {
29
+
30
+	/* Allow gcc to generate inline "stX xzr" instructions for
31
+	 * small, constant lengths.
32
+	 */
33
+	if ( __builtin_constant_p ( character ) && ( character == 0 ) &&
34
+	     __builtin_constant_p ( len ) && ( len <= 64 ) ) {
35
+		__builtin_memset ( dest, 0, len );
36
+		return dest;
37
+	}
38
+
39
+	/* For zeroing larger or non-constant lengths, use the
40
+	 * optimised variable-length zeroing code.
41
+	 */
42
+	if ( __builtin_constant_p ( character ) && ( character == 0 ) ) {
43
+		arm64_bzero ( dest, len );
44
+		return dest;
45
+	}
46
+
47
+	/* Not necessarily zeroing: use basic variable-length code */
48
+	arm64_memset ( dest, len, character );
49
+	return dest;
50
+}
51
+
52
+/**
53
+ * Copy memory region
54
+ *
55
+ * @v dest		Destination region
56
+ * @v src		Source region
57
+ * @v len		Length
58
+ * @ret dest		Destination region
59
+ */
60
+static inline __attribute__ (( always_inline )) void *
61
+memcpy ( void *dest, const void *src, size_t len ) {
62
+
63
+	/* Allow gcc to generate inline "ldX"/"stX" instructions for
64
+	 * small, constant lengths.
65
+	 */
66
+	if ( __builtin_constant_p ( len ) && ( len <= 64 ) ) {
67
+		__builtin_memcpy ( dest, src, len );
68
+		return dest;
69
+	}
70
+
71
+	/* Otherwise, use variable-length code */
72
+	arm64_memcpy ( dest, src, len );
73
+	return dest;
74
+}
75
+
76
+/**
77
+ * Copy (possibly overlapping) memory region
78
+ *
79
+ * @v dest		Destination region
80
+ * @v src		Source region
81
+ * @v len		Length
82
+ * @ret dest		Destination region
83
+ */
84
+static inline __attribute__ (( always_inline )) void *
85
+memmove ( void *dest, const void *src, size_t len ) {
86
+	ssize_t offset = ( dest - src );
87
+
88
+	/* If required direction of copy is known at build time, then
89
+	 * use the appropriate forwards/backwards copy directly.
90
+	 */
91
+	if ( __builtin_constant_p ( offset ) ) {
92
+		if ( offset <= 0 ) {
93
+			arm64_memmove_forwards ( dest, src, len );
94
+			return dest;
95
+		} else {
96
+			arm64_memmove_backwards ( dest, src, len );
97
+			return dest;
98
+		}
99
+	}
100
+
101
+	/* Otherwise, use ambidirectional copy */
102
+	arm64_memmove ( dest, src, len );
103
+	return dest;
104
+}
105
+
106
+#endif /* BITS_STRING_H */

Loading…
Cancel
Save