Browse Source

[libc] Reduce size of memset()

As with memcpy(), we can reduce the code size (by an average of 0.2%)
by giving the compiler more visibility into what memset() is doing,
and by avoiding the "rep" prefix on short fixed-length sequences of
string operations.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
tags/v1.20.1
Michael Brown 9 years ago
parent
commit
21d3d5c47c
1 changed files with 127 additions and 1 deletions
  1. 127
    1
      src/arch/x86/include/bits/string.h

+ 127
- 1
src/arch/x86/include/bits/string.h View File

@@ -200,7 +200,8 @@ memmove ( void *dest, const void *src, size_t len ) {
200 200
  * @v len		Length
201 201
  * @ret dest		Destination address
202 202
  */
203
-static inline void * memset ( void *dest, int fill, size_t len ) {
203
+static inline __attribute__ (( always_inline )) void *
204
+__memset ( void *dest, int fill, size_t len ) {
204 205
 	void *discard_D;
205 206
 	size_t discard_c;
206 207
 
@@ -211,4 +212,129 @@ static inline void * memset ( void *dest, int fill, size_t len ) {
211 212
 	return dest;
212 213
 }
213 214
 
215
+/**
216
+ * Fill memory region with zero (where length is a compile-time constant)
217
+ *
218
+ * @v dest		Destination address
219
+ * @v len		Length
220
+ * @ret dest		Destination address
221
+ */
222
+static inline __attribute__ (( always_inline )) void *
223
+__constant_memset_zero ( void *dest, size_t len ) {
224
+	union {
225
+		uint32_t u32[2];
226
+		uint16_t u16[4];
227
+		uint8_t  u8[8];
228
+	} __attribute__ (( __may_alias__ )) *dest_u = dest;
229
+	void *edi;
230
+	uint32_t eax;
231
+
232
+	switch ( len ) {
233
+	case 0 : /* 0 bytes */
234
+		return dest;
235
+
236
+	/* Single-register moves.  Almost certainly better than a
237
+	 * string operation.  We can avoid clobbering any registers,
238
+	 * we can reuse a zero that happens to already be in a
239
+	 * register, and we can optimise away the code entirely if the
240
+	 * memset() is used to clear a region which then gets
241
+	 * immediately overwritten.
242
+	 */
243
+	case 1 : /* 3 bytes */
244
+		dest_u->u8[0] = 0;
245
+		return dest;
246
+	case 2: /* 5 bytes */
247
+		dest_u->u16[0] = 0;
248
+		return dest;
249
+	case 4: /* 6 bytes */
250
+		dest_u->u32[0] = 0;
251
+		return dest;
252
+
253
+	/* Double-register moves.  Very probably better than a string
254
+	 * operation.
255
+	 */
256
+	case 3 : /* 9 bytes */
257
+		dest_u->u16[0] = 0;
258
+		dest_u->u8[2]  = 0;
259
+		return dest;
260
+	case 5 : /* 10 bytes */
261
+		dest_u->u32[0] = 0;
262
+		dest_u->u8[4]  = 0;
263
+		return dest;
264
+	case 6 : /* 12 bytes */
265
+		dest_u->u32[0] = 0;
266
+		dest_u->u16[2] = 0;
267
+		return dest;
268
+	case 8 : /* 13 bytes */
269
+		dest_u->u32[0] = 0;
270
+		dest_u->u32[1] = 0;
271
+		return dest;
272
+	}
273
+
274
+	/* As with memcpy(), we can potentially save space by using
275
+	 * multiple single-byte "stos" instructions instead of loading
276
+	 * up ecx and using "rep stosb".
277
+	 *
278
+	 * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
279
+	 * to allow for saving/restoring ecx 50% of the time.
280
+	 *
281
+	 * "stosl" and "stosb" are 1 byte each, "stosw" is two bytes.
282
+	 *
283
+	 * The calculations are therefore the same as for memcpy(),
284
+	 * giving a cutoff point of around 26 bytes.
285
+	 */
286
+
287
+	edi = dest;
288
+	eax = 0;
289
+
290
+	if ( len >= 26 )
291
+		return __memset ( dest, 0, len );
292
+
293
+	if ( len >= 6*4 )
294
+		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
295
+				       : "0" ( edi ), "1" ( eax ) : "memory" );
296
+	if ( len >= 5*4 )
297
+		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
298
+				       : "0" ( edi ), "1" ( eax ) : "memory" );
299
+	if ( len >= 4*4 )
300
+		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
301
+				       : "0" ( edi ), "1" ( eax ) : "memory" );
302
+	if ( len >= 3*4 )
303
+		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
304
+				       : "0" ( edi ), "1" ( eax ) : "memory" );
305
+	if ( len >= 2*4 )
306
+		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
307
+				       : "0" ( edi ), "1" ( eax ) : "memory" );
308
+	if ( len >= 1*4 )
309
+		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
310
+				       : "0" ( edi ), "1" ( eax ) : "memory" );
311
+	if ( ( len % 4 ) >= 2 )
312
+		__asm__ __volatile__ ( "stosw" : "=&D" ( edi ), "=&a" ( eax )
313
+				       : "0" ( edi ), "1" ( eax ) : "memory" );
314
+	if ( ( len % 2 ) >= 1 )
315
+		__asm__ __volatile__ ( "stosb" : "=&D" ( edi ), "=&a" ( eax )
316
+				       : "0" ( edi ), "1" ( eax ) : "memory" );
317
+
318
+	return dest;
319
+}
320
+
321
+/**
322
+ * Fill memory region
323
+ *
324
+ * @v dest		Destination address
325
+ * @v fill		Fill pattern
326
+ * @v len		Length
327
+ * @ret dest		Destination address
328
+ */
329
+static inline __attribute__ (( always_inline )) void *
330
+memset ( void *dest, int fill, size_t len ) {
331
+
332
+	if ( __builtin_constant_p ( fill ) && ( fill == 0 ) &&
333
+	     __builtin_constant_p ( len ) ) {
334
+		return __constant_memset_zero ( dest, len );
335
+	} else {
336
+		return __memset ( dest, fill, len );
337
+	}
338
+}
339
+
214 340
 #endif /* X86_BITS_STRING_H */

Loading…
Cancel
Save