|
@@ -200,7 +200,8 @@ memmove ( void *dest, const void *src, size_t len ) {
|
200
|
200
|
* @v len Length
|
201
|
201
|
* @ret dest Destination address
|
202
|
202
|
*/
|
203
|
|
-static inline void * memset ( void *dest, int fill, size_t len ) {
|
|
203
|
+static inline __attribute__ (( always_inline )) void *
|
|
204
|
+__memset ( void *dest, int fill, size_t len ) {
|
204
|
205
|
void *discard_D;
|
205
|
206
|
size_t discard_c;
|
206
|
207
|
|
|
@@ -211,4 +212,129 @@ static inline void * memset ( void *dest, int fill, size_t len ) {
|
211
|
212
|
return dest;
|
212
|
213
|
}
|
213
|
214
|
|
|
215
|
+/**
|
|
216
|
+ * Fill memory region with zero (where length is a compile-time constant)
|
|
217
|
+ *
|
|
218
|
+ * @v dest Destination address
|
|
219
|
+ * @v len Length
|
|
220
|
+ * @ret dest Destination address
|
|
221
|
+ */
|
|
222
|
+static inline __attribute__ (( always_inline )) void *
|
|
223
|
+__constant_memset_zero ( void *dest, size_t len ) {
|
|
224
|
+ union {
|
|
225
|
+ uint32_t u32[2];
|
|
226
|
+ uint16_t u16[4];
|
|
227
|
+ uint8_t u8[8];
|
|
228
|
+ } __attribute__ (( __may_alias__ )) *dest_u = dest;
|
|
229
|
+ void *edi;
|
|
230
|
+ uint32_t eax;
|
|
231
|
+
|
|
232
|
+ switch ( len ) {
|
|
233
|
+ case 0 : /* 0 bytes */
|
|
234
|
+ return dest;
|
|
235
|
+
|
|
236
|
+ /* Single-register moves. Almost certainly better than a
|
|
237
|
+ * string operation. We can avoid clobbering any registers,
|
|
238
|
+ * we can reuse a zero that happens to already be in a
|
|
239
|
+ * register, and we can optimise away the code entirely if the
|
|
240
|
+ * memset() is used to clear a region which then gets
|
|
241
|
+ * immediately overwritten.
|
|
242
|
+ */
|
|
243
|
+ case 1 : /* 3 bytes */
|
|
244
|
+ dest_u->u8[0] = 0;
|
|
245
|
+ return dest;
|
|
246
|
+ case 2: /* 5 bytes */
|
|
247
|
+ dest_u->u16[0] = 0;
|
|
248
|
+ return dest;
|
|
249
|
+ case 4: /* 6 bytes */
|
|
250
|
+ dest_u->u32[0] = 0;
|
|
251
|
+ return dest;
|
|
252
|
+
|
|
253
|
+ /* Double-register moves. Very probably better than a string
|
|
254
|
+ * operation.
|
|
255
|
+ */
|
|
256
|
+ case 3 : /* 9 bytes */
|
|
257
|
+ dest_u->u16[0] = 0;
|
|
258
|
+ dest_u->u8[2] = 0;
|
|
259
|
+ return dest;
|
|
260
|
+ case 5 : /* 10 bytes */
|
|
261
|
+ dest_u->u32[0] = 0;
|
|
262
|
+ dest_u->u8[4] = 0;
|
|
263
|
+ return dest;
|
|
264
|
+ case 6 : /* 12 bytes */
|
|
265
|
+ dest_u->u32[0] = 0;
|
|
266
|
+ dest_u->u16[2] = 0;
|
|
267
|
+ return dest;
|
|
268
|
+ case 8 : /* 13 bytes */
|
|
269
|
+ dest_u->u32[0] = 0;
|
|
270
|
+ dest_u->u32[1] = 0;
|
|
271
|
+ return dest;
|
|
272
|
+ }
|
|
273
|
+
|
|
274
|
+ /* As with memcpy(), we can potentially save space by using
|
|
275
|
+ * multiple single-byte "stos" instructions instead of loading
|
|
276
|
+ * up ecx and using "rep stosb".
|
|
277
|
+ *
|
|
278
|
+ * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
|
|
279
|
+ * to allow for saving/restoring ecx 50% of the time.
|
|
280
|
+ *
|
|
281
|
+ * "stosl" and "stosb" are 1 byte each, "stosw" is two bytes.
|
|
282
|
+ *
|
|
283
|
+ * The calculations are therefore the same as for memcpy(),
|
|
284
|
+ * giving a cutoff point of around 26 bytes.
|
|
285
|
+ */
|
|
286
|
+
|
|
287
|
+ edi = dest;
|
|
288
|
+ eax = 0;
|
|
289
|
+
|
|
290
|
+ if ( len >= 26 )
|
|
291
|
+ return __memset ( dest, 0, len );
|
|
292
|
+
|
|
293
|
+ if ( len >= 6*4 )
|
|
294
|
+ __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
|
|
295
|
+ : "0" ( edi ), "1" ( eax ) : "memory" );
|
|
296
|
+ if ( len >= 5*4 )
|
|
297
|
+ __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
|
|
298
|
+ : "0" ( edi ), "1" ( eax ) : "memory" );
|
|
299
|
+ if ( len >= 4*4 )
|
|
300
|
+ __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
|
|
301
|
+ : "0" ( edi ), "1" ( eax ) : "memory" );
|
|
302
|
+ if ( len >= 3*4 )
|
|
303
|
+ __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
|
|
304
|
+ : "0" ( edi ), "1" ( eax ) : "memory" );
|
|
305
|
+ if ( len >= 2*4 )
|
|
306
|
+ __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
|
|
307
|
+ : "0" ( edi ), "1" ( eax ) : "memory" );
|
|
308
|
+ if ( len >= 1*4 )
|
|
309
|
+ __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
|
|
310
|
+ : "0" ( edi ), "1" ( eax ) : "memory" );
|
|
311
|
+ if ( ( len % 4 ) >= 2 )
|
|
312
|
+ __asm__ __volatile__ ( "stosw" : "=&D" ( edi ), "=&a" ( eax )
|
|
313
|
+ : "0" ( edi ), "1" ( eax ) : "memory" );
|
|
314
|
+ if ( ( len % 2 ) >= 1 )
|
|
315
|
+ __asm__ __volatile__ ( "stosb" : "=&D" ( edi ), "=&a" ( eax )
|
|
316
|
+ : "0" ( edi ), "1" ( eax ) : "memory" );
|
|
317
|
+
|
|
318
|
+ return dest;
|
|
319
|
+}
|
|
320
|
+
|
|
321
|
+/**
|
|
322
|
+ * Fill memory region
|
|
323
|
+ *
|
|
324
|
+ * @v dest Destination address
|
|
325
|
+ * @v fill Fill pattern
|
|
326
|
+ * @v len Length
|
|
327
|
+ * @ret dest Destination address
|
|
328
|
+ */
|
|
329
|
+static inline __attribute__ (( always_inline )) void *
|
|
330
|
+memset ( void *dest, int fill, size_t len ) {
|
|
331
|
+
|
|
332
|
+ if ( __builtin_constant_p ( fill ) && ( fill == 0 ) &&
|
|
333
|
+ __builtin_constant_p ( len ) ) {
|
|
334
|
+ return __constant_memset_zero ( dest, len );
|
|
335
|
+ } else {
|
|
336
|
+ return __memset ( dest, fill, len );
|
|
337
|
+ }
|
|
338
|
+}
|
|
339
|
+
|
214
|
340
|
#endif /* X86_BITS_STRING_H */
|