You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

string.h 9.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. #ifndef X86_BITS_STRING_H
  2. #define X86_BITS_STRING_H
  3. /*
  4. * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU General Public License as
  8. * published by the Free Software Foundation; either version 2 of the
  9. * License, or any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19. * 02110-1301, USA.
  20. *
  21. * You can also choose to distribute this program under the terms of
  22. * the Unmodified Binary Distribution Licence (as given in the file
  23. * COPYING.UBDL), provided that you have satisfied its requirements.
  24. */
  25. FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
  26. /** @file
  27. *
  28. * Optimised string operations
  29. *
  30. */
  31. extern void * __memcpy ( void *dest, const void *src, size_t len );
  32. extern void * __memcpy_reverse ( void *dest, const void *src, size_t len );
  33. /**
  34. * Copy memory area (where length is a compile-time constant)
  35. *
  36. * @v dest Destination address
  37. * @v src Source address
  38. * @v len Length
  39. * @ret dest Destination address
  40. */
  41. static inline __attribute__ (( always_inline )) void *
  42. __constant_memcpy ( void *dest, const void *src, size_t len ) {
  43. union {
  44. uint32_t u32[2];
  45. uint16_t u16[4];
  46. uint8_t u8[8];
  47. } __attribute__ (( __may_alias__ )) *dest_u = dest;
  48. const union {
  49. uint32_t u32[2];
  50. uint16_t u16[4];
  51. uint8_t u8[8];
  52. } __attribute__ (( __may_alias__ )) *src_u = src;
  53. const void *esi;
  54. void *edi;
  55. switch ( len ) {
  56. case 0 : /* 0 bytes */
  57. return dest;
  58. /*
  59. * Single-register moves; these are always better than a
  60. * string operation. We can clobber an arbitrary two
  61. * registers (data, source, dest can re-use source register)
  62. * instead of being restricted to esi and edi. There's also a
  63. * much greater potential for optimising with nearby code.
  64. *
  65. */
  66. case 1 : /* 4 bytes */
  67. dest_u->u8[0] = src_u->u8[0];
  68. return dest;
  69. case 2 : /* 6 bytes */
  70. dest_u->u16[0] = src_u->u16[0];
  71. return dest;
  72. case 4 : /* 4 bytes */
  73. dest_u->u32[0] = src_u->u32[0];
  74. return dest;
  75. /*
  76. * Double-register moves; these are probably still a win.
  77. *
  78. */
  79. case 3 : /* 12 bytes */
  80. dest_u->u16[0] = src_u->u16[0];
  81. dest_u->u8[2] = src_u->u8[2];
  82. return dest;
  83. case 5 : /* 10 bytes */
  84. dest_u->u32[0] = src_u->u32[0];
  85. dest_u->u8[4] = src_u->u8[4];
  86. return dest;
  87. case 6 : /* 12 bytes */
  88. dest_u->u32[0] = src_u->u32[0];
  89. dest_u->u16[2] = src_u->u16[2];
  90. return dest;
  91. case 8 : /* 10 bytes */
  92. dest_u->u32[0] = src_u->u32[0];
  93. dest_u->u32[1] = src_u->u32[1];
  94. return dest;
  95. }
  96. /* Even if we have to load up esi and edi ready for a string
  97. * operation, we can sometimes save space by using multiple
  98. * single-byte "movs" operations instead of loading up ecx and
  99. * using "rep movsb".
  100. *
  101. * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
  102. * to allow for saving/restoring ecx 50% of the time.
  103. *
  104. * "movsl" and "movsb" are 1 byte each, "movsw" is two bytes.
  105. * (In 16-bit mode, "movsl" is 2 bytes and "movsw" is 1 byte,
  106. * but "movsl" moves twice as much data, so it balances out).
  107. *
  108. * The cutoff point therefore occurs around 26 bytes; the byte
  109. * requirements for each method are:
  110. *
  111. * len 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
  112. * #bytes (ecx) 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
  113. * #bytes (no ecx) 4 5 6 7 5 6 7 8 6 7 8 9 7 8 9 10
  114. */
  115. esi = src;
  116. edi = dest;
  117. if ( len >= 26 )
  118. return __memcpy ( dest, src, len );
  119. if ( len >= 6*4 )
  120. __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
  121. : "0" ( edi ), "1" ( esi ) : "memory" );
  122. if ( len >= 5*4 )
  123. __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
  124. : "0" ( edi ), "1" ( esi ) : "memory" );
  125. if ( len >= 4*4 )
  126. __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
  127. : "0" ( edi ), "1" ( esi ) : "memory" );
  128. if ( len >= 3*4 )
  129. __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
  130. : "0" ( edi ), "1" ( esi ) : "memory" );
  131. if ( len >= 2*4 )
  132. __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
  133. : "0" ( edi ), "1" ( esi ) : "memory" );
  134. if ( len >= 1*4 )
  135. __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
  136. : "0" ( edi ), "1" ( esi ) : "memory" );
  137. if ( ( len % 4 ) >= 2 )
  138. __asm__ __volatile__ ( "movsw" : "=&D" ( edi ), "=&S" ( esi )
  139. : "0" ( edi ), "1" ( esi ) : "memory" );
  140. if ( ( len % 2 ) >= 1 )
  141. __asm__ __volatile__ ( "movsb" : "=&D" ( edi ), "=&S" ( esi )
  142. : "0" ( edi ), "1" ( esi ) : "memory" );
  143. return dest;
  144. }
  145. /**
  146. * Copy memory area
  147. *
  148. * @v dest Destination address
  149. * @v src Source address
  150. * @v len Length
  151. * @ret dest Destination address
  152. */
  153. static inline __attribute__ (( always_inline )) void *
  154. memcpy ( void *dest, const void *src, size_t len ) {
  155. if ( __builtin_constant_p ( len ) ) {
  156. return __constant_memcpy ( dest, src, len );
  157. } else {
  158. return __memcpy ( dest, src, len );
  159. }
  160. }
  161. extern void * __memmove ( void *dest, const void *src, size_t len );
  162. /**
  163. * Copy (possibly overlapping) memory area
  164. *
  165. * @v dest Destination address
  166. * @v src Source address
  167. * @v len Length
  168. * @ret dest Destination address
  169. */
  170. static inline __attribute__ (( always_inline )) void *
  171. memmove ( void *dest, const void *src, size_t len ) {
  172. ssize_t offset = ( dest - src );
  173. if ( __builtin_constant_p ( offset ) ) {
  174. if ( offset <= 0 ) {
  175. return memcpy ( dest, src, len );
  176. } else {
  177. return __memcpy_reverse ( dest, src, len );
  178. }
  179. } else {
  180. return __memmove ( dest, src, len );
  181. }
  182. }
  183. /**
  184. * Fill memory region
  185. *
  186. * @v dest Destination address
  187. * @v fill Fill pattern
  188. * @v len Length
  189. * @ret dest Destination address
  190. */
  191. static inline __attribute__ (( always_inline )) void *
  192. __memset ( void *dest, int fill, size_t len ) {
  193. void *discard_D;
  194. size_t discard_c;
  195. __asm__ __volatile__ ( "rep stosb"
  196. : "=&D" ( discard_D ), "=&c" ( discard_c )
  197. : "0" ( dest ), "1" ( len ), "a" ( fill )
  198. : "memory" );
  199. return dest;
  200. }
  201. /**
  202. * Fill memory region with zero (where length is a compile-time constant)
  203. *
  204. * @v dest Destination address
  205. * @v len Length
  206. * @ret dest Destination address
  207. */
  208. static inline __attribute__ (( always_inline )) void *
  209. __constant_memset_zero ( void *dest, size_t len ) {
  210. union {
  211. uint32_t u32[2];
  212. uint16_t u16[4];
  213. uint8_t u8[8];
  214. } __attribute__ (( __may_alias__ )) *dest_u = dest;
  215. void *edi;
  216. uint32_t eax;
  217. switch ( len ) {
  218. case 0 : /* 0 bytes */
  219. return dest;
  220. /* Single-register moves. Almost certainly better than a
  221. * string operation. We can avoid clobbering any registers,
  222. * we can reuse a zero that happens to already be in a
  223. * register, and we can optimise away the code entirely if the
  224. * memset() is used to clear a region which then gets
  225. * immediately overwritten.
  226. */
  227. case 1 : /* 3 bytes */
  228. dest_u->u8[0] = 0;
  229. return dest;
  230. case 2: /* 5 bytes */
  231. dest_u->u16[0] = 0;
  232. return dest;
  233. case 4: /* 6 bytes */
  234. dest_u->u32[0] = 0;
  235. return dest;
  236. /* Double-register moves. Very probably better than a string
  237. * operation.
  238. */
  239. case 3 : /* 9 bytes */
  240. dest_u->u16[0] = 0;
  241. dest_u->u8[2] = 0;
  242. return dest;
  243. case 5 : /* 10 bytes */
  244. dest_u->u32[0] = 0;
  245. dest_u->u8[4] = 0;
  246. return dest;
  247. case 6 : /* 12 bytes */
  248. dest_u->u32[0] = 0;
  249. dest_u->u16[2] = 0;
  250. return dest;
  251. case 8 : /* 13 bytes */
  252. dest_u->u32[0] = 0;
  253. dest_u->u32[1] = 0;
  254. return dest;
  255. }
  256. /* As with memcpy(), we can potentially save space by using
  257. * multiple single-byte "stos" instructions instead of loading
  258. * up ecx and using "rep stosb".
  259. *
  260. * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
  261. * to allow for saving/restoring ecx 50% of the time.
  262. *
  263. * "stosl" and "stosb" are 1 byte each, "stosw" is two bytes.
  264. *
  265. * The calculations are therefore the same as for memcpy(),
  266. * giving a cutoff point of around 26 bytes.
  267. */
  268. edi = dest;
  269. eax = 0;
  270. if ( len >= 26 )
  271. return __memset ( dest, 0, len );
  272. if ( len >= 6*4 )
  273. __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
  274. : "0" ( edi ), "1" ( eax ) : "memory" );
  275. if ( len >= 5*4 )
  276. __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
  277. : "0" ( edi ), "1" ( eax ) : "memory" );
  278. if ( len >= 4*4 )
  279. __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
  280. : "0" ( edi ), "1" ( eax ) : "memory" );
  281. if ( len >= 3*4 )
  282. __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
  283. : "0" ( edi ), "1" ( eax ) : "memory" );
  284. if ( len >= 2*4 )
  285. __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
  286. : "0" ( edi ), "1" ( eax ) : "memory" );
  287. if ( len >= 1*4 )
  288. __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
  289. : "0" ( edi ), "1" ( eax ) : "memory" );
  290. if ( ( len % 4 ) >= 2 )
  291. __asm__ __volatile__ ( "stosw" : "=&D" ( edi ), "=&a" ( eax )
  292. : "0" ( edi ), "1" ( eax ) : "memory" );
  293. if ( ( len % 2 ) >= 1 )
  294. __asm__ __volatile__ ( "stosb" : "=&D" ( edi ), "=&a" ( eax )
  295. : "0" ( edi ), "1" ( eax ) : "memory" );
  296. return dest;
  297. }
  298. /**
  299. * Fill memory region
  300. *
  301. * @v dest Destination address
  302. * @v fill Fill pattern
  303. * @v len Length
  304. * @ret dest Destination address
  305. */
  306. static inline __attribute__ (( always_inline )) void *
  307. memset ( void *dest, int fill, size_t len ) {
  308. if ( __builtin_constant_p ( fill ) && ( fill == 0 ) &&
  309. __builtin_constant_p ( len ) ) {
  310. return __constant_memset_zero ( dest, len );
  311. } else {
  312. return __memset ( dest, fill, len );
  313. }
  314. }
  315. #endif /* X86_BITS_STRING_H */