94 #ifndef VSMC_UTILITY_CSTRING_HPP
95 #define VSMC_UTILITY_CSTRING_HPP
101 #include <emmintrin.h>
105 #include <immintrin.h>
110 #ifndef VSMC_CSTRING_RUNTIME_DISPATCH
111 #define VSMC_CSTRING_RUNTIME_DISPATCH 0
116 #ifndef VSMC_CSTRING_NON_TEMPORAL_THRESHOLD
117 #define VSMC_CSTRING_NON_TEMPORAL_THRESHOLD 0
120 #define VSMC_RUNTIME_ASSERT_UTILITY_CSTRING_MEMCPY(dst, src, n) \
121 VSMC_RUNTIME_ASSERT(( \
122 static_cast<const char *>(dst) - \
123 static_cast<const char *>(src) <= \
124 static_cast<std::ptrdiff_t>(n) && \
125 static_cast<const char *>(src) - \
126 static_cast<const char *>(dst) <= \
127 static_cast<std::ptrdiff_t>(n)), \
128 ("**vsmc::memcpy** OVERLAPPING BUFFERS"))
130 #define VSMC_RUNTIME_ASSERT_UTILITY_CSTRING_ALIGNED_DST(ISA, dst, func) \
131 VSMC_RUNTIME_ASSERT( \
132 (::vsmc::internal::cstring_is_aligned<ISA>(dst) != 0), \
133 ("**vsmc::"#func" DESTINATION POINTER IS NOT ALIGNED"))
135 #define VSMC_RUNTIME_ASSERT_UTILITY_CSTRING_ALIGNED_SRC(ISA, src, func) \
136 VSMC_RUNTIME_ASSERT( \
137 (::vsmc::internal::cstring_is_aligned<ISA>(src) != 0), \
138 ("**vsmc::"#func" SOURCE POINTER IS NOT ALIGNED"))
141 #define VSMC_DEFINE_UTILITY_CSTRING_SET(ISA, da, nt, c, m,\
144 inline void memset_n<ISA, 1, da, nt> (void *dst, int ch, std::size_t n) \
149 c *dstc = static_cast<c *>(dst); \
150 m m0 = cast(set1(static_cast<char>(static_cast<unsigned char>(ch)))); \
151 if (n >= traits::SIMDTrait<ISA>::alignment) { \
152 n -= traits::SIMDTrait<ISA>::alignment; \
154 dstc += traits::SIMDTrait<ISA>::alignment / sizeof(c); \
156 memset_0<ISA>(dstc, ch, n); \
160 inline void memset_n<ISA, 2, da, nt> (void *dst, int ch, std::size_t n) \
165 c *dstc = static_cast<c *>(dst); \
166 m m0 = cast(set1(static_cast<char>(static_cast<unsigned char>(ch)))); \
167 if (n >= traits::SIMDTrait<ISA>::alignment * 2) { \
168 n -= traits::SIMDTrait<ISA>::alignment * 2; \
169 store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
170 store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m0); \
171 dstc += traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c); \
173 memset_n<ISA, 1, da, nt>(dstc, ch, n); \
177 inline void memset_n<ISA, 4, da, nt> (void *dst, int ch, std::size_t n) \
182 c *dstc = static_cast<c *>(dst); \
183 m m0 = cast(set1(static_cast<char>(static_cast<unsigned char>(ch)))); \
184 if (n >= traits::SIMDTrait<ISA>::alignment * 4) { \
185 n -= traits::SIMDTrait<ISA>::alignment * 4; \
186 store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
187 store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m0); \
188 store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m0); \
189 store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m0); \
190 dstc += traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c); \
192 memset_n<ISA, 2, da, nt>(dstc, ch, n); \
196 inline void memset_n<ISA, 8, da, nt> (void *dst, int ch, std::size_t n) \
201 c *dstc = static_cast<c *>(dst); \
202 m m0 = cast(set1(static_cast<char>(static_cast<unsigned char>(ch)))); \
203 if (n >= traits::SIMDTrait<ISA>::alignment * 8) { \
204 n -= traits::SIMDTrait<ISA>::alignment * 8; \
205 store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
206 store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m0); \
207 store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m0); \
208 store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m0); \
209 store(dstc + traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c), m0); \
210 store(dstc + traits::SIMDTrait<ISA>::alignment * 5 / sizeof(c), m0); \
211 store(dstc + traits::SIMDTrait<ISA>::alignment * 6 / sizeof(c), m0); \
212 store(dstc + traits::SIMDTrait<ISA>::alignment * 7 / sizeof(c), m0); \
213 dstc += traits::SIMDTrait<ISA>::alignment * 8 / sizeof(c); \
215 memset_n<ISA, 4, da, nt>(dstc, ch, n); \
219 inline void memset_l<ISA, 4, da, nt> (void *dst, int ch, std::size_t n) \
224 c *dstc = static_cast<c *>(dst); \
225 m m0 = cast(set1(static_cast<char>(static_cast<unsigned char>(ch)))); \
226 while (n >= traits::SIMDTrait<ISA>::alignment * 4) { \
227 n -= traits::SIMDTrait<ISA>::alignment * 4; \
228 store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
229 store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m0); \
230 store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m0); \
231 store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m0); \
232 dstc += traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c); \
234 memset_n<ISA, 2, da, nt>(dstc, ch, n); \
238 inline void memset_l<ISA, 8, da, nt> (void *dst, int ch, std::size_t n) \
243 c *dstc = static_cast<c *>(dst); \
244 m m0 = cast(set1(static_cast<char>(static_cast<unsigned char>(ch)))); \
245 while (n >= traits::SIMDTrait<ISA>::alignment * 8) { \
246 n -= traits::SIMDTrait<ISA>::alignment * 8; \
247 store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
248 store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m0); \
249 store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m0); \
250 store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m0); \
251 store(dstc + traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c), m0); \
252 store(dstc + traits::SIMDTrait<ISA>::alignment * 5 / sizeof(c), m0); \
253 store(dstc + traits::SIMDTrait<ISA>::alignment * 6 / sizeof(c), m0); \
254 store(dstc + traits::SIMDTrait<ISA>::alignment * 7 / sizeof(c), m0); \
255 dstc += traits::SIMDTrait<ISA>::alignment * 8 / sizeof(c); \
257 memset_n<ISA, 4, da, nt>(dstc, ch, n); \
260 #define VSMC_DEFINE_UTILITY_CSTRING_CPY(ISA, sa, da, nt, c, m, load, store) \
262 inline void memcpy_n<ISA, 1, sa, da, nt> ( \
263 void *dst, const void *src, std::size_t n) \
268 c *dstc = static_cast<c *>(dst); \
269 const c *srcc = static_cast<const c *>(src); \
270 if (n >= traits::SIMDTrait<ISA>::alignment) { \
271 n -= traits::SIMDTrait<ISA>::alignment; \
274 dstc += traits::SIMDTrait<ISA>::alignment / sizeof(c); \
275 srcc += traits::SIMDTrait<ISA>::alignment / sizeof(c); \
277 memcpy_0<ISA>(dstc, srcc, n); \
281 inline void memcpy_n<ISA, 2, sa, da, nt> ( \
282 void *dst, const void *src, std::size_t n) \
287 c *dstc = static_cast<c *>(dst); \
288 const c *srcc = static_cast<const c *>(src); \
289 if (n >= traits::SIMDTrait<ISA>::alignment * 2) { \
290 n -= traits::SIMDTrait<ISA>::alignment * 2; \
291 m m0 = load(srcc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c));\
292 m m1 = load(srcc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c));\
293 store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
294 store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m1); \
295 dstc += traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c); \
296 srcc += traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c); \
298 memcpy_n<ISA, 1, sa, da, nt>(dstc, srcc, n); \
303 inline void memcpy_n<ISA, 4, sa, da, nt> ( \
304 void *dst, const void *src, std::size_t n) \
309 c *dstc = static_cast<c *>(dst); \
310 const c *srcc = static_cast<const c *>(src); \
311 if (n >= traits::SIMDTrait<ISA>::alignment * 4) { \
312 n -= traits::SIMDTrait<ISA>::alignment * 4; \
313 m m0 = load(srcc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c));\
314 m m1 = load(srcc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c));\
315 m m2 = load(srcc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c));\
316 m m3 = load(srcc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c));\
317 store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
318 store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m1); \
319 store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m2); \
320 store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m3); \
321 dstc += traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c); \
322 srcc += traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c); \
324 memcpy_n<ISA, 2, sa, da, nt>(dstc, srcc, n); \
328 inline void memcpy_n<ISA, 8, sa, da, nt> ( \
329 void *dst, const void *src, std::size_t n) \
334 c *dstc = static_cast<c *>(dst); \
335 const c *srcc = static_cast<const c *>(src); \
336 if (n >= traits::SIMDTrait<ISA>::alignment * 8) { \
337 n -= traits::SIMDTrait<ISA>::alignment * 8; \
338 m m0 = load(srcc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c));\
339 m m1 = load(srcc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c));\
340 m m2 = load(srcc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c));\
341 m m3 = load(srcc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c));\
342 m m4 = load(srcc + traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c));\
343 m m5 = load(srcc + traits::SIMDTrait<ISA>::alignment * 5 / sizeof(c));\
344 m m6 = load(srcc + traits::SIMDTrait<ISA>::alignment * 6 / sizeof(c));\
345 m m7 = load(srcc + traits::SIMDTrait<ISA>::alignment * 7 / sizeof(c));\
346 store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
347 store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m1); \
348 store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m2); \
349 store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m3); \
350 store(dstc + traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c), m4); \
351 store(dstc + traits::SIMDTrait<ISA>::alignment * 5 / sizeof(c), m5); \
352 store(dstc + traits::SIMDTrait<ISA>::alignment * 6 / sizeof(c), m6); \
353 store(dstc + traits::SIMDTrait<ISA>::alignment * 7 / sizeof(c), m7); \
354 dstc += traits::SIMDTrait<ISA>::alignment * 8 / sizeof(c); \
355 srcc += traits::SIMDTrait<ISA>::alignment * 8 / sizeof(c); \
357 memcpy_n<ISA, 4, sa, da, nt>(dstc, srcc, n); \
361 inline void memcpy_l<ISA, 4, sa, da, nt> ( \
362 void *dst, const void *src, std::size_t n) \
367 c *dstc = static_cast<c *>(dst); \
368 const c *srcc = static_cast<const c *>(src); \
369 while (n >= traits::SIMDTrait<ISA>::alignment * 4) { \
370 n -= traits::SIMDTrait<ISA>::alignment * 4; \
371 m m0 = load(srcc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c));\
372 m m1 = load(srcc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c));\
373 m m2 = load(srcc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c));\
374 m m3 = load(srcc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c));\
375 store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
376 store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m1); \
377 store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m2); \
378 store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m3); \
379 dstc += traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c); \
380 srcc += traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c); \
382 memcpy_n<ISA, 2, sa, da, nt>(dstc, srcc, n); \
386 inline void memcpy_l<ISA, 8, sa, da, nt> ( \
387 void *dst, const void *src, std::size_t n) \
392 c *dstc = static_cast<c *>(dst); \
393 const c *srcc = static_cast<const c *>(src); \
394 while (n >= traits::SIMDTrait<ISA>::alignment * 8) { \
395 n -= traits::SIMDTrait<ISA>::alignment * 8; \
396 m m0 = load(srcc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c));\
397 m m1 = load(srcc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c));\
398 m m2 = load(srcc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c));\
399 m m3 = load(srcc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c));\
400 m m4 = load(srcc + traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c));\
401 m m5 = load(srcc + traits::SIMDTrait<ISA>::alignment * 5 / sizeof(c));\
402 m m6 = load(srcc + traits::SIMDTrait<ISA>::alignment * 6 / sizeof(c));\
403 m m7 = load(srcc + traits::SIMDTrait<ISA>::alignment * 7 / sizeof(c));\
404 store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
405 store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m1); \
406 store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m2); \
407 store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m3); \
408 store(dstc + traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c), m4); \
409 store(dstc + traits::SIMDTrait<ISA>::alignment * 5 / sizeof(c), m5); \
410 store(dstc + traits::SIMDTrait<ISA>::alignment * 6 / sizeof(c), m6); \
411 store(dstc + traits::SIMDTrait<ISA>::alignment * 7 / sizeof(c), m7); \
412 dstc += traits::SIMDTrait<ISA>::alignment * 8 / sizeof(c); \
413 srcc += traits::SIMDTrait<ISA>::alignment * 8 / sizeof(c); \
415 memcpy_n<ISA, 4, sa, da, nt>(dstc, srcc, n); \
425 return reinterpret_cast<uintptr_t
>(ptr) %
430 inline void memset_0 (
void *dst,
int ch, std::size_t n)
437 inline void memcpy_0 (
void *dst,
const void *src, std::size_t n)
443 template <SIMD, std::
size_t,
bool,
bool>
444 inline void memset_n (
void *,
int, std::size_t);
446 template <SIMD, std::
size_t,
bool,
bool>
447 inline void memset_l (
void *,
int, std::size_t);
449 template <SIMD, std::
size_t,
bool,
bool,
bool>
450 inline void memcpy_n (
void *,
const void *, std::size_t);
452 template <SIMD, std::
size_t,
bool,
bool,
bool>
453 inline void memcpy_l (
void *,
const void *, std::size_t);
458 _mm_castsi128_pd, _mm_set1_epi8, _mm_storeu_pd)
460 _mm_castsi128_pd, _mm_set1_epi8, _mm_storeu_pd)
462 _mm_castsi128_pd, _mm_set1_epi8, _mm_store_pd)
464 _mm_castsi128_pd, _mm_set1_epi8, _mm_stream_pd)
467 _mm_loadu_pd, _mm_storeu_pd)
469 _mm_loadu_pd, _mm_storeu_pd)
471 _mm_loadu_pd, _mm_store_pd)
473 _mm_loadu_pd, _mm_stream_pd)
475 _mm_load_pd, _mm_storeu_pd)
477 _mm_load_pd, _mm_storeu_pd)
479 _mm_load_pd, _mm_store_pd)
481 _mm_load_pd, _mm_stream_pd)
483 #endif // VSMC_HAS_SSE2
488 _mm256_castsi256_pd, _mm256_set1_epi8, _mm256_storeu_pd)
490 _mm256_castsi256_pd, _mm256_set1_epi8, _mm256_storeu_pd)
492 _mm256_castsi256_pd, _mm256_set1_epi8, _mm256_store_pd)
494 _mm256_castsi256_pd, _mm256_set1_epi8, _mm256_stream_pd)
497 _mm256_loadu_pd, _mm256_storeu_pd)
499 _mm256_loadu_pd, _mm256_storeu_pd)
501 _mm256_loadu_pd, _mm256_store_pd)
503 _mm256_loadu_pd, _mm256_stream_pd)
505 _mm256_load_pd, _mm256_storeu_pd)
507 _mm256_load_pd, _mm256_storeu_pd)
509 _mm256_load_pd, _mm256_store_pd)
511 _mm256_load_pd, _mm256_stream_pd)
513 #endif // VSMC_HAS_AVX
515 template <SIMD ISA, std::
size_t N>
519 case 0: memset_n<ISA, N, false, false>(dst, ch, n);
break;
520 case 1: memset_n<ISA, N, false, true >(dst, ch, n);
break;
521 case 2: memset_n<ISA, N, true, false>(dst, ch, n);
break;
522 case 3: memset_n<ISA, N, true, true >(dst, ch, n);
break;
527 template <SIMD ISA, std::
size_t N>
531 case 0: memset_l<ISA, N, false, false>(dst, ch, n);
break;
532 case 1: memset_l<ISA, N, false, true >(dst, ch, n);
break;
533 case 2: memset_l<ISA, N, true, false>(dst, ch, n);
break;
534 case 3: memset_l<ISA, N, true, true >(dst, ch, n);
break;
539 template <SIMD ISA, std::
size_t N>
544 case 0 : memcpy_n<ISA, N, false, false, false>(dst, src, n);
break;
545 case 1 : memcpy_n<ISA, N, false, false, true >(dst, src, n);
break;
546 case 2 : memcpy_n<ISA, N, false, true, false>(dst, src, n);
break;
547 case 3 : memcpy_n<ISA, N, false, true, true >(dst, src, n);
break;
548 case 4 : memcpy_n<ISA, N, true, false, false>(dst, src, n);
break;
549 case 5 : memcpy_n<ISA, N, true, false, true >(dst, src, n);
break;
550 case 6 : memcpy_n<ISA, N, true, true, false>(dst, src, n);
break;
551 case 7 : memcpy_n<ISA, N, true, true, true >(dst, src, n);
break;
556 template <SIMD ISA, std::
size_t N>
561 case 0 : memcpy_l<ISA, N, false, false, false>(dst, src, n);
break;
562 case 1 : memcpy_l<ISA, N, false, false, true >(dst, src, n);
break;
563 case 2 : memcpy_l<ISA, N, false, true, false>(dst, src, n);
break;
564 case 3 : memcpy_l<ISA, N, false, true, true >(dst, src, n);
break;
565 case 4 : memcpy_l<ISA, N, true, false, false>(dst, src, n);
break;
566 case 5 : memcpy_l<ISA, N, true, false, true >(dst, src, n);
break;
567 case 6 : memcpy_l<ISA, N, true, true, false>(dst, src, n);
break;
568 case 7 : memcpy_l<ISA, N, true, true, true >(dst, src, n);
break;
609 unsigned cache_index = 0;
611 unsigned cache_level = 0;
612 for (
unsigned index = 0; index != cache_index_max; ++index) {
614 if (level <= max_level_ && level > cache_level) {
621 threshold_ = 1U << 18;
625 void set (std::size_t threshold) {threshold_ = threshold;}
628 std::size_t
get ()
const {
return threshold_;}
632 unsigned over (std::size_t n)
const {
return n > threshold_ ? 1 : 0;}
636 std::size_t threshold_;
637 std::size_t max_level_;
641 {
if (threshold_ == 0)
set();}
643 CStringNonTemporalThreshold (
const CStringNonTemporalThreshold &);
645 CStringNonTemporalThreshold &operator= (
646 const CStringNonTemporalThreshold &);
655 memset_0<ISA>(dst, ch, n);
659 unsigned flag = cstring_is_aligned<ISA>(dst);
662 memset_n_switch<ISA, traits::SIMDTrait<ISA>::grainsize>(
667 char *dstc =
static_cast<char *
>(dst);
668 std::size_t offset =
reinterpret_cast<uintptr_t
>(dstc) % 64;
670 offset = 64 - offset;
672 dstc, ch, offset, flag);
677 memset_l_switch<ISA, traits::SIMDTrait<ISA>::grainsize>(dstc, ch, n, flag);
683 inline void *
memcpy_simd (
void *dst,
const void *src, std::size_t n)
689 memcpy_0<ISA>(dst, src, n);
693 unsigned flag = cstring_is_aligned<ISA>(dst);
694 flag |= cstring_is_aligned<ISA>(src) << 1;
697 memcpy_n_switch<ISA, traits::SIMDTrait<ISA>::grainsize>(
702 char *dstc =
static_cast<char *
>(dst);
703 const char *srcc =
static_cast<const char *
>(src);
704 std::size_t offset =
reinterpret_cast<uintptr_t
>(dstc) % 64;
706 offset = 64 - offset;
708 dstc, srcc, offset, flag);
714 flag |= cstring_is_aligned<ISA>(srcc) << 1;
715 memcpy_l_switch<ISA, traits::SIMDTrait<ISA>::grainsize>(
716 dstc, srcc, n, flag);
728 memset_n<ISA, traits::SIMDTrait<ISA>::grainsize,
true,
true>(
733 memset_l<ISA, traits::SIMDTrait<ISA>::grainsize,
true,
true>(dst, ch, n);
749 memcpy_n<ISA, traits::SIMDTrait<ISA>::grainsize,
true,
true,
true>(
754 char *dstc =
static_cast<char *
>(dst);
755 const char *srcc =
static_cast<const char *
>(src);
756 std::size_t offset =
reinterpret_cast<uintptr_t
>(srcc) % 64;
758 offset = 64 - offset;
760 true,
true,
true>(dstc, srcc, offset);
765 memcpy_l<ISA, traits::SIMDTrait<ISA>::grainsize,
true,
true,
true>(
780 inline void *
memcpy_std (
void *dst,
const void *src, std::size_t n)
788 {
return internal::memset_simd<SSE2>(dst, ch, n);}
792 inline void *
memcpy_sse2 (
void *dst,
const void *src, std::size_t n)
793 {
return internal::memcpy_simd<SSE2>(dst, src, n);}
800 {
return internal::memset_simd_nt<SSE2>(dst, ch, n);}
807 {
return internal::memcpy_simd_nt<SSE2>(dst, src, n);}
809 #endif // VSMC_HAS_SSE2
816 {
return internal::memset_simd<AVX>(dst, ch, n);}
820 inline void *
memcpy_avx (
void *dst,
const void *src, std::size_t n)
821 {
return internal::memcpy_simd<AVX>(dst, src, n);}
828 {
return internal::memset_simd_nt<AVX>(dst, ch, n);}
835 {
return internal::memcpy_simd_nt<AVX>(dst, src, n);}
837 #endif // VSMC_HAS_AVX
852 void *
memset (
void *dst,
int ch, std::size_t n)
const
853 {
return memset_(dst, ch, n);}
855 void *
memcpy (
void *dst,
const void *src, std::size_t n)
const
856 {
return memcpy_(dst, src, n);}
858 void *
memset_nt (
void *dst,
int ch, std::size_t n)
const
859 {
return memset_nt_(dst, ch, n);}
861 void *
memcpy_nt (
void *dst,
const void *src, std::size_t n)
const
862 {
return memcpy_nt_(dst, src, n);}
866 void *(*memset_) (
void *,
int, std::size_t);
867 void *(*memcpy_) (
void *,
const void *, std::size_t);
868 void *(*memset_nt_) (
void *, int, std::size_t);
869 void *(*memcpy_nt_) (
void *,
const void *, std::size_t);
879 if (CPUID::has_feature<CPUIDFeatureSSE2>()) {
885 #endif // VSMC_HAS_SSE2
888 if (CPUID::has_feature<CPUIDFeatureAVX>()) {
894 #endif // VSMC_HAS_AVX
897 CStringRuntimeDispatch (
const CStringRuntimeDispatch &);
899 CStringRuntimeDispatch &operator= (
const CStringRuntimeDispatch &);
906 inline void *
memset (
void *dst,
int ch, std::size_t n)
908 #if VSMC_CSTRING_RUNTIME_DISPATCH
918 #endif // VSMC_CSTRING_RUNTIME_DISPATCH
923 inline void *
memcpy (
void *dst,
const void *src, std::size_t n)
926 #if VSMC_CSTRING_RUNTIME_DISPATCH
936 #endif // VSMC_CSTRING_RUNTIME_DISPATCH
944 inline void *
memset_nt (
void *dst,
int ch, std::size_t n)
946 #if VSMC_CSTRING_RUNTIME_DISPATCH
956 #endif // VSMC_CSTRING_RUNTIME_DISPATCH
964 inline void *
memcpy_nt (
void *dst,
const void *src, std::size_t n)
967 #if VSMC_CSTRING_RUNTIME_DISPATCH
977 #endif // VSMC_CSTRING_RUNTIME_DISPATCH
982 #endif // VSMC_UTILITY_CSTRING_HPP
#define VSMC_RUNTIME_ASSERT_UTILITY_CSTRING_ALIGNED_SRC(ISA, src, func)
void memset_l(void *, int, std::size_t)
void * memcpy_avx_nt(void *dst, const void *src, std::size_t n)
AVX optimized memcpy with non-temporal store regardless of size.
void set(std::size_t threshold)
Set the threshold to a specific size.
static CStringRuntimeDispatch & instance()
void * memcpy_simd_nt(void *dst, const void *src, std::size_t n)
void * memset_nt(void *dst, int ch, std::size_t n)
SIMD optimized memset with non-temporal store regardless of size.
void * memcpy(void *dst, const void *src, std::size_t n) const
static cache_param_type cache_param(unsigned cache_index)
Get the cache parameters (EAX = 0x04; EAX, EBX, ECX, EDX)
void * memset_sse2(void *dst, int ch, std::size_t n)
SSE2 optimized memset with non-temporal store for large buffers.
void set()
Set the threshold to default.
void memcpy_n_switch(void *dst, const void *src, std::size_t n, unsigned flag)
void * memset_avx(void *dst, int ch, std::size_t n)
AVX optimized memset with non-temporal store for large buffers.
void memcpy_l_switch(void *dst, const void *src, std::size_t n, unsigned flag)
void * memset_sse2_nt(void *dst, int ch, std::size_t n)
SSE2 optimized memset with non-temporal store regardless of size.
void * memset(void *dst, int ch, std::size_t n) const
void * memset(void *dst, int ch, std::size_t n)
SIMD optimized memset with non-temporal store for large buffers.
void * memcpy_sse2_nt(void *dst, const void *src, std::size_t n)
SSE2 optimized memcpy with non-temporal store regardless of size.
#define VSMC_DEFINE_UTILITY_CSTRING_SET(ISA, da, nt, c, m, cast, set1, store)
unsigned cstring_is_aligned(const void *ptr)
void * memcpy_simd(void *dst, const void *src, std::size_t n)
unsigned level() const
The level of this cache.
The threshold of buffer size above which memcpy use non-temporal instructions.
#define VSMC_CSTRING_NON_TEMPORAL_THRESHOLD
Threshold above which non-temporal copy shall be used (0 for auto)
#define VSMC_RUNTIME_ASSERT_UTILITY_CSTRING_ALIGNED_DST(ISA, dst, func)
void memcpy_n(void *, const void *, std::size_t)
void memset_n(void *, int, std::size_t)
void * memcpy_nt(void *dst, const void *src, std::size_t n)
SIMD optimized memcpy with non-temporal store regardless of size.
void * memset_simd(void *dst, int ch, std::size_t n)
void * memset_simd_nt(void *dst, int ch, std::size_t n)
unsigned size() const
Cache size in byte.
std::size_t max_level() const
The maximum level of cache considered in set().
#define VSMC_RUNTIME_ASSERT_UTILITY_CSTRING_MEMCPY(dst, src, n)
void * memcpy_std(void *dst, const void *src, std::size_t n)
Direct call to std::memcpy
void * memset_avx_nt(void *dst, int ch, std::size_t n)
AVX optimized memset with non-temporal store regardless of size.
#define VSMC_DEFINE_UTILITY_CSTRING_CPY(ISA, sa, da, nt, c, m, load, store)
void * memcpy(void *dst, const void *src, std::size_t n)
SIMD optimized memcpy with non-temporal store for large buffers.
void * memset_nt(void *dst, int ch, std::size_t n) const
static unsigned cache_param_num()
Get the number of caches.
void * memcpy_nt(void *dst, const void *src, std::size_t n) const
void memcpy_0(void *dst, const void *src, std::size_t n)
void memset_l_switch(void *dst, int ch, std::size_t n, unsigned flag)
void max_level(std::size_t level)
Set new maximum level of cache considered in set().
static CStringNonTemporalThreshold & instance()
Singleton instance.
void * memcpy_sse2(void *dst, const void *src, std::size_t n)
SSE2 optimized memcpy with non-temporal store for large buffers.
void * memcpy_avx(void *dst, const void *src, std::size_t n)
AVX optimized memcpy with non-temporal store for large buffers.
unsigned over(std::size_t n) const
Give number of bytes, return flag indicate if it is over the threshold.
void memcpy_l(void *, const void *, std::size_t)
void memset_n_switch(void *dst, int ch, std::size_t n, unsigned flag)
void * memset_std(void *dst, int ch, std::size_t n)
Direct call to std::memset
void memset_0(void *dst, int ch, std::size_t n)