vSMC
vSMC: Scalable Monte Carlo
cstring.hpp
Go to the documentation of this file.
1 //============================================================================
2 // vSMC/include/vsmc/utility/cstring.hpp
3 //----------------------------------------------------------------------------
4 // vSMC: Scalable Monte Carlo
5 //----------------------------------------------------------------------------
6 // Copyright (c) 2013,2014, Yan Zhou
7 // All rights reserved.
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are met:
11 //
12 // Redistributions of source code must retain the above copyright notice,
13 // this list of conditions and the following disclaimer.
14 //
15 // Redistributions in binary form must reproduce the above copyright notice,
16 // this list of conditions and the following disclaimer in the documentation
17 // and/or other materials provided with the distribution.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 // POSSIBILITY OF SUCH DAMAGE.
30 //============================================================================
31 
93 
94 #ifndef VSMC_UTILITY_CSTRING_HPP
95 #define VSMC_UTILITY_CSTRING_HPP
96 
97 #include <vsmc/internal/common.hpp>
98 #include <vsmc/utility/cpuid.hpp>
99 
100 #if VSMC_HAS_SSE2
101 #include <emmintrin.h>
102 #endif
103 
104 #if VSMC_HAS_AVX
105 #include <immintrin.h>
106 #endif
107 
110 #ifndef VSMC_CSTRING_RUNTIME_DISPATCH
111 #define VSMC_CSTRING_RUNTIME_DISPATCH 0
112 #endif
113 
116 #ifndef VSMC_CSTRING_NON_TEMPORAL_THRESHOLD
117 #define VSMC_CSTRING_NON_TEMPORAL_THRESHOLD 0
118 #endif
119 
120 #define VSMC_RUNTIME_ASSERT_UTILITY_CSTRING_MEMCPY(dst, src, n) \
121  VSMC_RUNTIME_ASSERT(( \
122  static_cast<const char *>(dst) - \
123  static_cast<const char *>(src) <= \
124  static_cast<std::ptrdiff_t>(n) && \
125  static_cast<const char *>(src) - \
126  static_cast<const char *>(dst) <= \
127  static_cast<std::ptrdiff_t>(n)), \
128  ("**vsmc::memcpy** OVERLAPPING BUFFERS"))
129 
130 #define VSMC_RUNTIME_ASSERT_UTILITY_CSTRING_ALIGNED_DST(ISA, dst, func) \
131  VSMC_RUNTIME_ASSERT( \
132  (::vsmc::internal::cstring_is_aligned<ISA>(dst) != 0), \
133  ("**vsmc::"#func" DESTINATION POINTER IS NOT ALIGNED"))
134 
135 #define VSMC_RUNTIME_ASSERT_UTILITY_CSTRING_ALIGNED_SRC(ISA, src, func) \
136  VSMC_RUNTIME_ASSERT( \
137  (::vsmc::internal::cstring_is_aligned<ISA>(src) != 0), \
138  ("**vsmc::"#func" SOURCE POINTER IS NOT ALIGNED"))
139 
140 
141 #define VSMC_DEFINE_UTILITY_CSTRING_SET(ISA, da, nt, c, m,\
142  cast, set1, store) \
143 template <> \
144 inline void memset_n<ISA, 1, da, nt> (void *dst, int ch, std::size_t n) \
145 { \
146  if (n == 0) \
147  return; \
148  \
149  c *dstc = static_cast<c *>(dst); \
150  m m0 = cast(set1(static_cast<char>(static_cast<unsigned char>(ch)))); \
151  if (n >= traits::SIMDTrait<ISA>::alignment) { \
152  n -= traits::SIMDTrait<ISA>::alignment; \
153  store(dstc, m0); \
154  dstc += traits::SIMDTrait<ISA>::alignment / sizeof(c); \
155  } \
156  memset_0<ISA>(dstc, ch, n); \
157 } \
158  \
159 template <> \
160 inline void memset_n<ISA, 2, da, nt> (void *dst, int ch, std::size_t n) \
161 { \
162  if (n == 0) \
163  return; \
164  \
165  c *dstc = static_cast<c *>(dst); \
166  m m0 = cast(set1(static_cast<char>(static_cast<unsigned char>(ch)))); \
167  if (n >= traits::SIMDTrait<ISA>::alignment * 2) { \
168  n -= traits::SIMDTrait<ISA>::alignment * 2; \
169  store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
170  store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m0); \
171  dstc += traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c); \
172  } \
173  memset_n<ISA, 1, da, nt>(dstc, ch, n); \
174 } \
175  \
176 template <> \
177 inline void memset_n<ISA, 4, da, nt> (void *dst, int ch, std::size_t n) \
178 { \
179  if (n == 0) \
180  return; \
181  \
182  c *dstc = static_cast<c *>(dst); \
183  m m0 = cast(set1(static_cast<char>(static_cast<unsigned char>(ch)))); \
184  if (n >= traits::SIMDTrait<ISA>::alignment * 4) { \
185  n -= traits::SIMDTrait<ISA>::alignment * 4; \
186  store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
187  store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m0); \
188  store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m0); \
189  store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m0); \
190  dstc += traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c); \
191  } \
192  memset_n<ISA, 2, da, nt>(dstc, ch, n); \
193 } \
194  \
195 template <> \
196 inline void memset_n<ISA, 8, da, nt> (void *dst, int ch, std::size_t n) \
197 { \
198  if (n == 0) \
199  return; \
200  \
201  c *dstc = static_cast<c *>(dst); \
202  m m0 = cast(set1(static_cast<char>(static_cast<unsigned char>(ch)))); \
203  if (n >= traits::SIMDTrait<ISA>::alignment * 8) { \
204  n -= traits::SIMDTrait<ISA>::alignment * 8; \
205  store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
206  store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m0); \
207  store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m0); \
208  store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m0); \
209  store(dstc + traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c), m0); \
210  store(dstc + traits::SIMDTrait<ISA>::alignment * 5 / sizeof(c), m0); \
211  store(dstc + traits::SIMDTrait<ISA>::alignment * 6 / sizeof(c), m0); \
212  store(dstc + traits::SIMDTrait<ISA>::alignment * 7 / sizeof(c), m0); \
213  dstc += traits::SIMDTrait<ISA>::alignment * 8 / sizeof(c); \
214  } \
215  memset_n<ISA, 4, da, nt>(dstc, ch, n); \
216 } \
217  \
218 template <> \
219 inline void memset_l<ISA, 4, da, nt> (void *dst, int ch, std::size_t n) \
220 { \
221  if (n == 0) \
222  return; \
223  \
224  c *dstc = static_cast<c *>(dst); \
225  m m0 = cast(set1(static_cast<char>(static_cast<unsigned char>(ch)))); \
226  while (n >= traits::SIMDTrait<ISA>::alignment * 4) { \
227  n -= traits::SIMDTrait<ISA>::alignment * 4; \
228  store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
229  store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m0); \
230  store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m0); \
231  store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m0); \
232  dstc += traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c); \
233  } \
234  memset_n<ISA, 2, da, nt>(dstc, ch, n); \
235 } \
236  \
237 template <> \
238 inline void memset_l<ISA, 8, da, nt> (void *dst, int ch, std::size_t n) \
239 { \
240  if (n == 0) \
241  return; \
242  \
243  c *dstc = static_cast<c *>(dst); \
244  m m0 = cast(set1(static_cast<char>(static_cast<unsigned char>(ch)))); \
245  while (n >= traits::SIMDTrait<ISA>::alignment * 8) { \
246  n -= traits::SIMDTrait<ISA>::alignment * 8; \
247  store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
248  store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m0); \
249  store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m0); \
250  store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m0); \
251  store(dstc + traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c), m0); \
252  store(dstc + traits::SIMDTrait<ISA>::alignment * 5 / sizeof(c), m0); \
253  store(dstc + traits::SIMDTrait<ISA>::alignment * 6 / sizeof(c), m0); \
254  store(dstc + traits::SIMDTrait<ISA>::alignment * 7 / sizeof(c), m0); \
255  dstc += traits::SIMDTrait<ISA>::alignment * 8 / sizeof(c); \
256  } \
257  memset_n<ISA, 4, da, nt>(dstc, ch, n); \
258 }
259 
260 #define VSMC_DEFINE_UTILITY_CSTRING_CPY(ISA, sa, da, nt, c, m, load, store) \
261 template <> \
262 inline void memcpy_n<ISA, 1, sa, da, nt> ( \
263  void *dst, const void *src, std::size_t n) \
264 { \
265  if (n == 0) \
266  return; \
267  \
268  c *dstc = static_cast<c *>(dst); \
269  const c *srcc = static_cast<const c *>(src); \
270  if (n >= traits::SIMDTrait<ISA>::alignment) { \
271  n -= traits::SIMDTrait<ISA>::alignment; \
272  m m0 = load(srcc); \
273  store(dstc, m0); \
274  dstc += traits::SIMDTrait<ISA>::alignment / sizeof(c); \
275  srcc += traits::SIMDTrait<ISA>::alignment / sizeof(c); \
276  } \
277  memcpy_0<ISA>(dstc, srcc, n); \
278 } \
279  \
280 template <> \
281 inline void memcpy_n<ISA, 2, sa, da, nt> ( \
282  void *dst, const void *src, std::size_t n) \
283 { \
284  if (n == 0) \
285  return; \
286  \
287  c *dstc = static_cast<c *>(dst); \
288  const c *srcc = static_cast<const c *>(src); \
289  if (n >= traits::SIMDTrait<ISA>::alignment * 2) { \
290  n -= traits::SIMDTrait<ISA>::alignment * 2; \
291  m m0 = load(srcc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c));\
292  m m1 = load(srcc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c));\
293  store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
294  store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m1); \
295  dstc += traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c); \
296  srcc += traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c); \
297  } \
298  memcpy_n<ISA, 1, sa, da, nt>(dstc, srcc, n); \
299 } \
300  \
301  \
302 template <> \
303 inline void memcpy_n<ISA, 4, sa, da, nt> ( \
304  void *dst, const void *src, std::size_t n) \
305 { \
306  if (n == 0) \
307  return; \
308  \
309  c *dstc = static_cast<c *>(dst); \
310  const c *srcc = static_cast<const c *>(src); \
311  if (n >= traits::SIMDTrait<ISA>::alignment * 4) { \
312  n -= traits::SIMDTrait<ISA>::alignment * 4; \
313  m m0 = load(srcc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c));\
314  m m1 = load(srcc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c));\
315  m m2 = load(srcc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c));\
316  m m3 = load(srcc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c));\
317  store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
318  store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m1); \
319  store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m2); \
320  store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m3); \
321  dstc += traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c); \
322  srcc += traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c); \
323  } \
324  memcpy_n<ISA, 2, sa, da, nt>(dstc, srcc, n); \
325 } \
326  \
327 template <> \
328 inline void memcpy_n<ISA, 8, sa, da, nt> ( \
329  void *dst, const void *src, std::size_t n) \
330 { \
331  if (n == 0) \
332  return; \
333  \
334  c *dstc = static_cast<c *>(dst); \
335  const c *srcc = static_cast<const c *>(src); \
336  if (n >= traits::SIMDTrait<ISA>::alignment * 8) { \
337  n -= traits::SIMDTrait<ISA>::alignment * 8; \
338  m m0 = load(srcc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c));\
339  m m1 = load(srcc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c));\
340  m m2 = load(srcc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c));\
341  m m3 = load(srcc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c));\
342  m m4 = load(srcc + traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c));\
343  m m5 = load(srcc + traits::SIMDTrait<ISA>::alignment * 5 / sizeof(c));\
344  m m6 = load(srcc + traits::SIMDTrait<ISA>::alignment * 6 / sizeof(c));\
345  m m7 = load(srcc + traits::SIMDTrait<ISA>::alignment * 7 / sizeof(c));\
346  store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
347  store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m1); \
348  store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m2); \
349  store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m3); \
350  store(dstc + traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c), m4); \
351  store(dstc + traits::SIMDTrait<ISA>::alignment * 5 / sizeof(c), m5); \
352  store(dstc + traits::SIMDTrait<ISA>::alignment * 6 / sizeof(c), m6); \
353  store(dstc + traits::SIMDTrait<ISA>::alignment * 7 / sizeof(c), m7); \
354  dstc += traits::SIMDTrait<ISA>::alignment * 8 / sizeof(c); \
355  srcc += traits::SIMDTrait<ISA>::alignment * 8 / sizeof(c); \
356  } \
357  memcpy_n<ISA, 4, sa, da, nt>(dstc, srcc, n); \
358 } \
359  \
360 template <> \
361 inline void memcpy_l<ISA, 4, sa, da, nt> ( \
362  void *dst, const void *src, std::size_t n) \
363 { \
364  if (n == 0) \
365  return; \
366  \
367  c *dstc = static_cast<c *>(dst); \
368  const c *srcc = static_cast<const c *>(src); \
369  while (n >= traits::SIMDTrait<ISA>::alignment * 4) { \
370  n -= traits::SIMDTrait<ISA>::alignment * 4; \
371  m m0 = load(srcc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c));\
372  m m1 = load(srcc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c));\
373  m m2 = load(srcc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c));\
374  m m3 = load(srcc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c));\
375  store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
376  store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m1); \
377  store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m2); \
378  store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m3); \
379  dstc += traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c); \
380  srcc += traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c); \
381  } \
382  memcpy_n<ISA, 2, sa, da, nt>(dstc, srcc, n); \
383 } \
384  \
385 template <> \
386 inline void memcpy_l<ISA, 8, sa, da, nt> ( \
387  void *dst, const void *src, std::size_t n) \
388 { \
389  if (n == 0) \
390  return; \
391  \
392  c *dstc = static_cast<c *>(dst); \
393  const c *srcc = static_cast<const c *>(src); \
394  while (n >= traits::SIMDTrait<ISA>::alignment * 8) { \
395  n -= traits::SIMDTrait<ISA>::alignment * 8; \
396  m m0 = load(srcc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c));\
397  m m1 = load(srcc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c));\
398  m m2 = load(srcc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c));\
399  m m3 = load(srcc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c));\
400  m m4 = load(srcc + traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c));\
401  m m5 = load(srcc + traits::SIMDTrait<ISA>::alignment * 5 / sizeof(c));\
402  m m6 = load(srcc + traits::SIMDTrait<ISA>::alignment * 6 / sizeof(c));\
403  m m7 = load(srcc + traits::SIMDTrait<ISA>::alignment * 7 / sizeof(c));\
404  store(dstc + traits::SIMDTrait<ISA>::alignment * 0 / sizeof(c), m0); \
405  store(dstc + traits::SIMDTrait<ISA>::alignment * 1 / sizeof(c), m1); \
406  store(dstc + traits::SIMDTrait<ISA>::alignment * 2 / sizeof(c), m2); \
407  store(dstc + traits::SIMDTrait<ISA>::alignment * 3 / sizeof(c), m3); \
408  store(dstc + traits::SIMDTrait<ISA>::alignment * 4 / sizeof(c), m4); \
409  store(dstc + traits::SIMDTrait<ISA>::alignment * 5 / sizeof(c), m5); \
410  store(dstc + traits::SIMDTrait<ISA>::alignment * 6 / sizeof(c), m6); \
411  store(dstc + traits::SIMDTrait<ISA>::alignment * 7 / sizeof(c), m7); \
412  dstc += traits::SIMDTrait<ISA>::alignment * 8 / sizeof(c); \
413  srcc += traits::SIMDTrait<ISA>::alignment * 8 / sizeof(c); \
414  } \
415  memcpy_n<ISA, 4, sa, da, nt>(dstc, srcc, n); \
416 }
417 
418 namespace vsmc {
419 
420 namespace internal {
421 
422 template <SIMD ISA>
423 inline unsigned cstring_is_aligned (const void *ptr)
424 {
425  return reinterpret_cast<uintptr_t>(ptr) %
427 }
428 
429 template <SIMD>
430 inline void memset_0 (void *dst, int ch, std::size_t n)
431 {
432  if (n != 0)
433  std::memset(dst, ch, n);
434 }
435 
436 template <SIMD>
437 inline void memcpy_0 (void *dst, const void *src, std::size_t n)
438 {
439  if (n != 0)
440  std::memcpy(dst, src, n);
441 }
442 
443 template <SIMD, std::size_t, bool, bool>
444 inline void memset_n (void *, int, std::size_t);
445 
446 template <SIMD, std::size_t, bool, bool>
447 inline void memset_l (void *, int, std::size_t);
448 
449 template <SIMD, std::size_t, bool, bool, bool>
450 inline void memcpy_n (void *, const void *, std::size_t);
451 
452 template <SIMD, std::size_t, bool, bool, bool>
453 inline void memcpy_l (void *, const void *, std::size_t);
454 
455 #if VSMC_HAS_SSE2
456 
457 VSMC_DEFINE_UTILITY_CSTRING_SET(SSE2, false, false, double, __m128d,
458  _mm_castsi128_pd, _mm_set1_epi8, _mm_storeu_pd)
459 VSMC_DEFINE_UTILITY_CSTRING_SET(SSE2, false, true, double, __m128d,
460  _mm_castsi128_pd, _mm_set1_epi8, _mm_storeu_pd)
461 VSMC_DEFINE_UTILITY_CSTRING_SET(SSE2, true, false, double, __m128d,
462  _mm_castsi128_pd, _mm_set1_epi8, _mm_store_pd)
463 VSMC_DEFINE_UTILITY_CSTRING_SET(SSE2, true, true, double, __m128d,
464  _mm_castsi128_pd, _mm_set1_epi8, _mm_stream_pd)
465 
466 VSMC_DEFINE_UTILITY_CSTRING_CPY(SSE2, false, false, false, double, __m128d,
467  _mm_loadu_pd, _mm_storeu_pd)
468 VSMC_DEFINE_UTILITY_CSTRING_CPY(SSE2, false, false, true, double, __m128d,
469  _mm_loadu_pd, _mm_storeu_pd)
470 VSMC_DEFINE_UTILITY_CSTRING_CPY(SSE2, false, true, false, double, __m128d,
471  _mm_loadu_pd, _mm_store_pd)
472 VSMC_DEFINE_UTILITY_CSTRING_CPY(SSE2, false, true, true, double, __m128d,
473  _mm_loadu_pd, _mm_stream_pd)
474 VSMC_DEFINE_UTILITY_CSTRING_CPY(SSE2, true, false, false, double, __m128d,
475  _mm_load_pd, _mm_storeu_pd)
476 VSMC_DEFINE_UTILITY_CSTRING_CPY(SSE2, true, false, true, double, __m128d,
477  _mm_load_pd, _mm_storeu_pd)
478 VSMC_DEFINE_UTILITY_CSTRING_CPY(SSE2, true, true, false, double, __m128d,
479  _mm_load_pd, _mm_store_pd)
480 VSMC_DEFINE_UTILITY_CSTRING_CPY(SSE2, true, true, true, double, __m128d,
481  _mm_load_pd, _mm_stream_pd)
482 
483 #endif // VSMC_HAS_SSE2
484 
485 #if VSMC_HAS_AVX
486 
487 VSMC_DEFINE_UTILITY_CSTRING_SET(AVX, false, false, double, __m256d,
488  _mm256_castsi256_pd, _mm256_set1_epi8, _mm256_storeu_pd)
489 VSMC_DEFINE_UTILITY_CSTRING_SET(AVX, false, true, double, __m256d,
490  _mm256_castsi256_pd, _mm256_set1_epi8, _mm256_storeu_pd)
491 VSMC_DEFINE_UTILITY_CSTRING_SET(AVX, true, false, double, __m256d,
492  _mm256_castsi256_pd, _mm256_set1_epi8, _mm256_store_pd)
493 VSMC_DEFINE_UTILITY_CSTRING_SET(AVX, true, true, double, __m256d,
494  _mm256_castsi256_pd, _mm256_set1_epi8, _mm256_stream_pd)
495 
496 VSMC_DEFINE_UTILITY_CSTRING_CPY(AVX, false, false, false, double, __m256d,
497  _mm256_loadu_pd, _mm256_storeu_pd)
498 VSMC_DEFINE_UTILITY_CSTRING_CPY(AVX, false, false, true, double, __m256d,
499  _mm256_loadu_pd, _mm256_storeu_pd)
500 VSMC_DEFINE_UTILITY_CSTRING_CPY(AVX, false, true, false, double, __m256d,
501  _mm256_loadu_pd, _mm256_store_pd)
502 VSMC_DEFINE_UTILITY_CSTRING_CPY(AVX, false, true, true, double, __m256d,
503  _mm256_loadu_pd, _mm256_stream_pd)
504 VSMC_DEFINE_UTILITY_CSTRING_CPY(AVX, true, false, false, double, __m256d,
505  _mm256_load_pd, _mm256_storeu_pd)
506 VSMC_DEFINE_UTILITY_CSTRING_CPY(AVX, true, false, true, double, __m256d,
507  _mm256_load_pd, _mm256_storeu_pd)
508 VSMC_DEFINE_UTILITY_CSTRING_CPY(AVX, true, true, false, double, __m256d,
509  _mm256_load_pd, _mm256_store_pd)
510 VSMC_DEFINE_UTILITY_CSTRING_CPY(AVX, true, true, true, double, __m256d,
511  _mm256_load_pd, _mm256_stream_pd)
512 
513 #endif // VSMC_HAS_AVX
514 
515 template <SIMD ISA, std::size_t N>
516 inline void memset_n_switch (void *dst, int ch, std::size_t n, unsigned flag)
517 {
518  switch (flag) {
519  case 0: memset_n<ISA, N, false, false>(dst, ch, n); break;
520  case 1: memset_n<ISA, N, false, true >(dst, ch, n); break;
521  case 2: memset_n<ISA, N, true, false>(dst, ch, n); break;
522  case 3: memset_n<ISA, N, true, true >(dst, ch, n); break;
523  default : break;
524  }
525 }
526 
527 template <SIMD ISA, std::size_t N>
528 inline void memset_l_switch (void *dst, int ch, std::size_t n, unsigned flag)
529 {
530  switch (flag) {
531  case 0: memset_l<ISA, N, false, false>(dst, ch, n); break;
532  case 1: memset_l<ISA, N, false, true >(dst, ch, n); break;
533  case 2: memset_l<ISA, N, true, false>(dst, ch, n); break;
534  case 3: memset_l<ISA, N, true, true >(dst, ch, n); break;
535  default : break;
536  }
537 }
538 
539 template <SIMD ISA, std::size_t N>
540 inline void memcpy_n_switch (void *dst, const void *src, std::size_t n,
541  unsigned flag)
542 {
543  switch (flag) {
544  case 0 : memcpy_n<ISA, N, false, false, false>(dst, src, n); break;
545  case 1 : memcpy_n<ISA, N, false, false, true >(dst, src, n); break;
546  case 2 : memcpy_n<ISA, N, false, true, false>(dst, src, n); break;
547  case 3 : memcpy_n<ISA, N, false, true, true >(dst, src, n); break;
548  case 4 : memcpy_n<ISA, N, true, false, false>(dst, src, n); break;
549  case 5 : memcpy_n<ISA, N, true, false, true >(dst, src, n); break;
550  case 6 : memcpy_n<ISA, N, true, true, false>(dst, src, n); break;
551  case 7 : memcpy_n<ISA, N, true, true, true >(dst, src, n); break;
552  default : break;
553  }
554 }
555 
556 template <SIMD ISA, std::size_t N>
557 inline void memcpy_l_switch (void *dst, const void *src, std::size_t n,
558  unsigned flag)
559 {
560  switch (flag) {
561  case 0 : memcpy_l<ISA, N, false, false, false>(dst, src, n); break;
562  case 1 : memcpy_l<ISA, N, false, false, true >(dst, src, n); break;
563  case 2 : memcpy_l<ISA, N, false, true, false>(dst, src, n); break;
564  case 3 : memcpy_l<ISA, N, false, true, true >(dst, src, n); break;
565  case 4 : memcpy_l<ISA, N, true, false, false>(dst, src, n); break;
566  case 5 : memcpy_l<ISA, N, true, false, true >(dst, src, n); break;
567  case 6 : memcpy_l<ISA, N, true, true, false>(dst, src, n); break;
568  case 7 : memcpy_l<ISA, N, true, true, true >(dst, src, n); break;
569  default : break;
570  }
571 }
572 
573 } // namespace internal
574 
579 {
580  public :
581 
588  {
589  static CStringNonTemporalThreshold ntt;
590 
591  return ntt;
592  }
593 
595  std::size_t max_level () const {return max_level_;}
596 
601  void max_level (std::size_t level) {max_level_ = level; set();}
602 
607  void set ()
608  {
609  unsigned cache_index = 0;
610  unsigned cache_index_max = CPUID::cache_param_num();
611  unsigned cache_level = 0;
612  for (unsigned index = 0; index != cache_index_max; ++index) {
613  unsigned level = CPUID::cache_param(index).level();
614  if (level <= max_level_ && level > cache_level) {
615  cache_index = index;
616  cache_level = level;
617  }
618  }
619  threshold_ = CPUID::cache_param(cache_index).size() / 2;
620  if (threshold_ == 0)
621  threshold_ = 1U << 18;
622  }
623 
625  void set (std::size_t threshold) {threshold_ = threshold;}
626 
628  std::size_t get () const {return threshold_;}
629 
632  unsigned over (std::size_t n) const {return n > threshold_ ? 1 : 0;}
633 
634  private :
635 
636  std::size_t threshold_;
637  std::size_t max_level_;
638 
640  threshold_(VSMC_CSTRING_NON_TEMPORAL_THRESHOLD), max_level_(3)
641  {if (threshold_ == 0) set();}
642 
643  CStringNonTemporalThreshold (const CStringNonTemporalThreshold &);
644 
645  CStringNonTemporalThreshold &operator= (
646  const CStringNonTemporalThreshold &);
647 }; // class CStringNonTemporalThreshold
648 
649 namespace internal {
650 
651 template <SIMD ISA>
652 inline void *memset_simd (void *dst, int ch, std::size_t n)
653 {
655  memset_0<ISA>(dst, ch, n);
656  return dst;
657  }
658 
659  unsigned flag = cstring_is_aligned<ISA>(dst);
662  memset_n_switch<ISA, traits::SIMDTrait<ISA>::grainsize>(
663  dst, ch, n, flag);
664  return dst;
665  }
666 
667  char *dstc = static_cast<char *>(dst);
668  std::size_t offset = reinterpret_cast<uintptr_t>(dstc) % 64;
669  if (offset != 0) {
670  offset = 64 - offset;
672  dstc, ch, offset, flag);
673  n -= offset;
674  dstc += offset;
675  }
677  memset_l_switch<ISA, traits::SIMDTrait<ISA>::grainsize>(dstc, ch, n, flag);
678 
679  return dst;
680 }
681 
682 template <SIMD ISA>
683 inline void *memcpy_simd (void *dst, const void *src, std::size_t n)
684 {
685  if (dst == src)
686  return dst;
687 
689  memcpy_0<ISA>(dst, src, n);
690  return dst;
691  }
692 
693  unsigned flag = cstring_is_aligned<ISA>(dst);
694  flag |= cstring_is_aligned<ISA>(src) << 1;
697  memcpy_n_switch<ISA, traits::SIMDTrait<ISA>::grainsize>(
698  dst, src, n, flag);
699  return dst;
700  }
701 
702  char *dstc = static_cast<char *>(dst);
703  const char *srcc = static_cast<const char *>(src);
704  std::size_t offset = reinterpret_cast<uintptr_t>(dstc) % 64;
705  if (offset != 0) {
706  offset = 64 - offset;
708  dstc, srcc, offset, flag);
709  n -= offset;
710  dstc += offset;
711  srcc += offset;
712  }
714  flag |= cstring_is_aligned<ISA>(srcc) << 1;
715  memcpy_l_switch<ISA, traits::SIMDTrait<ISA>::grainsize>(
716  dstc, srcc, n, flag);
717 
718  return dst;
719 }
720 
721 template <SIMD ISA>
722 inline void *memset_simd_nt (void *dst, int ch, std::size_t n)
723 {
725 
728  memset_n<ISA, traits::SIMDTrait<ISA>::grainsize, true, true>(
729  dst, ch, n);
730  return dst;
731  }
732 
733  memset_l<ISA, traits::SIMDTrait<ISA>::grainsize, true, true>(dst, ch, n);
734 
735  return dst;
736 }
737 
738 template <SIMD ISA>
739 inline void *memcpy_simd_nt (void *dst, const void *src, std::size_t n)
740 {
741  if (dst == src)
742  return dst;
743 
746 
749  memcpy_n<ISA, traits::SIMDTrait<ISA>::grainsize, true, true, true>(
750  dst, src, n);
751  return dst;
752  }
753 
754  char *dstc = static_cast<char *>(dst);
755  const char *srcc = static_cast<const char *>(src);
756  std::size_t offset = reinterpret_cast<uintptr_t>(srcc) % 64;
757  if (offset != 0) {
758  offset = 64 - offset;
760  true, true, true>(dstc, srcc, offset);
761  n -= offset;
762  dstc += offset;
763  srcc += offset;
764  }
765  memcpy_l<ISA, traits::SIMDTrait<ISA>::grainsize, true, true, true>(
766  dstc, srcc, n);
767 
768  return dst;
769 }
770 
771 } // namespace vsmc::internal
772 
775 inline void *memset_std (void *dst, int ch, std::size_t n)
776 {return std::memset(dst, ch, n);}
777 
780 inline void *memcpy_std (void *dst, const void *src, std::size_t n)
781 {return std::memcpy(dst, src, n);}
782 
783 #if VSMC_HAS_SSE2
784 
787 inline void *memset_sse2 (void *dst, int ch, std::size_t n)
788 {return internal::memset_simd<SSE2>(dst, ch, n);}
789 
792 inline void *memcpy_sse2 (void *dst, const void *src, std::size_t n)
793 {return internal::memcpy_simd<SSE2>(dst, src, n);}
794 
799 inline void *memset_sse2_nt (void *dst, int ch, std::size_t n)
800 {return internal::memset_simd_nt<SSE2>(dst, ch, n);}
801 
806 inline void *memcpy_sse2_nt (void *dst, const void *src, std::size_t n)
807 {return internal::memcpy_simd_nt<SSE2>(dst, src, n);}
808 
809 #endif // VSMC_HAS_SSE2
810 
811 #if VSMC_HAS_AVX
812 
815 inline void *memset_avx (void *dst, int ch, std::size_t n)
816 {return internal::memset_simd<AVX>(dst, ch, n);}
817 
820 inline void *memcpy_avx (void *dst, const void *src, std::size_t n)
821 {return internal::memcpy_simd<AVX>(dst, src, n);}
822 
827 inline void *memset_avx_nt (void *dst, int ch, std::size_t n)
828 {return internal::memset_simd_nt<AVX>(dst, ch, n);}
829 
834 inline void *memcpy_avx_nt (void *dst, const void *src, std::size_t n)
835 {return internal::memcpy_simd_nt<AVX>(dst, src, n);}
836 
837 #endif // VSMC_HAS_AVX
838 
839 namespace internal {
840 
842 {
843  public :
844 
846  {
847  static CStringRuntimeDispatch dispatch;
848 
849  return dispatch;
850  }
851 
852  void *memset (void *dst, int ch, std::size_t n) const
853  {return memset_(dst, ch, n);}
854 
855  void *memcpy (void *dst, const void *src, std::size_t n) const
856  {return memcpy_(dst, src, n);}
857 
858  void *memset_nt (void *dst, int ch, std::size_t n) const
859  {return memset_nt_(dst, ch, n);}
860 
861  void *memcpy_nt (void *dst, const void *src, std::size_t n) const
862  {return memcpy_nt_(dst, src, n);}
863 
864  private :
865 
866  void *(*memset_) (void *, int, std::size_t);
867  void *(*memcpy_) (void *, const void *, std::size_t);
868  void *(*memset_nt_) (void *, int, std::size_t);
869  void *(*memcpy_nt_) (void *, const void *, std::size_t);
870 
872  {
873  memset_ = memset_std;
874  memcpy_ = memcpy_std;
875  memset_nt_ = memset_std;
876  memcpy_nt_ = memcpy_std;
877 
878 #if VSMC_HAS_SSE2
879  if (CPUID::has_feature<CPUIDFeatureSSE2>()) {
880  memset_ = ::vsmc::memset_sse2;
881  memcpy_ = ::vsmc::memcpy_sse2;
882  memset_nt_ = ::vsmc::memset_sse2_nt;
883  memcpy_nt_ = ::vsmc::memcpy_sse2_nt;
884  }
885 #endif // VSMC_HAS_SSE2
886 
887 #if VSMC_HAS_AVX
888  if (CPUID::has_feature<CPUIDFeatureAVX>()) {
889  memset_ = ::vsmc::memset_avx;
890  memcpy_ = ::vsmc::memcpy_avx;
891  memset_nt_ = ::vsmc::memset_avx_nt;
892  memcpy_nt_ = ::vsmc::memcpy_avx_nt;
893  }
894 #endif // VSMC_HAS_AVX
895  }
896 
897  CStringRuntimeDispatch (const CStringRuntimeDispatch &);
898 
899  CStringRuntimeDispatch &operator= (const CStringRuntimeDispatch &);
900 }; // class CStringRuntimeDispatc
901 
902 } // namespace vsmc::internal
903 
906 inline void *memset (void *dst, int ch, std::size_t n)
907 {
908 #if VSMC_CSTRING_RUNTIME_DISPATCH
910 #else
911 #if VSMC_HAS_AVX
912  return memset_avx(dst, ch, n);
913 #elif VSMC_HAS_SSE2
914  return memset_sse2(dst, ch, n);
915 #else
916  return memset_std(dst, ch, n);
917 #endif
918 #endif // VSMC_CSTRING_RUNTIME_DISPATCH
919 }
920 
923 inline void *memcpy (void *dst, const void *src, std::size_t n)
924 {
926 #if VSMC_CSTRING_RUNTIME_DISPATCH
928 #else
929 #if VSMC_HAS_AVX
930  return memcpy_avx(dst, src, n);
931 #elif VSMC_HAS_SSE2
932  return memcpy_sse2(dst, src, n);
933 #else
934  return memcpy_std(dst, src, n);
935 #endif
936 #endif // VSMC_CSTRING_RUNTIME_DISPATCH
937 }
938 
944 inline void *memset_nt (void *dst, int ch, std::size_t n)
945 {
946 #if VSMC_CSTRING_RUNTIME_DISPATCH
948 #else
949 #if VSMC_HAS_AVX
950  return memset_avx_nt(dst, ch, n);
951 #elif VSMC_HAS_SSE2
952  return memset_sse2_nt(dst, ch, n);
953 #else
954  return memset_std(dst, ch, n);
955 #endif
956 #endif // VSMC_CSTRING_RUNTIME_DISPATCH
957 }
958 
964 inline void *memcpy_nt (void *dst, const void *src, std::size_t n)
965 {
967 #if VSMC_CSTRING_RUNTIME_DISPATCH
969 #else
970 #if VSMC_HAS_AVX
971  return memcpy_avx_nt(dst, src, n);
972 #elif VSMC_HAS_SSE2
973  return memcpy_sse2_nt(dst, src, n);
974 #else
975  return memcpy_std(dst, src, n);
976 #endif
977 #endif // VSMC_CSTRING_RUNTIME_DISPATCH
978 }
979 
980 } // namespace vsmc
981 
982 #endif // VSMC_UTILITY_CSTRING_HPP
#define VSMC_RUNTIME_ASSERT_UTILITY_CSTRING_ALIGNED_SRC(ISA, src, func)
Definition: cstring.hpp:135
Definition: adapter.hpp:37
void memset_l(void *, int, std::size_t)
void * memcpy_avx_nt(void *dst, const void *src, std::size_t n)
AVX optimized memcpy with non-temporal store regardless of size.
Definition: cstring.hpp:834
void set(std::size_t threshold)
Set the threshold to a specific size.
Definition: cstring.hpp:625
static CStringRuntimeDispatch & instance()
Definition: cstring.hpp:845
void * memcpy_simd_nt(void *dst, const void *src, std::size_t n)
Definition: cstring.hpp:739
SIMD traits.
Definition: traits.hpp:173
void * memset_nt(void *dst, int ch, std::size_t n)
SIMD optimized memset with non-temporal store regardless of size.
Definition: cstring.hpp:944
void * memcpy(void *dst, const void *src, std::size_t n) const
Definition: cstring.hpp:855
static cache_param_type cache_param(unsigned cache_index)
Get the cache parameters (EAX = 0x04; EAX, EBX, ECX, EDX)
Definition: cpuid.hpp:637
void * memset_sse2(void *dst, int ch, std::size_t n)
SSE2 optimized memset with non-temporal store for large buffers.
Definition: cstring.hpp:787
void set()
Set the threshold to default.
Definition: cstring.hpp:607
void memcpy_n_switch(void *dst, const void *src, std::size_t n, unsigned flag)
Definition: cstring.hpp:540
void * memset_avx(void *dst, int ch, std::size_t n)
AVX optimized memset with non-temporal store for large buffers.
Definition: cstring.hpp:815
void memcpy_l_switch(void *dst, const void *src, std::size_t n, unsigned flag)
Definition: cstring.hpp:557
void * memset_sse2_nt(void *dst, int ch, std::size_t n)
SSE2 optimized memset with non-temporal store regardless of size.
Definition: cstring.hpp:799
void * memset(void *dst, int ch, std::size_t n) const
Definition: cstring.hpp:852
void * memset(void *dst, int ch, std::size_t n)
SIMD optimized memset with non-temporal store for large buffers.
Definition: cstring.hpp:906
void * memcpy_sse2_nt(void *dst, const void *src, std::size_t n)
SSE2 optimized memcpy with non-temporal store regardless of size.
Definition: cstring.hpp:806
#define VSMC_DEFINE_UTILITY_CSTRING_SET(ISA, da, nt, c, m, cast, set1, store)
Definition: cstring.hpp:141
unsigned cstring_is_aligned(const void *ptr)
Definition: cstring.hpp:423
void * memcpy_simd(void *dst, const void *src, std::size_t n)
Definition: cstring.hpp:683
unsigned level() const
The level of this cache.
Definition: cpuid.hpp:475
The threshold of buffer size above which memcpy use non-temporal instructions.
Definition: cstring.hpp:578
#define VSMC_CSTRING_NON_TEMPORAL_THRESHOLD
Threshold above which non-temporal copy shall be used (0 for auto)
Definition: cstring.hpp:117
#define VSMC_RUNTIME_ASSERT_UTILITY_CSTRING_ALIGNED_DST(ISA, dst, func)
Definition: cstring.hpp:130
void memcpy_n(void *, const void *, std::size_t)
void memset_n(void *, int, std::size_t)
void * memcpy_nt(void *dst, const void *src, std::size_t n)
SIMD optimized memcpy with non-temporal store regardless of size.
Definition: cstring.hpp:964
void * memset_simd(void *dst, int ch, std::size_t n)
Definition: cstring.hpp:652
void * memset_simd_nt(void *dst, int ch, std::size_t n)
Definition: cstring.hpp:722
unsigned size() const
Cache size in byte.
Definition: cpuid.hpp:498
std::size_t max_level() const
The maximum level of cache considered in set().
Definition: cstring.hpp:595
#define VSMC_RUNTIME_ASSERT_UTILITY_CSTRING_MEMCPY(dst, src, n)
Definition: cstring.hpp:120
void * memcpy_std(void *dst, const void *src, std::size_t n)
Direct call to std::memcpy
Definition: cstring.hpp:780
void * memset_avx_nt(void *dst, int ch, std::size_t n)
AVX optimized memset with non-temporal store regardless of size.
Definition: cstring.hpp:827
#define VSMC_DEFINE_UTILITY_CSTRING_CPY(ISA, sa, da, nt, c, m, load, store)
Definition: cstring.hpp:260
void * memcpy(void *dst, const void *src, std::size_t n)
SIMD optimized memcpy with non-temporal store for large buffers.
Definition: cstring.hpp:923
void * memset_nt(void *dst, int ch, std::size_t n) const
Definition: cstring.hpp:858
static unsigned cache_param_num()
Get the number of caches.
Definition: cpuid.hpp:619
void * memcpy_nt(void *dst, const void *src, std::size_t n) const
Definition: cstring.hpp:861
void memcpy_0(void *dst, const void *src, std::size_t n)
Definition: cstring.hpp:437
void memset_l_switch(void *dst, int ch, std::size_t n, unsigned flag)
Definition: cstring.hpp:528
void max_level(std::size_t level)
Set new maximum level of cache considered in set().
Definition: cstring.hpp:601
static CStringNonTemporalThreshold & instance()
Singleton instance.
Definition: cstring.hpp:587
void * memcpy_sse2(void *dst, const void *src, std::size_t n)
SSE2 optimized memcpy with non-temporal store for large buffers.
Definition: cstring.hpp:792
void * memcpy_avx(void *dst, const void *src, std::size_t n)
AVX optimized memcpy with non-temporal store for large buffers.
Definition: cstring.hpp:820
unsigned over(std::size_t n) const
Give number of bytes, return flag indicate if it is over the threshold.
Definition: cstring.hpp:632
void memcpy_l(void *, const void *, std::size_t)
void memset_n_switch(void *dst, int ch, std::size_t n, unsigned flag)
Definition: cstring.hpp:516
void * memset_std(void *dst, int ch, std::size_t n)
Direct call to std::memset
Definition: cstring.hpp:775
void memset_0(void *dst, int ch, std::size_t n)
Definition: cstring.hpp:430