vSMC
vSMC: Scalable Monte Carlo
simd.hpp
Go to the documentation of this file.
1 //============================================================================
2 // vSMC/include/vsmc/utility/simd.hpp
3 //----------------------------------------------------------------------------
4 // vSMC: Scalable Monte Carlo
5 //----------------------------------------------------------------------------
6 // Copyright (c) 2013-2016, Yan Zhou
7 // All rights reserved.
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are met:
11 //
12 // Redistributions of source code must retain the above copyright notice,
13 // this list of conditions and the following disclaimer.
14 //
15 // Redistributions in binary form must reproduce the above copyright notice,
16 // this list of conditions and the following disclaimer in the documentation
17 // and/or other materials provided with the distribution.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, value, OR PROFITS; OR BUSINESS
26 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 // POSSIBILITY OF SUCH DAMAGE.
30 //============================================================================
31 
32 #ifndef VSMC_UTILITY_SIMD_HPP
33 #define VSMC_UTILITY_SIMD_HPP
34 
35 #include <vsmc/internal/common.hpp>
36 
37 #define VSMC_DEFINE_UTILITY_SIMD_INTEGER_BINARY_OP( \
38  Type, CType, op, bin, assign) \
39  template <typename T> \
40  inline Type &assign(Type &a, const Type &b) \
41  { \
42  a = a op b; \
43  \
44  return a; \
45  } \
46  \
47  template <typename T> \
48  inline Type bin(const Type &a, CType b) \
49  { \
50  Type x; \
51  x.set1(b); \
52  \
53  return a + x; \
54  } \
55  \
56  template <typename T> \
57  inline Type bin(CType a, const Type &b) \
58  { \
59  Type x; \
60  x.set1(a); \
61  \
62  return x + b; \
63  } \
64  \
65  template <typename T> \
66  inline Type &assign(Type &a, CType b) \
67  { \
68  a = a + b; \
69  \
70  return a; \
71  }
72 
73 #define VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(Type, CType, op, bin, assign) \
74  inline Type &assign(Type &a, const Type &b) \
75  { \
76  a = a op b; \
77  \
78  return a; \
79  } \
80  \
81  inline Type bin(const Type &a, CType b) \
82  { \
83  Type x; \
84  x.set1(b); \
85  \
86  return a + x; \
87  } \
88  \
89  inline Type bin(CType a, const Type &b) \
90  { \
91  Type x; \
92  x.set1(a); \
93  \
94  return x + b; \
95  } \
96  \
97  inline Type &assign(Type &a, CType b) \
98  { \
99  a = a + b; \
100  \
101  return a; \
102  }
103 
104 #if VSMC_HAS_SSE2
105 #include <emmintrin.h>
106 
107 namespace vsmc
108 {
109 
112 template <typename IntType = __m128i>
113 class M128I
114 {
115  public:
116  using value_type = IntType;
117 
118  M128I() = default;
119 
120  M128I(const __m128i &value) : value_(value) {}
121 
122  template <typename T>
123  M128I(const M128I<T> &other) : value_(other.value())
124  {
125  }
126 
127  template <typename T>
129  {
130  value_ = other.value();
131 
132  return *this;
133  }
134 
135  static constexpr std::size_t size()
136  {
137  return sizeof(__m128i) / sizeof(IntType);
138  }
139 
140  __m128i &value() { return value_; }
141  const __m128i &value() const { return value_; }
142 
143  __m128i *data() { return &value_; }
144  const __m128i *data() const { return &value_; }
145 
146  template <typename T>
147  void load_a(const T *mem)
148  {
149  value_ = _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
150  }
151 
152  template <typename T>
153  void load_u(const T *mem)
154  {
155  value_ = _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem));
156  }
157 
158  template <typename T>
159  void load(const T *mem)
160  {
161  reinterpret_cast<std::uintptr_t>(mem) % 16 == 0 ? load_a(mem) :
162  load_u(mem);
163  }
164 
165  template <typename T>
166  void store_a(T *mem) const
167  {
168  _mm_store_si128(reinterpret_cast<__m128i *>(mem), value_);
169  }
170 
171  template <typename T>
172  void store_u(T *mem) const
173  {
174  _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), value_);
175  }
176 
177  template <typename T>
178  void store(T *mem) const
179  {
180  reinterpret_cast<std::uintptr_t>(mem) % 16 == 0 ? store_a(mem) :
181  store_u(mem);
182  }
183 
184  void set0() { value_ = _mm_setzero_si128(); }
185 
186  template <typename T>
187  void set1(T n)
188  {
189  value_ = set1(n, std::integral_constant<std::size_t, sizeof(T)>());
190  }
191 
192  template <typename T>
193  void set(T e1, T e0)
194  {
195  value_ = _mm_set_epi64x(
196  static_cast<VSMC_INT64>(e1), static_cast<VSMC_INT64>(e0));
197  }
198 
199  template <typename T>
200  void set(T e3, T e2, T e1, T e0)
201  {
202  value_ = _mm_set_epi32(static_cast<int>(e3), static_cast<int>(e2),
203  static_cast<int>(e1), static_cast<int>(e0));
204  }
205 
206  template <typename T>
207  void set(T e7, T e6, T e5, T e4, T e3, T e2, T e1, T e0)
208  {
209  value_ = _mm_set_epi16(static_cast<short>(e7), static_cast<short>(e6),
210  static_cast<short>(e5), static_cast<short>(e4),
211  static_cast<short>(e3), static_cast<short>(e2),
212  static_cast<short>(e1), static_cast<short>(e0));
213  }
214 
215  template <typename T>
216  void set(T e15, T e14, T e13, T e12, T e11, T e10, T e9, T e8, T e7, T e6,
217  T e5, T e4, T e3, T e2, T e1, T e0)
218  {
219  value_ = _mm_set_epi8(static_cast<char>(e15), static_cast<char>(e14),
220  static_cast<char>(e13), static_cast<char>(e12),
221  static_cast<char>(e11), static_cast<char>(e10),
222  static_cast<char>(e9), static_cast<char>(e8),
223  static_cast<char>(e7), static_cast<char>(e6),
224  static_cast<char>(e5), static_cast<char>(e4),
225  static_cast<char>(e3), static_cast<char>(e2),
226  static_cast<char>(e1), static_cast<char>(e0));
227  }
228 
229  private:
230  __m128i value_;
231 
232  template <typename T>
233  __m128i set1(T n, std::integral_constant<std::size_t, sizeof(std::int8_t)>)
234  {
235  return _mm_set1_epi8(static_cast<char>(n));
236  }
237 
238  template <typename T>
239  __m128i set1(
240  T n, std::integral_constant<std::size_t, sizeof(std::int16_t)>)
241  {
242  return _mm_set1_epi16(static_cast<short>(n));
243  }
244 
245  template <typename T>
246  __m128i set1(
247  T n, std::integral_constant<std::size_t, sizeof(std::int32_t)>)
248  {
249  return _mm_set1_epi32(static_cast<int>(n));
250  }
251 
252  template <typename T>
253  __m128i set1(
254  T n, std::integral_constant<std::size_t, sizeof(std::int64_t)>)
255  {
256  return _mm_set1_epi64x(static_cast<VSMC_INT64>(n));
257  }
258 }; // class M128I
259 
260 namespace internal
261 {
262 
263 template <typename T>
264 inline M128I<T> m128i_add(const M128I<T> &a, const M128I<T> &b,
265  std::integral_constant<std::size_t, sizeof(std::int8_t)>)
266 {
267  return M128I<T>(_mm_add_epi8(a.value(), b.value()));
268 }
269 
270 template <typename T>
271 inline M128I<T> m128i_add(const M128I<T> &a, const M128I<T> &b,
272  std::integral_constant<std::size_t, sizeof(std::int16_t)>)
273 {
274  return M128I<T>(_mm_add_epi16(a.value(), b.value()));
275 }
276 
277 template <typename T>
278 inline M128I<T> m128i_add(const M128I<T> &a, const M128I<T> &b,
279  std::integral_constant<std::size_t, sizeof(std::int32_t)>)
280 {
281  return M128I<T>(_mm_add_epi32(a.value(), b.value()));
282 }
283 
284 template <typename T>
285 inline M128I<T> m128i_add(const M128I<T> &a, const M128I<T> &b,
286  std::integral_constant<std::size_t, sizeof(std::int64_t)>)
287 {
288  return M128I<T>(_mm_add_epi64(a.value(), b.value()));
289 }
290 
291 template <typename T>
292 inline M128I<T> m128i_sub(const M128I<T> &a, const M128I<T> &b,
293  std::integral_constant<std::size_t, sizeof(std::int8_t)>)
294 {
295  return M128I<T>(_mm_sub_epi8(a.value(), b.value()));
296 }
297 
298 template <typename T>
299 inline M128I<T> m128i_sub(const M128I<T> &a, const M128I<T> &b,
300  std::integral_constant<std::size_t, sizeof(std::int16_t)>)
301 {
302  return M128I<T>(_mm_sub_epi16(a.value(), b.value()));
303 }
304 
305 template <typename T>
306 inline M128I<T> m128i_sub(const M128I<T> &a, const M128I<T> &b,
307  std::integral_constant<std::size_t, sizeof(std::int32_t)>)
308 {
309  return M128I<T>(_mm_sub_epi32(a.value(), b.value()));
310 }
311 
312 template <typename T>
313 inline M128I<T> m128i_sub(const M128I<T> &a, const M128I<T> &b,
314  std::integral_constant<std::size_t, sizeof(std::int64_t)>)
315 {
316  return M128I<T>(_mm_sub_epi64(a.value(), b.value()));
317 }
318 
319 template <typename T>
320 inline M128I<T> m128i_slli(const M128I<T> &a, int imm8,
321  std::integral_constant<std::size_t, sizeof(std::int8_t)>)
322 {
323  return M128I<T>(_mm_slli_epi8(a.value(), imm8));
324 }
325 
326 template <typename T>
327 inline M128I<T> m128i_slli(const M128I<T> &a, int imm8,
328  std::integral_constant<std::size_t, sizeof(std::int16_t)>)
329 {
330  return M128I<T>(_mm_slli_epi16(a.value(), imm8));
331 }
332 
333 template <typename T>
334 inline M128I<T> m128i_slli(const M128I<T> &a, int imm8,
335  std::integral_constant<std::size_t, sizeof(std::int32_t)>)
336 {
337  return M128I<T>(_mm_slli_epi32(a.value(), imm8));
338 }
339 
340 template <typename T>
341 inline M128I<T> m128i_slli(const M128I<T> &a, int imm8,
342  std::integral_constant<std::size_t, sizeof(std::int64_t)>)
343 {
344  return M128I<T>(_mm_slli_epi64(a.value(), imm8));
345 }
346 
347 template <typename T>
348 inline M128I<T> m128i_srli(const M128I<T> &a, int imm8,
349  std::integral_constant<std::size_t, sizeof(std::int8_t)>)
350 {
351  return M128I<T>(_mm_srli_epi8(a.value(), imm8));
352 }
353 
354 template <typename T>
355 inline M128I<T> m128i_srli(const M128I<T> &a, int imm8,
356  std::integral_constant<std::size_t, sizeof(std::int16_t)>)
357 {
358  return M128I<T>(_mm_srli_epi16(a.value(), imm8));
359 }
360 
361 template <typename T>
362 inline M128I<T> m128i_srli(const M128I<T> &a, int imm8,
363  std::integral_constant<std::size_t, sizeof(std::int32_t)>)
364 {
365  return M128I<T>(_mm_srli_epi32(a.value(), imm8));
366 }
367 
368 template <typename T>
369 inline M128I<T> m128i_srli(const M128I<T> &a, int imm8,
370  std::integral_constant<std::size_t, sizeof(std::int64_t)>)
371 {
372  return M128I<T>(_mm_srli_epi64(a.value(), imm8));
373 }
374 
375 } // namespace internal
376 
377 template <typename T>
378 inline bool operator==(const M128I<T> &a, const M128I<T> &b)
379 {
380  std::array<std::uint64_t, 2> sa;
381  std::array<std::uint64_t, 2> sb;
382  a.store_u(sa.data());
383  b.store_u(sb.data());
384 
385  return sa == sb;
386 }
387 
388 template <typename T>
389 inline bool operator!=(const M128I<T> &a, const M128I<T> &b)
390 {
391  return !(a == b);
392 }
393 
394 template <typename CharT, typename Traits, typename T>
395 inline std::basic_ostream<CharT, Traits> &operator<<(
396  std::basic_ostream<CharT, Traits> &os, const M128I<T> &a)
397 {
398  if (!os.good())
399  return os;
400 
401  std::array<T, M128I<T>::size()> sa;
402  a.store_u(sa.data());
403  os << sa;
404 
405  return os;
406 }
407 
408 template <typename CharT, typename Traits, typename T>
409 inline std::basic_istream<CharT, Traits> &operator>>(
410  std::basic_istream<CharT, Traits> &is, M128I<T> &a)
411 {
412  if (!is.good())
413  return is;
414 
415  std::array<T, M128I<T>::size()> sa;
416  is >> sa;
417 
418  if (is.good())
419  a.load_u(sa.data());
420 
421  return is;
422 }
423 
424 template <typename T>
425 inline M128I<T> operator+(const M128I<T> &a, const M128I<T> &b)
426 {
427  return internal::m128i_add(
428  a, b, std::integral_constant<std::size_t, sizeof(T)>());
429 }
430 
431 template <typename T>
432 inline M128I<T> operator-(const M128I<T> &a, const M128I<T> &b)
433 {
434  return internal::m128i_sub(
435  a, b, std::integral_constant<std::size_t, sizeof(T)>());
436 }
437 
438 template <typename T>
439 inline M128I<T> operator&(const M128I<T> &a, const M128I<T> &b)
440 {
441  return M128I<T>(_mm_and_si128(a.value(), b.value()));
442 }
443 
444 template <typename T>
445 inline M128I<T> operator|(const M128I<T> &a, const M128I<T> &b)
446 {
447  return M128I<T>(_mm_or_si128(a.value(), b.value()));
448 }
449 
450 template <typename T>
451 inline M128I<T> operator^(const M128I<T> &a, const M128I<T> &b)
452 {
453  return M128I<T>(_mm_xor_si128(a.value(), b.value()));
454 }
455 
456 template <typename T>
457 inline M128I<T> operator<<(const M128I<T> &a, int imm8)
458 {
459  return internal::m128i_slli(
460  a, imm8, std::integral_constant<std::size_t, sizeof(T)>());
461 }
462 
463 template <typename T>
464 inline M128I<T> operator<<=(M128I<T> &a, int imm8)
465 {
466  a = a << imm8;
467 
468  return a;
469 }
470 
471 template <typename T>
472 inline M128I<T> operator>>(const M128I<T> &a, int imm8)
473 {
474  return internal::m128i_srli(
475  a, imm8, std::integral_constant<std::size_t, sizeof(T)>());
476 }
477 
478 template <typename T>
479 inline M128I<T> operator>>=(M128I<T> &a, int imm8)
480 {
481  a = a << imm8;
482 
483  return a;
484 }
485 
487  M128I<T>, T, +, operator+, operator+=)
489  M128I<T>, T, -, operator-, operator-=)
491  M128I<T>, T, &, operator&, operator&=)
493  M128I<T>, T, |, operator|, operator|=)
495  M128I<T>, T, ^, operator^, operator^=)
496 
499 class M128
500 {
501  public:
502  M128() = default;
503 
504  M128(const __m128 &value) : value_(value) {}
505 
506  static constexpr std::size_t size() { return 4; }
507 
508  __m128 &value() { return value_; }
509  const __m128 &value() const { return value_; }
510 
511  __m128 *data() { return &value_; }
512  const __m128 *data() const { return &value_; }
513 
514  template <typename T>
515  void load_a(const T *mem)
516  {
517  value_ = _mm_load_ps(reinterpret_cast<const float *>(mem));
518  }
519 
520  template <typename T>
521  void load_u(const T *mem)
522  {
523  value_ = _mm_loadu_ps(reinterpret_cast<const float *>(mem));
524  }
525 
526  template <typename T>
527  void load(const T *mem)
528  {
529  reinterpret_cast<std::uintptr_t>(mem) % 16 == 0 ? load_a(mem) :
530  load_u(mem);
531  }
532 
533  template <typename T>
534  void store_a(T *mem) const
535  {
536  _mm_store_ps(reinterpret_cast<float *>(mem), value_);
537  }
538 
539  template <typename T>
540  void store_u(T *mem) const
541  {
542  _mm_storeu_ps(reinterpret_cast<float *>(mem), value_);
543  }
544 
545  template <typename T>
546  void store(T *mem) const
547  {
548  reinterpret_cast<std::uintptr_t>(mem) % 16 == 0 ? store_a(mem) :
549  store_u(mem);
550  }
551 
552  void set0() { value_ = _mm_setzero_ps(); }
553 
554  void set1(float e) { value_ = _mm_set1_ps(e); }
555 
556  void set(float e3, float e2, float e1, float e0)
557  {
558  value_ = _mm_set_ps(e3, e2, e1, e0);
559  }
560 
561  private:
562  __m128 value_;
563 }; // class M128
564 
565 inline bool operator==(const M128 &a, const M128 &b)
566 {
567  std::array<float, 4> sa;
568  std::array<float, 4> sb;
569  a.store_u(sa.data());
570  b.store_u(sb.data());
571 
572  return sa == sb;
573 }
574 
575 inline bool operator!=(const M128 &a, const M128 &b) { return !(a == b); }
576 
577 template <typename CharT, typename Traits>
578 inline std::basic_ostream<CharT, Traits> &operator<<(
579  std::basic_ostream<CharT, Traits> &os, const M128 &a)
580 {
581  if (!os.good())
582  return os;
583 
584  std::array<float, 4> sa;
585  a.store_u(sa.data());
586  os << sa;
587 
588  return os;
589 }
590 
591 template <typename CharT, typename Traits>
592 inline std::basic_istream<CharT, Traits> &operator>>(
593  std::basic_istream<CharT, Traits> &is, M128 &a)
594 {
595  if (!is.good())
596  return is;
597 
598  std::array<float, 4> sa;
599  is >> sa;
600 
601  if (is.good())
602  a.load_u(sa.data());
603 
604  return is;
605 }
606 
607 inline M128 operator+(const M128 &a, const M128 &b)
608 {
609  return M128(_mm_add_ps(a.value(), b.value()));
610 }
611 
612 inline M128 operator-(const M128 &a, const M128 &b)
613 {
614  return M128(_mm_sub_ps(a.value(), b.value()));
615 }
616 
617 inline M128 operator*(const M128 &a, const M128 &b)
618 {
619  return M128(_mm_mul_ps(a.value(), b.value()));
620 }
621 
622 inline M128 operator/(const M128 &a, const M128 &b)
623 {
624  return M128(_mm_div_ps(a.value(), b.value()));
625 }
626 
627 VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(M128, float, +, operator+, operator+=)
628 VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(M128, float, -, operator-, operator-=)
629 VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(M128, float, *, operator*, operator*=)
630 VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(M128, float, /, operator/, operator/=)
631 
634 class M128D
635 {
636  public:
637  M128D() = default;
638 
639  M128D(const __m128d &value) : value_(value) {}
640 
641  static constexpr std::size_t size() { return 2; }
642 
643  __m128d &value() { return value_; }
644  const __m128d &value() const { return value_; }
645 
646  __m128d *data() { return &value_; }
647  const __m128d *data() const { return &value_; }
648 
649  template <typename T>
650  void load_a(const T *mem)
651  {
652  value_ = _mm_load_pd(reinterpret_cast<const double *>(mem));
653  }
654 
655  template <typename T>
656  void load_u(const T *mem)
657  {
658  value_ = _mm_loadu_pd(reinterpret_cast<const double *>(mem));
659  }
660 
661  template <typename T>
662  void load(const T *mem)
663  {
664  reinterpret_cast<std::uintptr_t>(mem) % 16 == 0 ? load_a(mem) :
665  load_u(mem);
666  }
667 
668  template <typename T>
669  void store_a(T *mem) const
670  {
671  _mm_store_pd(reinterpret_cast<double *>(mem), value_);
672  }
673 
674  template <typename T>
675  void store_u(T *mem) const
676  {
677  _mm_storeu_pd(reinterpret_cast<double *>(mem), value_);
678  }
679 
680  template <typename T>
681  void store(T *mem) const
682  {
683  reinterpret_cast<std::uintptr_t>(mem) % 16 == 0 ? store_a(mem) :
684  store_u(mem);
685  }
686 
687  void set0() { value_ = _mm_setzero_pd(); }
688 
689  void set1(double e) { value_ = _mm_set1_pd(e); }
690 
691  void set(double e1, double e0) { value_ = _mm_set_pd(e1, e0); }
692 
693  private:
694  __m128d value_;
695 }; // class M128D
696 
697 inline bool operator==(const M128D &a, const M128D &b)
698 {
699  std::array<double, 2> sa;
700  std::array<double, 2> sb;
701  a.store_u(sa.data());
702  b.store_u(sb.data());
703 
704  return sa == sb;
705 }
706 
707 inline bool operator!=(const M128D &a, const M128D &b) { return !(a == b); }
708 
709 template <typename CharT, typename Traits>
710 inline std::basic_ostream<CharT, Traits> &operator<<(
711  std::basic_ostream<CharT, Traits> &os, const M128D &a)
712 {
713  if (!os.good())
714  return os;
715 
716  std::array<double, 2> sa;
717  a.store_u(sa.data());
718  os << sa;
719 
720  return os;
721 }
722 
723 template <typename CharT, typename Traits>
724 inline std::basic_istream<CharT, Traits> &operator>>(
725  std::basic_istream<CharT, Traits> &is, M128D &a)
726 {
727  if (!is.good())
728  return is;
729 
730  std::array<double, 2> sa;
731  is >> sa;
732 
733  if (is.good())
734  a.load_u(sa.data());
735 
736  return is;
737 }
738 
739 inline M128D operator+(const M128D &a, const M128D &b)
740 {
741  return M128D(_mm_add_pd(a.value(), b.value()));
742 }
743 
744 inline M128D operator-(const M128D &a, const M128D &b)
745 {
746  return M128D(_mm_sub_pd(a.value(), b.value()));
747 }
748 
749 inline M128D operator*(const M128D &a, const M128D &b)
750 {
751  return M128D(_mm_mul_pd(a.value(), b.value()));
752 }
753 
754 inline M128D operator/(const M128D &a, const M128D &b)
755 {
756  return M128D(_mm_div_pd(a.value(), b.value()));
757 }
758 
760  M128D, double, +, operator+, operator+=)
762  M128D, double, -, operator-, operator-=)
764  M128D, double, *, operator*, operator*=)
766  M128D, double, /, operator/, operator/=)
767 
768 namespace internal
769 {
770 
771 template <typename RealType>
773 
774 template <>
775 class M128TypeTrait<float>
776 {
777  public:
778  using type = M128;
779 };
780 
781 template <>
782 class M128TypeTrait<double>
783 {
784  public:
785  using type = M128D;
786 };
787 
788 } // namespace vsmc::internal
789 
792 template <typename T>
793 using M128Type = typename std::conditional<std::is_integral<T>::value,
795 
796 #endif // VSMC_HAS_SSE2
797 
798 #if VSMC_HAS_AVX2
799 #include <immintrin.h>
800 
803 template <typename IntType = __m256i>
804 class M256I
805 {
806  public:
807  using value_type = IntType;
808 
809  M256I() = default;
810 
811  M256I(const __m256i &value) : value_(value) {}
812 
813  template <typename T>
814  M256I(const M256I<T> &other) : value_(other.value())
815  {
816  }
817 
818  template <typename T>
820  {
821  value_ = other.value_;
822 
823  return *this;
824  }
825 
826  static constexpr std::size_t size()
827  {
828  return sizeof(__m256i) / sizeof(IntType);
829  }
830 
831  __m256i &value() { return value_; }
832  const __m256i &value() const { return value_; }
833 
834  __m256i *data() { return &value_; }
835  const __m256i *data() const { return &value_; }
836 
837  template <typename T>
838  void load_a(const T *mem)
839  {
840  value_ = _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
841  }
842 
843  template <typename T>
844  void load_u(const T *mem)
845  {
846  value_ = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
847  }
848 
849  template <typename T>
850  void load(const T *mem)
851  {
852  reinterpret_cast<std::uintptr_t>(mem) % 32 == 0 ? load_a(mem) :
853  load_u(mem);
854  }
855 
856  template <typename T>
857  void store_a(T *mem) const
858  {
859  _mm256_store_si256(reinterpret_cast<__m256i *>(mem), value_);
860  }
861 
862  template <typename T>
863  void store_u(T *mem) const
864  {
865  _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), value_);
866  }
867 
868  template <typename T>
869  void store(T *mem) const
870  {
871  reinterpret_cast<std::uintptr_t>(mem) % 32 == 0 ? store_a(mem) :
872  store_u(mem);
873  }
874 
875  void set0() { value_ = _mm256_setzero_si256(); }
876 
877  template <typename T>
878  void set1(T n)
879  {
880  value_ = set1(n, std::integral_constant<std::size_t, sizeof(T)>());
881  }
882 
883  template <typename T>
884  void set(T e3, T e2, T e1, T e0)
885  {
886  value_ = _mm256_set_epi64x(static_cast<VSMC_INT64>(e3),
887  static_cast<VSMC_INT64>(e2), static_cast<VSMC_INT64>(e1),
888  static_cast<VSMC_INT64>(e0));
889  }
890 
891  template <typename T>
892  void set(T e7, T e6, T e5, T e4, T e3, T e2, T e1, T e0)
893  {
894  value_ = _mm256_set_epi32(static_cast<int>(e7), static_cast<int>(e6),
895  static_cast<int>(e5), static_cast<int>(e4), static_cast<int>(e3),
896  static_cast<int>(e2), static_cast<int>(e1), static_cast<int>(e0));
897  }
898 
899  template <typename T>
900  void set(T e15, T e14, T e13, T e12, T e11, T e10, T e9, T e8, T e7, T e6,
901  T e5, T e4, T e3, T e2, T e1, T e0)
902  {
903  value_ =
904  _mm256_set_epi16(static_cast<short>(e15), static_cast<short>(e14),
905  static_cast<short>(e13), static_cast<short>(e12),
906  static_cast<short>(e11), static_cast<short>(e10),
907  static_cast<short>(e9), static_cast<short>(e8),
908  static_cast<short>(e7), static_cast<short>(e6),
909  static_cast<short>(e5), static_cast<short>(e4),
910  static_cast<short>(e3), static_cast<short>(e2),
911  static_cast<short>(e1), static_cast<short>(e0));
912  }
913 
914  template <typename T>
915  void set(T e31, T e30, T e29, T e28, T e27, T e26, T e25, T e24, T e23,
916  T e22, T e21, T e20, T e19, T e18, T e17, T e16, T e15, T e14, T e13,
917  T e12, T e11, T e10, T e9, T e8, T e7, T e6, T e5, T e4, T e3, T e2,
918  T e1, T e0)
919  {
920  value_ =
921  _mm256_set_epi8(static_cast<char>(e31), static_cast<char>(e30),
922  static_cast<char>(e29), static_cast<char>(e28),
923  static_cast<char>(e27), static_cast<char>(e26),
924  static_cast<char>(e25), static_cast<char>(e24),
925  static_cast<char>(e23), static_cast<char>(e22),
926  static_cast<char>(e21), static_cast<char>(e20),
927  static_cast<char>(e19), static_cast<char>(e18),
928  static_cast<char>(e17), static_cast<char>(e16),
929  static_cast<char>(e15), static_cast<char>(e14),
930  static_cast<char>(e13), static_cast<char>(e12),
931  static_cast<char>(e11), static_cast<char>(e10),
932  static_cast<char>(e9), static_cast<char>(e8),
933  static_cast<char>(e7), static_cast<char>(e6),
934  static_cast<char>(e5), static_cast<char>(e4),
935  static_cast<char>(e3), static_cast<char>(e2),
936  static_cast<char>(e1), static_cast<char>(e0));
937  }
938 
939  private:
940  __m256i value_;
941 
942  template <typename T>
943  __m256i set1(T n, std::integral_constant<std::size_t, sizeof(std::int8_t)>)
944  {
945  return _mm256_set1_epi8(static_cast<char>(n));
946  }
947 
948  template <typename T>
949  __m256i set1(
950  T n, std::integral_constant<std::size_t, sizeof(std::int16_t)>)
951  {
952  return _mm256_set1_epi16(static_cast<short>(n));
953  }
954 
955  template <typename T>
956  __m256i set1(
957  T n, std::integral_constant<std::size_t, sizeof(std::int32_t)>)
958  {
959  return _mm256_set1_epi32(static_cast<int>(n));
960  }
961 
962  template <typename T>
963  __m256i set1(
964  T n, std::integral_constant<std::size_t, sizeof(std::int64_t)>)
965  {
966  return _mm256_set1_epi64x(static_cast<long long>(n));
967  }
968 }; // class M256I
969 
970 namespace internal
971 {
972 
973 template <typename T>
974 inline M256I<T> m256i_add(const M256I<T> &a, const M256I<T> &b,
975  std::integral_constant<std::size_t, sizeof(std::int8_t)>)
976 {
977  return M256I<T>(_mm256_add_epi8(a.value(), b.value()));
978 }
979 
980 template <typename T>
981 inline M256I<T> m256i_add(const M256I<T> &a, const M256I<T> &b,
982  std::integral_constant<std::size_t, sizeof(std::int16_t)>)
983 {
984  return M256I<T>(_mm256_add_epi16(a.value(), b.value()));
985 }
986 
987 template <typename T>
988 inline M256I<T> m256i_add(const M256I<T> &a, const M256I<T> &b,
989  std::integral_constant<std::size_t, sizeof(std::int32_t)>)
990 {
991  return M256I<T>(_mm256_add_epi32(a.value(), b.value()));
992 }
993 
994 template <typename T>
995 inline M256I<T> m256i_add(const M256I<T> &a, const M256I<T> &b,
996  std::integral_constant<std::size_t, sizeof(std::int64_t)>)
997 {
998  return M256I<T>(_mm256_add_epi64(a.value(), b.value()));
999 }
1000 
1001 template <typename T>
1002 inline M256I<T> m256i_sub(const M256I<T> &a, const M256I<T> &b,
1003  std::integral_constant<std::size_t, sizeof(std::int8_t)>)
1004 {
1005  return M256I<T>(_mm256_sub_epi8(a.value(), b.value()));
1006 }
1007 
1008 template <typename T>
1009 inline M256I<T> m256i_sub(const M256I<T> &a, const M256I<T> &b,
1010  std::integral_constant<std::size_t, sizeof(std::int16_t)>)
1011 {
1012  return M256I<T>(_mm256_sub_epi16(a.value(), b.value()));
1013 }
1014 
1015 template <typename T>
1016 inline M256I<T> m256i_sub(const M256I<T> &a, const M256I<T> &b,
1017  std::integral_constant<std::size_t, sizeof(std::int32_t)>)
1018 {
1019  return M256I<T>(_mm256_sub_epi32(a.value(), b.value()));
1020 }
1021 
1022 template <typename T>
1023 inline M256I<T> m256i_sub(const M256I<T> &a, const M256I<T> &b,
1024  std::integral_constant<std::size_t, sizeof(std::int64_t)>)
1025 {
1026  return M256I<T>(_mm256_sub_epi64(a.value(), b.value()));
1027 }
1028 
1029 template <typename T>
1030 inline M256I<T> m256i_slli(const M256I<T> &a, int imm8,
1031  std::integral_constant<std::size_t, sizeof(std::int8_t)>)
1032 {
1033  return M256I<T>(_mm256_slli_epi8(a.value(), imm8));
1034 }
1035 
1036 template <typename T>
1037 inline M256I<T> m256i_slli(const M256I<T> &a, int imm8,
1038  std::integral_constant<std::size_t, sizeof(std::int16_t)>)
1039 {
1040  return M256I<T>(_mm256_slli_epi16(a.value(), imm8));
1041 }
1042 
1043 template <typename T>
1044 inline M256I<T> m256i_slli(const M256I<T> &a, int imm8,
1045  std::integral_constant<std::size_t, sizeof(std::int32_t)>)
1046 {
1047  return M256I<T>(_mm256_slli_epi32(a.value(), imm8));
1048 }
1049 
1050 template <typename T>
1051 inline M256I<T> m256i_slli(const M256I<T> &a, int imm8,
1052  std::integral_constant<std::size_t, sizeof(std::int64_t)>)
1053 {
1054  return M256I<T>(_mm256_slli_epi64(a.value(), imm8));
1055 }
1056 
1057 template <typename T>
1058 inline M256I<T> m256i_srli(const M256I<T> &a, int imm8,
1059  std::integral_constant<std::size_t, sizeof(std::int8_t)>)
1060 {
1061  return M256I<T>(_mm256_srli_epi8(a.value(), imm8));
1062 }
1063 
1064 template <typename T>
1065 inline M256I<T> m256i_srli(const M256I<T> &a, int imm8,
1066  std::integral_constant<std::size_t, sizeof(std::int16_t)>)
1067 {
1068  return M256I<T>(_mm256_srli_epi16(a.value(), imm8));
1069 }
1070 
1071 template <typename T>
1072 inline M256I<T> m256i_srli(const M256I<T> &a, int imm8,
1073  std::integral_constant<std::size_t, sizeof(std::int32_t)>)
1074 {
1075  return M256I<T>(_mm256_srli_epi32(a.value(), imm8));
1076 }
1077 
1078 template <typename T>
1079 inline M256I<T> m256i_srli(const M256I<T> &a, int imm8,
1080  std::integral_constant<std::size_t, sizeof(std::int64_t)>)
1081 {
1082  return M256I<T>(_mm256_srli_epi64(a.value(), imm8));
1083 }
1084 
1085 } // namespace vsmc::internal
1086 
1087 template <typename T>
1088 inline bool operator==(const M256I<T> &a, const M256I<T> &b)
1089 {
1090  std::array<std::uint64_t, 4> sa;
1091  std::array<std::uint64_t, 4> sb;
1092  a.store_u(sa.data());
1093  b.store_u(sb.data());
1094 
1095  return sa == sb;
1096 }
1097 
1098 template <typename T>
1099 inline bool operator!=(const M256I<T> &a, const M256I<T> &b)
1100 {
1101  return !(a == b);
1102 }
1103 
1104 template <typename CharT, typename Traits, typename T>
1105 inline std::basic_ostream<CharT, Traits> &operator<<(
1106  std::basic_ostream<CharT, Traits> &os, const M256I<T> &a)
1107 {
1108  if (!os.good())
1109  return os;
1110 
1111  std::array<T, M256I<T>::size()> sa;
1112  a.store_u(sa.data());
1113  os << sa;
1114 
1115  return os;
1116 }
1117 
1118 template <typename CharT, typename Traits, typename T>
1119 inline std::basic_istream<CharT, Traits> &operator>>(
1120  std::basic_istream<CharT, Traits> &is, M256I<T> &a)
1121 {
1122  if (!is.good())
1123  return is;
1124 
1125  std::array<T, M256I<T>::size()> sa;
1126  is >> sa;
1127 
1128  if (is.good())
1129  a.load_u(sa.data());
1130 
1131  return is;
1132 }
1133 
1134 template <typename T>
1135 inline M256I<T> operator+(const M256I<T> &a, const M256I<T> &b)
1136 {
1137  return internal::m256i_add(
1138  a, b, std::integral_constant<std::size_t, sizeof(T)>());
1139 }
1140 
1141 template <typename T>
1142 inline M256I<T> operator-(const M256I<T> &a, const M256I<T> &b)
1143 {
1144  return internal::m256i_sub(
1145  a, b, std::integral_constant<std::size_t, sizeof(T)>());
1146 }
1147 
1148 template <typename T>
1149 inline M256I<T> operator&(const M256I<T> &a, const M256I<T> &b)
1150 {
1151  return M256I<T>(_mm256_and_si256(a.value(), b.value()));
1152 }
1153 
1154 template <typename T>
1155 inline M256I<T> operator|(const M256I<T> &a, const M256I<T> &b)
1156 {
1157  return M256I<T>(_mm256_or_si256(a.value(), b.value()));
1158 }
1159 
1160 template <typename T>
1161 inline M256I<T> operator^(const M256I<T> &a, const M256I<T> &b)
1162 {
1163  return M256I<T>(_mm256_xor_si256(a.value(), b.value()));
1164 }
1165 
1166 template <typename T>
1167 inline M256I<T> operator<<(const M256I<T> &a, int imm8)
1168 {
1169  return internal::m256i_slli(
1170  a, imm8, std::integral_constant<std::size_t, sizeof(T)>());
1171 }
1172 
1173 template <typename T>
1174 inline M256I<T> operator<<=(M256I<T> &a, int imm8)
1175 {
1176  a = a << imm8;
1177 
1178  return a;
1179 }
1180 
1181 template <typename T>
1182 inline M256I<T> operator>>(const M256I<T> &a, int imm8)
1183 {
1184  return internal::m256i_srli(
1185  a, imm8, std::integral_constant<std::size_t, sizeof(T)>());
1186 }
1187 
1188 template <typename T>
1189 inline M256I<T> operator>>=(M256I<T> &a, int imm8)
1190 {
1191  a = a << imm8;
1192 
1193  return a;
1194 }
1195 
1197  M256I<T>, T, +, operator+, operator+=)
1199  M256I<T>, T, -, operator-, operator-=)
1201  M256I<T>, T, &, operator&, operator&=)
1203  M256I<T>, T, |, operator|, operator|=)
1205  M256I<T>, T, ^, operator^, operator^=)
1206 
1209 class M256
1210 {
1211  public:
1212  M256() = default;
1213 
1214  M256(const __m256 &value) : value_(value) {}
1215 
1216  static constexpr std::size_t size() { return 8; }
1217 
1218  __m256 &value() { return value_; }
1219  const __m256 &value() const { return value_; }
1220 
1221  __m256 *data() { return &value_; }
1222  const __m256 *data() const { return &value_; }
1223 
1224  template <typename T>
1225  void load_a(const T *mem)
1226  {
1227  value_ = _mm256_load_ps(reinterpret_cast<const float *>(mem));
1228  }
1229 
1230  template <typename T>
1231  void load_u(const T *mem)
1232  {
1233  value_ = _mm256_loadu_ps(reinterpret_cast<const float *>(mem));
1234  }
1235 
1236  template <typename T>
1237  void load(const T *mem)
1238  {
1239  reinterpret_cast<std::uintptr_t>(mem) % 32 == 0 ? load_a(mem) :
1240  load_u(mem);
1241  }
1242 
1243  template <typename T>
1244  void store_a(T *mem) const
1245  {
1246  _mm256_store_ps(reinterpret_cast<float *>(mem), value_);
1247  }
1248 
1249  template <typename T>
1250  void store_u(T *mem) const
1251  {
1252  _mm256_storeu_ps(reinterpret_cast<float *>(mem), value_);
1253  }
1254 
1255  template <typename T>
1256  void store(T *mem) const
1257  {
1258  reinterpret_cast<std::uintptr_t>(mem) % 32 == 0 ? store_a(mem) :
1259  store_u(mem);
1260  }
1261 
1262  void set0() { value_ = _mm256_setzero_ps(); }
1263 
1264  void set1(float e) { value_ = _mm256_set1_ps(e); }
1265 
1266  void set(float e7, float e6, float e5, float e4, float e3, float e2,
1267  float e1, float e0)
1268  {
1269  value_ = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
1270  }
1271 
1272  private:
1273  __m256 value_;
1274 }; // class M256
1275 
1276 inline bool operator==(const M256 &a, const M256 &b)
1277 {
1278  std::array<float, 8> sa;
1279  std::array<float, 8> sb;
1280  a.store_u(sa.data());
1281  b.store_u(sb.data());
1282 
1283  return sa == sb;
1284 }
1285 
1286 inline bool operator!=(const M256 &a, const M256 &b) { return !(a == b); }
1287 
1288 template <typename CharT, typename Traits>
1289 inline std::basic_ostream<CharT, Traits> &operator<<(
1290  std::basic_ostream<CharT, Traits> &os, const M256 &a)
1291 {
1292  if (!os.good())
1293  return os;
1294 
1295  std::array<float, 8> sa;
1296  a.store_u(sa.data());
1297  os << sa;
1298 
1299  return os;
1300 }
1301 
1302 template <typename CharT, typename Traits>
1303 inline std::basic_istream<CharT, Traits> &operator>>(
1304  std::basic_istream<CharT, Traits> &is, M256 &a)
1305 {
1306  if (!is.good())
1307  return is;
1308 
1309  std::array<float, 8> sa;
1310  is >> sa;
1311 
1312  if (is.good())
1313  a.load_u(sa.data());
1314 
1315  return is;
1316 }
1317 
1318 inline M256 operator+(const M256 &a, const M256 &b)
1319 {
1320  return M256(_mm256_add_ps(a.value(), b.value()));
1321 }
1322 
1323 inline M256 operator-(const M256 &a, const M256 &b)
1324 {
1325  return M256(_mm256_sub_ps(a.value(), b.value()));
1326 }
1327 
1328 inline M256 operator*(const M256 &a, const M256 &b)
1329 {
1330  return M256(_mm256_mul_ps(a.value(), b.value()));
1331 }
1332 
1333 inline M256 operator/(const M256 &a, const M256 &b)
1334 {
1335  return M256(_mm256_div_ps(a.value(), b.value()));
1336 }
1337 
1338 VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(M256, float, +, operator+, operator+=)
1339 VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(M256, float, -, operator-, operator-=)
1340 VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(M256, float, *, operator*, operator*=)
1341 VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(M256, float, /, operator/, operator/=)
1342 
1345 class M256D
1346 {
1347  public:
1348  M256D() = default;
1349 
1350  M256D(const __m256d &value) : value_(value) {}
1351 
1352  static constexpr std::size_t size() { return 4; }
1353 
1354  __m256d &value() { return value_; }
1355  const __m256d &value() const { return value_; }
1356 
1357  __m256d *data() { return &value_; }
1358  const __m256d *data() const { return &value_; }
1359 
1360  template <typename T>
1361  void load_a(const T *mem)
1362  {
1363  value_ = _mm256_load_pd(reinterpret_cast<const double *>(mem));
1364  }
1365 
1366  template <typename T>
1367  void load_u(const T *mem)
1368  {
1369  value_ = _mm256_loadu_pd(reinterpret_cast<const double *>(mem));
1370  }
1371 
1372  template <typename T>
1373  void load(const T *mem)
1374  {
1375  reinterpret_cast<std::uintptr_t>(mem) % 32 == 0 ? load_a(mem) :
1376  load_u(mem);
1377  }
1378 
1379  template <typename T>
1380  void store_a(T *mem) const
1381  {
1382  _mm256_store_pd(reinterpret_cast<double *>(mem), value_);
1383  }
1384 
1385  template <typename T>
1386  void store_u(T *mem) const
1387  {
1388  _mm256_storeu_pd(reinterpret_cast<double *>(mem), value_);
1389  }
1390 
1391  template <typename T>
1392  void store(T *mem) const
1393  {
1394  reinterpret_cast<std::uintptr_t>(mem) % 32 == 0 ? store_a(mem) :
1395  store_u(mem);
1396  }
1397 
1398  void set0() { value_ = _mm256_setzero_pd(); }
1399 
1400  void set1(double e) { value_ = _mm256_set1_pd(e); }
1401 
1402  void set(double e3, double e2, double e1, double e0)
1403  {
1404  value_ = _mm256_set_pd(e3, e2, e1, e0);
1405  }
1406 
1407  private:
1408  __m256d value_;
1409 }; // class M256D
1410 
1411 inline bool operator==(const M256D &a, const M256D &b)
1412 {
1413  std::array<double, 4> sa;
1414  std::array<double, 4> sb;
1415  a.store_u(sa.data());
1416  b.store_u(sb.data());
1417 
1418  return sa == sb;
1419 }
1420 
1421 inline bool operator!=(const M256D &a, const M256D &b) { return !(a == b); }
1422 
1423 template <typename CharT, typename Traits>
1424 inline std::basic_ostream<CharT, Traits> &operator<<(
1425  std::basic_ostream<CharT, Traits> &os, const M256D &a)
1426 {
1427  if (!os.good())
1428  return os;
1429 
1430  std::array<double, 4> sa;
1431  a.store_u(sa.data());
1432  os << sa;
1433 
1434  return os;
1435 }
1436 
1437 template <typename CharT, typename Traits>
1438 inline std::basic_istream<CharT, Traits> &operator>>(
1439  std::basic_istream<CharT, Traits> &is, M256D &a)
1440 {
1441  if (!is.good())
1442  return is;
1443 
1444  std::array<double, 4> sa;
1445  is >> sa;
1446 
1447  if (is.good())
1448  a.load_u(sa.data());
1449 
1450  return is;
1451 }
1452 
1453 inline M256D operator+(const M256D &a, const M256D &b)
1454 {
1455  return M256D(_mm256_add_pd(a.value(), b.value()));
1456 }
1457 
1458 inline M256D operator-(const M256D &a, const M256D &b)
1459 {
1460  return M256D(_mm256_sub_pd(a.value(), b.value()));
1461 }
1462 
1463 inline M256D operator*(const M256D &a, const M256D &b)
1464 {
1465  return M256D(_mm256_mul_pd(a.value(), b.value()));
1466 }
1467 
1468 inline M256D operator/(const M256D &a, const M256D &b)
1469 {
1470  return M256D(_mm256_div_pd(a.value(), b.value()));
1471 }
1472 
1474  M256D, double, +, operator+, operator+=)
1476  M256D, double, -, operator-, operator-=)
1478  M256D, double, *, operator*, operator*=)
1480  M256D, double, /, operator/, operator/=)
1481 
1482 namespace internal
1483 {
1484 
1485 template <typename RealType>
1487 
1488 template <>
1489 class M256TypeTrait<float>
1490 {
1491  public:
1492  using type = M256;
1493 };
1494 
1495 template <>
1496 class M256TypeTrait<double>
1497 {
1498  public:
1499  using type = M256D;
1500 };
1501 
1502 } // namespace vsmc::internal
1503 
1506 template <typename T>
1507 using M256Type = typename std::conditional<std::is_integral<T>::value,
1509 
1510 #endif // VSMC_HAS_AVX2
1511 
1512 } // namespace vsmc
1513 
1514 #endif // VSMC_UTILITY_SIMD_HPP
__m256i * data()
Definition: simd.hpp:834
Definition: monitor.hpp:49
const __m256i * data() const
Definition: simd.hpp:835
__m128
Definition: simd.hpp:499
void load(const T *mem)
Definition: simd.hpp:850
M256I(const __m256i &value)
Definition: simd.hpp:811
M128I< T > operator>>=(M128I< T > &a, int imm8)
Definition: simd.hpp:479
void set0()
Definition: simd.hpp:1262
const __m128 & value() const
Definition: simd.hpp:509
const __m128i & value() const
Definition: simd.hpp:141
__m256 * data()
Definition: simd.hpp:1221
void load_u(const T *mem)
Definition: simd.hpp:521
void set0()
Definition: simd.hpp:1398
void store(T *mem) const
Definition: simd.hpp:546
M128I< T > m128i_srli(const M128I< T > &a, int imm8, std::integral_constant< std::size_t, sizeof(std::int8_t)>)
Definition: simd.hpp:348
M256I< T > m256i_add(const M256I< T > &a, const M256I< T > &b, std::integral_constant< std::size_t, sizeof(std::int8_t)>)
Definition: simd.hpp:974
void load_a(const T *mem)
Definition: simd.hpp:838
SingleParticle< T > operator-(const SingleParticle< T > &sp, IntType n)
Using __mm256i as integer vector.
Definition: simd.hpp:804
void store_a(T *mem) const
Definition: simd.hpp:1380
M128I< T > m128i_slli(const M128I< T > &a, int imm8, std::integral_constant< std::size_t, sizeof(std::int8_t)>)
Definition: simd.hpp:320
M128I()=default
void store_a(T *mem) const
Definition: simd.hpp:669
void store_a(T *mem) const
Definition: simd.hpp:857
M128 operator/(const M128 &a, const M128 &b)
Definition: simd.hpp:622
__m256i & value()
Definition: simd.hpp:831
void load_u(const T *mem)
Definition: simd.hpp:1367
const __m256 & value() const
Definition: simd.hpp:1219
M256I< T > m256i_srli(const M256I< T > &a, int imm8, std::integral_constant< std::size_t, sizeof(std::int8_t)>)
Definition: simd.hpp:1058
Using __m128i as integer vector.
Definition: simd.hpp:113
const __m128 * data() const
Definition: simd.hpp:512
static constexpr std::size_t size()
Definition: simd.hpp:826
M128I< T > m128i_add(const M128I< T > &a, const M128I< T > &b, std::integral_constant< std::size_t, sizeof(std::int8_t)>)
Definition: simd.hpp:264
__m256 & value()
Definition: simd.hpp:1218
void store_u(T *mem) const
Definition: simd.hpp:863
void set1(float e)
Definition: simd.hpp:554
const __m128d * data() const
Definition: simd.hpp:647
M256I< T > m256i_slli(const M256I< T > &a, int imm8, std::integral_constant< std::size_t, sizeof(std::int8_t)>)
Definition: simd.hpp:1030
void load_a(const T *mem)
Definition: simd.hpp:1225
void load(const T *mem)
Definition: simd.hpp:1237
M256(const __m256 &value)
Definition: simd.hpp:1214
__m128i & value()
Definition: simd.hpp:140
M256I< T > m256i_slli(const M256I< T > &a, int imm8, std::integral_constant< std::size_t, sizeof(std::int64_t)>)
Definition: simd.hpp:1051
M128I< T > m128i_sub(const M128I< T > &a, const M128I< T > &b, std::integral_constant< std::size_t, sizeof(std::int64_t)>)
Definition: simd.hpp:313
void store(T *mem) const
Definition: simd.hpp:1392
bool operator!=(const SingleParticle< T > &sp1, const SingleParticle< T > &sp2)
void store(T *mem) const
Definition: simd.hpp:1256
__m128i * data()
Definition: simd.hpp:143
__m128 & value()
Definition: simd.hpp:508
void load_u(const T *mem)
Definition: simd.hpp:844
void set1(T n)
Definition: simd.hpp:878
M128I(const __m128i &value)
Definition: simd.hpp:120
void store_u(T *mem) const
Definition: simd.hpp:1386
void load_a(const T *mem)
Definition: simd.hpp:147
M128I< T > operator&(const M128I< T > &a, const M128I< T > &b)
Definition: simd.hpp:439
void load_u(const T *mem)
Definition: simd.hpp:153
__m256d * data()
Definition: simd.hpp:1357
static constexpr std::size_t size()
Definition: simd.hpp:506
M128 operator*(const M128 &a, const M128 &b)
Definition: simd.hpp:617
static constexpr std::size_t size()
Definition: simd.hpp:135
void load_u(const T *mem)
Definition: simd.hpp:1231
void store_u(T *mem) const
Definition: simd.hpp:1250
const __m256d & value() const
Definition: simd.hpp:1355
const __m256i & value() const
Definition: simd.hpp:832
void store_u(T *mem) const
Definition: simd.hpp:675
const __m128i * data() const
Definition: simd.hpp:144
std::basic_ostream< CharT, Traits > & operator<<(std::basic_ostream< CharT, Traits > &os, const Sampler< T > &sampler)
Definition: sampler.hpp:861
static constexpr std::size_t size()
Definition: simd.hpp:1216
M256D(const __m256d &value)
Definition: simd.hpp:1350
bool operator==(const SingleParticle< T > &sp1, const SingleParticle< T > &sp2)
M128I< T > m128i_slli(const M128I< T > &a, int imm8, std::integral_constant< std::size_t, sizeof(std::int64_t)>)
Definition: simd.hpp:341
void load_u(const T *mem)
Definition: simd.hpp:656
static constexpr std::size_t size()
Definition: simd.hpp:641
void load(const T *mem)
Definition: simd.hpp:159
M128I< IntType > & operator=(const M128I< T > &other)
Definition: simd.hpp:128
void set0()
Definition: simd.hpp:687
M128I(const M128I< T > &other)
Definition: simd.hpp:123
M128I< T > operator^(const M128I< T > &a, const M128I< T > &b)
Definition: simd.hpp:451
M256I< T > m256i_add(const M256I< T > &a, const M256I< T > &b, std::integral_constant< std::size_t, sizeof(std::int64_t)>)
Definition: simd.hpp:995
void load(const T *mem)
Definition: simd.hpp:527
typename std::conditional< std::is_integral< T >::value, M128I< T >, typename internal::M128TypeTrait< T >::type >::type M128Type
floating point SSE2 type
Definition: simd.hpp:794
void store(T *mem) const
Definition: simd.hpp:178
void store_a(T *mem) const
Definition: simd.hpp:534
void store_u(T *mem) const
Definition: simd.hpp:540
M256I< T > m256i_srli(const M256I< T > &a, int imm8, std::integral_constant< std::size_t, sizeof(std::int64_t)>)
Definition: simd.hpp:1079
__m256d & value()
Definition: simd.hpp:1354
__m128d & value()
Definition: simd.hpp:643
void store_u(T *mem) const
Definition: simd.hpp:172
void set0()
Definition: simd.hpp:184
const __m256 * data() const
Definition: simd.hpp:1222
M128I< T > m128i_sub(const M128I< T > &a, const M128I< T > &b, std::integral_constant< std::size_t, sizeof(std::int8_t)>)
Definition: simd.hpp:292
void set1(double e)
Definition: simd.hpp:1400
void set0()
Definition: simd.hpp:875
#define VSMC_DEFINE_UTILITY_SIMD_INTEGER_BINARY_OP( Type, CType, op, bin, assign)
Definition: simd.hpp:37
M256I< T > m256i_sub(const M256I< T > &a, const M256I< T > &b, std::integral_constant< std::size_t, sizeof(std::int8_t)>)
Definition: simd.hpp:1002
M128I< T > m128i_srli(const M128I< T > &a, int imm8, std::integral_constant< std::size_t, sizeof(std::int64_t)>)
Definition: simd.hpp:369
void store(T *mem) const
Definition: simd.hpp:869
void store(T *mem) const
Definition: simd.hpp:681
__m256
Definition: simd.hpp:1209
M128(const __m128 &value)
Definition: simd.hpp:504
typename std::conditional< std::is_integral< T >::value, M256I< T >, typename internal::M256TypeTrait< T >::type >::type M256Type
floating point SSE2 type
Definition: simd.hpp:1508
void load_a(const T *mem)
Definition: simd.hpp:515
M128D(const __m128d &value)
Definition: simd.hpp:639
static constexpr std::size_t size()
Definition: simd.hpp:1352
M128I< T > m128i_add(const M128I< T > &a, const M128I< T > &b, std::integral_constant< std::size_t, sizeof(std::int64_t)>)
Definition: simd.hpp:285
IntType value_type
Definition: simd.hpp:807
void set1(T n)
Definition: simd.hpp:187
void set0()
Definition: simd.hpp:552
void load(const T *mem)
Definition: simd.hpp:1373
void set1(float e)
Definition: simd.hpp:1264
__m128 * data()
Definition: simd.hpp:511
const __m256d * data() const
Definition: simd.hpp:1358
M256I(const M256I< T > &other)
Definition: simd.hpp:814
M256I< T > m256i_sub(const M256I< T > &a, const M256I< T > &b, std::integral_constant< std::size_t, sizeof(std::int64_t)>)
Definition: simd.hpp:1023
__m256d
Definition: simd.hpp:1345
M256I< IntType > & operator=(const M256I< T > &other)
Definition: simd.hpp:819
__m128d * data()
Definition: simd.hpp:646
void store_a(T *mem) const
Definition: simd.hpp:1244
void store_a(T *mem) const
Definition: simd.hpp:166
std::basic_istream< CharT, Traits > & operator>>(std::basic_istream< CharT, Traits > &is, std::array< T, N > &ary)
Definition: common.hpp:165
void load_a(const T *mem)
Definition: simd.hpp:1361
void load_a(const T *mem)
Definition: simd.hpp:650
#define VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(Type, CType, op, bin, assign)
Definition: simd.hpp:73
const __m128d & value() const
Definition: simd.hpp:644
void load(const T *mem)
Definition: simd.hpp:662
M128I< T > operator|(const M128I< T > &a, const M128I< T > &b)
Definition: simd.hpp:445
SingleParticle< T > operator+(const SingleParticle< T > &sp, IntType n)
__m128i value_type
Definition: simd.hpp:116
void set1(double e)
Definition: simd.hpp:689
__m128d
Definition: simd.hpp:634