32 #ifndef VSMC_UTILITY_SIMD_HPP    33 #define VSMC_UTILITY_SIMD_HPP    37 #define VSMC_DEFINE_UTILITY_SIMD_INTEGER_BINARY_OP(                           \    38     Type, CType, op, bin, assign)                                             \    39     template <typename T>                                                     \    40     inline Type &assign(Type &a, const Type &b)                               \    47     template <typename T>                                                     \    48     inline Type bin(const Type &a, CType b)                                   \    56     template <typename T>                                                     \    57     inline Type bin(CType a, const Type &b)                                   \    65     template <typename T>                                                     \    66     inline Type &assign(Type &a, CType b)                                     \    73 #define VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(Type, CType, op, bin, assign) \    74     inline Type &assign(Type &a, const Type &b)                               \    81     inline Type bin(const Type &a, CType b)                                   \    89     inline Type bin(CType a, const Type &b)                                   \    97     inline Type &assign(Type &a, CType b)                                     \   105 #include <emmintrin.h>   112 template <
typename IntType = __m128i>
   116     using value_type = IntType;
   120     M128I(
const __m128i &value) : value_(value) {}
   122     template <
typename T>
   123     M128I(
const M128I<T> &other)
   124         : value_(other.value())
   128     template <
typename T>
   129     M128I<IntType> &operator=(
const M128I<T> &other)
   131         value_ = other.value();
   136     static constexpr std::size_t size()
   138         return sizeof(__m128i) / 
sizeof(IntType);
   141     __m128i &value() { 
return value_; }
   142     const __m128i &value()
 const { 
return value_; }
   144     __m128i *data() { 
return &value_; }
   145     const __m128i *data()
 const { 
return &value_; }
   147     template <
typename T>
   148     void load_a(
const T *mem)
   150         value_ = _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
   153     template <
typename T>
   154     void load_u(
const T *mem)
   156         value_ = _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem));
   159     template <
typename T>
   160     void load(
const T *mem)
   162         reinterpret_cast<std::uintptr_t
>(mem) % 16 == 0 ? load_a(mem) :
   166     template <
typename T>
   167     void store_a(T *mem)
 const   169         _mm_store_si128(reinterpret_cast<__m128i *>(mem), value_);
   172     template <
typename T>
   173     void store_u(T *mem)
 const   175         _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), value_);
   178     template <
typename T>
   179     void store(T *mem)
 const   181         reinterpret_cast<std::uintptr_t
>(mem) % 16 == 0 ? store_a(mem) :
   185     void set0() { value_ = _mm_setzero_si128(); }
   187     template <
typename T>
   190         value_ = set1(n, std::integral_constant<std::size_t, 
sizeof(T)>());
   193     template <
typename T>
   196         value_ = _mm_set_epi64x(
   197             static_cast<VSMC_INT64>(e1), static_cast<VSMC_INT64>(e0));
   200     template <
typename T>
   201     void set(T e3, T e2, T e1, T e0)
   203         value_ = _mm_set_epi32(static_cast<int>(e3), static_cast<int>(e2),
   204             static_cast<int>(e1), static_cast<int>(e0));
   207     template <
typename T>
   208     void set(T e7, T e6, T e5, T e4, T e3, T e2, T e1, T e0)
   210         value_ = _mm_set_epi16(static_cast<short>(e7), static_cast<short>(e6),
   211             static_cast<short>(e5), static_cast<short>(e4),
   212             static_cast<short>(e3), static_cast<short>(e2),
   213             static_cast<short>(e1), static_cast<short>(e0));
   216     template <
typename T>
   217     void set(T e15, T e14, T e13, T e12, T e11, T e10, T e9, T e8, T e7, T e6,
   218         T e5, T e4, T e3, T e2, T e1, T e0)
   220         value_ = _mm_set_epi8(static_cast<char>(e15), static_cast<char>(e14),
   221             static_cast<char>(e13), static_cast<char>(e12),
   222             static_cast<char>(e11), static_cast<char>(e10),
   223             static_cast<char>(e9), static_cast<char>(e8),
   224             static_cast<char>(e7), static_cast<char>(e6),
   225             static_cast<char>(e5), static_cast<char>(e4),
   226             static_cast<char>(e3), static_cast<char>(e2),
   227             static_cast<char>(e1), static_cast<char>(e0));
   233     template <
typename T>
   234     __m128i set1(T n, std::integral_constant<std::size_t, 
sizeof(std::int8_t)>)
   236         return _mm_set1_epi8(static_cast<char>(n));
   239     template <
typename T>
   241         T n, std::integral_constant<std::size_t, 
sizeof(std::int16_t)>)
   243         return _mm_set1_epi16(static_cast<short>(n));
   246     template <
typename T>
   248         T n, std::integral_constant<std::size_t, 
sizeof(std::int32_t)>)
   250         return _mm_set1_epi32(static_cast<int>(n));
   253     template <
typename T>
   255         T n, std::integral_constant<std::size_t, 
sizeof(std::int64_t)>)
   257         return _mm_set1_epi64x(static_cast<VSMC_INT64>(n));
   264 template <
typename T>
   265 inline M128I<T> m128i_add(
const M128I<T> &a, 
const M128I<T> &b,
   266     std::integral_constant<std::size_t, 
sizeof(std::int8_t)>)
   268     return M128I<T>(_mm_add_epi8(a.value(), b.value()));
   271 template <
typename T>
   272 inline M128I<T> m128i_add(
const M128I<T> &a, 
const M128I<T> &b,
   273     std::integral_constant<std::size_t, 
sizeof(std::int16_t)>)
   275     return M128I<T>(_mm_add_epi16(a.value(), b.value()));
   278 template <
typename T>
   279 inline M128I<T> m128i_add(
const M128I<T> &a, 
const M128I<T> &b,
   280     std::integral_constant<std::size_t, 
sizeof(std::int32_t)>)
   282     return M128I<T>(_mm_add_epi32(a.value(), b.value()));
   285 template <
typename T>
   286 inline M128I<T> m128i_add(
const M128I<T> &a, 
const M128I<T> &b,
   287     std::integral_constant<std::size_t, 
sizeof(std::int64_t)>)
   289     return M128I<T>(_mm_add_epi64(a.value(), b.value()));
   292 template <
typename T>
   293 inline M128I<T> m128i_sub(
const M128I<T> &a, 
const M128I<T> &b,
   294     std::integral_constant<std::size_t, 
sizeof(std::int8_t)>)
   296     return M128I<T>(_mm_sub_epi8(a.value(), b.value()));
   299 template <
typename T>
   300 inline M128I<T> m128i_sub(
const M128I<T> &a, 
const M128I<T> &b,
   301     std::integral_constant<std::size_t, 
sizeof(std::int16_t)>)
   303     return M128I<T>(_mm_sub_epi16(a.value(), b.value()));
   306 template <
typename T>
   307 inline M128I<T> m128i_sub(
const M128I<T> &a, 
const M128I<T> &b,
   308     std::integral_constant<std::size_t, 
sizeof(std::int32_t)>)
   310     return M128I<T>(_mm_sub_epi32(a.value(), b.value()));
   313 template <
typename T>
   314 inline M128I<T> m128i_sub(
const M128I<T> &a, 
const M128I<T> &b,
   315     std::integral_constant<std::size_t, 
sizeof(std::int64_t)>)
   317     return M128I<T>(_mm_sub_epi64(a.value(), b.value()));
   320 template <
typename T>
   321 inline M128I<T> m128i_slli(
const M128I<T> &a, 
int imm8,
   322     std::integral_constant<std::size_t, 
sizeof(std::int8_t)>)
   324     return M128I<T>(_mm_slli_epi8(a.value(), imm8));
   327 template <
typename T>
   328 inline M128I<T> m128i_slli(
const M128I<T> &a, 
int imm8,
   329     std::integral_constant<std::size_t, 
sizeof(std::int16_t)>)
   331     return M128I<T>(_mm_slli_epi16(a.value(), imm8));
   334 template <
typename T>
   335 inline M128I<T> m128i_slli(
const M128I<T> &a, 
int imm8,
   336     std::integral_constant<std::size_t, 
sizeof(std::int32_t)>)
   338     return M128I<T>(_mm_slli_epi32(a.value(), imm8));
   341 template <
typename T>
   342 inline M128I<T> m128i_slli(
const M128I<T> &a, 
int imm8,
   343     std::integral_constant<std::size_t, 
sizeof(std::int64_t)>)
   345     return M128I<T>(_mm_slli_epi64(a.value(), imm8));
   348 template <
typename T>
   349 inline M128I<T> m128i_srli(
const M128I<T> &a, 
int imm8,
   350     std::integral_constant<std::size_t, 
sizeof(std::int8_t)>)
   352     return M128I<T>(_mm_srli_epi8(a.value(), imm8));
   355 template <
typename T>
   356 inline M128I<T> m128i_srli(
const M128I<T> &a, 
int imm8,
   357     std::integral_constant<std::size_t, 
sizeof(std::int16_t)>)
   359     return M128I<T>(_mm_srli_epi16(a.value(), imm8));
   362 template <
typename T>
   363 inline M128I<T> m128i_srli(
const M128I<T> &a, 
int imm8,
   364     std::integral_constant<std::size_t, 
sizeof(std::int32_t)>)
   366     return M128I<T>(_mm_srli_epi32(a.value(), imm8));
   369 template <
typename T>
   370 inline M128I<T> m128i_srli(
const M128I<T> &a, 
int imm8,
   371     std::integral_constant<std::size_t, 
sizeof(std::int64_t)>)
   373     return M128I<T>(_mm_srli_epi64(a.value(), imm8));
   378 template <
typename T>
   379 inline bool operator==(
const M128I<T> &a, 
const M128I<T> &b)
   381     std::array<std::uint64_t, 2> sa;
   382     std::array<std::uint64_t, 2> sb;
   383     a.store_u(sa.data());
   384     b.store_u(sb.data());
   389 template <
typename T>
   390 inline bool operator!=(
const M128I<T> &a, 
const M128I<T> &b)
   395 template <
typename CharT, 
typename Traits, 
typename T>
   396 inline std::basic_ostream<CharT, Traits> &
operator<<(
   397     std::basic_ostream<CharT, Traits> &os, 
const M128I<T> &a)
   402     std::array<T, M128I<T>::size()> sa;
   403     a.store_u(sa.data());
   409 template <
typename CharT, 
typename Traits, 
typename T>
   410 inline std::basic_istream<CharT, Traits> &
operator>>(
   411     std::basic_istream<CharT, Traits> &is, M128I<T> &a)
   416     std::array<T, M128I<T>::size()> sa;
   425 template <
typename T>
   426 inline M128I<T> operator+(
const M128I<T> &a, 
const M128I<T> &b)
   428     return internal::m128i_add(
   429         a, b, std::integral_constant<std::size_t, 
sizeof(T)>());
   432 template <
typename T>
   433 inline M128I<T> operator-(
const M128I<T> &a, 
const M128I<T> &b)
   435     return internal::m128i_sub(
   436         a, b, std::integral_constant<std::size_t, 
sizeof(T)>());
   439 template <
typename T>
   440 inline M128I<T> operator&(
const M128I<T> &a, 
const M128I<T> &b)
   442     return M128I<T>(_mm_and_si128(a.value(), b.value()));
   445 template <
typename T>
   446 inline M128I<T> operator|(
const M128I<T> &a, 
const M128I<T> &b)
   448     return M128I<T>(_mm_or_si128(a.value(), b.value()));
   451 template <
typename T>
   452 inline M128I<T> operator^(
const M128I<T> &a, 
const M128I<T> &b) {
   453     return M128I<T>(_mm_xor_si128(a.value(), b.value()));
   456 template <
typename T>
   457 inline M128I<T> operator<<(const M128I<T> &a, 
int imm8)
   459     return internal::m128i_slli(
   460         a, imm8, std::integral_constant<std::size_t, 
sizeof(T)>());
   463 template <
typename T>
   464 inline M128I<T> operator<<=(M128I<T> &a, 
int imm8)
   471 template <
typename T>
   472 inline M128I<T> 
operator>>(
const M128I<T> &a, 
int imm8)
   474     return internal::m128i_srli(
   475         a, imm8, std::integral_constant<std::size_t, 
sizeof(T)>());
   478 template <
typename T>
   479 inline M128I<T> operator>>=(M128I<T> &a, 
int imm8)
   487     M128I<T>, T, +, 
operator+, 
operator+=)
   489     M128I<T>, T, -, 
operator-, 
operator-=)
   491     M128I<T>, T, &, 
operator&, 
operator&=)
   493     M128I<T>, T, |, 
operator|, 
operator|=)
   495     M128I<T>, T, ^, 
operator^, 
operator^=)
   504     M128(
const __m128 &value) : value_(value) {}
   506     static constexpr std::size_t size() { 
return 4; }
   508     __m128 &value() { 
return value_; }
   509     const __m128 &value()
 const { 
return value_; }
   511     __m128 *data() { 
return &value_; }
   512     const __m128 *data()
 const { 
return &value_; }
   514     template <
typename T>
   515     void load_a(
const T *mem)
   517         value_ = _mm_load_ps(reinterpret_cast<const float *>(mem));
   520     template <
typename T>
   521     void load_u(
const T *mem)
   523         value_ = _mm_loadu_ps(reinterpret_cast<const float *>(mem));
   526     template <
typename T>
   527     void load(
const T *mem)
   529         reinterpret_cast<std::uintptr_t
>(mem) % 16 == 0 ? load_a(mem) :
   533     template <
typename T>
   534     void store_a(T *mem)
 const   536         _mm_store_ps(reinterpret_cast<float *>(mem), value_);
   539     template <
typename T>
   540     void store_u(T *mem)
 const   542         _mm_storeu_ps(reinterpret_cast<float *>(mem), value_);
   545     template <
typename T>
   546     void store(T *mem)
 const   548         reinterpret_cast<std::uintptr_t
>(mem) % 16 == 0 ? store_a(mem) :
   552     void set0() { value_ = _mm_setzero_ps(); }
   554     void set1(
float e) { value_ = _mm_set1_ps(e); }
   556     void set(
float e3, 
float e2, 
float e1, 
float e0)
   558         value_ = _mm_set_ps(e3, e2, e1, e0);
   565 inline bool operator==(
const M128 &a, 
const M128 &b)
   567     std::array<float, 4> sa;
   568     std::array<float, 4> sb;
   569     a.store_u(sa.data());
   570     b.store_u(sb.data());
   575 inline bool operator!=(
const M128 &a, 
const M128 &b) { 
return !(a == b); }
   577 template <
typename CharT, 
typename Traits>
   578 inline std::basic_ostream<CharT, Traits> &
operator<<(
   579     std::basic_ostream<CharT, Traits> &os, 
const M128 &a)
   584     std::array<float, 4> sa;
   585     a.store_u(sa.data());
   591 template <
typename CharT, 
typename Traits>
   592 inline std::basic_istream<CharT, Traits> &
operator>>(
   593     std::basic_istream<CharT, Traits> &is, M128 &a)
   598     std::array<float, 4> sa;
   607 inline M128 operator+(
const M128 &a, 
const M128 &b)
   609     return M128(_mm_add_ps(a.value(), b.value()));
   612 inline M128 operator-(
const M128 &a, 
const M128 &b)
   614     return M128(_mm_sub_ps(a.value(), b.value()));
   617 inline M128 operator*(
const M128 &a, 
const M128 &b)
   619     return M128(_mm_mul_ps(a.value(), b.value()));
   622 inline M128 operator/(
const M128 &a, 
const M128 &b)
   624     return M128(_mm_div_ps(a.value(), b.value()));
   639     M128D(
const __m128d &value) : value_(value) {}
   641     static constexpr std::size_t size() { 
return 2; }
   643     __m128d &value() { 
return value_; }
   644     const __m128d &value()
 const { 
return value_; }
   646     __m128d *data() { 
return &value_; }
   647     const __m128d *data()
 const { 
return &value_; }
   649     template <
typename T>
   650     void load_a(
const T *mem)
   652         value_ = _mm_load_pd(reinterpret_cast<const double *>(mem));
   655     template <
typename T>
   656     void load_u(
const T *mem)
   658         value_ = _mm_loadu_pd(reinterpret_cast<const double *>(mem));
   661     template <
typename T>
   662     void load(
const T *mem)
   664         reinterpret_cast<std::uintptr_t
>(mem) % 16 == 0 ? load_a(mem) :
   668     template <
typename T>
   669     void store_a(T *mem)
 const   671         _mm_store_pd(reinterpret_cast<double *>(mem), value_);
   674     template <
typename T>
   675     void store_u(T *mem)
 const   677         _mm_storeu_pd(reinterpret_cast<double *>(mem), value_);
   680     template <
typename T>
   681     void store(T *mem)
 const   683         reinterpret_cast<std::uintptr_t
>(mem) % 16 == 0 ? store_a(mem) :
   687     void set0() { value_ = _mm_setzero_pd(); }
   689     void set1(
double e) { value_ = _mm_set1_pd(e); }
   691     void set(
double e1, 
double e0) { value_ = _mm_set_pd(e1, e0); }
   697 inline bool operator==(
const M128D &a, 
const M128D &b)
   699     std::array<double, 2> sa;
   700     std::array<double, 2> sb;
   701     a.store_u(sa.data());
   702     b.store_u(sb.data());
   707 inline bool operator!=(
const M128D &a, 
const M128D &b) { 
return !(a == b); }
   709 template <
typename CharT, 
typename Traits>
   710 inline std::basic_ostream<CharT, Traits> &
operator<<(
   711     std::basic_ostream<CharT, Traits> &os, 
const M128D &a)
   716     std::array<double, 2> sa;
   717     a.store_u(sa.data());
   723 template <
typename CharT, 
typename Traits>
   724 inline std::basic_istream<CharT, Traits> &
operator>>(
   725     std::basic_istream<CharT, Traits> &is, M128D &a)
   730     std::array<double, 2> sa;
   739 inline M128D operator+(
const M128D &a, 
const M128D &b)
   741     return M128D(_mm_add_pd(a.value(), b.value()));
   744 inline M128D operator-(
const M128D &a, 
const M128D &b)
   746     return M128D(_mm_sub_pd(a.value(), b.value()));
   749 inline M128D operator*(
const M128D &a, 
const M128D &b)
   751     return M128D(_mm_mul_pd(a.value(), b.value()));
   754 inline M128D operator/(
const M128D &a, 
const M128D &b)
   756     return M128D(_mm_div_pd(a.value(), b.value()));
   760     M128D, 
double, +, 
operator+, 
operator+=)
   762     M128D, 
double, -, 
operator-, 
operator-=)
   764     M128D, 
double, *, 
operator*, 
operator*=)
   766     M128D, 
double, /, 
operator/, 
operator/=)
   771 template <
typename RealType>
   775 class M128TypeTrait<float>
   782 class M128TypeTrait<double>
   791 template <
typename T>
   792 using M128Type = 
typename std::conditional<std::is_integral<T>::value,
   793     M128I<T>, 
typename internal::M128TypeTrait<T>::type>::type;
   795 #endif // VSMC_HAS_SSE2   798 #include <immintrin.h>   802 template <
typename IntType = __m256i>
   806     using value_type = IntType;
   810     M256I(
const __m256i &value) : value_(value) {}
   812     template <
typename T>
   813     M256I(
const M256I<T> &other)
   814         : value_(other.value())
   818     template <
typename T>
   819     M256I<IntType> &operator=(
const M256I<T> &other)
   821         value_ = other.value_;
   826     static constexpr std::size_t size()
   828         return sizeof(__m256i) / 
sizeof(IntType);
   831     __m256i &value() { 
return value_; }
   832     const __m256i &value()
 const { 
return value_; }
   834     __m256i *data() { 
return &value_; }
   835     const __m256i *data()
 const { 
return &value_; }
   837     template <
typename T>
   838     void load_a(
const T *mem)
   840         value_ = _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
   843     template <
typename T>
   844     void load_u(
const T *mem)
   846         value_ = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
   849     template <
typename T>
   850     void load(
const T *mem)
   852         reinterpret_cast<std::uintptr_t
>(mem) % 32 == 0 ? load_a(mem) :
   856     template <
typename T>
   857     void store_a(T *mem)
 const   859         _mm256_store_si256(reinterpret_cast<__m256i *>(mem), value_);
   862     template <
typename T>
   863     void store_u(T *mem)
 const   865         _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), value_);
   868     template <
typename T>
   869     void store(T *mem)
 const   871         reinterpret_cast<std::uintptr_t
>(mem) % 32 == 0 ? store_a(mem) :
   875     void set0() { value_ = _mm256_setzero_si256(); }
   877     template <
typename T>
   880         value_ = set1(n, std::integral_constant<std::size_t, 
sizeof(T)>());
   883     template <
typename T>
   884     void set(T e3, T e2, T e1, T e0)
   886         value_ = _mm256_set_epi64x(static_cast<VSMC_INT64>(e3),
   887             static_cast<VSMC_INT64>(e2), static_cast<VSMC_INT64>(e1),
   888             static_cast<VSMC_INT64>(e0));
   891     template <
typename T>
   892     void set(T e7, T e6, T e5, T e4, T e3, T e2, T e1, T e0)
   894         value_ = _mm256_set_epi32(static_cast<int>(e7), static_cast<int>(e6),
   895             static_cast<int>(e5), static_cast<int>(e4), static_cast<int>(e3),
   896             static_cast<int>(e2), static_cast<int>(e1), static_cast<int>(e0));
   899     template <
typename T>
   900     void set(T e15, T e14, T e13, T e12, T e11, T e10, T e9, T e8, T e7, T e6,
   901         T e5, T e4, T e3, T e2, T e1, T e0)
   904             _mm256_set_epi16(static_cast<short>(e15), static_cast<short>(e14),
   905                 static_cast<short>(e13), static_cast<short>(e12),
   906                 static_cast<short>(e11), static_cast<short>(e10),
   907                 static_cast<short>(e9), static_cast<short>(e8),
   908                 static_cast<short>(e7), static_cast<short>(e6),
   909                 static_cast<short>(e5), static_cast<short>(e4),
   910                 static_cast<short>(e3), static_cast<short>(e2),
   911                 static_cast<short>(e1), static_cast<short>(e0));
   914     template <
typename T>
   915     void set(T e31, T e30, T e29, T e28, T e27, T e26, T e25, T e24, T e23,
   916         T e22, T e21, T e20, T e19, T e18, T e17, T e16, T e15, T e14, T e13,
   917         T e12, T e11, T e10, T e9, T e8, T e7, T e6, T e5, T e4, T e3, T e2,
   921             _mm256_set_epi8(static_cast<char>(e31), static_cast<char>(e30),
   922                 static_cast<char>(e29), static_cast<char>(e28),
   923                 static_cast<char>(e27), static_cast<char>(e26),
   924                 static_cast<char>(e25), static_cast<char>(e24),
   925                 static_cast<char>(e23), static_cast<char>(e22),
   926                 static_cast<char>(e21), static_cast<char>(e20),
   927                 static_cast<char>(e19), static_cast<char>(e18),
   928                 static_cast<char>(e17), static_cast<char>(e16),
   929                 static_cast<char>(e15), static_cast<char>(e14),
   930                 static_cast<char>(e13), static_cast<char>(e12),
   931                 static_cast<char>(e11), static_cast<char>(e10),
   932                 static_cast<char>(e9), static_cast<char>(e8),
   933                 static_cast<char>(e7), static_cast<char>(e6),
   934                 static_cast<char>(e5), static_cast<char>(e4),
   935                 static_cast<char>(e3), static_cast<char>(e2),
   936                 static_cast<char>(e1), static_cast<char>(e0));
   942     template <
typename T>
   943     __m256i set1(T n, std::integral_constant<std::size_t, 
sizeof(std::int8_t)>)
   945         return _mm256_set1_epi8(static_cast<char>(n));
   948     template <
typename T>
   950         T n, std::integral_constant<std::size_t, 
sizeof(std::int16_t)>)
   952         return _mm256_set1_epi16(static_cast<short>(n));
   955     template <
typename T>
   957         T n, std::integral_constant<std::size_t, 
sizeof(std::int32_t)>)
   959         return _mm256_set1_epi32(static_cast<int>(n));
   962     template <
typename T>
   964         T n, std::integral_constant<std::size_t, 
sizeof(std::int64_t)>)
   966         return _mm256_set1_epi64x(static_cast<long long>(n));
   973 template <
typename T>
   974 inline M256I<T> m256i_add(
const M256I<T> &a, 
const M256I<T> &b,
   975     std::integral_constant<std::size_t, 
sizeof(std::int8_t)>)
   977     return M256I<T>(_mm256_add_epi8(a.value(), b.value()));
   980 template <
typename T>
   981 inline M256I<T> m256i_add(
const M256I<T> &a, 
const M256I<T> &b,
   982     std::integral_constant<std::size_t, 
sizeof(std::int16_t)>)
   984     return M256I<T>(_mm256_add_epi16(a.value(), b.value()));
   987 template <
typename T>
   988 inline M256I<T> m256i_add(
const M256I<T> &a, 
const M256I<T> &b,
   989     std::integral_constant<std::size_t, 
sizeof(std::int32_t)>)
   991     return M256I<T>(_mm256_add_epi32(a.value(), b.value()));
   994 template <
typename T>
   995 inline M256I<T> m256i_add(
const M256I<T> &a, 
const M256I<T> &b,
   996     std::integral_constant<std::size_t, 
sizeof(std::int64_t)>)
   998     return M256I<T>(_mm256_add_epi64(a.value(), b.value()));
  1001 template <
typename T>
  1002 inline M256I<T> m256i_sub(
const M256I<T> &a, 
const M256I<T> &b,
  1003     std::integral_constant<std::size_t, 
sizeof(std::int8_t)>)
  1005     return M256I<T>(_mm256_sub_epi8(a.value(), b.value()));
  1008 template <
typename T>
  1009 inline M256I<T> m256i_sub(
const M256I<T> &a, 
const M256I<T> &b,
  1010     std::integral_constant<std::size_t, 
sizeof(std::int16_t)>)
  1012     return M256I<T>(_mm256_sub_epi16(a.value(), b.value()));
  1015 template <
typename T>
  1016 inline M256I<T> m256i_sub(
const M256I<T> &a, 
const M256I<T> &b,
  1017     std::integral_constant<std::size_t, 
sizeof(std::int32_t)>)
  1019     return M256I<T>(_mm256_sub_epi32(a.value(), b.value()));
  1022 template <
typename T>
  1023 inline M256I<T> m256i_sub(
const M256I<T> &a, 
const M256I<T> &b,
  1024     std::integral_constant<std::size_t, 
sizeof(std::int64_t)>)
  1026     return M256I<T>(_mm256_sub_epi64(a.value(), b.value()));
  1029 template <
typename T>
  1030 inline M256I<T> m256i_slli(
const M256I<T> &a, 
int imm8,
  1031     std::integral_constant<std::size_t, 
sizeof(std::int8_t)>)
  1033     return M256I<T>(_mm256_slli_epi8(a.value(), imm8));
  1036 template <
typename T>
  1037 inline M256I<T> m256i_slli(
const M256I<T> &a, 
int imm8,
  1038     std::integral_constant<std::size_t, 
sizeof(std::int16_t)>)
  1040     return M256I<T>(_mm256_slli_epi16(a.value(), imm8));
  1043 template <
typename T>
  1044 inline M256I<T> m256i_slli(
const M256I<T> &a, 
int imm8,
  1045     std::integral_constant<std::size_t, 
sizeof(std::int32_t)>)
  1047     return M256I<T>(_mm256_slli_epi32(a.value(), imm8));
  1050 template <
typename T>
  1051 inline M256I<T> m256i_slli(
const M256I<T> &a, 
int imm8,
  1052     std::integral_constant<std::size_t, 
sizeof(std::int64_t)>)
  1054     return M256I<T>(_mm256_slli_epi64(a.value(), imm8));
  1057 template <
typename T>
  1058 inline M256I<T> m256i_srli(
const M256I<T> &a, 
int imm8,
  1059     std::integral_constant<std::size_t, 
sizeof(std::int8_t)>)
  1061     return M256I<T>(_mm256_srli_epi8(a.value(), imm8));
  1064 template <
typename T>
  1065 inline M256I<T> m256i_srli(
const M256I<T> &a, 
int imm8,
  1066     std::integral_constant<std::size_t, 
sizeof(std::int16_t)>)
  1068     return M256I<T>(_mm256_srli_epi16(a.value(), imm8));
  1071 template <
typename T>
  1072 inline M256I<T> m256i_srli(
const M256I<T> &a, 
int imm8,
  1073     std::integral_constant<std::size_t, 
sizeof(std::int32_t)>)
  1075     return M256I<T>(_mm256_srli_epi32(a.value(), imm8));
  1078 template <
typename T>
  1079 inline M256I<T> m256i_srli(
const M256I<T> &a, 
int imm8,
  1080     std::integral_constant<std::size_t, 
sizeof(std::int64_t)>)
  1082     return M256I<T>(_mm256_srli_epi64(a.value(), imm8));
  1087 template <
typename T>
  1088 inline bool operator==(
const M256I<T> &a, 
const M256I<T> &b)
  1090     std::array<std::uint64_t, 4> sa;
  1091     std::array<std::uint64_t, 4> sb;
  1092     a.store_u(sa.data());
  1093     b.store_u(sb.data());
  1098 template <
typename T>
  1099 inline bool operator!=(
const M256I<T> &a, 
const M256I<T> &b)
  1104 template <
typename CharT, 
typename Traits, 
typename T>
  1105 inline std::basic_ostream<CharT, Traits> &
operator<<(
  1106     std::basic_ostream<CharT, Traits> &os, 
const M256I<T> &a)
  1111     std::array<T, M256I<T>::size()> sa;
  1112     a.store_u(sa.data());
  1118 template <
typename CharT, 
typename Traits, 
typename T>
  1119 inline std::basic_istream<CharT, Traits> &
operator>>(
  1120     std::basic_istream<CharT, Traits> &is, M256I<T> &a)
  1125     std::array<T, M256I<T>::size()> sa;
  1129         a.load_u(sa.data());
  1134 template <
typename T>
  1135 inline M256I<T> operator+(
const M256I<T> &a, 
const M256I<T> &b)
  1137     return internal::m256i_add(
  1138         a, b, std::integral_constant<std::size_t, 
sizeof(T)>());
  1141 template <
typename T>
  1142 inline M256I<T> operator-(
const M256I<T> &a, 
const M256I<T> &b)
  1144     return internal::m256i_sub(
  1145         a, b, std::integral_constant<std::size_t, 
sizeof(T)>());
  1148 template <
typename T>
  1149 inline M256I<T> operator&(
const M256I<T> &a, 
const M256I<T> &b)
  1151     return M256I<T>(_mm256_and_si256(a.value(), b.value()));
  1154 template <
typename T>
  1155 inline M256I<T> operator|(
const M256I<T> &a, 
const M256I<T> &b)
  1157     return M256I<T>(_mm256_or_si256(a.value(), b.value()));
  1160 template <
typename T>
  1161 inline M256I<T> operator^(
const M256I<T> &a, 
const M256I<T> &b) {
  1162     return M256I<T>(_mm256_xor_si256(a.value(), b.value()));
  1165 template <
typename T>
  1166 inline M256I<T> operator<<(const M256I<T> &a, 
int imm8)
  1168     return internal::m256i_slli(
  1169         a, imm8, std::integral_constant<std::size_t, 
sizeof(T)>());
  1172 template <
typename T>
  1173 inline M256I<T> operator<<=(M256I<T> &a, 
int imm8)
  1180 template <
typename T>
  1181 inline M256I<T> 
operator>>(
const M256I<T> &a, 
int imm8)
  1183     return internal::m256i_srli(
  1184         a, imm8, std::integral_constant<std::size_t, 
sizeof(T)>());
  1187 template <
typename T>
  1188 inline M256I<T> operator>>=(M256I<T> &a, 
int imm8)
  1196     M256I<T>, T, +, 
operator+, 
operator+=)
  1198     M256I<T>, T, -, 
operator-, 
operator-=)
  1200     M256I<T>, T, &, 
operator&, 
operator&=)
  1202     M256I<T>, T, |, 
operator|, 
operator|=)
  1204     M256I<T>, T, ^, 
operator^, 
operator^=)
  1213     M256(
const __m256 &value) : value_(value) {}
  1215     static constexpr std::size_t size() { 
return 8; }
  1217     __m256 &value() { 
return value_; }
  1218     const __m256 &value()
 const { 
return value_; }
  1220     __m256 *data() { 
return &value_; }
  1221     const __m256 *data()
 const { 
return &value_; }
  1223     template <
typename T>
  1224     void load_a(
const T *mem)
  1226         value_ = _mm256_load_ps(reinterpret_cast<const float *>(mem));
  1229     template <
typename T>
  1230     void load_u(
const T *mem)
  1232         value_ = _mm256_loadu_ps(reinterpret_cast<const float *>(mem));
  1235     template <
typename T>
  1236     void load(
const T *mem)
  1238         reinterpret_cast<std::uintptr_t
>(mem) % 32 == 0 ? load_a(mem) :
  1242     template <
typename T>
  1243     void store_a(T *mem)
 const  1245         _mm256_store_ps(reinterpret_cast<float *>(mem), value_);
  1248     template <
typename T>
  1249     void store_u(T *mem)
 const  1251         _mm256_storeu_ps(reinterpret_cast<float *>(mem), value_);
  1254     template <
typename T>
  1255     void store(T *mem)
 const  1257         reinterpret_cast<std::uintptr_t
>(mem) % 32 == 0 ? store_a(mem) :
  1261     void set0() { value_ = _mm256_setzero_ps(); }
  1263     void set1(
float e) { value_ = _mm256_set1_ps(e); }
  1265     void set(
float e7, 
float e6, 
float e5, 
float e4, 
float e3, 
float e2,
  1268         value_ = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
  1275 inline bool operator==(
const M256 &a, 
const M256 &b)
  1277     std::array<float, 8> sa;
  1278     std::array<float, 8> sb;
  1279     a.store_u(sa.data());
  1280     b.store_u(sb.data());
  1285 inline bool operator!=(
const M256 &a, 
const M256 &b) { 
return !(a == b); }
  1287 template <
typename CharT, 
typename Traits>
  1288 inline std::basic_ostream<CharT, Traits> &
operator<<(
  1289     std::basic_ostream<CharT, Traits> &os, 
const M256 &a)
  1294     std::array<float, 8> sa;
  1295     a.store_u(sa.data());
  1301 template <
typename CharT, 
typename Traits>
  1302 inline std::basic_istream<CharT, Traits> &
operator>>(
  1303     std::basic_istream<CharT, Traits> &is, M256 &a)
  1308     std::array<float, 8> sa;
  1312         a.load_u(sa.data());
  1317 inline M256 operator+(
const M256 &a, 
const M256 &b)
  1319     return M256(_mm256_add_ps(a.value(), b.value()));
  1322 inline M256 operator-(
const M256 &a, 
const M256 &b)
  1324     return M256(_mm256_sub_ps(a.value(), b.value()));
  1327 inline M256 operator*(
const M256 &a, 
const M256 &b)
  1329     return M256(_mm256_mul_ps(a.value(), b.value()));
  1332 inline M256 operator/(
const M256 &a, 
const M256 &b)
  1334     return M256(_mm256_div_ps(a.value(), b.value()));
  1349     M256D(
const __m256d &value) : value_(value) {}
  1351     static constexpr std::size_t size() { 
return 4; }
  1353     __m256d &value() { 
return value_; }
  1354     const __m256d &value()
 const { 
return value_; }
  1356     __m256d *data() { 
return &value_; }
  1357     const __m256d *data()
 const { 
return &value_; }
  1359     template <
typename T>
  1360     void load_a(
const T *mem)
  1362         value_ = _mm256_load_pd(reinterpret_cast<const double *>(mem));
  1365     template <
typename T>
  1366     void load_u(
const T *mem)
  1368         value_ = _mm256_loadu_pd(reinterpret_cast<const double *>(mem));
  1371     template <
typename T>
  1372     void load(
const T *mem)
  1374         reinterpret_cast<std::uintptr_t
>(mem) % 32 == 0 ? load_a(mem) :
  1378     template <
typename T>
  1379     void store_a(T *mem)
 const  1381         _mm256_store_pd(reinterpret_cast<double *>(mem), value_);
  1384     template <
typename T>
  1385     void store_u(T *mem)
 const  1387         _mm256_storeu_pd(reinterpret_cast<double *>(mem), value_);
  1390     template <
typename T>
  1391     void store(T *mem)
 const  1393         reinterpret_cast<std::uintptr_t
>(mem) % 32 == 0 ? store_a(mem) :
  1397     void set0() { value_ = _mm256_setzero_pd(); }
  1399     void set1(
double e) { value_ = _mm256_set1_pd(e); }
  1401     void set(
double e3, 
double e2, 
double e1, 
double e0)
  1403         value_ = _mm256_set_pd(e3, e2, e1, e0);
  1410 inline bool operator==(
const M256D &a, 
const M256D &b)
  1412     std::array<double, 4> sa;
  1413     std::array<double, 4> sb;
  1414     a.store_u(sa.data());
  1415     b.store_u(sb.data());
  1420 inline bool operator!=(
const M256D &a, 
const M256D &b) { 
return !(a == b); }
  1422 template <
typename CharT, 
typename Traits>
  1423 inline std::basic_ostream<CharT, Traits> &
operator<<(
  1424     std::basic_ostream<CharT, Traits> &os, 
const M256D &a)
  1429     std::array<double, 4> sa;
  1430     a.store_u(sa.data());
  1436 template <
typename CharT, 
typename Traits>
  1437 inline std::basic_istream<CharT, Traits> &
operator>>(
  1438     std::basic_istream<CharT, Traits> &is, M256D &a)
  1443     std::array<double, 4> sa;
  1447         a.load_u(sa.data());
  1452 inline M256D operator+(
const M256D &a, 
const M256D &b)
  1454     return M256D(_mm256_add_pd(a.value(), b.value()));
  1457 inline M256D operator-(
const M256D &a, 
const M256D &b)
  1459     return M256D(_mm256_sub_pd(a.value(), b.value()));
  1462 inline M256D operator*(
const M256D &a, 
const M256D &b)
  1464     return M256D(_mm256_mul_pd(a.value(), b.value()));
  1467 inline M256D operator/(
const M256D &a, 
const M256D &b)
  1469     return M256D(_mm256_div_pd(a.value(), b.value()));
  1473     M256D, 
double, +, 
operator+, 
operator+=)
  1475     M256D, 
double, -, 
operator-, 
operator-=)
  1477     M256D, 
double, *, 
operator*, 
operator*=)
  1479     M256D, 
double, /, 
operator/, 
operator/=)
  1484 template <
typename RealType>
  1485 class M256TypeTrait;
  1488 class M256TypeTrait<float>
  1495 class M256TypeTrait<double>
  1504 template <
typename T>
  1505 using M256Type = 
typename std::conditional<std::is_integral<T>::value,
  1506     M256I<T>, 
typename internal::M256TypeTrait<T>::type>::type;
  1508 #endif // VSMC_HAS_AVX2  1512 #endif // VSMC_UTILITY_SIMD_HPP 
std::basic_ostream< CharT, Traits > & operator<<(std::basic_ostream< CharT, Traits > &os, const Sampler< T > &sampler)
 
#define VSMC_DEFINE_UTILITY_SIMD_INTEGER_BINARY_OP(                                                   Type, CType, op, bin, assign)                                                                                      
 
bool operator==(const MKLBase< MKLPtr, Derived > &ptr1, const MKLBase< MKLPtr, Derived > &ptr2)
Comparison of equality of two MKLBase objects. 
 
std::basic_istream< CharT, Traits > & operator>>(std::basic_istream< CharT, Traits > &is, std::array< T, N > &ary)
 
bool operator!=(const MKLBase< MKLPtr, Derived > &ptr1, const MKLBase< MKLPtr, Derived > &ptr2)
Comparison of inequality of two MKLBase objects. 
 
#define VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(Type, CType, op, bin, assign)