32 #ifndef VSMC_UTILITY_SIMD_HPP 33 #define VSMC_UTILITY_SIMD_HPP 37 #define VSMC_DEFINE_UTILITY_SIMD_INTEGER_BINARY_OP( \ 38 Type, CType, op, bin, assign) \ 39 template <typename T> \ 40 inline Type &assign(Type &a, const Type &b) \ 47 template <typename T> \ 48 inline Type bin(const Type &a, CType b) \ 56 template <typename T> \ 57 inline Type bin(CType a, const Type &b) \ 65 template <typename T> \ 66 inline Type &assign(Type &a, CType b) \ 73 #define VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(Type, CType, op, bin, assign) \ 74 inline Type &assign(Type &a, const Type &b) \ 81 inline Type bin(const Type &a, CType b) \ 89 inline Type bin(CType a, const Type &b) \ 97 inline Type &assign(Type &a, CType b) \ 105 #include <emmintrin.h> 112 template <
typename IntType = __m128i>
116 using value_type = IntType;
120 M128I(
const __m128i &value) : value_(value) {}
122 template <
typename T>
123 M128I(
const M128I<T> &other)
124 : value_(other.value())
128 template <
typename T>
129 M128I<IntType> &operator=(
const M128I<T> &other)
131 value_ = other.value();
136 static constexpr std::size_t size()
138 return sizeof(__m128i) /
sizeof(IntType);
141 __m128i &value() {
return value_; }
142 const __m128i &value()
const {
return value_; }
144 __m128i *data() {
return &value_; }
145 const __m128i *data()
const {
return &value_; }
147 template <
typename T>
148 void load_a(
const T *mem)
150 value_ = _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
153 template <
typename T>
154 void load_u(
const T *mem)
156 value_ = _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem));
159 template <
typename T>
160 void load(
const T *mem)
162 reinterpret_cast<std::uintptr_t
>(mem) % 16 == 0 ? load_a(mem) :
166 template <
typename T>
167 void store_a(T *mem)
const 169 _mm_store_si128(reinterpret_cast<__m128i *>(mem), value_);
172 template <
typename T>
173 void store_u(T *mem)
const 175 _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), value_);
178 template <
typename T>
179 void store(T *mem)
const 181 reinterpret_cast<std::uintptr_t
>(mem) % 16 == 0 ? store_a(mem) :
185 void set0() { value_ = _mm_setzero_si128(); }
187 template <
typename T>
190 value_ = set1(n, std::integral_constant<std::size_t,
sizeof(T)>());
193 template <
typename T>
196 value_ = _mm_set_epi64x(
197 static_cast<VSMC_INT64>(e1), static_cast<VSMC_INT64>(e0));
200 template <
typename T>
201 void set(T e3, T e2, T e1, T e0)
203 value_ = _mm_set_epi32(static_cast<int>(e3), static_cast<int>(e2),
204 static_cast<int>(e1), static_cast<int>(e0));
207 template <
typename T>
208 void set(T e7, T e6, T e5, T e4, T e3, T e2, T e1, T e0)
210 value_ = _mm_set_epi16(static_cast<short>(e7), static_cast<short>(e6),
211 static_cast<short>(e5), static_cast<short>(e4),
212 static_cast<short>(e3), static_cast<short>(e2),
213 static_cast<short>(e1), static_cast<short>(e0));
216 template <
typename T>
217 void set(T e15, T e14, T e13, T e12, T e11, T e10, T e9, T e8, T e7, T e6,
218 T e5, T e4, T e3, T e2, T e1, T e0)
220 value_ = _mm_set_epi8(static_cast<char>(e15), static_cast<char>(e14),
221 static_cast<char>(e13), static_cast<char>(e12),
222 static_cast<char>(e11), static_cast<char>(e10),
223 static_cast<char>(e9), static_cast<char>(e8),
224 static_cast<char>(e7), static_cast<char>(e6),
225 static_cast<char>(e5), static_cast<char>(e4),
226 static_cast<char>(e3), static_cast<char>(e2),
227 static_cast<char>(e1), static_cast<char>(e0));
233 template <
typename T>
234 __m128i set1(T n, std::integral_constant<std::size_t,
sizeof(std::int8_t)>)
236 return _mm_set1_epi8(static_cast<char>(n));
239 template <
typename T>
241 T n, std::integral_constant<std::size_t,
sizeof(std::int16_t)>)
243 return _mm_set1_epi16(static_cast<short>(n));
246 template <
typename T>
248 T n, std::integral_constant<std::size_t,
sizeof(std::int32_t)>)
250 return _mm_set1_epi32(static_cast<int>(n));
253 template <
typename T>
255 T n, std::integral_constant<std::size_t,
sizeof(std::int64_t)>)
257 return _mm_set1_epi64x(static_cast<VSMC_INT64>(n));
264 template <
typename T>
265 inline M128I<T> m128i_add(
const M128I<T> &a,
const M128I<T> &b,
266 std::integral_constant<std::size_t,
sizeof(std::int8_t)>)
268 return M128I<T>(_mm_add_epi8(a.value(), b.value()));
271 template <
typename T>
272 inline M128I<T> m128i_add(
const M128I<T> &a,
const M128I<T> &b,
273 std::integral_constant<std::size_t,
sizeof(std::int16_t)>)
275 return M128I<T>(_mm_add_epi16(a.value(), b.value()));
278 template <
typename T>
279 inline M128I<T> m128i_add(
const M128I<T> &a,
const M128I<T> &b,
280 std::integral_constant<std::size_t,
sizeof(std::int32_t)>)
282 return M128I<T>(_mm_add_epi32(a.value(), b.value()));
285 template <
typename T>
286 inline M128I<T> m128i_add(
const M128I<T> &a,
const M128I<T> &b,
287 std::integral_constant<std::size_t,
sizeof(std::int64_t)>)
289 return M128I<T>(_mm_add_epi64(a.value(), b.value()));
292 template <
typename T>
293 inline M128I<T> m128i_sub(
const M128I<T> &a,
const M128I<T> &b,
294 std::integral_constant<std::size_t,
sizeof(std::int8_t)>)
296 return M128I<T>(_mm_sub_epi8(a.value(), b.value()));
299 template <
typename T>
300 inline M128I<T> m128i_sub(
const M128I<T> &a,
const M128I<T> &b,
301 std::integral_constant<std::size_t,
sizeof(std::int16_t)>)
303 return M128I<T>(_mm_sub_epi16(a.value(), b.value()));
306 template <
typename T>
307 inline M128I<T> m128i_sub(
const M128I<T> &a,
const M128I<T> &b,
308 std::integral_constant<std::size_t,
sizeof(std::int32_t)>)
310 return M128I<T>(_mm_sub_epi32(a.value(), b.value()));
313 template <
typename T>
314 inline M128I<T> m128i_sub(
const M128I<T> &a,
const M128I<T> &b,
315 std::integral_constant<std::size_t,
sizeof(std::int64_t)>)
317 return M128I<T>(_mm_sub_epi64(a.value(), b.value()));
320 template <
typename T>
321 inline M128I<T> m128i_slli(
const M128I<T> &a,
int imm8,
322 std::integral_constant<std::size_t,
sizeof(std::int8_t)>)
324 return M128I<T>(_mm_slli_epi8(a.value(), imm8));
327 template <
typename T>
328 inline M128I<T> m128i_slli(
const M128I<T> &a,
int imm8,
329 std::integral_constant<std::size_t,
sizeof(std::int16_t)>)
331 return M128I<T>(_mm_slli_epi16(a.value(), imm8));
334 template <
typename T>
335 inline M128I<T> m128i_slli(
const M128I<T> &a,
int imm8,
336 std::integral_constant<std::size_t,
sizeof(std::int32_t)>)
338 return M128I<T>(_mm_slli_epi32(a.value(), imm8));
341 template <
typename T>
342 inline M128I<T> m128i_slli(
const M128I<T> &a,
int imm8,
343 std::integral_constant<std::size_t,
sizeof(std::int64_t)>)
345 return M128I<T>(_mm_slli_epi64(a.value(), imm8));
348 template <
typename T>
349 inline M128I<T> m128i_srli(
const M128I<T> &a,
int imm8,
350 std::integral_constant<std::size_t,
sizeof(std::int8_t)>)
352 return M128I<T>(_mm_srli_epi8(a.value(), imm8));
355 template <
typename T>
356 inline M128I<T> m128i_srli(
const M128I<T> &a,
int imm8,
357 std::integral_constant<std::size_t,
sizeof(std::int16_t)>)
359 return M128I<T>(_mm_srli_epi16(a.value(), imm8));
362 template <
typename T>
363 inline M128I<T> m128i_srli(
const M128I<T> &a,
int imm8,
364 std::integral_constant<std::size_t,
sizeof(std::int32_t)>)
366 return M128I<T>(_mm_srli_epi32(a.value(), imm8));
369 template <
typename T>
370 inline M128I<T> m128i_srli(
const M128I<T> &a,
int imm8,
371 std::integral_constant<std::size_t,
sizeof(std::int64_t)>)
373 return M128I<T>(_mm_srli_epi64(a.value(), imm8));
378 template <
typename T>
379 inline bool operator==(
const M128I<T> &a,
const M128I<T> &b)
381 std::array<std::uint64_t, 2> sa;
382 std::array<std::uint64_t, 2> sb;
383 a.store_u(sa.data());
384 b.store_u(sb.data());
389 template <
typename T>
390 inline bool operator!=(
const M128I<T> &a,
const M128I<T> &b)
395 template <
typename CharT,
typename Traits,
typename T>
396 inline std::basic_ostream<CharT, Traits> &
operator<<(
397 std::basic_ostream<CharT, Traits> &os,
const M128I<T> &a)
402 std::array<T, M128I<T>::size()> sa;
403 a.store_u(sa.data());
409 template <
typename CharT,
typename Traits,
typename T>
410 inline std::basic_istream<CharT, Traits> &
operator>>(
411 std::basic_istream<CharT, Traits> &is, M128I<T> &a)
416 std::array<T, M128I<T>::size()> sa;
425 template <
typename T>
426 inline M128I<T> operator+(
const M128I<T> &a,
const M128I<T> &b)
428 return internal::m128i_add(
429 a, b, std::integral_constant<std::size_t,
sizeof(T)>());
432 template <
typename T>
433 inline M128I<T> operator-(
const M128I<T> &a,
const M128I<T> &b)
435 return internal::m128i_sub(
436 a, b, std::integral_constant<std::size_t,
sizeof(T)>());
439 template <
typename T>
440 inline M128I<T> operator&(
const M128I<T> &a,
const M128I<T> &b)
442 return M128I<T>(_mm_and_si128(a.value(), b.value()));
445 template <
typename T>
446 inline M128I<T> operator|(
const M128I<T> &a,
const M128I<T> &b)
448 return M128I<T>(_mm_or_si128(a.value(), b.value()));
451 template <
typename T>
452 inline M128I<T> operator^(
const M128I<T> &a,
const M128I<T> &b) {
453 return M128I<T>(_mm_xor_si128(a.value(), b.value()));
456 template <
typename T>
457 inline M128I<T> operator<<(const M128I<T> &a,
int imm8)
459 return internal::m128i_slli(
460 a, imm8, std::integral_constant<std::size_t,
sizeof(T)>());
463 template <
typename T>
464 inline M128I<T> operator<<=(M128I<T> &a,
int imm8)
471 template <
typename T>
472 inline M128I<T>
operator>>(
const M128I<T> &a,
int imm8)
474 return internal::m128i_srli(
475 a, imm8, std::integral_constant<std::size_t,
sizeof(T)>());
478 template <
typename T>
479 inline M128I<T> operator>>=(M128I<T> &a,
int imm8)
487 M128I<T>, T, +,
operator+,
operator+=)
489 M128I<T>, T, -,
operator-,
operator-=)
491 M128I<T>, T, &,
operator&,
operator&=)
493 M128I<T>, T, |,
operator|,
operator|=)
495 M128I<T>, T, ^,
operator^,
operator^=)
504 M128(
const __m128 &value) : value_(value) {}
506 static constexpr std::size_t size() {
return 4; }
508 __m128 &value() {
return value_; }
509 const __m128 &value()
const {
return value_; }
511 __m128 *data() {
return &value_; }
512 const __m128 *data()
const {
return &value_; }
514 template <
typename T>
515 void load_a(
const T *mem)
517 value_ = _mm_load_ps(reinterpret_cast<const float *>(mem));
520 template <
typename T>
521 void load_u(
const T *mem)
523 value_ = _mm_loadu_ps(reinterpret_cast<const float *>(mem));
526 template <
typename T>
527 void load(
const T *mem)
529 reinterpret_cast<std::uintptr_t
>(mem) % 16 == 0 ? load_a(mem) :
533 template <
typename T>
534 void store_a(T *mem)
const 536 _mm_store_ps(reinterpret_cast<float *>(mem), value_);
539 template <
typename T>
540 void store_u(T *mem)
const 542 _mm_storeu_ps(reinterpret_cast<float *>(mem), value_);
545 template <
typename T>
546 void store(T *mem)
const 548 reinterpret_cast<std::uintptr_t
>(mem) % 16 == 0 ? store_a(mem) :
552 void set0() { value_ = _mm_setzero_ps(); }
554 void set1(
float e) { value_ = _mm_set1_ps(e); }
556 void set(
float e3,
float e2,
float e1,
float e0)
558 value_ = _mm_set_ps(e3, e2, e1, e0);
565 inline bool operator==(
const M128 &a,
const M128 &b)
567 std::array<float, 4> sa;
568 std::array<float, 4> sb;
569 a.store_u(sa.data());
570 b.store_u(sb.data());
575 inline bool operator!=(
const M128 &a,
const M128 &b) {
return !(a == b); }
577 template <
typename CharT,
typename Traits>
578 inline std::basic_ostream<CharT, Traits> &
operator<<(
579 std::basic_ostream<CharT, Traits> &os,
const M128 &a)
584 std::array<float, 4> sa;
585 a.store_u(sa.data());
591 template <
typename CharT,
typename Traits>
592 inline std::basic_istream<CharT, Traits> &
operator>>(
593 std::basic_istream<CharT, Traits> &is, M128 &a)
598 std::array<float, 4> sa;
607 inline M128 operator+(
const M128 &a,
const M128 &b)
609 return M128(_mm_add_ps(a.value(), b.value()));
612 inline M128 operator-(
const M128 &a,
const M128 &b)
614 return M128(_mm_sub_ps(a.value(), b.value()));
617 inline M128 operator*(
const M128 &a,
const M128 &b)
619 return M128(_mm_mul_ps(a.value(), b.value()));
622 inline M128 operator/(
const M128 &a,
const M128 &b)
624 return M128(_mm_div_ps(a.value(), b.value()));
639 M128D(
const __m128d &value) : value_(value) {}
641 static constexpr std::size_t size() {
return 2; }
643 __m128d &value() {
return value_; }
644 const __m128d &value()
const {
return value_; }
646 __m128d *data() {
return &value_; }
647 const __m128d *data()
const {
return &value_; }
649 template <
typename T>
650 void load_a(
const T *mem)
652 value_ = _mm_load_pd(reinterpret_cast<const double *>(mem));
655 template <
typename T>
656 void load_u(
const T *mem)
658 value_ = _mm_loadu_pd(reinterpret_cast<const double *>(mem));
661 template <
typename T>
662 void load(
const T *mem)
664 reinterpret_cast<std::uintptr_t
>(mem) % 16 == 0 ? load_a(mem) :
668 template <
typename T>
669 void store_a(T *mem)
const 671 _mm_store_pd(reinterpret_cast<double *>(mem), value_);
674 template <
typename T>
675 void store_u(T *mem)
const 677 _mm_storeu_pd(reinterpret_cast<double *>(mem), value_);
680 template <
typename T>
681 void store(T *mem)
const 683 reinterpret_cast<std::uintptr_t
>(mem) % 16 == 0 ? store_a(mem) :
687 void set0() { value_ = _mm_setzero_pd(); }
689 void set1(
double e) { value_ = _mm_set1_pd(e); }
691 void set(
double e1,
double e0) { value_ = _mm_set_pd(e1, e0); }
697 inline bool operator==(
const M128D &a,
const M128D &b)
699 std::array<double, 2> sa;
700 std::array<double, 2> sb;
701 a.store_u(sa.data());
702 b.store_u(sb.data());
707 inline bool operator!=(
const M128D &a,
const M128D &b) {
return !(a == b); }
709 template <
typename CharT,
typename Traits>
710 inline std::basic_ostream<CharT, Traits> &
operator<<(
711 std::basic_ostream<CharT, Traits> &os,
const M128D &a)
716 std::array<double, 2> sa;
717 a.store_u(sa.data());
723 template <
typename CharT,
typename Traits>
724 inline std::basic_istream<CharT, Traits> &
operator>>(
725 std::basic_istream<CharT, Traits> &is, M128D &a)
730 std::array<double, 2> sa;
739 inline M128D operator+(
const M128D &a,
const M128D &b)
741 return M128D(_mm_add_pd(a.value(), b.value()));
744 inline M128D operator-(
const M128D &a,
const M128D &b)
746 return M128D(_mm_sub_pd(a.value(), b.value()));
749 inline M128D operator*(
const M128D &a,
const M128D &b)
751 return M128D(_mm_mul_pd(a.value(), b.value()));
754 inline M128D operator/(
const M128D &a,
const M128D &b)
756 return M128D(_mm_div_pd(a.value(), b.value()));
760 M128D,
double, +,
operator+,
operator+=)
762 M128D,
double, -,
operator-,
operator-=)
764 M128D,
double, *,
operator*,
operator*=)
766 M128D,
double, /,
operator/,
operator/=)
771 template <
typename RealType>
775 class M128TypeTrait<float>
782 class M128TypeTrait<double>
791 template <
typename T>
792 using M128Type =
typename std::conditional<std::is_integral<T>::value,
793 M128I<T>,
typename internal::M128TypeTrait<T>::type>::type;
795 #endif // VSMC_HAS_SSE2 798 #include <immintrin.h> 802 template <
typename IntType = __m256i>
806 using value_type = IntType;
810 M256I(
const __m256i &value) : value_(value) {}
812 template <
typename T>
813 M256I(
const M256I<T> &other)
814 : value_(other.value())
818 template <
typename T>
819 M256I<IntType> &operator=(
const M256I<T> &other)
821 value_ = other.value_;
826 static constexpr std::size_t size()
828 return sizeof(__m256i) /
sizeof(IntType);
831 __m256i &value() {
return value_; }
832 const __m256i &value()
const {
return value_; }
834 __m256i *data() {
return &value_; }
835 const __m256i *data()
const {
return &value_; }
837 template <
typename T>
838 void load_a(
const T *mem)
840 value_ = _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
843 template <
typename T>
844 void load_u(
const T *mem)
846 value_ = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
849 template <
typename T>
850 void load(
const T *mem)
852 reinterpret_cast<std::uintptr_t
>(mem) % 32 == 0 ? load_a(mem) :
856 template <
typename T>
857 void store_a(T *mem)
const 859 _mm256_store_si256(reinterpret_cast<__m256i *>(mem), value_);
862 template <
typename T>
863 void store_u(T *mem)
const 865 _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), value_);
868 template <
typename T>
869 void store(T *mem)
const 871 reinterpret_cast<std::uintptr_t
>(mem) % 32 == 0 ? store_a(mem) :
875 void set0() { value_ = _mm256_setzero_si256(); }
877 template <
typename T>
880 value_ = set1(n, std::integral_constant<std::size_t,
sizeof(T)>());
883 template <
typename T>
884 void set(T e3, T e2, T e1, T e0)
886 value_ = _mm256_set_epi64x(static_cast<VSMC_INT64>(e3),
887 static_cast<VSMC_INT64>(e2), static_cast<VSMC_INT64>(e1),
888 static_cast<VSMC_INT64>(e0));
891 template <
typename T>
892 void set(T e7, T e6, T e5, T e4, T e3, T e2, T e1, T e0)
894 value_ = _mm256_set_epi32(static_cast<int>(e7), static_cast<int>(e6),
895 static_cast<int>(e5), static_cast<int>(e4), static_cast<int>(e3),
896 static_cast<int>(e2), static_cast<int>(e1), static_cast<int>(e0));
899 template <
typename T>
900 void set(T e15, T e14, T e13, T e12, T e11, T e10, T e9, T e8, T e7, T e6,
901 T e5, T e4, T e3, T e2, T e1, T e0)
904 _mm256_set_epi16(static_cast<short>(e15), static_cast<short>(e14),
905 static_cast<short>(e13), static_cast<short>(e12),
906 static_cast<short>(e11), static_cast<short>(e10),
907 static_cast<short>(e9), static_cast<short>(e8),
908 static_cast<short>(e7), static_cast<short>(e6),
909 static_cast<short>(e5), static_cast<short>(e4),
910 static_cast<short>(e3), static_cast<short>(e2),
911 static_cast<short>(e1), static_cast<short>(e0));
914 template <
typename T>
915 void set(T e31, T e30, T e29, T e28, T e27, T e26, T e25, T e24, T e23,
916 T e22, T e21, T e20, T e19, T e18, T e17, T e16, T e15, T e14, T e13,
917 T e12, T e11, T e10, T e9, T e8, T e7, T e6, T e5, T e4, T e3, T e2,
921 _mm256_set_epi8(static_cast<char>(e31), static_cast<char>(e30),
922 static_cast<char>(e29), static_cast<char>(e28),
923 static_cast<char>(e27), static_cast<char>(e26),
924 static_cast<char>(e25), static_cast<char>(e24),
925 static_cast<char>(e23), static_cast<char>(e22),
926 static_cast<char>(e21), static_cast<char>(e20),
927 static_cast<char>(e19), static_cast<char>(e18),
928 static_cast<char>(e17), static_cast<char>(e16),
929 static_cast<char>(e15), static_cast<char>(e14),
930 static_cast<char>(e13), static_cast<char>(e12),
931 static_cast<char>(e11), static_cast<char>(e10),
932 static_cast<char>(e9), static_cast<char>(e8),
933 static_cast<char>(e7), static_cast<char>(e6),
934 static_cast<char>(e5), static_cast<char>(e4),
935 static_cast<char>(e3), static_cast<char>(e2),
936 static_cast<char>(e1), static_cast<char>(e0));
942 template <
typename T>
943 __m256i set1(T n, std::integral_constant<std::size_t,
sizeof(std::int8_t)>)
945 return _mm256_set1_epi8(static_cast<char>(n));
948 template <
typename T>
950 T n, std::integral_constant<std::size_t,
sizeof(std::int16_t)>)
952 return _mm256_set1_epi16(static_cast<short>(n));
955 template <
typename T>
957 T n, std::integral_constant<std::size_t,
sizeof(std::int32_t)>)
959 return _mm256_set1_epi32(static_cast<int>(n));
962 template <
typename T>
964 T n, std::integral_constant<std::size_t,
sizeof(std::int64_t)>)
966 return _mm256_set1_epi64x(static_cast<long long>(n));
973 template <
typename T>
974 inline M256I<T> m256i_add(
const M256I<T> &a,
const M256I<T> &b,
975 std::integral_constant<std::size_t,
sizeof(std::int8_t)>)
977 return M256I<T>(_mm256_add_epi8(a.value(), b.value()));
980 template <
typename T>
981 inline M256I<T> m256i_add(
const M256I<T> &a,
const M256I<T> &b,
982 std::integral_constant<std::size_t,
sizeof(std::int16_t)>)
984 return M256I<T>(_mm256_add_epi16(a.value(), b.value()));
987 template <
typename T>
988 inline M256I<T> m256i_add(
const M256I<T> &a,
const M256I<T> &b,
989 std::integral_constant<std::size_t,
sizeof(std::int32_t)>)
991 return M256I<T>(_mm256_add_epi32(a.value(), b.value()));
994 template <
typename T>
995 inline M256I<T> m256i_add(
const M256I<T> &a,
const M256I<T> &b,
996 std::integral_constant<std::size_t,
sizeof(std::int64_t)>)
998 return M256I<T>(_mm256_add_epi64(a.value(), b.value()));
1001 template <
typename T>
1002 inline M256I<T> m256i_sub(
const M256I<T> &a,
const M256I<T> &b,
1003 std::integral_constant<std::size_t,
sizeof(std::int8_t)>)
1005 return M256I<T>(_mm256_sub_epi8(a.value(), b.value()));
1008 template <
typename T>
1009 inline M256I<T> m256i_sub(
const M256I<T> &a,
const M256I<T> &b,
1010 std::integral_constant<std::size_t,
sizeof(std::int16_t)>)
1012 return M256I<T>(_mm256_sub_epi16(a.value(), b.value()));
1015 template <
typename T>
1016 inline M256I<T> m256i_sub(
const M256I<T> &a,
const M256I<T> &b,
1017 std::integral_constant<std::size_t,
sizeof(std::int32_t)>)
1019 return M256I<T>(_mm256_sub_epi32(a.value(), b.value()));
1022 template <
typename T>
1023 inline M256I<T> m256i_sub(
const M256I<T> &a,
const M256I<T> &b,
1024 std::integral_constant<std::size_t,
sizeof(std::int64_t)>)
1026 return M256I<T>(_mm256_sub_epi64(a.value(), b.value()));
1029 template <
typename T>
1030 inline M256I<T> m256i_slli(
const M256I<T> &a,
int imm8,
1031 std::integral_constant<std::size_t,
sizeof(std::int8_t)>)
1033 return M256I<T>(_mm256_slli_epi8(a.value(), imm8));
1036 template <
typename T>
1037 inline M256I<T> m256i_slli(
const M256I<T> &a,
int imm8,
1038 std::integral_constant<std::size_t,
sizeof(std::int16_t)>)
1040 return M256I<T>(_mm256_slli_epi16(a.value(), imm8));
1043 template <
typename T>
1044 inline M256I<T> m256i_slli(
const M256I<T> &a,
int imm8,
1045 std::integral_constant<std::size_t,
sizeof(std::int32_t)>)
1047 return M256I<T>(_mm256_slli_epi32(a.value(), imm8));
1050 template <
typename T>
1051 inline M256I<T> m256i_slli(
const M256I<T> &a,
int imm8,
1052 std::integral_constant<std::size_t,
sizeof(std::int64_t)>)
1054 return M256I<T>(_mm256_slli_epi64(a.value(), imm8));
1057 template <
typename T>
1058 inline M256I<T> m256i_srli(
const M256I<T> &a,
int imm8,
1059 std::integral_constant<std::size_t,
sizeof(std::int8_t)>)
1061 return M256I<T>(_mm256_srli_epi8(a.value(), imm8));
1064 template <
typename T>
1065 inline M256I<T> m256i_srli(
const M256I<T> &a,
int imm8,
1066 std::integral_constant<std::size_t,
sizeof(std::int16_t)>)
1068 return M256I<T>(_mm256_srli_epi16(a.value(), imm8));
1071 template <
typename T>
1072 inline M256I<T> m256i_srli(
const M256I<T> &a,
int imm8,
1073 std::integral_constant<std::size_t,
sizeof(std::int32_t)>)
1075 return M256I<T>(_mm256_srli_epi32(a.value(), imm8));
1078 template <
typename T>
1079 inline M256I<T> m256i_srli(
const M256I<T> &a,
int imm8,
1080 std::integral_constant<std::size_t,
sizeof(std::int64_t)>)
1082 return M256I<T>(_mm256_srli_epi64(a.value(), imm8));
1087 template <
typename T>
1088 inline bool operator==(
const M256I<T> &a,
const M256I<T> &b)
1090 std::array<std::uint64_t, 4> sa;
1091 std::array<std::uint64_t, 4> sb;
1092 a.store_u(sa.data());
1093 b.store_u(sb.data());
1098 template <
typename T>
1099 inline bool operator!=(
const M256I<T> &a,
const M256I<T> &b)
1104 template <
typename CharT,
typename Traits,
typename T>
1105 inline std::basic_ostream<CharT, Traits> &
operator<<(
1106 std::basic_ostream<CharT, Traits> &os,
const M256I<T> &a)
1111 std::array<T, M256I<T>::size()> sa;
1112 a.store_u(sa.data());
1118 template <
typename CharT,
typename Traits,
typename T>
1119 inline std::basic_istream<CharT, Traits> &
operator>>(
1120 std::basic_istream<CharT, Traits> &is, M256I<T> &a)
1125 std::array<T, M256I<T>::size()> sa;
1129 a.load_u(sa.data());
1134 template <
typename T>
1135 inline M256I<T> operator+(
const M256I<T> &a,
const M256I<T> &b)
1137 return internal::m256i_add(
1138 a, b, std::integral_constant<std::size_t,
sizeof(T)>());
1141 template <
typename T>
1142 inline M256I<T> operator-(
const M256I<T> &a,
const M256I<T> &b)
1144 return internal::m256i_sub(
1145 a, b, std::integral_constant<std::size_t,
sizeof(T)>());
1148 template <
typename T>
1149 inline M256I<T> operator&(
const M256I<T> &a,
const M256I<T> &b)
1151 return M256I<T>(_mm256_and_si256(a.value(), b.value()));
1154 template <
typename T>
1155 inline M256I<T> operator|(
const M256I<T> &a,
const M256I<T> &b)
1157 return M256I<T>(_mm256_or_si256(a.value(), b.value()));
1160 template <
typename T>
1161 inline M256I<T> operator^(
const M256I<T> &a,
const M256I<T> &b) {
1162 return M256I<T>(_mm256_xor_si256(a.value(), b.value()));
1165 template <
typename T>
1166 inline M256I<T> operator<<(const M256I<T> &a,
int imm8)
1168 return internal::m256i_slli(
1169 a, imm8, std::integral_constant<std::size_t,
sizeof(T)>());
1172 template <
typename T>
1173 inline M256I<T> operator<<=(M256I<T> &a,
int imm8)
1180 template <
typename T>
1181 inline M256I<T>
operator>>(
const M256I<T> &a,
int imm8)
1183 return internal::m256i_srli(
1184 a, imm8, std::integral_constant<std::size_t,
sizeof(T)>());
1187 template <
typename T>
1188 inline M256I<T> operator>>=(M256I<T> &a,
int imm8)
1196 M256I<T>, T, +,
operator+,
operator+=)
1198 M256I<T>, T, -,
operator-,
operator-=)
1200 M256I<T>, T, &,
operator&,
operator&=)
1202 M256I<T>, T, |,
operator|,
operator|=)
1204 M256I<T>, T, ^,
operator^,
operator^=)
1213 M256(
const __m256 &value) : value_(value) {}
1215 static constexpr std::size_t size() {
return 8; }
1217 __m256 &value() {
return value_; }
1218 const __m256 &value()
const {
return value_; }
1220 __m256 *data() {
return &value_; }
1221 const __m256 *data()
const {
return &value_; }
1223 template <
typename T>
1224 void load_a(
const T *mem)
1226 value_ = _mm256_load_ps(reinterpret_cast<const float *>(mem));
1229 template <
typename T>
1230 void load_u(
const T *mem)
1232 value_ = _mm256_loadu_ps(reinterpret_cast<const float *>(mem));
1235 template <
typename T>
1236 void load(
const T *mem)
1238 reinterpret_cast<std::uintptr_t
>(mem) % 32 == 0 ? load_a(mem) :
1242 template <
typename T>
1243 void store_a(T *mem)
const 1245 _mm256_store_ps(reinterpret_cast<float *>(mem), value_);
1248 template <
typename T>
1249 void store_u(T *mem)
const 1251 _mm256_storeu_ps(reinterpret_cast<float *>(mem), value_);
1254 template <
typename T>
1255 void store(T *mem)
const 1257 reinterpret_cast<std::uintptr_t
>(mem) % 32 == 0 ? store_a(mem) :
1261 void set0() { value_ = _mm256_setzero_ps(); }
1263 void set1(
float e) { value_ = _mm256_set1_ps(e); }
1265 void set(
float e7,
float e6,
float e5,
float e4,
float e3,
float e2,
1268 value_ = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
1275 inline bool operator==(
const M256 &a,
const M256 &b)
1277 std::array<float, 8> sa;
1278 std::array<float, 8> sb;
1279 a.store_u(sa.data());
1280 b.store_u(sb.data());
1285 inline bool operator!=(
const M256 &a,
const M256 &b) {
return !(a == b); }
1287 template <
typename CharT,
typename Traits>
1288 inline std::basic_ostream<CharT, Traits> &
operator<<(
1289 std::basic_ostream<CharT, Traits> &os,
const M256 &a)
1294 std::array<float, 8> sa;
1295 a.store_u(sa.data());
1301 template <
typename CharT,
typename Traits>
1302 inline std::basic_istream<CharT, Traits> &
operator>>(
1303 std::basic_istream<CharT, Traits> &is, M256 &a)
1308 std::array<float, 8> sa;
1312 a.load_u(sa.data());
1317 inline M256 operator+(
const M256 &a,
const M256 &b)
1319 return M256(_mm256_add_ps(a.value(), b.value()));
1322 inline M256 operator-(
const M256 &a,
const M256 &b)
1324 return M256(_mm256_sub_ps(a.value(), b.value()));
1327 inline M256 operator*(
const M256 &a,
const M256 &b)
1329 return M256(_mm256_mul_ps(a.value(), b.value()));
1332 inline M256 operator/(
const M256 &a,
const M256 &b)
1334 return M256(_mm256_div_ps(a.value(), b.value()));
1349 M256D(
const __m256d &value) : value_(value) {}
1351 static constexpr std::size_t size() {
return 4; }
1353 __m256d &value() {
return value_; }
1354 const __m256d &value()
const {
return value_; }
1356 __m256d *data() {
return &value_; }
1357 const __m256d *data()
const {
return &value_; }
1359 template <
typename T>
1360 void load_a(
const T *mem)
1362 value_ = _mm256_load_pd(reinterpret_cast<const double *>(mem));
1365 template <
typename T>
1366 void load_u(
const T *mem)
1368 value_ = _mm256_loadu_pd(reinterpret_cast<const double *>(mem));
1371 template <
typename T>
1372 void load(
const T *mem)
1374 reinterpret_cast<std::uintptr_t
>(mem) % 32 == 0 ? load_a(mem) :
1378 template <
typename T>
1379 void store_a(T *mem)
const 1381 _mm256_store_pd(reinterpret_cast<double *>(mem), value_);
1384 template <
typename T>
1385 void store_u(T *mem)
const 1387 _mm256_storeu_pd(reinterpret_cast<double *>(mem), value_);
1390 template <
typename T>
1391 void store(T *mem)
const 1393 reinterpret_cast<std::uintptr_t
>(mem) % 32 == 0 ? store_a(mem) :
1397 void set0() { value_ = _mm256_setzero_pd(); }
1399 void set1(
double e) { value_ = _mm256_set1_pd(e); }
1401 void set(
double e3,
double e2,
double e1,
double e0)
1403 value_ = _mm256_set_pd(e3, e2, e1, e0);
1410 inline bool operator==(
const M256D &a,
const M256D &b)
1412 std::array<double, 4> sa;
1413 std::array<double, 4> sb;
1414 a.store_u(sa.data());
1415 b.store_u(sb.data());
1420 inline bool operator!=(
const M256D &a,
const M256D &b) {
return !(a == b); }
1422 template <
typename CharT,
typename Traits>
1423 inline std::basic_ostream<CharT, Traits> &
operator<<(
1424 std::basic_ostream<CharT, Traits> &os,
const M256D &a)
1429 std::array<double, 4> sa;
1430 a.store_u(sa.data());
1436 template <
typename CharT,
typename Traits>
1437 inline std::basic_istream<CharT, Traits> &
operator>>(
1438 std::basic_istream<CharT, Traits> &is, M256D &a)
1443 std::array<double, 4> sa;
1447 a.load_u(sa.data());
1452 inline M256D operator+(
const M256D &a,
const M256D &b)
1454 return M256D(_mm256_add_pd(a.value(), b.value()));
1457 inline M256D operator-(
const M256D &a,
const M256D &b)
1459 return M256D(_mm256_sub_pd(a.value(), b.value()));
1462 inline M256D operator*(
const M256D &a,
const M256D &b)
1464 return M256D(_mm256_mul_pd(a.value(), b.value()));
1467 inline M256D operator/(
const M256D &a,
const M256D &b)
1469 return M256D(_mm256_div_pd(a.value(), b.value()));
1473 M256D,
double, +,
operator+,
operator+=)
1475 M256D,
double, -,
operator-,
operator-=)
1477 M256D,
double, *,
operator*,
operator*=)
1479 M256D,
double, /,
operator/,
operator/=)
1484 template <
typename RealType>
1485 class M256TypeTrait;
1488 class M256TypeTrait<float>
1495 class M256TypeTrait<double>
1504 template <
typename T>
1505 using M256Type =
typename std::conditional<std::is_integral<T>::value,
1506 M256I<T>,
typename internal::M256TypeTrait<T>::type>::type;
1508 #endif // VSMC_HAS_AVX2 1512 #endif // VSMC_UTILITY_SIMD_HPP
std::basic_ostream< CharT, Traits > & operator<<(std::basic_ostream< CharT, Traits > &os, const Sampler< T > &sampler)
#define VSMC_DEFINE_UTILITY_SIMD_INTEGER_BINARY_OP( Type, CType, op, bin, assign)
bool operator==(const MKLBase< MKLPtr, Derived > &ptr1, const MKLBase< MKLPtr, Derived > &ptr2)
Comparison of equality of two MKLBase objects.
std::basic_istream< CharT, Traits > & operator>>(std::basic_istream< CharT, Traits > &is, std::array< T, N > &ary)
bool operator!=(const MKLBase< MKLPtr, Derived > &ptr1, const MKLBase< MKLPtr, Derived > &ptr2)
Comparison of inequality of two MKLBase objects.
#define VSMC_DEFINE_UTILITY_SIMD_REAL_BINARY_OP(Type, CType, op, bin, assign)