35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
38#if defined(__x86_64__)
40#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41#define TINYSIMD_HAS_SVML
53template <
typename scalarType,
int w
idth = 0>
struct avx2
60#if defined(__AVX2__) && defined(NEKTAR_ENABLE_SIMD_AVX2)
66template <
typename T>
struct avx2Long4;
67template <
typename T>
struct avx2Int8;
77template <>
struct avx2<double>
79 using type = avx2Double4;
81template <>
struct avx2<float>
83 using type = avx2Float8;
89 using type = avx2Long4<std::int64_t>;
93 using type = avx2Long4<std::uint64_t>;
96template <>
struct avx2<
std::size_t>
98 using type = avx2Long4<std::size_t>;
103 using type = avx2Int8<std::int32_t>;
107 using type = avx2Int8<std::uint32_t>;
112 using type = avx2Long4<std::int64_t>;
116 using type = avx2Long4<std::uint64_t>;
118#if defined(__APPLE__)
119template <>
struct avx2<
std::size_t, 4>
121 using type = avx2Long4<std::size_t>;
126 using type = sse2Int4<std::int32_t>;
130 using type = sse2Int4<std::uint32_t>;
134 using type = avx2Int8<std::int32_t>;
138 using type = avx2Int8<std::uint32_t>;
141template <>
struct avx2<bool, 4>
143 using type = avx2Mask4;
145template <>
struct avx2<bool, 8>
147 using type = avx2Mask8;
153template <
typename T>
struct avx2Int8
155 static_assert(std::is_integral_v<T> &&
sizeof(T) == 4,
156 "4 bytes Integral required.");
158 static constexpr unsigned int width = 8;
159 static constexpr unsigned int alignment = 32;
161 using scalarType = T;
162 using vectorType = __m256i;
163 using scalarArray = scalarType[width];
169 inline avx2Int8() =
default;
170 inline avx2Int8(
const avx2Int8 &rhs) =
default;
171 inline avx2Int8(
const vectorType &rhs) : _data(rhs)
174 inline avx2Int8(
const scalarType rhs)
176 _data = _mm256_set1_epi32(rhs);
178 explicit inline avx2Int8(scalarArray &rhs)
180 _data = _mm256_load_si256(
reinterpret_cast<vectorType *
>(rhs));
184 inline avx2Int8 &operator=(
const avx2Int8 &) =
default;
187 inline void store(scalarType *p)
const
189 _mm256_store_si256(
reinterpret_cast<vectorType *
>(p), _data);
192 template <
class flag,
193 typename std::enable_if<is_requiring_alignment_v<flag> &&
194 !is_streaming_v<flag>,
196 inline void store(scalarType *p, flag)
const
198 _mm256_store_si256(
reinterpret_cast<vectorType *
>(p), _data);
201 template <
class flag,
typename std::enable_if<
202 !is_requiring_alignment_v<flag>,
bool>::type = 0>
203 inline void store(scalarType *p, flag)
const
205 _mm256_storeu_si256(
reinterpret_cast<vectorType *
>(p), _data);
208 inline void load(
const scalarType *p)
210 _data = _mm256_load_si256(
reinterpret_cast<const vectorType *
>(p));
213 template <
class flag,
214 typename std::enable_if<is_requiring_alignment_v<flag> &&
215 !is_streaming_v<flag>,
217 inline void load(
const scalarType *p, flag)
219 _data = _mm256_load_si256(
reinterpret_cast<const vectorType *
>(p));
222 template <
class flag,
typename std::enable_if<
223 !is_requiring_alignment_v<flag>,
bool>::type = 0>
224 inline void load(
const scalarType *p, flag)
226 _data = _mm256_loadu_si256(
reinterpret_cast<const vectorType *
>(p));
229 inline void broadcast(
const scalarType rhs)
231 _data = _mm256_set1_epi32(rhs);
237 inline scalarType operator[](
size_t i)
const
239 alignas(alignment) scalarArray tmp;
240 store(tmp, is_aligned);
244 inline scalarType &operator[](
size_t i)
246 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
252inline avx2Int8<T>
operator+(avx2Int8<T> lhs, avx2Int8<T> rhs)
254 return _mm256_add_epi32(lhs._data, rhs._data);
257template <
typename T,
typename U,
258 typename =
typename std::enable_if<std::is_arithmetic_v<U>>::type>
259inline avx2Int8<T>
operator+(avx2Int8<T> lhs, U rhs)
261 return _mm256_add_epi32(lhs._data, _mm256_set1_epi32(rhs));
266template <
typename T>
struct avx2Long4
268 static_assert(std::is_integral_v<T> &&
sizeof(T) == 8,
269 "8 bytes Integral required.");
271 static constexpr unsigned int width = 4;
272 static constexpr unsigned int alignment = 32;
274 using scalarType = T;
275 using vectorType = __m256i;
276 using scalarArray = scalarType[width];
282 inline avx2Long4() =
default;
283 inline avx2Long4(
const avx2Long4 &rhs) =
default;
284 inline avx2Long4(
const vectorType &rhs) : _data(rhs)
287 inline avx2Long4(
const scalarType rhs)
289 _data = _mm256_set1_epi64x(rhs);
291 explicit inline avx2Long4(scalarArray &rhs)
293 _data = _mm256_load_si256(
reinterpret_cast<vectorType *
>(rhs));
297 inline avx2Long4 &operator=(
const avx2Long4 &) =
default;
300 inline void store(scalarType *p)
const
302 _mm256_store_si256(
reinterpret_cast<vectorType *
>(p), _data);
305 template <
class flag,
306 typename std::enable_if<is_requiring_alignment_v<flag> &&
307 !is_streaming_v<flag>,
309 inline void store(scalarType *p, flag)
const
311 _mm256_store_si256(
reinterpret_cast<vectorType *
>(p), _data);
314 template <
class flag,
typename std::enable_if<
315 !is_requiring_alignment_v<flag>,
bool>::type = 0>
316 inline void store(scalarType *p, flag)
const
318 _mm256_storeu_si256(
reinterpret_cast<vectorType *
>(p), _data);
321 inline void load(
const scalarType *p)
323 _data = _mm256_load_si256(
reinterpret_cast<const vectorType *
>(p));
326 template <
class flag,
327 typename std::enable_if<is_requiring_alignment_v<flag> &&
328 !is_streaming_v<flag>,
330 inline void load(
const scalarType *p, flag)
332 _data = _mm256_load_si256(
reinterpret_cast<const vectorType *
>(p));
335 template <
class flag,
typename std::enable_if<
336 !is_requiring_alignment_v<flag>,
bool>::type = 0>
337 inline void load(
const scalarType *p, flag)
339 _data = _mm256_loadu_si256(
reinterpret_cast<const vectorType *
>(p));
342 inline void broadcast(
const scalarType rhs)
344 _data = _mm256_set1_epi64x(rhs);
350 inline scalarType operator[](
size_t i)
const
352 alignas(alignment) scalarArray tmp;
353 store(tmp, is_aligned);
357 inline scalarType &operator[](
size_t i)
359 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
365inline avx2Long4<T>
operator+(avx2Long4<T> lhs, avx2Long4<T> rhs)
367 return _mm256_add_epi64(lhs._data, rhs._data);
370template <
typename T,
typename U,
371 typename =
typename std::enable_if<std::is_arithmetic_v<U>>::type>
372inline avx2Long4<T>
operator+(avx2Long4<T> lhs, U rhs)
374 return _mm256_add_epi64(lhs._data, _mm256_set1_epi64x(rhs));
381 static constexpr unsigned width = 4;
382 static constexpr unsigned alignment = 32;
384 using scalarType = double;
385 using scalarIndexType = std::uint64_t;
386 using vectorType = __m256d;
387 using scalarArray = scalarType[width];
393 inline avx2Double4() =
default;
394 inline avx2Double4(
const avx2Double4 &rhs) =
default;
395 inline avx2Double4(
const vectorType &rhs) : _data(rhs)
398 inline avx2Double4(
const scalarType rhs)
400 _data = _mm256_set1_pd(rhs);
404 inline avx2Double4 &operator=(
const avx2Double4 &) =
default;
407 inline void store(scalarType *p)
const
409 _mm256_store_pd(p, _data);
412 template <
class flag,
413 typename std::enable_if<is_requiring_alignment_v<flag> &&
414 !is_streaming_v<flag>,
416 inline void store(scalarType *p, flag)
const
418 _mm256_store_pd(p, _data);
421 template <
class flag,
typename std::enable_if<
422 !is_requiring_alignment_v<flag>,
bool>::type = 0>
423 inline void store(scalarType *p, flag)
const
425 _mm256_storeu_pd(p, _data);
428 template <
class flag,
429 typename std::enable_if<is_streaming_v<flag>,
bool>::type = 0>
430 inline void store(scalarType *p, flag)
const
432 _mm256_stream_pd(p, _data);
436 inline void load(
const scalarType *p)
438 _data = _mm256_load_pd(p);
441 template <
class flag,
typename std::enable_if<
442 is_requiring_alignment_v<flag>,
bool>::type = 0>
443 inline void load(
const scalarType *p, flag)
445 _data = _mm256_load_pd(p);
448 template <
class flag,
typename std::enable_if<
449 !is_requiring_alignment_v<flag>,
bool>::type = 0>
450 inline void load(
const scalarType *p, flag)
452 _data = _mm256_loadu_pd(p);
456 inline void broadcast(
const scalarType rhs)
458 _data = _mm256_set1_pd(rhs);
461#if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
463 template <
typename T>
464 inline void gather(scalarType
const *p,
const sse2Int4<T> &indices)
466 _data = _mm256_i32gather_pd(p, indices._data, 8);
469 template <
typename T>
470 inline void scatter(scalarType *out,
const sse2Int4<T> &indices)
const
473 alignas(alignment) scalarArray tmp;
474 _mm256_store_pd(tmp, _data);
476 out[_mm_extract_epi32(indices._data, 0)] = tmp[0];
477 out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
478 out[_mm_extract_epi32(indices._data, 2)] = tmp[2];
479 out[_mm_extract_epi32(indices._data, 3)] = tmp[3];
484 template <
typename T>
485 inline void gather(scalarType
const *p,
const avx2Long4<T> &indices)
487 _data = _mm256_i64gather_pd(p, indices._data, 8);
490 template <
typename T>
491 inline void scatter(scalarType *out,
const avx2Long4<T> &indices)
const
494 alignas(alignment) scalarArray tmp;
495 _mm256_store_pd(tmp, _data);
497 out[_mm256_extract_epi64(indices._data, 0)] = tmp[0];
498 out[_mm256_extract_epi64(indices._data, 1)] = tmp[1];
499 out[_mm256_extract_epi64(indices._data, 2)] = tmp[2];
500 out[_mm256_extract_epi64(indices._data, 3)] = tmp[3];
505 inline void fma(
const avx2Double4 &a,
const avx2Double4 &b)
507 _data = _mm256_fmadd_pd(a._data, b._data, _data);
513 inline scalarType operator[](
size_t i)
const
515 alignas(alignment) scalarArray tmp;
516 store(tmp, is_aligned);
520 inline scalarType &operator[](
size_t i)
522 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
527 inline void operator+=(avx2Double4 rhs)
529 _data = _mm256_add_pd(_data, rhs._data);
532 inline void operator-=(avx2Double4 rhs)
534 _data = _mm256_sub_pd(_data, rhs._data);
537 inline void operator*=(avx2Double4 rhs)
539 _data = _mm256_mul_pd(_data, rhs._data);
542 inline void operator/=(avx2Double4 rhs)
544 _data = _mm256_div_pd(_data, rhs._data);
548inline avx2Double4
operator+(avx2Double4 lhs, avx2Double4 rhs)
550 return _mm256_add_pd(lhs._data, rhs._data);
553inline avx2Double4
operator-(avx2Double4 lhs, avx2Double4 rhs)
555 return _mm256_sub_pd(lhs._data, rhs._data);
558inline avx2Double4
operator-(avx2Double4 in)
560 return _mm256_xor_pd(in._data, _mm256_set1_pd(-0.0));
563inline avx2Double4
operator*(avx2Double4 lhs, avx2Double4 rhs)
565 return _mm256_mul_pd(lhs._data, rhs._data);
568inline avx2Double4
operator/(avx2Double4 lhs, avx2Double4 rhs)
570 return _mm256_div_pd(lhs._data, rhs._data);
573inline avx2Double4
sqrt(avx2Double4 in)
575 return _mm256_sqrt_pd(in._data);
578inline avx2Double4
abs(avx2Double4 in)
581 static const __m256d sign_mask = _mm256_set1_pd(-0.);
582 return _mm256_andnot_pd(sign_mask, in._data);
585inline avx2Double4
min(avx2Double4 lhs, avx2Double4 rhs)
587 return _mm256_min_pd(lhs._data, rhs._data);
590inline avx2Double4
max(avx2Double4 lhs, avx2Double4 rhs)
592 return _mm256_max_pd(lhs._data, rhs._data);
595inline avx2Double4
log(avx2Double4 in)
597#if defined(TINYSIMD_HAS_SVML)
598 return _mm256_log_pd(in._data);
602 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
604 tmp[0] = std::log(tmp[0]);
605 tmp[1] = std::log(tmp[1]);
606 tmp[2] = std::log(tmp[2]);
607 tmp[3] = std::log(tmp[3]);
615 const double *in,
const std::uint32_t dataLen,
616 std::vector<avx2Double4, allocator<avx2Double4>> &out)
618 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
619 for (
size_t i = 0; i < dataLen; ++i)
622 tmp[1] = in[i + dataLen];
623 tmp[2] = in[i + 2 * dataLen];
624 tmp[3] = in[i + 3 * dataLen];
630 const double *in, std::uint32_t dataLen,
631 std::vector<avx2Double4, allocator<avx2Double4>> &out)
633 alignas(avx2Double4::alignment)
634 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
635 using index_t = avx2Long4<size_t>;
637 index_t index1 = index0 + 1;
638 index_t index2 = index0 + 2;
639 index_t index3 = index0 + 3;
642 constexpr uint16_t unrl = 4;
643 size_t nBlocks = dataLen / unrl;
644 for (
size_t i = 0; i < nBlocks; ++i)
646 out[unrl * i + 0].gather(in, index0);
647 out[unrl * i + 1].gather(in, index1);
648 out[unrl * i + 2].gather(in, index2);
649 out[unrl * i + 3].gather(in, index3);
650 index0 = index0 + unrl;
651 index1 = index1 + unrl;
652 index2 = index2 + unrl;
653 index3 = index3 + unrl;
657 for (
size_t i = unrl * nBlocks; i < dataLen; ++i)
659 out[i].gather(in, index0);
665 const std::vector<avx2Double4, allocator<avx2Double4>> &in,
666 const std::uint32_t dataLen,
double *out)
668 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
669 for (
size_t i = 0; i < dataLen; ++i)
673 out[i + dataLen] = tmp[1];
674 out[i + 2 * dataLen] = tmp[2];
675 out[i + 3 * dataLen] = tmp[3];
680 const std::vector<avx2Double4, allocator<avx2Double4>> &in,
681 std::uint32_t dataLen,
double *out)
683 alignas(avx2Double4::alignment)
684 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
685 using index_t = avx2Long4<size_t>;
688 for (
size_t i = 0; i < dataLen; ++i)
690 in[i].scatter(out, index0);
699 static constexpr unsigned width = 8;
700 static constexpr unsigned alignment = 32;
702 using scalarType = float;
703 using scalarIndexType = std::uint32_t;
704 using vectorType = __m256;
705 using scalarArray = scalarType[width];
711 inline avx2Float8() =
default;
712 inline avx2Float8(
const avx2Float8 &rhs) =
default;
713 inline avx2Float8(
const vectorType &rhs) : _data(rhs)
716 inline avx2Float8(
const scalarType rhs)
718 _data = _mm256_set1_ps(rhs);
722 inline avx2Float8 &operator=(
const avx2Float8 &) =
default;
725 inline void store(scalarType *p)
const
727 _mm256_store_ps(p, _data);
730 template <
class flag,
731 typename std::enable_if<is_requiring_alignment_v<flag> &&
732 !is_streaming_v<flag>,
734 inline void store(scalarType *p, flag)
const
736 _mm256_store_ps(p, _data);
739 template <
class flag,
typename std::enable_if<
740 !is_requiring_alignment_v<flag>,
bool>::type = 0>
741 inline void store(scalarType *p, flag)
const
743 _mm256_storeu_ps(p, _data);
746 template <
class flag,
747 typename std::enable_if<is_streaming_v<flag>,
bool>::type = 0>
748 inline void store(scalarType *p, flag)
const
750 _mm256_stream_ps(p, _data);
754 inline void load(
const scalarType *p)
756 _data = _mm256_load_ps(p);
759 template <
class flag,
typename std::enable_if<
760 is_requiring_alignment_v<flag>,
bool>::type = 0>
761 inline void load(
const scalarType *p, flag)
763 _data = _mm256_load_ps(p);
766 template <
class flag,
typename std::enable_if<
767 !is_requiring_alignment_v<flag>,
bool>::type = 0>
768 inline void load(
const scalarType *p, flag)
770 _data = _mm256_loadu_ps(p);
774 inline void broadcast(
const scalarType rhs)
776 _data = _mm256_set1_ps(rhs);
780 template <
typename T>
781 inline void gather(scalarType
const *p,
const avx2Int8<T> &indices)
783 _data = _mm256_i32gather_ps(p, indices._data, 4);
786 template <
typename T>
787 inline void scatter(scalarType *out,
const avx2Int8<T> &indices)
const
790 alignas(alignment) scalarArray tmp;
791 _mm256_store_ps(tmp, _data);
793 out[_mm256_extract_epi32(indices._data, 0)] = tmp[0];
794 out[_mm256_extract_epi32(indices._data, 1)] = tmp[1];
795 out[_mm256_extract_epi32(indices._data, 2)] = tmp[2];
796 out[_mm256_extract_epi32(indices._data, 3)] = tmp[3];
797 out[_mm256_extract_epi32(indices._data, 4)] = tmp[4];
798 out[_mm256_extract_epi32(indices._data, 5)] = tmp[5];
799 out[_mm256_extract_epi32(indices._data, 6)] = tmp[6];
800 out[_mm256_extract_epi32(indices._data, 7)] = tmp[7];
805 inline void fma(
const avx2Float8 &a,
const avx2Float8 &b)
807 _data = _mm256_fmadd_ps(a._data, b._data, _data);
813 inline scalarType operator[](
size_t i)
const
815 alignas(alignment) scalarArray tmp;
816 store(tmp, is_aligned);
820 inline scalarType &operator[](
size_t i)
822 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
826 inline void operator+=(avx2Float8 rhs)
828 _data = _mm256_add_ps(_data, rhs._data);
831 inline void operator-=(avx2Float8 rhs)
833 _data = _mm256_sub_ps(_data, rhs._data);
836 inline void operator*=(avx2Float8 rhs)
838 _data = _mm256_mul_ps(_data, rhs._data);
841 inline void operator/=(avx2Float8 rhs)
843 _data = _mm256_div_ps(_data, rhs._data);
847inline avx2Float8
operator+(avx2Float8 lhs, avx2Float8 rhs)
849 return _mm256_add_ps(lhs._data, rhs._data);
852inline avx2Float8
operator-(avx2Float8 lhs, avx2Float8 rhs)
854 return _mm256_sub_ps(lhs._data, rhs._data);
857inline avx2Float8
operator-(avx2Float8 in)
859 return _mm256_xor_ps(in._data, _mm256_set1_ps(-0.0));
862inline avx2Float8
operator*(avx2Float8 lhs, avx2Float8 rhs)
864 return _mm256_mul_ps(lhs._data, rhs._data);
867inline avx2Float8
operator/(avx2Float8 lhs, avx2Float8 rhs)
869 return _mm256_div_ps(lhs._data, rhs._data);
872inline avx2Float8
sqrt(avx2Float8 in)
874 return _mm256_sqrt_ps(in._data);
877inline avx2Float8
abs(avx2Float8 in)
880 static const __m256 sign_mask = _mm256_set1_ps(-0.);
881 return _mm256_andnot_ps(sign_mask, in._data);
884inline avx2Float8
min(avx2Float8 lhs, avx2Float8 rhs)
886 return _mm256_min_ps(lhs._data, rhs._data);
889inline avx2Float8
max(avx2Float8 lhs, avx2Float8 rhs)
891 return _mm256_max_ps(lhs._data, rhs._data);
894inline avx2Float8
log(avx2Float8 in)
898 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
900 tmp[0] = std::log(tmp[0]);
901 tmp[1] = std::log(tmp[1]);
902 tmp[2] = std::log(tmp[2]);
903 tmp[3] = std::log(tmp[3]);
904 tmp[4] = std::log(tmp[4]);
905 tmp[5] = std::log(tmp[5]);
906 tmp[6] = std::log(tmp[6]);
907 tmp[7] = std::log(tmp[7]);
914 const double *in,
const std::uint32_t dataLen,
915 std::vector<avx2Float8, allocator<avx2Float8>> &out)
917 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
918 for (
size_t i = 0; i < dataLen; ++i)
921 tmp[1] = in[i + dataLen];
922 tmp[2] = in[i + 2 * dataLen];
923 tmp[3] = in[i + 3 * dataLen];
924 tmp[4] = in[i + 4 * dataLen];
925 tmp[5] = in[i + 5 * dataLen];
926 tmp[6] = in[i + 6 * dataLen];
927 tmp[7] = in[i + 7 * dataLen];
933 std::vector<avx2Float8, allocator<avx2Float8>> &out)
936 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
937 0, dataLen, 2 * dataLen, 3 * dataLen,
938 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
940 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
942 index_t index1 = index0 + 1;
943 index_t index2 = index0 + 2;
944 index_t index3 = index0 + 3;
947 size_t nBlocks = dataLen / 4;
948 for (
size_t i = 0; i < nBlocks; ++i)
950 out[4 * i + 0].gather(in, index0);
951 out[4 * i + 1].gather(in, index1);
952 out[4 * i + 2].gather(in, index2);
953 out[4 * i + 3].gather(in, index3);
961 for (
size_t i = 4 * nBlocks; i < dataLen; ++i)
963 out[i].gather(in, index0);
969 const std::vector<avx2Float8, allocator<avx2Float8>> &in,
970 const std::uint32_t dataLen,
double *out)
972 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
973 for (
size_t i = 0; i < dataLen; ++i)
977 out[i + dataLen] = tmp[1];
978 out[i + 2 * dataLen] = tmp[2];
979 out[i + 3 * dataLen] = tmp[3];
980 out[i + 4 * dataLen] = tmp[4];
981 out[i + 5 * dataLen] = tmp[5];
982 out[i + 6 * dataLen] = tmp[6];
983 out[i + 7 * dataLen] = tmp[7];
988 const std::vector<avx2Float8, allocator<avx2Float8>> &in,
989 std::uint32_t dataLen,
float *out)
991 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
992 0, dataLen, 2 * dataLen, 3 * dataLen,
993 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
994 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
997 for (
size_t i = 0; i < dataLen; ++i)
999 in[i].scatter(out, index0);
1000 index0 = index0 + 1;
1013struct avx2Mask4 : avx2Long4<std::uint64_t>
1016 using avx2Long4::avx2Long4;
1018 static constexpr scalarType true_v = -1;
1019 static constexpr scalarType false_v = 0;
1022inline avx2Mask4
operator>(avx2Double4 lhs, avx2Double4 rhs)
1024 return reinterpret_cast<__m256i
>(
1025 _mm256_cmp_pd(lhs._data, rhs._data, _CMP_GT_OQ));
1028inline bool operator&&(avx2Mask4 lhs,
bool rhs)
1031 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask4::true_v));
1036struct avx2Mask8 : avx2Int8<std::uint32_t>
1039 using avx2Int8::avx2Int8;
1041 static constexpr scalarType true_v = -1;
1042 static constexpr scalarType false_v = 0;
1045inline avx2Mask8
operator>(avx2Float8 lhs, avx2Float8 rhs)
1047 return reinterpret_cast<__m256i
>(_mm256_cmp_ps(rhs._data, lhs._data, 1));
1050inline bool operator&&(avx2Mask8 lhs,
bool rhs)
1053 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask8::true_v));
void load_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
scalarT< T > abs(scalarT< T > in)
void deinterleave_unalign_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > max(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > log(scalarT< T > in)
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
bool operator&&(scalarMask lhs, bool rhs)
void load_unalign_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
scalarT< T > min(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > sqrt(scalarT< T > in)
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)