35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
38#if defined(__x86_64__)
40#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41#define TINYSIMD_HAS_SVML
53template <
typename scalarType,
int w
idth = 0>
struct avx2
60#if defined(__AVX2__) && defined(NEKTAR_ENABLE_SIMD_AVX2)
66template <
typename T>
struct avx2Long4;
67template <
typename T>
struct avx2Int8;
77template <>
struct avx2<double>
79 using type = avx2Double4;
81template <>
struct avx2<float>
83 using type = avx2Float8;
89 using type = avx2Long4<std::int64_t>;
93 using type = avx2Long4<std::uint64_t>;
96template <>
struct avx2<
std::size_t>
98 using type = avx2Long4<std::size_t>;
103 using type = avx2Int8<std::int32_t>;
107 using type = avx2Int8<std::uint32_t>;
112 using type = avx2Long4<std::int64_t>;
116 using type = avx2Long4<std::uint64_t>;
118#if defined(__APPLE__)
119template <>
struct avx2<
std::size_t, 4>
121 using type = avx2Long4<std::size_t>;
126 using type = sse2Int4<std::int32_t>;
130 using type = sse2Int4<std::uint32_t>;
134 using type = avx2Int8<std::int32_t>;
138 using type = avx2Int8<std::uint32_t>;
141template <>
struct avx2<bool, 4>
143 using type = avx2Mask4;
145template <>
struct avx2<bool, 8>
147 using type = avx2Mask8;
153template <
typename T>
struct avx2Int8
155 static_assert(std::is_integral_v<T> &&
sizeof(T) == 4,
156 "4 bytes Integral required.");
158 static constexpr unsigned int width = 8;
159 static constexpr unsigned int alignment = 32;
161 using scalarType = T;
162 using vectorType = __m256i;
163 using scalarArray = scalarType[width];
169 inline avx2Int8() =
default;
170 inline avx2Int8(
const avx2Int8 &rhs) =
default;
171 inline avx2Int8(
const vectorType &rhs) : _data(rhs)
174 inline avx2Int8(
const scalarType rhs)
176 _data = _mm256_set1_epi32(rhs);
178 explicit inline avx2Int8(scalarArray &rhs)
180 _data = _mm256_load_si256(
reinterpret_cast<vectorType *
>(rhs));
184 inline avx2Int8 &operator=(
const avx2Int8 &) =
default;
187 inline void store(scalarType *
p)
const
189 _mm256_store_si256(
reinterpret_cast<vectorType *
>(
p), _data);
192 template <
class flag,
193 typename std::enable_if<is_requiring_alignment_v<flag> &&
194 !is_streaming_v<flag>,
196 inline void store(scalarType *
p, flag)
const
198 _mm256_store_si256(
reinterpret_cast<vectorType *
>(
p), _data);
201 template <
class flag,
typename std::enable_if<
202 !is_requiring_alignment_v<flag>,
bool>::type = 0>
203 inline void store(scalarType *
p, flag)
const
205 _mm256_storeu_si256(
reinterpret_cast<vectorType *
>(
p), _data);
208 inline void load(
const scalarType *
p)
210 _data = _mm256_load_si256(
reinterpret_cast<const vectorType *
>(
p));
213 template <
class flag,
214 typename std::enable_if<is_requiring_alignment_v<flag> &&
215 !is_streaming_v<flag>,
217 inline void load(
const scalarType *
p, flag)
219 _data = _mm256_load_si256(
reinterpret_cast<const vectorType *
>(
p));
222 template <
class flag,
typename std::enable_if<
223 !is_requiring_alignment_v<flag>,
bool>::type = 0>
224 inline void load(
const scalarType *
p, flag)
226 _data = _mm256_loadu_si256(
reinterpret_cast<const vectorType *
>(
p));
229 inline void broadcast(
const scalarType rhs)
231 _data = _mm256_set1_epi32(rhs);
237 inline scalarType operator[](
size_t i)
const
239 alignas(alignment) scalarArray tmp;
244 inline scalarType &operator[](
size_t i)
246 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
252inline avx2Int8<T>
operator+(avx2Int8<T> lhs, avx2Int8<T> rhs)
254 return _mm256_add_epi32(lhs._data, rhs._data);
257template <
typename T,
typename U,
258 typename =
typename std::enable_if<std::is_arithmetic_v<U>>::type>
259inline avx2Int8<T>
operator+(avx2Int8<T> lhs, U rhs)
261 return _mm256_add_epi32(lhs._data, _mm256_set1_epi32(rhs));
266template <
typename T>
struct avx2Long4
268 static_assert(std::is_integral_v<T> &&
sizeof(T) == 8,
269 "8 bytes Integral required.");
271 static constexpr unsigned int width = 4;
272 static constexpr unsigned int alignment = 32;
274 using scalarType = T;
275 using vectorType = __m256i;
276 using scalarArray = scalarType[width];
282 inline avx2Long4() =
default;
283 inline avx2Long4(
const avx2Long4 &rhs) =
default;
284 inline avx2Long4(
const vectorType &rhs) : _data(rhs)
287 inline avx2Long4(
const scalarType rhs)
289 _data = _mm256_set1_epi64x(rhs);
291 explicit inline avx2Long4(scalarArray &rhs)
293 _data = _mm256_load_si256(
reinterpret_cast<vectorType *
>(rhs));
297 inline avx2Long4 &operator=(
const avx2Long4 &) =
default;
300 inline void store(scalarType *
p)
const
302 _mm256_store_si256(
reinterpret_cast<vectorType *
>(
p), _data);
305 template <
class flag,
306 typename std::enable_if<is_requiring_alignment_v<flag> &&
307 !is_streaming_v<flag>,
309 inline void store(scalarType *
p, flag)
const
311 _mm256_store_si256(
reinterpret_cast<vectorType *
>(
p), _data);
314 template <
class flag,
typename std::enable_if<
315 !is_requiring_alignment_v<flag>,
bool>::type = 0>
316 inline void store(scalarType *
p, flag)
const
318 _mm256_storeu_si256(
reinterpret_cast<vectorType *
>(
p), _data);
321 inline void load(
const scalarType *
p)
323 _data = _mm256_load_si256(
reinterpret_cast<const vectorType *
>(
p));
326 template <
class flag,
327 typename std::enable_if<is_requiring_alignment_v<flag> &&
328 !is_streaming_v<flag>,
330 inline void load(
const scalarType *
p, flag)
332 _data = _mm256_load_si256(
reinterpret_cast<const vectorType *
>(
p));
335 template <
class flag,
typename std::enable_if<
336 !is_requiring_alignment_v<flag>,
bool>::type = 0>
337 inline void load(
const scalarType *
p, flag)
339 _data = _mm256_loadu_si256(
reinterpret_cast<const vectorType *
>(
p));
342 inline void broadcast(
const scalarType rhs)
344 _data = _mm256_set1_epi64x(rhs);
350 inline scalarType operator[](
size_t i)
const
352 alignas(alignment) scalarArray tmp;
357 inline scalarType &operator[](
size_t i)
359 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
365inline avx2Long4<T>
operator+(avx2Long4<T> lhs, avx2Long4<T> rhs)
367 return _mm256_add_epi64(lhs._data, rhs._data);
370template <
typename T,
typename U,
371 typename =
typename std::enable_if<std::is_arithmetic_v<U>>::type>
372inline avx2Long4<T>
operator+(avx2Long4<T> lhs, U rhs)
374 return _mm256_add_epi64(lhs._data, _mm256_set1_epi64x(rhs));
381 static constexpr unsigned width = 4;
382 static constexpr unsigned alignment = 32;
384 using scalarType = double;
386 using vectorType = __m256d;
387 using scalarArray = scalarType[width];
393 inline avx2Double4() =
default;
394 inline avx2Double4(
const avx2Double4 &rhs) =
default;
395 inline avx2Double4(
const vectorType &rhs) : _data(rhs)
398 inline avx2Double4(
const scalarType rhs)
400 _data = _mm256_set1_pd(rhs);
404 inline avx2Double4 &operator=(
const avx2Double4 &) =
default;
407 inline void store(scalarType *
p)
const
409 _mm256_store_pd(
p, _data);
412 template <
class flag,
413 typename std::enable_if<is_requiring_alignment_v<flag> &&
414 !is_streaming_v<flag>,
416 inline void store(scalarType *
p, flag)
const
418 _mm256_store_pd(
p, _data);
421 template <
class flag,
typename std::enable_if<
422 !is_requiring_alignment_v<flag>,
bool>::type = 0>
423 inline void store(scalarType *
p, flag)
const
425 _mm256_storeu_pd(
p, _data);
428 template <
class flag,
429 typename std::enable_if<is_streaming_v<flag>,
bool>::type = 0>
430 inline void store(scalarType *
p, flag)
const
432 _mm256_stream_pd(
p, _data);
436 inline void load(
const scalarType *
p)
438 _data = _mm256_load_pd(
p);
441 template <
class flag,
typename std::enable_if<
442 is_requiring_alignment_v<flag>,
bool>::type = 0>
443 inline void load(
const scalarType *
p, flag)
445 _data = _mm256_load_pd(
p);
448 template <
class flag,
typename std::enable_if<
449 !is_requiring_alignment_v<flag>,
bool>::type = 0>
450 inline void load(
const scalarType *
p, flag)
452 _data = _mm256_loadu_pd(
p);
456 inline void broadcast(
const scalarType rhs)
458 _data = _mm256_set1_pd(rhs);
461#if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
463 template <
typename T>
464 inline void gather(scalarType
const *
p,
const sse2Int4<T> &indices)
466 _data = _mm256_i32gather_pd(
p, indices._data, 8);
469 template <
typename T>
470 inline void scatter(scalarType *out,
const sse2Int4<T> &indices)
const
473 alignas(alignment) scalarArray tmp;
474 _mm256_store_pd(tmp, _data);
476 out[_mm_extract_epi32(indices._data, 0)] = tmp[0];
477 out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
478 out[_mm_extract_epi32(indices._data, 2)] = tmp[2];
479 out[_mm_extract_epi32(indices._data, 3)] = tmp[3];
484 template <
typename T>
485 inline void gather(scalarType
const *
p,
const avx2Long4<T> &indices)
487 _data = _mm256_i64gather_pd(
p, indices._data, 8);
490 template <
typename T>
491 inline void scatter(scalarType *out,
const avx2Long4<T> &indices)
const
494 alignas(alignment) scalarArray tmp;
495 _mm256_store_pd(tmp, _data);
497 out[_mm256_extract_epi64(indices._data, 0)] = tmp[0];
498 out[_mm256_extract_epi64(indices._data, 1)] = tmp[1];
499 out[_mm256_extract_epi64(indices._data, 2)] = tmp[2];
500 out[_mm256_extract_epi64(indices._data, 3)] = tmp[3];
505 inline void fma(
const avx2Double4 &a,
const avx2Double4 &b)
507 _data = _mm256_fmadd_pd(a._data, b._data, _data);
513 inline scalarType operator[](
size_t i)
const
515 alignas(alignment) scalarArray tmp;
520 inline scalarType &operator[](
size_t i)
522 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
527 inline void operator+=(avx2Double4 rhs)
529 _data = _mm256_add_pd(_data, rhs._data);
532 inline void operator-=(avx2Double4 rhs)
534 _data = _mm256_sub_pd(_data, rhs._data);
537 inline void operator*=(avx2Double4 rhs)
539 _data = _mm256_mul_pd(_data, rhs._data);
542 inline void operator/=(avx2Double4 rhs)
544 _data = _mm256_div_pd(_data, rhs._data);
548inline avx2Double4
operator+(avx2Double4 lhs, avx2Double4 rhs)
550 return _mm256_add_pd(lhs._data, rhs._data);
553inline avx2Double4
operator-(avx2Double4 lhs, avx2Double4 rhs)
555 return _mm256_sub_pd(lhs._data, rhs._data);
558inline avx2Double4
operator*(avx2Double4 lhs, avx2Double4 rhs)
560 return _mm256_mul_pd(lhs._data, rhs._data);
563inline avx2Double4
operator/(avx2Double4 lhs, avx2Double4 rhs)
565 return _mm256_div_pd(lhs._data, rhs._data);
568inline avx2Double4
sqrt(avx2Double4 in)
570 return _mm256_sqrt_pd(in._data);
573inline avx2Double4
abs(avx2Double4 in)
576 static const __m256d sign_mask = _mm256_set1_pd(-0.);
577 return _mm256_andnot_pd(sign_mask, in._data);
580inline avx2Double4
log(avx2Double4 in)
582#if defined(TINYSIMD_HAS_SVML)
583 return _mm256_log_pd(in._data);
587 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
601 std::vector<avx2Double4, allocator<avx2Double4>> &out)
603 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
604 for (
size_t i = 0; i < dataLen; ++i)
607 tmp[1] = in[i + dataLen];
608 tmp[2] = in[i + 2 * dataLen];
609 tmp[3] = in[i + 3 * dataLen];
616 std::vector<avx2Double4, allocator<avx2Double4>> &out)
618 alignas(avx2Double4::alignment)
619 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
620 using index_t = avx2Long4<size_t>;
622 index_t index1 = index0 + 1;
623 index_t index2 = index0 + 2;
624 index_t index3 = index0 + 3;
627 constexpr uint16_t unrl = 4;
628 size_t nBlocks = dataLen / unrl;
629 for (
size_t i = 0; i < nBlocks; ++i)
631 out[unrl * i + 0].gather(in, index0);
632 out[unrl * i + 1].gather(in, index1);
633 out[unrl * i + 2].gather(in, index2);
634 out[unrl * i + 3].gather(in, index3);
635 index0 = index0 + unrl;
636 index1 = index1 + unrl;
637 index2 = index2 + unrl;
638 index3 = index3 + unrl;
642 for (
size_t i = unrl * nBlocks; i < dataLen; ++i)
644 out[i].gather(in, index0);
650 const std::vector<avx2Double4, allocator<avx2Double4>> &in,
653 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
654 for (
size_t i = 0; i < dataLen; ++i)
658 out[i + dataLen] = tmp[1];
659 out[i + 2 * dataLen] = tmp[2];
660 out[i + 3 * dataLen] = tmp[3];
665 const std::vector<avx2Double4, allocator<avx2Double4>> &in,
668 alignas(avx2Double4::alignment)
669 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
670 using index_t = avx2Long4<size_t>;
673 for (
size_t i = 0; i < dataLen; ++i)
675 in[i].scatter(out, index0);
684 static constexpr unsigned width = 8;
685 static constexpr unsigned alignment = 32;
687 using scalarType = float;
689 using vectorType = __m256;
690 using scalarArray = scalarType[width];
696 inline avx2Float8() =
default;
697 inline avx2Float8(
const avx2Float8 &rhs) =
default;
698 inline avx2Float8(
const vectorType &rhs) : _data(rhs)
701 inline avx2Float8(
const scalarType rhs)
703 _data = _mm256_set1_ps(rhs);
707 inline avx2Float8 &operator=(
const avx2Float8 &) =
default;
710 inline void store(scalarType *
p)
const
712 _mm256_store_ps(
p, _data);
715 template <
class flag,
716 typename std::enable_if<is_requiring_alignment_v<flag> &&
717 !is_streaming_v<flag>,
719 inline void store(scalarType *
p, flag)
const
721 _mm256_store_ps(
p, _data);
724 template <
class flag,
typename std::enable_if<
725 !is_requiring_alignment_v<flag>,
bool>::type = 0>
726 inline void store(scalarType *
p, flag)
const
728 _mm256_storeu_ps(
p, _data);
731 template <
class flag,
732 typename std::enable_if<is_streaming_v<flag>,
bool>::type = 0>
733 inline void store(scalarType *
p, flag)
const
735 _mm256_stream_ps(
p, _data);
739 inline void load(
const scalarType *
p)
741 _data = _mm256_load_ps(
p);
744 template <
class flag,
typename std::enable_if<
745 is_requiring_alignment_v<flag>,
bool>::type = 0>
746 inline void load(
const scalarType *
p, flag)
748 _data = _mm256_load_ps(
p);
751 template <
class flag,
typename std::enable_if<
752 !is_requiring_alignment_v<flag>,
bool>::type = 0>
753 inline void load(
const scalarType *
p, flag)
755 _data = _mm256_loadu_ps(
p);
759 inline void broadcast(
const scalarType rhs)
761 _data = _mm256_set1_ps(rhs);
765 template <
typename T>
766 inline void gather(scalarType
const *
p,
const avx2Int8<T> &indices)
768 _data = _mm256_i32gather_ps(
p, indices._data, 4);
771 template <
typename T>
772 inline void scatter(scalarType *out,
const avx2Int8<T> &indices)
const
775 alignas(alignment) scalarArray tmp;
776 _mm256_store_ps(tmp, _data);
778 out[_mm256_extract_epi32(indices._data, 0)] = tmp[0];
779 out[_mm256_extract_epi32(indices._data, 1)] = tmp[1];
780 out[_mm256_extract_epi32(indices._data, 2)] = tmp[2];
781 out[_mm256_extract_epi32(indices._data, 3)] = tmp[3];
782 out[_mm256_extract_epi32(indices._data, 4)] = tmp[4];
783 out[_mm256_extract_epi32(indices._data, 5)] = tmp[5];
784 out[_mm256_extract_epi32(indices._data, 6)] = tmp[6];
785 out[_mm256_extract_epi32(indices._data, 7)] = tmp[7];
790 inline void fma(
const avx2Float8 &a,
const avx2Float8 &b)
792 _data = _mm256_fmadd_ps(a._data, b._data, _data);
798 inline scalarType operator[](
size_t i)
const
800 alignas(alignment) scalarArray tmp;
805 inline scalarType &operator[](
size_t i)
807 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
811 inline void operator+=(avx2Float8 rhs)
813 _data = _mm256_add_ps(_data, rhs._data);
816 inline void operator-=(avx2Float8 rhs)
818 _data = _mm256_sub_ps(_data, rhs._data);
821 inline void operator*=(avx2Float8 rhs)
823 _data = _mm256_mul_ps(_data, rhs._data);
826 inline void operator/=(avx2Float8 rhs)
828 _data = _mm256_div_ps(_data, rhs._data);
832inline avx2Float8
operator+(avx2Float8 lhs, avx2Float8 rhs)
834 return _mm256_add_ps(lhs._data, rhs._data);
837inline avx2Float8
operator-(avx2Float8 lhs, avx2Float8 rhs)
839 return _mm256_sub_ps(lhs._data, rhs._data);
842inline avx2Float8
operator*(avx2Float8 lhs, avx2Float8 rhs)
844 return _mm256_mul_ps(lhs._data, rhs._data);
847inline avx2Float8
operator/(avx2Float8 lhs, avx2Float8 rhs)
849 return _mm256_div_ps(lhs._data, rhs._data);
852inline avx2Float8
sqrt(avx2Float8 in)
854 return _mm256_sqrt_ps(in._data);
857inline avx2Float8
abs(avx2Float8 in)
860 static const __m256 sign_mask = _mm256_set1_ps(-0.);
861 return _mm256_andnot_ps(sign_mask, in._data);
864inline avx2Float8
log(avx2Float8 in)
868 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
885 std::vector<avx2Float8, allocator<avx2Float8>> &out)
887 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
888 for (
size_t i = 0; i < dataLen; ++i)
891 tmp[1] = in[i + dataLen];
892 tmp[2] = in[i + 2 * dataLen];
893 tmp[3] = in[i + 3 * dataLen];
894 tmp[4] = in[i + 4 * dataLen];
895 tmp[5] = in[i + 5 * dataLen];
896 tmp[6] = in[i + 6 * dataLen];
897 tmp[7] = in[i + 7 * dataLen];
903 std::vector<avx2Float8, allocator<avx2Float8>> &out)
906 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
907 0, dataLen, 2 * dataLen, 3 * dataLen,
908 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
910 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
912 index_t index1 = index0 + 1;
913 index_t index2 = index0 + 2;
914 index_t index3 = index0 + 3;
917 size_t nBlocks = dataLen / 4;
918 for (
size_t i = 0; i < nBlocks; ++i)
920 out[4 * i + 0].gather(in, index0);
921 out[4 * i + 1].gather(in, index1);
922 out[4 * i + 2].gather(in, index2);
923 out[4 * i + 3].gather(in, index3);
931 for (
size_t i = 4 * nBlocks; i < dataLen; ++i)
933 out[i].gather(in, index0);
939 const std::vector<avx2Float8, allocator<avx2Float8>> &in,
942 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
943 for (
size_t i = 0; i < dataLen; ++i)
947 out[i + dataLen] = tmp[1];
948 out[i + 2 * dataLen] = tmp[2];
949 out[i + 3 * dataLen] = tmp[3];
950 out[i + 4 * dataLen] = tmp[4];
951 out[i + 5 * dataLen] = tmp[5];
952 out[i + 6 * dataLen] = tmp[6];
953 out[i + 7 * dataLen] = tmp[7];
958 const std::vector<avx2Float8, allocator<avx2Float8>> &in,
961 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
962 0, dataLen, 2 * dataLen, 3 * dataLen,
963 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
964 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
967 for (
size_t i = 0; i < dataLen; ++i)
969 in[i].scatter(out, index0);
983struct avx2Mask4 : avx2Long4<std::uint64_t>
986 using avx2Long4::avx2Long4;
988 static constexpr scalarType true_v = -1;
989 static constexpr scalarType false_v = 0;
992inline avx2Mask4
operator>(avx2Double4 lhs, avx2Double4 rhs)
994 return reinterpret_cast<__m256i
>(
995 _mm256_cmp_pd(lhs._data, rhs._data, _CMP_GT_OQ));
998inline bool operator&&(avx2Mask4 lhs,
bool rhs)
1001 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask4::true_v));
1006struct avx2Mask8 : avx2Int8<std::uint32_t>
1009 using avx2Int8::avx2Int8;
1011 static constexpr scalarType true_v = -1;
1012 static constexpr scalarType false_v = 0;
1015inline avx2Mask8
operator>(avx2Float8 lhs, avx2Float8 rhs)
1017 return reinterpret_cast<__m256i
>(_mm256_cmp_ps(rhs._data, lhs._data, 1));
1020inline bool operator&&(avx2Mask8 lhs,
bool rhs)
1023 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask8::true_v));
void load_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
scalarT< T > abs(scalarT< T > in)
void deinterleave_unalign_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > log(scalarT< T > in)
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
bool operator&&(scalarMask lhs, bool rhs)
void load_unalign_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
scalarT< T > sqrt(scalarT< T > in)
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)