35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
38#if defined(__x86_64__)
40#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41#define TINYSIMD_HAS_SVML
52template <
typename scalarType,
int w
idth = 0>
struct avx512
59#if defined(__AVX512F__) && defined(NEKTAR_ENABLE_SIMD_AVX512)
65template <
typename T>
struct avx512Long8;
66template <
typename T>
struct avx512Int16;
76template <>
struct avx512<double>
78 using type = avx512Double8;
80template <>
struct avx512<float>
82 using type = avx512Float16;
88 using type = avx512Long8<std::int64_t>;
92 using type = avx512Long8<std::uint64_t>;
95template <>
struct avx512<
std::size_t>
97 using type = avx512Long8<std::size_t>;
102 using type = avx512Int16<std::int32_t>;
106 using type = avx512Int16<std::uint32_t>;
111 using type = avx512Long8<std::int64_t>;
115 using type = avx512Long8<std::uint64_t>;
117#if defined(__APPLE__)
118template <>
struct avx512<
std::size_t, 8>
120 using type = avx512Long8<std::size_t>;
125 using type = avx2Int8<std::int32_t>;
129 using type = avx2Int8<std::uint32_t>;
133 using type = avx512Int16<std::int32_t>;
137 using type = avx512Int16<std::uint32_t>;
140template <>
struct avx512<bool, 8>
142 using type = avx512Mask8;
144template <>
struct avx512<bool, 16>
146 using type = avx512Mask16;
154template <
typename T>
struct avx512Int16
156 static_assert(std::is_integral_v<T> &&
sizeof(T) == 4,
157 "4 bytes Integral required.");
159 static constexpr unsigned int width = 16;
160 static constexpr unsigned int alignment = 64;
162 using scalarType = T;
163 using vectorType = __m512i;
164 using scalarArray = scalarType[width];
170 inline avx512Int16() =
default;
171 inline avx512Int16(
const avx512Int16 &rhs) =
default;
172 inline avx512Int16(
const vectorType &rhs) : _data(rhs)
175 inline avx512Int16(
const scalarType rhs)
177 _data = _mm512_set1_epi32(rhs);
179 explicit inline avx512Int16(scalarArray &rhs)
181 _data = _mm512_load_epi32(rhs);
185 inline avx512Int16 &operator=(
const avx512Int16 &) =
default;
188 inline void store(scalarType *
p)
const
190 _mm512_store_epi32(
p, _data);
193 template <
class flag,
194 typename std::enable_if<is_requiring_alignment_v<flag> &&
195 !is_streaming_v<flag>,
197 inline void store(scalarType *
p, flag)
const
199 _mm512_store_epi32(
p, _data);
202 template <
class flag,
typename std::enable_if<
203 !is_requiring_alignment_v<flag>,
bool>::type = 0>
204 inline void store(scalarType *
p, flag)
const
206 _mm512_storeu_epi32(
p, _data);
209 inline void load(
const scalarType *
p)
211 _data = _mm512_load_epi32(
p);
214 template <
class flag,
215 typename std::enable_if<is_requiring_alignment_v<flag> &&
216 !is_streaming_v<flag>,
218 inline void load(
const scalarType *
p, flag)
220 _data = _mm512_load_epi32(
p);
223 template <
class flag,
typename std::enable_if<
224 !is_requiring_alignment_v<flag>,
bool>::type = 0>
225 inline void load(
const scalarType *
p, flag)
232 _data = _mm512_loadu_si512(
p);
235 inline void broadcast(
const scalarType rhs)
237 _data = _mm512_set1_epi32(rhs);
243 inline scalarType operator[](
size_t i)
const
245 alignas(alignment) scalarArray tmp;
250 inline scalarType &operator[](
size_t i)
252 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
258inline avx512Int16<T>
operator+(avx512Int16<T> lhs, avx512Int16<T> rhs)
260 return _mm512_add_epi32(lhs._data, rhs._data);
263template <
typename T,
typename U,
264 typename =
typename std::enable_if<std::is_arithmetic_v<U>>::type>
265inline avx512Int16<T>
operator+(avx512Int16<T> lhs, U rhs)
267 return _mm512_add_epi32(lhs._data, _mm512_set1_epi32(rhs));
272template <
typename T>
struct avx512Long8
274 static_assert(std::is_integral_v<T> &&
sizeof(T) == 8,
275 "8 bytes Integral required.");
277 static constexpr unsigned int width = 8;
278 static constexpr unsigned int alignment = 64;
280 using scalarType = T;
281 using vectorType = __m512i;
282 using scalarArray = scalarType[width];
288 inline avx512Long8() =
default;
289 inline avx512Long8(
const avx512Long8 &rhs) =
default;
290 inline avx512Long8(
const vectorType &rhs) : _data(rhs)
293 inline avx512Long8(
const scalarType rhs)
295 _data = _mm512_set1_epi64(rhs);
297 explicit inline avx512Long8(scalarArray &rhs)
299 _data = _mm512_load_epi64(rhs);
303 inline avx512Long8 &operator=(
const avx512Long8 &) =
default;
306 inline void store(scalarType *
p)
const
308 _mm512_store_epi64(
p, _data);
311 template <
class flag,
312 typename std::enable_if<is_requiring_alignment_v<flag> &&
313 !is_streaming_v<flag>,
315 inline void store(scalarType *
p, flag)
const
317 _mm512_store_epi64(
p, _data);
320 template <
class flag,
typename std::enable_if<
321 !is_requiring_alignment_v<flag>,
bool>::type = 0>
322 inline void store(scalarType *
p, flag)
const
324 _mm512_storeu_epi64(
p, _data);
327 inline void load(
const scalarType *
p)
329 _data = _mm512_load_epi64(
p);
332 template <
class flag,
333 typename std::enable_if<is_requiring_alignment_v<flag> &&
334 !is_streaming_v<flag>,
336 inline void load(
const scalarType *
p, flag)
338 _data = _mm512_load_epi64(
p);
341 template <
class flag,
typename std::enable_if<
342 !is_requiring_alignment_v<flag>,
bool>::type = 0>
343 inline void load(
const scalarType *
p, flag)
350 _data = _mm512_loadu_si512(
p);
353 inline void broadcast(
const scalarType rhs)
355 _data = _mm512_set1_epi64(rhs);
361 inline scalarType operator[](
size_t i)
const
363 alignas(alignment) scalarArray tmp;
368 inline scalarType &operator[](
size_t i)
370 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
376inline avx512Long8<T>
operator+(avx512Long8<T> lhs, avx512Long8<T> rhs)
378 return _mm512_add_epi64(lhs._data, rhs._data);
381template <
typename T,
typename U,
382 typename =
typename std::enable_if<std::is_arithmetic_v<U>>::type>
383inline avx512Long8<T>
operator+(avx512Long8<T> lhs, U rhs)
385 return _mm512_add_epi64(lhs._data, _mm512_set1_epi64(rhs));
392 static constexpr unsigned int width = 8;
393 static constexpr unsigned int alignment = 64;
395 using scalarType = double;
397 using vectorType = __m512d;
398 using scalarArray = scalarType[width];
404 inline avx512Double8() =
default;
405 inline avx512Double8(
const avx512Double8 &rhs) =
default;
406 inline avx512Double8(
const vectorType &rhs) : _data(rhs)
409 inline avx512Double8(
const scalarType rhs)
411 _data = _mm512_set1_pd(rhs);
415 inline avx512Double8 &operator=(
const avx512Double8 &) =
default;
418 inline void store(scalarType *
p)
const
420 _mm512_store_pd(
p, _data);
423 template <
class flag,
424 typename std::enable_if<is_requiring_alignment_v<flag> &&
425 !is_streaming_v<flag>,
427 inline void store(scalarType *
p, flag)
const
429 _mm512_store_pd(
p, _data);
432 template <
class flag,
typename std::enable_if<
433 !is_requiring_alignment_v<flag>,
bool>::type = 0>
434 inline void store(scalarType *
p, flag)
const
436 _mm512_storeu_pd(
p, _data);
439 template <
class flag,
440 typename std::enable_if<is_streaming_v<flag>,
bool>::type = 0>
441 inline void store(scalarType *
p, flag)
const
443 _mm512_stream_pd(
p, _data);
447 inline void load(
const scalarType *
p)
449 _data = _mm512_load_pd(
p);
452 template <
class flag,
typename std::enable_if<
453 is_requiring_alignment_v<flag>,
bool>::type = 0>
454 inline void load(
const scalarType *
p, flag)
456 _data = _mm512_load_pd(
p);
459 template <
class flag,
typename std::enable_if<
460 !is_requiring_alignment_v<flag>,
bool>::type = 0>
461 inline void load(
const scalarType *
p, flag)
463 _data = _mm512_loadu_pd(
p);
467 inline void broadcast(
const scalarType rhs)
469 _data = _mm512_set1_pd(rhs);
473 template <
typename T>
474 inline void gather(scalarType
const *
p,
const avx2Int8<T> &indices)
476 _data = _mm512_i32gather_pd(indices._data,
p, 8);
479 template <
typename T>
480 inline void scatter(scalarType *out,
const avx2Int8<T> &indices)
const
482 _mm512_i32scatter_pd(out, indices._data, _data, 8);
485 template <
typename T>
486 inline void gather(scalarType
const *
p,
const avx512Long8<T> &indices)
488 _data = _mm512_i64gather_pd(indices._data,
p, 8);
491 template <
typename T>
492 inline void scatter(scalarType *out,
const avx512Long8<T> &indices)
const
494 _mm512_i64scatter_pd(out, indices._data, _data, 8);
499 inline void fma(
const avx512Double8 &a,
const avx512Double8 &b)
501 _data = _mm512_fmadd_pd(a._data, b._data, _data);
507 inline scalarType operator[](
size_t i)
const
509 alignas(alignment) scalarArray tmp;
514 inline scalarType &operator[](
size_t i)
516 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
521 inline void operator+=(avx512Double8 rhs)
523 _data = _mm512_add_pd(_data, rhs._data);
526 inline void operator-=(avx512Double8 rhs)
528 _data = _mm512_sub_pd(_data, rhs._data);
531 inline void operator*=(avx512Double8 rhs)
533 _data = _mm512_mul_pd(_data, rhs._data);
536 inline void operator/=(avx512Double8 rhs)
538 _data = _mm512_div_pd(_data, rhs._data);
542inline avx512Double8
operator+(avx512Double8 lhs, avx512Double8 rhs)
544 return _mm512_add_pd(lhs._data, rhs._data);
547inline avx512Double8
operator-(avx512Double8 lhs, avx512Double8 rhs)
549 return _mm512_sub_pd(lhs._data, rhs._data);
552inline avx512Double8
operator*(avx512Double8 lhs, avx512Double8 rhs)
554 return _mm512_mul_pd(lhs._data, rhs._data);
557inline avx512Double8
operator/(avx512Double8 lhs, avx512Double8 rhs)
559 return _mm512_div_pd(lhs._data, rhs._data);
562inline avx512Double8
sqrt(avx512Double8 in)
564 return _mm512_sqrt_pd(in._data);
567inline avx512Double8
abs(avx512Double8 in)
569 return _mm512_abs_pd(in._data);
572inline avx512Double8
log(avx512Double8 in)
574#if defined(TINYSIMD_HAS_SVML)
575 return _mm512_log_pd(in._data);
579 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
597 std::vector<avx512Double8, allocator<avx512Double8>> &out)
599 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
600 for (
size_t i = 0; i < dataLen; ++i)
603 tmp[1] = in[i + dataLen];
604 tmp[2] = in[i + 2 * dataLen];
605 tmp[3] = in[i + 3 * dataLen];
606 tmp[4] = in[i + 4 * dataLen];
607 tmp[5] = in[i + 5 * dataLen];
608 tmp[6] = in[i + 6 * dataLen];
609 tmp[7] = in[i + 7 * dataLen];
616 std::vector<avx512Double8, allocator<avx512Double8>> &out)
619 alignas(avx512Double8::alignment)
620 avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
621 0, dataLen, 2 * dataLen, 3 * dataLen,
622 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
624 using index_t = avx512Long8<avx512Double8::scalarIndexType>;
626 index_t index1 = index0 + 1;
627 index_t index2 = index0 + 2;
628 index_t index3 = index0 + 3;
631 constexpr uint16_t unrl = 4;
632 size_t nBlocks = dataLen / unrl;
633 for (
size_t i = 0; i < nBlocks; ++i)
635 out[unrl * i + 0].gather(in, index0);
636 out[unrl * i + 1].gather(in, index1);
637 out[unrl * i + 2].gather(in, index2);
638 out[unrl * i + 3].gather(in, index3);
639 index0 = index0 + unrl;
640 index1 = index1 + unrl;
641 index2 = index2 + unrl;
642 index3 = index3 + unrl;
646 for (
size_t i = unrl * nBlocks; i < dataLen; ++i)
648 out[i].gather(in, index0);
654 const std::vector<avx512Double8, allocator<avx512Double8>> &in,
657 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
658 for (
size_t i = 0; i < dataLen; ++i)
662 out[i + dataLen] = tmp[1];
663 out[i + 2 * dataLen] = tmp[2];
664 out[i + 3 * dataLen] = tmp[3];
665 out[i + 4 * dataLen] = tmp[4];
666 out[i + 5 * dataLen] = tmp[5];
667 out[i + 6 * dataLen] = tmp[6];
668 out[i + 7 * dataLen] = tmp[7];
673 const std::vector<avx512Double8, allocator<avx512Double8>> &in,
678 alignas(avx512Double8::alignment)
679 avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
680 0, dataLen, 2 * dataLen, 3 * dataLen,
681 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
682 using index_t = avx512Long8<avx512Double8::scalarIndexType>;
684 for (
size_t i = 0; i < dataLen; ++i)
686 in[i].scatter(out, index0);
695 static constexpr unsigned int width = 16;
696 static constexpr unsigned int alignment = 64;
698 using scalarType = float;
700 using vectorType = __m512;
701 using scalarArray = scalarType[width];
707 inline avx512Float16() =
default;
708 inline avx512Float16(
const avx512Float16 &rhs) =
default;
709 inline avx512Float16(
const vectorType &rhs) : _data(rhs)
712 inline avx512Float16(
const scalarType rhs)
714 _data = _mm512_set1_ps(rhs);
718 inline avx512Float16 &operator=(
const avx512Float16 &) =
default;
721 inline void store(scalarType *
p)
const
723 _mm512_store_ps(
p, _data);
726 template <
class flag,
727 typename std::enable_if<is_requiring_alignment_v<flag> &&
728 !is_streaming_v<flag>,
730 inline void store(scalarType *
p, flag)
const
732 _mm512_store_ps(
p, _data);
735 template <
class flag,
typename std::enable_if<
736 !is_requiring_alignment_v<flag>,
bool>::type = 0>
737 inline void store(scalarType *
p, flag)
const
739 _mm512_storeu_ps(
p, _data);
742 template <
class flag,
743 typename std::enable_if<is_streaming_v<flag>,
bool>::type = 0>
744 inline void store(scalarType *
p, flag)
const
746 _mm512_stream_ps(
p, _data);
750 inline void load(
const scalarType *
p)
752 _data = _mm512_load_ps(
p);
755 template <
class flag,
typename std::enable_if<
756 is_requiring_alignment_v<flag>,
bool>::type = 0>
757 inline void load(
const scalarType *
p, flag)
759 _data = _mm512_load_ps(
p);
762 template <
class flag,
typename std::enable_if<
763 !is_requiring_alignment_v<flag>,
bool>::type = 0>
764 inline void load(
const scalarType *
p, flag)
766 _data = _mm512_loadu_ps(
p);
770 inline void broadcast(
const scalarType rhs)
772 _data = _mm512_set1_ps(rhs);
776 template <
typename T>
777 inline void gather(scalarType
const *
p,
const avx512Int16<T> &indices)
779 _data = _mm512_i32gather_ps(indices._data,
p,
sizeof(scalarType));
782 template <
typename T>
783 inline void scatter(scalarType *out,
const avx512Int16<T> &indices)
const
785 _mm512_i32scatter_ps(out, indices._data, _data,
sizeof(scalarType));
790 inline void fma(
const avx512Float16 &a,
const avx512Float16 &b)
792 _data = _mm512_fmadd_ps(a._data, b._data, _data);
798 inline scalarType operator[](
size_t i)
const
800 alignas(alignment) scalarArray tmp;
805 inline scalarType &operator[](
size_t i)
807 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
811 inline void operator+=(avx512Float16 rhs)
813 _data = _mm512_add_ps(_data, rhs._data);
816 inline void operator-=(avx512Float16 rhs)
818 _data = _mm512_sub_ps(_data, rhs._data);
821 inline void operator*=(avx512Float16 rhs)
823 _data = _mm512_mul_ps(_data, rhs._data);
826 inline void operator/=(avx512Float16 rhs)
828 _data = _mm512_div_ps(_data, rhs._data);
832inline avx512Float16
operator+(avx512Float16 lhs, avx512Float16 rhs)
834 return _mm512_add_ps(lhs._data, rhs._data);
837inline avx512Float16
operator-(avx512Float16 lhs, avx512Float16 rhs)
839 return _mm512_sub_ps(lhs._data, rhs._data);
842inline avx512Float16
operator*(avx512Float16 lhs, avx512Float16 rhs)
844 return _mm512_mul_ps(lhs._data, rhs._data);
847inline avx512Float16
operator/(avx512Float16 lhs, avx512Float16 rhs)
849 return _mm512_div_ps(lhs._data, rhs._data);
852inline avx512Float16
sqrt(avx512Float16 in)
854 return _mm512_sqrt_ps(in._data);
857inline avx512Float16
abs(avx512Float16 in)
859 return _mm512_abs_ps(in._data);
862inline avx512Float16
log(avx512Float16 in)
864#if defined(TINYSIMD_HAS_SVML)
865 return _mm512_log_ps(in._data);
869 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
895 std::vector<avx512Float16, allocator<avx512Float16>> &out)
897 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
898 for (
size_t i = 0; i < dataLen; ++i)
901 tmp[1] = in[i + dataLen];
902 tmp[2] = in[i + 2 * dataLen];
903 tmp[3] = in[i + 3 * dataLen];
904 tmp[4] = in[i + 4 * dataLen];
905 tmp[5] = in[i + 5 * dataLen];
906 tmp[6] = in[i + 6 * dataLen];
907 tmp[7] = in[i + 7 * dataLen];
908 tmp[8] = in[i + 8 * dataLen];
909 tmp[9] = in[i + 9 * dataLen];
910 tmp[10] = in[i + 10 * dataLen];
911 tmp[11] = in[i + 11 * dataLen];
912 tmp[12] = in[i + 12 * dataLen];
913 tmp[13] = in[i + 13 * dataLen];
914 tmp[14] = in[i + 14 * dataLen];
915 tmp[15] = in[i + 15 * dataLen];
922 std::vector<avx512Float16, allocator<avx512Float16>> &out)
925 alignas(avx512Float16::alignment)
926 avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
944 using index_t = avx512Int16<avx512Float16::scalarIndexType>;
946 index_t index1 = index0 + 1;
947 index_t index2 = index0 + 2;
948 index_t index3 = index0 + 3;
951 constexpr uint16_t unrl = 4;
952 size_t nBlocks = dataLen / unrl;
953 for (
size_t i = 0; i < nBlocks; ++i)
955 out[unrl * i + 0].gather(in, index0);
956 out[unrl * i + 1].gather(in, index1);
957 out[unrl * i + 2].gather(in, index2);
958 out[unrl * i + 3].gather(in, index3);
959 index0 = index0 + unrl;
960 index1 = index1 + unrl;
961 index2 = index2 + unrl;
962 index3 = index3 + unrl;
966 for (
size_t i = unrl * nBlocks; i < dataLen; ++i)
968 out[i].gather(in, index0);
974 const std::vector<avx512Float16, allocator<avx512Float16>> &in,
977 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
978 for (
size_t i = 0; i < dataLen; ++i)
982 out[i + dataLen] = tmp[1];
983 out[i + 2 * dataLen] = tmp[2];
984 out[i + 3 * dataLen] = tmp[3];
985 out[i + 4 * dataLen] = tmp[4];
986 out[i + 5 * dataLen] = tmp[5];
987 out[i + 6 * dataLen] = tmp[6];
988 out[i + 7 * dataLen] = tmp[7];
989 out[i + 8 * dataLen] = tmp[8];
990 out[i + 9 * dataLen] = tmp[9];
991 out[i + 10 * dataLen] = tmp[10];
992 out[i + 11 * dataLen] = tmp[11];
993 out[i + 12 * dataLen] = tmp[12];
994 out[i + 13 * dataLen] = tmp[13];
995 out[i + 14 * dataLen] = tmp[14];
996 out[i + 15 * dataLen] = tmp[15];
1001 const std::vector<avx512Float16, allocator<avx512Float16>> &in,
1006 alignas(avx512Float16::alignment)
1007 avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
1024 using index_t = avx512Int16<avx512Float16::scalarIndexType>;
1026 index_t index0(tmp);
1027 for (
size_t i = 0; i < dataLen; ++i)
1029 in[i].scatter(out, index0);
1030 index0 = index0 + 1;
1043struct avx512Mask8 : avx512Long8<std::uint64_t>
1046 using avx512Long8::avx512Long8;
1048 static constexpr scalarType true_v = -1;
1049 static constexpr scalarType false_v = 0;
1052inline avx512Mask8
operator>(avx512Double8 lhs, avx512Double8 rhs)
1054 __mmask8 mask = _mm512_cmp_pd_mask(lhs._data, rhs._data, _CMP_GT_OQ);
1055 return _mm512_maskz_set1_epi64(mask, avx512Mask8::true_v);
1058inline bool operator&&(avx512Mask8 lhs,
bool rhs)
1060 __m512i val_true = _mm512_set1_epi64(avx512Mask8::true_v);
1061 __mmask8 mask = _mm512_test_epi64_mask(lhs._data, val_true);
1062 unsigned int tmp = _cvtmask16_u32(mask);
1066struct avx512Mask16 : avx512Int16<std::uint32_t>
1069 using avx512Int16::avx512Int16;
1071 static constexpr scalarType true_v = -1;
1072 static constexpr scalarType false_v = 0;
1075inline avx512Mask16
operator>(avx512Float16 lhs, avx512Float16 rhs)
1077 __mmask16 mask = _mm512_cmp_ps_mask(lhs._data, rhs._data, _CMP_GT_OQ);
1078 return _mm512_maskz_set1_epi32(mask, avx512Mask16::true_v);
1081inline bool operator&&(avx512Mask16 lhs,
bool rhs)
1083 __m512i val_true = _mm512_set1_epi32(avx512Mask16::true_v);
1084 __mmask16 mask = _mm512_test_epi32_mask(lhs._data, val_true);
1085 unsigned int tmp = _cvtmask16_u32(mask);
void load_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
scalarT< T > abs(scalarT< T > in)
void deinterleave_unalign_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > log(scalarT< T > in)
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
bool operator&&(scalarMask lhs, bool rhs)
void load_unalign_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
scalarT< T > sqrt(scalarT< T > in)
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)