35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
38#if defined(__x86_64__)
40#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41#define TINYSIMD_HAS_SVML
53template <
typename scalarType,
int w
idth = 0>
struct avx2
60#if defined(__AVX2__) && defined(NEKTAR_ENABLE_SIMD_AVX2)
66template <
typename T>
struct avx2Long4;
67template <
typename T>
struct avx2Int8;
77template <>
struct avx2<double>
79 using type = avx2Double4;
81template <>
struct avx2<float>
83 using type = avx2Float8;
87template <>
struct avx2<
std::int64_t>
89 using type = avx2Long4<std::int64_t>;
91template <>
struct avx2<
std::uint64_t>
93 using type = avx2Long4<std::uint64_t>;
96template <>
struct avx2<
std::size_t>
98 using type = avx2Long4<std::size_t>;
101template <>
struct avx2<
std::int32_t>
103 using type = avx2Int8<std::int32_t>;
105template <>
struct avx2<
std::uint32_t>
107 using type = avx2Int8<std::uint32_t>;
110template <>
struct avx2<
std::int64_t, 4>
112 using type = avx2Long4<std::int64_t>;
114template <>
struct avx2<
std::uint64_t, 4>
116 using type = avx2Long4<std::uint64_t>;
118#if defined(__APPLE__)
119template <>
struct avx2<
std::size_t, 4>
121 using type = avx2Long4<std::size_t>;
124template <>
struct avx2<
std::int32_t, 4>
126 using type = sse2Int4<std::int32_t>;
128template <>
struct avx2<
std::uint32_t, 4>
130 using type = sse2Int4<std::uint32_t>;
132template <>
struct avx2<
std::int32_t, 8>
134 using type = avx2Int8<std::int32_t>;
136template <>
struct avx2<
std::uint32_t, 8>
138 using type = avx2Int8<std::uint32_t>;
141template <>
struct avx2<bool, 4>
143 using type = avx2Mask4;
145template <>
struct avx2<bool, 8>
147 using type = avx2Mask8;
153template <
typename T>
struct avx2Int8
155 static_assert(std::is_integral<T>::value &&
sizeof(T) == 4,
156 "4 bytes Integral required.");
158 static constexpr unsigned int width = 8;
159 static constexpr unsigned int alignment = 32;
161 using scalarType = T;
162 using vectorType = __m256i;
163 using scalarArray = scalarType[width];
169 inline avx2Int8() =
default;
170 inline avx2Int8(
const avx2Int8 &rhs) =
default;
171 inline avx2Int8(
const vectorType &rhs) : _data(rhs)
174 inline avx2Int8(
const scalarType rhs)
176 _data = _mm256_set1_epi32(rhs);
178 explicit inline avx2Int8(scalarArray &rhs)
180 _data = _mm256_load_si256(
reinterpret_cast<vectorType *
>(rhs));
184 inline avx2Int8 &operator=(
const avx2Int8 &) =
default;
187 inline void store(scalarType *
p)
const
189 _mm256_store_si256(
reinterpret_cast<vectorType *
>(
p), _data);
192 template <
class flag,
193 typename std::enable_if<is_requiring_alignment<flag>::value &&
194 !is_streaming<flag>::value,
196 inline void store(scalarType *
p, flag)
const
198 _mm256_store_si256(
reinterpret_cast<vectorType *
>(
p), _data);
201 template <
class flag,
202 typename std::enable_if<!is_requiring_alignment<flag>::value,
204 inline void store(scalarType *
p, flag)
const
206 _mm256_storeu_si256(
reinterpret_cast<vectorType *
>(
p), _data);
209 inline void load(
const scalarType *
p)
211 _data = _mm256_load_si256(
reinterpret_cast<const vectorType *
>(
p));
214 template <
class flag,
215 typename std::enable_if<is_requiring_alignment<flag>::value &&
216 !is_streaming<flag>::value,
218 inline void load(
const scalarType *
p, flag)
220 _data = _mm256_load_si256(
reinterpret_cast<const vectorType *
>(
p));
223 template <
class flag,
224 typename std::enable_if<!is_requiring_alignment<flag>::value,
226 inline void load(
const scalarType *
p, flag)
228 _data = _mm256_loadu_si256(
reinterpret_cast<const vectorType *
>(
p));
231 inline void broadcast(
const scalarType rhs)
233 _data = _mm256_set1_epi32(rhs);
239 inline scalarType operator[](
size_t i)
const
241 alignas(alignment) scalarArray tmp;
246 inline scalarType &operator[](
size_t i)
248 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
254inline avx2Int8<T>
operator+(avx2Int8<T> lhs, avx2Int8<T> rhs)
256 return _mm256_add_epi32(lhs._data, rhs._data);
260 typename T,
typename U,
261 typename =
typename std::enable_if<std::is_arithmetic<U>::value>::type>
262inline avx2Int8<T>
operator+(avx2Int8<T> lhs, U rhs)
264 return _mm256_add_epi32(lhs._data, _mm256_set1_epi32(rhs));
269template <
typename T>
struct avx2Long4
271 static_assert(std::is_integral<T>::value &&
sizeof(T) == 8,
272 "8 bytes Integral required.");
274 static constexpr unsigned int width = 4;
275 static constexpr unsigned int alignment = 32;
277 using scalarType = T;
278 using vectorType = __m256i;
279 using scalarArray = scalarType[width];
285 inline avx2Long4() =
default;
286 inline avx2Long4(
const avx2Long4 &rhs) =
default;
287 inline avx2Long4(
const vectorType &rhs) : _data(rhs)
290 inline avx2Long4(
const scalarType rhs)
292 _data = _mm256_set1_epi64x(rhs);
294 explicit inline avx2Long4(scalarArray &rhs)
296 _data = _mm256_load_si256(
reinterpret_cast<vectorType *
>(rhs));
300 inline avx2Long4 &operator=(
const avx2Long4 &) =
default;
303 inline void store(scalarType *
p)
const
305 _mm256_store_si256(
reinterpret_cast<vectorType *
>(
p), _data);
308 template <
class flag,
309 typename std::enable_if<is_requiring_alignment<flag>::value &&
310 !is_streaming<flag>::value,
312 inline void store(scalarType *
p, flag)
const
314 _mm256_store_si256(
reinterpret_cast<vectorType *
>(
p), _data);
317 template <
class flag,
318 typename std::enable_if<!is_requiring_alignment<flag>::value,
320 inline void store(scalarType *
p, flag)
const
322 _mm256_storeu_si256(
reinterpret_cast<vectorType *
>(
p), _data);
325 inline void load(
const scalarType *
p)
327 _data = _mm256_load_si256(
reinterpret_cast<const vectorType *
>(
p));
330 template <
class flag,
331 typename std::enable_if<is_requiring_alignment<flag>::value &&
332 !is_streaming<flag>::value,
334 inline void load(
const scalarType *
p, flag)
336 _data = _mm256_load_si256(
reinterpret_cast<const vectorType *
>(
p));
339 template <
class flag,
340 typename std::enable_if<!is_requiring_alignment<flag>::value,
342 inline void load(
const scalarType *
p, flag)
344 _data = _mm256_loadu_si256(
reinterpret_cast<const vectorType *
>(
p));
347 inline void broadcast(
const scalarType rhs)
349 _data = _mm256_set1_epi64x(rhs);
355 inline scalarType operator[](
size_t i)
const
357 alignas(alignment) scalarArray tmp;
362 inline scalarType &operator[](
size_t i)
364 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
370inline avx2Long4<T>
operator+(avx2Long4<T> lhs, avx2Long4<T> rhs)
372 return _mm256_add_epi64(lhs._data, rhs._data);
376 typename T,
typename U,
377 typename =
typename std::enable_if<std::is_arithmetic<U>::value>::type>
378inline avx2Long4<T>
operator+(avx2Long4<T> lhs, U rhs)
380 return _mm256_add_epi64(lhs._data, _mm256_set1_epi64x(rhs));
387 static constexpr unsigned width = 4;
388 static constexpr unsigned alignment = 32;
390 using scalarType = double;
391 using scalarIndexType = std::uint64_t;
392 using vectorType = __m256d;
393 using scalarArray = scalarType[width];
399 inline avx2Double4() =
default;
400 inline avx2Double4(
const avx2Double4 &rhs) =
default;
401 inline avx2Double4(
const vectorType &rhs) : _data(rhs)
404 inline avx2Double4(
const scalarType rhs)
406 _data = _mm256_set1_pd(rhs);
410 inline avx2Double4 &operator=(
const avx2Double4 &) =
default;
413 inline void store(scalarType *
p)
const
415 _mm256_store_pd(
p, _data);
418 template <
class flag,
419 typename std::enable_if<is_requiring_alignment<flag>::value &&
420 !is_streaming<flag>::value,
422 inline void store(scalarType *
p, flag)
const
424 _mm256_store_pd(
p, _data);
427 template <
class flag,
428 typename std::enable_if<!is_requiring_alignment<flag>::value,
430 inline void store(scalarType *
p, flag)
const
432 _mm256_storeu_pd(
p, _data);
435 template <class flag, typename std::enable_if<is_streaming<flag>::value,
437 inline void store(scalarType *
p, flag)
const
439 _mm256_stream_pd(
p, _data);
443 inline void load(
const scalarType *
p)
445 _data = _mm256_load_pd(
p);
448 template <
class flag,
449 typename std::enable_if<is_requiring_alignment<flag>::value,
451 inline void load(
const scalarType *
p, flag)
453 _data = _mm256_load_pd(
p);
456 template <
class flag,
457 typename std::enable_if<!is_requiring_alignment<flag>::value,
459 inline void load(
const scalarType *
p, flag)
461 _data = _mm256_loadu_pd(
p);
465 inline void broadcast(
const scalarType rhs)
467 _data = _mm256_set1_pd(rhs);
470#if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
472 template <
typename T>
473 inline void gather(scalarType
const *
p,
const sse2Int4<T> &indices)
475 _data = _mm256_i32gather_pd(
p, indices._data, 8);
478 template <
typename T>
479 inline void scatter(scalarType *out,
const sse2Int4<T> &indices)
const
482 alignas(alignment) scalarArray tmp;
483 _mm256_store_pd(tmp, _data);
485 out[_mm_extract_epi32(indices._data, 0)] = tmp[0];
486 out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
487 out[_mm_extract_epi32(indices._data, 2)] = tmp[2];
488 out[_mm_extract_epi32(indices._data, 3)] = tmp[3];
493 template <
typename T>
494 inline void gather(scalarType
const *
p,
const avx2Long4<T> &indices)
496 _data = _mm256_i64gather_pd(
p, indices._data, 8);
499 template <
typename T>
500 inline void scatter(scalarType *out,
const avx2Long4<T> &indices)
const
503 alignas(alignment) scalarArray tmp;
504 _mm256_store_pd(tmp, _data);
506 out[_mm256_extract_epi64(indices._data, 0)] = tmp[0];
507 out[_mm256_extract_epi64(indices._data, 1)] = tmp[1];
508 out[_mm256_extract_epi64(indices._data, 2)] = tmp[2];
509 out[_mm256_extract_epi64(indices._data, 3)] = tmp[3];
514 inline void fma(
const avx2Double4 &a,
const avx2Double4 &b)
516 _data = _mm256_fmadd_pd(a._data, b._data, _data);
522 inline scalarType operator[](
size_t i)
const
524 alignas(alignment) scalarArray tmp;
529 inline scalarType &operator[](
size_t i)
531 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
536 inline void operator+=(avx2Double4 rhs)
538 _data = _mm256_add_pd(_data, rhs._data);
541 inline void operator-=(avx2Double4 rhs)
543 _data = _mm256_sub_pd(_data, rhs._data);
546 inline void operator*=(avx2Double4 rhs)
548 _data = _mm256_mul_pd(_data, rhs._data);
551 inline void operator/=(avx2Double4 rhs)
553 _data = _mm256_div_pd(_data, rhs._data);
557inline avx2Double4
operator+(avx2Double4 lhs, avx2Double4 rhs)
559 return _mm256_add_pd(lhs._data, rhs._data);
562inline avx2Double4
operator-(avx2Double4 lhs, avx2Double4 rhs)
564 return _mm256_sub_pd(lhs._data, rhs._data);
567inline avx2Double4
operator*(avx2Double4 lhs, avx2Double4 rhs)
569 return _mm256_mul_pd(lhs._data, rhs._data);
572inline avx2Double4
operator/(avx2Double4 lhs, avx2Double4 rhs)
574 return _mm256_div_pd(lhs._data, rhs._data);
577inline avx2Double4
sqrt(avx2Double4 in)
579 return _mm256_sqrt_pd(in._data);
582inline avx2Double4
abs(avx2Double4 in)
585 static const __m256d sign_mask = _mm256_set1_pd(-0.);
586 return _mm256_andnot_pd(sign_mask, in._data);
589inline avx2Double4
log(avx2Double4 in)
591#if defined(TINYSIMD_HAS_SVML)
592 return _mm256_log_pd(in._data);
596 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
609 const double *in,
const std::uint32_t dataLen,
610 std::vector<avx2Double4, allocator<avx2Double4>> &out)
612 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
613 for (
size_t i = 0; i < dataLen; ++i)
616 tmp[1] = in[i + dataLen];
617 tmp[2] = in[i + 2 * dataLen];
618 tmp[3] = in[i + 3 * dataLen];
624 const double *in, std::uint32_t dataLen,
625 std::vector<avx2Double4, allocator<avx2Double4>> &out)
627 alignas(avx2Double4::alignment)
628 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
629 using index_t = avx2Long4<size_t>;
631 index_t index1 = index0 + 1;
632 index_t index2 = index0 + 2;
633 index_t index3 = index0 + 3;
636 constexpr uint16_t unrl = 4;
637 size_t nBlocks = dataLen / unrl;
638 for (
size_t i = 0; i < nBlocks; ++i)
640 out[unrl * i + 0].gather(in, index0);
641 out[unrl * i + 1].gather(in, index1);
642 out[unrl * i + 2].gather(in, index2);
643 out[unrl * i + 3].gather(in, index3);
644 index0 = index0 + unrl;
645 index1 = index1 + unrl;
646 index2 = index2 + unrl;
647 index3 = index3 + unrl;
651 for (
size_t i = unrl * nBlocks; i < dataLen; ++i)
653 out[i].gather(in, index0);
659 const std::vector<avx2Double4, allocator<avx2Double4>> &in,
660 const std::uint32_t dataLen,
double *out)
662 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
663 for (
size_t i = 0; i < dataLen; ++i)
667 out[i + dataLen] = tmp[1];
668 out[i + 2 * dataLen] = tmp[2];
669 out[i + 3 * dataLen] = tmp[3];
674 const std::vector<avx2Double4, allocator<avx2Double4>> &in,
675 std::uint32_t dataLen,
double *out)
677 alignas(avx2Double4::alignment)
678 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
679 using index_t = avx2Long4<size_t>;
682 for (
size_t i = 0; i < dataLen; ++i)
684 in[i].scatter(out, index0);
693 static constexpr unsigned width = 8;
694 static constexpr unsigned alignment = 32;
696 using scalarType = float;
697 using scalarIndexType = std::uint32_t;
698 using vectorType = __m256;
699 using scalarArray = scalarType[width];
705 inline avx2Float8() =
default;
706 inline avx2Float8(
const avx2Float8 &rhs) =
default;
707 inline avx2Float8(
const vectorType &rhs) : _data(rhs)
710 inline avx2Float8(
const scalarType rhs)
712 _data = _mm256_set1_ps(rhs);
716 inline avx2Float8 &operator=(
const avx2Float8 &) =
default;
719 inline void store(scalarType *
p)
const
721 _mm256_store_ps(
p, _data);
724 template <
class flag,
725 typename std::enable_if<is_requiring_alignment<flag>::value &&
726 !is_streaming<flag>::value,
728 inline void store(scalarType *
p, flag)
const
730 _mm256_store_ps(
p, _data);
733 template <
class flag,
734 typename std::enable_if<!is_requiring_alignment<flag>::value,
736 inline void store(scalarType *
p, flag)
const
738 _mm256_storeu_ps(
p, _data);
741 template <class flag, typename std::enable_if<is_streaming<flag>::value,
743 inline void store(scalarType *
p, flag)
const
745 _mm256_stream_ps(
p, _data);
749 inline void load(
const scalarType *
p)
751 _data = _mm256_load_ps(
p);
754 template <
class flag,
755 typename std::enable_if<is_requiring_alignment<flag>::value,
757 inline void load(
const scalarType *
p, flag)
759 _data = _mm256_load_ps(
p);
762 template <
class flag,
763 typename std::enable_if<!is_requiring_alignment<flag>::value,
765 inline void load(
const scalarType *
p, flag)
767 _data = _mm256_loadu_ps(
p);
771 inline void broadcast(
const scalarType rhs)
773 _data = _mm256_set1_ps(rhs);
777 template <
typename T>
778 inline void gather(scalarType
const *
p,
const avx2Int8<T> &indices)
780 _data = _mm256_i32gather_ps(
p, indices._data, 4);
783 template <
typename T>
784 inline void scatter(scalarType *out,
const avx2Int8<T> &indices)
const
787 alignas(alignment) scalarArray tmp;
788 _mm256_store_ps(tmp, _data);
790 out[_mm256_extract_epi32(indices._data, 0)] = tmp[0];
791 out[_mm256_extract_epi32(indices._data, 1)] = tmp[1];
792 out[_mm256_extract_epi32(indices._data, 2)] = tmp[2];
793 out[_mm256_extract_epi32(indices._data, 3)] = tmp[3];
794 out[_mm256_extract_epi32(indices._data, 4)] = tmp[4];
795 out[_mm256_extract_epi32(indices._data, 5)] = tmp[5];
796 out[_mm256_extract_epi32(indices._data, 6)] = tmp[6];
797 out[_mm256_extract_epi32(indices._data, 7)] = tmp[7];
802 inline void fma(
const avx2Float8 &a,
const avx2Float8 &b)
804 _data = _mm256_fmadd_ps(a._data, b._data, _data);
810 inline scalarType operator[](
size_t i)
const
812 alignas(alignment) scalarArray tmp;
817 inline scalarType &operator[](
size_t i)
819 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
823 inline void operator+=(avx2Float8 rhs)
825 _data = _mm256_add_ps(_data, rhs._data);
828 inline void operator-=(avx2Float8 rhs)
830 _data = _mm256_sub_ps(_data, rhs._data);
833 inline void operator*=(avx2Float8 rhs)
835 _data = _mm256_mul_ps(_data, rhs._data);
838 inline void operator/=(avx2Float8 rhs)
840 _data = _mm256_div_ps(_data, rhs._data);
844inline avx2Float8
operator+(avx2Float8 lhs, avx2Float8 rhs)
846 return _mm256_add_ps(lhs._data, rhs._data);
849inline avx2Float8
operator-(avx2Float8 lhs, avx2Float8 rhs)
851 return _mm256_sub_ps(lhs._data, rhs._data);
854inline avx2Float8
operator*(avx2Float8 lhs, avx2Float8 rhs)
856 return _mm256_mul_ps(lhs._data, rhs._data);
859inline avx2Float8
operator/(avx2Float8 lhs, avx2Float8 rhs)
861 return _mm256_div_ps(lhs._data, rhs._data);
864inline avx2Float8
sqrt(avx2Float8 in)
866 return _mm256_sqrt_ps(in._data);
869inline avx2Float8
abs(avx2Float8 in)
872 static const __m256 sign_mask = _mm256_set1_ps(-0.);
873 return _mm256_andnot_ps(sign_mask, in._data);
876inline avx2Float8
log(avx2Float8 in)
880 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
896 const double *in,
const std::uint32_t dataLen,
897 std::vector<avx2Float8, allocator<avx2Float8>> &out)
899 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
900 for (
size_t i = 0; i < dataLen; ++i)
903 tmp[1] = in[i + dataLen];
904 tmp[2] = in[i + 2 * dataLen];
905 tmp[3] = in[i + 3 * dataLen];
906 tmp[4] = in[i + 4 * dataLen];
907 tmp[5] = in[i + 5 * dataLen];
908 tmp[6] = in[i + 6 * dataLen];
909 tmp[7] = in[i + 7 * dataLen];
915 std::vector<avx2Float8, allocator<avx2Float8>> &out)
918 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
919 0, dataLen, 2 * dataLen, 3 * dataLen,
920 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
922 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
924 index_t index1 = index0 + 1;
925 index_t index2 = index0 + 2;
926 index_t index3 = index0 + 3;
929 size_t nBlocks = dataLen / 4;
930 for (
size_t i = 0; i < nBlocks; ++i)
932 out[4 * i + 0].gather(in, index0);
933 out[4 * i + 1].gather(in, index1);
934 out[4 * i + 2].gather(in, index2);
935 out[4 * i + 3].gather(in, index3);
943 for (
size_t i = 4 * nBlocks; i < dataLen; ++i)
945 out[i].gather(in, index0);
951 const std::vector<avx2Float8, allocator<avx2Float8>> &in,
952 const std::uint32_t dataLen,
double *out)
954 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
955 for (
size_t i = 0; i < dataLen; ++i)
959 out[i + dataLen] = tmp[1];
960 out[i + 2 * dataLen] = tmp[2];
961 out[i + 3 * dataLen] = tmp[3];
962 out[i + 4 * dataLen] = tmp[4];
963 out[i + 5 * dataLen] = tmp[5];
964 out[i + 6 * dataLen] = tmp[6];
965 out[i + 7 * dataLen] = tmp[7];
970 const std::vector<avx2Float8, allocator<avx2Float8>> &in,
971 std::uint32_t dataLen,
float *out)
973 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
974 0, dataLen, 2 * dataLen, 3 * dataLen,
975 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
976 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
979 for (
size_t i = 0; i < dataLen; ++i)
981 in[i].scatter(out, index0);
995struct avx2Mask4 : avx2Long4<std::uint64_t>
998 using avx2Long4::avx2Long4;
1000 static constexpr scalarType true_v = -1;
1001 static constexpr scalarType false_v = 0;
1004inline avx2Mask4
operator>(avx2Double4 lhs, avx2Double4 rhs)
1006 return reinterpret_cast<__m256i
>(
1007 _mm256_cmp_pd(lhs._data, rhs._data, _CMP_GT_OQ));
1010inline bool operator&&(avx2Mask4 lhs,
bool rhs)
1013 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask4::true_v));
1018struct avx2Mask8 : avx2Int8<std::uint32_t>
1021 using avx2Int8::avx2Int8;
1023 static constexpr scalarType true_v = -1;
1024 static constexpr scalarType false_v = 0;
1027inline avx2Mask8
operator>(avx2Float8 lhs, avx2Float8 rhs)
1029 return reinterpret_cast<__m256i
>(_mm256_cmp_ps(rhs._data, lhs._data, 1));
1032inline bool operator&&(avx2Mask8 lhs,
bool rhs)
1035 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask8::true_v));
void load_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
scalarT< T > abs(scalarT< T > in)
void deinterleave_unalign_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > log(scalarT< T > in)
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
bool operator&&(scalarMask lhs, bool rhs)
void load_unalign_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
scalarT< T > sqrt(scalarT< T > in)
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)