35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
38#if defined(__x86_64__)
40#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41#define TINYSIMD_HAS_SVML
52template <
typename scalarType,
int w
idth = 0>
struct avx512
59#if defined(__AVX512F__) && defined(NEKTAR_ENABLE_SIMD_AVX512)
65template <
typename T>
struct avx512Long8;
66template <
typename T>
struct avx512Int16;
76template <>
struct avx512<double>
78 using type = avx512Double8;
80template <>
struct avx512<float>
82 using type = avx512Float16;
88 using type = avx512Long8<std::int64_t>;
92 using type = avx512Long8<std::uint64_t>;
95template <>
struct avx512<
std::size_t>
97 using type = avx512Long8<std::size_t>;
102 using type = avx512Int16<std::int32_t>;
106 using type = avx512Int16<std::uint32_t>;
111 using type = avx512Long8<std::int64_t>;
115 using type = avx512Long8<std::uint64_t>;
117#if defined(__APPLE__)
118template <>
struct avx512<
std::size_t, 8>
120 using type = avx512Long8<std::size_t>;
125 using type = avx2Int8<std::int32_t>;
129 using type = avx2Int8<std::uint32_t>;
133 using type = avx512Int16<std::int32_t>;
137 using type = avx512Int16<std::uint32_t>;
140template <>
struct avx512<bool, 8>
142 using type = avx512Mask8;
144template <>
struct avx512<bool, 16>
146 using type = avx512Mask16;
154template <
typename T>
struct avx512Int16
156 static_assert(std::is_integral_v<T> &&
sizeof(T) == 4,
157 "4 bytes Integral required.");
159 static constexpr unsigned int width = 16;
160 static constexpr unsigned int alignment = 64;
162 using scalarType = T;
163 using vectorType = __m512i;
164 using scalarArray = scalarType[width];
170 inline avx512Int16() =
default;
171 inline avx512Int16(
const avx512Int16 &rhs) =
default;
172 inline avx512Int16(
const vectorType &rhs) : _data(rhs)
175 inline avx512Int16(
const scalarType rhs)
177 _data = _mm512_set1_epi32(rhs);
179 explicit inline avx512Int16(scalarArray &rhs)
181 _data = _mm512_load_epi32(rhs);
185 inline avx512Int16 &operator=(
const avx512Int16 &) =
default;
188 inline void store(scalarType *p)
const
190 _mm512_store_epi32(p, _data);
193 template <
class flag,
194 typename std::enable_if<is_requiring_alignment_v<flag> &&
195 !is_streaming_v<flag>,
197 inline void store(scalarType *p, flag)
const
199 _mm512_store_epi32(p, _data);
202 template <
class flag,
typename std::enable_if<
203 !is_requiring_alignment_v<flag>,
bool>::type = 0>
204 inline void store(scalarType *p, flag)
const
206 _mm512_storeu_epi32(p, _data);
209 inline void load(
const scalarType *p)
211 _data = _mm512_load_epi32(p);
214 template <
class flag,
215 typename std::enable_if<is_requiring_alignment_v<flag> &&
216 !is_streaming_v<flag>,
218 inline void load(
const scalarType *p, flag)
220 _data = _mm512_load_epi32(p);
223 template <
class flag,
typename std::enable_if<
224 !is_requiring_alignment_v<flag>,
bool>::type = 0>
225 inline void load(
const scalarType *p, flag)
232 _data = _mm512_loadu_si512(p);
235 inline void broadcast(
const scalarType rhs)
237 _data = _mm512_set1_epi32(rhs);
243 inline scalarType operator[](
size_t i)
const
245 alignas(alignment) scalarArray tmp;
246 store(tmp, is_aligned);
250 inline scalarType &operator[](
size_t i)
252 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
258inline avx512Int16<T>
operator+(avx512Int16<T> lhs, avx512Int16<T> rhs)
260 return _mm512_add_epi32(lhs._data, rhs._data);
263template <
typename T,
typename U,
264 typename =
typename std::enable_if<std::is_arithmetic_v<U>>::type>
265inline avx512Int16<T>
operator+(avx512Int16<T> lhs, U rhs)
267 return _mm512_add_epi32(lhs._data, _mm512_set1_epi32(rhs));
272template <
typename T>
struct avx512Long8
274 static_assert(std::is_integral_v<T> &&
sizeof(T) == 8,
275 "8 bytes Integral required.");
277 static constexpr unsigned int width = 8;
278 static constexpr unsigned int alignment = 64;
280 using scalarType = T;
281 using vectorType = __m512i;
282 using scalarArray = scalarType[width];
288 inline avx512Long8() =
default;
289 inline avx512Long8(
const avx512Long8 &rhs) =
default;
290 inline avx512Long8(
const vectorType &rhs) : _data(rhs)
293 inline avx512Long8(
const scalarType rhs)
295 _data = _mm512_set1_epi64(rhs);
297 explicit inline avx512Long8(scalarArray &rhs)
299 _data = _mm512_load_epi64(rhs);
303 inline avx512Long8 &operator=(
const avx512Long8 &) =
default;
306 inline void store(scalarType *p)
const
308 _mm512_store_epi64(p, _data);
311 template <
class flag,
312 typename std::enable_if<is_requiring_alignment_v<flag> &&
313 !is_streaming_v<flag>,
315 inline void store(scalarType *p, flag)
const
317 _mm512_store_epi64(p, _data);
320 template <
class flag,
typename std::enable_if<
321 !is_requiring_alignment_v<flag>,
bool>::type = 0>
322 inline void store(scalarType *p, flag)
const
324 _mm512_storeu_epi64(p, _data);
327 inline void load(
const scalarType *p)
329 _data = _mm512_load_epi64(p);
332 template <
class flag,
333 typename std::enable_if<is_requiring_alignment_v<flag> &&
334 !is_streaming_v<flag>,
336 inline void load(
const scalarType *p, flag)
338 _data = _mm512_load_epi64(p);
341 template <
class flag,
typename std::enable_if<
342 !is_requiring_alignment_v<flag>,
bool>::type = 0>
343 inline void load(
const scalarType *p, flag)
350 _data = _mm512_loadu_si512(p);
353 inline void broadcast(
const scalarType rhs)
355 _data = _mm512_set1_epi64(rhs);
361 inline scalarType operator[](
size_t i)
const
363 alignas(alignment) scalarArray tmp;
364 store(tmp, is_aligned);
368 inline scalarType &operator[](
size_t i)
370 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
376inline avx512Long8<T>
operator+(avx512Long8<T> lhs, avx512Long8<T> rhs)
378 return _mm512_add_epi64(lhs._data, rhs._data);
381template <
typename T,
typename U,
382 typename =
typename std::enable_if<std::is_arithmetic_v<U>>::type>
383inline avx512Long8<T>
operator+(avx512Long8<T> lhs, U rhs)
385 return _mm512_add_epi64(lhs._data, _mm512_set1_epi64(rhs));
392 static constexpr unsigned int width = 8;
393 static constexpr unsigned int alignment = 64;
395 using scalarType = double;
396 using scalarIndexType = std::uint64_t;
397 using vectorType = __m512d;
398 using scalarArray = scalarType[width];
404 inline avx512Double8() =
default;
405 inline avx512Double8(
const avx512Double8 &rhs) =
default;
406 inline avx512Double8(
const vectorType &rhs) : _data(rhs)
409 inline avx512Double8(
const scalarType rhs)
411 _data = _mm512_set1_pd(rhs);
415 inline avx512Double8 &operator=(
const avx512Double8 &) =
default;
418 inline void store(scalarType *p)
const
420 _mm512_store_pd(p, _data);
423 template <
class flag,
424 typename std::enable_if<is_requiring_alignment_v<flag> &&
425 !is_streaming_v<flag>,
427 inline void store(scalarType *p, flag)
const
429 _mm512_store_pd(p, _data);
432 template <
class flag,
typename std::enable_if<
433 !is_requiring_alignment_v<flag>,
bool>::type = 0>
434 inline void store(scalarType *p, flag)
const
436 _mm512_storeu_pd(p, _data);
439 template <
class flag,
440 typename std::enable_if<is_streaming_v<flag>,
bool>::type = 0>
441 inline void store(scalarType *p, flag)
const
443 _mm512_stream_pd(p, _data);
447 inline void load(
const scalarType *p)
449 _data = _mm512_load_pd(p);
452 template <
class flag,
typename std::enable_if<
453 is_requiring_alignment_v<flag>,
bool>::type = 0>
454 inline void load(
const scalarType *p, flag)
456 _data = _mm512_load_pd(p);
459 template <
class flag,
typename std::enable_if<
460 !is_requiring_alignment_v<flag>,
bool>::type = 0>
461 inline void load(
const scalarType *p, flag)
463 _data = _mm512_loadu_pd(p);
467 inline void broadcast(
const scalarType rhs)
469 _data = _mm512_set1_pd(rhs);
473 template <
typename T>
474 inline void gather(scalarType
const *p,
const avx2Int8<T> &indices)
476 _data = _mm512_i32gather_pd(indices._data, p, 8);
479 template <
typename T>
480 inline void scatter(scalarType *out,
const avx2Int8<T> &indices)
const
482 _mm512_i32scatter_pd(out, indices._data, _data, 8);
485 template <
typename T>
486 inline void gather(scalarType
const *p,
const avx512Long8<T> &indices)
488 _data = _mm512_i64gather_pd(indices._data, p, 8);
491 template <
typename T>
492 inline void scatter(scalarType *out,
const avx512Long8<T> &indices)
const
494 _mm512_i64scatter_pd(out, indices._data, _data, 8);
499 inline void fma(
const avx512Double8 &a,
const avx512Double8 &b)
501 _data = _mm512_fmadd_pd(a._data, b._data, _data);
507 inline scalarType operator[](
size_t i)
const
509 alignas(alignment) scalarArray tmp;
510 store(tmp, is_aligned);
514 inline scalarType &operator[](
size_t i)
516 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
521 inline void operator+=(avx512Double8 rhs)
523 _data = _mm512_add_pd(_data, rhs._data);
526 inline void operator-=(avx512Double8 rhs)
528 _data = _mm512_sub_pd(_data, rhs._data);
531 inline void operator*=(avx512Double8 rhs)
533 _data = _mm512_mul_pd(_data, rhs._data);
536 inline void operator/=(avx512Double8 rhs)
538 _data = _mm512_div_pd(_data, rhs._data);
542inline avx512Double8
operator+(avx512Double8 lhs, avx512Double8 rhs)
544 return _mm512_add_pd(lhs._data, rhs._data);
547inline avx512Double8
operator-(avx512Double8 lhs, avx512Double8 rhs)
549 return _mm512_sub_pd(lhs._data, rhs._data);
552inline avx512Double8
operator-(avx512Double8 in)
554 return _mm512_sub_pd(_mm512_set1_pd(-0.0), in._data);
558inline avx512Double8
operator*(avx512Double8 lhs, avx512Double8 rhs)
560 return _mm512_mul_pd(lhs._data, rhs._data);
563inline avx512Double8
operator/(avx512Double8 lhs, avx512Double8 rhs)
565 return _mm512_div_pd(lhs._data, rhs._data);
568inline avx512Double8
sqrt(avx512Double8 in)
570 return _mm512_sqrt_pd(in._data);
573inline avx512Double8
abs(avx512Double8 in)
575 return _mm512_abs_pd(in._data);
578inline avx512Double8
min(avx512Double8 lhs, avx512Double8 rhs)
580 return _mm512_min_pd(lhs._data, rhs._data);
583inline avx512Double8
max(avx512Double8 lhs, avx512Double8 rhs)
585 return _mm512_max_pd(lhs._data, rhs._data);
588inline avx512Double8
log(avx512Double8 in)
590#if defined(TINYSIMD_HAS_SVML)
591 return _mm512_log_pd(in._data);
595 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
597 tmp[0] = std::log(tmp[0]);
598 tmp[1] = std::log(tmp[1]);
599 tmp[2] = std::log(tmp[2]);
600 tmp[3] = std::log(tmp[3]);
601 tmp[4] = std::log(tmp[4]);
602 tmp[5] = std::log(tmp[5]);
603 tmp[6] = std::log(tmp[6]);
604 tmp[7] = std::log(tmp[7]);
612 const double *in,
const std::uint32_t dataLen,
613 std::vector<avx512Double8, allocator<avx512Double8>> &out)
615 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
616 for (
size_t i = 0; i < dataLen; ++i)
619 tmp[1] = in[i + dataLen];
620 tmp[2] = in[i + 2 * dataLen];
621 tmp[3] = in[i + 3 * dataLen];
622 tmp[4] = in[i + 4 * dataLen];
623 tmp[5] = in[i + 5 * dataLen];
624 tmp[6] = in[i + 6 * dataLen];
625 tmp[7] = in[i + 7 * dataLen];
631 const double *in, std::uint32_t dataLen,
632 std::vector<avx512Double8, allocator<avx512Double8>> &out)
635 alignas(avx512Double8::alignment)
636 avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
637 0, dataLen, 2 * dataLen, 3 * dataLen,
638 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
640 using index_t = avx512Long8<avx512Double8::scalarIndexType>;
642 index_t index1 = index0 + 1;
643 index_t index2 = index0 + 2;
644 index_t index3 = index0 + 3;
647 constexpr uint16_t unrl = 4;
648 size_t nBlocks = dataLen / unrl;
649 for (
size_t i = 0; i < nBlocks; ++i)
651 out[unrl * i + 0].gather(in, index0);
652 out[unrl * i + 1].gather(in, index1);
653 out[unrl * i + 2].gather(in, index2);
654 out[unrl * i + 3].gather(in, index3);
655 index0 = index0 + unrl;
656 index1 = index1 + unrl;
657 index2 = index2 + unrl;
658 index3 = index3 + unrl;
662 for (
size_t i = unrl * nBlocks; i < dataLen; ++i)
664 out[i].gather(in, index0);
670 const std::vector<avx512Double8, allocator<avx512Double8>> &in,
671 const std::uint32_t dataLen,
double *out)
673 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
674 for (
size_t i = 0; i < dataLen; ++i)
678 out[i + dataLen] = tmp[1];
679 out[i + 2 * dataLen] = tmp[2];
680 out[i + 3 * dataLen] = tmp[3];
681 out[i + 4 * dataLen] = tmp[4];
682 out[i + 5 * dataLen] = tmp[5];
683 out[i + 6 * dataLen] = tmp[6];
684 out[i + 7 * dataLen] = tmp[7];
689 const std::vector<avx512Double8, allocator<avx512Double8>> &in,
690 std::uint32_t dataLen,
double *out)
694 alignas(avx512Double8::alignment)
695 avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
696 0, dataLen, 2 * dataLen, 3 * dataLen,
697 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
698 using index_t = avx512Long8<avx512Double8::scalarIndexType>;
700 for (
size_t i = 0; i < dataLen; ++i)
702 in[i].scatter(out, index0);
711 static constexpr unsigned int width = 16;
712 static constexpr unsigned int alignment = 64;
714 using scalarType = float;
715 using scalarIndexType = std::uint32_t;
716 using vectorType = __m512;
717 using scalarArray = scalarType[width];
723 inline avx512Float16() =
default;
724 inline avx512Float16(
const avx512Float16 &rhs) =
default;
725 inline avx512Float16(
const vectorType &rhs) : _data(rhs)
728 inline avx512Float16(
const scalarType rhs)
730 _data = _mm512_set1_ps(rhs);
734 inline avx512Float16 &operator=(
const avx512Float16 &) =
default;
737 inline void store(scalarType *p)
const
739 _mm512_store_ps(p, _data);
742 template <
class flag,
743 typename std::enable_if<is_requiring_alignment_v<flag> &&
744 !is_streaming_v<flag>,
746 inline void store(scalarType *p, flag)
const
748 _mm512_store_ps(p, _data);
751 template <
class flag,
typename std::enable_if<
752 !is_requiring_alignment_v<flag>,
bool>::type = 0>
753 inline void store(scalarType *p, flag)
const
755 _mm512_storeu_ps(p, _data);
758 template <
class flag,
759 typename std::enable_if<is_streaming_v<flag>,
bool>::type = 0>
760 inline void store(scalarType *p, flag)
const
762 _mm512_stream_ps(p, _data);
766 inline void load(
const scalarType *p)
768 _data = _mm512_load_ps(p);
771 template <
class flag,
typename std::enable_if<
772 is_requiring_alignment_v<flag>,
bool>::type = 0>
773 inline void load(
const scalarType *p, flag)
775 _data = _mm512_load_ps(p);
778 template <
class flag,
typename std::enable_if<
779 !is_requiring_alignment_v<flag>,
bool>::type = 0>
780 inline void load(
const scalarType *p, flag)
782 _data = _mm512_loadu_ps(p);
786 inline void broadcast(
const scalarType rhs)
788 _data = _mm512_set1_ps(rhs);
792 template <
typename T>
793 inline void gather(scalarType
const *p,
const avx512Int16<T> &indices)
795 _data = _mm512_i32gather_ps(indices._data, p,
sizeof(scalarType));
798 template <
typename T>
799 inline void scatter(scalarType *out,
const avx512Int16<T> &indices)
const
801 _mm512_i32scatter_ps(out, indices._data, _data,
sizeof(scalarType));
806 inline void fma(
const avx512Float16 &a,
const avx512Float16 &b)
808 _data = _mm512_fmadd_ps(a._data, b._data, _data);
814 inline scalarType operator[](
size_t i)
const
816 alignas(alignment) scalarArray tmp;
817 store(tmp, is_aligned);
821 inline scalarType &operator[](
size_t i)
823 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
827 inline void operator+=(avx512Float16 rhs)
829 _data = _mm512_add_ps(_data, rhs._data);
832 inline void operator-=(avx512Float16 rhs)
834 _data = _mm512_sub_ps(_data, rhs._data);
837 inline void operator*=(avx512Float16 rhs)
839 _data = _mm512_mul_ps(_data, rhs._data);
842 inline void operator/=(avx512Float16 rhs)
844 _data = _mm512_div_ps(_data, rhs._data);
848inline avx512Float16
operator+(avx512Float16 lhs, avx512Float16 rhs)
850 return _mm512_add_ps(lhs._data, rhs._data);
853inline avx512Float16
operator-(avx512Float16 lhs, avx512Float16 rhs)
855 return _mm512_sub_ps(lhs._data, rhs._data);
858inline avx512Float16
operator-(avx512Float16 in)
860 return _mm512_sub_ps(_mm512_set1_ps(-0.0), in._data);
864inline avx512Float16
operator*(avx512Float16 lhs, avx512Float16 rhs)
866 return _mm512_mul_ps(lhs._data, rhs._data);
869inline avx512Float16
operator/(avx512Float16 lhs, avx512Float16 rhs)
871 return _mm512_div_ps(lhs._data, rhs._data);
874inline avx512Float16
sqrt(avx512Float16 in)
876 return _mm512_sqrt_ps(in._data);
879inline avx512Float16
abs(avx512Float16 in)
881 return _mm512_abs_ps(in._data);
884inline avx512Float16
min(avx512Float16 lhs, avx512Float16 rhs)
886 return _mm512_min_ps(lhs._data, rhs._data);
889inline avx512Float16
max(avx512Float16 lhs, avx512Float16 rhs)
891 return _mm512_max_ps(lhs._data, rhs._data);
894inline avx512Float16
log(avx512Float16 in)
896#if defined(TINYSIMD_HAS_SVML)
897 return _mm512_log_ps(in._data);
901 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
903 tmp[0] = std::log(tmp[0]);
904 tmp[1] = std::log(tmp[1]);
905 tmp[2] = std::log(tmp[2]);
906 tmp[3] = std::log(tmp[3]);
907 tmp[4] = std::log(tmp[4]);
908 tmp[5] = std::log(tmp[5]);
909 tmp[6] = std::log(tmp[6]);
910 tmp[7] = std::log(tmp[7]);
911 tmp[8] = std::log(tmp[8]);
912 tmp[9] = std::log(tmp[9]);
913 tmp[10] = std::log(tmp[10]);
914 tmp[11] = std::log(tmp[11]);
915 tmp[12] = std::log(tmp[12]);
916 tmp[13] = std::log(tmp[13]);
917 tmp[14] = std::log(tmp[14]);
918 tmp[15] = std::log(tmp[15]);
926 const double *in,
const std::uint32_t dataLen,
927 std::vector<avx512Float16, allocator<avx512Float16>> &out)
929 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
930 for (
size_t i = 0; i < dataLen; ++i)
933 tmp[1] = in[i + dataLen];
934 tmp[2] = in[i + 2 * dataLen];
935 tmp[3] = in[i + 3 * dataLen];
936 tmp[4] = in[i + 4 * dataLen];
937 tmp[5] = in[i + 5 * dataLen];
938 tmp[6] = in[i + 6 * dataLen];
939 tmp[7] = in[i + 7 * dataLen];
940 tmp[8] = in[i + 8 * dataLen];
941 tmp[9] = in[i + 9 * dataLen];
942 tmp[10] = in[i + 10 * dataLen];
943 tmp[11] = in[i + 11 * dataLen];
944 tmp[12] = in[i + 12 * dataLen];
945 tmp[13] = in[i + 13 * dataLen];
946 tmp[14] = in[i + 14 * dataLen];
947 tmp[15] = in[i + 15 * dataLen];
953 const float *in, std::uint32_t dataLen,
954 std::vector<avx512Float16, allocator<avx512Float16>> &out)
957 alignas(avx512Float16::alignment)
958 avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
976 using index_t = avx512Int16<avx512Float16::scalarIndexType>;
978 index_t index1 = index0 + 1;
979 index_t index2 = index0 + 2;
980 index_t index3 = index0 + 3;
983 constexpr uint16_t unrl = 4;
984 size_t nBlocks = dataLen / unrl;
985 for (
size_t i = 0; i < nBlocks; ++i)
987 out[unrl * i + 0].gather(in, index0);
988 out[unrl * i + 1].gather(in, index1);
989 out[unrl * i + 2].gather(in, index2);
990 out[unrl * i + 3].gather(in, index3);
991 index0 = index0 + unrl;
992 index1 = index1 + unrl;
993 index2 = index2 + unrl;
994 index3 = index3 + unrl;
998 for (
size_t i = unrl * nBlocks; i < dataLen; ++i)
1000 out[i].gather(in, index0);
1001 index0 = index0 + 1;
1006 const std::vector<avx512Float16, allocator<avx512Float16>> &in,
1007 const std::uint32_t dataLen,
double *out)
1009 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
1010 for (
size_t i = 0; i < dataLen; ++i)
1014 out[i + dataLen] = tmp[1];
1015 out[i + 2 * dataLen] = tmp[2];
1016 out[i + 3 * dataLen] = tmp[3];
1017 out[i + 4 * dataLen] = tmp[4];
1018 out[i + 5 * dataLen] = tmp[5];
1019 out[i + 6 * dataLen] = tmp[6];
1020 out[i + 7 * dataLen] = tmp[7];
1021 out[i + 8 * dataLen] = tmp[8];
1022 out[i + 9 * dataLen] = tmp[9];
1023 out[i + 10 * dataLen] = tmp[10];
1024 out[i + 11 * dataLen] = tmp[11];
1025 out[i + 12 * dataLen] = tmp[12];
1026 out[i + 13 * dataLen] = tmp[13];
1027 out[i + 14 * dataLen] = tmp[14];
1028 out[i + 15 * dataLen] = tmp[15];
1033 const std::vector<avx512Float16, allocator<avx512Float16>> &in,
1034 std::uint32_t dataLen,
float *out)
1038 alignas(avx512Float16::alignment)
1039 avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
1056 using index_t = avx512Int16<avx512Float16::scalarIndexType>;
1058 index_t index0(tmp);
1059 for (
size_t i = 0; i < dataLen; ++i)
1061 in[i].scatter(out, index0);
1062 index0 = index0 + 1;
1075struct avx512Mask8 : avx512Long8<std::uint64_t>
1078 using avx512Long8::avx512Long8;
1080 static constexpr scalarType true_v = -1;
1081 static constexpr scalarType false_v = 0;
1084inline avx512Mask8
operator>(avx512Double8 lhs, avx512Double8 rhs)
1086 __mmask8 mask = _mm512_cmp_pd_mask(lhs._data, rhs._data, _CMP_GT_OQ);
1087 return _mm512_maskz_set1_epi64(mask, avx512Mask8::true_v);
1090inline bool operator&&(avx512Mask8 lhs,
bool rhs)
1092 __m512i val_true = _mm512_set1_epi64(avx512Mask8::true_v);
1093 __mmask8 mask = _mm512_test_epi64_mask(lhs._data, val_true);
1094 unsigned int tmp = _cvtmask16_u32(mask);
1098struct avx512Mask16 : avx512Int16<std::uint32_t>
1101 using avx512Int16::avx512Int16;
1103 static constexpr scalarType true_v = -1;
1104 static constexpr scalarType false_v = 0;
1107inline avx512Mask16
operator>(avx512Float16 lhs, avx512Float16 rhs)
1109 __mmask16 mask = _mm512_cmp_ps_mask(lhs._data, rhs._data, _CMP_GT_OQ);
1110 return _mm512_maskz_set1_epi32(mask, avx512Mask16::true_v);
1113inline bool operator&&(avx512Mask16 lhs,
bool rhs)
1115 __m512i val_true = _mm512_set1_epi32(avx512Mask16::true_v);
1116 __mmask16 mask = _mm512_test_epi32_mask(lhs._data, val_true);
1117 unsigned int tmp = _cvtmask16_u32(mask);
void load_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
scalarT< T > abs(scalarT< T > in)
void deinterleave_unalign_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > max(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > log(scalarT< T > in)
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
bool operator&&(scalarMask lhs, bool rhs)
void load_unalign_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
scalarT< T > min(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > sqrt(scalarT< T > in)
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)