35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SVE_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SVE_H
38#if defined(__ARM_FEATURE_SVE)
49template <
typename scalarType,
int w
idth = 0>
struct sve
58#if __ARM_FEATURE_SVE_BITS > 0 && defined(NEKTAR_ENABLE_SIMD_SVE)
66typedef svfloat64_t svfloat64_vlst_t
67 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
68typedef svint64_t svint64_vlst_t
69 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
70typedef svuint64_t svuint64_vlst_t
71 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
72typedef svfloat32_t svfloat32_vlst_t
73 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
74typedef svint32_t svint32_vlst_t
75 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
76typedef svuint32_t svuint32_vlst_t
77 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
78typedef svbool_t svbool_vlst_t
79 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
82template <
typename T>
struct sveInt64;
83template <
typename T>
struct sveInt32;
93template <>
struct sve<double>
95 using type = sveFloat64;
97template <>
struct sve<float>
99 using type = sveFloat32;
103template <>
struct sve<
std::int64_t>
105 using type = sveInt64<std::int64_t>;
107template <>
struct sve<
std::uint64_t>
109 using type = sveInt64<std::uint64_t>;
111template <>
struct sve<
std::int32_t>
113 using type = sveInt32<std::int32_t>;
115template <>
struct sve<
std::uint32_t>
117 using type = sveInt32<std::uint32_t>;
120template <>
struct sve<
std::int64_t, __ARM_FEATURE_SVE_BITS / 64>
122 using type = sveInt64<std::int64_t>;
124template <>
struct sve<
std::uint64_t, __ARM_FEATURE_SVE_BITS / 64>
126 using type = sveInt64<std::uint64_t>;
131template <>
struct sve<
std::int32_t, __ARM_FEATURE_SVE_BITS / 64>
133 using type = sveInt64<std::int64_t>;
135template <>
struct sve<
std::uint32_t, __ARM_FEATURE_SVE_BITS / 64>
137 using type = sveInt64<std::uint64_t>;
139template <>
struct sve<
std::int32_t, __ARM_FEATURE_SVE_BITS / 32>
141 using type = sveInt32<std::int32_t>;
143template <>
struct sve<
std::uint32_t, __ARM_FEATURE_SVE_BITS / 32>
145 using type = sveInt32<std::uint32_t>;
148template <>
struct sve<bool, __ARM_FEATURE_SVE_BITS / 64>
150 using type = sveMask64;
152template <>
struct sve<bool, __ARM_FEATURE_SVE_BITS / 32>
154 using type = sveMask32;
160template <
typename T>
struct sveInt32
162 static_assert(std::is_integral<T>::value &&
sizeof(T) == 4,
163 "4 bytes Integral required.");
165 static constexpr unsigned int alignment =
166 __ARM_FEATURE_SVE_BITS /
sizeof(T);
167 static constexpr unsigned int width = alignment / 8;
169 using scalarType = T;
171 typename std::conditional<std::is_signed<T>::value, svint32_vlst_t,
172 svuint32_vlst_t>::type;
173 using scalarArray = scalarType[width];
179 inline sveInt32() =
default;
180 inline sveInt32(
const sveInt32 &rhs) =
default;
181 inline sveInt32(
const vectorType &rhs) : _data(rhs)
184 inline sveInt32(
const scalarType rhs)
186 _data = svdup_s32(rhs);
188 explicit inline sveInt32(scalarArray &rhs)
190 _data = svld1(svptrue_b32(), rhs);
194 inline void store(scalarType *
p)
const
196 svst1(svptrue_b32(),
p, _data);
201 template <
typename TAG,
202 typename std::enable_if<is_load_tag<TAG>::value,
bool>::type = 0>
203 inline void store(scalarType *
p, TAG)
const
205 svst1(svptrue_b32(),
p, _data);
209 inline void load(
const scalarType *
p)
211 _data = svld1(svptrue_b32(),
p);
216 template <
typename TAG,
217 typename std::enable_if<is_load_tag<TAG>::value,
bool>::type = 0>
218 inline void load(
const scalarType *
p, TAG)
220 _data = svld1(svptrue_b32(),
p);
224 inline void broadcast(
const scalarType rhs)
232 inline scalarType operator[](
size_t i)
const
234 alignas(alignment) scalarArray tmp;
239 inline scalarType &operator[](
size_t i)
241 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
246 inline void operator+=(sveInt32 rhs)
248 _data = svadd_x(svptrue_b32(), _data, rhs._data);
251 inline void operator-=(sveInt32 rhs)
253 _data = svsub_x(svptrue_b32(), _data, rhs._data);
256 inline void operator*=(sveInt32 rhs)
258 _data = svmul_x(svptrue_b32(), _data, rhs._data);
261 inline void operator/=(sveInt32 rhs)
263 _data = svdiv_x(svptrue_b32(), _data, rhs._data);
268inline sveInt32<T>
operator+(sveInt32<T> lhs, sveInt32<T> rhs)
270 return svadd_x(svptrue_b32(), lhs._data, rhs._data);
273template <
typename T>
inline sveInt32<T>
operator+(sveInt32<T> lhs, T rhs)
275 return svadd_x(svptrue_b32(), lhs._data, sveInt32<T>(rhs)._data);
279inline sveInt32<T>
operator-(sveInt32<T> lhs, sveInt32<T> rhs)
281 return svsub_x(svptrue_b32(), lhs._data, rhs._data);
285inline sveInt32<T>
operator*(sveInt32<T> lhs, sveInt32<T> rhs)
287 return svmul_x(svptrue_b32(), lhs._data, rhs._data);
291inline sveInt32<T>
operator/(sveInt32<T> lhs, sveInt32<T> rhs)
293 return svdiv_x(svptrue_b32(), lhs._data, rhs._data);
296template <
typename T>
inline sveInt32<T>
abs(sveInt32<T> in)
298 return svabs_x(svptrue_b32(), in._data);
303template <
typename T>
struct sveInt64
305 static_assert(std::is_integral<T>::value &&
sizeof(T) == 8,
306 "8 bytes Integral required.");
308 static constexpr unsigned int alignment =
309 __ARM_FEATURE_SVE_BITS /
sizeof(T);
310 static constexpr unsigned int width = alignment / 8;
312 using scalarType = T;
314 typename std::conditional<std::is_signed<T>::value, svint64_vlst_t,
315 svuint64_vlst_t>::type;
316 using scalarArray = scalarType[width];
322 inline sveInt64() =
default;
323 inline sveInt64(
const sveInt64 &rhs) =
default;
324 inline sveInt64(
const vectorType &rhs) : _data(rhs)
327 inline sveInt64(
const scalarType rhs)
329 _data = svdup_s64(rhs);
331 explicit inline sveInt64(scalarArray &rhs)
333 _data = svld1(svptrue_b64(), rhs);
337 inline void store(scalarType *
p)
const
339 svst1(svptrue_b64(),
p, _data);
344 template <
typename TAG,
345 typename std::enable_if<is_load_tag<TAG>::value,
bool>::type = 0>
346 inline void store(scalarType *
p, TAG)
const
348 svst1(svptrue_b64(),
p, _data);
352 inline void load(
const scalarType *
p)
354 _data = svld1(svptrue_b64(),
p);
359 template <
typename TAG,
360 typename std::enable_if<is_load_tag<TAG>::value,
bool>::type = 0>
361 inline void load(
const scalarType *
p, TAG)
363 _data = svld1(svptrue_b64(),
p);
367 template <
typename I32,
368 typename std::enable_if<std::is_integral<I32>::value &&
369 std::is_signed<scalarType>::value &&
372 inline void load(
const I32 *
p)
374 _data = svld1sw_s64(svptrue_b64(),
p);
376 template <
typename I32,
377 typename std::enable_if<std::is_integral<I32>::value &&
378 !std::is_signed<scalarType>::value &&
381 inline void load(
const I32 *
p)
383 _data = svld1uw_s64(svptrue_b64(),
p);
385 template <
typename I32,
typename TAG,
386 typename std::enable_if<
387 is_load_tag<TAG>::value && std::is_integral<I32>::value &&
388 std::is_signed<scalarType>::value &&
sizeof(I32) == 4,
390 inline void load(
const I32 *
p, TAG)
392 _data = svld1sw_s64(svptrue_b64(),
p);
394 template <
typename I32,
typename TAG,
395 typename std::enable_if<
396 is_load_tag<TAG>::value && std::is_integral<I32>::value &&
397 !std::is_signed<scalarType>::value &&
sizeof(I32) == 4,
399 inline void load(
const I32 *
p, TAG)
401 _data = svld1uw_s64(svptrue_b64(),
p);
405 inline void broadcast(
const scalarType rhs)
413 inline scalarType operator[](
size_t i)
const
415 alignas(alignment) scalarArray tmp;
421 inline void operator+=(sveInt64 rhs)
423 _data = svadd_x(svptrue_b64(), _data, rhs._data);
426 inline void operator-=(sveInt64 rhs)
428 _data = svsub_x(svptrue_b64(), _data, rhs._data);
431 inline void operator*=(sveInt64 rhs)
433 _data = svmul_x(svptrue_b64(), _data, rhs._data);
436 inline void operator/=(sveInt64 rhs)
438 _data = svdiv_x(svptrue_b64(), _data, rhs._data);
443inline sveInt64<T>
operator+(sveInt64<T> lhs, sveInt64<T> rhs)
445 return svadd_x(svptrue_b64(), lhs._data, rhs._data);
448template <
typename T>
inline sveInt64<T>
operator+(sveInt64<T> lhs, T rhs)
450 return svadd_x(svptrue_b64(), lhs._data, sveInt64<T>(rhs)._data);
454inline sveInt64<T>
operator-(sveInt64<T> lhs, sveInt64<T> rhs)
456 return svsub_x(svptrue_b64(), lhs._data, rhs._data);
460inline sveInt64<T>
operator*(sveInt64<T> lhs, sveInt64<T> rhs)
462 return svmul_x(svptrue_b64(), lhs._data, rhs._data);
466inline sveInt64<T>
operator/(sveInt64<T> lhs, sveInt64<T> rhs)
468 return svdiv_x(svptrue_b64(), lhs._data, rhs._data);
471template <
typename T>
inline sveInt64<T>
abs(sveInt64<T> in)
473 return svabs_x(svptrue_b64(), in._data);
480 static constexpr unsigned int alignment =
481 __ARM_FEATURE_SVE_BITS /
sizeof(float);
482 static constexpr unsigned int width = alignment / 8;
484 using scalarType = float;
485 using scalarIndexType = std::uint32_t;
486 using vectorType = svfloat32_vlst_t;
487 using scalarArray = scalarType[width];
493 inline sveFloat32() =
default;
494 inline sveFloat32(
const sveFloat32 &rhs) =
default;
495 inline sveFloat32(
const vectorType &rhs) : _data(rhs)
498 inline sveFloat32(
const scalarType rhs)
500 _data = svdup_f32(rhs);
504 inline void store(scalarType *
p)
const
506 svst1_f32(svptrue_b32(),
p, _data);
511 template <
typename T,
512 typename std::enable_if<is_load_tag<T>::value,
bool>::type = 0>
513 inline void store(scalarType *
p, T)
const
515 svst1_f32(svptrue_b32(),
p, _data);
519 inline void load(
const scalarType *
p)
521 _data = svld1_f32(svptrue_b32(),
p);
526 template <
typename T,
527 typename std::enable_if<is_load_tag<T>::value,
bool>::type = 0>
528 inline void load(
const scalarType *
p, T)
530 _data = svld1_f32(svptrue_b32(),
p);
534 inline void broadcast(
const scalarType rhs)
536 _data = svdup_f32(rhs);
540 template <
typename T>
541 inline void gather(scalarType
const *
p,
const sveInt32<T> &indices)
543 _data = svld1_gather_index(svptrue_b32(),
p, indices._data);
546 template <
typename T>
547 inline void scatter(scalarType *out,
const sveInt32<T> &indices)
const
549 svst1_scatter_index(svptrue_b32(), out, indices._data, _data);
554 inline void fma(
const sveFloat32 &a,
const sveFloat32 &b)
556 _data = svmad_x(svptrue_b32(), a._data, b._data, _data);
562 inline scalarType operator[](
size_t i)
const
564 alignas(alignment) scalarArray tmp;
569 inline scalarType &operator[](
size_t i)
571 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
576 inline void operator+=(sveFloat32 rhs)
578 _data = svadd_x(svptrue_b32(), _data, rhs._data);
581 inline void operator-=(sveFloat32 rhs)
583 _data = svsub_x(svptrue_b32(), _data, rhs._data);
586 inline void operator*=(sveFloat32 rhs)
588 _data = svmul_x(svptrue_b32(), _data, rhs._data);
591 inline void operator/=(sveFloat32 rhs)
593 _data = svdiv_x(svptrue_b32(), _data, rhs._data);
597inline sveFloat32
operator+(sveFloat32 lhs, sveFloat32 rhs)
599 return svadd_x(svptrue_b32(), lhs._data, rhs._data);
602inline sveFloat32
operator-(sveFloat32 lhs, sveFloat32 rhs)
604 return svsub_x(svptrue_b32(), lhs._data, rhs._data);
607inline sveFloat32
operator*(sveFloat32 lhs, sveFloat32 rhs)
609 return svmul_x(svptrue_b32(), lhs._data, rhs._data);
612inline sveFloat32
operator/(sveFloat32 lhs, sveFloat32 rhs)
614 return svdiv_x(svptrue_b32(), lhs._data, rhs._data);
617inline sveFloat32
sqrt(sveFloat32 in)
619 return svsqrt_x(svptrue_b32(), in._data);
622inline sveFloat32
abs(sveFloat32 in)
624 return svabs_x(svptrue_b32(), in._data);
627inline sveFloat32
log(sveFloat32 in)
631 alignas(sveFloat32::alignment) sveFloat32::scalarArray tmp;
633 for (
size_t i = 0; i < sveFloat32::width; ++i)
643 std::vector<sveFloat32, allocator<sveFloat32>> &out)
646 alignas(sveFloat32::alignment)
647 sveFloat32::scalarIndexType tmp[sveFloat32::width] = {};
651 for (
size_t i = 0; i < sveFloat32::width; ++i)
653 tmp[i] = i * dataLen;
656 using index_t = sveInt32<sveFloat32::scalarIndexType>;
658 index_t index1 = index0 + 1u;
661 size_t nBlocks = dataLen / 2;
662 for (
size_t i = 0; i < nBlocks; ++i)
664 out[2 * i + 0].gather(in, index0);
665 out[2 * i + 1].gather(in, index1);
666 index0 = index0 + 2u;
667 index1 = index1 + 2u;
671 for (
size_t i = 2 * nBlocks; i < dataLen; ++i)
673 out[i].gather(in, index0);
674 index0 = index0 + 1u;
679 const std::vector<sveFloat32, allocator<sveFloat32>> &in,
680 std::uint32_t dataLen,
float *out)
682 alignas(sveFloat32::alignment)
683 sveFloat32::scalarIndexType tmp[sveFloat32::width] = {};
687 for (
size_t i = 0; i < sveFloat32::width; ++i)
689 tmp[i] = i * dataLen;
692 using index_t = sveInt32<sveFloat32::scalarIndexType>;
695 for (
size_t i = 0; i < dataLen; ++i)
697 in[i].scatter(out, index0);
698 index0 = index0 + 1u;
706 static constexpr unsigned int alignment =
707 __ARM_FEATURE_SVE_BITS /
sizeof(double);
708 static constexpr unsigned int width = alignment / 8;
710 using scalarType = double;
711 using scalarIndexType = std::uint64_t;
712 using vectorType = svfloat64_vlst_t;
713 using scalarArray = scalarType[width];
719 inline sveFloat64() =
default;
720 inline sveFloat64(
const sveFloat64 &rhs) =
default;
721 inline sveFloat64(
const vectorType &rhs) : _data(rhs)
724 inline sveFloat64(
const scalarType rhs)
726 _data = svdup_f64(rhs);
730 inline void store(scalarType *
p)
const
732 svst1_f64(svptrue_b64(),
p, _data);
737 template <
typename T,
738 typename std::enable_if<is_load_tag<T>::value,
bool>::type = 0>
739 inline void store(scalarType *
p, T)
const
741 svst1_f64(svptrue_b64(),
p, _data);
745 inline void load(
const scalarType *
p)
747 _data = svld1_f64(svptrue_b64(),
p);
752 template <
typename T,
753 typename std::enable_if<is_load_tag<T>::value,
bool>::type = 0>
754 inline void load(
const scalarType *
p, T)
756 _data = svld1_f64(svptrue_b64(),
p);
760 inline void broadcast(
const scalarType rhs)
762 _data = svdup_f64(rhs);
766 template <
typename T>
767 inline void gather(scalarType
const *
p,
const sveInt64<T> &indices)
769 _data = svld1_gather_index(svptrue_b64(),
p, indices._data);
772 template <
typename T>
773 inline void scatter(scalarType *out,
const sveInt64<T> &indices)
const
775 svst1_scatter_index(svptrue_b64(), out, indices._data, _data);
780 inline void fma(
const sveFloat64 &a,
const sveFloat64 &b)
782 _data = svmad_x(svptrue_b64(), a._data, b._data, _data);
788 inline scalarType operator[](
size_t i)
const
790 alignas(alignment) scalarArray tmp;
795 inline scalarType &operator[](
size_t i)
797 scalarType *tmp =
reinterpret_cast<scalarType *
>(&_data);
802 inline void operator+=(sveFloat64 rhs)
804 _data = svadd_x(svptrue_b64(), _data, rhs._data);
807 inline void operator-=(sveFloat64 rhs)
809 _data = svsub_x(svptrue_b64(), _data, rhs._data);
812 inline void operator*=(sveFloat64 rhs)
814 _data = svmul_x(svptrue_b64(), _data, rhs._data);
817 inline void operator/=(sveFloat64 rhs)
819 _data = svdiv_x(svptrue_b64(), _data, rhs._data);
823inline sveFloat64
operator+(sveFloat64 lhs, sveFloat64 rhs)
825 return svadd_x(svptrue_b64(), lhs._data, rhs._data);
828inline sveFloat64
operator-(sveFloat64 lhs, sveFloat64 rhs)
830 return svsub_x(svptrue_b64(), lhs._data, rhs._data);
833inline sveFloat64
operator*(sveFloat64 lhs, sveFloat64 rhs)
835 return svmul_x(svptrue_b64(), lhs._data, rhs._data);
838inline sveFloat64
operator/(sveFloat64 lhs, sveFloat64 rhs)
840 return svdiv_x(svptrue_b64(), lhs._data, rhs._data);
843inline sveFloat64
sqrt(sveFloat64 in)
845 return svsqrt_x(svptrue_b64(), in._data);
848inline sveFloat64
abs(sveFloat64 in)
850 return svabs_x(svptrue_b64(), in._data);
853inline sveFloat64
log(sveFloat64 in)
857 alignas(sveFloat64::alignment) sveFloat64::scalarArray tmp;
859 for (
size_t i = 0; i < sveFloat64::width; ++i)
869 std::vector<sveFloat64, allocator<sveFloat64>> &out)
872 alignas(sveFloat64::alignment)
size_t tmp[sveFloat64::width] = {};
876 for (
size_t i = 0; i < sveFloat64::width; ++i)
878 tmp[i] = i * dataLen;
881 using index_t = sveInt64<size_t>;
883 index_t index1 = index0 + 1ul;
886 size_t nBlocks = dataLen / 2;
887 for (
size_t i = 0; i < nBlocks; ++i)
889 out[2 * i + 0].gather(in, index0);
890 out[2 * i + 1].gather(in, index1);
891 index0 = index0 + 2ul;
892 index1 = index1 + 2ul;
896 for (
size_t i = 2 * nBlocks; i < dataLen; ++i)
898 out[i].gather(in, index0);
899 index0 = index0 + 1ul;
904 const std::vector<sveFloat64, allocator<sveFloat64>> &in,
905 std::uint32_t dataLen,
double *out)
907 alignas(sveFloat64::alignment)
size_t tmp[sveFloat64::width] = {};
911 for (
size_t i = 0; i < sveFloat64::width; ++i)
913 tmp[i] = i * dataLen;
916 using index_t = sveInt64<size_t>;
919 for (
size_t i = 0; i < dataLen; ++i)
921 in[i].scatter(out, index0);
922 index0 = index0 + 1ul;
935struct sveMask64 : sveInt64<std::uint64_t>
938 using sveInt64::sveInt64;
940 static constexpr scalarType true_v = -1;
941 static constexpr scalarType false_v = 0;
944inline sveMask64
operator>(sveFloat64 lhs, sveFloat64 rhs)
947 svbool_vlst_t mask = svcmpgt(svptrue_b64(), lhs._data, rhs._data);
949 sveMask64::vectorType sveTrue_v = svdup_u64(sveMask64::true_v);
950 return svand_z(mask, sveTrue_v, sveTrue_v);
954inline bool operator&&(sveMask64 lhs,
bool rhs)
957 sveMask64::vectorType sveFalse_v = svdup_u64(sveMask64::false_v);
958 svbool_vlst_t mask = svcmpne(svptrue_b64(), lhs._data, sveFalse_v);
960 bool tmp = svptest_any(svptrue_b64(), mask);
966struct sveMask32 : sveInt32<std::uint32_t>
969 using sveInt32::sveInt32;
971 static constexpr scalarType true_v = -1;
972 static constexpr scalarType false_v = 0;
975inline sveMask32
operator>(sveFloat32 lhs, sveFloat32 rhs)
978 svbool_vlst_t mask = svcmpgt(svptrue_b32(), lhs._data, rhs._data);
980 sveMask32::vectorType sveTrue_v = svdup_u32(sveMask32::true_v);
981 return svand_z(mask, sveTrue_v, sveTrue_v);
985inline bool operator&&(sveMask32 lhs,
bool rhs)
988 sveMask32::vectorType sveFalse_v = svdup_u32(sveMask32::false_v);
989 svbool_vlst_t mask = svcmpne(svptrue_b32(), lhs._data, sveFalse_v);
991 bool tmp = svptest_any(svptrue_b32(), mask);
void load_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
scalarT< T > abs(scalarT< T > in)
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > log(scalarT< T > in)
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
bool operator&&(scalarMask lhs, bool rhs)
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
scalarT< T > sqrt(scalarT< T > in)
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)