35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SVE_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SVE_H
38 #if defined(__ARM_FEATURE_SVE)
53 template <
typename scalarType>
struct sve
62 #if __ARM_FEATURE_SVE_BITS > 0 && defined(NEKTAR_ENABLE_SIMD_SVE)
66 typedef svfloat64_t svfloat64_vlst_t
67 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
68 typedef svint64_t svint64_vlst_t
69 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
70 typedef svuint64_t svuint64_vlst_t
71 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
72 typedef svbool_t svbool_vlst_t
73 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
76 template <
typename T>
struct sveLong;
84 template <>
struct sve<double>
86 using type = sveDouble;
88 template <>
struct sve<std::int64_t>
90 using type = sveLong<std::int64_t>;
92 template <>
struct sve<std::uint64_t>
94 using type = sveLong<std::uint64_t>;
96 template <>
struct sve<bool>
104 template <
typename T>
struct sveLong
106 static_assert(std::is_integral<T>::value &&
sizeof(T) == 8,
107 "8 bytes Integral required.");
109 static constexpr
unsigned int alignment =
110 __ARM_FEATURE_SVE_BITS /
sizeof(T);
111 static constexpr
unsigned int width = alignment / 8;
113 using scalarType = T;
115 typename std::conditional<std::is_signed<T>::value, svint64_vlst_t,
116 svuint64_vlst_t>::type;
117 using scalarArray = scalarType[width];
123 inline sveLong() =
default;
124 inline sveLong(
const sveLong &rhs) =
default;
125 inline sveLong(
const vectorType &rhs) : _data(rhs)
128 inline sveLong(
const scalarType rhs)
130 _data = svdup_s64(rhs);
132 explicit inline sveLong(scalarArray &rhs)
134 _data = svld1(svptrue_b64(), rhs);
138 inline void store(scalarType *
p)
const
140 svst1(svptrue_b64(),
p, _data);
145 template <
typename TAG,
146 typename std::enable_if<is_load_tag<TAG>::value,
bool>::type = 0>
147 inline void store(scalarType *
p, TAG)
const
149 svst1(svptrue_b64(),
p, _data);
153 inline void load(
const scalarType *
p)
155 _data = svld1(svptrue_b64(),
p);
160 template <
typename TAG,
161 typename std::enable_if<is_load_tag<TAG>::value,
bool>::type = 0>
162 inline void load(
const scalarType *
p, TAG)
164 _data = svld1(svptrue_b64(),
p);
168 inline void broadcast(
const scalarType rhs)
176 inline scalarType operator[](
size_t i)
const
178 alignas(alignment) scalarArray tmp;
184 inline void operator+=(sveLong rhs)
186 _data = svadd_x(svptrue_b64(), _data, rhs._data);
189 inline void operator-=(sveLong rhs)
191 _data = svsub_x(svptrue_b64(), _data, rhs._data);
194 inline void operator*=(sveLong rhs)
196 _data = svmul_x(svptrue_b64(), _data, rhs._data);
199 inline void operator/=(sveLong rhs)
201 _data = svdiv_x(svptrue_b64(), _data, rhs._data);
205 template <
typename T>
206 inline sveLong<T>
operator+(sveLong<T> lhs, sveLong<T> rhs)
208 return svadd_x(svptrue_b64(), lhs._data, rhs._data);
211 template <
typename T>
inline sveLong<T>
operator+(sveLong<T> lhs, T rhs)
213 return svadd_x(svptrue_b64(), lhs._data, sveLong<T>(rhs)._data);
216 template <
typename T>
217 inline sveLong<T>
operator-(sveLong<T> lhs, sveLong<T> rhs)
219 return svsub_x(svptrue_b64(), lhs._data, rhs._data);
222 template <
typename T>
223 inline sveLong<T>
operator*(sveLong<T> lhs, sveLong<T> rhs)
225 return svmul_x(svptrue_b64(), lhs._data, rhs._data);
228 template <
typename T>
229 inline sveLong<T>
operator/(sveLong<T> lhs, sveLong<T> rhs)
231 return svdiv_x(svptrue_b64(), lhs._data, rhs._data);
234 template <
typename T>
inline sveLong<T>
abs(sveLong<T> in)
236 return svabs_x(svptrue_b64(), in._data);
243 static constexpr
unsigned int alignment =
244 __ARM_FEATURE_SVE_BITS /
sizeof(double);
245 static constexpr
unsigned int width = alignment / 8;
247 using scalarType = double;
248 using vectorType = svfloat64_vlst_t;
249 using scalarArray = scalarType[width];
255 inline sveDouble() =
default;
256 inline sveDouble(
const sveDouble &rhs) =
default;
257 inline sveDouble(
const vectorType &rhs) : _data(rhs)
260 inline sveDouble(
const scalarType rhs)
262 _data = svdup_f64(rhs);
266 inline void store(scalarType *
p)
const
268 svst1_f64(svptrue_b64(),
p, _data);
273 template <
typename T,
274 typename std::enable_if<is_load_tag<T>::value,
bool>::type = 0>
275 inline void store(scalarType *
p, T)
const
277 svst1_f64(svptrue_b64(),
p, _data);
281 inline void load(
const scalarType *
p)
283 _data = svld1_f64(svptrue_b64(),
p);
288 template <
typename T,
289 typename std::enable_if<is_load_tag<T>::value,
bool>::type = 0>
290 inline void load(
const scalarType *
p, T)
292 _data = svld1_f64(svptrue_b64(),
p);
296 inline void broadcast(
const scalarType rhs)
298 _data = svdup_f64(rhs);
302 template <
typename T>
303 inline void gather(scalarType
const *
p,
const sveLong<T> &indices)
305 _data = svld1_gather_index(svptrue_b64(),
p, indices._data);
308 template <
typename T>
309 inline void scatter(scalarType *out,
const sveLong<T> &indices)
const
311 svst1_scatter_index(svptrue_b64(), out, indices._data, _data);
316 inline void fma(
const sveDouble &a,
const sveDouble &b)
318 _data = svmad_x(svptrue_b64(), a._data, b._data, _data);
324 inline scalarType operator[](
size_t i)
const
326 alignas(alignment) scalarArray tmp;
332 inline void operator+=(sveDouble rhs)
334 _data = svadd_x(svptrue_b64(), _data, rhs._data);
337 inline void operator-=(sveDouble rhs)
339 _data = svsub_x(svptrue_b64(), _data, rhs._data);
342 inline void operator*=(sveDouble rhs)
344 _data = svmul_x(svptrue_b64(), _data, rhs._data);
347 inline void operator/=(sveDouble rhs)
349 _data = svdiv_x(svptrue_b64(), _data, rhs._data);
353 inline sveDouble
operator+(sveDouble lhs, sveDouble rhs)
355 return svadd_x(svptrue_b64(), lhs._data, rhs._data);
358 inline sveDouble
operator-(sveDouble lhs, sveDouble rhs)
360 return svsub_x(svptrue_b64(), lhs._data, rhs._data);
363 inline sveDouble
operator*(sveDouble lhs, sveDouble rhs)
365 return svmul_x(svptrue_b64(), lhs._data, rhs._data);
368 inline sveDouble
operator/(sveDouble lhs, sveDouble rhs)
370 return svdiv_x(svptrue_b64(), lhs._data, rhs._data);
373 inline sveDouble
sqrt(sveDouble in)
375 return svsqrt_x(svptrue_b64(), in._data);
378 inline sveDouble
abs(sveDouble in)
380 return svabs_x(svptrue_b64(), in._data);
383 inline sveDouble
log(sveDouble in)
387 alignas(sveDouble::alignment) sveDouble::scalarArray tmp;
389 for (
size_t i = 0; i < sveDouble::width; ++i)
399 std::vector<sveDouble, allocator<sveDouble>> &out)
402 alignas(sveDouble::alignment)
size_t tmp[sveDouble::width] = {};
406 for (
size_t i = 0; i < sveDouble::width; ++i)
408 tmp[i] = i * dataLen;
411 using index_t = sveLong<size_t>;
413 index_t index1 = index0 + 1ul;
416 size_t nBlocks = dataLen / 2;
417 for (
size_t i = 0; i < nBlocks; ++i)
419 out[2 * i + 0].gather(in, index0);
420 out[2 * i + 1].gather(in, index1);
421 index0 = index0 + 2ul;
422 index1 = index1 + 2ul;
426 for (
size_t i = 2 * nBlocks; i < dataLen; ++i)
428 out[i].gather(in, index0);
429 index0 = index0 + 1ul;
434 const std::vector<sveDouble, allocator<sveDouble>> &in,
size_t dataLen,
437 alignas(sveDouble::alignment)
size_t tmp[sveDouble::width] = {};
441 for (
size_t i = 0; i < sveDouble::width; ++i)
443 tmp[i] = i * dataLen;
446 using index_t = sveLong<size_t>;
449 for (
size_t i = 0; i < dataLen; ++i)
451 in[i].scatter(out, index0);
452 index0 = index0 + 1ul;
465 struct sveMask : sveLong<std::uint64_t>
468 using sveLong::sveLong;
470 static constexpr scalarType true_v = -1;
471 static constexpr scalarType false_v = 0;
474 inline sveMask
operator>(sveDouble lhs, sveDouble rhs)
477 svbool_vlst_t mask = svcmpgt(svptrue_b64(), lhs._data, rhs._data);
479 sveMask::vectorType sveTrue_v = svdup_u64(sveMask::true_v);
480 return svand_z(mask, sveTrue_v, sveTrue_v);
487 sveMask::vectorType sveFalse_v = svdup_u64(sveMask::false_v);
488 svbool_vlst_t mask = svcmpne(svptrue_b64(), lhs._data, sveFalse_v);
490 bool tmp = svptest_any(svptrue_b64(), mask);
scalarT< T > log(scalarT< T > in)
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T >>> &in, size_t dataLen, T *out)
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > abs(scalarT< T > in)
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
void load_interleave(const T *in, size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T >>> &out)
bool operator&&(scalarMask lhs, bool rhs)
scalarT< T > sqrt(scalarT< T > in)
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)