Nektar++
sve.hpp
Go to the documentation of this file.
1///////////////////////////////////////////////////////////////////////////////
2//
3// File: sve.hpp
4//
5// For more information, please see: http://www.nektar.info
6//
7// The MIT License
8//
9// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10// Department of Aeronautics, Imperial College London (UK), and Scientific
11// Computing and Imaging Institute, University of Utah (USA).
12//
13// Permission is hereby granted, free of charge, to any person obtaining a
14// copy of this software and associated documentation files (the "Software"),
15// to deal in the Software without restriction, including without limitation
16// the rights to use, copy, modify, merge, publish, distribute, sublicense,
17// and/or sell copies of the Software, and to permit persons to whom the
18// Software is furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included
21// in all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29// DEALINGS IN THE SOFTWARE.
30//
31// Description: Vector type using Armv8 Scalable Vector Extension (SVE).
32//
33///////////////////////////////////////////////////////////////////////////////
34
35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SVE_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SVE_H
37
38#if defined(__ARM_FEATURE_SVE)
39#include <arm_acle.h>
40#include <arm_sve.h>
41#endif
42
43#include "allocator.hpp"
44#include "traits.hpp"
45#include <vector>
46
47namespace tinysimd::abi
48{
49template <typename scalarType, int width = 0> struct sve
50{
51 using type = void;
52};
53
54} // namespace tinysimd::abi
55
56// requires clang >= 12.0.0 or gcc >= 10
57// requires -msve-vector-bits=<length>
58#if __ARM_FEATURE_SVE_BITS > 0 && defined(NEKTAR_ENABLE_SIMD_SVE)
59
60namespace tinysimd
61{
62
63// from VLA to VLST
64// C++ does not allow for incomplete class member types
65// to get around that we force a known size at compile time
66typedef svfloat64_t svfloat64_vlst_t
67 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
68typedef svint64_t svint64_vlst_t
69 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
70typedef svuint64_t svuint64_vlst_t
71 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
72typedef svfloat32_t svfloat32_vlst_t
73 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
74typedef svint32_t svint32_vlst_t
75 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
76typedef svuint32_t svuint32_vlst_t
77 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
78typedef svbool_t svbool_vlst_t
79 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
80
81// forward declaration of concrete types
82template <typename T> struct sveInt64;
83template <typename T> struct sveInt32;
84struct sveFloat64;
85struct sveFloat32;
86struct sveMask64;
87struct sveMask32;
88
89namespace abi
90{
91
92// mapping between abstract types and concrete floating point types
93template <> struct sve<double>
94{
95 using type = sveFloat64;
96};
97template <> struct sve<float>
98{
99 using type = sveFloat32;
100};
101// generic index mapping
102// assumes index type width same as floating point type
103template <> struct sve<std::int64_t>
104{
105 using type = sveInt64<std::int64_t>;
106};
107template <> struct sve<std::uint64_t>
108{
109 using type = sveInt64<std::uint64_t>;
110};
111template <> struct sve<std::int32_t>
112{
113 using type = sveInt32<std::int32_t>;
114};
115template <> struct sve<std::uint32_t>
116{
117 using type = sveInt32<std::uint32_t>;
118};
119// specialized index mapping
120template <> struct sve<std::int64_t, __ARM_FEATURE_SVE_BITS / 64>
121{
122 using type = sveInt64<std::int64_t>;
123};
124template <> struct sve<std::uint64_t, __ARM_FEATURE_SVE_BITS / 64>
125{
126 using type = sveInt64<std::uint64_t>;
127};
128// the number of lanes dictate the simd type
129// then we need to make sure we can load properly
130// a 32 bit pointer to a 64 bit vector (zero or sign extend)
131template <> struct sve<std::int32_t, __ARM_FEATURE_SVE_BITS / 64>
132{
133 using type = sveInt64<std::int64_t>;
134};
135template <> struct sve<std::uint32_t, __ARM_FEATURE_SVE_BITS / 64>
136{
137 using type = sveInt64<std::uint64_t>;
138};
139template <> struct sve<std::int32_t, __ARM_FEATURE_SVE_BITS / 32>
140{
141 using type = sveInt32<std::int32_t>;
142};
143template <> struct sve<std::uint32_t, __ARM_FEATURE_SVE_BITS / 32>
144{
145 using type = sveInt32<std::uint32_t>;
146};
147// bool mapping
148template <> struct sve<bool, __ARM_FEATURE_SVE_BITS / 64>
149{
150 using type = sveMask64;
151};
152template <> struct sve<bool, __ARM_FEATURE_SVE_BITS / 32>
153{
154 using type = sveMask32;
155};
156
157} // namespace abi
158
159// concrete types, could add enable if to allow only unsigned long and long...
160template <typename T> struct sveInt32
161{
162 static_assert(std::is_integral<T>::value && sizeof(T) == 4,
163 "4 bytes Integral required.");
164
165 static constexpr unsigned int alignment =
166 __ARM_FEATURE_SVE_BITS / sizeof(T);
167 static constexpr unsigned int width = alignment / 8;
168
169 using scalarType = T;
170 using vectorType =
171 typename std::conditional<std::is_signed<T>::value, svint32_vlst_t,
172 svuint32_vlst_t>::type;
173 using scalarArray = scalarType[width];
174
175 // storage
176 vectorType _data;
177
178 // ctors
179 inline sveInt32() = default;
180 inline sveInt32(const sveInt32 &rhs) = default;
181 inline sveInt32(const vectorType &rhs) : _data(rhs)
182 {
183 }
184 inline sveInt32(const scalarType rhs)
185 {
186 _data = svdup_s32(rhs);
187 }
188 explicit inline sveInt32(scalarArray &rhs)
189 {
190 _data = svld1(svptrue_b32(), rhs);
191 }
192
193 // store packed
194 inline void store(scalarType *p) const
195 {
196 svst1(svptrue_b32(), p, _data);
197 }
198 // refer to x86_64 implementations
199 // sve has no requirements on alignment
200 // nevertheless we should accept valid tags for compatibility
201 template <typename TAG,
202 typename std::enable_if<is_load_tag<TAG>::value, bool>::type = 0>
203 inline void store(scalarType *p, TAG) const
204 {
205 svst1(svptrue_b32(), p, _data);
206 }
207
208 // load packed
209 inline void load(const scalarType *p)
210 {
211 _data = svld1(svptrue_b32(), p);
212 }
213 // refer to x86_64 implementations
214 // sve has no requirements on alignment
215 // nevertheless we should accept valid tags for compatibility
216 template <typename TAG,
217 typename std::enable_if<is_load_tag<TAG>::value, bool>::type = 0>
218 inline void load(const scalarType *p, TAG)
219 {
220 _data = svld1(svptrue_b32(), p);
221 }
222
223 // broadcast
224 inline void broadcast(const scalarType rhs)
225 {
226 _data = svdup(rhs);
227 }
228
229 // subscript
230 // subscript operators are convienient but expensive
231 // should not be used in optimized kernels
232 inline scalarType operator[](size_t i) const
233 {
234 alignas(alignment) scalarArray tmp;
235 store(tmp, is_aligned);
236 return tmp[i];
237 }
238
239 inline scalarType &operator[](size_t i)
240 {
241 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
242 return tmp[i];
243 }
244
245 // unary ops
246 inline void operator+=(sveInt32 rhs)
247 {
248 _data = svadd_x(svptrue_b32(), _data, rhs._data);
249 }
250
251 inline void operator-=(sveInt32 rhs)
252 {
253 _data = svsub_x(svptrue_b32(), _data, rhs._data);
254 }
255
256 inline void operator*=(sveInt32 rhs)
257 {
258 _data = svmul_x(svptrue_b32(), _data, rhs._data);
259 }
260
261 inline void operator/=(sveInt32 rhs)
262 {
263 _data = svdiv_x(svptrue_b32(), _data, rhs._data);
264 }
265};
266
267template <typename T>
268inline sveInt32<T> operator+(sveInt32<T> lhs, sveInt32<T> rhs)
269{
270 return svadd_x(svptrue_b32(), lhs._data, rhs._data);
271}
272
273template <typename T> inline sveInt32<T> operator+(sveInt32<T> lhs, T rhs)
274{
275 return svadd_x(svptrue_b32(), lhs._data, sveInt32<T>(rhs)._data);
276}
277
278template <typename T>
279inline sveInt32<T> operator-(sveInt32<T> lhs, sveInt32<T> rhs)
280{
281 return svsub_x(svptrue_b32(), lhs._data, rhs._data);
282}
283
284template <typename T>
285inline sveInt32<T> operator*(sveInt32<T> lhs, sveInt32<T> rhs)
286{
287 return svmul_x(svptrue_b32(), lhs._data, rhs._data);
288}
289
290template <typename T>
291inline sveInt32<T> operator/(sveInt32<T> lhs, sveInt32<T> rhs)
292{
293 return svdiv_x(svptrue_b32(), lhs._data, rhs._data);
294}
295
296template <typename T> inline sveInt32<T> abs(sveInt32<T> in)
297{
298 return svabs_x(svptrue_b32(), in._data);
299}
300
301////////////////////////////////////////////////////////////////////////////////
302
303template <typename T> struct sveInt64
304{
305 static_assert(std::is_integral<T>::value && sizeof(T) == 8,
306 "8 bytes Integral required.");
307
308 static constexpr unsigned int alignment =
309 __ARM_FEATURE_SVE_BITS / sizeof(T);
310 static constexpr unsigned int width = alignment / 8;
311
312 using scalarType = T;
313 using vectorType =
314 typename std::conditional<std::is_signed<T>::value, svint64_vlst_t,
315 svuint64_vlst_t>::type;
316 using scalarArray = scalarType[width];
317
318 // storage
319 vectorType _data;
320
321 // ctors
322 inline sveInt64() = default;
323 inline sveInt64(const sveInt64 &rhs) = default;
324 inline sveInt64(const vectorType &rhs) : _data(rhs)
325 {
326 }
327 inline sveInt64(const scalarType rhs)
328 {
329 _data = svdup_s64(rhs);
330 }
331 explicit inline sveInt64(scalarArray &rhs)
332 {
333 _data = svld1(svptrue_b64(), rhs);
334 }
335
336 // store packed
337 inline void store(scalarType *p) const
338 {
339 svst1(svptrue_b64(), p, _data);
340 }
341 // refer to x86_64 implementations
342 // sve has no requirements on alignment
343 // nevertheless we should accept valid tags for compatibility
344 template <typename TAG,
345 typename std::enable_if<is_load_tag<TAG>::value, bool>::type = 0>
346 inline void store(scalarType *p, TAG) const
347 {
348 svst1(svptrue_b64(), p, _data);
349 }
350
351 // load packed
352 inline void load(const scalarType *p)
353 {
354 _data = svld1(svptrue_b64(), p);
355 }
356 // refer to x86_64 implementations
357 // sve has no requirements on alignment
358 // nevertheless we should accept valid tags for compatibility
359 template <typename TAG,
360 typename std::enable_if<is_load_tag<TAG>::value, bool>::type = 0>
361 inline void load(const scalarType *p, TAG)
362 {
363 _data = svld1(svptrue_b64(), p);
364 }
365
366 // load packed from 32 bit
367 template <typename I32,
368 typename std::enable_if<std::is_integral<I32>::value &&
369 std::is_signed<scalarType>::value &&
370 sizeof(I32) == 4,
371 bool>::type = 0>
372 inline void load(const I32 *p)
373 {
374 _data = svld1sw_s64(svptrue_b64(), p);
375 }
376 template <typename I32,
377 typename std::enable_if<std::is_integral<I32>::value &&
378 !std::is_signed<scalarType>::value &&
379 sizeof(I32) == 4,
380 bool>::type = 0>
381 inline void load(const I32 *p)
382 {
383 _data = svld1uw_s64(svptrue_b64(), p);
384 }
385 template <typename I32, typename TAG,
386 typename std::enable_if<
387 is_load_tag<TAG>::value && std::is_integral<I32>::value &&
388 std::is_signed<scalarType>::value && sizeof(I32) == 4,
389 bool>::type = 0>
390 inline void load(const I32 *p, TAG)
391 {
392 _data = svld1sw_s64(svptrue_b64(), p);
393 }
394 template <typename I32, typename TAG,
395 typename std::enable_if<
396 is_load_tag<TAG>::value && std::is_integral<I32>::value &&
397 !std::is_signed<scalarType>::value && sizeof(I32) == 4,
398 bool>::type = 0>
399 inline void load(const I32 *p, TAG)
400 {
401 _data = svld1uw_s64(svptrue_b64(), p);
402 }
403
404 // broadcast
405 inline void broadcast(const scalarType rhs)
406 {
407 _data = svdup(rhs);
408 }
409
410 // subscript
411 // subscript operators are convienient but expensive
412 // should not be used in optimized kernels
413 inline scalarType operator[](size_t i) const
414 {
415 alignas(alignment) scalarArray tmp;
416 store(tmp, is_aligned);
417 return tmp[i];
418 }
419
420 // unary ops
421 inline void operator+=(sveInt64 rhs)
422 {
423 _data = svadd_x(svptrue_b64(), _data, rhs._data);
424 }
425
426 inline void operator-=(sveInt64 rhs)
427 {
428 _data = svsub_x(svptrue_b64(), _data, rhs._data);
429 }
430
431 inline void operator*=(sveInt64 rhs)
432 {
433 _data = svmul_x(svptrue_b64(), _data, rhs._data);
434 }
435
436 inline void operator/=(sveInt64 rhs)
437 {
438 _data = svdiv_x(svptrue_b64(), _data, rhs._data);
439 }
440};
441
442template <typename T>
443inline sveInt64<T> operator+(sveInt64<T> lhs, sveInt64<T> rhs)
444{
445 return svadd_x(svptrue_b64(), lhs._data, rhs._data);
446}
447
448template <typename T> inline sveInt64<T> operator+(sveInt64<T> lhs, T rhs)
449{
450 return svadd_x(svptrue_b64(), lhs._data, sveInt64<T>(rhs)._data);
451}
452
453template <typename T>
454inline sveInt64<T> operator-(sveInt64<T> lhs, sveInt64<T> rhs)
455{
456 return svsub_x(svptrue_b64(), lhs._data, rhs._data);
457}
458
459template <typename T>
460inline sveInt64<T> operator*(sveInt64<T> lhs, sveInt64<T> rhs)
461{
462 return svmul_x(svptrue_b64(), lhs._data, rhs._data);
463}
464
465template <typename T>
466inline sveInt64<T> operator/(sveInt64<T> lhs, sveInt64<T> rhs)
467{
468 return svdiv_x(svptrue_b64(), lhs._data, rhs._data);
469}
470
471template <typename T> inline sveInt64<T> abs(sveInt64<T> in)
472{
473 return svabs_x(svptrue_b64(), in._data);
474}
475
476////////////////////////////////////////////////////////////////////////////////
477
478struct sveFloat32
479{
480 static constexpr unsigned int alignment =
481 __ARM_FEATURE_SVE_BITS / sizeof(float);
482 static constexpr unsigned int width = alignment / 8;
483
484 using scalarType = float;
485 using scalarIndexType = std::uint32_t;
486 using vectorType = svfloat32_vlst_t;
487 using scalarArray = scalarType[width];
488
489 // storage
490 vectorType _data;
491
492 // ctors
493 inline sveFloat32() = default;
494 inline sveFloat32(const sveFloat32 &rhs) = default;
495 inline sveFloat32(const vectorType &rhs) : _data(rhs)
496 {
497 }
498 inline sveFloat32(const scalarType rhs)
499 {
500 _data = svdup_f32(rhs);
501 }
502
503 // store packed
504 inline void store(scalarType *p) const
505 {
506 svst1_f32(svptrue_b32(), p, _data);
507 }
508 // refer to x86_64 implementations
509 // sve has no requirements on alignment
510 // nevertheless we should accept valid tags for compatibility
511 template <typename T,
512 typename std::enable_if<is_load_tag<T>::value, bool>::type = 0>
513 inline void store(scalarType *p, T) const
514 {
515 svst1_f32(svptrue_b32(), p, _data);
516 }
517
518 // load packed
519 inline void load(const scalarType *p)
520 {
521 _data = svld1_f32(svptrue_b32(), p);
522 }
523 // refer to x86_64 implementations
524 // sve has no requirements on alignment
525 // nevertheless we should accept valid tags for compatibility
526 template <typename T,
527 typename std::enable_if<is_load_tag<T>::value, bool>::type = 0>
528 inline void load(const scalarType *p, T)
529 {
530 _data = svld1_f32(svptrue_b32(), p);
531 }
532
533 // broadcast
534 inline void broadcast(const scalarType rhs)
535 {
536 _data = svdup_f32(rhs);
537 }
538
539 // gather/scatter
540 template <typename T>
541 inline void gather(scalarType const *p, const sveInt32<T> &indices)
542 {
543 _data = svld1_gather_index(svptrue_b32(), p, indices._data);
544 }
545
546 template <typename T>
547 inline void scatter(scalarType *out, const sveInt32<T> &indices) const
548 {
549 svst1_scatter_index(svptrue_b32(), out, indices._data, _data);
550 }
551
552 // fma
553 // this = this + a * b
554 inline void fma(const sveFloat32 &a, const sveFloat32 &b)
555 {
556 _data = svmad_x(svptrue_b32(), a._data, b._data, _data);
557 }
558
559 // subscript
560 // subscript operators are convienient but expensive
561 // should not be used in optimized kernels
562 inline scalarType operator[](size_t i) const
563 {
564 alignas(alignment) scalarArray tmp;
565 store(tmp, is_aligned);
566 return tmp[i];
567 }
568
569 inline scalarType &operator[](size_t i)
570 {
571 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
572 return tmp[i];
573 }
574
575 // unary ops
576 inline void operator+=(sveFloat32 rhs)
577 {
578 _data = svadd_x(svptrue_b32(), _data, rhs._data);
579 }
580
581 inline void operator-=(sveFloat32 rhs)
582 {
583 _data = svsub_x(svptrue_b32(), _data, rhs._data);
584 }
585
586 inline void operator*=(sveFloat32 rhs)
587 {
588 _data = svmul_x(svptrue_b32(), _data, rhs._data);
589 }
590
591 inline void operator/=(sveFloat32 rhs)
592 {
593 _data = svdiv_x(svptrue_b32(), _data, rhs._data);
594 }
595};
596
597inline sveFloat32 operator+(sveFloat32 lhs, sveFloat32 rhs)
598{
599 return svadd_x(svptrue_b32(), lhs._data, rhs._data);
600}
601
602inline sveFloat32 operator-(sveFloat32 lhs, sveFloat32 rhs)
603{
604 return svsub_x(svptrue_b32(), lhs._data, rhs._data);
605}
606
607inline sveFloat32 operator*(sveFloat32 lhs, sveFloat32 rhs)
608{
609 return svmul_x(svptrue_b32(), lhs._data, rhs._data);
610}
611
612inline sveFloat32 operator/(sveFloat32 lhs, sveFloat32 rhs)
613{
614 return svdiv_x(svptrue_b32(), lhs._data, rhs._data);
615}
616
617inline sveFloat32 sqrt(sveFloat32 in)
618{
619 return svsqrt_x(svptrue_b32(), in._data);
620}
621
622inline sveFloat32 abs(sveFloat32 in)
623{
624 return svabs_x(svptrue_b32(), in._data);
625}
626
627inline sveFloat32 log(sveFloat32 in)
628{
629 // there is no sve log intrinsic
630 // this is a dreadful implementation and is simply a stop gap measure
631 alignas(sveFloat32::alignment) sveFloat32::scalarArray tmp;
632 in.store(tmp);
633 for (size_t i = 0; i < sveFloat32::width; ++i)
634 {
635 tmp[i] = std::log(tmp[i]);
636 }
637 sveFloat32 ret;
638 ret.load(tmp);
639 return ret;
640}
641
642inline void load_interleave(const float *in, std::uint32_t dataLen,
643 std::vector<sveFloat32, allocator<sveFloat32>> &out)
644{
645
646 alignas(sveFloat32::alignment)
647 sveFloat32::scalarIndexType tmp[sveFloat32::width] = {};
648
649 // populate scalar index of unknown size
650 // (known at compile time)
651 for (size_t i = 0; i < sveFloat32::width; ++i)
652 {
653 tmp[i] = i * dataLen;
654 }
655
656 using index_t = sveInt32<sveFloat32::scalarIndexType>;
657 index_t index0(tmp);
658 index_t index1 = index0 + 1u;
659
660 // 2x unrolled loop -- minimun width is 2
661 size_t nBlocks = dataLen / 2;
662 for (size_t i = 0; i < nBlocks; ++i)
663 {
664 out[2 * i + 0].gather(in, index0);
665 out[2 * i + 1].gather(in, index1);
666 index0 = index0 + 2u;
667 index1 = index1 + 2u;
668 }
669
670 // spillover loop
671 for (size_t i = 2 * nBlocks; i < dataLen; ++i)
672 {
673 out[i].gather(in, index0);
674 index0 = index0 + 1u;
675 }
676}
677
678inline void deinterleave_store(
679 const std::vector<sveFloat32, allocator<sveFloat32>> &in,
680 std::uint32_t dataLen, float *out)
681{
682 alignas(sveFloat32::alignment)
683 sveFloat32::scalarIndexType tmp[sveFloat32::width] = {};
684
685 // populate scalar index of unknown size
686 // (known at compile time)
687 for (size_t i = 0; i < sveFloat32::width; ++i)
688 {
689 tmp[i] = i * dataLen;
690 }
691
692 using index_t = sveInt32<sveFloat32::scalarIndexType>;
693 index_t index0(tmp);
694
695 for (size_t i = 0; i < dataLen; ++i)
696 {
697 in[i].scatter(out, index0);
698 index0 = index0 + 1u;
699 }
700}
701
702////////////////////////////////////////////////////////////////////////////////
703
704struct sveFloat64
705{
706 static constexpr unsigned int alignment =
707 __ARM_FEATURE_SVE_BITS / sizeof(double);
708 static constexpr unsigned int width = alignment / 8;
709
710 using scalarType = double;
711 using scalarIndexType = std::uint64_t;
712 using vectorType = svfloat64_vlst_t;
713 using scalarArray = scalarType[width];
714
715 // storage
716 vectorType _data;
717
718 // ctors
719 inline sveFloat64() = default;
720 inline sveFloat64(const sveFloat64 &rhs) = default;
721 inline sveFloat64(const vectorType &rhs) : _data(rhs)
722 {
723 }
724 inline sveFloat64(const scalarType rhs)
725 {
726 _data = svdup_f64(rhs);
727 }
728
729 // store packed
730 inline void store(scalarType *p) const
731 {
732 svst1_f64(svptrue_b64(), p, _data);
733 }
734 // refer to x86_64 implementations
735 // sve has no requirements on alignment
736 // nevertheless we should accept valid tags for compatibility
737 template <typename T,
738 typename std::enable_if<is_load_tag<T>::value, bool>::type = 0>
739 inline void store(scalarType *p, T) const
740 {
741 svst1_f64(svptrue_b64(), p, _data);
742 }
743
744 // load packed
745 inline void load(const scalarType *p)
746 {
747 _data = svld1_f64(svptrue_b64(), p);
748 }
749 // refer to x86_64 implementations
750 // sve has no requirements on alignment
751 // nevertheless we should accept valid tags for compatibility
752 template <typename T,
753 typename std::enable_if<is_load_tag<T>::value, bool>::type = 0>
754 inline void load(const scalarType *p, T)
755 {
756 _data = svld1_f64(svptrue_b64(), p);
757 }
758
759 // broadcast
760 inline void broadcast(const scalarType rhs)
761 {
762 _data = svdup_f64(rhs);
763 }
764
765 // gather/scatter
766 template <typename T>
767 inline void gather(scalarType const *p, const sveInt64<T> &indices)
768 {
769 _data = svld1_gather_index(svptrue_b64(), p, indices._data);
770 }
771
772 template <typename T>
773 inline void scatter(scalarType *out, const sveInt64<T> &indices) const
774 {
775 svst1_scatter_index(svptrue_b64(), out, indices._data, _data);
776 }
777
778 // fma
779 // this = this + a * b
780 inline void fma(const sveFloat64 &a, const sveFloat64 &b)
781 {
782 _data = svmad_x(svptrue_b64(), a._data, b._data, _data);
783 }
784
785 // subscript
786 // subscript operators are convienient but expensive
787 // should not be used in optimized kernels
788 inline scalarType operator[](size_t i) const
789 {
790 alignas(alignment) scalarArray tmp;
791 store(tmp, is_aligned);
792 return tmp[i];
793 }
794
795 inline scalarType &operator[](size_t i)
796 {
797 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
798 return tmp[i];
799 }
800
801 // unary ops
802 inline void operator+=(sveFloat64 rhs)
803 {
804 _data = svadd_x(svptrue_b64(), _data, rhs._data);
805 }
806
807 inline void operator-=(sveFloat64 rhs)
808 {
809 _data = svsub_x(svptrue_b64(), _data, rhs._data);
810 }
811
812 inline void operator*=(sveFloat64 rhs)
813 {
814 _data = svmul_x(svptrue_b64(), _data, rhs._data);
815 }
816
817 inline void operator/=(sveFloat64 rhs)
818 {
819 _data = svdiv_x(svptrue_b64(), _data, rhs._data);
820 }
821};
822
823inline sveFloat64 operator+(sveFloat64 lhs, sveFloat64 rhs)
824{
825 return svadd_x(svptrue_b64(), lhs._data, rhs._data);
826}
827
828inline sveFloat64 operator-(sveFloat64 lhs, sveFloat64 rhs)
829{
830 return svsub_x(svptrue_b64(), lhs._data, rhs._data);
831}
832
833inline sveFloat64 operator*(sveFloat64 lhs, sveFloat64 rhs)
834{
835 return svmul_x(svptrue_b64(), lhs._data, rhs._data);
836}
837
838inline sveFloat64 operator/(sveFloat64 lhs, sveFloat64 rhs)
839{
840 return svdiv_x(svptrue_b64(), lhs._data, rhs._data);
841}
842
843inline sveFloat64 sqrt(sveFloat64 in)
844{
845 return svsqrt_x(svptrue_b64(), in._data);
846}
847
848inline sveFloat64 abs(sveFloat64 in)
849{
850 return svabs_x(svptrue_b64(), in._data);
851}
852
853inline sveFloat64 log(sveFloat64 in)
854{
855 // there is no sve log intrinsic
856 // this is a dreadful implementation and is simply a stop gap measure
857 alignas(sveFloat64::alignment) sveFloat64::scalarArray tmp;
858 in.store(tmp);
859 for (size_t i = 0; i < sveFloat64::width; ++i)
860 {
861 tmp[i] = std::log(tmp[i]);
862 }
863 sveFloat64 ret;
864 ret.load(tmp);
865 return ret;
866}
867
868inline void load_interleave(const double *in, std::uint32_t dataLen,
869 std::vector<sveFloat64, allocator<sveFloat64>> &out)
870{
871
872 alignas(sveFloat64::alignment) size_t tmp[sveFloat64::width] = {};
873
874 // populate scalar index of unknown size
875 // (known at compile time)
876 for (size_t i = 0; i < sveFloat64::width; ++i)
877 {
878 tmp[i] = i * dataLen;
879 }
880
881 using index_t = sveInt64<size_t>;
882 index_t index0(tmp);
883 index_t index1 = index0 + 1ul;
884
885 // 2x unrolled loop -- minimun width is 2
886 size_t nBlocks = dataLen / 2;
887 for (size_t i = 0; i < nBlocks; ++i)
888 {
889 out[2 * i + 0].gather(in, index0);
890 out[2 * i + 1].gather(in, index1);
891 index0 = index0 + 2ul;
892 index1 = index1 + 2ul;
893 }
894
895 // spillover loop
896 for (size_t i = 2 * nBlocks; i < dataLen; ++i)
897 {
898 out[i].gather(in, index0);
899 index0 = index0 + 1ul;
900 }
901}
902
903inline void deinterleave_store(
904 const std::vector<sveFloat64, allocator<sveFloat64>> &in,
905 std::uint32_t dataLen, double *out)
906{
907 alignas(sveFloat64::alignment) size_t tmp[sveFloat64::width] = {};
908
909 // populate scalar index of unknown size
910 // (known at compile time)
911 for (size_t i = 0; i < sveFloat64::width; ++i)
912 {
913 tmp[i] = i * dataLen;
914 }
915
916 using index_t = sveInt64<size_t>;
917 index_t index0(tmp);
918
919 for (size_t i = 0; i < dataLen; ++i)
920 {
921 in[i].scatter(out, index0);
922 index0 = index0 + 1ul;
923 }
924}
925
926////////////////////////////////////////////////////////////////////////////////
927
928// mask type
929// mask is a int type with special properties (broad boolean vector)
930// broad boolean vectors defined and allowed values are:
931// false=0x0 and true=0xFFFFFFFF
932//
933// VERY LIMITED SUPPORT...just enough to make cubic eos work...
934//
935struct sveMask64 : sveInt64<std::uint64_t>
936{
937 // bring in ctors
938 using sveInt64::sveInt64;
939
940 static constexpr scalarType true_v = -1;
941 static constexpr scalarType false_v = 0;
942};
943
944inline sveMask64 operator>(sveFloat64 lhs, sveFloat64 rhs)
945{
946 // set mask
947 svbool_vlst_t mask = svcmpgt(svptrue_b64(), lhs._data, rhs._data);
948 // abuse set inactive to zero to convert
949 sveMask64::vectorType sveTrue_v = svdup_u64(sveMask64::true_v);
950 return svand_z(mask, sveTrue_v, sveTrue_v);
951}
952
953// logical and
954inline bool operator&&(sveMask64 lhs, bool rhs)
955{
956 // set mask
957 sveMask64::vectorType sveFalse_v = svdup_u64(sveMask64::false_v);
958 svbool_vlst_t mask = svcmpne(svptrue_b64(), lhs._data, sveFalse_v);
959 // is any equal to false (zero)?
960 bool tmp = svptest_any(svptrue_b64(), mask);
961 return tmp && rhs;
962}
963
964////////////////////////////////////////////////////////////////////////////////
965
966struct sveMask32 : sveInt32<std::uint32_t>
967{
968 // bring in ctors
969 using sveInt32::sveInt32;
970
971 static constexpr scalarType true_v = -1;
972 static constexpr scalarType false_v = 0;
973};
974
975inline sveMask32 operator>(sveFloat32 lhs, sveFloat32 rhs)
976{
977 // set mask
978 svbool_vlst_t mask = svcmpgt(svptrue_b32(), lhs._data, rhs._data);
979 // abuse set inactive to zero to convert
980 sveMask32::vectorType sveTrue_v = svdup_u32(sveMask32::true_v);
981 return svand_z(mask, sveTrue_v, sveTrue_v);
982}
983
984// logical and
985inline bool operator&&(sveMask32 lhs, bool rhs)
986{
987 // set mask
988 sveMask32::vectorType sveFalse_v = svdup_u32(sveMask32::false_v);
989 svbool_vlst_t mask = svcmpne(svptrue_b32(), lhs._data, sveFalse_v);
990 // is any equal to false (zero)?
991 bool tmp = svptest_any(svptrue_b32(), mask);
992 return tmp && rhs;
993}
994
995} // namespace tinysimd
996
997#endif // defined(__ARM_FEATURE_SVE_BITS)
998
999#endif
void load_interleave(const T *in, size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition: scalar.hpp:309
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:298
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:235
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:275
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:303
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:255
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:366
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:376
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:294
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, size_t dataLen, T *out)
Definition: scalar.hpp:319
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:215