Nektar++
sve.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: sve.hpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Vector type using Armv8 Scalable Vector Extension (SVE).
32 //
33 ///////////////////////////////////////////////////////////////////////////////
34 
35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SVE_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SVE_H
37 
38 #if defined(__ARM_FEATURE_SVE)
39 #include <arm_acle.h>
40 #include <arm_sve.h>
41 #endif
42 
43 #include "allocator.hpp"
44 #include "traits.hpp"
45 #include <vector>
46 
47 namespace tinysimd
48 {
49 
50 namespace abi
51 {
52 
53 template <typename scalarType, int width = 0> struct sve
54 {
55  using type = void;
56 };
57 
58 } // namespace abi
59 
60 // requires clang >= 12.0.0 or gcc >= 10
61 // requires -msve-vector-bits=<length>
62 #if __ARM_FEATURE_SVE_BITS > 0 && defined(NEKTAR_ENABLE_SIMD_SVE)
63 // from VLA to VLST
64 // C++ does not allow for incomplete class member types
65 // to get around that we force a known size at compile time
66 typedef svfloat64_t svfloat64_vlst_t
67  __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
68 typedef svint64_t svint64_vlst_t
69  __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
70 typedef svuint64_t svuint64_vlst_t
71  __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
72 typedef svfloat32_t svfloat32_vlst_t
73  __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
74 typedef svint32_t svint32_vlst_t
75  __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
76 typedef svuint32_t svuint32_vlst_t
77  __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
78 typedef svbool_t svbool_vlst_t
79  __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
80 
81 // forward declaration of concrete types
82 template <typename T> struct sveInt64;
83 template <typename T> struct sveInt32;
84 struct sveFloat64;
85 struct sveFloat32;
86 struct sveMask64;
87 struct sveMask32;
88 
89 namespace abi
90 {
91 
92 // mapping between abstract types and concrete floating point types
93 template <> struct sve<double>
94 {
95  using type = sveFloat64;
96 };
97 template <> struct sve<float>
98 {
99  using type = sveFloat32;
100 };
101 // generic index mapping
102 // assumes index type width same as floating point type
103 template <> struct sve<std::int64_t>
104 {
105  using type = sveInt64<std::int64_t>;
106 };
107 template <> struct sve<std::uint64_t>
108 {
109  using type = sveInt64<std::uint64_t>;
110 };
111 template <> struct sve<std::int32_t>
112 {
113  using type = sveInt32<std::int32_t>;
114 };
115 template <> struct sve<std::uint32_t>
116 {
117  using type = sveInt32<std::uint32_t>;
118 };
119 // specialized index mapping
120 template <> struct sve<std::int64_t, __ARM_FEATURE_SVE_BITS / 64>
121 {
122  using type = sveInt64<std::int64_t>;
123 };
124 template <> struct sve<std::uint64_t, __ARM_FEATURE_SVE_BITS / 64>
125 {
126  using type = sveInt64<std::uint64_t>;
127 };
128 // the number of lanes dictate the simd type
129 // then we need to make sure we can load properly
130 // a 32 bit pointer to a 64 bit vector (zero or sign extend)
131 template <> struct sve<std::int32_t, __ARM_FEATURE_SVE_BITS / 64>
132 {
133  using type = sveInt64<std::int64_t>;
134 };
135 template <> struct sve<std::uint32_t, __ARM_FEATURE_SVE_BITS / 64>
136 {
137  using type = sveInt64<std::uint64_t>;
138 };
139 template <> struct sve<std::int32_t, __ARM_FEATURE_SVE_BITS / 32>
140 {
141  using type = sveInt32<std::int32_t>;
142 };
143 template <> struct sve<std::uint32_t, __ARM_FEATURE_SVE_BITS / 32>
144 {
145  using type = sveInt32<std::uint32_t>;
146 };
147 // bool mapping
148 template <> struct sve<bool, __ARM_FEATURE_SVE_BITS / 64>
149 {
150  using type = sveMask64;
151 };
152 template <> struct sve<bool, __ARM_FEATURE_SVE_BITS / 32>
153 {
154  using type = sveMask32;
155 };
156 
157 } // namespace abi
158 
159 // concrete types, could add enable if to allow only unsigned long and long...
160 template <typename T> struct sveInt32
161 {
162  static_assert(std::is_integral<T>::value && sizeof(T) == 4,
163  "4 bytes Integral required.");
164 
165  static constexpr unsigned int alignment =
166  __ARM_FEATURE_SVE_BITS / sizeof(T);
167  static constexpr unsigned int width = alignment / 8;
168 
169  using scalarType = T;
170  using vectorType =
171  typename std::conditional<std::is_signed<T>::value, svint32_vlst_t,
172  svuint32_vlst_t>::type;
173  using scalarArray = scalarType[width];
174 
175  // storage
176  vectorType _data;
177 
178  // ctors
179  inline sveInt32() = default;
180  inline sveInt32(const sveInt32 &rhs) = default;
181  inline sveInt32(const vectorType &rhs) : _data(rhs)
182  {
183  }
184  inline sveInt32(const scalarType rhs)
185  {
186  _data = svdup_s32(rhs);
187  }
188  explicit inline sveInt32(scalarArray &rhs)
189  {
190  _data = svld1(svptrue_b32(), rhs);
191  }
192 
193  // store packed
194  inline void store(scalarType *p) const
195  {
196  svst1(svptrue_b32(), p, _data);
197  }
198  // refer to x86_64 implementations
199  // sve has no requirements on alignment
200  // nevertheless we should accept valid tags for compatibility
201  template <typename TAG,
202  typename std::enable_if<is_load_tag<TAG>::value, bool>::type = 0>
203  inline void store(scalarType *p, TAG) const
204  {
205  svst1(svptrue_b32(), p, _data);
206  }
207 
208  // load packed
209  inline void load(const scalarType *p)
210  {
211  _data = svld1(svptrue_b32(), p);
212  }
213  // refer to x86_64 implementations
214  // sve has no requirements on alignment
215  // nevertheless we should accept valid tags for compatibility
216  template <typename TAG,
217  typename std::enable_if<is_load_tag<TAG>::value, bool>::type = 0>
218  inline void load(const scalarType *p, TAG)
219  {
220  _data = svld1(svptrue_b32(), p);
221  }
222 
223  // broadcast
224  inline void broadcast(const scalarType rhs)
225  {
226  _data = svdup(rhs);
227  }
228 
229  // subscript
230  // subscript operators are convienient but expensive
231  // should not be used in optimized kernels
232  inline scalarType operator[](size_t i) const
233  {
234  alignas(alignment) scalarArray tmp;
235  store(tmp, is_aligned);
236  return tmp[i];
237  }
238 
239  inline scalarType &operator[](size_t i)
240  {
241  scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
242  return tmp[i];
243  }
244 
245  // unary ops
246  inline void operator+=(sveInt32 rhs)
247  {
248  _data = svadd_x(svptrue_b32(), _data, rhs._data);
249  }
250 
251  inline void operator-=(sveInt32 rhs)
252  {
253  _data = svsub_x(svptrue_b32(), _data, rhs._data);
254  }
255 
256  inline void operator*=(sveInt32 rhs)
257  {
258  _data = svmul_x(svptrue_b32(), _data, rhs._data);
259  }
260 
261  inline void operator/=(sveInt32 rhs)
262  {
263  _data = svdiv_x(svptrue_b32(), _data, rhs._data);
264  }
265 };
266 
267 template <typename T>
268 inline sveInt32<T> operator+(sveInt32<T> lhs, sveInt32<T> rhs)
269 {
270  return svadd_x(svptrue_b32(), lhs._data, rhs._data);
271 }
272 
273 template <typename T> inline sveInt32<T> operator+(sveInt32<T> lhs, T rhs)
274 {
275  return svadd_x(svptrue_b32(), lhs._data, sveInt32<T>(rhs)._data);
276 }
277 
278 template <typename T>
279 inline sveInt32<T> operator-(sveInt32<T> lhs, sveInt32<T> rhs)
280 {
281  return svsub_x(svptrue_b32(), lhs._data, rhs._data);
282 }
283 
284 template <typename T>
285 inline sveInt32<T> operator*(sveInt32<T> lhs, sveInt32<T> rhs)
286 {
287  return svmul_x(svptrue_b32(), lhs._data, rhs._data);
288 }
289 
290 template <typename T>
291 inline sveInt32<T> operator/(sveInt32<T> lhs, sveInt32<T> rhs)
292 {
293  return svdiv_x(svptrue_b32(), lhs._data, rhs._data);
294 }
295 
296 template <typename T> inline sveInt32<T> abs(sveInt32<T> in)
297 {
298  return svabs_x(svptrue_b32(), in._data);
299 }
300 
301 ////////////////////////////////////////////////////////////////////////////////
302 
303 template <typename T> struct sveInt64
304 {
305  static_assert(std::is_integral<T>::value && sizeof(T) == 8,
306  "8 bytes Integral required.");
307 
308  static constexpr unsigned int alignment =
309  __ARM_FEATURE_SVE_BITS / sizeof(T);
310  static constexpr unsigned int width = alignment / 8;
311 
312  using scalarType = T;
313  using vectorType =
314  typename std::conditional<std::is_signed<T>::value, svint64_vlst_t,
315  svuint64_vlst_t>::type;
316  using scalarArray = scalarType[width];
317 
318  // storage
319  vectorType _data;
320 
321  // ctors
322  inline sveInt64() = default;
323  inline sveInt64(const sveInt64 &rhs) = default;
324  inline sveInt64(const vectorType &rhs) : _data(rhs)
325  {
326  }
327  inline sveInt64(const scalarType rhs)
328  {
329  _data = svdup_s64(rhs);
330  }
331  explicit inline sveInt64(scalarArray &rhs)
332  {
333  _data = svld1(svptrue_b64(), rhs);
334  }
335 
336  // store packed
337  inline void store(scalarType *p) const
338  {
339  svst1(svptrue_b64(), p, _data);
340  }
341  // refer to x86_64 implementations
342  // sve has no requirements on alignment
343  // nevertheless we should accept valid tags for compatibility
344  template <typename TAG,
345  typename std::enable_if<is_load_tag<TAG>::value, bool>::type = 0>
346  inline void store(scalarType *p, TAG) const
347  {
348  svst1(svptrue_b64(), p, _data);
349  }
350 
351  // load packed
352  inline void load(const scalarType *p)
353  {
354  _data = svld1(svptrue_b64(), p);
355  }
356  // refer to x86_64 implementations
357  // sve has no requirements on alignment
358  // nevertheless we should accept valid tags for compatibility
359  template <typename TAG,
360  typename std::enable_if<is_load_tag<TAG>::value, bool>::type = 0>
361  inline void load(const scalarType *p, TAG)
362  {
363  _data = svld1(svptrue_b64(), p);
364  }
365 
366  // load packed from 32 bit
367  template <typename I32,
368  typename std::enable_if<std::is_integral<I32>::value &&
369  std::is_signed<scalarType>::value &&
370  sizeof(I32) == 4,
371  bool>::type = 0>
372  inline void load(const I32 *p)
373  {
374  _data = svld1sw_s64(svptrue_b64(), p);
375  }
376  template <typename I32,
377  typename std::enable_if<std::is_integral<I32>::value &&
378  !std::is_signed<scalarType>::value &&
379  sizeof(I32) == 4,
380  bool>::type = 0>
381  inline void load(const I32 *p)
382  {
383  _data = svld1uw_s64(svptrue_b64(), p);
384  }
385  template <typename I32, typename TAG,
386  typename std::enable_if<
387  is_load_tag<TAG>::value && std::is_integral<I32>::value &&
388  std::is_signed<scalarType>::value && sizeof(I32) == 4,
389  bool>::type = 0>
390  inline void load(const I32 *p, TAG)
391  {
392  _data = svld1sw_s64(svptrue_b64(), p);
393  }
394  template <typename I32, typename TAG,
395  typename std::enable_if<
396  is_load_tag<TAG>::value && std::is_integral<I32>::value &&
397  !std::is_signed<scalarType>::value && sizeof(I32) == 4,
398  bool>::type = 0>
399  inline void load(const I32 *p, TAG)
400  {
401  _data = svld1uw_s64(svptrue_b64(), p);
402  }
403 
404  // broadcast
405  inline void broadcast(const scalarType rhs)
406  {
407  _data = svdup(rhs);
408  }
409 
410  // subscript
411  // subscript operators are convienient but expensive
412  // should not be used in optimized kernels
413  inline scalarType operator[](size_t i) const
414  {
415  alignas(alignment) scalarArray tmp;
416  store(tmp, is_aligned);
417  return tmp[i];
418  }
419 
420  // unary ops
421  inline void operator+=(sveInt64 rhs)
422  {
423  _data = svadd_x(svptrue_b64(), _data, rhs._data);
424  }
425 
426  inline void operator-=(sveInt64 rhs)
427  {
428  _data = svsub_x(svptrue_b64(), _data, rhs._data);
429  }
430 
431  inline void operator*=(sveInt64 rhs)
432  {
433  _data = svmul_x(svptrue_b64(), _data, rhs._data);
434  }
435 
436  inline void operator/=(sveInt64 rhs)
437  {
438  _data = svdiv_x(svptrue_b64(), _data, rhs._data);
439  }
440 };
441 
442 template <typename T>
443 inline sveInt64<T> operator+(sveInt64<T> lhs, sveInt64<T> rhs)
444 {
445  return svadd_x(svptrue_b64(), lhs._data, rhs._data);
446 }
447 
448 template <typename T> inline sveInt64<T> operator+(sveInt64<T> lhs, T rhs)
449 {
450  return svadd_x(svptrue_b64(), lhs._data, sveInt64<T>(rhs)._data);
451 }
452 
453 template <typename T>
454 inline sveInt64<T> operator-(sveInt64<T> lhs, sveInt64<T> rhs)
455 {
456  return svsub_x(svptrue_b64(), lhs._data, rhs._data);
457 }
458 
459 template <typename T>
460 inline sveInt64<T> operator*(sveInt64<T> lhs, sveInt64<T> rhs)
461 {
462  return svmul_x(svptrue_b64(), lhs._data, rhs._data);
463 }
464 
465 template <typename T>
466 inline sveInt64<T> operator/(sveInt64<T> lhs, sveInt64<T> rhs)
467 {
468  return svdiv_x(svptrue_b64(), lhs._data, rhs._data);
469 }
470 
471 template <typename T> inline sveInt64<T> abs(sveInt64<T> in)
472 {
473  return svabs_x(svptrue_b64(), in._data);
474 }
475 
476 ////////////////////////////////////////////////////////////////////////////////
477 
478 struct sveFloat32
479 {
480  static constexpr unsigned int alignment =
481  __ARM_FEATURE_SVE_BITS / sizeof(float);
482  static constexpr unsigned int width = alignment / 8;
483 
484  using scalarType = float;
485  using scalarIndexType = std::uint32_t;
486  using vectorType = svfloat32_vlst_t;
487  using scalarArray = scalarType[width];
488 
489  // storage
490  vectorType _data;
491 
492  // ctors
493  inline sveFloat32() = default;
494  inline sveFloat32(const sveFloat32 &rhs) = default;
495  inline sveFloat32(const vectorType &rhs) : _data(rhs)
496  {
497  }
498  inline sveFloat32(const scalarType rhs)
499  {
500  _data = svdup_f32(rhs);
501  }
502 
503  // store packed
504  inline void store(scalarType *p) const
505  {
506  svst1_f32(svptrue_b32(), p, _data);
507  }
508  // refer to x86_64 implementations
509  // sve has no requirements on alignment
510  // nevertheless we should accept valid tags for compatibility
511  template <typename T,
512  typename std::enable_if<is_load_tag<T>::value, bool>::type = 0>
513  inline void store(scalarType *p, T) const
514  {
515  svst1_f32(svptrue_b32(), p, _data);
516  }
517 
518  // load packed
519  inline void load(const scalarType *p)
520  {
521  _data = svld1_f32(svptrue_b32(), p);
522  }
523  // refer to x86_64 implementations
524  // sve has no requirements on alignment
525  // nevertheless we should accept valid tags for compatibility
526  template <typename T,
527  typename std::enable_if<is_load_tag<T>::value, bool>::type = 0>
528  inline void load(const scalarType *p, T)
529  {
530  _data = svld1_f32(svptrue_b32(), p);
531  }
532 
533  // broadcast
534  inline void broadcast(const scalarType rhs)
535  {
536  _data = svdup_f32(rhs);
537  }
538 
539  // gather/scatter
540  template <typename T>
541  inline void gather(scalarType const *p, const sveInt32<T> &indices)
542  {
543  _data = svld1_gather_index(svptrue_b32(), p, indices._data);
544  }
545 
546  template <typename T>
547  inline void scatter(scalarType *out, const sveInt32<T> &indices) const
548  {
549  svst1_scatter_index(svptrue_b32(), out, indices._data, _data);
550  }
551 
552  // fma
553  // this = this + a * b
554  inline void fma(const sveFloat32 &a, const sveFloat32 &b)
555  {
556  _data = svmad_x(svptrue_b32(), a._data, b._data, _data);
557  }
558 
559  // subscript
560  // subscript operators are convienient but expensive
561  // should not be used in optimized kernels
562  inline scalarType operator[](size_t i) const
563  {
564  alignas(alignment) scalarArray tmp;
565  store(tmp, is_aligned);
566  return tmp[i];
567  }
568 
569  inline scalarType &operator[](size_t i)
570  {
571  scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
572  return tmp[i];
573  }
574 
575  // unary ops
576  inline void operator+=(sveFloat32 rhs)
577  {
578  _data = svadd_x(svptrue_b32(), _data, rhs._data);
579  }
580 
581  inline void operator-=(sveFloat32 rhs)
582  {
583  _data = svsub_x(svptrue_b32(), _data, rhs._data);
584  }
585 
586  inline void operator*=(sveFloat32 rhs)
587  {
588  _data = svmul_x(svptrue_b32(), _data, rhs._data);
589  }
590 
591  inline void operator/=(sveFloat32 rhs)
592  {
593  _data = svdiv_x(svptrue_b32(), _data, rhs._data);
594  }
595 };
596 
597 inline sveFloat32 operator+(sveFloat32 lhs, sveFloat32 rhs)
598 {
599  return svadd_x(svptrue_b32(), lhs._data, rhs._data);
600 }
601 
602 inline sveFloat32 operator-(sveFloat32 lhs, sveFloat32 rhs)
603 {
604  return svsub_x(svptrue_b32(), lhs._data, rhs._data);
605 }
606 
607 inline sveFloat32 operator*(sveFloat32 lhs, sveFloat32 rhs)
608 {
609  return svmul_x(svptrue_b32(), lhs._data, rhs._data);
610 }
611 
612 inline sveFloat32 operator/(sveFloat32 lhs, sveFloat32 rhs)
613 {
614  return svdiv_x(svptrue_b32(), lhs._data, rhs._data);
615 }
616 
617 inline sveFloat32 sqrt(sveFloat32 in)
618 {
619  return svsqrt_x(svptrue_b32(), in._data);
620 }
621 
622 inline sveFloat32 abs(sveFloat32 in)
623 {
624  return svabs_x(svptrue_b32(), in._data);
625 }
626 
627 inline sveFloat32 log(sveFloat32 in)
628 {
629  // there is no sve log intrinsic
630  // this is a dreadful implementation and is simply a stop gap measure
631  alignas(sveFloat32::alignment) sveFloat32::scalarArray tmp;
632  in.store(tmp);
633  for (size_t i = 0; i < sveFloat32::width; ++i)
634  {
635  tmp[i] = std::log(tmp[i]);
636  }
637  sveFloat32 ret;
638  ret.load(tmp);
639  return ret;
640 }
641 
642 inline void load_interleave(const float *in, std::uint32_t dataLen,
643  std::vector<sveFloat32, allocator<sveFloat32>> &out)
644 {
645 
646  alignas(sveFloat32::alignment)
647  sveFloat32::scalarIndexType tmp[sveFloat32::width] = {};
648 
649  // populate scalar index of unknown size
650  // (known at compile time)
651  for (size_t i = 0; i < sveFloat32::width; ++i)
652  {
653  tmp[i] = i * dataLen;
654  }
655 
656  using index_t = sveInt32<sveFloat32::scalarIndexType>;
657  index_t index0(tmp);
658  index_t index1 = index0 + 1u;
659 
660  // 2x unrolled loop -- minimun width is 2
661  size_t nBlocks = dataLen / 2;
662  for (size_t i = 0; i < nBlocks; ++i)
663  {
664  out[2 * i + 0].gather(in, index0);
665  out[2 * i + 1].gather(in, index1);
666  index0 = index0 + 2u;
667  index1 = index1 + 2u;
668  }
669 
670  // spillover loop
671  for (size_t i = 2 * nBlocks; i < dataLen; ++i)
672  {
673  out[i].gather(in, index0);
674  index0 = index0 + 1u;
675  }
676 }
677 
678 inline void deinterleave_store(
679  const std::vector<sveFloat32, allocator<sveFloat32>> &in,
680  std::uint32_t dataLen, float *out)
681 {
682  alignas(sveFloat32::alignment)
683  sveFloat32::scalarIndexType tmp[sveFloat32::width] = {};
684 
685  // populate scalar index of unknown size
686  // (known at compile time)
687  for (size_t i = 0; i < sveFloat32::width; ++i)
688  {
689  tmp[i] = i * dataLen;
690  }
691 
692  using index_t = sveInt32<sveFloat32::scalarIndexType>;
693  index_t index0(tmp);
694 
695  for (size_t i = 0; i < dataLen; ++i)
696  {
697  in[i].scatter(out, index0);
698  index0 = index0 + 1u;
699  }
700 }
701 
702 ////////////////////////////////////////////////////////////////////////////////
703 
704 struct sveFloat64
705 {
706  static constexpr unsigned int alignment =
707  __ARM_FEATURE_SVE_BITS / sizeof(double);
708  static constexpr unsigned int width = alignment / 8;
709 
710  using scalarType = double;
711  using scalarIndexType = std::uint64_t;
712  using vectorType = svfloat64_vlst_t;
713  using scalarArray = scalarType[width];
714 
715  // storage
716  vectorType _data;
717 
718  // ctors
719  inline sveFloat64() = default;
720  inline sveFloat64(const sveFloat64 &rhs) = default;
721  inline sveFloat64(const vectorType &rhs) : _data(rhs)
722  {
723  }
724  inline sveFloat64(const scalarType rhs)
725  {
726  _data = svdup_f64(rhs);
727  }
728 
729  // store packed
730  inline void store(scalarType *p) const
731  {
732  svst1_f64(svptrue_b64(), p, _data);
733  }
734  // refer to x86_64 implementations
735  // sve has no requirements on alignment
736  // nevertheless we should accept valid tags for compatibility
737  template <typename T,
738  typename std::enable_if<is_load_tag<T>::value, bool>::type = 0>
739  inline void store(scalarType *p, T) const
740  {
741  svst1_f64(svptrue_b64(), p, _data);
742  }
743 
744  // load packed
745  inline void load(const scalarType *p)
746  {
747  _data = svld1_f64(svptrue_b64(), p);
748  }
749  // refer to x86_64 implementations
750  // sve has no requirements on alignment
751  // nevertheless we should accept valid tags for compatibility
752  template <typename T,
753  typename std::enable_if<is_load_tag<T>::value, bool>::type = 0>
754  inline void load(const scalarType *p, T)
755  {
756  _data = svld1_f64(svptrue_b64(), p);
757  }
758 
759  // broadcast
760  inline void broadcast(const scalarType rhs)
761  {
762  _data = svdup_f64(rhs);
763  }
764 
765  // gather/scatter
766  template <typename T>
767  inline void gather(scalarType const *p, const sveInt64<T> &indices)
768  {
769  _data = svld1_gather_index(svptrue_b64(), p, indices._data);
770  }
771 
772  template <typename T>
773  inline void scatter(scalarType *out, const sveInt64<T> &indices) const
774  {
775  svst1_scatter_index(svptrue_b64(), out, indices._data, _data);
776  }
777 
778  // fma
779  // this = this + a * b
780  inline void fma(const sveFloat64 &a, const sveFloat64 &b)
781  {
782  _data = svmad_x(svptrue_b64(), a._data, b._data, _data);
783  }
784 
785  // subscript
786  // subscript operators are convienient but expensive
787  // should not be used in optimized kernels
788  inline scalarType operator[](size_t i) const
789  {
790  alignas(alignment) scalarArray tmp;
791  store(tmp, is_aligned);
792  return tmp[i];
793  }
794 
795  inline scalarType &operator[](size_t i)
796  {
797  scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
798  return tmp[i];
799  }
800 
801  // unary ops
802  inline void operator+=(sveFloat64 rhs)
803  {
804  _data = svadd_x(svptrue_b64(), _data, rhs._data);
805  }
806 
807  inline void operator-=(sveFloat64 rhs)
808  {
809  _data = svsub_x(svptrue_b64(), _data, rhs._data);
810  }
811 
812  inline void operator*=(sveFloat64 rhs)
813  {
814  _data = svmul_x(svptrue_b64(), _data, rhs._data);
815  }
816 
817  inline void operator/=(sveFloat64 rhs)
818  {
819  _data = svdiv_x(svptrue_b64(), _data, rhs._data);
820  }
821 };
822 
823 inline sveFloat64 operator+(sveFloat64 lhs, sveFloat64 rhs)
824 {
825  return svadd_x(svptrue_b64(), lhs._data, rhs._data);
826 }
827 
828 inline sveFloat64 operator-(sveFloat64 lhs, sveFloat64 rhs)
829 {
830  return svsub_x(svptrue_b64(), lhs._data, rhs._data);
831 }
832 
833 inline sveFloat64 operator*(sveFloat64 lhs, sveFloat64 rhs)
834 {
835  return svmul_x(svptrue_b64(), lhs._data, rhs._data);
836 }
837 
838 inline sveFloat64 operator/(sveFloat64 lhs, sveFloat64 rhs)
839 {
840  return svdiv_x(svptrue_b64(), lhs._data, rhs._data);
841 }
842 
843 inline sveFloat64 sqrt(sveFloat64 in)
844 {
845  return svsqrt_x(svptrue_b64(), in._data);
846 }
847 
848 inline sveFloat64 abs(sveFloat64 in)
849 {
850  return svabs_x(svptrue_b64(), in._data);
851 }
852 
853 inline sveFloat64 log(sveFloat64 in)
854 {
855  // there is no sve log intrinsic
856  // this is a dreadful implementation and is simply a stop gap measure
857  alignas(sveFloat64::alignment) sveFloat64::scalarArray tmp;
858  in.store(tmp);
859  for (size_t i = 0; i < sveFloat64::width; ++i)
860  {
861  tmp[i] = std::log(tmp[i]);
862  }
863  sveFloat64 ret;
864  ret.load(tmp);
865  return ret;
866 }
867 
868 inline void load_interleave(const double *in, std::uint32_t dataLen,
869  std::vector<sveFloat64, allocator<sveFloat64>> &out)
870 {
871 
872  alignas(sveFloat64::alignment) size_t tmp[sveFloat64::width] = {};
873 
874  // populate scalar index of unknown size
875  // (known at compile time)
876  for (size_t i = 0; i < sveFloat64::width; ++i)
877  {
878  tmp[i] = i * dataLen;
879  }
880 
881  using index_t = sveInt64<size_t>;
882  index_t index0(tmp);
883  index_t index1 = index0 + 1ul;
884 
885  // 2x unrolled loop -- minimun width is 2
886  size_t nBlocks = dataLen / 2;
887  for (size_t i = 0; i < nBlocks; ++i)
888  {
889  out[2 * i + 0].gather(in, index0);
890  out[2 * i + 1].gather(in, index1);
891  index0 = index0 + 2ul;
892  index1 = index1 + 2ul;
893  }
894 
895  // spillover loop
896  for (size_t i = 2 * nBlocks; i < dataLen; ++i)
897  {
898  out[i].gather(in, index0);
899  index0 = index0 + 1ul;
900  }
901 }
902 
903 inline void deinterleave_store(
904  const std::vector<sveFloat64, allocator<sveFloat64>> &in,
905  std::uint32_t dataLen, double *out)
906 {
907  alignas(sveFloat64::alignment) size_t tmp[sveFloat64::width] = {};
908 
909  // populate scalar index of unknown size
910  // (known at compile time)
911  for (size_t i = 0; i < sveFloat64::width; ++i)
912  {
913  tmp[i] = i * dataLen;
914  }
915 
916  using index_t = sveInt64<size_t>;
917  index_t index0(tmp);
918 
919  for (size_t i = 0; i < dataLen; ++i)
920  {
921  in[i].scatter(out, index0);
922  index0 = index0 + 1ul;
923  }
924 }
925 
926 ////////////////////////////////////////////////////////////////////////////////
927 
928 // mask type
929 // mask is a int type with special properties (broad boolean vector)
930 // broad boolean vectors defined and allowed values are:
931 // false=0x0 and true=0xFFFFFFFF
932 //
933 // VERY LIMITED SUPPORT...just enough to make cubic eos work...
934 //
935 struct sveMask64 : sveInt64<std::uint64_t>
936 {
937  // bring in ctors
938  using sveInt64::sveInt64;
939 
940  static constexpr scalarType true_v = -1;
941  static constexpr scalarType false_v = 0;
942 };
943 
944 inline sveMask64 operator>(sveFloat64 lhs, sveFloat64 rhs)
945 {
946  // set mask
947  svbool_vlst_t mask = svcmpgt(svptrue_b64(), lhs._data, rhs._data);
948  // abuse set inactive to zero to convert
949  sveMask64::vectorType sveTrue_v = svdup_u64(sveMask64::true_v);
950  return svand_z(mask, sveTrue_v, sveTrue_v);
951 }
952 
953 // logical and
954 inline bool operator&&(sveMask64 lhs, bool rhs)
955 {
956  // set mask
957  sveMask64::vectorType sveFalse_v = svdup_u64(sveMask64::false_v);
958  svbool_vlst_t mask = svcmpne(svptrue_b64(), lhs._data, sveFalse_v);
959  // is any equal to false (zero)?
960  bool tmp = svptest_any(svptrue_b64(), mask);
961  return tmp && rhs;
962 }
963 
964 ////////////////////////////////////////////////////////////////////////////////
965 
966 struct sveMask32 : sveInt32<std::uint32_t>
967 {
968  // bring in ctors
969  using sveInt32::sveInt32;
970 
971  static constexpr scalarType true_v = -1;
972  static constexpr scalarType false_v = 0;
973 };
974 
975 inline sveMask32 operator>(sveFloat32 lhs, sveFloat32 rhs)
976 {
977  // set mask
978  svbool_vlst_t mask = svcmpgt(svptrue_b32(), lhs._data, rhs._data);
979  // abuse set inactive to zero to convert
980  sveMask32::vectorType sveTrue_v = svdup_u32(sveMask32::true_v);
981  return svand_z(mask, sveTrue_v, sveTrue_v);
982 }
983 
984 // logical and
985 inline bool operator&&(sveMask32 lhs, bool rhs)
986 {
987  // set mask
988  sveMask32::vectorType sveFalse_v = svdup_u32(sveMask32::false_v);
989  svbool_vlst_t mask = svcmpne(svptrue_b32(), lhs._data, sveFalse_v);
990  // is any equal to false (zero)?
991  bool tmp = svptest_any(svptrue_b32(), mask);
992  return tmp && rhs;
993 }
994 
995 #endif // defined(__ARM_FEATURE_SVE_BITS)
996 
997 } // namespace tinysimd
998 #endif
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:303
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:215
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T >>> &in, size_t dataLen, T *out)
Definition: scalar.hpp:319
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:235
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:298
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:366
void load_interleave(const T *in, size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T >>> &out)
Definition: scalar.hpp:309
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:376
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:294
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:275
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:255