Nektar++
sve.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: sve.hpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Vector type using Armv8 Scalable Vector Extension (SVE).
32 //
33 ///////////////////////////////////////////////////////////////////////////////
34 
35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SVE_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SVE_H
37 
38 #if defined(__ARM_FEATURE_SVE)
39 #include <arm_acle.h>
40 #include <arm_sve.h>
41 #endif
42 
43 #include "allocator.hpp"
44 #include "traits.hpp"
45 #include <vector>
46 
47 namespace tinysimd
48 {
49 
50 namespace abi
51 {
52 
53 template <typename scalarType> struct sve
54 {
55  using type = void;
56 };
57 
58 } // namespace abi
59 
60 // requires clang >= 12.0.0 or gcc >= 10
61 // requires -msve-vector-bits=<length>
62 #if __ARM_FEATURE_SVE_BITS > 0 && defined(NEKTAR_ENABLE_SIMD_SVE)
63 // from VLA to VLST
64 // C++ does not allow for incomplete class member types
65 // to get around that we force a known size at compile time
66 typedef svfloat64_t svfloat64_vlst_t
67  __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
68 typedef svint64_t svint64_vlst_t
69  __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
70 typedef svuint64_t svuint64_vlst_t
71  __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
72 typedef svbool_t svbool_vlst_t
73  __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
74 
75 // forward declaration of concrete types
76 template <typename T> struct sveLong;
77 struct sveDouble;
78 struct sveMask;
79 
80 namespace abi
81 {
82 
83 // mapping between abstract types and concrete types
84 template <> struct sve<double>
85 {
86  using type = sveDouble;
87 };
88 template <> struct sve<std::int64_t>
89 {
90  using type = sveLong<std::int64_t>;
91 };
92 template <> struct sve<std::uint64_t>
93 {
94  using type = sveLong<std::uint64_t>;
95 };
96 template <> struct sve<bool>
97 {
98  using type = sveMask;
99 };
100 
101 } // namespace abi
102 
103 // concrete types, could add enable if to allow only unsigned long and long...
104 template <typename T> struct sveLong
105 {
106  static_assert(std::is_integral<T>::value && sizeof(T) == 8,
107  "8 bytes Integral required.");
108 
109  static constexpr unsigned int alignment =
110  __ARM_FEATURE_SVE_BITS / sizeof(T);
111  static constexpr unsigned int width = alignment / 8;
112 
113  using scalarType = T;
114  using vectorType =
115  typename std::conditional<std::is_signed<T>::value, svint64_vlst_t,
116  svuint64_vlst_t>::type;
117  using scalarArray = scalarType[width];
118 
119  // storage
120  vectorType _data;
121 
122  // ctors
123  inline sveLong() = default;
124  inline sveLong(const sveLong &rhs) = default;
125  inline sveLong(const vectorType &rhs) : _data(rhs)
126  {
127  }
128  inline sveLong(const scalarType rhs)
129  {
130  _data = svdup_s64(rhs);
131  }
132  explicit inline sveLong(scalarArray &rhs)
133  {
134  _data = svld1(svptrue_b64(), rhs);
135  }
136 
137  // store packed
138  inline void store(scalarType *p) const
139  {
140  svst1(svptrue_b64(), p, _data);
141  }
142  // refer to x86_64 implementations
143  // sve has no requirements on alignment
144  // nevertheless we should accept valid tags for compatibility
145  template <typename TAG,
146  typename std::enable_if<is_load_tag<TAG>::value, bool>::type = 0>
147  inline void store(scalarType *p, TAG) const
148  {
149  svst1(svptrue_b64(), p, _data);
150  }
151 
152  // load packed
153  inline void load(const scalarType *p)
154  {
155  _data = svld1(svptrue_b64(), p);
156  }
157  // refer to x86_64 implementations
158  // sve has no requirements on alignment
159  // nevertheless we should accept valid tags for compatibility
160  template <typename TAG,
161  typename std::enable_if<is_load_tag<TAG>::value, bool>::type = 0>
162  inline void load(const scalarType *p, TAG)
163  {
164  _data = svld1(svptrue_b64(), p);
165  }
166 
167  // broadcast
168  inline void broadcast(const scalarType rhs)
169  {
170  _data = svdup(rhs);
171  }
172 
173  // subscript
174  // subscript operators are convienient but expensive
175  // should not be used in optimized kernels
176  inline scalarType operator[](size_t i) const
177  {
178  alignas(alignment) scalarArray tmp;
179  store(tmp, is_aligned);
180  return tmp[i];
181  }
182 
183  // unary ops
184  inline void operator+=(sveLong rhs)
185  {
186  _data = svadd_x(svptrue_b64(), _data, rhs._data);
187  }
188 
189  inline void operator-=(sveLong rhs)
190  {
191  _data = svsub_x(svptrue_b64(), _data, rhs._data);
192  }
193 
194  inline void operator*=(sveLong rhs)
195  {
196  _data = svmul_x(svptrue_b64(), _data, rhs._data);
197  }
198 
199  inline void operator/=(sveLong rhs)
200  {
201  _data = svdiv_x(svptrue_b64(), _data, rhs._data);
202  }
203 };
204 
205 template <typename T>
206 inline sveLong<T> operator+(sveLong<T> lhs, sveLong<T> rhs)
207 {
208  return svadd_x(svptrue_b64(), lhs._data, rhs._data);
209 }
210 
211 template <typename T> inline sveLong<T> operator+(sveLong<T> lhs, T rhs)
212 {
213  return svadd_x(svptrue_b64(), lhs._data, sveLong<T>(rhs)._data);
214 }
215 
216 template <typename T>
217 inline sveLong<T> operator-(sveLong<T> lhs, sveLong<T> rhs)
218 {
219  return svsub_x(svptrue_b64(), lhs._data, rhs._data);
220 }
221 
222 template <typename T>
223 inline sveLong<T> operator*(sveLong<T> lhs, sveLong<T> rhs)
224 {
225  return svmul_x(svptrue_b64(), lhs._data, rhs._data);
226 }
227 
228 template <typename T>
229 inline sveLong<T> operator/(sveLong<T> lhs, sveLong<T> rhs)
230 {
231  return svdiv_x(svptrue_b64(), lhs._data, rhs._data);
232 }
233 
234 template <typename T> inline sveLong<T> abs(sveLong<T> in)
235 {
236  return svabs_x(svptrue_b64(), in._data);
237 }
238 
239 ////////////////////////////////////////////////////////////////////////////////
240 
241 struct sveDouble
242 {
243  static constexpr unsigned int alignment =
244  __ARM_FEATURE_SVE_BITS / sizeof(double);
245  static constexpr unsigned int width = alignment / 8;
246 
247  using scalarType = double;
248  using vectorType = svfloat64_vlst_t;
249  using scalarArray = scalarType[width];
250 
251  // storage
252  vectorType _data;
253 
254  // ctors
255  inline sveDouble() = default;
256  inline sveDouble(const sveDouble &rhs) = default;
257  inline sveDouble(const vectorType &rhs) : _data(rhs)
258  {
259  }
260  inline sveDouble(const scalarType rhs)
261  {
262  _data = svdup_f64(rhs);
263  }
264 
265  // store packed
266  inline void store(scalarType *p) const
267  {
268  svst1_f64(svptrue_b64(), p, _data);
269  }
270  // refer to x86_64 implementations
271  // sve has no requirements on alignment
272  // nevertheless we should accept valid tags for compatibility
273  template <typename T,
274  typename std::enable_if<is_load_tag<T>::value, bool>::type = 0>
275  inline void store(scalarType *p, T) const
276  {
277  svst1_f64(svptrue_b64(), p, _data);
278  }
279 
280  // load packed
281  inline void load(const scalarType *p)
282  {
283  _data = svld1_f64(svptrue_b64(), p);
284  }
285  // refer to x86_64 implementations
286  // sve has no requirements on alignment
287  // nevertheless we should accept valid tags for compatibility
288  template <typename T,
289  typename std::enable_if<is_load_tag<T>::value, bool>::type = 0>
290  inline void load(const scalarType *p, T)
291  {
292  _data = svld1_f64(svptrue_b64(), p);
293  }
294 
295  // broadcast
296  inline void broadcast(const scalarType rhs)
297  {
298  _data = svdup_f64(rhs);
299  }
300 
301  // gather/scatter
302  template <typename T>
303  inline void gather(scalarType const *p, const sveLong<T> &indices)
304  {
305  _data = svld1_gather_index(svptrue_b64(), p, indices._data);
306  }
307 
308  template <typename T>
309  inline void scatter(scalarType *out, const sveLong<T> &indices) const
310  {
311  svst1_scatter_index(svptrue_b64(), out, indices._data, _data);
312  }
313 
314  // fma
315  // this = this + a * b
316  inline void fma(const sveDouble &a, const sveDouble &b)
317  {
318  _data = svmad_x(svptrue_b64(), a._data, b._data, _data);
319  }
320 
321  // subscript
322  // subscript operators are convienient but expensive
323  // should not be used in optimized kernels
324  inline scalarType operator[](size_t i) const
325  {
326  alignas(alignment) scalarArray tmp;
327  store(tmp, is_aligned);
328  return tmp[i];
329  }
330 
331  // unary ops
332  inline void operator+=(sveDouble rhs)
333  {
334  _data = svadd_x(svptrue_b64(), _data, rhs._data);
335  }
336 
337  inline void operator-=(sveDouble rhs)
338  {
339  _data = svsub_x(svptrue_b64(), _data, rhs._data);
340  }
341 
342  inline void operator*=(sveDouble rhs)
343  {
344  _data = svmul_x(svptrue_b64(), _data, rhs._data);
345  }
346 
347  inline void operator/=(sveDouble rhs)
348  {
349  _data = svdiv_x(svptrue_b64(), _data, rhs._data);
350  }
351 };
352 
353 inline sveDouble operator+(sveDouble lhs, sveDouble rhs)
354 {
355  return svadd_x(svptrue_b64(), lhs._data, rhs._data);
356 }
357 
358 inline sveDouble operator-(sveDouble lhs, sveDouble rhs)
359 {
360  return svsub_x(svptrue_b64(), lhs._data, rhs._data);
361 }
362 
363 inline sveDouble operator*(sveDouble lhs, sveDouble rhs)
364 {
365  return svmul_x(svptrue_b64(), lhs._data, rhs._data);
366 }
367 
368 inline sveDouble operator/(sveDouble lhs, sveDouble rhs)
369 {
370  return svdiv_x(svptrue_b64(), lhs._data, rhs._data);
371 }
372 
373 inline sveDouble sqrt(sveDouble in)
374 {
375  return svsqrt_x(svptrue_b64(), in._data);
376 }
377 
378 inline sveDouble abs(sveDouble in)
379 {
380  return svabs_x(svptrue_b64(), in._data);
381 }
382 
383 inline sveDouble log(sveDouble in)
384 {
385  // there is no sve log intrinsic
386  // this is a dreadful implementation and is simply a stop gap measure
387  alignas(sveDouble::alignment) sveDouble::scalarArray tmp;
388  in.store(tmp);
389  for (size_t i = 0; i < sveDouble::width; ++i)
390  {
391  tmp[i] = std::log(tmp[i]);
392  }
393  sveDouble ret;
394  ret.load(tmp);
395  return ret;
396 }
397 
398 inline void load_interleave(const double *in, size_t dataLen,
399  std::vector<sveDouble, allocator<sveDouble>> &out)
400 {
401 
402  alignas(sveDouble::alignment) size_t tmp[sveDouble::width] = {};
403 
404  // populate scalar index of unknown size
405  // (known at compile time)
406  for (size_t i = 0; i < sveDouble::width; ++i)
407  {
408  tmp[i] = i * dataLen;
409  }
410 
411  using index_t = sveLong<size_t>;
412  index_t index0(tmp);
413  index_t index1 = index0 + 1ul;
414 
415  // 2x unrolled loop -- minimun width is 2
416  size_t nBlocks = dataLen / 2;
417  for (size_t i = 0; i < nBlocks; ++i)
418  {
419  out[2 * i + 0].gather(in, index0);
420  out[2 * i + 1].gather(in, index1);
421  index0 = index0 + 2ul;
422  index1 = index1 + 2ul;
423  }
424 
425  // spillover loop
426  for (size_t i = 2 * nBlocks; i < dataLen; ++i)
427  {
428  out[i].gather(in, index0);
429  index0 = index0 + 1ul;
430  }
431 }
432 
433 inline void deinterleave_store(
434  const std::vector<sveDouble, allocator<sveDouble>> &in, size_t dataLen,
435  double *out)
436 {
437  alignas(sveDouble::alignment) size_t tmp[sveDouble::width] = {};
438 
439  // populate scalar index of unknown size
440  // (known at compile time)
441  for (size_t i = 0; i < sveDouble::width; ++i)
442  {
443  tmp[i] = i * dataLen;
444  }
445 
446  using index_t = sveLong<size_t>;
447  index_t index0(tmp);
448 
449  for (size_t i = 0; i < dataLen; ++i)
450  {
451  in[i].scatter(out, index0);
452  index0 = index0 + 1ul;
453  }
454 }
455 
456 ////////////////////////////////////////////////////////////////////////////////
457 
458 // mask type
459 // mask is a int type with special properties (broad boolean vector)
460 // broad boolean vectors defined and allowed values are:
461 // false=0x0 and true=0xFFFFFFFF
462 //
463 // VERY LIMITED SUPPORT...just enough to make cubic eos work...
464 //
465 struct sveMask : sveLong<std::uint64_t>
466 {
467  // bring in ctors
468  using sveLong::sveLong;
469 
470  static constexpr scalarType true_v = -1;
471  static constexpr scalarType false_v = 0;
472 };
473 
474 inline sveMask operator>(sveDouble lhs, sveDouble rhs)
475 {
476  // set mask
477  svbool_vlst_t mask = svcmpgt(svptrue_b64(), lhs._data, rhs._data);
478  // abuse set inactive to zero to convert
479  sveMask::vectorType sveTrue_v = svdup_u64(sveMask::true_v);
480  return svand_z(mask, sveTrue_v, sveTrue_v);
481 }
482 
483 // logical and
484 inline bool operator&&(sveMask lhs, bool rhs)
485 {
486  // set mask
487  sveMask::vectorType sveFalse_v = svdup_u64(sveMask::false_v);
488  svbool_vlst_t mask = svcmpne(svptrue_b64(), lhs._data, sveFalse_v);
489  // is any equal to false (zero)?
490  bool tmp = svptest_any(svptrue_b64(), mask);
491  return tmp && rhs;
492 }
493 
494 #endif // defined(__ARM_FEATURE_SVE_BITS)
495 
496 } // namespace tinysimd
497 #endif
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:300
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:212
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T >>> &in, size_t dataLen, T *out)
Definition: scalar.hpp:316
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:232
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:295
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:363
void load_interleave(const T *in, size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T >>> &out)
Definition: scalar.hpp:306
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:373
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:291
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:272
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:252