Nektar++
avx512.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: avx512.cpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Vector type using avx512 extension.
32 //
33 ///////////////////////////////////////////////////////////////////////////////
34 
35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
37 
38 #include <immintrin.h>
39 #include <vector>
40 #include "allocator.hpp"
41 #include "traits.hpp"
42 #include "avx2.hpp"
43 
44 namespace tinysimd
45 {
46 
47 namespace abi
48 {
49 
50 template <typename scalarType>
51 struct avx512
52 {
53  using type = void;
54 };
55 
56 } // namespace abi
57 
58 
59 #if defined(__AVX512F__) && defined(NEKTAR_ENABLE_SIMD_AVX512)
60 
61 // forward declaration of concrete types
62 template<typename T> struct avx512Long8;
63 struct avx512Double8;
64 
65 namespace abi
66 {
67 
68 // mapping between abstract types and concrete types
69 template <> struct avx512<double> { using type = avx512Double8; };
70 template <> struct avx512<std::int64_t> { using type = avx512Long8<std::int64_t>; };
71 template <> struct avx512<std::uint64_t> { using type = avx512Long8<std::uint64_t>; };
72 template <> struct avx512<bool> { using type = avx512Mask; };
73 
74 } // namespace abi
75 
76 // concrete types, could add enable if to allow only unsigned long and long...
77 template<typename T>
78 struct avx512Long8
79 {
80  static_assert(std::is_integral<T>::value && sizeof(T) == 8,
81  "8 bytes Integral required.");
82 
83  static constexpr unsigned int width = 8;
84  static constexpr unsigned int alignment = 64;
85 
86  using scalarType = T;
87  using vectorType = __m512i;
88  using scalarArray = scalarType[width];
89 
90  // storage
91  vectorType _data;
92 
93  // ctors
94  inline avx512Long8() = default;
95  inline avx512Long8(const avx512Long8& rhs) = default;
96  inline avx512Long8(const vectorType& rhs) : _data(rhs){}
97  inline avx512Long8(const scalarType rhs)
98  {
99  _data = _mm512_set1_epi64(rhs);
100  }
101  explicit inline avx512Long8(scalarArray& rhs)
102  {
103  _data = _mm512_load_epi64(rhs);
104  }
105 
106  // store
107  inline void store(scalarType* p) const
108  {
109  _mm512_store_epi64(p, _data);
110  }
111 
112  template
113  <
114  class flag,
115  typename std::enable_if<
116  is_requiring_alignment<flag>::value &&
117  !is_streaming<flag>::value, bool
118  >::type = 0
119  >
120  inline void store(scalarType* p, flag) const
121  {
122  _mm512_store_epi64(p, _data);
123  }
124 
125  template
126  <
127  class flag,
128  typename std::enable_if<
129  !is_requiring_alignment<flag>::value, bool
130  >::type = 0
131  >
132  inline void store(scalarType* p, flag) const
133  {
134  _mm512_storeu_epi64(p, _data);
135  }
136 
137  inline void load(const scalarType* p)
138  {
139  _data = _mm512_load_epi64(p);
140  }
141 
142  template
143  <
144  class flag,
145  typename std::enable_if<
146  is_requiring_alignment<flag>::value &&
147  !is_streaming<flag>::value, bool
148  >::type = 0
149  >
150  inline void load(const scalarType* p, flag)
151  {
152  _data = _mm512_load_epi64(p);
153  }
154 
155  template
156  <
157  class flag,
158  typename std::enable_if<
159  !is_requiring_alignment<flag>::value, bool
160  >::type = 0
161  >
162  inline void load(const scalarType* p, flag)
163  {
164  _data = _mm512_loadu_epi64(p);
165  }
166 
167  inline void broadcast(const scalarType rhs)
168  {
169  _data = _mm512_set1_epi64(rhs);
170  }
171 
172  // subscript
173  // subscript operators are convienient but expensive
174  // should not be used in optimized kernels
175  inline scalarType operator[](size_t i) const
176  {
177  alignas(alignment) scalarArray tmp;
178  store(tmp, is_aligned);
179  return tmp[i];
180  }
181 
182  inline scalarType& operator[](size_t i)
183  {
184  scalarType* tmp = reinterpret_cast<scalarType*>(&_data);
185  return tmp[i];
186  }
187 
188 };
189 
190 template<typename T>
191 inline avx512Long8<T> operator+(avx512Long8<T> lhs, avx512Long8<T> rhs)
192 {
193  return _mm512_add_epi64(lhs._data, rhs._data);
194 }
195 
196 template<typename T, typename U, typename = typename std::enable_if<
197  std::is_arithmetic<U>::value>::type>
198 inline avx512Long8<T> operator+(avx512Long8<T> lhs, U rhs)
199 {
200  return _mm512_add_epi64(lhs._data, _mm512_set1_epi64(rhs));
201 }
202 
203 ////////////////////////////////////////////////////////////////////////////////
204 
205 struct avx512Double8
206 {
207  static constexpr unsigned int width = 8;
208  static constexpr unsigned int alignment = 64;
209 
210  using scalarType = double;
211  using vectorType = __m512d;
212  using scalarArray = scalarType[width];
213 
214  // storage
215  vectorType _data;
216 
217  // ctors
218  inline avx512Double8() = default;
219  inline avx512Double8(const avx512Double8& rhs) = default;
220  inline avx512Double8(const vectorType& rhs) : _data(rhs){}
221  inline avx512Double8(const scalarType rhs)
222  {
223  _data = _mm512_set1_pd(rhs);
224  }
225 
226  // store
227  inline void store(scalarType* p) const
228  {
229  _mm512_store_pd(p, _data);
230  }
231 
232  template
233  <
234  class flag,
235  typename std::enable_if<
236  is_requiring_alignment<flag>::value &&
237  !is_streaming<flag>::value, bool
238  >::type = 0
239  >
240  inline void store(scalarType* p, flag) const
241  {
242  _mm512_store_pd(p, _data);
243  }
244 
245  template
246  <
247  class flag,
248  typename std::enable_if<
249  !is_requiring_alignment<flag>::value, bool
250  >::type = 0
251  >
252  inline void store(scalarType* p, flag) const
253  {
254  _mm512_storeu_pd(p, _data);
255  }
256 
257  template
258  <
259  class flag,
260  typename std::enable_if<
261  is_streaming<flag>::value, bool
262  >::type = 0
263  >
264  inline void store(scalarType* p, flag) const
265  {
266  _mm512_stream_pd(p, _data);
267  }
268 
269  // load packed
270  inline void load(const scalarType* p)
271  {
272  _data = _mm512_load_pd(p);
273  }
274 
275  template
276  <
277  class flag,
278  typename std::enable_if<
279  is_requiring_alignment<flag>::value, bool
280  >::type = 0
281  >
282  inline void load(const scalarType* p, flag)
283  {
284  _data = _mm512_load_pd(p);
285  }
286 
287  template
288  <
289  class flag,
290  typename std::enable_if<
291  !is_requiring_alignment<flag>::value, bool
292  >::type = 0
293  >
294  inline void load(const scalarType* p, flag)
295  {
296  _data = _mm512_loadu_pd(p);
297  }
298 
299  // broadcast
300  inline void broadcast(const scalarType rhs)
301  {
302  _data = _mm512_set1_pd(rhs);
303  }
304 
305  // gather/scatter
306 
307  template <typename T>
308  inline void gather(scalarType const* p, const avx2Int8<T>& indices)
309  {
310  _data = _mm512_i32gather_pd(p, indices._data, 8);
311  }
312 
313  template <typename T>
314  inline void scatter(scalarType* out, const avx2Int8<T>& indices) const
315  {
316  _mm512_i32scatter_pd(out, indices._data, 8);
317  }
318 
319  template <typename T>
320  inline void gather(scalarType const* p, const avx512Long8<T>& indices)
321  {
322  _data = _mm512_i64gather_pd(p, indices._data, 8);
323  }
324 
325  template <typename T>
326  inline void scatter(scalarType* out, const avx512Long8<T>& indices) const
327  {
328  _mm512_i64scatter_pd(out, indices._data, 8);
329  }
330 
331  // fma
332  // this = this + a * b
333  inline void fma(const avx512Double8& a, const avx512Double8& b)
334  {
335  _data = _mm512_fmadd_pd(a._data, b._data, _data);
336  }
337 
338 
339  // subscript
340  // subscript operators are convienient but expensive
341  // should not be used in optimized kernels
342  inline scalarType operator[](size_t i) const
343  {
344  alignas(alignment) scalarArray tmp;
345  store(tmp, is_aligned);
346  return tmp[i];
347  }
348 
349  inline scalarType& operator[](size_t i)
350  {
351  scalarType* tmp = reinterpret_cast<scalarType*>(&_data);
352  return tmp[i];
353  }
354 
355  // unary ops
356  inline void operator+=(avx512Double8 rhs)
357  {
358  _data = _mm512_add_pd(_data, rhs._data);
359  }
360 
361  inline void operator-=(avx512Double8 rhs)
362  {
363  _data = _mm512_sub_pd(_data, rhs._data);
364  }
365 
366  inline void operator*=(avx512Double8 rhs)
367  {
368  _data = _mm512_mul_pd(_data, rhs._data);
369  }
370 
371  inline void operator/=(avx512Double8 rhs)
372  {
373  _data = _mm512_div_pd(_data, rhs._data);
374  }
375 
376 };
377 
378 inline avx512Double8 operator+(avx512Double8 lhs, avx512Double8 rhs)
379 {
380  return _mm512_add_pd(lhs._data, rhs._data);
381 }
382 
383 inline avx512Double8 operator-(avx512Double8 lhs, avx512Double8 rhs)
384 {
385  return _mm512_sub_pd(lhs._data, rhs._data);
386 }
387 
388 inline avx512Double8 operator*(avx512Double8 lhs, avx512Double8 rhs)
389 {
390  return _mm512_mul_pd(lhs._data, rhs._data);
391 }
392 
393 inline avx512Double8 operator/(avx512Double8 lhs, avx512Double8 rhs)
394 {
395  return _mm512_div_pd(lhs._data, rhs._data);
396 }
397 
398 inline avx512Double8 sqrt(avx512Double8 in)
399 {
400  return _mm512_sqrt_pd(in._data);
401 }
402 
403 inline avx512Double8 abs(avx512Double8 in)
404 {
405  return _mm512_abs_pd(in._data);
406 }
407 
408 inline avx512Double8 log(avx512Double8 in)
409 {
410  // there is no avx512 log intrinsic
411  // this is a dreadful implementation and is simply a stop gap measure
412  alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
413  in.store(tmp);
414  tmp[0] = std::log(tmp[0]);
415  tmp[1] = std::log(tmp[1]);
416  tmp[2] = std::log(tmp[2]);
417  tmp[3] = std::log(tmp[3]);
418  tmp[4] = std::log(tmp[4]);
419  tmp[5] = std::log(tmp[5]);
420  tmp[6] = std::log(tmp[6]);
421  tmp[7] = std::log(tmp[7]);
422  avx512Double8 ret;
423  ret.load(tmp);
424  return ret;
425 }
426 
427 inline void load_interleave(
428  const double* in,
429  size_t dataLen,
430  std::vector<avx512Double8, allocator<avx512Double8>> &out)
431 {
432 
433  alignas(avx512Double8::alignment) size_t tmp[avx512Double8::width] =
434  {0, dataLen, 2*dataLen, 3*dataLen, 4*dataLen, 5*dataLen, 6*dataLen,
435  7*dataLen};
436 
437  using index_t = avx512Long8<size_t>;
438  index_t index0(tmp);
439  index_t index1 = index0 + 1;
440  index_t index2 = index0 + 2;
441  index_t index3 = index0 + 3;
442 
443  // 4x unrolled loop
444  size_t nBlocks = dataLen / 4;
445  for (size_t i = 0; i < nBlocks; ++i)
446  {
447  out[4*i + 0].gather(in, index0);
448  out[4*i + 1].gather(in, index1);
449  out[4*i + 2].gather(in, index2);
450  out[4*i + 3].gather(in, index3);
451  index0 = index0 + 4;
452  index1 = index1 + 4;
453  index2 = index2 + 4;
454  index3 = index3 + 4;
455  }
456 
457  // spillover loop
458  for (size_t i = 4 * nBlocks; i < dataLen; ++i)
459  {
460  out[i].gather(in, index0);
461  index0 = index0 + 1;
462  }
463 }
464 
465 
466 inline void deinterleave_store(
467  const std::vector<avx512Double8, allocator<avx512Double8>> &in,
468  size_t dataLen,
469  double *out)
470 {
471 #if 0
472  double *out0 = out;
473  double *out1 = out + dataLen;
474  double *out2 = out + 2 * dataLen;
475  double *out3 = out + 3 * dataLen;
476  double *out4 = out + 4 * dataLen;
477  double *out5 = out + 5 * dataLen;
478  double *out6 = out + 6 * dataLen;
479  double *out7 = out + 7 * dataLen;
480 
481 
482  for (size_t i = 0; i < dataLen; ++i)
483  {
484  out0[i] = in[i][0];
485  out1[i] = in[i][1];
486  out2[i] = in[i][2];
487  out3[i] = in[i][3];
488  out4[i] = in[i][4];
489  out5[i] = in[i][5];
490  out6[i] = in[i][6];
491  out7[i] = in[i][7];
492  }
493 #else
494 
495  // size_t nBlocks = dataLen / 4;
496 
497  alignas(avx512Double8::alignment) size_t tmp[avx512Double8::width] =
498  {0, dataLen, 2*dataLen, 3*dataLen, 4*dataLen, 5*dataLen, 6*dataLen,
499  7*dataLen};
500  using index_t = avx512Long8<size_t>;
501  index_t index0(tmp);
502  // index_t index1 = index0 + 1;
503  // index_t index2 = index0 + 2;
504  // index_t index3 = index0 + 3;
505 
506  // // 4x unrolled loop
507  // for (size_t i = 0; i < nBlocks; ++i)
508  // {
509  // in[i].scatter(out, index0);
510  // in[i+1].scatter(out, index1);
511  // in[i+2].scatter(out, index2);
512  // in[i+3].scatter(out, index3);
513  // index0 = index0 + 4;
514  // index1 = index1 + 4;
515  // index2 = index2 + 4;
516  // index3 = index3 + 4;
517  // }
518 
519  // // spillover loop
520  // for (size_t i = 4 * nBlocks; i < dataLen; ++i)
521  // {
522  // in[i].scatter(out, index0);
523  // index0 = index0 + 1;
524  // }
525 
526  for (size_t i = 0; i < dataLen; ++i)
527  {
528  in[i].scatter(out, index0);
529  index0 = index0 + 1;
530  }
531 #endif
532 
533 }
534 
535 ////////////////////////////////////////////////////////////////////////////////
536 
537 // mask type
538 // avx512 support a narrow bolean type, i.e. a bitwise mask: __mask8
539 //
540 // VERY LIMITED SUPPORT...just enough to make cubic eos work...
541 // NOT TESTED
542 //
543 struct avx512Mask
544 {
545  static constexpr unsigned int width = 1;
546  static constexpr unsigned int alignment = 8;
547 
548  using scalarType = bool;
549  using vectorType = __mmask8;
550  using scalarArray = std::uint8_t;
551 
552  // storage
553  vectorType _data;
554 
555  // true false type
556  static constexpr scalarType true_v = true;
557  static constexpr scalarType false_v = false;
558 
559  // ctors
560  inline avx512Mask() = default;
561  inline avx512Mask(const avx512Mask& rhs) = default;
562  inline avx512Mask(const vectorType& rhs) : _data(rhs){}
563  inline avx512Mask(const scalarType rhs)
564  {
565  _data = _mm512_set1_epi64(rhs);
566  }
567  explicit inline avx512Mask(scalarArray& rhs)
568  {
569  _data = _mm512_load_epi64(rhs);
570  }
571 
572  // load
573  inline void load(scalarArray* p) const
574  {
575  _load_mask8(reinterpret_cast<vectorType*>(p), _data);
576  }
577 
578 };
579 
580 inline avx512Mask operator>(avx512Double8 lhs, avx512Double8 rhs)
581 {
582 
583  return _mm512_cmp_pd_mask(rhs._data, lhs._data, 1);
584 }
585 
586 inline bool operator&&(avx512Mask lhs, bool rhs)
587 {
588  static constexpr std::uint8_t mask_true = 0xFF;
589  bool tmp = _ktestc_mask8_u8(lhs._data, _load_mask8(&mask_true));
590  return tmp && rhs;
591 }
592 
593 #endif // defined(__avx512__)
594 
595 } // namespace tinysimd
596 #endif
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:278
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:192
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T >>> &in, size_t dataLen, T *out)
Definition: scalar.hpp:296
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:211
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:272
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:323
void load_interleave(const T *in, size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T >>> &out)
Definition: scalar.hpp:284
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:329
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:267
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:249
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:230