Nektar++
avx512.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: avx512.cpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Vector type using avx512 extension.
32 //
33 ///////////////////////////////////////////////////////////////////////////////
34 
35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
37 
38 #if defined(__x86_64__)
39 #include <immintrin.h>
40 #if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41 #define TINYSIMD_HAS_SVML
42 #endif
43 #endif
44 #include "allocator.hpp"
45 #include "avx2.hpp"
46 #include "traits.hpp"
47 #include <vector>
48 
49 namespace tinysimd
50 {
51 
52 namespace abi
53 {
54 
55 template <typename scalarType> struct avx512
56 {
57  using type = void;
58 };
59 
60 } // namespace abi
61 
62 #if defined(__AVX512F__) && defined(NEKTAR_ENABLE_SIMD_AVX512)
63 
64 // forward declaration of concrete types
65 template <typename T> struct avx512Long8;
66 struct avx512Double8;
67 struct avx512Mask;
68 
69 namespace abi
70 {
71 
72 // mapping between abstract types and concrete types
73 template <> struct avx512<double>
74 {
75  using type = avx512Double8;
76 };
77 template <> struct avx512<std::int64_t>
78 {
79  using type = avx512Long8<std::int64_t>;
80 };
81 template <> struct avx512<std::uint64_t>
82 {
83  using type = avx512Long8<std::uint64_t>;
84 };
85 template <> struct avx512<bool>
86 {
87  using type = avx512Mask;
88 };
89 
90 } // namespace abi
91 
92 // concrete types, could add enable if to allow only unsigned long and long...
93 template <typename T> struct avx512Long8
94 {
95  static_assert(std::is_integral<T>::value && sizeof(T) == 8,
96  "8 bytes Integral required.");
97 
98  static constexpr unsigned int width = 8;
99  static constexpr unsigned int alignment = 64;
100 
101  using scalarType = T;
102  using vectorType = __m512i;
103  using scalarArray = scalarType[width];
104 
105  // storage
106  vectorType _data;
107 
108  // ctors
109  inline avx512Long8() = default;
110  inline avx512Long8(const avx512Long8 &rhs) = default;
111  inline avx512Long8(const vectorType &rhs) : _data(rhs)
112  {
113  }
114  inline avx512Long8(const scalarType rhs)
115  {
116  _data = _mm512_set1_epi64(rhs);
117  }
118  explicit inline avx512Long8(scalarArray &rhs)
119  {
120  _data = _mm512_load_epi64(rhs);
121  }
122 
123  // store
124  inline void store(scalarType *p) const
125  {
126  _mm512_store_epi64(p, _data);
127  }
128 
129  template <class flag,
130  typename std::enable_if<is_requiring_alignment<flag>::value &&
131  !is_streaming<flag>::value,
132  bool>::type = 0>
133  inline void store(scalarType *p, flag) const
134  {
135  _mm512_store_epi64(p, _data);
136  }
137 
138  template <class flag,
139  typename std::enable_if<!is_requiring_alignment<flag>::value,
140  bool>::type = 0>
141  inline void store(scalarType *p, flag) const
142  {
143  _mm512_storeu_epi64(p, _data);
144  }
145 
146  inline void load(const scalarType *p)
147  {
148  _data = _mm512_load_epi64(p);
149  }
150 
151  template <class flag,
152  typename std::enable_if<is_requiring_alignment<flag>::value &&
153  !is_streaming<flag>::value,
154  bool>::type = 0>
155  inline void load(const scalarType *p, flag)
156  {
157  _data = _mm512_load_epi64(p);
158  }
159 
160  template <class flag,
161  typename std::enable_if<!is_requiring_alignment<flag>::value,
162  bool>::type = 0>
163  inline void load(const scalarType *p, flag)
164  {
165  // even though the intel intrisic manual lists
166  // __m512i _mm512_loadu_epi64 (void const* mem_addr)
167  // it is not implemented in some compilers (gcc)
168  // since this is a bitwise load with no extension
169  // the following instruction is equivalent
170  _data = _mm512_loadu_si512(p);
171  }
172 
173  inline void broadcast(const scalarType rhs)
174  {
175  _data = _mm512_set1_epi64(rhs);
176  }
177 
178  // subscript
179  // subscript operators are convienient but expensive
180  // should not be used in optimized kernels
181  inline scalarType operator[](size_t i) const
182  {
183  alignas(alignment) scalarArray tmp;
184  store(tmp, is_aligned);
185  return tmp[i];
186  }
187 };
188 
189 template <typename T>
190 inline avx512Long8<T> operator+(avx512Long8<T> lhs, avx512Long8<T> rhs)
191 {
192  return _mm512_add_epi64(lhs._data, rhs._data);
193 }
194 
195 template <
196  typename T, typename U,
197  typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
198 inline avx512Long8<T> operator+(avx512Long8<T> lhs, U rhs)
199 {
200  return _mm512_add_epi64(lhs._data, _mm512_set1_epi64(rhs));
201 }
202 
203 ////////////////////////////////////////////////////////////////////////////////
204 
205 struct avx512Double8
206 {
207  static constexpr unsigned int width = 8;
208  static constexpr unsigned int alignment = 64;
209 
210  using scalarType = double;
211  using vectorType = __m512d;
212  using scalarArray = scalarType[width];
213 
214  // storage
215  vectorType _data;
216 
217  // ctors
218  inline avx512Double8() = default;
219  inline avx512Double8(const avx512Double8 &rhs) = default;
220  inline avx512Double8(const vectorType &rhs) : _data(rhs)
221  {
222  }
223  inline avx512Double8(const scalarType rhs)
224  {
225  _data = _mm512_set1_pd(rhs);
226  }
227 
228  // store
229  inline void store(scalarType *p) const
230  {
231  _mm512_store_pd(p, _data);
232  }
233 
234  template <class flag,
235  typename std::enable_if<is_requiring_alignment<flag>::value &&
236  !is_streaming<flag>::value,
237  bool>::type = 0>
238  inline void store(scalarType *p, flag) const
239  {
240  _mm512_store_pd(p, _data);
241  }
242 
243  template <class flag,
244  typename std::enable_if<!is_requiring_alignment<flag>::value,
245  bool>::type = 0>
246  inline void store(scalarType *p, flag) const
247  {
248  _mm512_storeu_pd(p, _data);
249  }
250 
251  template <class flag, typename std::enable_if<is_streaming<flag>::value,
252  bool>::type = 0>
253  inline void store(scalarType *p, flag) const
254  {
255  _mm512_stream_pd(p, _data);
256  }
257 
258  // load packed
259  inline void load(const scalarType *p)
260  {
261  _data = _mm512_load_pd(p);
262  }
263 
264  template <class flag,
265  typename std::enable_if<is_requiring_alignment<flag>::value,
266  bool>::type = 0>
267  inline void load(const scalarType *p, flag)
268  {
269  _data = _mm512_load_pd(p);
270  }
271 
272  template <class flag,
273  typename std::enable_if<!is_requiring_alignment<flag>::value,
274  bool>::type = 0>
275  inline void load(const scalarType *p, flag)
276  {
277  _data = _mm512_loadu_pd(p);
278  }
279 
280  // broadcast
281  inline void broadcast(const scalarType rhs)
282  {
283  _data = _mm512_set1_pd(rhs);
284  }
285 
286  // gather/scatter
287  // template <typename T>
288  // inline void gather(scalarType const* p, const avx2Int8<T>& indices)
289  // {
290  // _data = _mm512_i32gather_pd(indices._data, p, 8);
291  // }
292 
293  // template <typename T>
294  // inline void scatter(scalarType* out, const avx2Int8<T>& indices) const
295  // {
296  // _mm512_i32scatter_pd(out, indices._data, _data, 8);
297  // }
298 
299  template <typename T>
300  inline void gather(scalarType const *p, const avx512Long8<T> &indices)
301  {
302  _data = _mm512_i64gather_pd(indices._data, p, 8);
303  }
304 
305  template <typename T>
306  inline void scatter(scalarType *out, const avx512Long8<T> &indices) const
307  {
308  _mm512_i64scatter_pd(out, indices._data, _data, 8);
309  }
310 
311  // fma
312  // this = this + a * b
313  inline void fma(const avx512Double8 &a, const avx512Double8 &b)
314  {
315  _data = _mm512_fmadd_pd(a._data, b._data, _data);
316  }
317 
318  // subscript
319  // subscript operators are convienient but expensive
320  // should not be used in optimized kernels
321  inline scalarType operator[](size_t i) const
322  {
323  alignas(alignment) scalarArray tmp;
324  store(tmp, is_aligned);
325  return tmp[i];
326  }
327 
328  // unary ops
329  inline void operator+=(avx512Double8 rhs)
330  {
331  _data = _mm512_add_pd(_data, rhs._data);
332  }
333 
334  inline void operator-=(avx512Double8 rhs)
335  {
336  _data = _mm512_sub_pd(_data, rhs._data);
337  }
338 
339  inline void operator*=(avx512Double8 rhs)
340  {
341  _data = _mm512_mul_pd(_data, rhs._data);
342  }
343 
344  inline void operator/=(avx512Double8 rhs)
345  {
346  _data = _mm512_div_pd(_data, rhs._data);
347  }
348 };
349 
350 inline avx512Double8 operator+(avx512Double8 lhs, avx512Double8 rhs)
351 {
352  return _mm512_add_pd(lhs._data, rhs._data);
353 }
354 
355 inline avx512Double8 operator-(avx512Double8 lhs, avx512Double8 rhs)
356 {
357  return _mm512_sub_pd(lhs._data, rhs._data);
358 }
359 
360 inline avx512Double8 operator*(avx512Double8 lhs, avx512Double8 rhs)
361 {
362  return _mm512_mul_pd(lhs._data, rhs._data);
363 }
364 
365 inline avx512Double8 operator/(avx512Double8 lhs, avx512Double8 rhs)
366 {
367  return _mm512_div_pd(lhs._data, rhs._data);
368 }
369 
370 inline avx512Double8 sqrt(avx512Double8 in)
371 {
372  return _mm512_sqrt_pd(in._data);
373 }
374 
375 inline avx512Double8 abs(avx512Double8 in)
376 {
377  return _mm512_abs_pd(in._data);
378 }
379 
380 inline avx512Double8 log(avx512Double8 in)
381 {
382 #if defined(TINYSIMD_HAS_SVML)
383  return _mm512_log_pd(in._data);
384 #else
385  // there is no avx512 log intrinsic
386  // this is a dreadful implementation and is simply a stop gap measure
387  alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
388  in.store(tmp);
389  tmp[0] = std::log(tmp[0]);
390  tmp[1] = std::log(tmp[1]);
391  tmp[2] = std::log(tmp[2]);
392  tmp[3] = std::log(tmp[3]);
393  tmp[4] = std::log(tmp[4]);
394  tmp[5] = std::log(tmp[5]);
395  tmp[6] = std::log(tmp[6]);
396  tmp[7] = std::log(tmp[7]);
397  avx512Double8 ret;
398  ret.load(tmp);
399  return ret;
400 #endif
401 }
402 
403 inline void load_interleave(
404  const double *in, size_t dataLen,
405  std::vector<avx512Double8, allocator<avx512Double8>> &out)
406 {
407 
408  alignas(avx512Double8::alignment) size_t tmp[avx512Double8::width] = {
409  0, dataLen, 2 * dataLen, 3 * dataLen,
410  4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
411 
412  using index_t = avx512Long8<size_t>;
413  index_t index0(tmp);
414  index_t index1 = index0 + 1;
415  index_t index2 = index0 + 2;
416  index_t index3 = index0 + 3;
417 
418  // 4x unrolled loop
419  constexpr uint16_t unrl = 4;
420  size_t nBlocks = dataLen / unrl;
421  for (size_t i = 0; i < nBlocks; ++i)
422  {
423  out[unrl * i + 0].gather(in, index0);
424  out[unrl * i + 1].gather(in, index1);
425  out[unrl * i + 2].gather(in, index2);
426  out[unrl * i + 3].gather(in, index3);
427  index0 = index0 + unrl;
428  index1 = index1 + unrl;
429  index2 = index2 + unrl;
430  index3 = index3 + unrl;
431  }
432 
433  // spillover loop
434  for (size_t i = unrl * nBlocks; i < dataLen; ++i)
435  {
436  out[i].gather(in, index0);
437  index0 = index0 + 1;
438  }
439 }
440 
441 inline void deinterleave_store(
442  const std::vector<avx512Double8, allocator<avx512Double8>> &in,
443  size_t dataLen, double *out)
444 {
445  // size_t nBlocks = dataLen / 4;
446 
447  alignas(avx512Double8::alignment) size_t tmp[avx512Double8::width] = {
448  0, dataLen, 2 * dataLen, 3 * dataLen,
449  4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
450  using index_t = avx512Long8<size_t>;
451  index_t index0(tmp);
452  // index_t index1 = index0 + 1;
453  // index_t index2 = index0 + 2;
454  // index_t index3 = index0 + 3;
455 
456  // // 4x unrolled loop
457  // for (size_t i = 0; i < nBlocks; ++i)
458  // {
459  // in[i].scatter(out, index0);
460  // in[i+1].scatter(out, index1);
461  // in[i+2].scatter(out, index2);
462  // in[i+3].scatter(out, index3);
463  // index0 = index0 + 4;
464  // index1 = index1 + 4;
465  // index2 = index2 + 4;
466  // index3 = index3 + 4;
467  // }
468 
469  // // spillover loop
470  // for (size_t i = 4 * nBlocks; i < dataLen; ++i)
471  // {
472  // in[i].scatter(out, index0);
473  // index0 = index0 + 1;
474  // }
475 
476  for (size_t i = 0; i < dataLen; ++i)
477  {
478  in[i].scatter(out, index0);
479  index0 = index0 + 1;
480  }
481 }
482 
483 ////////////////////////////////////////////////////////////////////////////////
484 
485 // mask type
486 // mask is a int type with special properties (broad boolean vector)
487 // broad boolean vectors defined and allowed values are:
488 // false=0x0 and true=0xFFFFFFFF
489 //
490 // VERY LIMITED SUPPORT...just enough to make cubic eos work...
491 //
492 struct avx512Mask : avx512Long8<std::uint64_t>
493 {
494  // bring in ctors
495  using avx512Long8::avx512Long8;
496 
497  static constexpr scalarType true_v = -1;
498  static constexpr scalarType false_v = 0;
499 };
500 
501 inline avx512Mask operator>(avx512Double8 lhs, avx512Double8 rhs)
502 {
503  __mmask8 mask = _mm512_cmp_pd_mask(lhs._data, rhs._data, _CMP_GT_OQ);
504  return _mm512_maskz_set1_epi64(mask, avx512Mask::true_v);
505 }
506 
507 inline bool operator&&(avx512Mask lhs, bool rhs)
508 {
509  __m512i val_true = _mm512_set1_epi64(avx512Mask::true_v);
510  __mmask8 mask = _mm512_test_epi64_mask(lhs._data, val_true);
511  unsigned int tmp = _cvtmask16_u32(mask);
512  return tmp && rhs;
513 }
514 
515 #endif // defined(__avx512__)
516 
517 } // namespace tinysimd
518 #endif
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:300
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:212
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T >>> &in, size_t dataLen, T *out)
Definition: scalar.hpp:316
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:232
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:295
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:363
void load_interleave(const T *in, size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T >>> &out)
Definition: scalar.hpp:306
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:373
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:291
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:272
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:252