Nektar++
avx2.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: avx2.cpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Vector type using avx2 extension.
32 //
33 ///////////////////////////////////////////////////////////////////////////////
34 
35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
37 
38 #include <immintrin.h>
39 #include <vector>
40 #include "allocator.hpp"
41 #include "traits.hpp"
42 #include "sse2.hpp"
43 
44 namespace tinysimd
45 {
46 
47 namespace abi
48 {
49 
50 template <typename scalarType>
51 struct avx2
52 {
53  using type = void;
54 };
55 
56 } // namespace abi
57 
58 
59 #if defined(__AVX2__) && defined(NEKTAR_ENABLE_SIMD_AVX2)
60 
61 // forward declaration of concrete types
62 template<typename T> struct avx2Int8;
63 template<typename T> struct avx2Long4;
64 struct avx2Double4;
65 struct avx2Mask;
66 
67 namespace abi
68 {
69 
70 // mapping between abstract types and concrete types
71 template <> struct avx2<double> { using type = avx2Double4; };
72 template <> struct avx2<std::int64_t> { using type = avx2Long4<std::int64_t>; };
73 template <> struct avx2<std::uint64_t> { using type = avx2Long4<std::uint64_t>; };
74 template <> struct avx2<bool> { using type = avx2Mask; };
75 #if defined(__AVX512F__) && defined(NEKTAR_ENABLE_SIMD_AVX512)
76 // these types are used for indexes only
77 // should be enabled only with avx512 otherwise they get selected by the wrapper
78 // instead of sse2int4. The concrete type avx2Int8 is available in case someone
79 // wants to explicity use it
80 template <> struct avx2<std::int32_t> { using type = avx2Int8<std::int32_t>; };
81 template <> struct avx2<std::uint32_t> { using type = avx2Int8<std::uint32_t>; };
82 #endif
83 
84 } // namespace abi
85 
86 // concrete types
87 template<typename T>
88 struct avx2Int8
89 {
90  static_assert(std::is_integral<T>::value && sizeof(T) == 4,
91  "4 bytes Integral required.");
92 
93  static constexpr unsigned int width = 8;
94  static constexpr unsigned int alignment = 32;
95 
96  using scalarType = T;
97  using vectorType = __m256i;
98  using scalarArray = scalarType[width];
99 
100  // storage
101  vectorType _data;
102 
103  // ctors
104  inline avx2Int8() = default;
105  inline avx2Int8(const avx2Int8& rhs) = default;
106  inline avx2Int8(const vectorType& rhs) : _data(rhs){}
107  inline avx2Int8(const scalarType rhs)
108  {
109  _data = _mm256_set1_epi32(rhs);
110  }
111  explicit inline avx2Int8(scalarArray& rhs)
112  {
113  _data = _mm256_load_si256(reinterpret_cast<vectorType*>(rhs));
114  }
115 
116  // store
117  inline void store(scalarType* p) const
118  {
119  _mm256_store_si256(reinterpret_cast<vectorType*>(p), _data);
120  }
121 
122  template
123  <
124  class flag,
125  typename std::enable_if<
126  is_requiring_alignment<flag>::value &&
127  !is_streaming<flag>::value, bool
128  >::type = 0
129  >
130  inline void store(scalarType* p, flag) const
131  {
132  _mm256_store_si256(reinterpret_cast<vectorType*>(p), _data);
133  }
134 
135  template
136  <
137  class flag,
138  typename std::enable_if<
139  !is_requiring_alignment<flag>::value, bool
140  >::type = 0
141  >
142  inline void store(scalarType* p, flag) const
143  {
144  _mm256_storeu_si256(reinterpret_cast<vectorType*>(p), _data);
145  }
146 
147  inline void load(const scalarType* p)
148  {
149  _data = _mm256_load_si256(reinterpret_cast<const vectorType*>(p));
150  }
151 
152  template
153  <
154  class flag,
155  typename std::enable_if<
156  is_requiring_alignment<flag>::value &&
157  !is_streaming<flag>::value, bool
158  >::type = 0
159  >
160  inline void load(const scalarType* p, flag)
161  {
162  _data = _mm256_load_si256(reinterpret_cast<const vectorType*>(p));
163  }
164 
165  template
166  <
167  class flag,
168  typename std::enable_if<
169  !is_requiring_alignment<flag>::value, bool
170  >::type = 0
171  >
172  inline void load(const scalarType* p, flag)
173  {
174  _data = _mm256_loadu_si256(reinterpret_cast<const vectorType*>(p));
175  }
176 
177  inline void broadcast(const scalarType rhs)
178  {
179  _data = _mm256_set1_epi32(rhs);
180  }
181 
182  // subscript
183  // subscript operators are convienient but expensive
184  // should not be used in optimized kernels
185  inline scalarType operator[](size_t i) const
186  {
187  alignas(alignment) scalarArray tmp;
188  store(tmp, is_aligned);
189  return tmp[i];
190  }
191 
192  inline scalarType& operator[](size_t i)
193  {
194  scalarType* tmp = reinterpret_cast<scalarType*>(&_data);
195  return tmp[i];
196  }
197 
198 };
199 
200 template<typename T>
201 inline avx2Int8<T> operator+(avx2Int8<T> lhs, avx2Int8<T> rhs)
202 {
203  return _mm256_add_epi32(lhs._data, rhs._data);
204 }
205 
206 template<typename T, typename U, typename = typename std::enable_if<
207  std::is_arithmetic<U>::value>::type>
208 inline avx2Int8<T> operator+(avx2Int8<T> lhs, U rhs)
209 {
210  return _mm256_add_epi32(lhs._data, _mm256_set1_epi32(rhs));
211 }
212 
213 ////////////////////////////////////////////////////////////////////////////////
214 
215 template<typename T>
216 struct avx2Long4
217 {
218  static_assert(std::is_integral<T>::value && sizeof(T) == 8,
219  "8 bytes Integral required.");
220 
221  static constexpr unsigned int width = 4;
222  static constexpr unsigned int alignment = 32;
223 
224  using scalarType = T;
225  using vectorType = __m256i;
226  using scalarArray = scalarType[width];
227 
228  // storage
229  vectorType _data;
230 
231  // ctors
232  inline avx2Long4() = default;
233  inline avx2Long4(const avx2Long4& rhs) = default;
234  inline avx2Long4(const vectorType& rhs) : _data(rhs){}
235  inline avx2Long4(const scalarType rhs)
236  {
237  _data = _mm256_set1_epi64x(rhs);
238  }
239  explicit inline avx2Long4(scalarArray& rhs)
240  {
241  _data = _mm256_load_si256(reinterpret_cast<vectorType*>(rhs));
242  }
243 
244  // store
245  inline void store(scalarType* p) const
246  {
247  _mm256_store_si256(reinterpret_cast<vectorType*>(p), _data);
248  }
249 
250  template
251  <
252  class flag,
253  typename std::enable_if<
254  is_requiring_alignment<flag>::value &&
255  !is_streaming<flag>::value, bool
256  >::type = 0
257  >
258  inline void store(scalarType* p, flag) const
259  {
260  _mm256_store_si256(reinterpret_cast<vectorType*>(p), _data);
261  }
262 
263  template
264  <
265  class flag,
266  typename std::enable_if<
267  !is_requiring_alignment<flag>::value, bool
268  >::type = 0
269  >
270  inline void store(scalarType* p, flag) const
271  {
272  _mm256_storeu_si256(reinterpret_cast<vectorType*>(p), _data);
273  }
274 
275  inline void load(const scalarType* p)
276  {
277  _data = _mm256_load_si256(reinterpret_cast<const vectorType*>(p));
278  }
279 
280  template
281  <
282  class flag,
283  typename std::enable_if<
284  is_requiring_alignment<flag>::value &&
285  !is_streaming<flag>::value, bool
286  >::type = 0
287  >
288  inline void load(const scalarType* p, flag)
289  {
290  _data = _mm256_load_si256(reinterpret_cast<const vectorType*>(p));
291  }
292 
293  template
294  <
295  class flag,
296  typename std::enable_if<
297  !is_requiring_alignment<flag>::value, bool
298  >::type = 0
299  >
300  inline void load(const scalarType* p, flag)
301  {
302  _data = _mm256_loadu_si256(reinterpret_cast<const vectorType*>(p));
303  }
304 
305  inline void broadcast(const scalarType rhs)
306  {
307  _data = _mm256_set1_epi64x(rhs);
308  }
309 
310  // subscript
311  // subscript operators are convienient but expensive
312  // should not be used in optimized kernels
313  inline scalarType operator[](size_t i) const
314  {
315  alignas(alignment) scalarArray tmp;
316  store(tmp, is_aligned);
317  return tmp[i];
318  }
319 
320  inline scalarType& operator[](size_t i)
321  {
322  scalarType* tmp = reinterpret_cast<scalarType*>(&_data);
323  return tmp[i];
324  }
325 
326 };
327 
328 template<typename T>
329 inline avx2Long4<T> operator+(avx2Long4<T> lhs, avx2Long4<T> rhs)
330 {
331  return _mm256_add_epi64(lhs._data, rhs._data);
332 }
333 
334 template<typename T, typename U, typename = typename std::enable_if<
335  std::is_arithmetic<U>::value>::type>
336 inline avx2Long4<T> operator+(avx2Long4<T> lhs, U rhs)
337 {
338  return _mm256_add_epi64(lhs._data, _mm256_set1_epi64x(rhs));
339 }
340 
341 ////////////////////////////////////////////////////////////////////////////////
342 
343 struct avx2Double4
344 {
345  static constexpr unsigned width = 4;
346  static constexpr unsigned alignment = 32;
347 
348  using scalarType = double;
349  using vectorType = __m256d;
350  using scalarArray = scalarType[width];
351 
352  // storage
353  vectorType _data;
354 
355  // ctors
356  inline avx2Double4() = default;
357  inline avx2Double4(const avx2Double4& rhs) = default;
358  inline avx2Double4(const vectorType& rhs) : _data(rhs){}
359  inline avx2Double4(const scalarType rhs)
360  {
361  _data = _mm256_set1_pd(rhs);
362  }
363 
364  // store
365  inline void store(scalarType* p) const
366  {
367  _mm256_store_pd(p, _data);
368  }
369 
370  template
371  <
372  class flag,
373  typename std::enable_if<
374  is_requiring_alignment<flag>::value &&
375  !is_streaming<flag>::value, bool
376  >::type = 0
377  >
378  inline void store(scalarType* p, flag) const
379  {
380  _mm256_store_pd(p, _data);
381  }
382 
383  template
384  <
385  class flag,
386  typename std::enable_if<
387  !is_requiring_alignment<flag>::value, bool
388  >::type = 0
389  >
390  inline void store(scalarType* p, flag) const
391  {
392  _mm256_storeu_pd(p, _data);
393  }
394 
395  template
396  <
397  class flag,
398  typename std::enable_if<
399  is_streaming<flag>::value, bool
400  >::type = 0
401  >
402  inline void store(scalarType* p, flag) const
403  {
404  _mm256_stream_pd(p, _data);
405  }
406 
407  // load packed
408  inline void load(const scalarType* p)
409  {
410  _data = _mm256_load_pd(p);
411  }
412 
413  template
414  <
415  class flag,
416  typename std::enable_if<
417  is_requiring_alignment<flag>::value, bool
418  >::type = 0
419  >
420  inline void load(const scalarType* p, flag)
421  {
422  _data = _mm256_load_pd(p);
423  }
424 
425  template
426  <
427  class flag,
428  typename std::enable_if<
429  !is_requiring_alignment<flag>::value, bool
430  >::type = 0
431  >
432  inline void load(const scalarType* p, flag)
433  {
434  _data = _mm256_loadu_pd(p);
435  }
436 
437  // load random
438  inline void load(scalarType const* a, scalarType const* b,
439  scalarType const* c, scalarType const* d)
440  {
441  __m128d t1, t2, t3, t4;
442  __m256d t5;
443  t1 = _mm_load_sd(a); // SSE2
444  t2 = _mm_loadh_pd(t1, b); // SSE2
445  t3 = _mm_load_sd(c); // SSE2
446  t4 = _mm_loadh_pd(t3, d); // SSE2
447  t5 = _mm256_castpd128_pd256(t2); // cast __m128d -> __m256d
448  _data = _mm256_insertf128_pd(t5, t4, 1);
449  }
450 
451  // broadcast
452  inline void broadcast(const scalarType rhs)
453  {
454  _data = _mm256_set1_pd(rhs);
455  }
456 
457  // gather/scatter with sse2
458  template <typename T>
459  inline void gather(scalarType const* p, const sse2Int4<T>& indices)
460  {
461  _data = _mm256_i32gather_pd(p, indices._data, 8);
462  }
463 
464  template <typename T>
465  inline void scatter(scalarType* out, const sse2Int4<T>& indices) const
466  {
467  // no scatter intrinsics for AVX2
468  alignas(alignment) scalarArray tmp;
469  _mm256_store_pd(tmp, _data);
470 
471  out[_mm_extract_epi32(indices._data, 0)] = tmp[0]; // SSE4.1
472  out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
473  out[_mm_extract_epi32(indices._data, 2)] = tmp[2];
474  out[_mm_extract_epi32(indices._data, 3)] = tmp[3];
475  }
476 
477  // gather scatter with avx2
478  template <typename T>
479  inline void gather(scalarType const* p, const avx2Long4<T>& indices)
480  {
481  _data = _mm256_i64gather_pd(p, indices._data, 8);
482  }
483 
484  template <typename T>
485  inline void scatter(scalarType* out, const avx2Long4<T>& indices) const
486  {
487  // no scatter intrinsics for AVX2
488  alignas(alignment) scalarArray tmp;
489  _mm256_store_pd(tmp, _data);
490 
491  out[_mm256_extract_epi64(indices._data, 0)] = tmp[0];
492  out[_mm256_extract_epi64(indices._data, 1)] = tmp[1];
493  out[_mm256_extract_epi64(indices._data, 2)] = tmp[2];
494  out[_mm256_extract_epi64(indices._data, 3)] = tmp[3];
495  }
496 
497  // fma
498  // this = this + a * b
499  inline void fma(const avx2Double4& a, const avx2Double4& b)
500  {
501  _data = _mm256_fmadd_pd(a._data, b._data, _data);
502  }
503 
504 
505  // subscript
506  // subscript operators are convienient but expensive
507  // should not be used in optimized kernels
508  inline scalarType operator[](size_t i) const
509  {
510  alignas(alignment) scalarArray tmp;
511  store(tmp, is_aligned);
512  return tmp[i];
513  }
514 
515  inline scalarType& operator[](size_t i)
516  {
517  scalarType* tmp = reinterpret_cast<scalarType*>(&_data);
518  return tmp[i];
519  }
520 
521  // unary ops
522  inline void operator+=(avx2Double4 rhs)
523  {
524  _data = _mm256_add_pd(_data, rhs._data);
525  }
526 
527  inline void operator-=(avx2Double4 rhs)
528  {
529  _data = _mm256_sub_pd(_data, rhs._data);
530  }
531 
532  inline void operator*=(avx2Double4 rhs)
533  {
534  _data = _mm256_mul_pd(_data, rhs._data);
535  }
536 
537  inline void operator/=(avx2Double4 rhs)
538  {
539  _data = _mm256_div_pd(_data, rhs._data);
540  }
541 
542 };
543 
544 inline avx2Double4 operator+(avx2Double4 lhs, avx2Double4 rhs)
545 {
546  return _mm256_add_pd(lhs._data, rhs._data);
547 }
548 
549 inline avx2Double4 operator-(avx2Double4 lhs, avx2Double4 rhs)
550 {
551  return _mm256_sub_pd(lhs._data, rhs._data);
552 }
553 
554 inline avx2Double4 operator*(avx2Double4 lhs, avx2Double4 rhs)
555 {
556  return _mm256_mul_pd(lhs._data, rhs._data);
557 }
558 
559 inline avx2Double4 operator/(avx2Double4 lhs, avx2Double4 rhs)
560 {
561  return _mm256_div_pd(lhs._data, rhs._data);
562 }
563 
564 inline avx2Double4 sqrt(avx2Double4 in)
565 {
566  return _mm256_sqrt_pd(in._data);
567 }
568 
569 inline avx2Double4 abs(avx2Double4 in)
570 {
571  // there is no avx2 _mm256_abs_pd intrinsic
572  static const __m256d sign_mask = _mm256_set1_pd(-0.); // -0. = 1 << 63
573  return _mm256_andnot_pd(sign_mask, in._data); // !sign_mask & x
574 }
575 
576 inline avx2Double4 log(avx2Double4 in)
577 {
578  // there is no avx2 log intrinsic
579  // this is a dreadful implementation and is simply a stop gap measure
580  alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
581  in.store(tmp);
582  tmp[0] = std::log(tmp[0]);
583  tmp[1] = std::log(tmp[1]);
584  tmp[2] = std::log(tmp[2]);
585  tmp[3] = std::log(tmp[3]);
586  avx2Double4 ret;
587  ret.load(tmp);
588  return ret;
589 }
590 
591 inline void load_interleave(
592  const double* in,
593  size_t dataLen,
594  std::vector<avx2Double4, allocator<avx2Double4>> &out)
595 {
596  size_t nBlocks = dataLen / 4;
597 
598  alignas(32) size_t tmp[4] = {0, dataLen, 2*dataLen, 3*dataLen};
599  using index_t = avx2Long4<size_t>;
600  index_t index0(tmp);
601  index_t index1 = index0 + 1;
602  index_t index2 = index0 + 2;
603  index_t index3 = index0 + 3;
604 
605  // 4x unrolled loop
606  for (size_t i = 0; i < nBlocks; ++i)
607  {
608  out[4*i + 0].gather(in, index0);
609  out[4*i + 1].gather(in, index1);
610  out[4*i + 2].gather(in, index2);
611  out[4*i + 3].gather(in, index3);
612  index0 = index0 + 4;
613  index1 = index1 + 4;
614  index2 = index2 + 4;
615  index3 = index3 + 4;
616  }
617 
618  // spillover loop
619  for (size_t i = 4 * nBlocks; i < dataLen; ++i)
620  {
621  out[i].gather(in, index0);
622  index0 = index0 + 1;
623  }
624 }
625 
626 
627 inline void deinterleave_store(
628  const std::vector<avx2Double4, allocator<avx2Double4>> &in,
629  size_t dataLen,
630  double *out)
631 {
632  // size_t nBlocks = dataLen / 4;
633 
634  alignas(32) size_t tmp[4] = {0, dataLen, 2*dataLen, 3*dataLen};
635  using index_t = avx2Long4<size_t>;
636  index_t index0(tmp);
637 
638  for (size_t i = 0; i < dataLen; ++i)
639  {
640  in[i].scatter(out, index0);
641  index0 = index0 + 1;
642  }
643 }
644 
645 
646 ////////////////////////////////////////////////////////////////////////////////
647 
648 // mask type
649 // mask is a int type with special properties (broad boolean vector)
650 // broad boolean vectors defined and allowed values are:
651 // false=0x0 and true=0xFFFFFFFF
652 //
653 // VERY LIMITED SUPPORT...just enough to make cubic eos work...
654 //
655 struct avx2Mask : avx2Long4<std::uint64_t>
656 {
657  // bring in ctors
658  using avx2Long4::avx2Long4;
659 
660  static constexpr scalarType true_v = -1;
661  static constexpr scalarType false_v = 0;
662 };
663 
664 inline avx2Mask operator>(avx2Double4 lhs, avx2Double4 rhs)
665 {
666 
667  return reinterpret_cast<__m256i>(_mm256_cmp_pd(rhs._data, lhs._data, 1));
668 }
669 
670 inline bool operator&&(avx2Mask lhs, bool rhs)
671 {
672  bool tmp = _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask::true_v));
673 
674  return tmp && rhs;
675 }
676 
677 
678 
679 #endif // defined(__AVX2__)
680 
681 } // namespace tinysimd
682 #endif
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:278
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:192
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T >>> &in, size_t dataLen, T *out)
Definition: scalar.hpp:296
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:211
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:272
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:323
void load_interleave(const T *in, size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T >>> &out)
Definition: scalar.hpp:284
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:329
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:267
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:249
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:230