Nektar++
avx2.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: avx2.hpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Vector type using avx2 extension.
32 //
33 ///////////////////////////////////////////////////////////////////////////////
34 
35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
37 
38 #if defined(__x86_64__)
39 #include <immintrin.h>
40 #if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41 #define TINYSIMD_HAS_SVML
42 #endif
43 #endif
44 #include "allocator.hpp"
45 #include "sse2.hpp"
46 #include "traits.hpp"
47 #include <cmath>
48 #include <vector>
49 
50 namespace tinysimd
51 {
52 
53 namespace abi
54 {
55 
56 template <typename scalarType, int width = 0> struct avx2
57 {
58  using type = void;
59 };
60 
61 } // namespace abi
62 
63 #if defined(__AVX2__) && defined(NEKTAR_ENABLE_SIMD_AVX2)
64 
65 // forward declaration of concrete types
66 template <typename T> struct avx2Long4;
67 template <typename T> struct avx2Int8;
68 struct avx2Double4;
69 struct avx2Float8;
70 struct avx2Mask4;
71 struct avx2Mask8;
72 
73 namespace abi
74 {
75 
76 // mapping between abstract types and concrete floating point types
77 template <> struct avx2<double>
78 {
79  using type = avx2Double4;
80 };
81 template <> struct avx2<float>
82 {
83  using type = avx2Float8;
84 };
85 // generic index mapping
86 // assumes index type width same as floating point type
87 template <> struct avx2<std::int64_t>
88 {
89  using type = avx2Long4<std::int64_t>;
90 };
91 template <> struct avx2<std::uint64_t>
92 {
93  using type = avx2Long4<std::uint64_t>;
94 };
95 #if defined(__APPLE__)
96 template <> struct avx2<std::size_t>
97 {
98  using type = avx2Long4<std::size_t>;
99 };
100 #endif
101 template <> struct avx2<std::int32_t>
102 {
103  using type = avx2Int8<std::int32_t>;
104 };
105 template <> struct avx2<std::uint32_t>
106 {
107  using type = avx2Int8<std::uint32_t>;
108 };
109 // specialized index mapping
110 template <> struct avx2<std::int64_t, 4>
111 {
112  using type = avx2Long4<std::int64_t>;
113 };
114 template <> struct avx2<std::uint64_t, 4>
115 {
116  using type = avx2Long4<std::uint64_t>;
117 };
118 #if defined(__APPLE__)
119 template <> struct avx2<std::size_t, 4>
120 {
121  using type = avx2Long4<std::size_t>;
122 };
123 #endif
124 template <> struct avx2<std::int32_t, 4>
125 {
126  using type = sse2Int4<std::int32_t>;
127 };
128 template <> struct avx2<std::uint32_t, 4>
129 {
130  using type = sse2Int4<std::uint32_t>;
131 };
132 template <> struct avx2<std::int32_t, 8>
133 {
134  using type = avx2Int8<std::int32_t>;
135 };
136 template <> struct avx2<std::uint32_t, 8>
137 {
138  using type = avx2Int8<std::uint32_t>;
139 };
140 // bool mapping
141 template <> struct avx2<bool, 4>
142 {
143  using type = avx2Mask4;
144 };
145 template <> struct avx2<bool, 8>
146 {
147  using type = avx2Mask8;
148 };
149 
150 } // namespace abi
151 
152 // concrete types
153 template <typename T> struct avx2Int8
154 {
155  static_assert(std::is_integral<T>::value && sizeof(T) == 4,
156  "4 bytes Integral required.");
157 
158  static constexpr unsigned int width = 8;
159  static constexpr unsigned int alignment = 32;
160 
161  using scalarType = T;
162  using vectorType = __m256i;
163  using scalarArray = scalarType[width];
164 
165  // storage
166  vectorType _data;
167 
168  // ctors
169  inline avx2Int8() = default;
170  inline avx2Int8(const avx2Int8 &rhs) = default;
171  inline avx2Int8(const vectorType &rhs) : _data(rhs)
172  {
173  }
174  inline avx2Int8(const scalarType rhs)
175  {
176  _data = _mm256_set1_epi32(rhs);
177  }
178  explicit inline avx2Int8(scalarArray &rhs)
179  {
180  _data = _mm256_load_si256(reinterpret_cast<vectorType *>(rhs));
181  }
182 
183  // copy assignment
184  inline avx2Int8 &operator=(const avx2Int8 &) = default;
185 
186  // store
187  inline void store(scalarType *p) const
188  {
189  _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
190  }
191 
192  template <class flag,
193  typename std::enable_if<is_requiring_alignment<flag>::value &&
194  !is_streaming<flag>::value,
195  bool>::type = 0>
196  inline void store(scalarType *p, flag) const
197  {
198  _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
199  }
200 
201  template <class flag,
202  typename std::enable_if<!is_requiring_alignment<flag>::value,
203  bool>::type = 0>
204  inline void store(scalarType *p, flag) const
205  {
206  _mm256_storeu_si256(reinterpret_cast<vectorType *>(p), _data);
207  }
208 
209  inline void load(const scalarType *p)
210  {
211  _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
212  }
213 
214  template <class flag,
215  typename std::enable_if<is_requiring_alignment<flag>::value &&
216  !is_streaming<flag>::value,
217  bool>::type = 0>
218  inline void load(const scalarType *p, flag)
219  {
220  _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
221  }
222 
223  template <class flag,
224  typename std::enable_if<!is_requiring_alignment<flag>::value,
225  bool>::type = 0>
226  inline void load(const scalarType *p, flag)
227  {
228  _data = _mm256_loadu_si256(reinterpret_cast<const vectorType *>(p));
229  }
230 
231  inline void broadcast(const scalarType rhs)
232  {
233  _data = _mm256_set1_epi32(rhs);
234  }
235 
236  // subscript
237  // subscriptsoperators are convienient but expensive
238  // should not be used in optimized kernels
239  inline scalarType operator[](size_t i) const
240  {
241  alignas(alignment) scalarArray tmp;
242  store(tmp, is_aligned);
243  return tmp[i];
244  }
245 
246  inline scalarType &operator[](size_t i)
247  {
248  scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
249  return tmp[i];
250  }
251 };
252 
253 template <typename T>
254 inline avx2Int8<T> operator+(avx2Int8<T> lhs, avx2Int8<T> rhs)
255 {
256  return _mm256_add_epi32(lhs._data, rhs._data);
257 }
258 
259 template <
260  typename T, typename U,
261  typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
262 inline avx2Int8<T> operator+(avx2Int8<T> lhs, U rhs)
263 {
264  return _mm256_add_epi32(lhs._data, _mm256_set1_epi32(rhs));
265 }
266 
267 ////////////////////////////////////////////////////////////////////////////////
268 
269 template <typename T> struct avx2Long4
270 {
271  static_assert(std::is_integral<T>::value && sizeof(T) == 8,
272  "8 bytes Integral required.");
273 
274  static constexpr unsigned int width = 4;
275  static constexpr unsigned int alignment = 32;
276 
277  using scalarType = T;
278  using vectorType = __m256i;
279  using scalarArray = scalarType[width];
280 
281  // storage
282  vectorType _data;
283 
284  // ctorsv
285  inline avx2Long4() = default;
286  inline avx2Long4(const avx2Long4 &rhs) = default;
287  inline avx2Long4(const vectorType &rhs) : _data(rhs)
288  {
289  }
290  inline avx2Long4(const scalarType rhs)
291  {
292  _data = _mm256_set1_epi64x(rhs);
293  }
294  explicit inline avx2Long4(scalarArray &rhs)
295  {
296  _data = _mm256_load_si256(reinterpret_cast<vectorType *>(rhs));
297  }
298 
299  // copy assignment
300  inline avx2Long4 &operator=(const avx2Long4 &) = default;
301 
302  // store
303  inline void store(scalarType *p) const
304  {
305  _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
306  }
307 
308  template <class flag,
309  typename std::enable_if<is_requiring_alignment<flag>::value &&
310  !is_streaming<flag>::value,
311  bool>::type = 0>
312  inline void store(scalarType *p, flag) const
313  {
314  _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
315  }
316 
317  template <class flag,
318  typename std::enable_if<!is_requiring_alignment<flag>::value,
319  bool>::type = 0>
320  inline void store(scalarType *p, flag) const
321  {
322  _mm256_storeu_si256(reinterpret_cast<vectorType *>(p), _data);
323  }
324 
325  inline void load(const scalarType *p)
326  {
327  _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
328  }
329 
330  template <class flag,
331  typename std::enable_if<is_requiring_alignment<flag>::value &&
332  !is_streaming<flag>::value,
333  bool>::type = 0>
334  inline void load(const scalarType *p, flag)
335  {
336  _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
337  }
338 
339  template <class flag,
340  typename std::enable_if<!is_requiring_alignment<flag>::value,
341  bool>::type = 0>
342  inline void load(const scalarType *p, flag)
343  {
344  _data = _mm256_loadu_si256(reinterpret_cast<const vectorType *>(p));
345  }
346 
347  inline void broadcast(const scalarType rhs)
348  {
349  _data = _mm256_set1_epi64x(rhs);
350  }
351 
352  // subscript
353  // subscript operators are convienient but expensive
354  // should not be used in optimized kernels
355  inline scalarType operator[](size_t i) const
356  {
357  alignas(alignment) scalarArray tmp;
358  store(tmp, is_aligned);
359  return tmp[i];
360  }
361 
362  inline scalarType &operator[](size_t i)
363  {
364  scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
365  return tmp[i];
366  }
367 };
368 
369 template <typename T>
370 inline avx2Long4<T> operator+(avx2Long4<T> lhs, avx2Long4<T> rhs)
371 {
372  return _mm256_add_epi64(lhs._data, rhs._data);
373 }
374 
375 template <
376  typename T, typename U,
377  typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
378 inline avx2Long4<T> operator+(avx2Long4<T> lhs, U rhs)
379 {
380  return _mm256_add_epi64(lhs._data, _mm256_set1_epi64x(rhs));
381 }
382 
383 ////////////////////////////////////////////////////////////////////////////////
384 
385 struct avx2Double4
386 {
387  static constexpr unsigned width = 4;
388  static constexpr unsigned alignment = 32;
389 
390  using scalarType = double;
391  using scalarIndexType = std::uint64_t;
392  using vectorType = __m256d;
393  using scalarArray = scalarType[width];
394 
395  // storage
396  vectorType _data;
397 
398  // ctors
399  inline avx2Double4() = default;
400  inline avx2Double4(const avx2Double4 &rhs) = default;
401  inline avx2Double4(const vectorType &rhs) : _data(rhs)
402  {
403  }
404  inline avx2Double4(const scalarType rhs)
405  {
406  _data = _mm256_set1_pd(rhs);
407  }
408 
409  // copy assignment
410  inline avx2Double4 &operator=(const avx2Double4 &) = default;
411 
412  // store
413  inline void store(scalarType *p) const
414  {
415  _mm256_store_pd(p, _data);
416  }
417 
418  template <class flag,
419  typename std::enable_if<is_requiring_alignment<flag>::value &&
420  !is_streaming<flag>::value,
421  bool>::type = 0>
422  inline void store(scalarType *p, flag) const
423  {
424  _mm256_store_pd(p, _data);
425  }
426 
427  template <class flag,
428  typename std::enable_if<!is_requiring_alignment<flag>::value,
429  bool>::type = 0>
430  inline void store(scalarType *p, flag) const
431  {
432  _mm256_storeu_pd(p, _data);
433  }
434 
435  template <class flag, typename std::enable_if<is_streaming<flag>::value,
436  bool>::type = 0>
437  inline void store(scalarType *p, flag) const
438  {
439  _mm256_stream_pd(p, _data);
440  }
441 
442  // load packed
443  inline void load(const scalarType *p)
444  {
445  _data = _mm256_load_pd(p);
446  }
447 
448  template <class flag,
449  typename std::enable_if<is_requiring_alignment<flag>::value,
450  bool>::type = 0>
451  inline void load(const scalarType *p, flag)
452  {
453  _data = _mm256_load_pd(p);
454  }
455 
456  template <class flag,
457  typename std::enable_if<!is_requiring_alignment<flag>::value,
458  bool>::type = 0>
459  inline void load(const scalarType *p, flag)
460  {
461  _data = _mm256_loadu_pd(p);
462  }
463 
464  // broadcast
465  inline void broadcast(const scalarType rhs)
466  {
467  _data = _mm256_set1_pd(rhs);
468  }
469 
470 #if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
471  // gather/scatter with sse2
472  template <typename T>
473  inline void gather(scalarType const *p, const sse2Int4<T> &indices)
474  {
475  _data = _mm256_i32gather_pd(p, indices._data, 8);
476  }
477 
478  template <typename T>
479  inline void scatter(scalarType *out, const sse2Int4<T> &indices) const
480  {
481  // no scatter intrinsics for AVX2
482  alignas(alignment) scalarArray tmp;
483  _mm256_store_pd(tmp, _data);
484 
485  out[_mm_extract_epi32(indices._data, 0)] = tmp[0]; // SSE4.1
486  out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
487  out[_mm_extract_epi32(indices._data, 2)] = tmp[2];
488  out[_mm_extract_epi32(indices._data, 3)] = tmp[3];
489  }
490 #endif
491 
492  // gather scatter with avx2
493  template <typename T>
494  inline void gather(scalarType const *p, const avx2Long4<T> &indices)
495  {
496  _data = _mm256_i64gather_pd(p, indices._data, 8);
497  }
498 
499  template <typename T>
500  inline void scatter(scalarType *out, const avx2Long4<T> &indices) const
501  {
502  // no scatter intrinsics for AVX2
503  alignas(alignment) scalarArray tmp;
504  _mm256_store_pd(tmp, _data);
505 
506  out[_mm256_extract_epi64(indices._data, 0)] = tmp[0];
507  out[_mm256_extract_epi64(indices._data, 1)] = tmp[1];
508  out[_mm256_extract_epi64(indices._data, 2)] = tmp[2];
509  out[_mm256_extract_epi64(indices._data, 3)] = tmp[3];
510  }
511 
512  // fma
513  // this = this + a * b
514  inline void fma(const avx2Double4 &a, const avx2Double4 &b)
515  {
516  _data = _mm256_fmadd_pd(a._data, b._data, _data);
517  }
518 
519  // subscript
520  // subscript operators are convienient but expensive
521  // should not be used in optimized kernels
522  inline scalarType operator[](size_t i) const
523  {
524  alignas(alignment) scalarArray tmp;
525  store(tmp, is_aligned);
526  return tmp[i];
527  }
528 
529  inline scalarType &operator[](size_t i)
530  {
531  scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
532  return tmp[i];
533  }
534 
535  // unary ops
536  inline void operator+=(avx2Double4 rhs)
537  {
538  _data = _mm256_add_pd(_data, rhs._data);
539  }
540 
541  inline void operator-=(avx2Double4 rhs)
542  {
543  _data = _mm256_sub_pd(_data, rhs._data);
544  }
545 
546  inline void operator*=(avx2Double4 rhs)
547  {
548  _data = _mm256_mul_pd(_data, rhs._data);
549  }
550 
551  inline void operator/=(avx2Double4 rhs)
552  {
553  _data = _mm256_div_pd(_data, rhs._data);
554  }
555 };
556 
557 inline avx2Double4 operator+(avx2Double4 lhs, avx2Double4 rhs)
558 {
559  return _mm256_add_pd(lhs._data, rhs._data);
560 }
561 
562 inline avx2Double4 operator-(avx2Double4 lhs, avx2Double4 rhs)
563 {
564  return _mm256_sub_pd(lhs._data, rhs._data);
565 }
566 
567 inline avx2Double4 operator*(avx2Double4 lhs, avx2Double4 rhs)
568 {
569  return _mm256_mul_pd(lhs._data, rhs._data);
570 }
571 
572 inline avx2Double4 operator/(avx2Double4 lhs, avx2Double4 rhs)
573 {
574  return _mm256_div_pd(lhs._data, rhs._data);
575 }
576 
577 inline avx2Double4 sqrt(avx2Double4 in)
578 {
579  return _mm256_sqrt_pd(in._data);
580 }
581 
582 inline avx2Double4 abs(avx2Double4 in)
583 {
584  // there is no avx2 _mm256_abs_pd intrinsic
585  static const __m256d sign_mask = _mm256_set1_pd(-0.); // -0. = 1 << 63
586  return _mm256_andnot_pd(sign_mask, in._data); // !sign_mask & x
587 }
588 
589 inline avx2Double4 log(avx2Double4 in)
590 {
591 #if defined(TINYSIMD_HAS_SVML)
592  return _mm256_log_pd(in._data);
593 #else
594  // there is no avx2 log intrinsic
595  // this is a dreadful implementation and is simply a stop gap measure
596  alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
597  in.store(tmp);
598  tmp[0] = std::log(tmp[0]);
599  tmp[1] = std::log(tmp[1]);
600  tmp[2] = std::log(tmp[2]);
601  tmp[3] = std::log(tmp[3]);
602  avx2Double4 ret;
603  ret.load(tmp);
604  return ret;
605 #endif
606 }
607 
608 inline void load_interleave(
609  const double *in, std::uint32_t dataLen,
610  std::vector<avx2Double4, allocator<avx2Double4>> &out)
611 {
612  alignas(avx2Double4::alignment)
613  size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
614  using index_t = avx2Long4<size_t>;
615  index_t index0(tmp);
616  index_t index1 = index0 + 1;
617  index_t index2 = index0 + 2;
618  index_t index3 = index0 + 3;
619 
620  // 4x unrolled loop
621  constexpr uint16_t unrl = 4;
622  size_t nBlocks = dataLen / unrl;
623  for (size_t i = 0; i < nBlocks; ++i)
624  {
625  out[unrl * i + 0].gather(in, index0);
626  out[unrl * i + 1].gather(in, index1);
627  out[unrl * i + 2].gather(in, index2);
628  out[unrl * i + 3].gather(in, index3);
629  index0 = index0 + unrl;
630  index1 = index1 + unrl;
631  index2 = index2 + unrl;
632  index3 = index3 + unrl;
633  }
634 
635  // spillover loop
636  for (size_t i = unrl * nBlocks; i < dataLen; ++i)
637  {
638  out[i].gather(in, index0);
639  index0 = index0 + 1;
640  }
641 }
642 
643 inline void deinterleave_store(
644  const std::vector<avx2Double4, allocator<avx2Double4>> &in,
645  std::uint32_t dataLen, double *out)
646 {
647  alignas(avx2Double4::alignment)
648  size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
649  using index_t = avx2Long4<size_t>;
650  index_t index0(tmp);
651 
652  for (size_t i = 0; i < dataLen; ++i)
653  {
654  in[i].scatter(out, index0);
655  index0 = index0 + 1;
656  }
657 }
658 
659 //////////////////////////////////////////////////////////////////////////////
660 
661 struct avx2Float8
662 {
663  static constexpr unsigned width = 8;
664  static constexpr unsigned alignment = 32;
665 
666  using scalarType = float;
667  using scalarIndexType = std::uint32_t;
668  using vectorType = __m256;
669  using scalarArray = scalarType[width];
670 
671  // storage
672  vectorType _data;
673 
674  // ctors
675  inline avx2Float8() = default;
676  inline avx2Float8(const avx2Float8 &rhs) = default;
677  inline avx2Float8(const vectorType &rhs) : _data(rhs)
678  {
679  }
680  inline avx2Float8(const scalarType rhs)
681  {
682  _data = _mm256_set1_ps(rhs);
683  }
684 
685  // copy assignment
686  inline avx2Float8 &operator=(const avx2Float8 &) = default;
687 
688  // store
689  inline void store(scalarType *p) const
690  {
691  _mm256_store_ps(p, _data);
692  }
693 
694  template <class flag,
695  typename std::enable_if<is_requiring_alignment<flag>::value &&
696  !is_streaming<flag>::value,
697  bool>::type = 0>
698  inline void store(scalarType *p, flag) const
699  {
700  _mm256_store_ps(p, _data);
701  }
702 
703  template <class flag,
704  typename std::enable_if<!is_requiring_alignment<flag>::value,
705  bool>::type = 0>
706  inline void store(scalarType *p, flag) const
707  {
708  _mm256_storeu_ps(p, _data);
709  }
710 
711  template <class flag, typename std::enable_if<is_streaming<flag>::value,
712  bool>::type = 0>
713  inline void store(scalarType *p, flag) const
714  {
715  _mm256_stream_ps(p, _data);
716  }
717 
718  // load packed
719  inline void load(const scalarType *p)
720  {
721  _data = _mm256_load_ps(p);
722  }
723 
724  template <class flag,
725  typename std::enable_if<is_requiring_alignment<flag>::value,
726  bool>::type = 0>
727  inline void load(const scalarType *p, flag)
728  {
729  _data = _mm256_load_ps(p);
730  }
731 
732  template <class flag,
733  typename std::enable_if<!is_requiring_alignment<flag>::value,
734  bool>::type = 0>
735  inline void load(const scalarType *p, flag)
736  {
737  _data = _mm256_loadu_ps(p);
738  }
739 
740  // broadcast
741  inline void broadcast(const scalarType rhs)
742  {
743  _data = _mm256_set1_ps(rhs);
744  }
745 
746  // gather scatter with avx2
747  template <typename T>
748  inline void gather(scalarType const *p, const avx2Int8<T> &indices)
749  {
750  _data = _mm256_i32gather_ps(p, indices._data, 4);
751  }
752 
753  template <typename T>
754  inline void scatter(scalarType *out, const avx2Int8<T> &indices) const
755  {
756  // no scatter intrinsics for AVX2
757  alignas(alignment) scalarArray tmp;
758  _mm256_store_ps(tmp, _data);
759 
760  out[_mm256_extract_epi32(indices._data, 0)] = tmp[0];
761  out[_mm256_extract_epi32(indices._data, 1)] = tmp[1];
762  out[_mm256_extract_epi32(indices._data, 2)] = tmp[2];
763  out[_mm256_extract_epi32(indices._data, 3)] = tmp[3];
764  out[_mm256_extract_epi32(indices._data, 4)] = tmp[4];
765  out[_mm256_extract_epi32(indices._data, 5)] = tmp[5];
766  out[_mm256_extract_epi32(indices._data, 6)] = tmp[6];
767  out[_mm256_extract_epi32(indices._data, 7)] = tmp[7];
768  }
769 
770  // fma
771  // this = this + a * b
772  inline void fma(const avx2Float8 &a, const avx2Float8 &b)
773  {
774  _data = _mm256_fmadd_ps(a._data, b._data, _data);
775  }
776 
777  // subscript
778  // subscript operators are convienient but expensive
779  // should not be used in optimized kernels
780  inline scalarType operator[](size_t i) const
781  {
782  alignas(alignment) scalarArray tmp;
783  store(tmp, is_aligned);
784  return tmp[i];
785  }
786 
787  inline scalarType &operator[](size_t i)
788  {
789  scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
790  return tmp[i];
791  }
792 
793  inline void operator+=(avx2Float8 rhs)
794  {
795  _data = _mm256_add_ps(_data, rhs._data);
796  }
797 
798  inline void operator-=(avx2Float8 rhs)
799  {
800  _data = _mm256_sub_ps(_data, rhs._data);
801  }
802 
803  inline void operator*=(avx2Float8 rhs)
804  {
805  _data = _mm256_mul_ps(_data, rhs._data);
806  }
807 
808  inline void operator/=(avx2Float8 rhs)
809  {
810  _data = _mm256_div_ps(_data, rhs._data);
811  }
812 };
813 
814 inline avx2Float8 operator+(avx2Float8 lhs, avx2Float8 rhs)
815 {
816  return _mm256_add_ps(lhs._data, rhs._data);
817 }
818 
819 inline avx2Float8 operator-(avx2Float8 lhs, avx2Float8 rhs)
820 {
821  return _mm256_sub_ps(lhs._data, rhs._data);
822 }
823 
824 inline avx2Float8 operator*(avx2Float8 lhs, avx2Float8 rhs)
825 {
826  return _mm256_mul_ps(lhs._data, rhs._data);
827 }
828 
829 inline avx2Float8 operator/(avx2Float8 lhs, avx2Float8 rhs)
830 {
831  return _mm256_div_ps(lhs._data, rhs._data);
832 }
833 
834 inline avx2Float8 sqrt(avx2Float8 in)
835 {
836  return _mm256_sqrt_ps(in._data);
837 }
838 
839 inline avx2Float8 abs(avx2Float8 in)
840 {
841  // there is no avx2 _mm256_abs_ps intrinsic
842  static const __m256 sign_mask = _mm256_set1_ps(-0.); // -0. = 1 << 63
843  return _mm256_andnot_ps(sign_mask, in._data); // !sign_mask & x
844 }
845 
846 inline avx2Float8 log(avx2Float8 in)
847 {
848  // there is no avx2 log intrinsic
849  // this is a dreadful implementation and is simply a stop gap measure
850  alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
851  in.store(tmp);
852  tmp[0] = std::log(tmp[0]);
853  tmp[1] = std::log(tmp[1]);
854  tmp[2] = std::log(tmp[2]);
855  tmp[3] = std::log(tmp[3]);
856  tmp[4] = std::log(tmp[4]);
857  tmp[5] = std::log(tmp[5]);
858  tmp[6] = std::log(tmp[6]);
859  tmp[7] = std::log(tmp[7]);
860  avx2Float8 ret;
861  ret.load(tmp);
862  return ret;
863 }
864 
865 inline void load_interleave(const float *in, std::uint32_t dataLen,
866  std::vector<avx2Float8, allocator<avx2Float8>> &out)
867 {
868 
869  alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
870  0, dataLen, 2 * dataLen, 3 * dataLen,
871  4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
872 
873  using index_t = avx2Int8<avx2Float8::scalarIndexType>;
874  index_t index0(tmp);
875  index_t index1 = index0 + 1;
876  index_t index2 = index0 + 2;
877  index_t index3 = index0 + 3;
878 
879  // 4x unrolled loop
880  size_t nBlocks = dataLen / 4;
881  for (size_t i = 0; i < nBlocks; ++i)
882  {
883  out[4 * i + 0].gather(in, index0);
884  out[4 * i + 1].gather(in, index1);
885  out[4 * i + 2].gather(in, index2);
886  out[4 * i + 3].gather(in, index3);
887  index0 = index0 + 4;
888  index1 = index1 + 4;
889  index2 = index2 + 4;
890  index3 = index3 + 4;
891  }
892 
893  // spillover loop
894  for (size_t i = 4 * nBlocks; i < dataLen; ++i)
895  {
896  out[i].gather(in, index0);
897  index0 = index0 + 1;
898  }
899 }
900 
901 inline void deinterleave_store(
902  const std::vector<avx2Float8, allocator<avx2Float8>> &in,
903  std::uint32_t dataLen, float *out)
904 {
905  alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
906  0, dataLen, 2 * dataLen, 3 * dataLen,
907  4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
908  using index_t = avx2Int8<avx2Float8::scalarIndexType>;
909  index_t index0(tmp);
910 
911  for (size_t i = 0; i < dataLen; ++i)
912  {
913  in[i].scatter(out, index0);
914  index0 = index0 + 1;
915  }
916 }
917 
918 ////////////////////////////////////////////////////////////////////////////////
919 
920 // mask type
921 // mask is a int type with special properties (broad boolean vector)
922 // broad boolean vectors defined and allowed values are:
923 // false=0x0 and true=0xFFFFFFFF
924 //
925 // VERY LIMITED SUPPORT...just enough to make cubic eos work...
926 //
927 struct avx2Mask4 : avx2Long4<std::uint64_t>
928 {
929  // bring in ctors
930  using avx2Long4::avx2Long4;
931 
932  static constexpr scalarType true_v = -1;
933  static constexpr scalarType false_v = 0;
934 };
935 
936 inline avx2Mask4 operator>(avx2Double4 lhs, avx2Double4 rhs)
937 {
938  return reinterpret_cast<__m256i>(
939  _mm256_cmp_pd(lhs._data, rhs._data, _CMP_GT_OQ));
940 }
941 
942 inline bool operator&&(avx2Mask4 lhs, bool rhs)
943 {
944  bool tmp =
945  _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask4::true_v));
946 
947  return tmp && rhs;
948 }
949 
950 struct avx2Mask8 : avx2Int8<std::uint32_t>
951 {
952  // bring in ctors
953  using avx2Int8::avx2Int8;
954 
955  static constexpr scalarType true_v = -1;
956  static constexpr scalarType false_v = 0;
957 };
958 
959 inline avx2Mask8 operator>(avx2Float8 lhs, avx2Float8 rhs)
960 {
961  return reinterpret_cast<__m256i>(_mm256_cmp_ps(rhs._data, lhs._data, 1));
962 }
963 
964 inline bool operator&&(avx2Mask8 lhs, bool rhs)
965 {
966  bool tmp =
967  _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask8::true_v));
968 
969  return tmp && rhs;
970 }
971 
972 #endif // defined(__AVX2__)
973 
974 } // namespace tinysimd
975 #endif
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:303
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:215
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T >>> &in, size_t dataLen, T *out)
Definition: scalar.hpp:319
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:235
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:298
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:366
void load_interleave(const T *in, size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T >>> &out)
Definition: scalar.hpp:309
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:376
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:294
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:275
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:255