Nektar++
avx2.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: avx2.hpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Vector type using avx2 extension.
32 //
33 ///////////////////////////////////////////////////////////////////////////////
34 
35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
37 
38 #if defined(__x86_64__)
39 #include <immintrin.h>
40 #if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41 #define TINYSIMD_HAS_SVML
42 #endif
43 #endif
44 #include "allocator.hpp"
45 #include "sse2.hpp"
46 #include "traits.hpp"
47 #include <cmath>
48 #include <vector>
49 
50 namespace tinysimd
51 {
52 
53 namespace abi
54 {
55 
56 template <typename scalarType, int width = 0> struct avx2
57 {
58  using type = void;
59 };
60 
61 } // namespace abi
62 
63 #if defined(__AVX2__) && defined(NEKTAR_ENABLE_SIMD_AVX2)
64 
65 // forward declaration of concrete types
66 template <typename T> struct avx2Int8;
67 template <typename T> struct avx2Long4;
68 struct avx2Double4;
69 struct avx2Float8;
70 struct avx2Mask4;
71 struct avx2Mask8;
72 
73 namespace abi
74 {
75 
76 // mapping between abstract types and concrete types
77 template <> struct avx2<double>
78 {
79  using type = avx2Double4;
80 };
81 template <> struct avx2<float>
82 {
83  using type = avx2Float8;
84 };
85 template <> struct avx2<std::int64_t>
86 {
87  using type = avx2Long4<std::int64_t>;
88 };
89 template <> struct avx2<std::uint64_t>
90 {
91  using type = avx2Long4<std::uint64_t>;
92 };
93 template <> struct avx2<std::int32_t>
94 {
95  using type = avx2Int8<std::int32_t>;
96 };
97 template <> struct avx2<std::uint32_t>
98 {
99  using type = avx2Int8<std::uint32_t>;
100 };
101 template <> struct avx2<bool, 4>
102 {
103  using type = avx2Mask4;
104 };
105 template <> struct avx2<bool, 8>
106 {
107  using type = avx2Mask8;
108 };
109 
110 } // namespace abi
111 
112 // concrete types
113 template <typename T> struct avx2Int8
114 {
115  static_assert(std::is_integral<T>::value && sizeof(T) == 4,
116  "4 bytes Integral required.");
117 
118  static constexpr unsigned int width = 8;
119  static constexpr unsigned int alignment = 32;
120 
121  using scalarType = T;
122  using vectorType = __m256i;
123  using scalarArray = scalarType[width];
124 
125  // storage
126  vectorType _data;
127 
128  // ctors
129  inline avx2Int8() = default;
130  inline avx2Int8(const avx2Int8 &rhs) = default;
131  inline avx2Int8(const vectorType &rhs) : _data(rhs)
132  {
133  }
134  inline avx2Int8(const scalarType rhs)
135  {
136  _data = _mm256_set1_epi32(rhs);
137  }
138  explicit inline avx2Int8(scalarArray &rhs)
139  {
140  _data = _mm256_load_si256(reinterpret_cast<vectorType *>(rhs));
141  }
142 
143  // store
144  inline void store(scalarType *p) const
145  {
146  _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
147  }
148 
149  template <class flag,
150  typename std::enable_if<is_requiring_alignment<flag>::value &&
151  !is_streaming<flag>::value,
152  bool>::type = 0>
153  inline void store(scalarType *p, flag) const
154  {
155  _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
156  }
157 
158  template <class flag,
159  typename std::enable_if<!is_requiring_alignment<flag>::value,
160  bool>::type = 0>
161  inline void store(scalarType *p, flag) const
162  {
163  _mm256_storeu_si256(reinterpret_cast<vectorType *>(p), _data);
164  }
165 
166  inline void load(const scalarType *p)
167  {
168  _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
169  }
170 
171  template <class flag,
172  typename std::enable_if<is_requiring_alignment<flag>::value &&
173  !is_streaming<flag>::value,
174  bool>::type = 0>
175  inline void load(const scalarType *p, flag)
176  {
177  _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
178  }
179 
180  template <class flag,
181  typename std::enable_if<!is_requiring_alignment<flag>::value,
182  bool>::type = 0>
183  inline void load(const scalarType *p, flag)
184  {
185  _data = _mm256_loadu_si256(reinterpret_cast<const vectorType *>(p));
186  }
187 
188  inline void broadcast(const scalarType rhs)
189  {
190  _data = _mm256_set1_epi32(rhs);
191  }
192 
193  // subscript
194  // subscriptsoperators are convienient but expensive
195  // should not be used in optimized kernels
196  inline scalarType operator[](size_t i) const
197  {
198  alignas(alignment) scalarArray tmp;
199  store(tmp, is_aligned);
200  return tmp[i];
201  }
202 };
203 
204 template <typename T>
205 inline avx2Int8<T> operator+(avx2Int8<T> lhs, avx2Int8<T> rhs)
206 {
207  return _mm256_add_epi32(lhs._data, rhs._data);
208 }
209 
210 template <
211  typename T, typename U,
212  typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
213 inline avx2Int8<T> operator+(avx2Int8<T> lhs, U rhs)
214 {
215  return _mm256_add_epi32(lhs._data, _mm256_set1_epi32(rhs));
216 }
217 
218 ////////////////////////////////////////////////////////////////////////////////
219 
220 template <typename T> struct avx2Long4
221 {
222  static_assert(std::is_integral<T>::value && sizeof(T) == 8,
223  "8 bytes Integral required.");
224 
225  static constexpr unsigned int width = 4;
226  static constexpr unsigned int alignment = 32;
227 
228  using scalarType = T;
229  using vectorType = __m256i;
230  using scalarArray = scalarType[width];
231 
232  // storage
233  vectorType _data;
234 
235  // ctors
236  inline avx2Long4() = default;
237  inline avx2Long4(const avx2Long4 &rhs) = default;
238  inline avx2Long4(const vectorType &rhs) : _data(rhs)
239  {
240  }
241  inline avx2Long4(const scalarType rhs)
242  {
243  _data = _mm256_set1_epi64x(rhs);
244  }
245  explicit inline avx2Long4(scalarArray &rhs)
246  {
247  _data = _mm256_load_si256(reinterpret_cast<vectorType *>(rhs));
248  }
249 
250  // store
251  inline void store(scalarType *p) const
252  {
253  _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
254  }
255 
256  template <class flag,
257  typename std::enable_if<is_requiring_alignment<flag>::value &&
258  !is_streaming<flag>::value,
259  bool>::type = 0>
260  inline void store(scalarType *p, flag) const
261  {
262  _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
263  }
264 
265  template <class flag,
266  typename std::enable_if<!is_requiring_alignment<flag>::value,
267  bool>::type = 0>
268  inline void store(scalarType *p, flag) const
269  {
270  _mm256_storeu_si256(reinterpret_cast<vectorType *>(p), _data);
271  }
272 
273  inline void load(const scalarType *p)
274  {
275  _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
276  }
277 
278  template <class flag,
279  typename std::enable_if<is_requiring_alignment<flag>::value &&
280  !is_streaming<flag>::value,
281  bool>::type = 0>
282  inline void load(const scalarType *p, flag)
283  {
284  _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
285  }
286 
287  template <class flag,
288  typename std::enable_if<!is_requiring_alignment<flag>::value,
289  bool>::type = 0>
290  inline void load(const scalarType *p, flag)
291  {
292  _data = _mm256_loadu_si256(reinterpret_cast<const vectorType *>(p));
293  }
294 
295  inline void broadcast(const scalarType rhs)
296  {
297  _data = _mm256_set1_epi64x(rhs);
298  }
299 
300  // subscript
301  // subscript operators are convienient but expensive
302  // should not be used in optimized kernels
303  inline scalarType operator[](size_t i) const
304  {
305  alignas(alignment) scalarArray tmp;
306  store(tmp, is_aligned);
307  return tmp[i];
308  }
309 };
310 
311 template <typename T>
312 inline avx2Long4<T> operator+(avx2Long4<T> lhs, avx2Long4<T> rhs)
313 {
314  return _mm256_add_epi64(lhs._data, rhs._data);
315 }
316 
317 template <
318  typename T, typename U,
319  typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
320 inline avx2Long4<T> operator+(avx2Long4<T> lhs, U rhs)
321 {
322  return _mm256_add_epi64(lhs._data, _mm256_set1_epi64x(rhs));
323 }
324 
325 ////////////////////////////////////////////////////////////////////////////////
326 
327 struct avx2Double4
328 {
329  static constexpr unsigned width = 4;
330  static constexpr unsigned alignment = 32;
331 
332  using scalarType = double;
333  using scalarIndexType = std::uint64_t;
334  using vectorType = __m256d;
335  using scalarArray = scalarType[width];
336 
337  // storage
338  vectorType _data;
339 
340  // ctors
341  inline avx2Double4() = default;
342  inline avx2Double4(const avx2Double4 &rhs) = default;
343  inline avx2Double4(const vectorType &rhs) : _data(rhs)
344  {
345  }
346  inline avx2Double4(const scalarType rhs)
347  {
348  _data = _mm256_set1_pd(rhs);
349  }
350 
351  // store
352  inline void store(scalarType *p) const
353  {
354  _mm256_store_pd(p, _data);
355  }
356 
357  template <class flag,
358  typename std::enable_if<is_requiring_alignment<flag>::value &&
359  !is_streaming<flag>::value,
360  bool>::type = 0>
361  inline void store(scalarType *p, flag) const
362  {
363  _mm256_store_pd(p, _data);
364  }
365 
366  template <class flag,
367  typename std::enable_if<!is_requiring_alignment<flag>::value,
368  bool>::type = 0>
369  inline void store(scalarType *p, flag) const
370  {
371  _mm256_storeu_pd(p, _data);
372  }
373 
374  template <class flag, typename std::enable_if<is_streaming<flag>::value,
375  bool>::type = 0>
376  inline void store(scalarType *p, flag) const
377  {
378  _mm256_stream_pd(p, _data);
379  }
380 
381  // load packed
382  inline void load(const scalarType *p)
383  {
384  _data = _mm256_load_pd(p);
385  }
386 
387  template <class flag,
388  typename std::enable_if<is_requiring_alignment<flag>::value,
389  bool>::type = 0>
390  inline void load(const scalarType *p, flag)
391  {
392  _data = _mm256_load_pd(p);
393  }
394 
395  template <class flag,
396  typename std::enable_if<!is_requiring_alignment<flag>::value,
397  bool>::type = 0>
398  inline void load(const scalarType *p, flag)
399  {
400  _data = _mm256_loadu_pd(p);
401  }
402 
403  // broadcast
404  inline void broadcast(const scalarType rhs)
405  {
406  _data = _mm256_set1_pd(rhs);
407  }
408 
409 #if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
410  // gather/scatter with sse2
411  template <typename T>
412  inline void gather(scalarType const *p, const sse2Int4<T> &indices)
413  {
414  _data = _mm256_i32gather_pd(p, indices._data, 8);
415  }
416 
417  template <typename T>
418  inline void scatter(scalarType *out, const sse2Int4<T> &indices) const
419  {
420  // no scatter intrinsics for AVX2
421  alignas(alignment) scalarArray tmp;
422  _mm256_store_pd(tmp, _data);
423 
424  out[_mm_extract_epi32(indices._data, 0)] = tmp[0]; // SSE4.1
425  out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
426  out[_mm_extract_epi32(indices._data, 2)] = tmp[2];
427  out[_mm_extract_epi32(indices._data, 3)] = tmp[3];
428  }
429 #endif
430 
431  // gather scatter with avx2
432  template <typename T>
433  inline void gather(scalarType const *p, const avx2Long4<T> &indices)
434  {
435  _data = _mm256_i64gather_pd(p, indices._data, 8);
436  }
437 
438  template <typename T>
439  inline void scatter(scalarType *out, const avx2Long4<T> &indices) const
440  {
441  // no scatter intrinsics for AVX2
442  alignas(alignment) scalarArray tmp;
443  _mm256_store_pd(tmp, _data);
444 
445  out[_mm256_extract_epi64(indices._data, 0)] = tmp[0];
446  out[_mm256_extract_epi64(indices._data, 1)] = tmp[1];
447  out[_mm256_extract_epi64(indices._data, 2)] = tmp[2];
448  out[_mm256_extract_epi64(indices._data, 3)] = tmp[3];
449  }
450 
451  // fma
452  // this = this + a * b
453  inline void fma(const avx2Double4 &a, const avx2Double4 &b)
454  {
455  _data = _mm256_fmadd_pd(a._data, b._data, _data);
456  }
457 
458  // subscript
459  // subscript operators are convienient but expensive
460  // should not be used in optimized kernels
461  inline scalarType operator[](size_t i) const
462  {
463  alignas(alignment) scalarArray tmp;
464  store(tmp, is_aligned);
465  return tmp[i];
466  }
467 
468  // unary ops
469  inline void operator+=(avx2Double4 rhs)
470  {
471  _data = _mm256_add_pd(_data, rhs._data);
472  }
473 
474  inline void operator-=(avx2Double4 rhs)
475  {
476  _data = _mm256_sub_pd(_data, rhs._data);
477  }
478 
479  inline void operator*=(avx2Double4 rhs)
480  {
481  _data = _mm256_mul_pd(_data, rhs._data);
482  }
483 
484  inline void operator/=(avx2Double4 rhs)
485  {
486  _data = _mm256_div_pd(_data, rhs._data);
487  }
488 };
489 
490 inline avx2Double4 operator+(avx2Double4 lhs, avx2Double4 rhs)
491 {
492  return _mm256_add_pd(lhs._data, rhs._data);
493 }
494 
495 inline avx2Double4 operator-(avx2Double4 lhs, avx2Double4 rhs)
496 {
497  return _mm256_sub_pd(lhs._data, rhs._data);
498 }
499 
500 inline avx2Double4 operator*(avx2Double4 lhs, avx2Double4 rhs)
501 {
502  return _mm256_mul_pd(lhs._data, rhs._data);
503 }
504 
505 inline avx2Double4 operator/(avx2Double4 lhs, avx2Double4 rhs)
506 {
507  return _mm256_div_pd(lhs._data, rhs._data);
508 }
509 
510 inline avx2Double4 sqrt(avx2Double4 in)
511 {
512  return _mm256_sqrt_pd(in._data);
513 }
514 
515 inline avx2Double4 abs(avx2Double4 in)
516 {
517  // there is no avx2 _mm256_abs_pd intrinsic
518  static const __m256d sign_mask = _mm256_set1_pd(-0.); // -0. = 1 << 63
519  return _mm256_andnot_pd(sign_mask, in._data); // !sign_mask & x
520 }
521 
522 inline avx2Double4 log(avx2Double4 in)
523 {
524 #if defined(TINYSIMD_HAS_SVML)
525  return _mm256_log_pd(in._data);
526 #else
527  // there is no avx2 log intrinsic
528  // this is a dreadful implementation and is simply a stop gap measure
529  alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
530  in.store(tmp);
531  tmp[0] = std::log(tmp[0]);
532  tmp[1] = std::log(tmp[1]);
533  tmp[2] = std::log(tmp[2]);
534  tmp[3] = std::log(tmp[3]);
535  avx2Double4 ret;
536  ret.load(tmp);
537  return ret;
538 #endif
539 }
540 
541 inline void load_interleave(
542  const double *in, size_t dataLen,
543  std::vector<avx2Double4, allocator<avx2Double4>> &out)
544 {
545  alignas(avx2Double4::alignment)
546  size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
547  using index_t = avx2Long4<size_t>;
548  index_t index0(tmp);
549  index_t index1 = index0 + 1;
550  index_t index2 = index0 + 2;
551  index_t index3 = index0 + 3;
552 
553  // 4x unrolled loop
554  constexpr uint16_t unrl = 4;
555  size_t nBlocks = dataLen / unrl;
556  for (size_t i = 0; i < nBlocks; ++i)
557  {
558  out[unrl * i + 0].gather(in, index0);
559  out[unrl * i + 1].gather(in, index1);
560  out[unrl * i + 2].gather(in, index2);
561  out[unrl * i + 3].gather(in, index3);
562  index0 = index0 + unrl;
563  index1 = index1 + unrl;
564  index2 = index2 + unrl;
565  index3 = index3 + unrl;
566  }
567 
568  // spillover loop
569  for (size_t i = unrl * nBlocks; i < dataLen; ++i)
570  {
571  out[i].gather(in, index0);
572  index0 = index0 + 1;
573  }
574 }
575 
576 inline void deinterleave_store(
577  const std::vector<avx2Double4, allocator<avx2Double4>> &in, size_t dataLen,
578  double *out)
579 {
580  alignas(avx2Double4::alignment)
581  size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
582  using index_t = avx2Long4<size_t>;
583  index_t index0(tmp);
584 
585  for (size_t i = 0; i < dataLen; ++i)
586  {
587  in[i].scatter(out, index0);
588  index0 = index0 + 1;
589  }
590 }
591 
592 //////////////////////////////////////////////////////////////////////////////
593 
594 struct avx2Float8
595 {
596  static constexpr unsigned width = 8;
597  static constexpr unsigned alignment = 32;
598 
599  using scalarType = float;
600  using scalarIndexType = std::uint32_t;
601  using vectorType = __m256;
602  using scalarArray = scalarType[width];
603 
604  // storage
605  vectorType _data;
606 
607  // ctors
608  inline avx2Float8() = default;
609  inline avx2Float8(const avx2Float8 &rhs) = default;
610  inline avx2Float8(const vectorType &rhs) : _data(rhs)
611  {
612  }
613  inline avx2Float8(const scalarType rhs)
614  {
615  _data = _mm256_set1_ps(rhs);
616  }
617 
618  // store
619  inline void store(scalarType *p) const
620  {
621  _mm256_store_ps(p, _data);
622  }
623 
624  template <class flag,
625  typename std::enable_if<is_requiring_alignment<flag>::value &&
626  !is_streaming<flag>::value,
627  bool>::type = 0>
628  inline void store(scalarType *p, flag) const
629  {
630  _mm256_store_ps(p, _data);
631  }
632 
633  template <class flag,
634  typename std::enable_if<!is_requiring_alignment<flag>::value,
635  bool>::type = 0>
636  inline void store(scalarType *p, flag) const
637  {
638  _mm256_storeu_ps(p, _data);
639  }
640 
641  template <class flag, typename std::enable_if<is_streaming<flag>::value,
642  bool>::type = 0>
643  inline void store(scalarType *p, flag) const
644  {
645  _mm256_stream_ps(p, _data);
646  }
647 
648  // load packed
649  inline void load(const scalarType *p)
650  {
651  _data = _mm256_load_ps(p);
652  }
653 
654  template <class flag,
655  typename std::enable_if<is_requiring_alignment<flag>::value,
656  bool>::type = 0>
657  inline void load(const scalarType *p, flag)
658  {
659  _data = _mm256_load_ps(p);
660  }
661 
662  template <class flag,
663  typename std::enable_if<!is_requiring_alignment<flag>::value,
664  bool>::type = 0>
665  inline void load(const scalarType *p, flag)
666  {
667  _data = _mm256_loadu_ps(p);
668  }
669 
670  // broadcast
671  inline void broadcast(const scalarType rhs)
672  {
673  _data = _mm256_set1_ps(rhs);
674  }
675 
676  // gather scatter with avx2
677  template <typename T>
678  inline void gather(scalarType const *p, const avx2Int8<T> &indices)
679  {
680  _data = _mm256_i32gather_ps(p, indices._data, 4);
681  }
682 
683  template <typename T>
684  inline void scatter(scalarType *out, const avx2Int8<T> &indices) const
685  {
686  // no scatter intrinsics for AVX2
687  alignas(alignment) scalarArray tmp;
688  _mm256_store_ps(tmp, _data);
689 
690  out[_mm256_extract_epi32(indices._data, 0)] = tmp[0];
691  out[_mm256_extract_epi32(indices._data, 1)] = tmp[1];
692  out[_mm256_extract_epi32(indices._data, 2)] = tmp[2];
693  out[_mm256_extract_epi32(indices._data, 3)] = tmp[3];
694  out[_mm256_extract_epi32(indices._data, 4)] = tmp[4];
695  out[_mm256_extract_epi32(indices._data, 5)] = tmp[5];
696  out[_mm256_extract_epi32(indices._data, 6)] = tmp[6];
697  out[_mm256_extract_epi32(indices._data, 7)] = tmp[7];
698  }
699 
700  // fma
701  // this = this + a * b
702  inline void fma(const avx2Float8 &a, const avx2Float8 &b)
703  {
704  _data = _mm256_fmadd_ps(a._data, b._data, _data);
705  }
706 
707  // subscript
708  // subscript operators are convienient but expensive
709  // should not be used in optimized kernels
710  inline scalarType operator[](size_t i) const
711  {
712  alignas(alignment) scalarArray tmp;
713  store(tmp, is_aligned);
714  return tmp[i];
715  }
716 
717  inline scalarType &operator[](size_t i)
718  {
719  scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
720  return tmp[i];
721  }
722 
723  // unary ops
724  inline void operator+=(avx2Float8 rhs)
725  {
726  _data = _mm256_add_ps(_data, rhs._data);
727  }
728 
729  inline void operator-=(avx2Float8 rhs)
730  {
731  _data = _mm256_sub_ps(_data, rhs._data);
732  }
733 
734  inline void operator*=(avx2Float8 rhs)
735  {
736  _data = _mm256_mul_ps(_data, rhs._data);
737  }
738 
739  inline void operator/=(avx2Float8 rhs)
740  {
741  _data = _mm256_div_ps(_data, rhs._data);
742  }
743 };
744 
745 inline avx2Float8 operator+(avx2Float8 lhs, avx2Float8 rhs)
746 {
747  return _mm256_add_ps(lhs._data, rhs._data);
748 }
749 
750 inline avx2Float8 operator-(avx2Float8 lhs, avx2Float8 rhs)
751 {
752  return _mm256_sub_ps(lhs._data, rhs._data);
753 }
754 
755 inline avx2Float8 operator*(avx2Float8 lhs, avx2Float8 rhs)
756 {
757  return _mm256_mul_ps(lhs._data, rhs._data);
758 }
759 
760 inline avx2Float8 operator/(avx2Float8 lhs, avx2Float8 rhs)
761 {
762  return _mm256_div_ps(lhs._data, rhs._data);
763 }
764 
765 inline avx2Float8 sqrt(avx2Float8 in)
766 {
767  return _mm256_sqrt_ps(in._data);
768 }
769 
770 inline avx2Float8 abs(avx2Float8 in)
771 {
772  // there is no avx2 _mm256_abs_ps intrinsic
773  static const __m256 sign_mask = _mm256_set1_ps(-0.); // -0. = 1 << 63
774  return _mm256_andnot_ps(sign_mask, in._data); // !sign_mask & x
775 }
776 
777 inline avx2Float8 log(avx2Float8 in)
778 {
779  // there is no avx2 log intrinsic
780  // this is a dreadful implementation and is simply a stop gap measure
781  alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
782  in.store(tmp);
783  tmp[0] = std::log(tmp[0]);
784  tmp[1] = std::log(tmp[1]);
785  tmp[2] = std::log(tmp[2]);
786  tmp[3] = std::log(tmp[3]);
787  tmp[4] = std::log(tmp[4]);
788  tmp[5] = std::log(tmp[5]);
789  tmp[6] = std::log(tmp[6]);
790  tmp[7] = std::log(tmp[7]);
791  avx2Float8 ret;
792  ret.load(tmp);
793  return ret;
794 }
795 
796 inline void load_interleave(const float *in, std::uint32_t dataLen,
797  std::vector<avx2Float8, allocator<avx2Float8>> &out)
798 {
799 
800  alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
801  0, dataLen, 2 * dataLen, 3 * dataLen,
802  4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
803 
804  using index_t = avx2Int8<avx2Float8::scalarIndexType>;
805  index_t index0(tmp);
806  index_t index1 = index0 + 1;
807  index_t index2 = index0 + 2;
808  index_t index3 = index0 + 3;
809 
810  // 4x unrolled loop
811  size_t nBlocks = dataLen / 4;
812  for (size_t i = 0; i < nBlocks; ++i)
813  {
814  out[4 * i + 0].gather(in, index0);
815  out[4 * i + 1].gather(in, index1);
816  out[4 * i + 2].gather(in, index2);
817  out[4 * i + 3].gather(in, index3);
818  index0 = index0 + 4;
819  index1 = index1 + 4;
820  index2 = index2 + 4;
821  index3 = index3 + 4;
822  }
823 
824  // spillover loop
825  for (size_t i = 4 * nBlocks; i < dataLen; ++i)
826  {
827  out[i].gather(in, index0);
828  index0 = index0 + 1;
829  }
830 }
831 
832 inline void deinterleave_store(
833  const std::vector<avx2Float8, allocator<avx2Float8>> &in,
834  std::uint32_t dataLen, float *out)
835 {
836  alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
837  0, dataLen, 2 * dataLen, 3 * dataLen,
838  4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
839  using index_t = avx2Int8<avx2Float8::scalarIndexType>;
840  index_t index0(tmp);
841 
842  for (size_t i = 0; i < dataLen; ++i)
843  {
844  in[i].scatter(out, index0);
845  index0 = index0 + 1;
846  }
847 }
848 
849 ////////////////////////////////////////////////////////////////////////////////
850 
851 // mask type
852 // mask is a int type with special properties (broad boolean vector)
853 // broad boolean vectors defined and allowed values are:
854 // false=0x0 and true=0xFFFFFFFF
855 //
856 // VERY LIMITED SUPPORT...just enough to make cubic eos work...
857 //
858 struct avx2Mask4 : avx2Long4<std::uint64_t>
859 {
860  // bring in ctors
861  using avx2Long4::avx2Long4;
862 
863  static constexpr scalarType true_v = -1;
864  static constexpr scalarType false_v = 0;
865 };
866 
867 inline avx2Mask4 operator>(avx2Double4 lhs, avx2Double4 rhs)
868 {
869  return reinterpret_cast<__m256i>(
870  _mm256_cmp_pd(lhs._data, rhs._data, _CMP_GT_OQ));
871 }
872 
873 inline bool operator&&(avx2Mask4 lhs, bool rhs)
874 {
875  bool tmp =
876  _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask4::true_v));
877 
878  return tmp && rhs;
879 }
880 
881 struct avx2Mask8 : avx2Int8<std::uint32_t>
882 {
883  // bring in ctors
884  using avx2Int8::avx2Int8;
885 
886  static constexpr scalarType true_v = -1;
887  static constexpr scalarType false_v = 0;
888 };
889 
890 inline avx2Mask8 operator>(avx2Float8 lhs, avx2Float8 rhs)
891 {
892  return reinterpret_cast<__m256i>(_mm256_cmp_ps(rhs._data, lhs._data, 1));
893 }
894 
895 inline bool operator&&(avx2Mask8 lhs, bool rhs)
896 {
897  bool tmp =
898  _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask8::true_v));
899 
900  return tmp && rhs;
901 }
902 
903 #endif // defined(__AVX2__)
904 
905 } // namespace tinysimd
906 #endif
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:300
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:212
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T >>> &in, size_t dataLen, T *out)
Definition: scalar.hpp:316
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:232
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:295
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:363
void load_interleave(const T *in, size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T >>> &out)
Definition: scalar.hpp:306
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:373
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:291
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:272
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:252