Nektar++
avx2.hpp
Go to the documentation of this file.
1///////////////////////////////////////////////////////////////////////////////
2//
3// File: avx2.hpp
4//
5// For more information, please see: http://www.nektar.info
6//
7// The MIT License
8//
9// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10// Department of Aeronautics, Imperial College London (UK), and Scientific
11// Computing and Imaging Institute, University of Utah (USA).
12//
13// Permission is hereby granted, free of charge, to any person obtaining a
14// copy of this software and associated documentation files (the "Software"),
15// to deal in the Software without restriction, including without limitation
16// the rights to use, copy, modify, merge, publish, distribute, sublicense,
17// and/or sell copies of the Software, and to permit persons to whom the
18// Software is furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included
21// in all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29// DEALINGS IN THE SOFTWARE.
30//
31// Description: Vector type using avx2 extension.
32//
33///////////////////////////////////////////////////////////////////////////////
34
35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
37
38#if defined(__x86_64__)
39#include <immintrin.h>
40#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41#define TINYSIMD_HAS_SVML
42#endif
43#endif
44#include "allocator.hpp"
45#include "sse2.hpp"
46#include "traits.hpp"
47#include <cmath>
48#include <vector>
49
51{
52
53template <typename scalarType, int width = 0> struct avx2
54{
55 using type = void;
56};
57
58} // namespace tinysimd::abi
59
60#if defined(__AVX2__) && defined(NEKTAR_ENABLE_SIMD_AVX2)
61
62namespace tinysimd
63{
64
65// forward declaration of concrete types
66template <typename T> struct avx2Long4;
67template <typename T> struct avx2Int8;
68struct avx2Double4;
69struct avx2Float8;
70struct avx2Mask4;
71struct avx2Mask8;
72
73namespace abi
74{
75
76// mapping between abstract types and concrete floating point types
77template <> struct avx2<double>
78{
79 using type = avx2Double4;
80};
81template <> struct avx2<float>
82{
83 using type = avx2Float8;
84};
85// generic index mapping
86// assumes index type width same as floating point type
87template <> struct avx2<std::int64_t>
88{
89 using type = avx2Long4<std::int64_t>;
90};
91template <> struct avx2<std::uint64_t>
92{
93 using type = avx2Long4<std::uint64_t>;
94};
95#if defined(__APPLE__)
96template <> struct avx2<std::size_t>
97{
98 using type = avx2Long4<std::size_t>;
99};
100#endif
101template <> struct avx2<std::int32_t>
102{
103 using type = avx2Int8<std::int32_t>;
104};
105template <> struct avx2<std::uint32_t>
106{
107 using type = avx2Int8<std::uint32_t>;
108};
109// specialized index mapping
110template <> struct avx2<std::int64_t, 4>
111{
112 using type = avx2Long4<std::int64_t>;
113};
114template <> struct avx2<std::uint64_t, 4>
115{
116 using type = avx2Long4<std::uint64_t>;
117};
118#if defined(__APPLE__)
119template <> struct avx2<std::size_t, 4>
120{
121 using type = avx2Long4<std::size_t>;
122};
123#endif
124template <> struct avx2<std::int32_t, 4>
125{
126 using type = sse2Int4<std::int32_t>;
127};
128template <> struct avx2<std::uint32_t, 4>
129{
130 using type = sse2Int4<std::uint32_t>;
131};
132template <> struct avx2<std::int32_t, 8>
133{
134 using type = avx2Int8<std::int32_t>;
135};
136template <> struct avx2<std::uint32_t, 8>
137{
138 using type = avx2Int8<std::uint32_t>;
139};
140// bool mapping
141template <> struct avx2<bool, 4>
142{
143 using type = avx2Mask4;
144};
145template <> struct avx2<bool, 8>
146{
147 using type = avx2Mask8;
148};
149
150} // namespace abi
151
152// concrete types
153template <typename T> struct avx2Int8
154{
155 static_assert(std::is_integral<T>::value && sizeof(T) == 4,
156 "4 bytes Integral required.");
157
158 static constexpr unsigned int width = 8;
159 static constexpr unsigned int alignment = 32;
160
161 using scalarType = T;
162 using vectorType = __m256i;
163 using scalarArray = scalarType[width];
164
165 // storage
166 vectorType _data;
167
168 // ctors
169 inline avx2Int8() = default;
170 inline avx2Int8(const avx2Int8 &rhs) = default;
171 inline avx2Int8(const vectorType &rhs) : _data(rhs)
172 {
173 }
174 inline avx2Int8(const scalarType rhs)
175 {
176 _data = _mm256_set1_epi32(rhs);
177 }
178 explicit inline avx2Int8(scalarArray &rhs)
179 {
180 _data = _mm256_load_si256(reinterpret_cast<vectorType *>(rhs));
181 }
182
183 // copy assignment
184 inline avx2Int8 &operator=(const avx2Int8 &) = default;
185
186 // store
187 inline void store(scalarType *p) const
188 {
189 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
190 }
191
192 template <class flag,
193 typename std::enable_if<is_requiring_alignment<flag>::value &&
194 !is_streaming<flag>::value,
195 bool>::type = 0>
196 inline void store(scalarType *p, flag) const
197 {
198 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
199 }
200
201 template <class flag,
202 typename std::enable_if<!is_requiring_alignment<flag>::value,
203 bool>::type = 0>
204 inline void store(scalarType *p, flag) const
205 {
206 _mm256_storeu_si256(reinterpret_cast<vectorType *>(p), _data);
207 }
208
209 inline void load(const scalarType *p)
210 {
211 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
212 }
213
214 template <class flag,
215 typename std::enable_if<is_requiring_alignment<flag>::value &&
216 !is_streaming<flag>::value,
217 bool>::type = 0>
218 inline void load(const scalarType *p, flag)
219 {
220 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
221 }
222
223 template <class flag,
224 typename std::enable_if<!is_requiring_alignment<flag>::value,
225 bool>::type = 0>
226 inline void load(const scalarType *p, flag)
227 {
228 _data = _mm256_loadu_si256(reinterpret_cast<const vectorType *>(p));
229 }
230
231 inline void broadcast(const scalarType rhs)
232 {
233 _data = _mm256_set1_epi32(rhs);
234 }
235
236 // subscript
237 // subscriptsoperators are convienient but expensive
238 // should not be used in optimized kernels
239 inline scalarType operator[](size_t i) const
240 {
241 alignas(alignment) scalarArray tmp;
242 store(tmp, is_aligned);
243 return tmp[i];
244 }
245
246 inline scalarType &operator[](size_t i)
247 {
248 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
249 return tmp[i];
250 }
251};
252
253template <typename T>
254inline avx2Int8<T> operator+(avx2Int8<T> lhs, avx2Int8<T> rhs)
255{
256 return _mm256_add_epi32(lhs._data, rhs._data);
257}
258
259template <
260 typename T, typename U,
261 typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
262inline avx2Int8<T> operator+(avx2Int8<T> lhs, U rhs)
263{
264 return _mm256_add_epi32(lhs._data, _mm256_set1_epi32(rhs));
265}
266
267////////////////////////////////////////////////////////////////////////////////
268
269template <typename T> struct avx2Long4
270{
271 static_assert(std::is_integral<T>::value && sizeof(T) == 8,
272 "8 bytes Integral required.");
273
274 static constexpr unsigned int width = 4;
275 static constexpr unsigned int alignment = 32;
276
277 using scalarType = T;
278 using vectorType = __m256i;
279 using scalarArray = scalarType[width];
280
281 // storage
282 vectorType _data;
283
284 // ctorsv
285 inline avx2Long4() = default;
286 inline avx2Long4(const avx2Long4 &rhs) = default;
287 inline avx2Long4(const vectorType &rhs) : _data(rhs)
288 {
289 }
290 inline avx2Long4(const scalarType rhs)
291 {
292 _data = _mm256_set1_epi64x(rhs);
293 }
294 explicit inline avx2Long4(scalarArray &rhs)
295 {
296 _data = _mm256_load_si256(reinterpret_cast<vectorType *>(rhs));
297 }
298
299 // copy assignment
300 inline avx2Long4 &operator=(const avx2Long4 &) = default;
301
302 // store
303 inline void store(scalarType *p) const
304 {
305 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
306 }
307
308 template <class flag,
309 typename std::enable_if<is_requiring_alignment<flag>::value &&
310 !is_streaming<flag>::value,
311 bool>::type = 0>
312 inline void store(scalarType *p, flag) const
313 {
314 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
315 }
316
317 template <class flag,
318 typename std::enable_if<!is_requiring_alignment<flag>::value,
319 bool>::type = 0>
320 inline void store(scalarType *p, flag) const
321 {
322 _mm256_storeu_si256(reinterpret_cast<vectorType *>(p), _data);
323 }
324
325 inline void load(const scalarType *p)
326 {
327 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
328 }
329
330 template <class flag,
331 typename std::enable_if<is_requiring_alignment<flag>::value &&
332 !is_streaming<flag>::value,
333 bool>::type = 0>
334 inline void load(const scalarType *p, flag)
335 {
336 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
337 }
338
339 template <class flag,
340 typename std::enable_if<!is_requiring_alignment<flag>::value,
341 bool>::type = 0>
342 inline void load(const scalarType *p, flag)
343 {
344 _data = _mm256_loadu_si256(reinterpret_cast<const vectorType *>(p));
345 }
346
347 inline void broadcast(const scalarType rhs)
348 {
349 _data = _mm256_set1_epi64x(rhs);
350 }
351
352 // subscript
353 // subscript operators are convienient but expensive
354 // should not be used in optimized kernels
355 inline scalarType operator[](size_t i) const
356 {
357 alignas(alignment) scalarArray tmp;
358 store(tmp, is_aligned);
359 return tmp[i];
360 }
361
362 inline scalarType &operator[](size_t i)
363 {
364 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
365 return tmp[i];
366 }
367};
368
369template <typename T>
370inline avx2Long4<T> operator+(avx2Long4<T> lhs, avx2Long4<T> rhs)
371{
372 return _mm256_add_epi64(lhs._data, rhs._data);
373}
374
375template <
376 typename T, typename U,
377 typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
378inline avx2Long4<T> operator+(avx2Long4<T> lhs, U rhs)
379{
380 return _mm256_add_epi64(lhs._data, _mm256_set1_epi64x(rhs));
381}
382
383////////////////////////////////////////////////////////////////////////////////
384
385struct avx2Double4
386{
387 static constexpr unsigned width = 4;
388 static constexpr unsigned alignment = 32;
389
390 using scalarType = double;
391 using scalarIndexType = std::uint64_t;
392 using vectorType = __m256d;
393 using scalarArray = scalarType[width];
394
395 // storage
396 vectorType _data;
397
398 // ctors
399 inline avx2Double4() = default;
400 inline avx2Double4(const avx2Double4 &rhs) = default;
401 inline avx2Double4(const vectorType &rhs) : _data(rhs)
402 {
403 }
404 inline avx2Double4(const scalarType rhs)
405 {
406 _data = _mm256_set1_pd(rhs);
407 }
408
409 // copy assignment
410 inline avx2Double4 &operator=(const avx2Double4 &) = default;
411
412 // store
413 inline void store(scalarType *p) const
414 {
415 _mm256_store_pd(p, _data);
416 }
417
418 template <class flag,
419 typename std::enable_if<is_requiring_alignment<flag>::value &&
420 !is_streaming<flag>::value,
421 bool>::type = 0>
422 inline void store(scalarType *p, flag) const
423 {
424 _mm256_store_pd(p, _data);
425 }
426
427 template <class flag,
428 typename std::enable_if<!is_requiring_alignment<flag>::value,
429 bool>::type = 0>
430 inline void store(scalarType *p, flag) const
431 {
432 _mm256_storeu_pd(p, _data);
433 }
434
435 template <class flag, typename std::enable_if<is_streaming<flag>::value,
436 bool>::type = 0>
437 inline void store(scalarType *p, flag) const
438 {
439 _mm256_stream_pd(p, _data);
440 }
441
442 // load packed
443 inline void load(const scalarType *p)
444 {
445 _data = _mm256_load_pd(p);
446 }
447
448 template <class flag,
449 typename std::enable_if<is_requiring_alignment<flag>::value,
450 bool>::type = 0>
451 inline void load(const scalarType *p, flag)
452 {
453 _data = _mm256_load_pd(p);
454 }
455
456 template <class flag,
457 typename std::enable_if<!is_requiring_alignment<flag>::value,
458 bool>::type = 0>
459 inline void load(const scalarType *p, flag)
460 {
461 _data = _mm256_loadu_pd(p);
462 }
463
464 // broadcast
465 inline void broadcast(const scalarType rhs)
466 {
467 _data = _mm256_set1_pd(rhs);
468 }
469
470#if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
471 // gather/scatter with sse2
472 template <typename T>
473 inline void gather(scalarType const *p, const sse2Int4<T> &indices)
474 {
475 _data = _mm256_i32gather_pd(p, indices._data, 8);
476 }
477
478 template <typename T>
479 inline void scatter(scalarType *out, const sse2Int4<T> &indices) const
480 {
481 // no scatter intrinsics for AVX2
482 alignas(alignment) scalarArray tmp;
483 _mm256_store_pd(tmp, _data);
484
485 out[_mm_extract_epi32(indices._data, 0)] = tmp[0]; // SSE4.1
486 out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
487 out[_mm_extract_epi32(indices._data, 2)] = tmp[2];
488 out[_mm_extract_epi32(indices._data, 3)] = tmp[3];
489 }
490#endif
491
492 // gather scatter with avx2
493 template <typename T>
494 inline void gather(scalarType const *p, const avx2Long4<T> &indices)
495 {
496 _data = _mm256_i64gather_pd(p, indices._data, 8);
497 }
498
499 template <typename T>
500 inline void scatter(scalarType *out, const avx2Long4<T> &indices) const
501 {
502 // no scatter intrinsics for AVX2
503 alignas(alignment) scalarArray tmp;
504 _mm256_store_pd(tmp, _data);
505
506 out[_mm256_extract_epi64(indices._data, 0)] = tmp[0];
507 out[_mm256_extract_epi64(indices._data, 1)] = tmp[1];
508 out[_mm256_extract_epi64(indices._data, 2)] = tmp[2];
509 out[_mm256_extract_epi64(indices._data, 3)] = tmp[3];
510 }
511
512 // fma
513 // this = this + a * b
514 inline void fma(const avx2Double4 &a, const avx2Double4 &b)
515 {
516 _data = _mm256_fmadd_pd(a._data, b._data, _data);
517 }
518
519 // subscript
520 // subscript operators are convienient but expensive
521 // should not be used in optimized kernels
522 inline scalarType operator[](size_t i) const
523 {
524 alignas(alignment) scalarArray tmp;
525 store(tmp, is_aligned);
526 return tmp[i];
527 }
528
529 inline scalarType &operator[](size_t i)
530 {
531 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
532 return tmp[i];
533 }
534
535 // unary ops
536 inline void operator+=(avx2Double4 rhs)
537 {
538 _data = _mm256_add_pd(_data, rhs._data);
539 }
540
541 inline void operator-=(avx2Double4 rhs)
542 {
543 _data = _mm256_sub_pd(_data, rhs._data);
544 }
545
546 inline void operator*=(avx2Double4 rhs)
547 {
548 _data = _mm256_mul_pd(_data, rhs._data);
549 }
550
551 inline void operator/=(avx2Double4 rhs)
552 {
553 _data = _mm256_div_pd(_data, rhs._data);
554 }
555};
556
557inline avx2Double4 operator+(avx2Double4 lhs, avx2Double4 rhs)
558{
559 return _mm256_add_pd(lhs._data, rhs._data);
560}
561
562inline avx2Double4 operator-(avx2Double4 lhs, avx2Double4 rhs)
563{
564 return _mm256_sub_pd(lhs._data, rhs._data);
565}
566
567inline avx2Double4 operator*(avx2Double4 lhs, avx2Double4 rhs)
568{
569 return _mm256_mul_pd(lhs._data, rhs._data);
570}
571
572inline avx2Double4 operator/(avx2Double4 lhs, avx2Double4 rhs)
573{
574 return _mm256_div_pd(lhs._data, rhs._data);
575}
576
577inline avx2Double4 sqrt(avx2Double4 in)
578{
579 return _mm256_sqrt_pd(in._data);
580}
581
582inline avx2Double4 abs(avx2Double4 in)
583{
584 // there is no avx2 _mm256_abs_pd intrinsic
585 static const __m256d sign_mask = _mm256_set1_pd(-0.); // -0. = 1 << 63
586 return _mm256_andnot_pd(sign_mask, in._data); // !sign_mask & x
587}
588
589inline avx2Double4 log(avx2Double4 in)
590{
591#if defined(TINYSIMD_HAS_SVML)
592 return _mm256_log_pd(in._data);
593#else
594 // there is no avx2 log intrinsic
595 // this is a dreadful implementation and is simply a stop gap measure
596 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
597 in.store(tmp);
598 tmp[0] = std::log(tmp[0]);
599 tmp[1] = std::log(tmp[1]);
600 tmp[2] = std::log(tmp[2]);
601 tmp[3] = std::log(tmp[3]);
602 avx2Double4 ret;
603 ret.load(tmp);
604 return ret;
605#endif
606}
607
608inline void load_interleave(
609 const double *in, std::uint32_t dataLen,
610 std::vector<avx2Double4, allocator<avx2Double4>> &out)
611{
612 alignas(avx2Double4::alignment)
613 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
614 using index_t = avx2Long4<size_t>;
615 index_t index0(tmp);
616 index_t index1 = index0 + 1;
617 index_t index2 = index0 + 2;
618 index_t index3 = index0 + 3;
619
620 // 4x unrolled loop
621 constexpr uint16_t unrl = 4;
622 size_t nBlocks = dataLen / unrl;
623 for (size_t i = 0; i < nBlocks; ++i)
624 {
625 out[unrl * i + 0].gather(in, index0);
626 out[unrl * i + 1].gather(in, index1);
627 out[unrl * i + 2].gather(in, index2);
628 out[unrl * i + 3].gather(in, index3);
629 index0 = index0 + unrl;
630 index1 = index1 + unrl;
631 index2 = index2 + unrl;
632 index3 = index3 + unrl;
633 }
634
635 // spillover loop
636 for (size_t i = unrl * nBlocks; i < dataLen; ++i)
637 {
638 out[i].gather(in, index0);
639 index0 = index0 + 1;
640 }
641}
642
643inline void deinterleave_store(
644 const std::vector<avx2Double4, allocator<avx2Double4>> &in,
645 std::uint32_t dataLen, double *out)
646{
647 alignas(avx2Double4::alignment)
648 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
649 using index_t = avx2Long4<size_t>;
650 index_t index0(tmp);
651
652 for (size_t i = 0; i < dataLen; ++i)
653 {
654 in[i].scatter(out, index0);
655 index0 = index0 + 1;
656 }
657}
658
659//////////////////////////////////////////////////////////////////////////////
660
661struct avx2Float8
662{
663 static constexpr unsigned width = 8;
664 static constexpr unsigned alignment = 32;
665
666 using scalarType = float;
667 using scalarIndexType = std::uint32_t;
668 using vectorType = __m256;
669 using scalarArray = scalarType[width];
670
671 // storage
672 vectorType _data;
673
674 // ctors
675 inline avx2Float8() = default;
676 inline avx2Float8(const avx2Float8 &rhs) = default;
677 inline avx2Float8(const vectorType &rhs) : _data(rhs)
678 {
679 }
680 inline avx2Float8(const scalarType rhs)
681 {
682 _data = _mm256_set1_ps(rhs);
683 }
684
685 // copy assignment
686 inline avx2Float8 &operator=(const avx2Float8 &) = default;
687
688 // store
689 inline void store(scalarType *p) const
690 {
691 _mm256_store_ps(p, _data);
692 }
693
694 template <class flag,
695 typename std::enable_if<is_requiring_alignment<flag>::value &&
696 !is_streaming<flag>::value,
697 bool>::type = 0>
698 inline void store(scalarType *p, flag) const
699 {
700 _mm256_store_ps(p, _data);
701 }
702
703 template <class flag,
704 typename std::enable_if<!is_requiring_alignment<flag>::value,
705 bool>::type = 0>
706 inline void store(scalarType *p, flag) const
707 {
708 _mm256_storeu_ps(p, _data);
709 }
710
711 template <class flag, typename std::enable_if<is_streaming<flag>::value,
712 bool>::type = 0>
713 inline void store(scalarType *p, flag) const
714 {
715 _mm256_stream_ps(p, _data);
716 }
717
718 // load packed
719 inline void load(const scalarType *p)
720 {
721 _data = _mm256_load_ps(p);
722 }
723
724 template <class flag,
725 typename std::enable_if<is_requiring_alignment<flag>::value,
726 bool>::type = 0>
727 inline void load(const scalarType *p, flag)
728 {
729 _data = _mm256_load_ps(p);
730 }
731
732 template <class flag,
733 typename std::enable_if<!is_requiring_alignment<flag>::value,
734 bool>::type = 0>
735 inline void load(const scalarType *p, flag)
736 {
737 _data = _mm256_loadu_ps(p);
738 }
739
740 // broadcast
741 inline void broadcast(const scalarType rhs)
742 {
743 _data = _mm256_set1_ps(rhs);
744 }
745
746 // gather scatter with avx2
747 template <typename T>
748 inline void gather(scalarType const *p, const avx2Int8<T> &indices)
749 {
750 _data = _mm256_i32gather_ps(p, indices._data, 4);
751 }
752
753 template <typename T>
754 inline void scatter(scalarType *out, const avx2Int8<T> &indices) const
755 {
756 // no scatter intrinsics for AVX2
757 alignas(alignment) scalarArray tmp;
758 _mm256_store_ps(tmp, _data);
759
760 out[_mm256_extract_epi32(indices._data, 0)] = tmp[0];
761 out[_mm256_extract_epi32(indices._data, 1)] = tmp[1];
762 out[_mm256_extract_epi32(indices._data, 2)] = tmp[2];
763 out[_mm256_extract_epi32(indices._data, 3)] = tmp[3];
764 out[_mm256_extract_epi32(indices._data, 4)] = tmp[4];
765 out[_mm256_extract_epi32(indices._data, 5)] = tmp[5];
766 out[_mm256_extract_epi32(indices._data, 6)] = tmp[6];
767 out[_mm256_extract_epi32(indices._data, 7)] = tmp[7];
768 }
769
770 // fma
771 // this = this + a * b
772 inline void fma(const avx2Float8 &a, const avx2Float8 &b)
773 {
774 _data = _mm256_fmadd_ps(a._data, b._data, _data);
775 }
776
777 // subscript
778 // subscript operators are convienient but expensive
779 // should not be used in optimized kernels
780 inline scalarType operator[](size_t i) const
781 {
782 alignas(alignment) scalarArray tmp;
783 store(tmp, is_aligned);
784 return tmp[i];
785 }
786
787 inline scalarType &operator[](size_t i)
788 {
789 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
790 return tmp[i];
791 }
792
793 inline void operator+=(avx2Float8 rhs)
794 {
795 _data = _mm256_add_ps(_data, rhs._data);
796 }
797
798 inline void operator-=(avx2Float8 rhs)
799 {
800 _data = _mm256_sub_ps(_data, rhs._data);
801 }
802
803 inline void operator*=(avx2Float8 rhs)
804 {
805 _data = _mm256_mul_ps(_data, rhs._data);
806 }
807
808 inline void operator/=(avx2Float8 rhs)
809 {
810 _data = _mm256_div_ps(_data, rhs._data);
811 }
812};
813
814inline avx2Float8 operator+(avx2Float8 lhs, avx2Float8 rhs)
815{
816 return _mm256_add_ps(lhs._data, rhs._data);
817}
818
819inline avx2Float8 operator-(avx2Float8 lhs, avx2Float8 rhs)
820{
821 return _mm256_sub_ps(lhs._data, rhs._data);
822}
823
824inline avx2Float8 operator*(avx2Float8 lhs, avx2Float8 rhs)
825{
826 return _mm256_mul_ps(lhs._data, rhs._data);
827}
828
829inline avx2Float8 operator/(avx2Float8 lhs, avx2Float8 rhs)
830{
831 return _mm256_div_ps(lhs._data, rhs._data);
832}
833
834inline avx2Float8 sqrt(avx2Float8 in)
835{
836 return _mm256_sqrt_ps(in._data);
837}
838
839inline avx2Float8 abs(avx2Float8 in)
840{
841 // there is no avx2 _mm256_abs_ps intrinsic
842 static const __m256 sign_mask = _mm256_set1_ps(-0.); // -0. = 1 << 63
843 return _mm256_andnot_ps(sign_mask, in._data); // !sign_mask & x
844}
845
846inline avx2Float8 log(avx2Float8 in)
847{
848 // there is no avx2 log intrinsic
849 // this is a dreadful implementation and is simply a stop gap measure
850 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
851 in.store(tmp);
852 tmp[0] = std::log(tmp[0]);
853 tmp[1] = std::log(tmp[1]);
854 tmp[2] = std::log(tmp[2]);
855 tmp[3] = std::log(tmp[3]);
856 tmp[4] = std::log(tmp[4]);
857 tmp[5] = std::log(tmp[5]);
858 tmp[6] = std::log(tmp[6]);
859 tmp[7] = std::log(tmp[7]);
860 avx2Float8 ret;
861 ret.load(tmp);
862 return ret;
863}
864
865inline void load_interleave(const float *in, std::uint32_t dataLen,
866 std::vector<avx2Float8, allocator<avx2Float8>> &out)
867{
868
869 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
870 0, dataLen, 2 * dataLen, 3 * dataLen,
871 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
872
873 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
874 index_t index0(tmp);
875 index_t index1 = index0 + 1;
876 index_t index2 = index0 + 2;
877 index_t index3 = index0 + 3;
878
879 // 4x unrolled loop
880 size_t nBlocks = dataLen / 4;
881 for (size_t i = 0; i < nBlocks; ++i)
882 {
883 out[4 * i + 0].gather(in, index0);
884 out[4 * i + 1].gather(in, index1);
885 out[4 * i + 2].gather(in, index2);
886 out[4 * i + 3].gather(in, index3);
887 index0 = index0 + 4;
888 index1 = index1 + 4;
889 index2 = index2 + 4;
890 index3 = index3 + 4;
891 }
892
893 // spillover loop
894 for (size_t i = 4 * nBlocks; i < dataLen; ++i)
895 {
896 out[i].gather(in, index0);
897 index0 = index0 + 1;
898 }
899}
900
901inline void deinterleave_store(
902 const std::vector<avx2Float8, allocator<avx2Float8>> &in,
903 std::uint32_t dataLen, float *out)
904{
905 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
906 0, dataLen, 2 * dataLen, 3 * dataLen,
907 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
908 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
909 index_t index0(tmp);
910
911 for (size_t i = 0; i < dataLen; ++i)
912 {
913 in[i].scatter(out, index0);
914 index0 = index0 + 1;
915 }
916}
917
918////////////////////////////////////////////////////////////////////////////////
919
920// mask type
921// mask is a int type with special properties (broad boolean vector)
922// broad boolean vectors defined and allowed values are:
923// false=0x0 and true=0xFFFFFFFF
924//
925// VERY LIMITED SUPPORT...just enough to make cubic eos work...
926//
927struct avx2Mask4 : avx2Long4<std::uint64_t>
928{
929 // bring in ctors
930 using avx2Long4::avx2Long4;
931
932 static constexpr scalarType true_v = -1;
933 static constexpr scalarType false_v = 0;
934};
935
936inline avx2Mask4 operator>(avx2Double4 lhs, avx2Double4 rhs)
937{
938 return reinterpret_cast<__m256i>(
939 _mm256_cmp_pd(lhs._data, rhs._data, _CMP_GT_OQ));
940}
941
942inline bool operator&&(avx2Mask4 lhs, bool rhs)
943{
944 bool tmp =
945 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask4::true_v));
946
947 return tmp && rhs;
948}
949
950struct avx2Mask8 : avx2Int8<std::uint32_t>
951{
952 // bring in ctors
953 using avx2Int8::avx2Int8;
954
955 static constexpr scalarType true_v = -1;
956 static constexpr scalarType false_v = 0;
957};
958
959inline avx2Mask8 operator>(avx2Float8 lhs, avx2Float8 rhs)
960{
961 return reinterpret_cast<__m256i>(_mm256_cmp_ps(rhs._data, lhs._data, 1));
962}
963
964inline bool operator&&(avx2Mask8 lhs, bool rhs)
965{
966 bool tmp =
967 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask8::true_v));
968
969 return tmp && rhs;
970}
971
972} // namespace tinysimd
973#endif // defined(__AVX2__)
974#endif
void load_interleave(const T *in, size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition: scalar.hpp:309
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:298
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:235
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:275
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:303
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:255
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:366
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:376
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:294
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, size_t dataLen, T *out)
Definition: scalar.hpp:319
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:215