Nektar++
avx2.hpp
Go to the documentation of this file.
1///////////////////////////////////////////////////////////////////////////////
2//
3// File: avx2.hpp
4//
5// For more information, please see: http://www.nektar.info
6//
7// The MIT License
8//
9// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10// Department of Aeronautics, Imperial College London (UK), and Scientific
11// Computing and Imaging Institute, University of Utah (USA).
12//
13// Permission is hereby granted, free of charge, to any person obtaining a
14// copy of this software and associated documentation files (the "Software"),
15// to deal in the Software without restriction, including without limitation
16// the rights to use, copy, modify, merge, publish, distribute, sublicense,
17// and/or sell copies of the Software, and to permit persons to whom the
18// Software is furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included
21// in all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29// DEALINGS IN THE SOFTWARE.
30//
31// Description: Vector type using avx2 extension.
32//
33///////////////////////////////////////////////////////////////////////////////
34
35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
37
38#if defined(__x86_64__)
39#include <immintrin.h>
40#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41#define TINYSIMD_HAS_SVML
42#endif
43#endif
44#include "allocator.hpp"
45#include "sse2.hpp"
46#include "traits.hpp"
47#include <cmath>
48#include <vector>
49
51{
52
53template <typename scalarType, int width = 0> struct avx2
54{
55 using type = void;
56};
57
58} // namespace tinysimd::abi
59
60#if defined(__AVX2__) && defined(NEKTAR_ENABLE_SIMD_AVX2)
61
62namespace tinysimd
63{
64
65// forward declaration of concrete types
66template <typename T> struct avx2Long4;
67template <typename T> struct avx2Int8;
68struct avx2Double4;
69struct avx2Float8;
70struct avx2Mask4;
71struct avx2Mask8;
72
73namespace abi
74{
75
76// mapping between abstract types and concrete floating point types
77template <> struct avx2<double>
78{
79 using type = avx2Double4;
80};
81template <> struct avx2<float>
82{
83 using type = avx2Float8;
84};
85// generic index mapping
86// assumes index type width same as floating point type
87template <> struct avx2<std::int64_t>
88{
89 using type = avx2Long4<std::int64_t>;
90};
91template <> struct avx2<std::uint64_t>
92{
93 using type = avx2Long4<std::uint64_t>;
94};
95#if defined(__APPLE__)
96template <> struct avx2<std::size_t>
97{
98 using type = avx2Long4<std::size_t>;
99};
100#endif
101template <> struct avx2<std::int32_t>
102{
103 using type = avx2Int8<std::int32_t>;
104};
105template <> struct avx2<std::uint32_t>
106{
107 using type = avx2Int8<std::uint32_t>;
108};
109// specialized index mapping
110template <> struct avx2<std::int64_t, 4>
111{
112 using type = avx2Long4<std::int64_t>;
113};
114template <> struct avx2<std::uint64_t, 4>
115{
116 using type = avx2Long4<std::uint64_t>;
117};
118#if defined(__APPLE__)
119template <> struct avx2<std::size_t, 4>
120{
121 using type = avx2Long4<std::size_t>;
122};
123#endif
124template <> struct avx2<std::int32_t, 4>
125{
126 using type = sse2Int4<std::int32_t>;
127};
128template <> struct avx2<std::uint32_t, 4>
129{
130 using type = sse2Int4<std::uint32_t>;
131};
132template <> struct avx2<std::int32_t, 8>
133{
134 using type = avx2Int8<std::int32_t>;
135};
136template <> struct avx2<std::uint32_t, 8>
137{
138 using type = avx2Int8<std::uint32_t>;
139};
140// bool mapping
141template <> struct avx2<bool, 4>
142{
143 using type = avx2Mask4;
144};
145template <> struct avx2<bool, 8>
146{
147 using type = avx2Mask8;
148};
149
150} // namespace abi
151
152// concrete types
153template <typename T> struct avx2Int8
154{
155 static_assert(std::is_integral<T>::value && sizeof(T) == 4,
156 "4 bytes Integral required.");
157
158 static constexpr unsigned int width = 8;
159 static constexpr unsigned int alignment = 32;
160
161 using scalarType = T;
162 using vectorType = __m256i;
163 using scalarArray = scalarType[width];
164
165 // storage
166 vectorType _data;
167
168 // ctors
169 inline avx2Int8() = default;
170 inline avx2Int8(const avx2Int8 &rhs) = default;
171 inline avx2Int8(const vectorType &rhs) : _data(rhs)
172 {
173 }
174 inline avx2Int8(const scalarType rhs)
175 {
176 _data = _mm256_set1_epi32(rhs);
177 }
178 explicit inline avx2Int8(scalarArray &rhs)
179 {
180 _data = _mm256_load_si256(reinterpret_cast<vectorType *>(rhs));
181 }
182
183 // copy assignment
184 inline avx2Int8 &operator=(const avx2Int8 &) = default;
185
186 // store
187 inline void store(scalarType *p) const
188 {
189 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
190 }
191
192 template <class flag,
193 typename std::enable_if<is_requiring_alignment<flag>::value &&
194 !is_streaming<flag>::value,
195 bool>::type = 0>
196 inline void store(scalarType *p, flag) const
197 {
198 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
199 }
200
201 template <class flag,
202 typename std::enable_if<!is_requiring_alignment<flag>::value,
203 bool>::type = 0>
204 inline void store(scalarType *p, flag) const
205 {
206 _mm256_storeu_si256(reinterpret_cast<vectorType *>(p), _data);
207 }
208
209 inline void load(const scalarType *p)
210 {
211 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
212 }
213
214 template <class flag,
215 typename std::enable_if<is_requiring_alignment<flag>::value &&
216 !is_streaming<flag>::value,
217 bool>::type = 0>
218 inline void load(const scalarType *p, flag)
219 {
220 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
221 }
222
223 template <class flag,
224 typename std::enable_if<!is_requiring_alignment<flag>::value,
225 bool>::type = 0>
226 inline void load(const scalarType *p, flag)
227 {
228 _data = _mm256_loadu_si256(reinterpret_cast<const vectorType *>(p));
229 }
230
231 inline void broadcast(const scalarType rhs)
232 {
233 _data = _mm256_set1_epi32(rhs);
234 }
235
236 // subscript
237 // subscriptsoperators are convienient but expensive
238 // should not be used in optimized kernels
239 inline scalarType operator[](size_t i) const
240 {
241 alignas(alignment) scalarArray tmp;
242 store(tmp, is_aligned);
243 return tmp[i];
244 }
245
246 inline scalarType &operator[](size_t i)
247 {
248 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
249 return tmp[i];
250 }
251};
252
253template <typename T>
254inline avx2Int8<T> operator+(avx2Int8<T> lhs, avx2Int8<T> rhs)
255{
256 return _mm256_add_epi32(lhs._data, rhs._data);
257}
258
259template <
260 typename T, typename U,
261 typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
262inline avx2Int8<T> operator+(avx2Int8<T> lhs, U rhs)
263{
264 return _mm256_add_epi32(lhs._data, _mm256_set1_epi32(rhs));
265}
266
267////////////////////////////////////////////////////////////////////////////////
268
269template <typename T> struct avx2Long4
270{
271 static_assert(std::is_integral<T>::value && sizeof(T) == 8,
272 "8 bytes Integral required.");
273
274 static constexpr unsigned int width = 4;
275 static constexpr unsigned int alignment = 32;
276
277 using scalarType = T;
278 using vectorType = __m256i;
279 using scalarArray = scalarType[width];
280
281 // storage
282 vectorType _data;
283
284 // ctorsv
285 inline avx2Long4() = default;
286 inline avx2Long4(const avx2Long4 &rhs) = default;
287 inline avx2Long4(const vectorType &rhs) : _data(rhs)
288 {
289 }
290 inline avx2Long4(const scalarType rhs)
291 {
292 _data = _mm256_set1_epi64x(rhs);
293 }
294 explicit inline avx2Long4(scalarArray &rhs)
295 {
296 _data = _mm256_load_si256(reinterpret_cast<vectorType *>(rhs));
297 }
298
299 // copy assignment
300 inline avx2Long4 &operator=(const avx2Long4 &) = default;
301
302 // store
303 inline void store(scalarType *p) const
304 {
305 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
306 }
307
308 template <class flag,
309 typename std::enable_if<is_requiring_alignment<flag>::value &&
310 !is_streaming<flag>::value,
311 bool>::type = 0>
312 inline void store(scalarType *p, flag) const
313 {
314 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
315 }
316
317 template <class flag,
318 typename std::enable_if<!is_requiring_alignment<flag>::value,
319 bool>::type = 0>
320 inline void store(scalarType *p, flag) const
321 {
322 _mm256_storeu_si256(reinterpret_cast<vectorType *>(p), _data);
323 }
324
325 inline void load(const scalarType *p)
326 {
327 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
328 }
329
330 template <class flag,
331 typename std::enable_if<is_requiring_alignment<flag>::value &&
332 !is_streaming<flag>::value,
333 bool>::type = 0>
334 inline void load(const scalarType *p, flag)
335 {
336 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
337 }
338
339 template <class flag,
340 typename std::enable_if<!is_requiring_alignment<flag>::value,
341 bool>::type = 0>
342 inline void load(const scalarType *p, flag)
343 {
344 _data = _mm256_loadu_si256(reinterpret_cast<const vectorType *>(p));
345 }
346
347 inline void broadcast(const scalarType rhs)
348 {
349 _data = _mm256_set1_epi64x(rhs);
350 }
351
352 // subscript
353 // subscript operators are convienient but expensive
354 // should not be used in optimized kernels
355 inline scalarType operator[](size_t i) const
356 {
357 alignas(alignment) scalarArray tmp;
358 store(tmp, is_aligned);
359 return tmp[i];
360 }
361
362 inline scalarType &operator[](size_t i)
363 {
364 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
365 return tmp[i];
366 }
367};
368
369template <typename T>
370inline avx2Long4<T> operator+(avx2Long4<T> lhs, avx2Long4<T> rhs)
371{
372 return _mm256_add_epi64(lhs._data, rhs._data);
373}
374
375template <
376 typename T, typename U,
377 typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
378inline avx2Long4<T> operator+(avx2Long4<T> lhs, U rhs)
379{
380 return _mm256_add_epi64(lhs._data, _mm256_set1_epi64x(rhs));
381}
382
383////////////////////////////////////////////////////////////////////////////////
384
385struct avx2Double4
386{
387 static constexpr unsigned width = 4;
388 static constexpr unsigned alignment = 32;
389
390 using scalarType = double;
391 using scalarIndexType = std::uint64_t;
392 using vectorType = __m256d;
393 using scalarArray = scalarType[width];
394
395 // storage
396 vectorType _data;
397
398 // ctors
399 inline avx2Double4() = default;
400 inline avx2Double4(const avx2Double4 &rhs) = default;
401 inline avx2Double4(const vectorType &rhs) : _data(rhs)
402 {
403 }
404 inline avx2Double4(const scalarType rhs)
405 {
406 _data = _mm256_set1_pd(rhs);
407 }
408
409 // copy assignment
410 inline avx2Double4 &operator=(const avx2Double4 &) = default;
411
412 // store
413 inline void store(scalarType *p) const
414 {
415 _mm256_store_pd(p, _data);
416 }
417
418 template <class flag,
419 typename std::enable_if<is_requiring_alignment<flag>::value &&
420 !is_streaming<flag>::value,
421 bool>::type = 0>
422 inline void store(scalarType *p, flag) const
423 {
424 _mm256_store_pd(p, _data);
425 }
426
427 template <class flag,
428 typename std::enable_if<!is_requiring_alignment<flag>::value,
429 bool>::type = 0>
430 inline void store(scalarType *p, flag) const
431 {
432 _mm256_storeu_pd(p, _data);
433 }
434
435 template <class flag, typename std::enable_if<is_streaming<flag>::value,
436 bool>::type = 0>
437 inline void store(scalarType *p, flag) const
438 {
439 _mm256_stream_pd(p, _data);
440 }
441
442 // load packed
443 inline void load(const scalarType *p)
444 {
445 _data = _mm256_load_pd(p);
446 }
447
448 template <class flag,
449 typename std::enable_if<is_requiring_alignment<flag>::value,
450 bool>::type = 0>
451 inline void load(const scalarType *p, flag)
452 {
453 _data = _mm256_load_pd(p);
454 }
455
456 template <class flag,
457 typename std::enable_if<!is_requiring_alignment<flag>::value,
458 bool>::type = 0>
459 inline void load(const scalarType *p, flag)
460 {
461 _data = _mm256_loadu_pd(p);
462 }
463
464 // broadcast
465 inline void broadcast(const scalarType rhs)
466 {
467 _data = _mm256_set1_pd(rhs);
468 }
469
470#if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
471 // gather/scatter with sse2
472 template <typename T>
473 inline void gather(scalarType const *p, const sse2Int4<T> &indices)
474 {
475 _data = _mm256_i32gather_pd(p, indices._data, 8);
476 }
477
478 template <typename T>
479 inline void scatter(scalarType *out, const sse2Int4<T> &indices) const
480 {
481 // no scatter intrinsics for AVX2
482 alignas(alignment) scalarArray tmp;
483 _mm256_store_pd(tmp, _data);
484
485 out[_mm_extract_epi32(indices._data, 0)] = tmp[0]; // SSE4.1
486 out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
487 out[_mm_extract_epi32(indices._data, 2)] = tmp[2];
488 out[_mm_extract_epi32(indices._data, 3)] = tmp[3];
489 }
490#endif
491
492 // gather scatter with avx2
493 template <typename T>
494 inline void gather(scalarType const *p, const avx2Long4<T> &indices)
495 {
496 _data = _mm256_i64gather_pd(p, indices._data, 8);
497 }
498
499 template <typename T>
500 inline void scatter(scalarType *out, const avx2Long4<T> &indices) const
501 {
502 // no scatter intrinsics for AVX2
503 alignas(alignment) scalarArray tmp;
504 _mm256_store_pd(tmp, _data);
505
506 out[_mm256_extract_epi64(indices._data, 0)] = tmp[0];
507 out[_mm256_extract_epi64(indices._data, 1)] = tmp[1];
508 out[_mm256_extract_epi64(indices._data, 2)] = tmp[2];
509 out[_mm256_extract_epi64(indices._data, 3)] = tmp[3];
510 }
511
512 // fma
513 // this = this + a * b
514 inline void fma(const avx2Double4 &a, const avx2Double4 &b)
515 {
516 _data = _mm256_fmadd_pd(a._data, b._data, _data);
517 }
518
519 // subscript
520 // subscript operators are convienient but expensive
521 // should not be used in optimized kernels
522 inline scalarType operator[](size_t i) const
523 {
524 alignas(alignment) scalarArray tmp;
525 store(tmp, is_aligned);
526 return tmp[i];
527 }
528
529 inline scalarType &operator[](size_t i)
530 {
531 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
532 return tmp[i];
533 }
534
535 // unary ops
536 inline void operator+=(avx2Double4 rhs)
537 {
538 _data = _mm256_add_pd(_data, rhs._data);
539 }
540
541 inline void operator-=(avx2Double4 rhs)
542 {
543 _data = _mm256_sub_pd(_data, rhs._data);
544 }
545
546 inline void operator*=(avx2Double4 rhs)
547 {
548 _data = _mm256_mul_pd(_data, rhs._data);
549 }
550
551 inline void operator/=(avx2Double4 rhs)
552 {
553 _data = _mm256_div_pd(_data, rhs._data);
554 }
555};
556
557inline avx2Double4 operator+(avx2Double4 lhs, avx2Double4 rhs)
558{
559 return _mm256_add_pd(lhs._data, rhs._data);
560}
561
562inline avx2Double4 operator-(avx2Double4 lhs, avx2Double4 rhs)
563{
564 return _mm256_sub_pd(lhs._data, rhs._data);
565}
566
567inline avx2Double4 operator*(avx2Double4 lhs, avx2Double4 rhs)
568{
569 return _mm256_mul_pd(lhs._data, rhs._data);
570}
571
572inline avx2Double4 operator/(avx2Double4 lhs, avx2Double4 rhs)
573{
574 return _mm256_div_pd(lhs._data, rhs._data);
575}
576
577inline avx2Double4 sqrt(avx2Double4 in)
578{
579 return _mm256_sqrt_pd(in._data);
580}
581
582inline avx2Double4 abs(avx2Double4 in)
583{
584 // there is no avx2 _mm256_abs_pd intrinsic
585 static const __m256d sign_mask = _mm256_set1_pd(-0.); // -0. = 1 << 63
586 return _mm256_andnot_pd(sign_mask, in._data); // !sign_mask & x
587}
588
589inline avx2Double4 log(avx2Double4 in)
590{
591#if defined(TINYSIMD_HAS_SVML)
592 return _mm256_log_pd(in._data);
593#else
594 // there is no avx2 log intrinsic
595 // this is a dreadful implementation and is simply a stop gap measure
596 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
597 in.store(tmp);
598 tmp[0] = std::log(tmp[0]);
599 tmp[1] = std::log(tmp[1]);
600 tmp[2] = std::log(tmp[2]);
601 tmp[3] = std::log(tmp[3]);
602 avx2Double4 ret;
603 ret.load(tmp);
604 return ret;
605#endif
606}
607
608inline void load_unalign_interleave(
609 const double *in, const std::uint32_t dataLen,
610 std::vector<avx2Double4, allocator<avx2Double4>> &out)
611{
612 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
613 for (size_t i = 0; i < dataLen; ++i)
614 {
615 tmp[0] = in[i];
616 tmp[1] = in[i + dataLen];
617 tmp[2] = in[i + 2 * dataLen];
618 tmp[3] = in[i + 3 * dataLen];
619 out[i].load(tmp);
620 }
621}
622
623inline void load_interleave(
624 const double *in, std::uint32_t dataLen,
625 std::vector<avx2Double4, allocator<avx2Double4>> &out)
626{
627 alignas(avx2Double4::alignment)
628 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
629 using index_t = avx2Long4<size_t>;
630 index_t index0(tmp);
631 index_t index1 = index0 + 1;
632 index_t index2 = index0 + 2;
633 index_t index3 = index0 + 3;
634
635 // 4x unrolled loop
636 constexpr uint16_t unrl = 4;
637 size_t nBlocks = dataLen / unrl;
638 for (size_t i = 0; i < nBlocks; ++i)
639 {
640 out[unrl * i + 0].gather(in, index0);
641 out[unrl * i + 1].gather(in, index1);
642 out[unrl * i + 2].gather(in, index2);
643 out[unrl * i + 3].gather(in, index3);
644 index0 = index0 + unrl;
645 index1 = index1 + unrl;
646 index2 = index2 + unrl;
647 index3 = index3 + unrl;
648 }
649
650 // spillover loop
651 for (size_t i = unrl * nBlocks; i < dataLen; ++i)
652 {
653 out[i].gather(in, index0);
654 index0 = index0 + 1;
655 }
656}
657
659 const std::vector<avx2Double4, allocator<avx2Double4>> &in,
660 const std::uint32_t dataLen, double *out)
661{
662 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
663 for (size_t i = 0; i < dataLen; ++i)
664 {
665 in[i].store(tmp);
666 out[i] = tmp[0];
667 out[i + dataLen] = tmp[1];
668 out[i + 2 * dataLen] = tmp[2];
669 out[i + 3 * dataLen] = tmp[3];
670 }
671}
672
673inline void deinterleave_store(
674 const std::vector<avx2Double4, allocator<avx2Double4>> &in,
675 std::uint32_t dataLen, double *out)
676{
677 alignas(avx2Double4::alignment)
678 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
679 using index_t = avx2Long4<size_t>;
680 index_t index0(tmp);
681
682 for (size_t i = 0; i < dataLen; ++i)
683 {
684 in[i].scatter(out, index0);
685 index0 = index0 + 1;
686 }
687}
688
689//////////////////////////////////////////////////////////////////////////////
690
691struct avx2Float8
692{
693 static constexpr unsigned width = 8;
694 static constexpr unsigned alignment = 32;
695
696 using scalarType = float;
697 using scalarIndexType = std::uint32_t;
698 using vectorType = __m256;
699 using scalarArray = scalarType[width];
700
701 // storage
702 vectorType _data;
703
704 // ctors
705 inline avx2Float8() = default;
706 inline avx2Float8(const avx2Float8 &rhs) = default;
707 inline avx2Float8(const vectorType &rhs) : _data(rhs)
708 {
709 }
710 inline avx2Float8(const scalarType rhs)
711 {
712 _data = _mm256_set1_ps(rhs);
713 }
714
715 // copy assignment
716 inline avx2Float8 &operator=(const avx2Float8 &) = default;
717
718 // store
719 inline void store(scalarType *p) const
720 {
721 _mm256_store_ps(p, _data);
722 }
723
724 template <class flag,
725 typename std::enable_if<is_requiring_alignment<flag>::value &&
726 !is_streaming<flag>::value,
727 bool>::type = 0>
728 inline void store(scalarType *p, flag) const
729 {
730 _mm256_store_ps(p, _data);
731 }
732
733 template <class flag,
734 typename std::enable_if<!is_requiring_alignment<flag>::value,
735 bool>::type = 0>
736 inline void store(scalarType *p, flag) const
737 {
738 _mm256_storeu_ps(p, _data);
739 }
740
741 template <class flag, typename std::enable_if<is_streaming<flag>::value,
742 bool>::type = 0>
743 inline void store(scalarType *p, flag) const
744 {
745 _mm256_stream_ps(p, _data);
746 }
747
748 // load packed
749 inline void load(const scalarType *p)
750 {
751 _data = _mm256_load_ps(p);
752 }
753
754 template <class flag,
755 typename std::enable_if<is_requiring_alignment<flag>::value,
756 bool>::type = 0>
757 inline void load(const scalarType *p, flag)
758 {
759 _data = _mm256_load_ps(p);
760 }
761
762 template <class flag,
763 typename std::enable_if<!is_requiring_alignment<flag>::value,
764 bool>::type = 0>
765 inline void load(const scalarType *p, flag)
766 {
767 _data = _mm256_loadu_ps(p);
768 }
769
770 // broadcast
771 inline void broadcast(const scalarType rhs)
772 {
773 _data = _mm256_set1_ps(rhs);
774 }
775
776 // gather scatter with avx2
777 template <typename T>
778 inline void gather(scalarType const *p, const avx2Int8<T> &indices)
779 {
780 _data = _mm256_i32gather_ps(p, indices._data, 4);
781 }
782
783 template <typename T>
784 inline void scatter(scalarType *out, const avx2Int8<T> &indices) const
785 {
786 // no scatter intrinsics for AVX2
787 alignas(alignment) scalarArray tmp;
788 _mm256_store_ps(tmp, _data);
789
790 out[_mm256_extract_epi32(indices._data, 0)] = tmp[0];
791 out[_mm256_extract_epi32(indices._data, 1)] = tmp[1];
792 out[_mm256_extract_epi32(indices._data, 2)] = tmp[2];
793 out[_mm256_extract_epi32(indices._data, 3)] = tmp[3];
794 out[_mm256_extract_epi32(indices._data, 4)] = tmp[4];
795 out[_mm256_extract_epi32(indices._data, 5)] = tmp[5];
796 out[_mm256_extract_epi32(indices._data, 6)] = tmp[6];
797 out[_mm256_extract_epi32(indices._data, 7)] = tmp[7];
798 }
799
800 // fma
801 // this = this + a * b
802 inline void fma(const avx2Float8 &a, const avx2Float8 &b)
803 {
804 _data = _mm256_fmadd_ps(a._data, b._data, _data);
805 }
806
807 // subscript
808 // subscript operators are convienient but expensive
809 // should not be used in optimized kernels
810 inline scalarType operator[](size_t i) const
811 {
812 alignas(alignment) scalarArray tmp;
813 store(tmp, is_aligned);
814 return tmp[i];
815 }
816
817 inline scalarType &operator[](size_t i)
818 {
819 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
820 return tmp[i];
821 }
822
823 inline void operator+=(avx2Float8 rhs)
824 {
825 _data = _mm256_add_ps(_data, rhs._data);
826 }
827
828 inline void operator-=(avx2Float8 rhs)
829 {
830 _data = _mm256_sub_ps(_data, rhs._data);
831 }
832
833 inline void operator*=(avx2Float8 rhs)
834 {
835 _data = _mm256_mul_ps(_data, rhs._data);
836 }
837
838 inline void operator/=(avx2Float8 rhs)
839 {
840 _data = _mm256_div_ps(_data, rhs._data);
841 }
842};
843
844inline avx2Float8 operator+(avx2Float8 lhs, avx2Float8 rhs)
845{
846 return _mm256_add_ps(lhs._data, rhs._data);
847}
848
849inline avx2Float8 operator-(avx2Float8 lhs, avx2Float8 rhs)
850{
851 return _mm256_sub_ps(lhs._data, rhs._data);
852}
853
854inline avx2Float8 operator*(avx2Float8 lhs, avx2Float8 rhs)
855{
856 return _mm256_mul_ps(lhs._data, rhs._data);
857}
858
859inline avx2Float8 operator/(avx2Float8 lhs, avx2Float8 rhs)
860{
861 return _mm256_div_ps(lhs._data, rhs._data);
862}
863
864inline avx2Float8 sqrt(avx2Float8 in)
865{
866 return _mm256_sqrt_ps(in._data);
867}
868
869inline avx2Float8 abs(avx2Float8 in)
870{
871 // there is no avx2 _mm256_abs_ps intrinsic
872 static const __m256 sign_mask = _mm256_set1_ps(-0.); // -0. = 1 << 63
873 return _mm256_andnot_ps(sign_mask, in._data); // !sign_mask & x
874}
875
876inline avx2Float8 log(avx2Float8 in)
877{
878 // there is no avx2 log intrinsic
879 // this is a dreadful implementation and is simply a stop gap measure
880 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
881 in.store(tmp);
882 tmp[0] = std::log(tmp[0]);
883 tmp[1] = std::log(tmp[1]);
884 tmp[2] = std::log(tmp[2]);
885 tmp[3] = std::log(tmp[3]);
886 tmp[4] = std::log(tmp[4]);
887 tmp[5] = std::log(tmp[5]);
888 tmp[6] = std::log(tmp[6]);
889 tmp[7] = std::log(tmp[7]);
890 avx2Float8 ret;
891 ret.load(tmp);
892 return ret;
893}
894
895inline void load_unalign_interleave(
896 const double *in, const std::uint32_t dataLen,
897 std::vector<avx2Float8, allocator<avx2Float8>> &out)
898{
899 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
900 for (size_t i = 0; i < dataLen; ++i)
901 {
902 tmp[0] = in[i];
903 tmp[1] = in[i + dataLen];
904 tmp[2] = in[i + 2 * dataLen];
905 tmp[3] = in[i + 3 * dataLen];
906 tmp[4] = in[i + 4 * dataLen];
907 tmp[5] = in[i + 5 * dataLen];
908 tmp[6] = in[i + 6 * dataLen];
909 tmp[7] = in[i + 7 * dataLen];
910 out[i].load(tmp);
911 }
912}
913
914inline void load_interleave(const float *in, std::uint32_t dataLen,
915 std::vector<avx2Float8, allocator<avx2Float8>> &out)
916{
917
918 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
919 0, dataLen, 2 * dataLen, 3 * dataLen,
920 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
921
922 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
923 index_t index0(tmp);
924 index_t index1 = index0 + 1;
925 index_t index2 = index0 + 2;
926 index_t index3 = index0 + 3;
927
928 // 4x unrolled loop
929 size_t nBlocks = dataLen / 4;
930 for (size_t i = 0; i < nBlocks; ++i)
931 {
932 out[4 * i + 0].gather(in, index0);
933 out[4 * i + 1].gather(in, index1);
934 out[4 * i + 2].gather(in, index2);
935 out[4 * i + 3].gather(in, index3);
936 index0 = index0 + 4;
937 index1 = index1 + 4;
938 index2 = index2 + 4;
939 index3 = index3 + 4;
940 }
941
942 // spillover loop
943 for (size_t i = 4 * nBlocks; i < dataLen; ++i)
944 {
945 out[i].gather(in, index0);
946 index0 = index0 + 1;
947 }
948}
949
951 const std::vector<avx2Float8, allocator<avx2Float8>> &in,
952 const std::uint32_t dataLen, double *out)
953{
954 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
955 for (size_t i = 0; i < dataLen; ++i)
956 {
957 in[i].store(tmp);
958 out[i] = tmp[0];
959 out[i + dataLen] = tmp[1];
960 out[i + 2 * dataLen] = tmp[2];
961 out[i + 3 * dataLen] = tmp[3];
962 out[i + 4 * dataLen] = tmp[4];
963 out[i + 5 * dataLen] = tmp[5];
964 out[i + 6 * dataLen] = tmp[6];
965 out[i + 7 * dataLen] = tmp[7];
966 }
967}
968
969inline void deinterleave_store(
970 const std::vector<avx2Float8, allocator<avx2Float8>> &in,
971 std::uint32_t dataLen, float *out)
972{
973 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
974 0, dataLen, 2 * dataLen, 3 * dataLen,
975 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
976 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
977 index_t index0(tmp);
978
979 for (size_t i = 0; i < dataLen; ++i)
980 {
981 in[i].scatter(out, index0);
982 index0 = index0 + 1;
983 }
984}
985
986////////////////////////////////////////////////////////////////////////////////
987
988// mask type
989// mask is a int type with special properties (broad boolean vector)
990// broad boolean vectors defined and allowed values are:
991// false=0x0 and true=0xFFFFFFFF
992//
993// VERY LIMITED SUPPORT...just enough to make cubic eos work...
994//
995struct avx2Mask4 : avx2Long4<std::uint64_t>
996{
997 // bring in ctors
998 using avx2Long4::avx2Long4;
999
1000 static constexpr scalarType true_v = -1;
1001 static constexpr scalarType false_v = 0;
1002};
1003
1004inline avx2Mask4 operator>(avx2Double4 lhs, avx2Double4 rhs)
1005{
1006 return reinterpret_cast<__m256i>(
1007 _mm256_cmp_pd(lhs._data, rhs._data, _CMP_GT_OQ));
1008}
1009
1010inline bool operator&&(avx2Mask4 lhs, bool rhs)
1011{
1012 bool tmp =
1013 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask4::true_v));
1014
1015 return tmp && rhs;
1016}
1017
1018struct avx2Mask8 : avx2Int8<std::uint32_t>
1019{
1020 // bring in ctors
1021 using avx2Int8::avx2Int8;
1022
1023 static constexpr scalarType true_v = -1;
1024 static constexpr scalarType false_v = 0;
1025};
1026
1027inline avx2Mask8 operator>(avx2Float8 lhs, avx2Float8 rhs)
1028{
1029 return reinterpret_cast<__m256i>(_mm256_cmp_ps(rhs._data, lhs._data, 1));
1030}
1031
1032inline bool operator&&(avx2Mask8 lhs, bool rhs)
1033{
1034 bool tmp =
1035 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask8::true_v));
1036
1037 return tmp && rhs;
1038}
1039
1040} // namespace tinysimd
1041#endif // defined(__AVX2__)
1042#endif
STL namespace.
void load_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition: scalar.hpp:320
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:298
void deinterleave_unalign_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
Definition: scalar.hpp:330
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:235
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:275
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:303
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:255
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:388
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:398
void load_unalign_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition: scalar.hpp:309
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
Definition: scalar.hpp:341
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:294
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:215