Nektar++
avx2.hpp
Go to the documentation of this file.
1///////////////////////////////////////////////////////////////////////////////
2//
3// File: avx2.hpp
4//
5// For more information, please see: http://www.nektar.info
6//
7// The MIT License
8//
9// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10// Department of Aeronautics, Imperial College London (UK), and Scientific
11// Computing and Imaging Institute, University of Utah (USA).
12//
13// Permission is hereby granted, free of charge, to any person obtaining a
14// copy of this software and associated documentation files (the "Software"),
15// to deal in the Software without restriction, including without limitation
16// the rights to use, copy, modify, merge, publish, distribute, sublicense,
17// and/or sell copies of the Software, and to permit persons to whom the
18// Software is furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included
21// in all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29// DEALINGS IN THE SOFTWARE.
30//
31// Description: Vector type using avx2 extension.
32//
33///////////////////////////////////////////////////////////////////////////////
34
35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
37
38#if defined(__x86_64__)
39#include <immintrin.h>
40#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41#define TINYSIMD_HAS_SVML
42#endif
43#endif
44#include "allocator.hpp"
45#include "sse2.hpp"
46#include "traits.hpp"
47#include <cmath>
48#include <vector>
49
51{
52
53template <typename scalarType, int width = 0> struct avx2
54{
55 using type = void;
56};
57
58} // namespace tinysimd::abi
59
60#if defined(__AVX2__) && defined(NEKTAR_ENABLE_SIMD_AVX2)
61
62namespace tinysimd
63{
64
65// forward declaration of concrete types
66template <typename T> struct avx2Long4;
67template <typename T> struct avx2Int8;
68struct avx2Double4;
69struct avx2Float8;
70struct avx2Mask4;
71struct avx2Mask8;
72
73namespace abi
74{
75
76// mapping between abstract types and concrete floating point types
77template <> struct avx2<double>
78{
79 using type = avx2Double4;
80};
81template <> struct avx2<float>
82{
83 using type = avx2Float8;
84};
85// generic index mapping
86// assumes index type width same as floating point type
87template <> struct avx2<std::int64_t>
88{
89 using type = avx2Long4<std::int64_t>;
90};
91template <> struct avx2<std::uint64_t>
92{
93 using type = avx2Long4<std::uint64_t>;
94};
95#if defined(__APPLE__)
96template <> struct avx2<std::size_t>
97{
98 using type = avx2Long4<std::size_t>;
99};
100#endif
101template <> struct avx2<std::int32_t>
102{
103 using type = avx2Int8<std::int32_t>;
104};
105template <> struct avx2<std::uint32_t>
106{
107 using type = avx2Int8<std::uint32_t>;
108};
109// specialized index mapping
110template <> struct avx2<std::int64_t, 4>
111{
112 using type = avx2Long4<std::int64_t>;
113};
114template <> struct avx2<std::uint64_t, 4>
115{
116 using type = avx2Long4<std::uint64_t>;
117};
118#if defined(__APPLE__)
119template <> struct avx2<std::size_t, 4>
120{
121 using type = avx2Long4<std::size_t>;
122};
123#endif
124template <> struct avx2<std::int32_t, 4>
125{
126 using type = sse2Int4<std::int32_t>;
127};
128template <> struct avx2<std::uint32_t, 4>
129{
130 using type = sse2Int4<std::uint32_t>;
131};
132template <> struct avx2<std::int32_t, 8>
133{
134 using type = avx2Int8<std::int32_t>;
135};
136template <> struct avx2<std::uint32_t, 8>
137{
138 using type = avx2Int8<std::uint32_t>;
139};
140// bool mapping
141template <> struct avx2<bool, 4>
142{
143 using type = avx2Mask4;
144};
145template <> struct avx2<bool, 8>
146{
147 using type = avx2Mask8;
148};
149
150} // namespace abi
151
152// concrete types
153template <typename T> struct avx2Int8
154{
155 static_assert(std::is_integral_v<T> && sizeof(T) == 4,
156 "4 bytes Integral required.");
157
158 static constexpr unsigned int width = 8;
159 static constexpr unsigned int alignment = 32;
160
161 using scalarType = T;
162 using vectorType = __m256i;
163 using scalarArray = scalarType[width];
164
165 // storage
166 vectorType _data;
167
168 // ctors
169 inline avx2Int8() = default;
170 inline avx2Int8(const avx2Int8 &rhs) = default;
171 inline avx2Int8(const vectorType &rhs) : _data(rhs)
172 {
173 }
174 inline avx2Int8(const scalarType rhs)
175 {
176 _data = _mm256_set1_epi32(rhs);
177 }
178 explicit inline avx2Int8(scalarArray &rhs)
179 {
180 _data = _mm256_load_si256(reinterpret_cast<vectorType *>(rhs));
181 }
182
183 // copy assignment
184 inline avx2Int8 &operator=(const avx2Int8 &) = default;
185
186 // store
187 inline void store(scalarType *p) const
188 {
189 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
190 }
191
192 template <class flag,
193 typename std::enable_if<is_requiring_alignment_v<flag> &&
194 !is_streaming_v<flag>,
195 bool>::type = 0>
196 inline void store(scalarType *p, flag) const
197 {
198 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
199 }
200
201 template <class flag, typename std::enable_if<
202 !is_requiring_alignment_v<flag>, bool>::type = 0>
203 inline void store(scalarType *p, flag) const
204 {
205 _mm256_storeu_si256(reinterpret_cast<vectorType *>(p), _data);
206 }
207
208 inline void load(const scalarType *p)
209 {
210 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
211 }
212
213 template <class flag,
214 typename std::enable_if<is_requiring_alignment_v<flag> &&
215 !is_streaming_v<flag>,
216 bool>::type = 0>
217 inline void load(const scalarType *p, flag)
218 {
219 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
220 }
221
222 template <class flag, typename std::enable_if<
223 !is_requiring_alignment_v<flag>, bool>::type = 0>
224 inline void load(const scalarType *p, flag)
225 {
226 _data = _mm256_loadu_si256(reinterpret_cast<const vectorType *>(p));
227 }
228
229 inline void broadcast(const scalarType rhs)
230 {
231 _data = _mm256_set1_epi32(rhs);
232 }
233
234 // subscript
235 // subscriptsoperators are convienient but expensive
236 // should not be used in optimized kernels
237 inline scalarType operator[](size_t i) const
238 {
239 alignas(alignment) scalarArray tmp;
240 store(tmp, is_aligned);
241 return tmp[i];
242 }
243
244 inline scalarType &operator[](size_t i)
245 {
246 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
247 return tmp[i];
248 }
249};
250
251template <typename T>
252inline avx2Int8<T> operator+(avx2Int8<T> lhs, avx2Int8<T> rhs)
253{
254 return _mm256_add_epi32(lhs._data, rhs._data);
255}
256
257template <typename T, typename U,
258 typename = typename std::enable_if<std::is_arithmetic_v<U>>::type>
259inline avx2Int8<T> operator+(avx2Int8<T> lhs, U rhs)
260{
261 return _mm256_add_epi32(lhs._data, _mm256_set1_epi32(rhs));
262}
263
264////////////////////////////////////////////////////////////////////////////////
265
266template <typename T> struct avx2Long4
267{
268 static_assert(std::is_integral_v<T> && sizeof(T) == 8,
269 "8 bytes Integral required.");
270
271 static constexpr unsigned int width = 4;
272 static constexpr unsigned int alignment = 32;
273
274 using scalarType = T;
275 using vectorType = __m256i;
276 using scalarArray = scalarType[width];
277
278 // storage
279 vectorType _data;
280
281 // ctorsv
282 inline avx2Long4() = default;
283 inline avx2Long4(const avx2Long4 &rhs) = default;
284 inline avx2Long4(const vectorType &rhs) : _data(rhs)
285 {
286 }
287 inline avx2Long4(const scalarType rhs)
288 {
289 _data = _mm256_set1_epi64x(rhs);
290 }
291 explicit inline avx2Long4(scalarArray &rhs)
292 {
293 _data = _mm256_load_si256(reinterpret_cast<vectorType *>(rhs));
294 }
295
296 // copy assignment
297 inline avx2Long4 &operator=(const avx2Long4 &) = default;
298
299 // store
300 inline void store(scalarType *p) const
301 {
302 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
303 }
304
305 template <class flag,
306 typename std::enable_if<is_requiring_alignment_v<flag> &&
307 !is_streaming_v<flag>,
308 bool>::type = 0>
309 inline void store(scalarType *p, flag) const
310 {
311 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
312 }
313
314 template <class flag, typename std::enable_if<
315 !is_requiring_alignment_v<flag>, bool>::type = 0>
316 inline void store(scalarType *p, flag) const
317 {
318 _mm256_storeu_si256(reinterpret_cast<vectorType *>(p), _data);
319 }
320
321 inline void load(const scalarType *p)
322 {
323 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
324 }
325
326 template <class flag,
327 typename std::enable_if<is_requiring_alignment_v<flag> &&
328 !is_streaming_v<flag>,
329 bool>::type = 0>
330 inline void load(const scalarType *p, flag)
331 {
332 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
333 }
334
335 template <class flag, typename std::enable_if<
336 !is_requiring_alignment_v<flag>, bool>::type = 0>
337 inline void load(const scalarType *p, flag)
338 {
339 _data = _mm256_loadu_si256(reinterpret_cast<const vectorType *>(p));
340 }
341
342 inline void broadcast(const scalarType rhs)
343 {
344 _data = _mm256_set1_epi64x(rhs);
345 }
346
347 // subscript
348 // subscript operators are convienient but expensive
349 // should not be used in optimized kernels
350 inline scalarType operator[](size_t i) const
351 {
352 alignas(alignment) scalarArray tmp;
353 store(tmp, is_aligned);
354 return tmp[i];
355 }
356
357 inline scalarType &operator[](size_t i)
358 {
359 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
360 return tmp[i];
361 }
362};
363
364template <typename T>
365inline avx2Long4<T> operator+(avx2Long4<T> lhs, avx2Long4<T> rhs)
366{
367 return _mm256_add_epi64(lhs._data, rhs._data);
368}
369
370template <typename T, typename U,
371 typename = typename std::enable_if<std::is_arithmetic_v<U>>::type>
372inline avx2Long4<T> operator+(avx2Long4<T> lhs, U rhs)
373{
374 return _mm256_add_epi64(lhs._data, _mm256_set1_epi64x(rhs));
375}
376
377////////////////////////////////////////////////////////////////////////////////
378
379struct avx2Double4
380{
381 static constexpr unsigned width = 4;
382 static constexpr unsigned alignment = 32;
383
384 using scalarType = double;
385 using scalarIndexType = std::uint64_t;
386 using vectorType = __m256d;
387 using scalarArray = scalarType[width];
388
389 // storage
390 vectorType _data;
391
392 // ctors
393 inline avx2Double4() = default;
394 inline avx2Double4(const avx2Double4 &rhs) = default;
395 inline avx2Double4(const vectorType &rhs) : _data(rhs)
396 {
397 }
398 inline avx2Double4(const scalarType rhs)
399 {
400 _data = _mm256_set1_pd(rhs);
401 }
402
403 // copy assignment
404 inline avx2Double4 &operator=(const avx2Double4 &) = default;
405
406 // store
407 inline void store(scalarType *p) const
408 {
409 _mm256_store_pd(p, _data);
410 }
411
412 template <class flag,
413 typename std::enable_if<is_requiring_alignment_v<flag> &&
414 !is_streaming_v<flag>,
415 bool>::type = 0>
416 inline void store(scalarType *p, flag) const
417 {
418 _mm256_store_pd(p, _data);
419 }
420
421 template <class flag, typename std::enable_if<
422 !is_requiring_alignment_v<flag>, bool>::type = 0>
423 inline void store(scalarType *p, flag) const
424 {
425 _mm256_storeu_pd(p, _data);
426 }
427
428 template <class flag,
429 typename std::enable_if<is_streaming_v<flag>, bool>::type = 0>
430 inline void store(scalarType *p, flag) const
431 {
432 _mm256_stream_pd(p, _data);
433 }
434
435 // load packed
436 inline void load(const scalarType *p)
437 {
438 _data = _mm256_load_pd(p);
439 }
440
441 template <class flag, typename std::enable_if<
442 is_requiring_alignment_v<flag>, bool>::type = 0>
443 inline void load(const scalarType *p, flag)
444 {
445 _data = _mm256_load_pd(p);
446 }
447
448 template <class flag, typename std::enable_if<
449 !is_requiring_alignment_v<flag>, bool>::type = 0>
450 inline void load(const scalarType *p, flag)
451 {
452 _data = _mm256_loadu_pd(p);
453 }
454
455 // broadcast
456 inline void broadcast(const scalarType rhs)
457 {
458 _data = _mm256_set1_pd(rhs);
459 }
460
461#if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
462 // gather/scatter with sse2
463 template <typename T>
464 inline void gather(scalarType const *p, const sse2Int4<T> &indices)
465 {
466 _data = _mm256_i32gather_pd(p, indices._data, 8);
467 }
468
469 template <typename T>
470 inline void scatter(scalarType *out, const sse2Int4<T> &indices) const
471 {
472 // no scatter intrinsics for AVX2
473 alignas(alignment) scalarArray tmp;
474 _mm256_store_pd(tmp, _data);
475
476 out[_mm_extract_epi32(indices._data, 0)] = tmp[0]; // SSE4.1
477 out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
478 out[_mm_extract_epi32(indices._data, 2)] = tmp[2];
479 out[_mm_extract_epi32(indices._data, 3)] = tmp[3];
480 }
481#endif
482
483 // gather scatter with avx2
484 template <typename T>
485 inline void gather(scalarType const *p, const avx2Long4<T> &indices)
486 {
487 _data = _mm256_i64gather_pd(p, indices._data, 8);
488 }
489
490 template <typename T>
491 inline void scatter(scalarType *out, const avx2Long4<T> &indices) const
492 {
493 // no scatter intrinsics for AVX2
494 alignas(alignment) scalarArray tmp;
495 _mm256_store_pd(tmp, _data);
496
497 out[_mm256_extract_epi64(indices._data, 0)] = tmp[0];
498 out[_mm256_extract_epi64(indices._data, 1)] = tmp[1];
499 out[_mm256_extract_epi64(indices._data, 2)] = tmp[2];
500 out[_mm256_extract_epi64(indices._data, 3)] = tmp[3];
501 }
502
503 // fma
504 // this = this + a * b
505 inline void fma(const avx2Double4 &a, const avx2Double4 &b)
506 {
507 _data = _mm256_fmadd_pd(a._data, b._data, _data);
508 }
509
510 // subscript
511 // subscript operators are convienient but expensive
512 // should not be used in optimized kernels
513 inline scalarType operator[](size_t i) const
514 {
515 alignas(alignment) scalarArray tmp;
516 store(tmp, is_aligned);
517 return tmp[i];
518 }
519
520 inline scalarType &operator[](size_t i)
521 {
522 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
523 return tmp[i];
524 }
525
526 // unary ops
527 inline void operator+=(avx2Double4 rhs)
528 {
529 _data = _mm256_add_pd(_data, rhs._data);
530 }
531
532 inline void operator-=(avx2Double4 rhs)
533 {
534 _data = _mm256_sub_pd(_data, rhs._data);
535 }
536
537 inline void operator*=(avx2Double4 rhs)
538 {
539 _data = _mm256_mul_pd(_data, rhs._data);
540 }
541
542 inline void operator/=(avx2Double4 rhs)
543 {
544 _data = _mm256_div_pd(_data, rhs._data);
545 }
546};
547
548inline avx2Double4 operator+(avx2Double4 lhs, avx2Double4 rhs)
549{
550 return _mm256_add_pd(lhs._data, rhs._data);
551}
552
553inline avx2Double4 operator-(avx2Double4 lhs, avx2Double4 rhs)
554{
555 return _mm256_sub_pd(lhs._data, rhs._data);
556}
557
558inline avx2Double4 operator*(avx2Double4 lhs, avx2Double4 rhs)
559{
560 return _mm256_mul_pd(lhs._data, rhs._data);
561}
562
563inline avx2Double4 operator/(avx2Double4 lhs, avx2Double4 rhs)
564{
565 return _mm256_div_pd(lhs._data, rhs._data);
566}
567
568inline avx2Double4 sqrt(avx2Double4 in)
569{
570 return _mm256_sqrt_pd(in._data);
571}
572
573inline avx2Double4 abs(avx2Double4 in)
574{
575 // there is no avx2 _mm256_abs_pd intrinsic
576 static const __m256d sign_mask = _mm256_set1_pd(-0.); // -0. = 1 << 63
577 return _mm256_andnot_pd(sign_mask, in._data); // !sign_mask & x
578}
579
580inline avx2Double4 log(avx2Double4 in)
581{
582#if defined(TINYSIMD_HAS_SVML)
583 return _mm256_log_pd(in._data);
584#else
585 // there is no avx2 log intrinsic
586 // this is a dreadful implementation and is simply a stop gap measure
587 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
588 in.store(tmp);
589 tmp[0] = std::log(tmp[0]);
590 tmp[1] = std::log(tmp[1]);
591 tmp[2] = std::log(tmp[2]);
592 tmp[3] = std::log(tmp[3]);
593 avx2Double4 ret;
594 ret.load(tmp);
595 return ret;
596#endif
597}
598
599inline void load_unalign_interleave(
600 const double *in, const std::uint32_t dataLen,
601 std::vector<avx2Double4, allocator<avx2Double4>> &out)
602{
603 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
604 for (size_t i = 0; i < dataLen; ++i)
605 {
606 tmp[0] = in[i];
607 tmp[1] = in[i + dataLen];
608 tmp[2] = in[i + 2 * dataLen];
609 tmp[3] = in[i + 3 * dataLen];
610 out[i].load(tmp);
611 }
612}
613
614inline void load_interleave(
615 const double *in, std::uint32_t dataLen,
616 std::vector<avx2Double4, allocator<avx2Double4>> &out)
617{
618 alignas(avx2Double4::alignment)
619 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
620 using index_t = avx2Long4<size_t>;
621 index_t index0(tmp);
622 index_t index1 = index0 + 1;
623 index_t index2 = index0 + 2;
624 index_t index3 = index0 + 3;
625
626 // 4x unrolled loop
627 constexpr uint16_t unrl = 4;
628 size_t nBlocks = dataLen / unrl;
629 for (size_t i = 0; i < nBlocks; ++i)
630 {
631 out[unrl * i + 0].gather(in, index0);
632 out[unrl * i + 1].gather(in, index1);
633 out[unrl * i + 2].gather(in, index2);
634 out[unrl * i + 3].gather(in, index3);
635 index0 = index0 + unrl;
636 index1 = index1 + unrl;
637 index2 = index2 + unrl;
638 index3 = index3 + unrl;
639 }
640
641 // spillover loop
642 for (size_t i = unrl * nBlocks; i < dataLen; ++i)
643 {
644 out[i].gather(in, index0);
645 index0 = index0 + 1;
646 }
647}
648
650 const std::vector<avx2Double4, allocator<avx2Double4>> &in,
651 const std::uint32_t dataLen, double *out)
652{
653 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
654 for (size_t i = 0; i < dataLen; ++i)
655 {
656 in[i].store(tmp);
657 out[i] = tmp[0];
658 out[i + dataLen] = tmp[1];
659 out[i + 2 * dataLen] = tmp[2];
660 out[i + 3 * dataLen] = tmp[3];
661 }
662}
663
664inline void deinterleave_store(
665 const std::vector<avx2Double4, allocator<avx2Double4>> &in,
666 std::uint32_t dataLen, double *out)
667{
668 alignas(avx2Double4::alignment)
669 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
670 using index_t = avx2Long4<size_t>;
671 index_t index0(tmp);
672
673 for (size_t i = 0; i < dataLen; ++i)
674 {
675 in[i].scatter(out, index0);
676 index0 = index0 + 1;
677 }
678}
679
680//////////////////////////////////////////////////////////////////////////////
681
682struct avx2Float8
683{
684 static constexpr unsigned width = 8;
685 static constexpr unsigned alignment = 32;
686
687 using scalarType = float;
688 using scalarIndexType = std::uint32_t;
689 using vectorType = __m256;
690 using scalarArray = scalarType[width];
691
692 // storage
693 vectorType _data;
694
695 // ctors
696 inline avx2Float8() = default;
697 inline avx2Float8(const avx2Float8 &rhs) = default;
698 inline avx2Float8(const vectorType &rhs) : _data(rhs)
699 {
700 }
701 inline avx2Float8(const scalarType rhs)
702 {
703 _data = _mm256_set1_ps(rhs);
704 }
705
706 // copy assignment
707 inline avx2Float8 &operator=(const avx2Float8 &) = default;
708
709 // store
710 inline void store(scalarType *p) const
711 {
712 _mm256_store_ps(p, _data);
713 }
714
715 template <class flag,
716 typename std::enable_if<is_requiring_alignment_v<flag> &&
717 !is_streaming_v<flag>,
718 bool>::type = 0>
719 inline void store(scalarType *p, flag) const
720 {
721 _mm256_store_ps(p, _data);
722 }
723
724 template <class flag, typename std::enable_if<
725 !is_requiring_alignment_v<flag>, bool>::type = 0>
726 inline void store(scalarType *p, flag) const
727 {
728 _mm256_storeu_ps(p, _data);
729 }
730
731 template <class flag,
732 typename std::enable_if<is_streaming_v<flag>, bool>::type = 0>
733 inline void store(scalarType *p, flag) const
734 {
735 _mm256_stream_ps(p, _data);
736 }
737
738 // load packed
739 inline void load(const scalarType *p)
740 {
741 _data = _mm256_load_ps(p);
742 }
743
744 template <class flag, typename std::enable_if<
745 is_requiring_alignment_v<flag>, bool>::type = 0>
746 inline void load(const scalarType *p, flag)
747 {
748 _data = _mm256_load_ps(p);
749 }
750
751 template <class flag, typename std::enable_if<
752 !is_requiring_alignment_v<flag>, bool>::type = 0>
753 inline void load(const scalarType *p, flag)
754 {
755 _data = _mm256_loadu_ps(p);
756 }
757
758 // broadcast
759 inline void broadcast(const scalarType rhs)
760 {
761 _data = _mm256_set1_ps(rhs);
762 }
763
764 // gather scatter with avx2
765 template <typename T>
766 inline void gather(scalarType const *p, const avx2Int8<T> &indices)
767 {
768 _data = _mm256_i32gather_ps(p, indices._data, 4);
769 }
770
771 template <typename T>
772 inline void scatter(scalarType *out, const avx2Int8<T> &indices) const
773 {
774 // no scatter intrinsics for AVX2
775 alignas(alignment) scalarArray tmp;
776 _mm256_store_ps(tmp, _data);
777
778 out[_mm256_extract_epi32(indices._data, 0)] = tmp[0];
779 out[_mm256_extract_epi32(indices._data, 1)] = tmp[1];
780 out[_mm256_extract_epi32(indices._data, 2)] = tmp[2];
781 out[_mm256_extract_epi32(indices._data, 3)] = tmp[3];
782 out[_mm256_extract_epi32(indices._data, 4)] = tmp[4];
783 out[_mm256_extract_epi32(indices._data, 5)] = tmp[5];
784 out[_mm256_extract_epi32(indices._data, 6)] = tmp[6];
785 out[_mm256_extract_epi32(indices._data, 7)] = tmp[7];
786 }
787
788 // fma
789 // this = this + a * b
790 inline void fma(const avx2Float8 &a, const avx2Float8 &b)
791 {
792 _data = _mm256_fmadd_ps(a._data, b._data, _data);
793 }
794
795 // subscript
796 // subscript operators are convienient but expensive
797 // should not be used in optimized kernels
798 inline scalarType operator[](size_t i) const
799 {
800 alignas(alignment) scalarArray tmp;
801 store(tmp, is_aligned);
802 return tmp[i];
803 }
804
805 inline scalarType &operator[](size_t i)
806 {
807 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
808 return tmp[i];
809 }
810
811 inline void operator+=(avx2Float8 rhs)
812 {
813 _data = _mm256_add_ps(_data, rhs._data);
814 }
815
816 inline void operator-=(avx2Float8 rhs)
817 {
818 _data = _mm256_sub_ps(_data, rhs._data);
819 }
820
821 inline void operator*=(avx2Float8 rhs)
822 {
823 _data = _mm256_mul_ps(_data, rhs._data);
824 }
825
826 inline void operator/=(avx2Float8 rhs)
827 {
828 _data = _mm256_div_ps(_data, rhs._data);
829 }
830};
831
832inline avx2Float8 operator+(avx2Float8 lhs, avx2Float8 rhs)
833{
834 return _mm256_add_ps(lhs._data, rhs._data);
835}
836
837inline avx2Float8 operator-(avx2Float8 lhs, avx2Float8 rhs)
838{
839 return _mm256_sub_ps(lhs._data, rhs._data);
840}
841
842inline avx2Float8 operator*(avx2Float8 lhs, avx2Float8 rhs)
843{
844 return _mm256_mul_ps(lhs._data, rhs._data);
845}
846
847inline avx2Float8 operator/(avx2Float8 lhs, avx2Float8 rhs)
848{
849 return _mm256_div_ps(lhs._data, rhs._data);
850}
851
852inline avx2Float8 sqrt(avx2Float8 in)
853{
854 return _mm256_sqrt_ps(in._data);
855}
856
857inline avx2Float8 abs(avx2Float8 in)
858{
859 // there is no avx2 _mm256_abs_ps intrinsic
860 static const __m256 sign_mask = _mm256_set1_ps(-0.); // -0. = 1 << 63
861 return _mm256_andnot_ps(sign_mask, in._data); // !sign_mask & x
862}
863
864inline avx2Float8 log(avx2Float8 in)
865{
866 // there is no avx2 log intrinsic
867 // this is a dreadful implementation and is simply a stop gap measure
868 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
869 in.store(tmp);
870 tmp[0] = std::log(tmp[0]);
871 tmp[1] = std::log(tmp[1]);
872 tmp[2] = std::log(tmp[2]);
873 tmp[3] = std::log(tmp[3]);
874 tmp[4] = std::log(tmp[4]);
875 tmp[5] = std::log(tmp[5]);
876 tmp[6] = std::log(tmp[6]);
877 tmp[7] = std::log(tmp[7]);
878 avx2Float8 ret;
879 ret.load(tmp);
880 return ret;
881}
882
883inline void load_unalign_interleave(
884 const double *in, const std::uint32_t dataLen,
885 std::vector<avx2Float8, allocator<avx2Float8>> &out)
886{
887 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
888 for (size_t i = 0; i < dataLen; ++i)
889 {
890 tmp[0] = in[i];
891 tmp[1] = in[i + dataLen];
892 tmp[2] = in[i + 2 * dataLen];
893 tmp[3] = in[i + 3 * dataLen];
894 tmp[4] = in[i + 4 * dataLen];
895 tmp[5] = in[i + 5 * dataLen];
896 tmp[6] = in[i + 6 * dataLen];
897 tmp[7] = in[i + 7 * dataLen];
898 out[i].load(tmp);
899 }
900}
901
902inline void load_interleave(const float *in, std::uint32_t dataLen,
903 std::vector<avx2Float8, allocator<avx2Float8>> &out)
904{
905
906 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
907 0, dataLen, 2 * dataLen, 3 * dataLen,
908 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
909
910 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
911 index_t index0(tmp);
912 index_t index1 = index0 + 1;
913 index_t index2 = index0 + 2;
914 index_t index3 = index0 + 3;
915
916 // 4x unrolled loop
917 size_t nBlocks = dataLen / 4;
918 for (size_t i = 0; i < nBlocks; ++i)
919 {
920 out[4 * i + 0].gather(in, index0);
921 out[4 * i + 1].gather(in, index1);
922 out[4 * i + 2].gather(in, index2);
923 out[4 * i + 3].gather(in, index3);
924 index0 = index0 + 4;
925 index1 = index1 + 4;
926 index2 = index2 + 4;
927 index3 = index3 + 4;
928 }
929
930 // spillover loop
931 for (size_t i = 4 * nBlocks; i < dataLen; ++i)
932 {
933 out[i].gather(in, index0);
934 index0 = index0 + 1;
935 }
936}
937
939 const std::vector<avx2Float8, allocator<avx2Float8>> &in,
940 const std::uint32_t dataLen, double *out)
941{
942 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
943 for (size_t i = 0; i < dataLen; ++i)
944 {
945 in[i].store(tmp);
946 out[i] = tmp[0];
947 out[i + dataLen] = tmp[1];
948 out[i + 2 * dataLen] = tmp[2];
949 out[i + 3 * dataLen] = tmp[3];
950 out[i + 4 * dataLen] = tmp[4];
951 out[i + 5 * dataLen] = tmp[5];
952 out[i + 6 * dataLen] = tmp[6];
953 out[i + 7 * dataLen] = tmp[7];
954 }
955}
956
957inline void deinterleave_store(
958 const std::vector<avx2Float8, allocator<avx2Float8>> &in,
959 std::uint32_t dataLen, float *out)
960{
961 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
962 0, dataLen, 2 * dataLen, 3 * dataLen,
963 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
964 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
965 index_t index0(tmp);
966
967 for (size_t i = 0; i < dataLen; ++i)
968 {
969 in[i].scatter(out, index0);
970 index0 = index0 + 1;
971 }
972}
973
974////////////////////////////////////////////////////////////////////////////////
975
976// mask type
977// mask is a int type with special properties (broad boolean vector)
978// broad boolean vectors defined and allowed values are:
979// false=0x0 and true=0xFFFFFFFF
980//
981// VERY LIMITED SUPPORT...just enough to make cubic eos work...
982//
983struct avx2Mask4 : avx2Long4<std::uint64_t>
984{
985 // bring in ctors
986 using avx2Long4::avx2Long4;
987
988 static constexpr scalarType true_v = -1;
989 static constexpr scalarType false_v = 0;
990};
991
992inline avx2Mask4 operator>(avx2Double4 lhs, avx2Double4 rhs)
993{
994 return reinterpret_cast<__m256i>(
995 _mm256_cmp_pd(lhs._data, rhs._data, _CMP_GT_OQ));
996}
997
998inline bool operator&&(avx2Mask4 lhs, bool rhs)
999{
1000 bool tmp =
1001 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask4::true_v));
1002
1003 return tmp && rhs;
1004}
1005
1006struct avx2Mask8 : avx2Int8<std::uint32_t>
1007{
1008 // bring in ctors
1009 using avx2Int8::avx2Int8;
1010
1011 static constexpr scalarType true_v = -1;
1012 static constexpr scalarType false_v = 0;
1013};
1014
1015inline avx2Mask8 operator>(avx2Float8 lhs, avx2Float8 rhs)
1016{
1017 return reinterpret_cast<__m256i>(_mm256_cmp_ps(rhs._data, lhs._data, 1));
1018}
1019
1020inline bool operator&&(avx2Mask8 lhs, bool rhs)
1021{
1022 bool tmp =
1023 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask8::true_v));
1024
1025 return tmp && rhs;
1026}
1027
1028} // namespace tinysimd
1029#endif // defined(__AVX2__)
1030#endif
std::int32_t int32_t
std::uint32_t uint32_t
std::int64_t int64_t
std::uint64_t uint64_t
STL namespace.
void load_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition: scalar.hpp:311
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:289
void deinterleave_unalign_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
Definition: scalar.hpp:321
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:232
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:268
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:294
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:250
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:379
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:389
void load_unalign_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition: scalar.hpp:300
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
Definition: scalar.hpp:332
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:285
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:214