Nektar++
Loading...
Searching...
No Matches
avx2.hpp
Go to the documentation of this file.
1///////////////////////////////////////////////////////////////////////////////
2//
3// File: avx2.hpp
4//
5// For more information, please see: http://www.nektar.info
6//
7// The MIT License
8//
9// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10// Department of Aeronautics, Imperial College London (UK), and Scientific
11// Computing and Imaging Institute, University of Utah (USA).
12//
13// Permission is hereby granted, free of charge, to any person obtaining a
14// copy of this software and associated documentation files (the "Software"),
15// to deal in the Software without restriction, including without limitation
16// the rights to use, copy, modify, merge, publish, distribute, sublicense,
17// and/or sell copies of the Software, and to permit persons to whom the
18// Software is furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included
21// in all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29// DEALINGS IN THE SOFTWARE.
30//
31// Description: Vector type using avx2 extension.
32//
33///////////////////////////////////////////////////////////////////////////////
34
35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX2_H
37
38#if defined(__x86_64__)
39#include <immintrin.h>
40#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41#define TINYSIMD_HAS_SVML
42#endif
43#endif
44#include "allocator.hpp"
45#include "sse2.hpp"
46#include "traits.hpp"
47#include <cmath>
48#include <vector>
49
51{
52
53template <typename scalarType, int width = 0> struct avx2
54{
55 using type = void;
56};
57
58} // namespace tinysimd::abi
59
60#if defined(__AVX2__) && defined(NEKTAR_ENABLE_SIMD_AVX2)
61
62namespace tinysimd
63{
64
65// forward declaration of concrete types
66template <typename T> struct avx2Long4;
67template <typename T> struct avx2Int8;
68struct avx2Double4;
69struct avx2Float8;
70struct avx2Mask4;
71struct avx2Mask8;
72
73namespace abi
74{
75
76// mapping between abstract types and concrete floating point types
77template <> struct avx2<double>
78{
79 using type = avx2Double4;
80};
81template <> struct avx2<float>
82{
83 using type = avx2Float8;
84};
85// generic index mapping
86// assumes index type width same as floating point type
87template <> struct avx2<std::int64_t>
88{
89 using type = avx2Long4<std::int64_t>;
90};
91template <> struct avx2<std::uint64_t>
92{
93 using type = avx2Long4<std::uint64_t>;
94};
95#if defined(__APPLE__)
96template <> struct avx2<std::size_t>
97{
98 using type = avx2Long4<std::size_t>;
99};
100#endif
101template <> struct avx2<std::int32_t>
102{
103 using type = avx2Int8<std::int32_t>;
104};
105template <> struct avx2<std::uint32_t>
106{
107 using type = avx2Int8<std::uint32_t>;
108};
109// specialized index mapping
110template <> struct avx2<std::int64_t, 4>
111{
112 using type = avx2Long4<std::int64_t>;
113};
114template <> struct avx2<std::uint64_t, 4>
115{
116 using type = avx2Long4<std::uint64_t>;
117};
118#if defined(__APPLE__)
119template <> struct avx2<std::size_t, 4>
120{
121 using type = avx2Long4<std::size_t>;
122};
123#endif
124template <> struct avx2<std::int32_t, 4>
125{
126 using type = sse2Int4<std::int32_t>;
127};
128template <> struct avx2<std::uint32_t, 4>
129{
130 using type = sse2Int4<std::uint32_t>;
131};
132template <> struct avx2<std::int32_t, 8>
133{
134 using type = avx2Int8<std::int32_t>;
135};
136template <> struct avx2<std::uint32_t, 8>
137{
138 using type = avx2Int8<std::uint32_t>;
139};
140// bool mapping
141template <> struct avx2<bool, 4>
142{
143 using type = avx2Mask4;
144};
145template <> struct avx2<bool, 8>
146{
147 using type = avx2Mask8;
148};
149
150} // namespace abi
151
152// concrete types
153template <typename T> struct avx2Int8
154{
155 static_assert(std::is_integral_v<T> && sizeof(T) == 4,
156 "4 bytes Integral required.");
157
158 static constexpr unsigned int width = 8;
159 static constexpr unsigned int alignment = 32;
160
161 using scalarType = T;
162 using vectorType = __m256i;
163 using scalarArray = scalarType[width];
164
165 // storage
166 vectorType _data;
167
168 // ctors
169 inline avx2Int8() = default;
170 inline avx2Int8(const avx2Int8 &rhs) = default;
171 inline avx2Int8(const vectorType &rhs) : _data(rhs)
172 {
173 }
174 inline avx2Int8(const scalarType rhs)
175 {
176 _data = _mm256_set1_epi32(rhs);
177 }
178 explicit inline avx2Int8(scalarArray &rhs)
179 {
180 _data = _mm256_load_si256(reinterpret_cast<vectorType *>(rhs));
181 }
182
183 // copy assignment
184 inline avx2Int8 &operator=(const avx2Int8 &) = default;
185
186 // store
187 inline void store(scalarType *p) const
188 {
189 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
190 }
191
192 template <class flag,
193 typename std::enable_if<is_requiring_alignment_v<flag> &&
194 !is_streaming_v<flag>,
195 bool>::type = 0>
196 inline void store(scalarType *p, flag) const
197 {
198 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
199 }
200
201 template <class flag, typename std::enable_if<
202 !is_requiring_alignment_v<flag>, bool>::type = 0>
203 inline void store(scalarType *p, flag) const
204 {
205 _mm256_storeu_si256(reinterpret_cast<vectorType *>(p), _data);
206 }
207
208 inline void load(const scalarType *p)
209 {
210 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
211 }
212
213 template <class flag,
214 typename std::enable_if<is_requiring_alignment_v<flag> &&
215 !is_streaming_v<flag>,
216 bool>::type = 0>
217 inline void load(const scalarType *p, flag)
218 {
219 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
220 }
221
222 template <class flag, typename std::enable_if<
223 !is_requiring_alignment_v<flag>, bool>::type = 0>
224 inline void load(const scalarType *p, flag)
225 {
226 _data = _mm256_loadu_si256(reinterpret_cast<const vectorType *>(p));
227 }
228
229 inline void broadcast(const scalarType rhs)
230 {
231 _data = _mm256_set1_epi32(rhs);
232 }
233
234 // subscript
235 // subscriptsoperators are convienient but expensive
236 // should not be used in optimized kernels
237 inline scalarType operator[](size_t i) const
238 {
239 alignas(alignment) scalarArray tmp;
240 store(tmp, is_aligned);
241 return tmp[i];
242 }
243
244 inline scalarType &operator[](size_t i)
245 {
246 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
247 return tmp[i];
248 }
249};
250
251template <typename T>
252inline avx2Int8<T> operator+(avx2Int8<T> lhs, avx2Int8<T> rhs)
253{
254 return _mm256_add_epi32(lhs._data, rhs._data);
255}
256
257template <typename T, typename U,
258 typename = typename std::enable_if<std::is_arithmetic_v<U>>::type>
259inline avx2Int8<T> operator+(avx2Int8<T> lhs, U rhs)
260{
261 return _mm256_add_epi32(lhs._data, _mm256_set1_epi32(rhs));
262}
263
264////////////////////////////////////////////////////////////////////////////////
265
266template <typename T> struct avx2Long4
267{
268 static_assert(std::is_integral_v<T> && sizeof(T) == 8,
269 "8 bytes Integral required.");
270
271 static constexpr unsigned int width = 4;
272 static constexpr unsigned int alignment = 32;
273
274 using scalarType = T;
275 using vectorType = __m256i;
276 using scalarArray = scalarType[width];
277
278 // storage
279 vectorType _data;
280
281 // ctorsv
282 inline avx2Long4() = default;
283 inline avx2Long4(const avx2Long4 &rhs) = default;
284 inline avx2Long4(const vectorType &rhs) : _data(rhs)
285 {
286 }
287 inline avx2Long4(const scalarType rhs)
288 {
289 _data = _mm256_set1_epi64x(rhs);
290 }
291 explicit inline avx2Long4(scalarArray &rhs)
292 {
293 _data = _mm256_load_si256(reinterpret_cast<vectorType *>(rhs));
294 }
295
296 // copy assignment
297 inline avx2Long4 &operator=(const avx2Long4 &) = default;
298
299 // store
300 inline void store(scalarType *p) const
301 {
302 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
303 }
304
305 template <class flag,
306 typename std::enable_if<is_requiring_alignment_v<flag> &&
307 !is_streaming_v<flag>,
308 bool>::type = 0>
309 inline void store(scalarType *p, flag) const
310 {
311 _mm256_store_si256(reinterpret_cast<vectorType *>(p), _data);
312 }
313
314 template <class flag, typename std::enable_if<
315 !is_requiring_alignment_v<flag>, bool>::type = 0>
316 inline void store(scalarType *p, flag) const
317 {
318 _mm256_storeu_si256(reinterpret_cast<vectorType *>(p), _data);
319 }
320
321 inline void load(const scalarType *p)
322 {
323 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
324 }
325
326 template <class flag,
327 typename std::enable_if<is_requiring_alignment_v<flag> &&
328 !is_streaming_v<flag>,
329 bool>::type = 0>
330 inline void load(const scalarType *p, flag)
331 {
332 _data = _mm256_load_si256(reinterpret_cast<const vectorType *>(p));
333 }
334
335 template <class flag, typename std::enable_if<
336 !is_requiring_alignment_v<flag>, bool>::type = 0>
337 inline void load(const scalarType *p, flag)
338 {
339 _data = _mm256_loadu_si256(reinterpret_cast<const vectorType *>(p));
340 }
341
342 inline void broadcast(const scalarType rhs)
343 {
344 _data = _mm256_set1_epi64x(rhs);
345 }
346
347 // subscript
348 // subscript operators are convienient but expensive
349 // should not be used in optimized kernels
350 inline scalarType operator[](size_t i) const
351 {
352 alignas(alignment) scalarArray tmp;
353 store(tmp, is_aligned);
354 return tmp[i];
355 }
356
357 inline scalarType &operator[](size_t i)
358 {
359 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
360 return tmp[i];
361 }
362};
363
364template <typename T>
365inline avx2Long4<T> operator+(avx2Long4<T> lhs, avx2Long4<T> rhs)
366{
367 return _mm256_add_epi64(lhs._data, rhs._data);
368}
369
370template <typename T, typename U,
371 typename = typename std::enable_if<std::is_arithmetic_v<U>>::type>
372inline avx2Long4<T> operator+(avx2Long4<T> lhs, U rhs)
373{
374 return _mm256_add_epi64(lhs._data, _mm256_set1_epi64x(rhs));
375}
376
377////////////////////////////////////////////////////////////////////////////////
378
379struct avx2Double4
380{
381 static constexpr unsigned width = 4;
382 static constexpr unsigned alignment = 32;
383
384 using scalarType = double;
385 using scalarIndexType = std::uint64_t;
386 using vectorType = __m256d;
387 using scalarArray = scalarType[width];
388
389 // storage
390 vectorType _data;
391
392 // ctors
393 inline avx2Double4() = default;
394 inline avx2Double4(const avx2Double4 &rhs) = default;
395 inline avx2Double4(const vectorType &rhs) : _data(rhs)
396 {
397 }
398 inline avx2Double4(const scalarType rhs)
399 {
400 _data = _mm256_set1_pd(rhs);
401 }
402
403 // copy assignment
404 inline avx2Double4 &operator=(const avx2Double4 &) = default;
405
406 // store
407 inline void store(scalarType *p) const
408 {
409 _mm256_store_pd(p, _data);
410 }
411
412 template <class flag,
413 typename std::enable_if<is_requiring_alignment_v<flag> &&
414 !is_streaming_v<flag>,
415 bool>::type = 0>
416 inline void store(scalarType *p, flag) const
417 {
418 _mm256_store_pd(p, _data);
419 }
420
421 template <class flag, typename std::enable_if<
422 !is_requiring_alignment_v<flag>, bool>::type = 0>
423 inline void store(scalarType *p, flag) const
424 {
425 _mm256_storeu_pd(p, _data);
426 }
427
428 template <class flag,
429 typename std::enable_if<is_streaming_v<flag>, bool>::type = 0>
430 inline void store(scalarType *p, flag) const
431 {
432 _mm256_stream_pd(p, _data);
433 }
434
435 // load packed
436 inline void load(const scalarType *p)
437 {
438 _data = _mm256_load_pd(p);
439 }
440
441 template <class flag, typename std::enable_if<
442 is_requiring_alignment_v<flag>, bool>::type = 0>
443 inline void load(const scalarType *p, flag)
444 {
445 _data = _mm256_load_pd(p);
446 }
447
448 template <class flag, typename std::enable_if<
449 !is_requiring_alignment_v<flag>, bool>::type = 0>
450 inline void load(const scalarType *p, flag)
451 {
452 _data = _mm256_loadu_pd(p);
453 }
454
455 // broadcast
456 inline void broadcast(const scalarType rhs)
457 {
458 _data = _mm256_set1_pd(rhs);
459 }
460
461#if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
462 // gather/scatter with sse2
463 template <typename T>
464 inline void gather(scalarType const *p, const sse2Int4<T> &indices)
465 {
466 _data = _mm256_i32gather_pd(p, indices._data, 8);
467 }
468
469 template <typename T>
470 inline void scatter(scalarType *out, const sse2Int4<T> &indices) const
471 {
472 // no scatter intrinsics for AVX2
473 alignas(alignment) scalarArray tmp;
474 _mm256_store_pd(tmp, _data);
475
476 out[_mm_extract_epi32(indices._data, 0)] = tmp[0]; // SSE4.1
477 out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
478 out[_mm_extract_epi32(indices._data, 2)] = tmp[2];
479 out[_mm_extract_epi32(indices._data, 3)] = tmp[3];
480 }
481#endif
482
483 // gather scatter with avx2
484 template <typename T>
485 inline void gather(scalarType const *p, const avx2Long4<T> &indices)
486 {
487 _data = _mm256_i64gather_pd(p, indices._data, 8);
488 }
489
490 template <typename T>
491 inline void scatter(scalarType *out, const avx2Long4<T> &indices) const
492 {
493 // no scatter intrinsics for AVX2
494 alignas(alignment) scalarArray tmp;
495 _mm256_store_pd(tmp, _data);
496
497 out[_mm256_extract_epi64(indices._data, 0)] = tmp[0];
498 out[_mm256_extract_epi64(indices._data, 1)] = tmp[1];
499 out[_mm256_extract_epi64(indices._data, 2)] = tmp[2];
500 out[_mm256_extract_epi64(indices._data, 3)] = tmp[3];
501 }
502
503 // fma
504 // this = this + a * b
505 inline void fma(const avx2Double4 &a, const avx2Double4 &b)
506 {
507 _data = _mm256_fmadd_pd(a._data, b._data, _data);
508 }
509
510 // subscript
511 // subscript operators are convienient but expensive
512 // should not be used in optimized kernels
513 inline scalarType operator[](size_t i) const
514 {
515 alignas(alignment) scalarArray tmp;
516 store(tmp, is_aligned);
517 return tmp[i];
518 }
519
520 inline scalarType &operator[](size_t i)
521 {
522 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
523 return tmp[i];
524 }
525
526 // unary ops
527 inline void operator+=(avx2Double4 rhs)
528 {
529 _data = _mm256_add_pd(_data, rhs._data);
530 }
531
532 inline void operator-=(avx2Double4 rhs)
533 {
534 _data = _mm256_sub_pd(_data, rhs._data);
535 }
536
537 inline void operator*=(avx2Double4 rhs)
538 {
539 _data = _mm256_mul_pd(_data, rhs._data);
540 }
541
542 inline void operator/=(avx2Double4 rhs)
543 {
544 _data = _mm256_div_pd(_data, rhs._data);
545 }
546};
547
548inline avx2Double4 operator+(avx2Double4 lhs, avx2Double4 rhs)
549{
550 return _mm256_add_pd(lhs._data, rhs._data);
551}
552
553inline avx2Double4 operator-(avx2Double4 lhs, avx2Double4 rhs)
554{
555 return _mm256_sub_pd(lhs._data, rhs._data);
556}
557
558inline avx2Double4 operator-(avx2Double4 in)
559{
560 return _mm256_xor_pd(in._data, _mm256_set1_pd(-0.0));
561}
562
563inline avx2Double4 operator*(avx2Double4 lhs, avx2Double4 rhs)
564{
565 return _mm256_mul_pd(lhs._data, rhs._data);
566}
567
568inline avx2Double4 operator/(avx2Double4 lhs, avx2Double4 rhs)
569{
570 return _mm256_div_pd(lhs._data, rhs._data);
571}
572
573inline avx2Double4 sqrt(avx2Double4 in)
574{
575 return _mm256_sqrt_pd(in._data);
576}
577
578inline avx2Double4 abs(avx2Double4 in)
579{
580 // there is no avx2 _mm256_abs_pd intrinsic
581 static const __m256d sign_mask = _mm256_set1_pd(-0.); // -0. = 1 << 63
582 return _mm256_andnot_pd(sign_mask, in._data); // !sign_mask & x
583}
584
585inline avx2Double4 min(avx2Double4 lhs, avx2Double4 rhs)
586{
587 return _mm256_min_pd(lhs._data, rhs._data);
588}
589
590inline avx2Double4 max(avx2Double4 lhs, avx2Double4 rhs)
591{
592 return _mm256_max_pd(lhs._data, rhs._data);
593}
594
595inline avx2Double4 log(avx2Double4 in)
596{
597#if defined(TINYSIMD_HAS_SVML)
598 return _mm256_log_pd(in._data);
599#else
600 // there is no avx2 log intrinsic
601 // this is a dreadful implementation and is simply a stop gap measure
602 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
603 in.store(tmp);
604 tmp[0] = std::log(tmp[0]);
605 tmp[1] = std::log(tmp[1]);
606 tmp[2] = std::log(tmp[2]);
607 tmp[3] = std::log(tmp[3]);
608 avx2Double4 ret;
609 ret.load(tmp);
610 return ret;
611#endif
612}
613
614inline void load_unalign_interleave(
615 const double *in, const std::uint32_t dataLen,
616 std::vector<avx2Double4, allocator<avx2Double4>> &out)
617{
618 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
619 for (size_t i = 0; i < dataLen; ++i)
620 {
621 tmp[0] = in[i];
622 tmp[1] = in[i + dataLen];
623 tmp[2] = in[i + 2 * dataLen];
624 tmp[3] = in[i + 3 * dataLen];
625 out[i].load(tmp);
626 }
627}
628
629inline void load_interleave(
630 const double *in, std::uint32_t dataLen,
631 std::vector<avx2Double4, allocator<avx2Double4>> &out)
632{
633 alignas(avx2Double4::alignment)
634 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
635 using index_t = avx2Long4<size_t>;
636 index_t index0(tmp);
637 index_t index1 = index0 + 1;
638 index_t index2 = index0 + 2;
639 index_t index3 = index0 + 3;
640
641 // 4x unrolled loop
642 constexpr uint16_t unrl = 4;
643 size_t nBlocks = dataLen / unrl;
644 for (size_t i = 0; i < nBlocks; ++i)
645 {
646 out[unrl * i + 0].gather(in, index0);
647 out[unrl * i + 1].gather(in, index1);
648 out[unrl * i + 2].gather(in, index2);
649 out[unrl * i + 3].gather(in, index3);
650 index0 = index0 + unrl;
651 index1 = index1 + unrl;
652 index2 = index2 + unrl;
653 index3 = index3 + unrl;
654 }
655
656 // spillover loop
657 for (size_t i = unrl * nBlocks; i < dataLen; ++i)
658 {
659 out[i].gather(in, index0);
660 index0 = index0 + 1;
661 }
662}
663
665 const std::vector<avx2Double4, allocator<avx2Double4>> &in,
666 const std::uint32_t dataLen, double *out)
667{
668 alignas(avx2Double4::alignment) avx2Double4::scalarArray tmp;
669 for (size_t i = 0; i < dataLen; ++i)
670 {
671 in[i].store(tmp);
672 out[i] = tmp[0];
673 out[i + dataLen] = tmp[1];
674 out[i + 2 * dataLen] = tmp[2];
675 out[i + 3 * dataLen] = tmp[3];
676 }
677}
678
679inline void deinterleave_store(
680 const std::vector<avx2Double4, allocator<avx2Double4>> &in,
681 std::uint32_t dataLen, double *out)
682{
683 alignas(avx2Double4::alignment)
684 size_t tmp[avx2Double4::width] = {0, dataLen, 2 * dataLen, 3 * dataLen};
685 using index_t = avx2Long4<size_t>;
686 index_t index0(tmp);
687
688 for (size_t i = 0; i < dataLen; ++i)
689 {
690 in[i].scatter(out, index0);
691 index0 = index0 + 1;
692 }
693}
694
695//////////////////////////////////////////////////////////////////////////////
696
697struct avx2Float8
698{
699 static constexpr unsigned width = 8;
700 static constexpr unsigned alignment = 32;
701
702 using scalarType = float;
703 using scalarIndexType = std::uint32_t;
704 using vectorType = __m256;
705 using scalarArray = scalarType[width];
706
707 // storage
708 vectorType _data;
709
710 // ctors
711 inline avx2Float8() = default;
712 inline avx2Float8(const avx2Float8 &rhs) = default;
713 inline avx2Float8(const vectorType &rhs) : _data(rhs)
714 {
715 }
716 inline avx2Float8(const scalarType rhs)
717 {
718 _data = _mm256_set1_ps(rhs);
719 }
720
721 // copy assignment
722 inline avx2Float8 &operator=(const avx2Float8 &) = default;
723
724 // store
725 inline void store(scalarType *p) const
726 {
727 _mm256_store_ps(p, _data);
728 }
729
730 template <class flag,
731 typename std::enable_if<is_requiring_alignment_v<flag> &&
732 !is_streaming_v<flag>,
733 bool>::type = 0>
734 inline void store(scalarType *p, flag) const
735 {
736 _mm256_store_ps(p, _data);
737 }
738
739 template <class flag, typename std::enable_if<
740 !is_requiring_alignment_v<flag>, bool>::type = 0>
741 inline void store(scalarType *p, flag) const
742 {
743 _mm256_storeu_ps(p, _data);
744 }
745
746 template <class flag,
747 typename std::enable_if<is_streaming_v<flag>, bool>::type = 0>
748 inline void store(scalarType *p, flag) const
749 {
750 _mm256_stream_ps(p, _data);
751 }
752
753 // load packed
754 inline void load(const scalarType *p)
755 {
756 _data = _mm256_load_ps(p);
757 }
758
759 template <class flag, typename std::enable_if<
760 is_requiring_alignment_v<flag>, bool>::type = 0>
761 inline void load(const scalarType *p, flag)
762 {
763 _data = _mm256_load_ps(p);
764 }
765
766 template <class flag, typename std::enable_if<
767 !is_requiring_alignment_v<flag>, bool>::type = 0>
768 inline void load(const scalarType *p, flag)
769 {
770 _data = _mm256_loadu_ps(p);
771 }
772
773 // broadcast
774 inline void broadcast(const scalarType rhs)
775 {
776 _data = _mm256_set1_ps(rhs);
777 }
778
779 // gather scatter with avx2
780 template <typename T>
781 inline void gather(scalarType const *p, const avx2Int8<T> &indices)
782 {
783 _data = _mm256_i32gather_ps(p, indices._data, 4);
784 }
785
786 template <typename T>
787 inline void scatter(scalarType *out, const avx2Int8<T> &indices) const
788 {
789 // no scatter intrinsics for AVX2
790 alignas(alignment) scalarArray tmp;
791 _mm256_store_ps(tmp, _data);
792
793 out[_mm256_extract_epi32(indices._data, 0)] = tmp[0];
794 out[_mm256_extract_epi32(indices._data, 1)] = tmp[1];
795 out[_mm256_extract_epi32(indices._data, 2)] = tmp[2];
796 out[_mm256_extract_epi32(indices._data, 3)] = tmp[3];
797 out[_mm256_extract_epi32(indices._data, 4)] = tmp[4];
798 out[_mm256_extract_epi32(indices._data, 5)] = tmp[5];
799 out[_mm256_extract_epi32(indices._data, 6)] = tmp[6];
800 out[_mm256_extract_epi32(indices._data, 7)] = tmp[7];
801 }
802
803 // fma
804 // this = this + a * b
805 inline void fma(const avx2Float8 &a, const avx2Float8 &b)
806 {
807 _data = _mm256_fmadd_ps(a._data, b._data, _data);
808 }
809
810 // subscript
811 // subscript operators are convienient but expensive
812 // should not be used in optimized kernels
813 inline scalarType operator[](size_t i) const
814 {
815 alignas(alignment) scalarArray tmp;
816 store(tmp, is_aligned);
817 return tmp[i];
818 }
819
820 inline scalarType &operator[](size_t i)
821 {
822 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
823 return tmp[i];
824 }
825
826 inline void operator+=(avx2Float8 rhs)
827 {
828 _data = _mm256_add_ps(_data, rhs._data);
829 }
830
831 inline void operator-=(avx2Float8 rhs)
832 {
833 _data = _mm256_sub_ps(_data, rhs._data);
834 }
835
836 inline void operator*=(avx2Float8 rhs)
837 {
838 _data = _mm256_mul_ps(_data, rhs._data);
839 }
840
841 inline void operator/=(avx2Float8 rhs)
842 {
843 _data = _mm256_div_ps(_data, rhs._data);
844 }
845};
846
847inline avx2Float8 operator+(avx2Float8 lhs, avx2Float8 rhs)
848{
849 return _mm256_add_ps(lhs._data, rhs._data);
850}
851
852inline avx2Float8 operator-(avx2Float8 lhs, avx2Float8 rhs)
853{
854 return _mm256_sub_ps(lhs._data, rhs._data);
855}
856
857inline avx2Float8 operator-(avx2Float8 in)
858{
859 return _mm256_xor_ps(in._data, _mm256_set1_ps(-0.0));
860}
861
862inline avx2Float8 operator*(avx2Float8 lhs, avx2Float8 rhs)
863{
864 return _mm256_mul_ps(lhs._data, rhs._data);
865}
866
867inline avx2Float8 operator/(avx2Float8 lhs, avx2Float8 rhs)
868{
869 return _mm256_div_ps(lhs._data, rhs._data);
870}
871
872inline avx2Float8 sqrt(avx2Float8 in)
873{
874 return _mm256_sqrt_ps(in._data);
875}
876
877inline avx2Float8 abs(avx2Float8 in)
878{
879 // there is no avx2 _mm256_abs_ps intrinsic
880 static const __m256 sign_mask = _mm256_set1_ps(-0.); // -0. = 1 << 63
881 return _mm256_andnot_ps(sign_mask, in._data); // !sign_mask & x
882}
883
884inline avx2Float8 min(avx2Float8 lhs, avx2Float8 rhs)
885{
886 return _mm256_min_ps(lhs._data, rhs._data);
887}
888
889inline avx2Float8 max(avx2Float8 lhs, avx2Float8 rhs)
890{
891 return _mm256_max_ps(lhs._data, rhs._data);
892}
893
894inline avx2Float8 log(avx2Float8 in)
895{
896 // there is no avx2 log intrinsic
897 // this is a dreadful implementation and is simply a stop gap measure
898 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
899 in.store(tmp);
900 tmp[0] = std::log(tmp[0]);
901 tmp[1] = std::log(tmp[1]);
902 tmp[2] = std::log(tmp[2]);
903 tmp[3] = std::log(tmp[3]);
904 tmp[4] = std::log(tmp[4]);
905 tmp[5] = std::log(tmp[5]);
906 tmp[6] = std::log(tmp[6]);
907 tmp[7] = std::log(tmp[7]);
908 avx2Float8 ret;
909 ret.load(tmp);
910 return ret;
911}
912
913inline void load_unalign_interleave(
914 const double *in, const std::uint32_t dataLen,
915 std::vector<avx2Float8, allocator<avx2Float8>> &out)
916{
917 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
918 for (size_t i = 0; i < dataLen; ++i)
919 {
920 tmp[0] = in[i];
921 tmp[1] = in[i + dataLen];
922 tmp[2] = in[i + 2 * dataLen];
923 tmp[3] = in[i + 3 * dataLen];
924 tmp[4] = in[i + 4 * dataLen];
925 tmp[5] = in[i + 5 * dataLen];
926 tmp[6] = in[i + 6 * dataLen];
927 tmp[7] = in[i + 7 * dataLen];
928 out[i].load(tmp);
929 }
930}
931
932inline void load_interleave(const float *in, std::uint32_t dataLen,
933 std::vector<avx2Float8, allocator<avx2Float8>> &out)
934{
935
936 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
937 0, dataLen, 2 * dataLen, 3 * dataLen,
938 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
939
940 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
941 index_t index0(tmp);
942 index_t index1 = index0 + 1;
943 index_t index2 = index0 + 2;
944 index_t index3 = index0 + 3;
945
946 // 4x unrolled loop
947 size_t nBlocks = dataLen / 4;
948 for (size_t i = 0; i < nBlocks; ++i)
949 {
950 out[4 * i + 0].gather(in, index0);
951 out[4 * i + 1].gather(in, index1);
952 out[4 * i + 2].gather(in, index2);
953 out[4 * i + 3].gather(in, index3);
954 index0 = index0 + 4;
955 index1 = index1 + 4;
956 index2 = index2 + 4;
957 index3 = index3 + 4;
958 }
959
960 // spillover loop
961 for (size_t i = 4 * nBlocks; i < dataLen; ++i)
962 {
963 out[i].gather(in, index0);
964 index0 = index0 + 1;
965 }
966}
967
969 const std::vector<avx2Float8, allocator<avx2Float8>> &in,
970 const std::uint32_t dataLen, double *out)
971{
972 alignas(avx2Float8::alignment) avx2Float8::scalarArray tmp;
973 for (size_t i = 0; i < dataLen; ++i)
974 {
975 in[i].store(tmp);
976 out[i] = tmp[0];
977 out[i + dataLen] = tmp[1];
978 out[i + 2 * dataLen] = tmp[2];
979 out[i + 3 * dataLen] = tmp[3];
980 out[i + 4 * dataLen] = tmp[4];
981 out[i + 5 * dataLen] = tmp[5];
982 out[i + 6 * dataLen] = tmp[6];
983 out[i + 7 * dataLen] = tmp[7];
984 }
985}
986
987inline void deinterleave_store(
988 const std::vector<avx2Float8, allocator<avx2Float8>> &in,
989 std::uint32_t dataLen, float *out)
990{
991 alignas(avx2Float8::alignment) avx2Float8::scalarIndexType tmp[8] = {
992 0, dataLen, 2 * dataLen, 3 * dataLen,
993 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
994 using index_t = avx2Int8<avx2Float8::scalarIndexType>;
995 index_t index0(tmp);
996
997 for (size_t i = 0; i < dataLen; ++i)
998 {
999 in[i].scatter(out, index0);
1000 index0 = index0 + 1;
1001 }
1002}
1003
1004////////////////////////////////////////////////////////////////////////////////
1005
1006// mask type
1007// mask is a int type with special properties (broad boolean vector)
1008// broad boolean vectors defined and allowed values are:
1009// false=0x0 and true=0xFFFFFFFF
1010//
1011// VERY LIMITED SUPPORT...just enough to make cubic eos work...
1012//
1013struct avx2Mask4 : avx2Long4<std::uint64_t>
1014{
1015 // bring in ctors
1016 using avx2Long4::avx2Long4;
1017
1018 static constexpr scalarType true_v = -1;
1019 static constexpr scalarType false_v = 0;
1020};
1021
1022inline avx2Mask4 operator>(avx2Double4 lhs, avx2Double4 rhs)
1023{
1024 return reinterpret_cast<__m256i>(
1025 _mm256_cmp_pd(lhs._data, rhs._data, _CMP_GT_OQ));
1026}
1027
1028inline bool operator&&(avx2Mask4 lhs, bool rhs)
1029{
1030 bool tmp =
1031 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask4::true_v));
1032
1033 return tmp && rhs;
1034}
1035
1036struct avx2Mask8 : avx2Int8<std::uint32_t>
1037{
1038 // bring in ctors
1039 using avx2Int8::avx2Int8;
1040
1041 static constexpr scalarType true_v = -1;
1042 static constexpr scalarType false_v = 0;
1043};
1044
1045inline avx2Mask8 operator>(avx2Float8 lhs, avx2Float8 rhs)
1046{
1047 return reinterpret_cast<__m256i>(_mm256_cmp_ps(rhs._data, lhs._data, 1));
1048}
1049
1050inline bool operator&&(avx2Mask8 lhs, bool rhs)
1051{
1052 bool tmp =
1053 _mm256_testc_si256(lhs._data, _mm256_set1_epi64x(avx2Mask8::true_v));
1054
1055 return tmp && rhs;
1056}
1057
1058} // namespace tinysimd
1059#endif // defined(__AVX2__)
1060#endif
std::int32_t int32_t
std::uint32_t uint32_t
std::int64_t int64_t
std::uint64_t uint64_t
STL namespace.
void load_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition scalar.hpp:327
scalarT< T > abs(scalarT< T > in)
Definition scalar.hpp:295
void deinterleave_unalign_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
Definition scalar.hpp:337
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition scalar.hpp:232
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition scalar.hpp:273
scalarT< T > max(scalarT< T > lhs, scalarT< T > rhs)
Definition scalar.hpp:305
scalarT< T > log(scalarT< T > in)
Definition scalar.hpp:310
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition scalar.hpp:255
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition scalar.hpp:395
bool operator&&(scalarMask lhs, bool rhs)
Definition scalar.hpp:405
void load_unalign_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition scalar.hpp:316
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
Definition scalar.hpp:348
scalarT< T > min(scalarT< T > lhs, scalarT< T > rhs)
Definition scalar.hpp:300
scalarT< T > sqrt(scalarT< T > in)
Definition scalar.hpp:290
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition scalar.hpp:214