Nektar++
avx512.hpp
Go to the documentation of this file.
1///////////////////////////////////////////////////////////////////////////////
2//
3// File: avx512.hpp
4//
5// For more information, please see: http://www.nektar.info
6//
7// The MIT License
8//
9// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10// Department of Aeronautics, Imperial College London (UK), and Scientific
11// Computing and Imaging Institute, University of Utah (USA).
12//
13// Permission is hereby granted, free of charge, to any person obtaining a
14// copy of this software and associated documentation files (the "Software"),
15// to deal in the Software without restriction, including without limitation
16// the rights to use, copy, modify, merge, publish, distribute, sublicense,
17// and/or sell copies of the Software, and to permit persons to whom the
18// Software is furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included
21// in all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29// DEALINGS IN THE SOFTWARE.
30//
31// Description: Vector type using avx512 extension.
32//
33///////////////////////////////////////////////////////////////////////////////
34
35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
37
38#if defined(__x86_64__)
39#include <immintrin.h>
40#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41#define TINYSIMD_HAS_SVML
42#endif
43#endif
44#include "allocator.hpp"
45#include "avx2.hpp"
46#include "traits.hpp"
47#include <vector>
48
49namespace tinysimd::abi
50{
51
52template <typename scalarType, int width = 0> struct avx512
53{
54 using type = void;
55};
56
57} // namespace tinysimd::abi
58
59#if defined(__AVX512F__) && defined(NEKTAR_ENABLE_SIMD_AVX512)
60
61namespace tinysimd
62{
63
64// forward declaration of concrete types
65template <typename T> struct avx512Long8;
66template <typename T> struct avx512Int16;
67struct avx512Double8;
68struct avx512Float16;
69struct avx512Mask8;
70struct avx512Mask16;
71
72namespace abi
73{
74
75// mapping between abstract types and concrete floating point types
76template <> struct avx512<double>
77{
78 using type = avx512Double8;
79};
80template <> struct avx512<float>
81{
82 using type = avx512Float16;
83};
84// generic index mapping
85// assumes index type width same as floating point type
86template <> struct avx512<std::int64_t>
87{
88 using type = avx512Long8<std::int64_t>;
89};
90template <> struct avx512<std::uint64_t>
91{
92 using type = avx512Long8<std::uint64_t>;
93};
94#if defined(__APPLE__)
95template <> struct avx512<std::size_t>
96{
97 using type = avx512Long8<std::size_t>;
98};
99#endif
100template <> struct avx512<std::int32_t>
101{
102 using type = avx512Int16<std::int32_t>;
103};
104template <> struct avx512<std::uint32_t>
105{
106 using type = avx512Int16<std::uint32_t>;
107};
108// specialized index mapping
109template <> struct avx512<std::int64_t, 8>
110{
111 using type = avx512Long8<std::int64_t>;
112};
113template <> struct avx512<std::uint64_t, 8>
114{
115 using type = avx512Long8<std::uint64_t>;
116};
117#if defined(__APPLE__)
118template <> struct avx512<std::size_t, 8>
119{
120 using type = avx512Long8<std::size_t>;
121};
122#endif
123template <> struct avx512<std::int32_t, 8>
124{
125 using type = avx2Int8<std::int32_t>;
126};
127template <> struct avx512<std::uint32_t, 8>
128{
129 using type = avx2Int8<std::uint32_t>;
130};
131template <> struct avx512<std::int32_t, 16>
132{
133 using type = avx512Int16<std::int32_t>;
134};
135template <> struct avx512<std::uint32_t, 16>
136{
137 using type = avx512Int16<std::uint32_t>;
138};
139// bool mapping
140template <> struct avx512<bool, 8>
141{
142 using type = avx512Mask8;
143};
144template <> struct avx512<bool, 16>
145{
146 using type = avx512Mask16;
147};
148
149} // namespace abi
150
151// concrete types
152
153// could add enable if to allow only unsigned long and long...
154template <typename T> struct avx512Int16
155{
156 static_assert(std::is_integral<T>::value && sizeof(T) == 4,
157 "4 bytes Integral required.");
158
159 static constexpr unsigned int width = 16;
160 static constexpr unsigned int alignment = 64;
161
162 using scalarType = T;
163 using vectorType = __m512i;
164 using scalarArray = scalarType[width];
165
166 // storage
167 vectorType _data;
168
169 // ctors
170 inline avx512Int16() = default;
171 inline avx512Int16(const avx512Int16 &rhs) = default;
172 inline avx512Int16(const vectorType &rhs) : _data(rhs)
173 {
174 }
175 inline avx512Int16(const scalarType rhs)
176 {
177 _data = _mm512_set1_epi32(rhs);
178 }
179 explicit inline avx512Int16(scalarArray &rhs)
180 {
181 _data = _mm512_load_epi32(rhs);
182 }
183
184 // copy assignment
185 inline avx512Int16 &operator=(const avx512Int16 &) = default;
186
187 // store
188 inline void store(scalarType *p) const
189 {
190 _mm512_store_epi32(p, _data);
191 }
192
193 template <class flag,
194 typename std::enable_if<is_requiring_alignment<flag>::value &&
195 !is_streaming<flag>::value,
196 bool>::type = 0>
197 inline void store(scalarType *p, flag) const
198 {
199 _mm512_store_epi32(p, _data);
200 }
201
202 template <class flag,
203 typename std::enable_if<!is_requiring_alignment<flag>::value,
204 bool>::type = 0>
205 inline void store(scalarType *p, flag) const
206 {
207 _mm512_storeu_epi32(p, _data);
208 }
209
210 inline void load(const scalarType *p)
211 {
212 _data = _mm512_load_epi32(p);
213 }
214
215 template <class flag,
216 typename std::enable_if<is_requiring_alignment<flag>::value &&
217 !is_streaming<flag>::value,
218 bool>::type = 0>
219 inline void load(const scalarType *p, flag)
220 {
221 _data = _mm512_load_epi32(p);
222 }
223
224 template <class flag,
225 typename std::enable_if<!is_requiring_alignment<flag>::value,
226 bool>::type = 0>
227 inline void load(const scalarType *p, flag)
228 {
229 // even though the intel intrisic manual lists
230 // __m512i _mm512_loadu_epi64 (void const* mem_addr)
231 // it is not implemented in some compilers (gcc)
232 // since this is a bitwise load with no extension
233 // the following instruction is equivalent
234 _data = _mm512_loadu_si512(p);
235 }
236
237 inline void broadcast(const scalarType rhs)
238 {
239 _data = _mm512_set1_epi32(rhs);
240 }
241
242 // subscript
243 // subscript operators are convienient but expensive
244 // should not be used in optimized kernels
245 inline scalarType operator[](size_t i) const
246 {
247 alignas(alignment) scalarArray tmp;
248 store(tmp, is_aligned);
249 return tmp[i];
250 }
251
252 inline scalarType &operator[](size_t i)
253 {
254 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
255 return tmp[i];
256 }
257};
258
259template <typename T>
260inline avx512Int16<T> operator+(avx512Int16<T> lhs, avx512Int16<T> rhs)
261{
262 return _mm512_add_epi32(lhs._data, rhs._data);
263}
264
265template <
266 typename T, typename U,
267 typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
268inline avx512Int16<T> operator+(avx512Int16<T> lhs, U rhs)
269{
270 return _mm512_add_epi32(lhs._data, _mm512_set1_epi32(rhs));
271}
272
273////////////////////////////////////////////////////////////////////////////////
274
275template <typename T> struct avx512Long8
276{
277 static_assert(std::is_integral<T>::value && sizeof(T) == 8,
278 "8 bytes Integral required.");
279
280 static constexpr unsigned int width = 8;
281 static constexpr unsigned int alignment = 64;
282
283 using scalarType = T;
284 using vectorType = __m512i;
285 using scalarArray = scalarType[width];
286
287 // storage
288 vectorType _data;
289
290 // ctors
291 inline avx512Long8() = default;
292 inline avx512Long8(const avx512Long8 &rhs) = default;
293 inline avx512Long8(const vectorType &rhs) : _data(rhs)
294 {
295 }
296 inline avx512Long8(const scalarType rhs)
297 {
298 _data = _mm512_set1_epi64(rhs);
299 }
300 explicit inline avx512Long8(scalarArray &rhs)
301 {
302 _data = _mm512_load_epi64(rhs);
303 }
304
305 // copy assignment
306 inline avx512Long8 &operator=(const avx512Long8 &) = default;
307
308 // store
309 inline void store(scalarType *p) const
310 {
311 _mm512_store_epi64(p, _data);
312 }
313
314 template <class flag,
315 typename std::enable_if<is_requiring_alignment<flag>::value &&
316 !is_streaming<flag>::value,
317 bool>::type = 0>
318 inline void store(scalarType *p, flag) const
319 {
320 _mm512_store_epi64(p, _data);
321 }
322
323 template <class flag,
324 typename std::enable_if<!is_requiring_alignment<flag>::value,
325 bool>::type = 0>
326 inline void store(scalarType *p, flag) const
327 {
328 _mm512_storeu_epi64(p, _data);
329 }
330
331 inline void load(const scalarType *p)
332 {
333 _data = _mm512_load_epi64(p);
334 }
335
336 template <class flag,
337 typename std::enable_if<is_requiring_alignment<flag>::value &&
338 !is_streaming<flag>::value,
339 bool>::type = 0>
340 inline void load(const scalarType *p, flag)
341 {
342 _data = _mm512_load_epi64(p);
343 }
344
345 template <class flag,
346 typename std::enable_if<!is_requiring_alignment<flag>::value,
347 bool>::type = 0>
348 inline void load(const scalarType *p, flag)
349 {
350 // even though the intel intrisic manual lists
351 // __m512i _mm512_loadu_epi64 (void const* mem_addr)
352 // it is not implemented in some compilers (gcc)
353 // since this is a bitwise load with no extension
354 // the following instruction is equivalent
355 _data = _mm512_loadu_si512(p);
356 }
357
358 inline void broadcast(const scalarType rhs)
359 {
360 _data = _mm512_set1_epi64(rhs);
361 }
362
363 // subscript
364 // subscript operators are convienient but expensive
365 // should not be used in optimized kernels
366 inline scalarType operator[](size_t i) const
367 {
368 alignas(alignment) scalarArray tmp;
369 store(tmp, is_aligned);
370 return tmp[i];
371 }
372
373 inline scalarType &operator[](size_t i)
374 {
375 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
376 return tmp[i];
377 }
378};
379
380template <typename T>
381inline avx512Long8<T> operator+(avx512Long8<T> lhs, avx512Long8<T> rhs)
382{
383 return _mm512_add_epi64(lhs._data, rhs._data);
384}
385
386template <
387 typename T, typename U,
388 typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
389inline avx512Long8<T> operator+(avx512Long8<T> lhs, U rhs)
390{
391 return _mm512_add_epi64(lhs._data, _mm512_set1_epi64(rhs));
392}
393
394////////////////////////////////////////////////////////////////////////////////
395
396struct avx512Double8
397{
398 static constexpr unsigned int width = 8;
399 static constexpr unsigned int alignment = 64;
400
401 using scalarType = double;
402 using scalarIndexType = std::uint64_t;
403 using vectorType = __m512d;
404 using scalarArray = scalarType[width];
405
406 // storage
407 vectorType _data;
408
409 // ctors
410 inline avx512Double8() = default;
411 inline avx512Double8(const avx512Double8 &rhs) = default;
412 inline avx512Double8(const vectorType &rhs) : _data(rhs)
413 {
414 }
415 inline avx512Double8(const scalarType rhs)
416 {
417 _data = _mm512_set1_pd(rhs);
418 }
419
420 // copy assignment
421 inline avx512Double8 &operator=(const avx512Double8 &) = default;
422
423 // store
424 inline void store(scalarType *p) const
425 {
426 _mm512_store_pd(p, _data);
427 }
428
429 template <class flag,
430 typename std::enable_if<is_requiring_alignment<flag>::value &&
431 !is_streaming<flag>::value,
432 bool>::type = 0>
433 inline void store(scalarType *p, flag) const
434 {
435 _mm512_store_pd(p, _data);
436 }
437
438 template <class flag,
439 typename std::enable_if<!is_requiring_alignment<flag>::value,
440 bool>::type = 0>
441 inline void store(scalarType *p, flag) const
442 {
443 _mm512_storeu_pd(p, _data);
444 }
445
446 template <class flag, typename std::enable_if<is_streaming<flag>::value,
447 bool>::type = 0>
448 inline void store(scalarType *p, flag) const
449 {
450 _mm512_stream_pd(p, _data);
451 }
452
453 // load packed
454 inline void load(const scalarType *p)
455 {
456 _data = _mm512_load_pd(p);
457 }
458
459 template <class flag,
460 typename std::enable_if<is_requiring_alignment<flag>::value,
461 bool>::type = 0>
462 inline void load(const scalarType *p, flag)
463 {
464 _data = _mm512_load_pd(p);
465 }
466
467 template <class flag,
468 typename std::enable_if<!is_requiring_alignment<flag>::value,
469 bool>::type = 0>
470 inline void load(const scalarType *p, flag)
471 {
472 _data = _mm512_loadu_pd(p);
473 }
474
475 // broadcast
476 inline void broadcast(const scalarType rhs)
477 {
478 _data = _mm512_set1_pd(rhs);
479 }
480
481 // gather/scatter
482 template <typename T>
483 inline void gather(scalarType const *p, const avx2Int8<T> &indices)
484 {
485 _data = _mm512_i32gather_pd(indices._data, p, 8);
486 }
487
488 template <typename T>
489 inline void scatter(scalarType *out, const avx2Int8<T> &indices) const
490 {
491 _mm512_i32scatter_pd(out, indices._data, _data, 8);
492 }
493
494 template <typename T>
495 inline void gather(scalarType const *p, const avx512Long8<T> &indices)
496 {
497 _data = _mm512_i64gather_pd(indices._data, p, 8);
498 }
499
500 template <typename T>
501 inline void scatter(scalarType *out, const avx512Long8<T> &indices) const
502 {
503 _mm512_i64scatter_pd(out, indices._data, _data, 8);
504 }
505
506 // fma
507 // this = this + a * b
508 inline void fma(const avx512Double8 &a, const avx512Double8 &b)
509 {
510 _data = _mm512_fmadd_pd(a._data, b._data, _data);
511 }
512
513 // subscript
514 // subscript operators are convienient but expensive
515 // should not be used in optimized kernels
516 inline scalarType operator[](size_t i) const
517 {
518 alignas(alignment) scalarArray tmp;
519 store(tmp, is_aligned);
520 return tmp[i];
521 }
522
523 inline scalarType &operator[](size_t i)
524 {
525 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
526 return tmp[i];
527 }
528
529 // unary ops
530 inline void operator+=(avx512Double8 rhs)
531 {
532 _data = _mm512_add_pd(_data, rhs._data);
533 }
534
535 inline void operator-=(avx512Double8 rhs)
536 {
537 _data = _mm512_sub_pd(_data, rhs._data);
538 }
539
540 inline void operator*=(avx512Double8 rhs)
541 {
542 _data = _mm512_mul_pd(_data, rhs._data);
543 }
544
545 inline void operator/=(avx512Double8 rhs)
546 {
547 _data = _mm512_div_pd(_data, rhs._data);
548 }
549};
550
551inline avx512Double8 operator+(avx512Double8 lhs, avx512Double8 rhs)
552{
553 return _mm512_add_pd(lhs._data, rhs._data);
554}
555
556inline avx512Double8 operator-(avx512Double8 lhs, avx512Double8 rhs)
557{
558 return _mm512_sub_pd(lhs._data, rhs._data);
559}
560
561inline avx512Double8 operator*(avx512Double8 lhs, avx512Double8 rhs)
562{
563 return _mm512_mul_pd(lhs._data, rhs._data);
564}
565
566inline avx512Double8 operator/(avx512Double8 lhs, avx512Double8 rhs)
567{
568 return _mm512_div_pd(lhs._data, rhs._data);
569}
570
571inline avx512Double8 sqrt(avx512Double8 in)
572{
573 return _mm512_sqrt_pd(in._data);
574}
575
576inline avx512Double8 abs(avx512Double8 in)
577{
578 return _mm512_abs_pd(in._data);
579}
580
581inline avx512Double8 log(avx512Double8 in)
582{
583#if defined(TINYSIMD_HAS_SVML)
584 return _mm512_log_pd(in._data);
585#else
586 // there is no avx512 log intrinsic
587 // this is a dreadful implementation and is simply a stop gap measure
588 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
589 in.store(tmp);
590 tmp[0] = std::log(tmp[0]);
591 tmp[1] = std::log(tmp[1]);
592 tmp[2] = std::log(tmp[2]);
593 tmp[3] = std::log(tmp[3]);
594 tmp[4] = std::log(tmp[4]);
595 tmp[5] = std::log(tmp[5]);
596 tmp[6] = std::log(tmp[6]);
597 tmp[7] = std::log(tmp[7]);
598 avx512Double8 ret;
599 ret.load(tmp);
600 return ret;
601#endif
602}
603
604inline void load_unalign_interleave(
605 const double *in, const std::uint32_t dataLen,
606 std::vector<avx512Double8, allocator<avx512Double8>> &out)
607{
608 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
609 for (size_t i = 0; i < dataLen; ++i)
610 {
611 tmp[0] = in[i];
612 tmp[1] = in[i + dataLen];
613 tmp[2] = in[i + 2 * dataLen];
614 tmp[3] = in[i + 3 * dataLen];
615 tmp[4] = in[i + 4 * dataLen];
616 tmp[5] = in[i + 5 * dataLen];
617 tmp[6] = in[i + 6 * dataLen];
618 tmp[7] = in[i + 7 * dataLen];
619 out[i].load(tmp);
620 }
621}
622
623inline void load_interleave(
624 const double *in, std::uint32_t dataLen,
625 std::vector<avx512Double8, allocator<avx512Double8>> &out)
626{
627
628 alignas(avx512Double8::alignment)
629 avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
630 0, dataLen, 2 * dataLen, 3 * dataLen,
631 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
632
633 using index_t = avx512Long8<avx512Double8::scalarIndexType>;
634 index_t index0(tmp);
635 index_t index1 = index0 + 1;
636 index_t index2 = index0 + 2;
637 index_t index3 = index0 + 3;
638
639 // 4x unrolled loop
640 constexpr uint16_t unrl = 4;
641 size_t nBlocks = dataLen / unrl;
642 for (size_t i = 0; i < nBlocks; ++i)
643 {
644 out[unrl * i + 0].gather(in, index0);
645 out[unrl * i + 1].gather(in, index1);
646 out[unrl * i + 2].gather(in, index2);
647 out[unrl * i + 3].gather(in, index3);
648 index0 = index0 + unrl;
649 index1 = index1 + unrl;
650 index2 = index2 + unrl;
651 index3 = index3 + unrl;
652 }
653
654 // spillover loop
655 for (size_t i = unrl * nBlocks; i < dataLen; ++i)
656 {
657 out[i].gather(in, index0);
658 index0 = index0 + 1;
659 }
660}
661
663 const std::vector<avx512Double8, allocator<avx512Double8>> &in,
664 const std::uint32_t dataLen, double *out)
665{
666 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
667 for (size_t i = 0; i < dataLen; ++i)
668 {
669 in[i].store(tmp);
670 out[i] = tmp[0];
671 out[i + dataLen] = tmp[1];
672 out[i + 2 * dataLen] = tmp[2];
673 out[i + 3 * dataLen] = tmp[3];
674 out[i + 4 * dataLen] = tmp[4];
675 out[i + 5 * dataLen] = tmp[5];
676 out[i + 6 * dataLen] = tmp[6];
677 out[i + 7 * dataLen] = tmp[7];
678 }
679}
680
681inline void deinterleave_store(
682 const std::vector<avx512Double8, allocator<avx512Double8>> &in,
683 std::uint32_t dataLen, double *out)
684{
685 // size_t nBlocks = dataLen / 4;
686
687 alignas(avx512Double8::alignment)
688 avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
689 0, dataLen, 2 * dataLen, 3 * dataLen,
690 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
691 using index_t = avx512Long8<avx512Double8::scalarIndexType>;
692 index_t index0(tmp);
693 for (size_t i = 0; i < dataLen; ++i)
694 {
695 in[i].scatter(out, index0);
696 index0 = index0 + 1;
697 }
698}
699
700////////////////////////////////////////////////////////////////////////////////
701
702struct avx512Float16
703{
704 static constexpr unsigned int width = 16;
705 static constexpr unsigned int alignment = 64;
706
707 using scalarType = float;
708 using scalarIndexType = std::uint32_t;
709 using vectorType = __m512;
710 using scalarArray = scalarType[width];
711
712 // storage
713 vectorType _data;
714
715 // ctors
716 inline avx512Float16() = default;
717 inline avx512Float16(const avx512Float16 &rhs) = default;
718 inline avx512Float16(const vectorType &rhs) : _data(rhs)
719 {
720 }
721 inline avx512Float16(const scalarType rhs)
722 {
723 _data = _mm512_set1_ps(rhs);
724 }
725
726 // copy assignment
727 inline avx512Float16 &operator=(const avx512Float16 &) = default;
728
729 // store
730 inline void store(scalarType *p) const
731 {
732 _mm512_store_ps(p, _data);
733 }
734
735 template <class flag,
736 typename std::enable_if<is_requiring_alignment<flag>::value &&
737 !is_streaming<flag>::value,
738 bool>::type = 0>
739 inline void store(scalarType *p, flag) const
740 {
741 _mm512_store_ps(p, _data);
742 }
743
744 template <class flag,
745 typename std::enable_if<!is_requiring_alignment<flag>::value,
746 bool>::type = 0>
747 inline void store(scalarType *p, flag) const
748 {
749 _mm512_storeu_ps(p, _data);
750 }
751
752 template <class flag, typename std::enable_if<is_streaming<flag>::value,
753 bool>::type = 0>
754 inline void store(scalarType *p, flag) const
755 {
756 _mm512_stream_ps(p, _data);
757 }
758
759 // load packed
760 inline void load(const scalarType *p)
761 {
762 _data = _mm512_load_ps(p);
763 }
764
765 template <class flag,
766 typename std::enable_if<is_requiring_alignment<flag>::value,
767 bool>::type = 0>
768 inline void load(const scalarType *p, flag)
769 {
770 _data = _mm512_load_ps(p);
771 }
772
773 template <class flag,
774 typename std::enable_if<!is_requiring_alignment<flag>::value,
775 bool>::type = 0>
776 inline void load(const scalarType *p, flag)
777 {
778 _data = _mm512_loadu_ps(p);
779 }
780
781 // broadcast
782 inline void broadcast(const scalarType rhs)
783 {
784 _data = _mm512_set1_ps(rhs);
785 }
786
787 // gather/scatter
788 template <typename T>
789 inline void gather(scalarType const *p, const avx512Int16<T> &indices)
790 {
791 _data = _mm512_i32gather_ps(indices._data, p, sizeof(scalarType));
792 }
793
794 template <typename T>
795 inline void scatter(scalarType *out, const avx512Int16<T> &indices) const
796 {
797 _mm512_i32scatter_ps(out, indices._data, _data, sizeof(scalarType));
798 }
799
800 // fma
801 // this = this + a * b
802 inline void fma(const avx512Float16 &a, const avx512Float16 &b)
803 {
804 _data = _mm512_fmadd_ps(a._data, b._data, _data);
805 }
806
807 // subscript
808 // subscript operators are convienient but expensive
809 // should not be used in optimized kernels
810 inline scalarType operator[](size_t i) const
811 {
812 alignas(alignment) scalarArray tmp;
813 store(tmp, is_aligned);
814 return tmp[i];
815 }
816
817 inline scalarType &operator[](size_t i)
818 {
819 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
820 return tmp[i];
821 }
822
823 inline void operator+=(avx512Float16 rhs)
824 {
825 _data = _mm512_add_ps(_data, rhs._data);
826 }
827
828 inline void operator-=(avx512Float16 rhs)
829 {
830 _data = _mm512_sub_ps(_data, rhs._data);
831 }
832
833 inline void operator*=(avx512Float16 rhs)
834 {
835 _data = _mm512_mul_ps(_data, rhs._data);
836 }
837
838 inline void operator/=(avx512Float16 rhs)
839 {
840 _data = _mm512_div_ps(_data, rhs._data);
841 }
842};
843
844inline avx512Float16 operator+(avx512Float16 lhs, avx512Float16 rhs)
845{
846 return _mm512_add_ps(lhs._data, rhs._data);
847}
848
849inline avx512Float16 operator-(avx512Float16 lhs, avx512Float16 rhs)
850{
851 return _mm512_sub_ps(lhs._data, rhs._data);
852}
853
854inline avx512Float16 operator*(avx512Float16 lhs, avx512Float16 rhs)
855{
856 return _mm512_mul_ps(lhs._data, rhs._data);
857}
858
859inline avx512Float16 operator/(avx512Float16 lhs, avx512Float16 rhs)
860{
861 return _mm512_div_ps(lhs._data, rhs._data);
862}
863
864inline avx512Float16 sqrt(avx512Float16 in)
865{
866 return _mm512_sqrt_ps(in._data);
867}
868
869inline avx512Float16 abs(avx512Float16 in)
870{
871 return _mm512_abs_ps(in._data);
872}
873
874inline avx512Float16 log(avx512Float16 in)
875{
876#if defined(TINYSIMD_HAS_SVML)
877 return _mm512_log_ps(in._data);
878#else
879 // there is no avx512 log intrinsic
880 // this is a dreadful implementation and is simply a stop gap measure
881 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
882 in.store(tmp);
883 tmp[0] = std::log(tmp[0]);
884 tmp[1] = std::log(tmp[1]);
885 tmp[2] = std::log(tmp[2]);
886 tmp[3] = std::log(tmp[3]);
887 tmp[4] = std::log(tmp[4]);
888 tmp[5] = std::log(tmp[5]);
889 tmp[6] = std::log(tmp[6]);
890 tmp[7] = std::log(tmp[7]);
891 tmp[8] = std::log(tmp[8]);
892 tmp[9] = std::log(tmp[9]);
893 tmp[10] = std::log(tmp[10]);
894 tmp[11] = std::log(tmp[11]);
895 tmp[12] = std::log(tmp[12]);
896 tmp[13] = std::log(tmp[13]);
897 tmp[14] = std::log(tmp[14]);
898 tmp[15] = std::log(tmp[15]);
899 avx512Float16 ret;
900 ret.load(tmp);
901 return ret;
902#endif
903}
904
905inline void load_unalign_interleave(
906 const double *in, const std::uint32_t dataLen,
907 std::vector<avx512Float16, allocator<avx512Float16>> &out)
908{
909 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
910 for (size_t i = 0; i < dataLen; ++i)
911 {
912 tmp[0] = in[i];
913 tmp[1] = in[i + dataLen];
914 tmp[2] = in[i + 2 * dataLen];
915 tmp[3] = in[i + 3 * dataLen];
916 tmp[4] = in[i + 4 * dataLen];
917 tmp[5] = in[i + 5 * dataLen];
918 tmp[6] = in[i + 6 * dataLen];
919 tmp[7] = in[i + 7 * dataLen];
920 tmp[8] = in[i + 8 * dataLen];
921 tmp[9] = in[i + 9 * dataLen];
922 tmp[10] = in[i + 10 * dataLen];
923 tmp[11] = in[i + 11 * dataLen];
924 tmp[12] = in[i + 12 * dataLen];
925 tmp[13] = in[i + 13 * dataLen];
926 tmp[14] = in[i + 14 * dataLen];
927 tmp[15] = in[i + 15 * dataLen];
928 out[i].load(tmp);
929 }
930}
931
932inline void load_interleave(
933 const float *in, std::uint32_t dataLen,
934 std::vector<avx512Float16, allocator<avx512Float16>> &out)
935{
936
937 alignas(avx512Float16::alignment)
938 avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
939 0,
940 dataLen,
941 2 * dataLen,
942 3 * dataLen,
943 4 * dataLen,
944 5 * dataLen,
945 6 * dataLen,
946 7 * dataLen,
947 8 * dataLen,
948 9 * dataLen,
949 10 * dataLen,
950 11 * dataLen,
951 12 * dataLen,
952 13 * dataLen,
953 14 * dataLen,
954 15 * dataLen};
955
956 using index_t = avx512Int16<avx512Float16::scalarIndexType>;
957 index_t index0(tmp);
958 index_t index1 = index0 + 1;
959 index_t index2 = index0 + 2;
960 index_t index3 = index0 + 3;
961
962 // 4x unrolled loop
963 constexpr uint16_t unrl = 4;
964 size_t nBlocks = dataLen / unrl;
965 for (size_t i = 0; i < nBlocks; ++i)
966 {
967 out[unrl * i + 0].gather(in, index0);
968 out[unrl * i + 1].gather(in, index1);
969 out[unrl * i + 2].gather(in, index2);
970 out[unrl * i + 3].gather(in, index3);
971 index0 = index0 + unrl;
972 index1 = index1 + unrl;
973 index2 = index2 + unrl;
974 index3 = index3 + unrl;
975 }
976
977 // spillover loop
978 for (size_t i = unrl * nBlocks; i < dataLen; ++i)
979 {
980 out[i].gather(in, index0);
981 index0 = index0 + 1;
982 }
983}
984
986 const std::vector<avx512Float16, allocator<avx512Float16>> &in,
987 const std::uint32_t dataLen, double *out)
988{
989 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
990 for (size_t i = 0; i < dataLen; ++i)
991 {
992 in[i].store(tmp);
993 out[i] = tmp[0];
994 out[i + dataLen] = tmp[1];
995 out[i + 2 * dataLen] = tmp[2];
996 out[i + 3 * dataLen] = tmp[3];
997 out[i + 4 * dataLen] = tmp[4];
998 out[i + 5 * dataLen] = tmp[5];
999 out[i + 6 * dataLen] = tmp[6];
1000 out[i + 7 * dataLen] = tmp[7];
1001 out[i + 8 * dataLen] = tmp[8];
1002 out[i + 9 * dataLen] = tmp[9];
1003 out[i + 10 * dataLen] = tmp[10];
1004 out[i + 11 * dataLen] = tmp[11];
1005 out[i + 12 * dataLen] = tmp[12];
1006 out[i + 13 * dataLen] = tmp[13];
1007 out[i + 14 * dataLen] = tmp[14];
1008 out[i + 15 * dataLen] = tmp[15];
1009 }
1010}
1011
1012inline void deinterleave_store(
1013 const std::vector<avx512Float16, allocator<avx512Float16>> &in,
1014 std::uint32_t dataLen, float *out)
1015{
1016 // size_t nBlocks = dataLen / 4;
1017
1018 alignas(avx512Float16::alignment)
1019 avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
1020 0,
1021 dataLen,
1022 2 * dataLen,
1023 3 * dataLen,
1024 4 * dataLen,
1025 5 * dataLen,
1026 6 * dataLen,
1027 7 * dataLen,
1028 8 * dataLen,
1029 9 * dataLen,
1030 10 * dataLen,
1031 11 * dataLen,
1032 12 * dataLen,
1033 13 * dataLen,
1034 14 * dataLen,
1035 15 * dataLen};
1036 using index_t = avx512Int16<avx512Float16::scalarIndexType>;
1037
1038 index_t index0(tmp);
1039 for (size_t i = 0; i < dataLen; ++i)
1040 {
1041 in[i].scatter(out, index0);
1042 index0 = index0 + 1;
1043 }
1044}
1045
1046////////////////////////////////////////////////////////////////////////////////
1047
1048// mask type
1049// mask is a int type with special properties (broad boolean vector)
1050// broad boolean vectors defined and allowed values are:
1051// false=0x0 and true=0xFFFFFFFF
1052//
1053// VERY LIMITED SUPPORT...just enough to make cubic eos work...
1054//
1055struct avx512Mask8 : avx512Long8<std::uint64_t>
1056{
1057 // bring in ctors
1058 using avx512Long8::avx512Long8;
1059
1060 static constexpr scalarType true_v = -1;
1061 static constexpr scalarType false_v = 0;
1062};
1063
1064inline avx512Mask8 operator>(avx512Double8 lhs, avx512Double8 rhs)
1065{
1066 __mmask8 mask = _mm512_cmp_pd_mask(lhs._data, rhs._data, _CMP_GT_OQ);
1067 return _mm512_maskz_set1_epi64(mask, avx512Mask8::true_v);
1068}
1069
1070inline bool operator&&(avx512Mask8 lhs, bool rhs)
1071{
1072 __m512i val_true = _mm512_set1_epi64(avx512Mask8::true_v);
1073 __mmask8 mask = _mm512_test_epi64_mask(lhs._data, val_true);
1074 unsigned int tmp = _cvtmask16_u32(mask);
1075 return tmp && rhs;
1076}
1077
1078struct avx512Mask16 : avx512Int16<std::uint32_t>
1079{
1080 // bring in ctors
1081 using avx512Int16::avx512Int16;
1082
1083 static constexpr scalarType true_v = -1;
1084 static constexpr scalarType false_v = 0;
1085};
1086
1087inline avx512Mask16 operator>(avx512Float16 lhs, avx512Float16 rhs)
1088{
1089 __mmask16 mask = _mm512_cmp_ps_mask(lhs._data, rhs._data, _CMP_GT_OQ);
1090 return _mm512_maskz_set1_epi32(mask, avx512Mask16::true_v);
1091}
1092
1093inline bool operator&&(avx512Mask16 lhs, bool rhs)
1094{
1095 __m512i val_true = _mm512_set1_epi32(avx512Mask16::true_v);
1096 __mmask16 mask = _mm512_test_epi32_mask(lhs._data, val_true);
1097 unsigned int tmp = _cvtmask16_u32(mask);
1098 return tmp && rhs;
1099}
1100
1101} // namespace tinysimd
1102
1103#endif // defined(__avx512__)
1104
1105#endif
STL namespace.
void load_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition: scalar.hpp:320
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:298
void deinterleave_unalign_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
Definition: scalar.hpp:330
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:235
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:275
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:303
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:255
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:388
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:398
void load_unalign_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition: scalar.hpp:309
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
Definition: scalar.hpp:341
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:294
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:215