Nektar++
avx512.hpp
Go to the documentation of this file.
1///////////////////////////////////////////////////////////////////////////////
2//
3// File: avx512.hpp
4//
5// For more information, please see: http://www.nektar.info
6//
7// The MIT License
8//
9// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10// Department of Aeronautics, Imperial College London (UK), and Scientific
11// Computing and Imaging Institute, University of Utah (USA).
12//
13// Permission is hereby granted, free of charge, to any person obtaining a
14// copy of this software and associated documentation files (the "Software"),
15// to deal in the Software without restriction, including without limitation
16// the rights to use, copy, modify, merge, publish, distribute, sublicense,
17// and/or sell copies of the Software, and to permit persons to whom the
18// Software is furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included
21// in all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29// DEALINGS IN THE SOFTWARE.
30//
31// Description: Vector type using avx512 extension.
32//
33///////////////////////////////////////////////////////////////////////////////
34
35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
37
38#if defined(__x86_64__)
39#include <immintrin.h>
40#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41#define TINYSIMD_HAS_SVML
42#endif
43#endif
44#include "allocator.hpp"
45#include "avx2.hpp"
46#include "traits.hpp"
47#include <vector>
48
49namespace tinysimd::abi
50{
51
52template <typename scalarType, int width = 0> struct avx512
53{
54 using type = void;
55};
56
57} // namespace tinysimd::abi
58
59#if defined(__AVX512F__) && defined(NEKTAR_ENABLE_SIMD_AVX512)
60
61namespace tinysimd
62{
63
64// forward declaration of concrete types
65template <typename T> struct avx512Long8;
66template <typename T> struct avx512Int16;
67struct avx512Double8;
68struct avx512Float16;
69struct avx512Mask8;
70struct avx512Mask16;
71
72namespace abi
73{
74
75// mapping between abstract types and concrete floating point types
76template <> struct avx512<double>
77{
78 using type = avx512Double8;
79};
80template <> struct avx512<float>
81{
82 using type = avx512Float16;
83};
84// generic index mapping
85// assumes index type width same as floating point type
86template <> struct avx512<std::int64_t>
87{
88 using type = avx512Long8<std::int64_t>;
89};
90template <> struct avx512<std::uint64_t>
91{
92 using type = avx512Long8<std::uint64_t>;
93};
94#if defined(__APPLE__)
95template <> struct avx512<std::size_t>
96{
97 using type = avx512Long8<std::size_t>;
98};
99#endif
100template <> struct avx512<std::int32_t>
101{
102 using type = avx512Int16<std::int32_t>;
103};
104template <> struct avx512<std::uint32_t>
105{
106 using type = avx512Int16<std::uint32_t>;
107};
108// specialized index mapping
109template <> struct avx512<std::int64_t, 8>
110{
111 using type = avx512Long8<std::int64_t>;
112};
113template <> struct avx512<std::uint64_t, 8>
114{
115 using type = avx512Long8<std::uint64_t>;
116};
117#if defined(__APPLE__)
118template <> struct avx512<std::size_t, 8>
119{
120 using type = avx512Long8<std::size_t>;
121};
122#endif
123template <> struct avx512<std::int32_t, 8>
124{
125 using type = avx2Int8<std::int32_t>;
126};
127template <> struct avx512<std::uint32_t, 8>
128{
129 using type = avx2Int8<std::uint32_t>;
130};
131template <> struct avx512<std::int32_t, 16>
132{
133 using type = avx512Int16<std::int32_t>;
134};
135template <> struct avx512<std::uint32_t, 16>
136{
137 using type = avx512Int16<std::uint32_t>;
138};
139// bool mapping
140template <> struct avx512<bool, 8>
141{
142 using type = avx512Mask8;
143};
144template <> struct avx512<bool, 16>
145{
146 using type = avx512Mask16;
147};
148
149} // namespace abi
150
151// concrete types
152
153// could add enable if to allow only unsigned long and long...
154template <typename T> struct avx512Int16
155{
156 static_assert(std::is_integral_v<T> && sizeof(T) == 4,
157 "4 bytes Integral required.");
158
159 static constexpr unsigned int width = 16;
160 static constexpr unsigned int alignment = 64;
161
162 using scalarType = T;
163 using vectorType = __m512i;
164 using scalarArray = scalarType[width];
165
166 // storage
167 vectorType _data;
168
169 // ctors
170 inline avx512Int16() = default;
171 inline avx512Int16(const avx512Int16 &rhs) = default;
172 inline avx512Int16(const vectorType &rhs) : _data(rhs)
173 {
174 }
175 inline avx512Int16(const scalarType rhs)
176 {
177 _data = _mm512_set1_epi32(rhs);
178 }
179 explicit inline avx512Int16(scalarArray &rhs)
180 {
181 _data = _mm512_load_epi32(rhs);
182 }
183
184 // copy assignment
185 inline avx512Int16 &operator=(const avx512Int16 &) = default;
186
187 // store
188 inline void store(scalarType *p) const
189 {
190 _mm512_store_epi32(p, _data);
191 }
192
193 template <class flag,
194 typename std::enable_if<is_requiring_alignment_v<flag> &&
195 !is_streaming_v<flag>,
196 bool>::type = 0>
197 inline void store(scalarType *p, flag) const
198 {
199 _mm512_store_epi32(p, _data);
200 }
201
202 template <class flag, typename std::enable_if<
203 !is_requiring_alignment_v<flag>, bool>::type = 0>
204 inline void store(scalarType *p, flag) const
205 {
206 _mm512_storeu_epi32(p, _data);
207 }
208
209 inline void load(const scalarType *p)
210 {
211 _data = _mm512_load_epi32(p);
212 }
213
214 template <class flag,
215 typename std::enable_if<is_requiring_alignment_v<flag> &&
216 !is_streaming_v<flag>,
217 bool>::type = 0>
218 inline void load(const scalarType *p, flag)
219 {
220 _data = _mm512_load_epi32(p);
221 }
222
223 template <class flag, typename std::enable_if<
224 !is_requiring_alignment_v<flag>, bool>::type = 0>
225 inline void load(const scalarType *p, flag)
226 {
227 // even though the intel intrisic manual lists
228 // __m512i _mm512_loadu_epi64 (void const* mem_addr)
229 // it is not implemented in some compilers (gcc)
230 // since this is a bitwise load with no extension
231 // the following instruction is equivalent
232 _data = _mm512_loadu_si512(p);
233 }
234
235 inline void broadcast(const scalarType rhs)
236 {
237 _data = _mm512_set1_epi32(rhs);
238 }
239
240 // subscript
241 // subscript operators are convienient but expensive
242 // should not be used in optimized kernels
243 inline scalarType operator[](size_t i) const
244 {
245 alignas(alignment) scalarArray tmp;
246 store(tmp, is_aligned);
247 return tmp[i];
248 }
249
250 inline scalarType &operator[](size_t i)
251 {
252 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
253 return tmp[i];
254 }
255};
256
257template <typename T>
258inline avx512Int16<T> operator+(avx512Int16<T> lhs, avx512Int16<T> rhs)
259{
260 return _mm512_add_epi32(lhs._data, rhs._data);
261}
262
263template <typename T, typename U,
264 typename = typename std::enable_if<std::is_arithmetic_v<U>>::type>
265inline avx512Int16<T> operator+(avx512Int16<T> lhs, U rhs)
266{
267 return _mm512_add_epi32(lhs._data, _mm512_set1_epi32(rhs));
268}
269
270////////////////////////////////////////////////////////////////////////////////
271
272template <typename T> struct avx512Long8
273{
274 static_assert(std::is_integral_v<T> && sizeof(T) == 8,
275 "8 bytes Integral required.");
276
277 static constexpr unsigned int width = 8;
278 static constexpr unsigned int alignment = 64;
279
280 using scalarType = T;
281 using vectorType = __m512i;
282 using scalarArray = scalarType[width];
283
284 // storage
285 vectorType _data;
286
287 // ctors
288 inline avx512Long8() = default;
289 inline avx512Long8(const avx512Long8 &rhs) = default;
290 inline avx512Long8(const vectorType &rhs) : _data(rhs)
291 {
292 }
293 inline avx512Long8(const scalarType rhs)
294 {
295 _data = _mm512_set1_epi64(rhs);
296 }
297 explicit inline avx512Long8(scalarArray &rhs)
298 {
299 _data = _mm512_load_epi64(rhs);
300 }
301
302 // copy assignment
303 inline avx512Long8 &operator=(const avx512Long8 &) = default;
304
305 // store
306 inline void store(scalarType *p) const
307 {
308 _mm512_store_epi64(p, _data);
309 }
310
311 template <class flag,
312 typename std::enable_if<is_requiring_alignment_v<flag> &&
313 !is_streaming_v<flag>,
314 bool>::type = 0>
315 inline void store(scalarType *p, flag) const
316 {
317 _mm512_store_epi64(p, _data);
318 }
319
320 template <class flag, typename std::enable_if<
321 !is_requiring_alignment_v<flag>, bool>::type = 0>
322 inline void store(scalarType *p, flag) const
323 {
324 _mm512_storeu_epi64(p, _data);
325 }
326
327 inline void load(const scalarType *p)
328 {
329 _data = _mm512_load_epi64(p);
330 }
331
332 template <class flag,
333 typename std::enable_if<is_requiring_alignment_v<flag> &&
334 !is_streaming_v<flag>,
335 bool>::type = 0>
336 inline void load(const scalarType *p, flag)
337 {
338 _data = _mm512_load_epi64(p);
339 }
340
341 template <class flag, typename std::enable_if<
342 !is_requiring_alignment_v<flag>, bool>::type = 0>
343 inline void load(const scalarType *p, flag)
344 {
345 // even though the intel intrisic manual lists
346 // __m512i _mm512_loadu_epi64 (void const* mem_addr)
347 // it is not implemented in some compilers (gcc)
348 // since this is a bitwise load with no extension
349 // the following instruction is equivalent
350 _data = _mm512_loadu_si512(p);
351 }
352
353 inline void broadcast(const scalarType rhs)
354 {
355 _data = _mm512_set1_epi64(rhs);
356 }
357
358 // subscript
359 // subscript operators are convienient but expensive
360 // should not be used in optimized kernels
361 inline scalarType operator[](size_t i) const
362 {
363 alignas(alignment) scalarArray tmp;
364 store(tmp, is_aligned);
365 return tmp[i];
366 }
367
368 inline scalarType &operator[](size_t i)
369 {
370 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
371 return tmp[i];
372 }
373};
374
375template <typename T>
376inline avx512Long8<T> operator+(avx512Long8<T> lhs, avx512Long8<T> rhs)
377{
378 return _mm512_add_epi64(lhs._data, rhs._data);
379}
380
381template <typename T, typename U,
382 typename = typename std::enable_if<std::is_arithmetic_v<U>>::type>
383inline avx512Long8<T> operator+(avx512Long8<T> lhs, U rhs)
384{
385 return _mm512_add_epi64(lhs._data, _mm512_set1_epi64(rhs));
386}
387
388////////////////////////////////////////////////////////////////////////////////
389
390struct avx512Double8
391{
392 static constexpr unsigned int width = 8;
393 static constexpr unsigned int alignment = 64;
394
395 using scalarType = double;
396 using scalarIndexType = std::uint64_t;
397 using vectorType = __m512d;
398 using scalarArray = scalarType[width];
399
400 // storage
401 vectorType _data;
402
403 // ctors
404 inline avx512Double8() = default;
405 inline avx512Double8(const avx512Double8 &rhs) = default;
406 inline avx512Double8(const vectorType &rhs) : _data(rhs)
407 {
408 }
409 inline avx512Double8(const scalarType rhs)
410 {
411 _data = _mm512_set1_pd(rhs);
412 }
413
414 // copy assignment
415 inline avx512Double8 &operator=(const avx512Double8 &) = default;
416
417 // store
418 inline void store(scalarType *p) const
419 {
420 _mm512_store_pd(p, _data);
421 }
422
423 template <class flag,
424 typename std::enable_if<is_requiring_alignment_v<flag> &&
425 !is_streaming_v<flag>,
426 bool>::type = 0>
427 inline void store(scalarType *p, flag) const
428 {
429 _mm512_store_pd(p, _data);
430 }
431
432 template <class flag, typename std::enable_if<
433 !is_requiring_alignment_v<flag>, bool>::type = 0>
434 inline void store(scalarType *p, flag) const
435 {
436 _mm512_storeu_pd(p, _data);
437 }
438
439 template <class flag,
440 typename std::enable_if<is_streaming_v<flag>, bool>::type = 0>
441 inline void store(scalarType *p, flag) const
442 {
443 _mm512_stream_pd(p, _data);
444 }
445
446 // load packed
447 inline void load(const scalarType *p)
448 {
449 _data = _mm512_load_pd(p);
450 }
451
452 template <class flag, typename std::enable_if<
453 is_requiring_alignment_v<flag>, bool>::type = 0>
454 inline void load(const scalarType *p, flag)
455 {
456 _data = _mm512_load_pd(p);
457 }
458
459 template <class flag, typename std::enable_if<
460 !is_requiring_alignment_v<flag>, bool>::type = 0>
461 inline void load(const scalarType *p, flag)
462 {
463 _data = _mm512_loadu_pd(p);
464 }
465
466 // broadcast
467 inline void broadcast(const scalarType rhs)
468 {
469 _data = _mm512_set1_pd(rhs);
470 }
471
472 // gather/scatter
473 template <typename T>
474 inline void gather(scalarType const *p, const avx2Int8<T> &indices)
475 {
476 _data = _mm512_i32gather_pd(indices._data, p, 8);
477 }
478
479 template <typename T>
480 inline void scatter(scalarType *out, const avx2Int8<T> &indices) const
481 {
482 _mm512_i32scatter_pd(out, indices._data, _data, 8);
483 }
484
485 template <typename T>
486 inline void gather(scalarType const *p, const avx512Long8<T> &indices)
487 {
488 _data = _mm512_i64gather_pd(indices._data, p, 8);
489 }
490
491 template <typename T>
492 inline void scatter(scalarType *out, const avx512Long8<T> &indices) const
493 {
494 _mm512_i64scatter_pd(out, indices._data, _data, 8);
495 }
496
497 // fma
498 // this = this + a * b
499 inline void fma(const avx512Double8 &a, const avx512Double8 &b)
500 {
501 _data = _mm512_fmadd_pd(a._data, b._data, _data);
502 }
503
504 // subscript
505 // subscript operators are convienient but expensive
506 // should not be used in optimized kernels
507 inline scalarType operator[](size_t i) const
508 {
509 alignas(alignment) scalarArray tmp;
510 store(tmp, is_aligned);
511 return tmp[i];
512 }
513
514 inline scalarType &operator[](size_t i)
515 {
516 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
517 return tmp[i];
518 }
519
520 // unary ops
521 inline void operator+=(avx512Double8 rhs)
522 {
523 _data = _mm512_add_pd(_data, rhs._data);
524 }
525
526 inline void operator-=(avx512Double8 rhs)
527 {
528 _data = _mm512_sub_pd(_data, rhs._data);
529 }
530
531 inline void operator*=(avx512Double8 rhs)
532 {
533 _data = _mm512_mul_pd(_data, rhs._data);
534 }
535
536 inline void operator/=(avx512Double8 rhs)
537 {
538 _data = _mm512_div_pd(_data, rhs._data);
539 }
540};
541
542inline avx512Double8 operator+(avx512Double8 lhs, avx512Double8 rhs)
543{
544 return _mm512_add_pd(lhs._data, rhs._data);
545}
546
547inline avx512Double8 operator-(avx512Double8 lhs, avx512Double8 rhs)
548{
549 return _mm512_sub_pd(lhs._data, rhs._data);
550}
551
552inline avx512Double8 operator*(avx512Double8 lhs, avx512Double8 rhs)
553{
554 return _mm512_mul_pd(lhs._data, rhs._data);
555}
556
557inline avx512Double8 operator/(avx512Double8 lhs, avx512Double8 rhs)
558{
559 return _mm512_div_pd(lhs._data, rhs._data);
560}
561
562inline avx512Double8 sqrt(avx512Double8 in)
563{
564 return _mm512_sqrt_pd(in._data);
565}
566
567inline avx512Double8 abs(avx512Double8 in)
568{
569 return _mm512_abs_pd(in._data);
570}
571
572inline avx512Double8 log(avx512Double8 in)
573{
574#if defined(TINYSIMD_HAS_SVML)
575 return _mm512_log_pd(in._data);
576#else
577 // there is no avx512 log intrinsic
578 // this is a dreadful implementation and is simply a stop gap measure
579 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
580 in.store(tmp);
581 tmp[0] = std::log(tmp[0]);
582 tmp[1] = std::log(tmp[1]);
583 tmp[2] = std::log(tmp[2]);
584 tmp[3] = std::log(tmp[3]);
585 tmp[4] = std::log(tmp[4]);
586 tmp[5] = std::log(tmp[5]);
587 tmp[6] = std::log(tmp[6]);
588 tmp[7] = std::log(tmp[7]);
589 avx512Double8 ret;
590 ret.load(tmp);
591 return ret;
592#endif
593}
594
595inline void load_unalign_interleave(
596 const double *in, const std::uint32_t dataLen,
597 std::vector<avx512Double8, allocator<avx512Double8>> &out)
598{
599 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
600 for (size_t i = 0; i < dataLen; ++i)
601 {
602 tmp[0] = in[i];
603 tmp[1] = in[i + dataLen];
604 tmp[2] = in[i + 2 * dataLen];
605 tmp[3] = in[i + 3 * dataLen];
606 tmp[4] = in[i + 4 * dataLen];
607 tmp[5] = in[i + 5 * dataLen];
608 tmp[6] = in[i + 6 * dataLen];
609 tmp[7] = in[i + 7 * dataLen];
610 out[i].load(tmp);
611 }
612}
613
614inline void load_interleave(
615 const double *in, std::uint32_t dataLen,
616 std::vector<avx512Double8, allocator<avx512Double8>> &out)
617{
618
619 alignas(avx512Double8::alignment)
620 avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
621 0, dataLen, 2 * dataLen, 3 * dataLen,
622 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
623
624 using index_t = avx512Long8<avx512Double8::scalarIndexType>;
625 index_t index0(tmp);
626 index_t index1 = index0 + 1;
627 index_t index2 = index0 + 2;
628 index_t index3 = index0 + 3;
629
630 // 4x unrolled loop
631 constexpr uint16_t unrl = 4;
632 size_t nBlocks = dataLen / unrl;
633 for (size_t i = 0; i < nBlocks; ++i)
634 {
635 out[unrl * i + 0].gather(in, index0);
636 out[unrl * i + 1].gather(in, index1);
637 out[unrl * i + 2].gather(in, index2);
638 out[unrl * i + 3].gather(in, index3);
639 index0 = index0 + unrl;
640 index1 = index1 + unrl;
641 index2 = index2 + unrl;
642 index3 = index3 + unrl;
643 }
644
645 // spillover loop
646 for (size_t i = unrl * nBlocks; i < dataLen; ++i)
647 {
648 out[i].gather(in, index0);
649 index0 = index0 + 1;
650 }
651}
652
654 const std::vector<avx512Double8, allocator<avx512Double8>> &in,
655 const std::uint32_t dataLen, double *out)
656{
657 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
658 for (size_t i = 0; i < dataLen; ++i)
659 {
660 in[i].store(tmp);
661 out[i] = tmp[0];
662 out[i + dataLen] = tmp[1];
663 out[i + 2 * dataLen] = tmp[2];
664 out[i + 3 * dataLen] = tmp[3];
665 out[i + 4 * dataLen] = tmp[4];
666 out[i + 5 * dataLen] = tmp[5];
667 out[i + 6 * dataLen] = tmp[6];
668 out[i + 7 * dataLen] = tmp[7];
669 }
670}
671
672inline void deinterleave_store(
673 const std::vector<avx512Double8, allocator<avx512Double8>> &in,
674 std::uint32_t dataLen, double *out)
675{
676 // size_t nBlocks = dataLen / 4;
677
678 alignas(avx512Double8::alignment)
679 avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
680 0, dataLen, 2 * dataLen, 3 * dataLen,
681 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
682 using index_t = avx512Long8<avx512Double8::scalarIndexType>;
683 index_t index0(tmp);
684 for (size_t i = 0; i < dataLen; ++i)
685 {
686 in[i].scatter(out, index0);
687 index0 = index0 + 1;
688 }
689}
690
691////////////////////////////////////////////////////////////////////////////////
692
693struct avx512Float16
694{
695 static constexpr unsigned int width = 16;
696 static constexpr unsigned int alignment = 64;
697
698 using scalarType = float;
699 using scalarIndexType = std::uint32_t;
700 using vectorType = __m512;
701 using scalarArray = scalarType[width];
702
703 // storage
704 vectorType _data;
705
706 // ctors
707 inline avx512Float16() = default;
708 inline avx512Float16(const avx512Float16 &rhs) = default;
709 inline avx512Float16(const vectorType &rhs) : _data(rhs)
710 {
711 }
712 inline avx512Float16(const scalarType rhs)
713 {
714 _data = _mm512_set1_ps(rhs);
715 }
716
717 // copy assignment
718 inline avx512Float16 &operator=(const avx512Float16 &) = default;
719
720 // store
721 inline void store(scalarType *p) const
722 {
723 _mm512_store_ps(p, _data);
724 }
725
726 template <class flag,
727 typename std::enable_if<is_requiring_alignment_v<flag> &&
728 !is_streaming_v<flag>,
729 bool>::type = 0>
730 inline void store(scalarType *p, flag) const
731 {
732 _mm512_store_ps(p, _data);
733 }
734
735 template <class flag, typename std::enable_if<
736 !is_requiring_alignment_v<flag>, bool>::type = 0>
737 inline void store(scalarType *p, flag) const
738 {
739 _mm512_storeu_ps(p, _data);
740 }
741
742 template <class flag,
743 typename std::enable_if<is_streaming_v<flag>, bool>::type = 0>
744 inline void store(scalarType *p, flag) const
745 {
746 _mm512_stream_ps(p, _data);
747 }
748
749 // load packed
750 inline void load(const scalarType *p)
751 {
752 _data = _mm512_load_ps(p);
753 }
754
755 template <class flag, typename std::enable_if<
756 is_requiring_alignment_v<flag>, bool>::type = 0>
757 inline void load(const scalarType *p, flag)
758 {
759 _data = _mm512_load_ps(p);
760 }
761
762 template <class flag, typename std::enable_if<
763 !is_requiring_alignment_v<flag>, bool>::type = 0>
764 inline void load(const scalarType *p, flag)
765 {
766 _data = _mm512_loadu_ps(p);
767 }
768
769 // broadcast
770 inline void broadcast(const scalarType rhs)
771 {
772 _data = _mm512_set1_ps(rhs);
773 }
774
775 // gather/scatter
776 template <typename T>
777 inline void gather(scalarType const *p, const avx512Int16<T> &indices)
778 {
779 _data = _mm512_i32gather_ps(indices._data, p, sizeof(scalarType));
780 }
781
782 template <typename T>
783 inline void scatter(scalarType *out, const avx512Int16<T> &indices) const
784 {
785 _mm512_i32scatter_ps(out, indices._data, _data, sizeof(scalarType));
786 }
787
788 // fma
789 // this = this + a * b
790 inline void fma(const avx512Float16 &a, const avx512Float16 &b)
791 {
792 _data = _mm512_fmadd_ps(a._data, b._data, _data);
793 }
794
795 // subscript
796 // subscript operators are convienient but expensive
797 // should not be used in optimized kernels
798 inline scalarType operator[](size_t i) const
799 {
800 alignas(alignment) scalarArray tmp;
801 store(tmp, is_aligned);
802 return tmp[i];
803 }
804
805 inline scalarType &operator[](size_t i)
806 {
807 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
808 return tmp[i];
809 }
810
811 inline void operator+=(avx512Float16 rhs)
812 {
813 _data = _mm512_add_ps(_data, rhs._data);
814 }
815
816 inline void operator-=(avx512Float16 rhs)
817 {
818 _data = _mm512_sub_ps(_data, rhs._data);
819 }
820
821 inline void operator*=(avx512Float16 rhs)
822 {
823 _data = _mm512_mul_ps(_data, rhs._data);
824 }
825
826 inline void operator/=(avx512Float16 rhs)
827 {
828 _data = _mm512_div_ps(_data, rhs._data);
829 }
830};
831
832inline avx512Float16 operator+(avx512Float16 lhs, avx512Float16 rhs)
833{
834 return _mm512_add_ps(lhs._data, rhs._data);
835}
836
837inline avx512Float16 operator-(avx512Float16 lhs, avx512Float16 rhs)
838{
839 return _mm512_sub_ps(lhs._data, rhs._data);
840}
841
842inline avx512Float16 operator*(avx512Float16 lhs, avx512Float16 rhs)
843{
844 return _mm512_mul_ps(lhs._data, rhs._data);
845}
846
847inline avx512Float16 operator/(avx512Float16 lhs, avx512Float16 rhs)
848{
849 return _mm512_div_ps(lhs._data, rhs._data);
850}
851
852inline avx512Float16 sqrt(avx512Float16 in)
853{
854 return _mm512_sqrt_ps(in._data);
855}
856
857inline avx512Float16 abs(avx512Float16 in)
858{
859 return _mm512_abs_ps(in._data);
860}
861
862inline avx512Float16 log(avx512Float16 in)
863{
864#if defined(TINYSIMD_HAS_SVML)
865 return _mm512_log_ps(in._data);
866#else
867 // there is no avx512 log intrinsic
868 // this is a dreadful implementation and is simply a stop gap measure
869 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
870 in.store(tmp);
871 tmp[0] = std::log(tmp[0]);
872 tmp[1] = std::log(tmp[1]);
873 tmp[2] = std::log(tmp[2]);
874 tmp[3] = std::log(tmp[3]);
875 tmp[4] = std::log(tmp[4]);
876 tmp[5] = std::log(tmp[5]);
877 tmp[6] = std::log(tmp[6]);
878 tmp[7] = std::log(tmp[7]);
879 tmp[8] = std::log(tmp[8]);
880 tmp[9] = std::log(tmp[9]);
881 tmp[10] = std::log(tmp[10]);
882 tmp[11] = std::log(tmp[11]);
883 tmp[12] = std::log(tmp[12]);
884 tmp[13] = std::log(tmp[13]);
885 tmp[14] = std::log(tmp[14]);
886 tmp[15] = std::log(tmp[15]);
887 avx512Float16 ret;
888 ret.load(tmp);
889 return ret;
890#endif
891}
892
893inline void load_unalign_interleave(
894 const double *in, const std::uint32_t dataLen,
895 std::vector<avx512Float16, allocator<avx512Float16>> &out)
896{
897 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
898 for (size_t i = 0; i < dataLen; ++i)
899 {
900 tmp[0] = in[i];
901 tmp[1] = in[i + dataLen];
902 tmp[2] = in[i + 2 * dataLen];
903 tmp[3] = in[i + 3 * dataLen];
904 tmp[4] = in[i + 4 * dataLen];
905 tmp[5] = in[i + 5 * dataLen];
906 tmp[6] = in[i + 6 * dataLen];
907 tmp[7] = in[i + 7 * dataLen];
908 tmp[8] = in[i + 8 * dataLen];
909 tmp[9] = in[i + 9 * dataLen];
910 tmp[10] = in[i + 10 * dataLen];
911 tmp[11] = in[i + 11 * dataLen];
912 tmp[12] = in[i + 12 * dataLen];
913 tmp[13] = in[i + 13 * dataLen];
914 tmp[14] = in[i + 14 * dataLen];
915 tmp[15] = in[i + 15 * dataLen];
916 out[i].load(tmp);
917 }
918}
919
920inline void load_interleave(
921 const float *in, std::uint32_t dataLen,
922 std::vector<avx512Float16, allocator<avx512Float16>> &out)
923{
924
925 alignas(avx512Float16::alignment)
926 avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
927 0,
928 dataLen,
929 2 * dataLen,
930 3 * dataLen,
931 4 * dataLen,
932 5 * dataLen,
933 6 * dataLen,
934 7 * dataLen,
935 8 * dataLen,
936 9 * dataLen,
937 10 * dataLen,
938 11 * dataLen,
939 12 * dataLen,
940 13 * dataLen,
941 14 * dataLen,
942 15 * dataLen};
943
944 using index_t = avx512Int16<avx512Float16::scalarIndexType>;
945 index_t index0(tmp);
946 index_t index1 = index0 + 1;
947 index_t index2 = index0 + 2;
948 index_t index3 = index0 + 3;
949
950 // 4x unrolled loop
951 constexpr uint16_t unrl = 4;
952 size_t nBlocks = dataLen / unrl;
953 for (size_t i = 0; i < nBlocks; ++i)
954 {
955 out[unrl * i + 0].gather(in, index0);
956 out[unrl * i + 1].gather(in, index1);
957 out[unrl * i + 2].gather(in, index2);
958 out[unrl * i + 3].gather(in, index3);
959 index0 = index0 + unrl;
960 index1 = index1 + unrl;
961 index2 = index2 + unrl;
962 index3 = index3 + unrl;
963 }
964
965 // spillover loop
966 for (size_t i = unrl * nBlocks; i < dataLen; ++i)
967 {
968 out[i].gather(in, index0);
969 index0 = index0 + 1;
970 }
971}
972
974 const std::vector<avx512Float16, allocator<avx512Float16>> &in,
975 const std::uint32_t dataLen, double *out)
976{
977 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
978 for (size_t i = 0; i < dataLen; ++i)
979 {
980 in[i].store(tmp);
981 out[i] = tmp[0];
982 out[i + dataLen] = tmp[1];
983 out[i + 2 * dataLen] = tmp[2];
984 out[i + 3 * dataLen] = tmp[3];
985 out[i + 4 * dataLen] = tmp[4];
986 out[i + 5 * dataLen] = tmp[5];
987 out[i + 6 * dataLen] = tmp[6];
988 out[i + 7 * dataLen] = tmp[7];
989 out[i + 8 * dataLen] = tmp[8];
990 out[i + 9 * dataLen] = tmp[9];
991 out[i + 10 * dataLen] = tmp[10];
992 out[i + 11 * dataLen] = tmp[11];
993 out[i + 12 * dataLen] = tmp[12];
994 out[i + 13 * dataLen] = tmp[13];
995 out[i + 14 * dataLen] = tmp[14];
996 out[i + 15 * dataLen] = tmp[15];
997 }
998}
999
1000inline void deinterleave_store(
1001 const std::vector<avx512Float16, allocator<avx512Float16>> &in,
1002 std::uint32_t dataLen, float *out)
1003{
1004 // size_t nBlocks = dataLen / 4;
1005
1006 alignas(avx512Float16::alignment)
1007 avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
1008 0,
1009 dataLen,
1010 2 * dataLen,
1011 3 * dataLen,
1012 4 * dataLen,
1013 5 * dataLen,
1014 6 * dataLen,
1015 7 * dataLen,
1016 8 * dataLen,
1017 9 * dataLen,
1018 10 * dataLen,
1019 11 * dataLen,
1020 12 * dataLen,
1021 13 * dataLen,
1022 14 * dataLen,
1023 15 * dataLen};
1024 using index_t = avx512Int16<avx512Float16::scalarIndexType>;
1025
1026 index_t index0(tmp);
1027 for (size_t i = 0; i < dataLen; ++i)
1028 {
1029 in[i].scatter(out, index0);
1030 index0 = index0 + 1;
1031 }
1032}
1033
1034////////////////////////////////////////////////////////////////////////////////
1035
1036// mask type
1037// mask is a int type with special properties (broad boolean vector)
1038// broad boolean vectors defined and allowed values are:
1039// false=0x0 and true=0xFFFFFFFF
1040//
1041// VERY LIMITED SUPPORT...just enough to make cubic eos work...
1042//
1043struct avx512Mask8 : avx512Long8<std::uint64_t>
1044{
1045 // bring in ctors
1046 using avx512Long8::avx512Long8;
1047
1048 static constexpr scalarType true_v = -1;
1049 static constexpr scalarType false_v = 0;
1050};
1051
1052inline avx512Mask8 operator>(avx512Double8 lhs, avx512Double8 rhs)
1053{
1054 __mmask8 mask = _mm512_cmp_pd_mask(lhs._data, rhs._data, _CMP_GT_OQ);
1055 return _mm512_maskz_set1_epi64(mask, avx512Mask8::true_v);
1056}
1057
1058inline bool operator&&(avx512Mask8 lhs, bool rhs)
1059{
1060 __m512i val_true = _mm512_set1_epi64(avx512Mask8::true_v);
1061 __mmask8 mask = _mm512_test_epi64_mask(lhs._data, val_true);
1062 unsigned int tmp = _cvtmask16_u32(mask);
1063 return tmp && rhs;
1064}
1065
1066struct avx512Mask16 : avx512Int16<std::uint32_t>
1067{
1068 // bring in ctors
1069 using avx512Int16::avx512Int16;
1070
1071 static constexpr scalarType true_v = -1;
1072 static constexpr scalarType false_v = 0;
1073};
1074
1075inline avx512Mask16 operator>(avx512Float16 lhs, avx512Float16 rhs)
1076{
1077 __mmask16 mask = _mm512_cmp_ps_mask(lhs._data, rhs._data, _CMP_GT_OQ);
1078 return _mm512_maskz_set1_epi32(mask, avx512Mask16::true_v);
1079}
1080
1081inline bool operator&&(avx512Mask16 lhs, bool rhs)
1082{
1083 __m512i val_true = _mm512_set1_epi32(avx512Mask16::true_v);
1084 __mmask16 mask = _mm512_test_epi32_mask(lhs._data, val_true);
1085 unsigned int tmp = _cvtmask16_u32(mask);
1086 return tmp && rhs;
1087}
1088
1089} // namespace tinysimd
1090
1091#endif // defined(__avx512__)
1092
1093#endif
std::int32_t int32_t
std::uint32_t uint32_t
std::int64_t int64_t
std::uint64_t uint64_t
STL namespace.
void load_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition: scalar.hpp:311
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:289
void deinterleave_unalign_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
Definition: scalar.hpp:321
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:232
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:268
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:294
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:250
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:379
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:389
void load_unalign_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition: scalar.hpp:300
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
Definition: scalar.hpp:332
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:285
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:214