Nektar++
Loading...
Searching...
No Matches
avx512.hpp
Go to the documentation of this file.
1///////////////////////////////////////////////////////////////////////////////
2//
3// File: avx512.hpp
4//
5// For more information, please see: http://www.nektar.info
6//
7// The MIT License
8//
9// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10// Department of Aeronautics, Imperial College London (UK), and Scientific
11// Computing and Imaging Institute, University of Utah (USA).
12//
13// Permission is hereby granted, free of charge, to any person obtaining a
14// copy of this software and associated documentation files (the "Software"),
15// to deal in the Software without restriction, including without limitation
16// the rights to use, copy, modify, merge, publish, distribute, sublicense,
17// and/or sell copies of the Software, and to permit persons to whom the
18// Software is furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included
21// in all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29// DEALINGS IN THE SOFTWARE.
30//
31// Description: Vector type using avx512 extension.
32//
33///////////////////////////////////////////////////////////////////////////////
34
35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
37
38#if defined(__x86_64__)
39#include <immintrin.h>
40#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41#define TINYSIMD_HAS_SVML
42#endif
43#endif
44#include "allocator.hpp"
45#include "avx2.hpp"
46#include "traits.hpp"
47#include <vector>
48
49namespace tinysimd::abi
50{
51
52template <typename scalarType, int width = 0> struct avx512
53{
54 using type = void;
55};
56
57} // namespace tinysimd::abi
58
59#if defined(__AVX512F__) && defined(NEKTAR_ENABLE_SIMD_AVX512)
60
61namespace tinysimd
62{
63
64// forward declaration of concrete types
65template <typename T> struct avx512Long8;
66template <typename T> struct avx512Int16;
67struct avx512Double8;
68struct avx512Float16;
69struct avx512Mask8;
70struct avx512Mask16;
71
72namespace abi
73{
74
75// mapping between abstract types and concrete floating point types
76template <> struct avx512<double>
77{
78 using type = avx512Double8;
79};
80template <> struct avx512<float>
81{
82 using type = avx512Float16;
83};
84// generic index mapping
85// assumes index type width same as floating point type
86template <> struct avx512<std::int64_t>
87{
88 using type = avx512Long8<std::int64_t>;
89};
90template <> struct avx512<std::uint64_t>
91{
92 using type = avx512Long8<std::uint64_t>;
93};
94#if defined(__APPLE__)
95template <> struct avx512<std::size_t>
96{
97 using type = avx512Long8<std::size_t>;
98};
99#endif
100template <> struct avx512<std::int32_t>
101{
102 using type = avx512Int16<std::int32_t>;
103};
104template <> struct avx512<std::uint32_t>
105{
106 using type = avx512Int16<std::uint32_t>;
107};
108// specialized index mapping
109template <> struct avx512<std::int64_t, 8>
110{
111 using type = avx512Long8<std::int64_t>;
112};
113template <> struct avx512<std::uint64_t, 8>
114{
115 using type = avx512Long8<std::uint64_t>;
116};
117#if defined(__APPLE__)
118template <> struct avx512<std::size_t, 8>
119{
120 using type = avx512Long8<std::size_t>;
121};
122#endif
123template <> struct avx512<std::int32_t, 8>
124{
125 using type = avx2Int8<std::int32_t>;
126};
127template <> struct avx512<std::uint32_t, 8>
128{
129 using type = avx2Int8<std::uint32_t>;
130};
131template <> struct avx512<std::int32_t, 16>
132{
133 using type = avx512Int16<std::int32_t>;
134};
135template <> struct avx512<std::uint32_t, 16>
136{
137 using type = avx512Int16<std::uint32_t>;
138};
139// bool mapping
140template <> struct avx512<bool, 8>
141{
142 using type = avx512Mask8;
143};
144template <> struct avx512<bool, 16>
145{
146 using type = avx512Mask16;
147};
148
149} // namespace abi
150
151// concrete types
152
153// could add enable if to allow only unsigned long and long...
154template <typename T> struct avx512Int16
155{
156 static_assert(std::is_integral_v<T> && sizeof(T) == 4,
157 "4 bytes Integral required.");
158
159 static constexpr unsigned int width = 16;
160 static constexpr unsigned int alignment = 64;
161
162 using scalarType = T;
163 using vectorType = __m512i;
164 using scalarArray = scalarType[width];
165
166 // storage
167 vectorType _data;
168
169 // ctors
170 inline avx512Int16() = default;
171 inline avx512Int16(const avx512Int16 &rhs) = default;
172 inline avx512Int16(const vectorType &rhs) : _data(rhs)
173 {
174 }
175 inline avx512Int16(const scalarType rhs)
176 {
177 _data = _mm512_set1_epi32(rhs);
178 }
179 explicit inline avx512Int16(scalarArray &rhs)
180 {
181 _data = _mm512_load_epi32(rhs);
182 }
183
184 // copy assignment
185 inline avx512Int16 &operator=(const avx512Int16 &) = default;
186
187 // store
188 inline void store(scalarType *p) const
189 {
190 _mm512_store_epi32(p, _data);
191 }
192
193 template <class flag,
194 typename std::enable_if<is_requiring_alignment_v<flag> &&
195 !is_streaming_v<flag>,
196 bool>::type = 0>
197 inline void store(scalarType *p, flag) const
198 {
199 _mm512_store_epi32(p, _data);
200 }
201
202 template <class flag, typename std::enable_if<
203 !is_requiring_alignment_v<flag>, bool>::type = 0>
204 inline void store(scalarType *p, flag) const
205 {
206 _mm512_storeu_epi32(p, _data);
207 }
208
209 inline void load(const scalarType *p)
210 {
211 _data = _mm512_load_epi32(p);
212 }
213
214 template <class flag,
215 typename std::enable_if<is_requiring_alignment_v<flag> &&
216 !is_streaming_v<flag>,
217 bool>::type = 0>
218 inline void load(const scalarType *p, flag)
219 {
220 _data = _mm512_load_epi32(p);
221 }
222
223 template <class flag, typename std::enable_if<
224 !is_requiring_alignment_v<flag>, bool>::type = 0>
225 inline void load(const scalarType *p, flag)
226 {
227 // even though the intel intrisic manual lists
228 // __m512i _mm512_loadu_epi64 (void const* mem_addr)
229 // it is not implemented in some compilers (gcc)
230 // since this is a bitwise load with no extension
231 // the following instruction is equivalent
232 _data = _mm512_loadu_si512(p);
233 }
234
235 inline void broadcast(const scalarType rhs)
236 {
237 _data = _mm512_set1_epi32(rhs);
238 }
239
240 // subscript
241 // subscript operators are convienient but expensive
242 // should not be used in optimized kernels
243 inline scalarType operator[](size_t i) const
244 {
245 alignas(alignment) scalarArray tmp;
246 store(tmp, is_aligned);
247 return tmp[i];
248 }
249
250 inline scalarType &operator[](size_t i)
251 {
252 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
253 return tmp[i];
254 }
255};
256
257template <typename T>
258inline avx512Int16<T> operator+(avx512Int16<T> lhs, avx512Int16<T> rhs)
259{
260 return _mm512_add_epi32(lhs._data, rhs._data);
261}
262
263template <typename T, typename U,
264 typename = typename std::enable_if<std::is_arithmetic_v<U>>::type>
265inline avx512Int16<T> operator+(avx512Int16<T> lhs, U rhs)
266{
267 return _mm512_add_epi32(lhs._data, _mm512_set1_epi32(rhs));
268}
269
270////////////////////////////////////////////////////////////////////////////////
271
272template <typename T> struct avx512Long8
273{
274 static_assert(std::is_integral_v<T> && sizeof(T) == 8,
275 "8 bytes Integral required.");
276
277 static constexpr unsigned int width = 8;
278 static constexpr unsigned int alignment = 64;
279
280 using scalarType = T;
281 using vectorType = __m512i;
282 using scalarArray = scalarType[width];
283
284 // storage
285 vectorType _data;
286
287 // ctors
288 inline avx512Long8() = default;
289 inline avx512Long8(const avx512Long8 &rhs) = default;
290 inline avx512Long8(const vectorType &rhs) : _data(rhs)
291 {
292 }
293 inline avx512Long8(const scalarType rhs)
294 {
295 _data = _mm512_set1_epi64(rhs);
296 }
297 explicit inline avx512Long8(scalarArray &rhs)
298 {
299 _data = _mm512_load_epi64(rhs);
300 }
301
302 // copy assignment
303 inline avx512Long8 &operator=(const avx512Long8 &) = default;
304
305 // store
306 inline void store(scalarType *p) const
307 {
308 _mm512_store_epi64(p, _data);
309 }
310
311 template <class flag,
312 typename std::enable_if<is_requiring_alignment_v<flag> &&
313 !is_streaming_v<flag>,
314 bool>::type = 0>
315 inline void store(scalarType *p, flag) const
316 {
317 _mm512_store_epi64(p, _data);
318 }
319
320 template <class flag, typename std::enable_if<
321 !is_requiring_alignment_v<flag>, bool>::type = 0>
322 inline void store(scalarType *p, flag) const
323 {
324 _mm512_storeu_epi64(p, _data);
325 }
326
327 inline void load(const scalarType *p)
328 {
329 _data = _mm512_load_epi64(p);
330 }
331
332 template <class flag,
333 typename std::enable_if<is_requiring_alignment_v<flag> &&
334 !is_streaming_v<flag>,
335 bool>::type = 0>
336 inline void load(const scalarType *p, flag)
337 {
338 _data = _mm512_load_epi64(p);
339 }
340
341 template <class flag, typename std::enable_if<
342 !is_requiring_alignment_v<flag>, bool>::type = 0>
343 inline void load(const scalarType *p, flag)
344 {
345 // even though the intel intrisic manual lists
346 // __m512i _mm512_loadu_epi64 (void const* mem_addr)
347 // it is not implemented in some compilers (gcc)
348 // since this is a bitwise load with no extension
349 // the following instruction is equivalent
350 _data = _mm512_loadu_si512(p);
351 }
352
353 inline void broadcast(const scalarType rhs)
354 {
355 _data = _mm512_set1_epi64(rhs);
356 }
357
358 // subscript
359 // subscript operators are convienient but expensive
360 // should not be used in optimized kernels
361 inline scalarType operator[](size_t i) const
362 {
363 alignas(alignment) scalarArray tmp;
364 store(tmp, is_aligned);
365 return tmp[i];
366 }
367
368 inline scalarType &operator[](size_t i)
369 {
370 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
371 return tmp[i];
372 }
373};
374
375template <typename T>
376inline avx512Long8<T> operator+(avx512Long8<T> lhs, avx512Long8<T> rhs)
377{
378 return _mm512_add_epi64(lhs._data, rhs._data);
379}
380
381template <typename T, typename U,
382 typename = typename std::enable_if<std::is_arithmetic_v<U>>::type>
383inline avx512Long8<T> operator+(avx512Long8<T> lhs, U rhs)
384{
385 return _mm512_add_epi64(lhs._data, _mm512_set1_epi64(rhs));
386}
387
388////////////////////////////////////////////////////////////////////////////////
389
390struct avx512Double8
391{
392 static constexpr unsigned int width = 8;
393 static constexpr unsigned int alignment = 64;
394
395 using scalarType = double;
396 using scalarIndexType = std::uint64_t;
397 using vectorType = __m512d;
398 using scalarArray = scalarType[width];
399
400 // storage
401 vectorType _data;
402
403 // ctors
404 inline avx512Double8() = default;
405 inline avx512Double8(const avx512Double8 &rhs) = default;
406 inline avx512Double8(const vectorType &rhs) : _data(rhs)
407 {
408 }
409 inline avx512Double8(const scalarType rhs)
410 {
411 _data = _mm512_set1_pd(rhs);
412 }
413
414 // copy assignment
415 inline avx512Double8 &operator=(const avx512Double8 &) = default;
416
417 // store
418 inline void store(scalarType *p) const
419 {
420 _mm512_store_pd(p, _data);
421 }
422
423 template <class flag,
424 typename std::enable_if<is_requiring_alignment_v<flag> &&
425 !is_streaming_v<flag>,
426 bool>::type = 0>
427 inline void store(scalarType *p, flag) const
428 {
429 _mm512_store_pd(p, _data);
430 }
431
432 template <class flag, typename std::enable_if<
433 !is_requiring_alignment_v<flag>, bool>::type = 0>
434 inline void store(scalarType *p, flag) const
435 {
436 _mm512_storeu_pd(p, _data);
437 }
438
439 template <class flag,
440 typename std::enable_if<is_streaming_v<flag>, bool>::type = 0>
441 inline void store(scalarType *p, flag) const
442 {
443 _mm512_stream_pd(p, _data);
444 }
445
446 // load packed
447 inline void load(const scalarType *p)
448 {
449 _data = _mm512_load_pd(p);
450 }
451
452 template <class flag, typename std::enable_if<
453 is_requiring_alignment_v<flag>, bool>::type = 0>
454 inline void load(const scalarType *p, flag)
455 {
456 _data = _mm512_load_pd(p);
457 }
458
459 template <class flag, typename std::enable_if<
460 !is_requiring_alignment_v<flag>, bool>::type = 0>
461 inline void load(const scalarType *p, flag)
462 {
463 _data = _mm512_loadu_pd(p);
464 }
465
466 // broadcast
467 inline void broadcast(const scalarType rhs)
468 {
469 _data = _mm512_set1_pd(rhs);
470 }
471
472 // gather/scatter
473 template <typename T>
474 inline void gather(scalarType const *p, const avx2Int8<T> &indices)
475 {
476 _data = _mm512_i32gather_pd(indices._data, p, 8);
477 }
478
479 template <typename T>
480 inline void scatter(scalarType *out, const avx2Int8<T> &indices) const
481 {
482 _mm512_i32scatter_pd(out, indices._data, _data, 8);
483 }
484
485 template <typename T>
486 inline void gather(scalarType const *p, const avx512Long8<T> &indices)
487 {
488 _data = _mm512_i64gather_pd(indices._data, p, 8);
489 }
490
491 template <typename T>
492 inline void scatter(scalarType *out, const avx512Long8<T> &indices) const
493 {
494 _mm512_i64scatter_pd(out, indices._data, _data, 8);
495 }
496
497 // fma
498 // this = this + a * b
499 inline void fma(const avx512Double8 &a, const avx512Double8 &b)
500 {
501 _data = _mm512_fmadd_pd(a._data, b._data, _data);
502 }
503
504 // subscript
505 // subscript operators are convienient but expensive
506 // should not be used in optimized kernels
507 inline scalarType operator[](size_t i) const
508 {
509 alignas(alignment) scalarArray tmp;
510 store(tmp, is_aligned);
511 return tmp[i];
512 }
513
514 inline scalarType &operator[](size_t i)
515 {
516 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
517 return tmp[i];
518 }
519
520 // unary ops
521 inline void operator+=(avx512Double8 rhs)
522 {
523 _data = _mm512_add_pd(_data, rhs._data);
524 }
525
526 inline void operator-=(avx512Double8 rhs)
527 {
528 _data = _mm512_sub_pd(_data, rhs._data);
529 }
530
531 inline void operator*=(avx512Double8 rhs)
532 {
533 _data = _mm512_mul_pd(_data, rhs._data);
534 }
535
536 inline void operator/=(avx512Double8 rhs)
537 {
538 _data = _mm512_div_pd(_data, rhs._data);
539 }
540};
541
542inline avx512Double8 operator+(avx512Double8 lhs, avx512Double8 rhs)
543{
544 return _mm512_add_pd(lhs._data, rhs._data);
545}
546
547inline avx512Double8 operator-(avx512Double8 lhs, avx512Double8 rhs)
548{
549 return _mm512_sub_pd(lhs._data, rhs._data);
550}
551
552inline avx512Double8 operator-(avx512Double8 in)
553{
554 return _mm512_sub_pd(_mm512_set1_pd(-0.0), in._data);
555 // return _mm512_xor_pd(in._data, _mm512_set1_pd(-0.0));
556}
557
558inline avx512Double8 operator*(avx512Double8 lhs, avx512Double8 rhs)
559{
560 return _mm512_mul_pd(lhs._data, rhs._data);
561}
562
563inline avx512Double8 operator/(avx512Double8 lhs, avx512Double8 rhs)
564{
565 return _mm512_div_pd(lhs._data, rhs._data);
566}
567
568inline avx512Double8 sqrt(avx512Double8 in)
569{
570 return _mm512_sqrt_pd(in._data);
571}
572
573inline avx512Double8 abs(avx512Double8 in)
574{
575 return _mm512_abs_pd(in._data);
576}
577
578inline avx512Double8 min(avx512Double8 lhs, avx512Double8 rhs)
579{
580 return _mm512_min_pd(lhs._data, rhs._data);
581}
582
583inline avx512Double8 max(avx512Double8 lhs, avx512Double8 rhs)
584{
585 return _mm512_max_pd(lhs._data, rhs._data);
586}
587
588inline avx512Double8 log(avx512Double8 in)
589{
590#if defined(TINYSIMD_HAS_SVML)
591 return _mm512_log_pd(in._data);
592#else
593 // there is no avx512 log intrinsic
594 // this is a dreadful implementation and is simply a stop gap measure
595 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
596 in.store(tmp);
597 tmp[0] = std::log(tmp[0]);
598 tmp[1] = std::log(tmp[1]);
599 tmp[2] = std::log(tmp[2]);
600 tmp[3] = std::log(tmp[3]);
601 tmp[4] = std::log(tmp[4]);
602 tmp[5] = std::log(tmp[5]);
603 tmp[6] = std::log(tmp[6]);
604 tmp[7] = std::log(tmp[7]);
605 avx512Double8 ret;
606 ret.load(tmp);
607 return ret;
608#endif
609}
610
611inline void load_unalign_interleave(
612 const double *in, const std::uint32_t dataLen,
613 std::vector<avx512Double8, allocator<avx512Double8>> &out)
614{
615 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
616 for (size_t i = 0; i < dataLen; ++i)
617 {
618 tmp[0] = in[i];
619 tmp[1] = in[i + dataLen];
620 tmp[2] = in[i + 2 * dataLen];
621 tmp[3] = in[i + 3 * dataLen];
622 tmp[4] = in[i + 4 * dataLen];
623 tmp[5] = in[i + 5 * dataLen];
624 tmp[6] = in[i + 6 * dataLen];
625 tmp[7] = in[i + 7 * dataLen];
626 out[i].load(tmp);
627 }
628}
629
630inline void load_interleave(
631 const double *in, std::uint32_t dataLen,
632 std::vector<avx512Double8, allocator<avx512Double8>> &out)
633{
634
635 alignas(avx512Double8::alignment)
636 avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
637 0, dataLen, 2 * dataLen, 3 * dataLen,
638 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
639
640 using index_t = avx512Long8<avx512Double8::scalarIndexType>;
641 index_t index0(tmp);
642 index_t index1 = index0 + 1;
643 index_t index2 = index0 + 2;
644 index_t index3 = index0 + 3;
645
646 // 4x unrolled loop
647 constexpr uint16_t unrl = 4;
648 size_t nBlocks = dataLen / unrl;
649 for (size_t i = 0; i < nBlocks; ++i)
650 {
651 out[unrl * i + 0].gather(in, index0);
652 out[unrl * i + 1].gather(in, index1);
653 out[unrl * i + 2].gather(in, index2);
654 out[unrl * i + 3].gather(in, index3);
655 index0 = index0 + unrl;
656 index1 = index1 + unrl;
657 index2 = index2 + unrl;
658 index3 = index3 + unrl;
659 }
660
661 // spillover loop
662 for (size_t i = unrl * nBlocks; i < dataLen; ++i)
663 {
664 out[i].gather(in, index0);
665 index0 = index0 + 1;
666 }
667}
668
670 const std::vector<avx512Double8, allocator<avx512Double8>> &in,
671 const std::uint32_t dataLen, double *out)
672{
673 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
674 for (size_t i = 0; i < dataLen; ++i)
675 {
676 in[i].store(tmp);
677 out[i] = tmp[0];
678 out[i + dataLen] = tmp[1];
679 out[i + 2 * dataLen] = tmp[2];
680 out[i + 3 * dataLen] = tmp[3];
681 out[i + 4 * dataLen] = tmp[4];
682 out[i + 5 * dataLen] = tmp[5];
683 out[i + 6 * dataLen] = tmp[6];
684 out[i + 7 * dataLen] = tmp[7];
685 }
686}
687
688inline void deinterleave_store(
689 const std::vector<avx512Double8, allocator<avx512Double8>> &in,
690 std::uint32_t dataLen, double *out)
691{
692 // size_t nBlocks = dataLen / 4;
693
694 alignas(avx512Double8::alignment)
695 avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
696 0, dataLen, 2 * dataLen, 3 * dataLen,
697 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
698 using index_t = avx512Long8<avx512Double8::scalarIndexType>;
699 index_t index0(tmp);
700 for (size_t i = 0; i < dataLen; ++i)
701 {
702 in[i].scatter(out, index0);
703 index0 = index0 + 1;
704 }
705}
706
707////////////////////////////////////////////////////////////////////////////////
708
709struct avx512Float16
710{
711 static constexpr unsigned int width = 16;
712 static constexpr unsigned int alignment = 64;
713
714 using scalarType = float;
715 using scalarIndexType = std::uint32_t;
716 using vectorType = __m512;
717 using scalarArray = scalarType[width];
718
719 // storage
720 vectorType _data;
721
722 // ctors
723 inline avx512Float16() = default;
724 inline avx512Float16(const avx512Float16 &rhs) = default;
725 inline avx512Float16(const vectorType &rhs) : _data(rhs)
726 {
727 }
728 inline avx512Float16(const scalarType rhs)
729 {
730 _data = _mm512_set1_ps(rhs);
731 }
732
733 // copy assignment
734 inline avx512Float16 &operator=(const avx512Float16 &) = default;
735
736 // store
737 inline void store(scalarType *p) const
738 {
739 _mm512_store_ps(p, _data);
740 }
741
742 template <class flag,
743 typename std::enable_if<is_requiring_alignment_v<flag> &&
744 !is_streaming_v<flag>,
745 bool>::type = 0>
746 inline void store(scalarType *p, flag) const
747 {
748 _mm512_store_ps(p, _data);
749 }
750
751 template <class flag, typename std::enable_if<
752 !is_requiring_alignment_v<flag>, bool>::type = 0>
753 inline void store(scalarType *p, flag) const
754 {
755 _mm512_storeu_ps(p, _data);
756 }
757
758 template <class flag,
759 typename std::enable_if<is_streaming_v<flag>, bool>::type = 0>
760 inline void store(scalarType *p, flag) const
761 {
762 _mm512_stream_ps(p, _data);
763 }
764
765 // load packed
766 inline void load(const scalarType *p)
767 {
768 _data = _mm512_load_ps(p);
769 }
770
771 template <class flag, typename std::enable_if<
772 is_requiring_alignment_v<flag>, bool>::type = 0>
773 inline void load(const scalarType *p, flag)
774 {
775 _data = _mm512_load_ps(p);
776 }
777
778 template <class flag, typename std::enable_if<
779 !is_requiring_alignment_v<flag>, bool>::type = 0>
780 inline void load(const scalarType *p, flag)
781 {
782 _data = _mm512_loadu_ps(p);
783 }
784
785 // broadcast
786 inline void broadcast(const scalarType rhs)
787 {
788 _data = _mm512_set1_ps(rhs);
789 }
790
791 // gather/scatter
792 template <typename T>
793 inline void gather(scalarType const *p, const avx512Int16<T> &indices)
794 {
795 _data = _mm512_i32gather_ps(indices._data, p, sizeof(scalarType));
796 }
797
798 template <typename T>
799 inline void scatter(scalarType *out, const avx512Int16<T> &indices) const
800 {
801 _mm512_i32scatter_ps(out, indices._data, _data, sizeof(scalarType));
802 }
803
804 // fma
805 // this = this + a * b
806 inline void fma(const avx512Float16 &a, const avx512Float16 &b)
807 {
808 _data = _mm512_fmadd_ps(a._data, b._data, _data);
809 }
810
811 // subscript
812 // subscript operators are convienient but expensive
813 // should not be used in optimized kernels
814 inline scalarType operator[](size_t i) const
815 {
816 alignas(alignment) scalarArray tmp;
817 store(tmp, is_aligned);
818 return tmp[i];
819 }
820
821 inline scalarType &operator[](size_t i)
822 {
823 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
824 return tmp[i];
825 }
826
827 inline void operator+=(avx512Float16 rhs)
828 {
829 _data = _mm512_add_ps(_data, rhs._data);
830 }
831
832 inline void operator-=(avx512Float16 rhs)
833 {
834 _data = _mm512_sub_ps(_data, rhs._data);
835 }
836
837 inline void operator*=(avx512Float16 rhs)
838 {
839 _data = _mm512_mul_ps(_data, rhs._data);
840 }
841
842 inline void operator/=(avx512Float16 rhs)
843 {
844 _data = _mm512_div_ps(_data, rhs._data);
845 }
846};
847
848inline avx512Float16 operator+(avx512Float16 lhs, avx512Float16 rhs)
849{
850 return _mm512_add_ps(lhs._data, rhs._data);
851}
852
853inline avx512Float16 operator-(avx512Float16 lhs, avx512Float16 rhs)
854{
855 return _mm512_sub_ps(lhs._data, rhs._data);
856}
857
858inline avx512Float16 operator-(avx512Float16 in)
859{
860 return _mm512_sub_ps(_mm512_set1_ps(-0.0), in._data);
861 // return _mm512_xor_ps(in._data, _mm512_set1_ps(-0.0));
862}
863
864inline avx512Float16 operator*(avx512Float16 lhs, avx512Float16 rhs)
865{
866 return _mm512_mul_ps(lhs._data, rhs._data);
867}
868
869inline avx512Float16 operator/(avx512Float16 lhs, avx512Float16 rhs)
870{
871 return _mm512_div_ps(lhs._data, rhs._data);
872}
873
874inline avx512Float16 sqrt(avx512Float16 in)
875{
876 return _mm512_sqrt_ps(in._data);
877}
878
879inline avx512Float16 abs(avx512Float16 in)
880{
881 return _mm512_abs_ps(in._data);
882}
883
884inline avx512Float16 min(avx512Float16 lhs, avx512Float16 rhs)
885{
886 return _mm512_min_ps(lhs._data, rhs._data);
887}
888
889inline avx512Float16 max(avx512Float16 lhs, avx512Float16 rhs)
890{
891 return _mm512_max_ps(lhs._data, rhs._data);
892}
893
894inline avx512Float16 log(avx512Float16 in)
895{
896#if defined(TINYSIMD_HAS_SVML)
897 return _mm512_log_ps(in._data);
898#else
899 // there is no avx512 log intrinsic
900 // this is a dreadful implementation and is simply a stop gap measure
901 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
902 in.store(tmp);
903 tmp[0] = std::log(tmp[0]);
904 tmp[1] = std::log(tmp[1]);
905 tmp[2] = std::log(tmp[2]);
906 tmp[3] = std::log(tmp[3]);
907 tmp[4] = std::log(tmp[4]);
908 tmp[5] = std::log(tmp[5]);
909 tmp[6] = std::log(tmp[6]);
910 tmp[7] = std::log(tmp[7]);
911 tmp[8] = std::log(tmp[8]);
912 tmp[9] = std::log(tmp[9]);
913 tmp[10] = std::log(tmp[10]);
914 tmp[11] = std::log(tmp[11]);
915 tmp[12] = std::log(tmp[12]);
916 tmp[13] = std::log(tmp[13]);
917 tmp[14] = std::log(tmp[14]);
918 tmp[15] = std::log(tmp[15]);
919 avx512Float16 ret;
920 ret.load(tmp);
921 return ret;
922#endif
923}
924
925inline void load_unalign_interleave(
926 const double *in, const std::uint32_t dataLen,
927 std::vector<avx512Float16, allocator<avx512Float16>> &out)
928{
929 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
930 for (size_t i = 0; i < dataLen; ++i)
931 {
932 tmp[0] = in[i];
933 tmp[1] = in[i + dataLen];
934 tmp[2] = in[i + 2 * dataLen];
935 tmp[3] = in[i + 3 * dataLen];
936 tmp[4] = in[i + 4 * dataLen];
937 tmp[5] = in[i + 5 * dataLen];
938 tmp[6] = in[i + 6 * dataLen];
939 tmp[7] = in[i + 7 * dataLen];
940 tmp[8] = in[i + 8 * dataLen];
941 tmp[9] = in[i + 9 * dataLen];
942 tmp[10] = in[i + 10 * dataLen];
943 tmp[11] = in[i + 11 * dataLen];
944 tmp[12] = in[i + 12 * dataLen];
945 tmp[13] = in[i + 13 * dataLen];
946 tmp[14] = in[i + 14 * dataLen];
947 tmp[15] = in[i + 15 * dataLen];
948 out[i].load(tmp);
949 }
950}
951
952inline void load_interleave(
953 const float *in, std::uint32_t dataLen,
954 std::vector<avx512Float16, allocator<avx512Float16>> &out)
955{
956
957 alignas(avx512Float16::alignment)
958 avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
959 0,
960 dataLen,
961 2 * dataLen,
962 3 * dataLen,
963 4 * dataLen,
964 5 * dataLen,
965 6 * dataLen,
966 7 * dataLen,
967 8 * dataLen,
968 9 * dataLen,
969 10 * dataLen,
970 11 * dataLen,
971 12 * dataLen,
972 13 * dataLen,
973 14 * dataLen,
974 15 * dataLen};
975
976 using index_t = avx512Int16<avx512Float16::scalarIndexType>;
977 index_t index0(tmp);
978 index_t index1 = index0 + 1;
979 index_t index2 = index0 + 2;
980 index_t index3 = index0 + 3;
981
982 // 4x unrolled loop
983 constexpr uint16_t unrl = 4;
984 size_t nBlocks = dataLen / unrl;
985 for (size_t i = 0; i < nBlocks; ++i)
986 {
987 out[unrl * i + 0].gather(in, index0);
988 out[unrl * i + 1].gather(in, index1);
989 out[unrl * i + 2].gather(in, index2);
990 out[unrl * i + 3].gather(in, index3);
991 index0 = index0 + unrl;
992 index1 = index1 + unrl;
993 index2 = index2 + unrl;
994 index3 = index3 + unrl;
995 }
996
997 // spillover loop
998 for (size_t i = unrl * nBlocks; i < dataLen; ++i)
999 {
1000 out[i].gather(in, index0);
1001 index0 = index0 + 1;
1002 }
1003}
1004
1005inline void deinterleave_unalign_store(
1006 const std::vector<avx512Float16, allocator<avx512Float16>> &in,
1007 const std::uint32_t dataLen, double *out)
1008{
1009 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
1010 for (size_t i = 0; i < dataLen; ++i)
1011 {
1012 in[i].store(tmp);
1013 out[i] = tmp[0];
1014 out[i + dataLen] = tmp[1];
1015 out[i + 2 * dataLen] = tmp[2];
1016 out[i + 3 * dataLen] = tmp[3];
1017 out[i + 4 * dataLen] = tmp[4];
1018 out[i + 5 * dataLen] = tmp[5];
1019 out[i + 6 * dataLen] = tmp[6];
1020 out[i + 7 * dataLen] = tmp[7];
1021 out[i + 8 * dataLen] = tmp[8];
1022 out[i + 9 * dataLen] = tmp[9];
1023 out[i + 10 * dataLen] = tmp[10];
1024 out[i + 11 * dataLen] = tmp[11];
1025 out[i + 12 * dataLen] = tmp[12];
1026 out[i + 13 * dataLen] = tmp[13];
1027 out[i + 14 * dataLen] = tmp[14];
1028 out[i + 15 * dataLen] = tmp[15];
1029 }
1030}
1031
1032inline void deinterleave_store(
1033 const std::vector<avx512Float16, allocator<avx512Float16>> &in,
1034 std::uint32_t dataLen, float *out)
1035{
1036 // size_t nBlocks = dataLen / 4;
1037
1038 alignas(avx512Float16::alignment)
1039 avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
1040 0,
1041 dataLen,
1042 2 * dataLen,
1043 3 * dataLen,
1044 4 * dataLen,
1045 5 * dataLen,
1046 6 * dataLen,
1047 7 * dataLen,
1048 8 * dataLen,
1049 9 * dataLen,
1050 10 * dataLen,
1051 11 * dataLen,
1052 12 * dataLen,
1053 13 * dataLen,
1054 14 * dataLen,
1055 15 * dataLen};
1056 using index_t = avx512Int16<avx512Float16::scalarIndexType>;
1057
1058 index_t index0(tmp);
1059 for (size_t i = 0; i < dataLen; ++i)
1060 {
1061 in[i].scatter(out, index0);
1062 index0 = index0 + 1;
1063 }
1064}
1065
1066////////////////////////////////////////////////////////////////////////////////
1067
1068// mask type
1069// mask is a int type with special properties (broad boolean vector)
1070// broad boolean vectors defined and allowed values are:
1071// false=0x0 and true=0xFFFFFFFF
1072//
1073// VERY LIMITED SUPPORT...just enough to make cubic eos work...
1074//
1075struct avx512Mask8 : avx512Long8<std::uint64_t>
1076{
1077 // bring in ctors
1078 using avx512Long8::avx512Long8;
1079
1080 static constexpr scalarType true_v = -1;
1081 static constexpr scalarType false_v = 0;
1082};
1083
1084inline avx512Mask8 operator>(avx512Double8 lhs, avx512Double8 rhs)
1085{
1086 __mmask8 mask = _mm512_cmp_pd_mask(lhs._data, rhs._data, _CMP_GT_OQ);
1087 return _mm512_maskz_set1_epi64(mask, avx512Mask8::true_v);
1088}
1089
1090inline bool operator&&(avx512Mask8 lhs, bool rhs)
1091{
1092 __m512i val_true = _mm512_set1_epi64(avx512Mask8::true_v);
1093 __mmask8 mask = _mm512_test_epi64_mask(lhs._data, val_true);
1094 unsigned int tmp = _cvtmask16_u32(mask);
1095 return tmp && rhs;
1096}
1097
1098struct avx512Mask16 : avx512Int16<std::uint32_t>
1099{
1100 // bring in ctors
1101 using avx512Int16::avx512Int16;
1102
1103 static constexpr scalarType true_v = -1;
1104 static constexpr scalarType false_v = 0;
1105};
1106
1107inline avx512Mask16 operator>(avx512Float16 lhs, avx512Float16 rhs)
1108{
1109 __mmask16 mask = _mm512_cmp_ps_mask(lhs._data, rhs._data, _CMP_GT_OQ);
1110 return _mm512_maskz_set1_epi32(mask, avx512Mask16::true_v);
1111}
1112
1113inline bool operator&&(avx512Mask16 lhs, bool rhs)
1114{
1115 __m512i val_true = _mm512_set1_epi32(avx512Mask16::true_v);
1116 __mmask16 mask = _mm512_test_epi32_mask(lhs._data, val_true);
1117 unsigned int tmp = _cvtmask16_u32(mask);
1118 return tmp && rhs;
1119}
1120
1121} // namespace tinysimd
1122
1123#endif // defined(__avx512__)
1124
1125#endif
std::int32_t int32_t
std::uint32_t uint32_t
std::int64_t int64_t
std::uint64_t uint64_t
STL namespace.
void load_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition scalar.hpp:327
scalarT< T > abs(scalarT< T > in)
Definition scalar.hpp:295
void deinterleave_unalign_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
Definition scalar.hpp:337
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition scalar.hpp:232
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition scalar.hpp:273
scalarT< T > max(scalarT< T > lhs, scalarT< T > rhs)
Definition scalar.hpp:305
scalarT< T > log(scalarT< T > in)
Definition scalar.hpp:310
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition scalar.hpp:255
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition scalar.hpp:395
bool operator&&(scalarMask lhs, bool rhs)
Definition scalar.hpp:405
void load_unalign_interleave(const T *in, const size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition scalar.hpp:316
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, const size_t dataLen, T *out)
Definition scalar.hpp:348
scalarT< T > min(scalarT< T > lhs, scalarT< T > rhs)
Definition scalar.hpp:300
scalarT< T > sqrt(scalarT< T > in)
Definition scalar.hpp:290
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition scalar.hpp:214