Nektar++
avx512.hpp
Go to the documentation of this file.
1///////////////////////////////////////////////////////////////////////////////
2//
3// File: avx512.hpp
4//
5// For more information, please see: http://www.nektar.info
6//
7// The MIT License
8//
9// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10// Department of Aeronautics, Imperial College London (UK), and Scientific
11// Computing and Imaging Institute, University of Utah (USA).
12//
13// Permission is hereby granted, free of charge, to any person obtaining a
14// copy of this software and associated documentation files (the "Software"),
15// to deal in the Software without restriction, including without limitation
16// the rights to use, copy, modify, merge, publish, distribute, sublicense,
17// and/or sell copies of the Software, and to permit persons to whom the
18// Software is furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included
21// in all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29// DEALINGS IN THE SOFTWARE.
30//
31// Description: Vector type using avx512 extension.
32//
33///////////////////////////////////////////////////////////////////////////////
34
35#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
36#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
37
38#if defined(__x86_64__)
39#include <immintrin.h>
40#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41#define TINYSIMD_HAS_SVML
42#endif
43#endif
44#include "allocator.hpp"
45#include "avx2.hpp"
46#include "traits.hpp"
47#include <vector>
48
49namespace tinysimd::abi
50{
51
52template <typename scalarType, int width = 0> struct avx512
53{
54 using type = void;
55};
56
57} // namespace tinysimd::abi
58
59#if defined(__AVX512F__) && defined(NEKTAR_ENABLE_SIMD_AVX512)
60
61namespace tinysimd
62{
63
64// forward declaration of concrete types
65template <typename T> struct avx512Long8;
66template <typename T> struct avx512Int16;
67struct avx512Double8;
68struct avx512Float16;
69struct avx512Mask8;
70struct avx512Mask16;
71
72namespace abi
73{
74
75// mapping between abstract types and concrete floating point types
76template <> struct avx512<double>
77{
78 using type = avx512Double8;
79};
80template <> struct avx512<float>
81{
82 using type = avx512Float16;
83};
84// generic index mapping
85// assumes index type width same as floating point type
86template <> struct avx512<std::int64_t>
87{
88 using type = avx512Long8<std::int64_t>;
89};
90template <> struct avx512<std::uint64_t>
91{
92 using type = avx512Long8<std::uint64_t>;
93};
94#if defined(__APPLE__)
95template <> struct avx512<std::size_t>
96{
97 using type = avx512Long8<std::size_t>;
98};
99#endif
100template <> struct avx512<std::int32_t>
101{
102 using type = avx512Int16<std::int32_t>;
103};
104template <> struct avx512<std::uint32_t>
105{
106 using type = avx512Int16<std::uint32_t>;
107};
108// specialized index mapping
109template <> struct avx512<std::int64_t, 8>
110{
111 using type = avx512Long8<std::int64_t>;
112};
113template <> struct avx512<std::uint64_t, 8>
114{
115 using type = avx512Long8<std::uint64_t>;
116};
117#if defined(__APPLE__)
118template <> struct avx512<std::size_t, 8>
119{
120 using type = avx512Long8<std::size_t>;
121};
122#endif
123template <> struct avx512<std::int32_t, 8>
124{
125 using type = avx2Int8<std::int32_t>;
126};
127template <> struct avx512<std::uint32_t, 8>
128{
129 using type = avx2Int8<std::uint32_t>;
130};
131template <> struct avx512<std::int32_t, 16>
132{
133 using type = avx512Int16<std::int32_t>;
134};
135template <> struct avx512<std::uint32_t, 16>
136{
137 using type = avx512Int16<std::uint32_t>;
138};
139// bool mapping
140template <> struct avx512<bool, 8>
141{
142 using type = avx512Mask8;
143};
144template <> struct avx512<bool, 16>
145{
146 using type = avx512Mask16;
147};
148
149} // namespace abi
150
151// concrete types
152
153// could add enable if to allow only unsigned long and long...
154template <typename T> struct avx512Int16
155{
156 static_assert(std::is_integral<T>::value && sizeof(T) == 4,
157 "4 bytes Integral required.");
158
159 static constexpr unsigned int width = 16;
160 static constexpr unsigned int alignment = 64;
161
162 using scalarType = T;
163 using vectorType = __m512i;
164 using scalarArray = scalarType[width];
165
166 // storage
167 vectorType _data;
168
169 // ctors
170 inline avx512Int16() = default;
171 inline avx512Int16(const avx512Int16 &rhs) = default;
172 inline avx512Int16(const vectorType &rhs) : _data(rhs)
173 {
174 }
175 inline avx512Int16(const scalarType rhs)
176 {
177 _data = _mm512_set1_epi32(rhs);
178 }
179 explicit inline avx512Int16(scalarArray &rhs)
180 {
181 _data = _mm512_load_epi32(rhs);
182 }
183
184 // copy assignment
185 inline avx512Int16 &operator=(const avx512Int16 &) = default;
186
187 // store
188 inline void store(scalarType *p) const
189 {
190 _mm512_store_epi32(p, _data);
191 }
192
193 template <class flag,
194 typename std::enable_if<is_requiring_alignment<flag>::value &&
195 !is_streaming<flag>::value,
196 bool>::type = 0>
197 inline void store(scalarType *p, flag) const
198 {
199 _mm512_store_epi32(p, _data);
200 }
201
202 template <class flag,
203 typename std::enable_if<!is_requiring_alignment<flag>::value,
204 bool>::type = 0>
205 inline void store(scalarType *p, flag) const
206 {
207 _mm512_storeu_epi32(p, _data);
208 }
209
210 inline void load(const scalarType *p)
211 {
212 _data = _mm512_load_epi32(p);
213 }
214
215 template <class flag,
216 typename std::enable_if<is_requiring_alignment<flag>::value &&
217 !is_streaming<flag>::value,
218 bool>::type = 0>
219 inline void load(const scalarType *p, flag)
220 {
221 _data = _mm512_load_epi32(p);
222 }
223
224 template <class flag,
225 typename std::enable_if<!is_requiring_alignment<flag>::value,
226 bool>::type = 0>
227 inline void load(const scalarType *p, flag)
228 {
229 // even though the intel intrisic manual lists
230 // __m512i _mm512_loadu_epi64 (void const* mem_addr)
231 // it is not implemented in some compilers (gcc)
232 // since this is a bitwise load with no extension
233 // the following instruction is equivalent
234 _data = _mm512_loadu_si512(p);
235 }
236
237 inline void broadcast(const scalarType rhs)
238 {
239 _data = _mm512_set1_epi32(rhs);
240 }
241
242 // subscript
243 // subscript operators are convienient but expensive
244 // should not be used in optimized kernels
245 inline scalarType operator[](size_t i) const
246 {
247 alignas(alignment) scalarArray tmp;
248 store(tmp, is_aligned);
249 return tmp[i];
250 }
251
252 inline scalarType &operator[](size_t i)
253 {
254 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
255 return tmp[i];
256 }
257};
258
259template <typename T>
260inline avx512Int16<T> operator+(avx512Int16<T> lhs, avx512Int16<T> rhs)
261{
262 return _mm512_add_epi32(lhs._data, rhs._data);
263}
264
265template <
266 typename T, typename U,
267 typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
268inline avx512Int16<T> operator+(avx512Int16<T> lhs, U rhs)
269{
270 return _mm512_add_epi32(lhs._data, _mm512_set1_epi32(rhs));
271}
272
273////////////////////////////////////////////////////////////////////////////////
274
275template <typename T> struct avx512Long8
276{
277 static_assert(std::is_integral<T>::value && sizeof(T) == 8,
278 "8 bytes Integral required.");
279
280 static constexpr unsigned int width = 8;
281 static constexpr unsigned int alignment = 64;
282
283 using scalarType = T;
284 using vectorType = __m512i;
285 using scalarArray = scalarType[width];
286
287 // storage
288 vectorType _data;
289
290 // ctors
291 inline avx512Long8() = default;
292 inline avx512Long8(const avx512Long8 &rhs) = default;
293 inline avx512Long8(const vectorType &rhs) : _data(rhs)
294 {
295 }
296 inline avx512Long8(const scalarType rhs)
297 {
298 _data = _mm512_set1_epi64(rhs);
299 }
300 explicit inline avx512Long8(scalarArray &rhs)
301 {
302 _data = _mm512_load_epi64(rhs);
303 }
304
305 // copy assignment
306 inline avx512Long8 &operator=(const avx512Long8 &) = default;
307
308 // store
309 inline void store(scalarType *p) const
310 {
311 _mm512_store_epi64(p, _data);
312 }
313
314 template <class flag,
315 typename std::enable_if<is_requiring_alignment<flag>::value &&
316 !is_streaming<flag>::value,
317 bool>::type = 0>
318 inline void store(scalarType *p, flag) const
319 {
320 _mm512_store_epi64(p, _data);
321 }
322
323 template <class flag,
324 typename std::enable_if<!is_requiring_alignment<flag>::value,
325 bool>::type = 0>
326 inline void store(scalarType *p, flag) const
327 {
328 _mm512_storeu_epi64(p, _data);
329 }
330
331 inline void load(const scalarType *p)
332 {
333 _data = _mm512_load_epi64(p);
334 }
335
336 template <class flag,
337 typename std::enable_if<is_requiring_alignment<flag>::value &&
338 !is_streaming<flag>::value,
339 bool>::type = 0>
340 inline void load(const scalarType *p, flag)
341 {
342 _data = _mm512_load_epi64(p);
343 }
344
345 template <class flag,
346 typename std::enable_if<!is_requiring_alignment<flag>::value,
347 bool>::type = 0>
348 inline void load(const scalarType *p, flag)
349 {
350 // even though the intel intrisic manual lists
351 // __m512i _mm512_loadu_epi64 (void const* mem_addr)
352 // it is not implemented in some compilers (gcc)
353 // since this is a bitwise load with no extension
354 // the following instruction is equivalent
355 _data = _mm512_loadu_si512(p);
356 }
357
358 inline void broadcast(const scalarType rhs)
359 {
360 _data = _mm512_set1_epi64(rhs);
361 }
362
363 // subscript
364 // subscript operators are convienient but expensive
365 // should not be used in optimized kernels
366 inline scalarType operator[](size_t i) const
367 {
368 alignas(alignment) scalarArray tmp;
369 store(tmp, is_aligned);
370 return tmp[i];
371 }
372
373 inline scalarType &operator[](size_t i)
374 {
375 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
376 return tmp[i];
377 }
378};
379
380template <typename T>
381inline avx512Long8<T> operator+(avx512Long8<T> lhs, avx512Long8<T> rhs)
382{
383 return _mm512_add_epi64(lhs._data, rhs._data);
384}
385
386template <
387 typename T, typename U,
388 typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
389inline avx512Long8<T> operator+(avx512Long8<T> lhs, U rhs)
390{
391 return _mm512_add_epi64(lhs._data, _mm512_set1_epi64(rhs));
392}
393
394////////////////////////////////////////////////////////////////////////////////
395
396struct avx512Double8
397{
398 static constexpr unsigned int width = 8;
399 static constexpr unsigned int alignment = 64;
400
401 using scalarType = double;
402 using scalarIndexType = std::uint64_t;
403 using vectorType = __m512d;
404 using scalarArray = scalarType[width];
405
406 // storage
407 vectorType _data;
408
409 // ctors
410 inline avx512Double8() = default;
411 inline avx512Double8(const avx512Double8 &rhs) = default;
412 inline avx512Double8(const vectorType &rhs) : _data(rhs)
413 {
414 }
415 inline avx512Double8(const scalarType rhs)
416 {
417 _data = _mm512_set1_pd(rhs);
418 }
419
420 // copy assignment
421 inline avx512Double8 &operator=(const avx512Double8 &) = default;
422
423 // store
424 inline void store(scalarType *p) const
425 {
426 _mm512_store_pd(p, _data);
427 }
428
429 template <class flag,
430 typename std::enable_if<is_requiring_alignment<flag>::value &&
431 !is_streaming<flag>::value,
432 bool>::type = 0>
433 inline void store(scalarType *p, flag) const
434 {
435 _mm512_store_pd(p, _data);
436 }
437
438 template <class flag,
439 typename std::enable_if<!is_requiring_alignment<flag>::value,
440 bool>::type = 0>
441 inline void store(scalarType *p, flag) const
442 {
443 _mm512_storeu_pd(p, _data);
444 }
445
446 template <class flag, typename std::enable_if<is_streaming<flag>::value,
447 bool>::type = 0>
448 inline void store(scalarType *p, flag) const
449 {
450 _mm512_stream_pd(p, _data);
451 }
452
453 // load packed
454 inline void load(const scalarType *p)
455 {
456 _data = _mm512_load_pd(p);
457 }
458
459 template <class flag,
460 typename std::enable_if<is_requiring_alignment<flag>::value,
461 bool>::type = 0>
462 inline void load(const scalarType *p, flag)
463 {
464 _data = _mm512_load_pd(p);
465 }
466
467 template <class flag,
468 typename std::enable_if<!is_requiring_alignment<flag>::value,
469 bool>::type = 0>
470 inline void load(const scalarType *p, flag)
471 {
472 _data = _mm512_loadu_pd(p);
473 }
474
475 // broadcast
476 inline void broadcast(const scalarType rhs)
477 {
478 _data = _mm512_set1_pd(rhs);
479 }
480
481 // gather/scatter
482 template <typename T>
483 inline void gather(scalarType const *p, const avx2Int8<T> &indices)
484 {
485 _data = _mm512_i32gather_pd(indices._data, p, 8);
486 }
487
488 template <typename T>
489 inline void scatter(scalarType *out, const avx2Int8<T> &indices) const
490 {
491 _mm512_i32scatter_pd(out, indices._data, _data, 8);
492 }
493
494 template <typename T>
495 inline void gather(scalarType const *p, const avx512Long8<T> &indices)
496 {
497 _data = _mm512_i64gather_pd(indices._data, p, 8);
498 }
499
500 template <typename T>
501 inline void scatter(scalarType *out, const avx512Long8<T> &indices) const
502 {
503 _mm512_i64scatter_pd(out, indices._data, _data, 8);
504 }
505
506 // fma
507 // this = this + a * b
508 inline void fma(const avx512Double8 &a, const avx512Double8 &b)
509 {
510 _data = _mm512_fmadd_pd(a._data, b._data, _data);
511 }
512
513 // subscript
514 // subscript operators are convienient but expensive
515 // should not be used in optimized kernels
516 inline scalarType operator[](size_t i) const
517 {
518 alignas(alignment) scalarArray tmp;
519 store(tmp, is_aligned);
520 return tmp[i];
521 }
522
523 inline scalarType &operator[](size_t i)
524 {
525 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
526 return tmp[i];
527 }
528
529 // unary ops
530 inline void operator+=(avx512Double8 rhs)
531 {
532 _data = _mm512_add_pd(_data, rhs._data);
533 }
534
535 inline void operator-=(avx512Double8 rhs)
536 {
537 _data = _mm512_sub_pd(_data, rhs._data);
538 }
539
540 inline void operator*=(avx512Double8 rhs)
541 {
542 _data = _mm512_mul_pd(_data, rhs._data);
543 }
544
545 inline void operator/=(avx512Double8 rhs)
546 {
547 _data = _mm512_div_pd(_data, rhs._data);
548 }
549};
550
551inline avx512Double8 operator+(avx512Double8 lhs, avx512Double8 rhs)
552{
553 return _mm512_add_pd(lhs._data, rhs._data);
554}
555
556inline avx512Double8 operator-(avx512Double8 lhs, avx512Double8 rhs)
557{
558 return _mm512_sub_pd(lhs._data, rhs._data);
559}
560
561inline avx512Double8 operator*(avx512Double8 lhs, avx512Double8 rhs)
562{
563 return _mm512_mul_pd(lhs._data, rhs._data);
564}
565
566inline avx512Double8 operator/(avx512Double8 lhs, avx512Double8 rhs)
567{
568 return _mm512_div_pd(lhs._data, rhs._data);
569}
570
571inline avx512Double8 sqrt(avx512Double8 in)
572{
573 return _mm512_sqrt_pd(in._data);
574}
575
576inline avx512Double8 abs(avx512Double8 in)
577{
578 return _mm512_abs_pd(in._data);
579}
580
581inline avx512Double8 log(avx512Double8 in)
582{
583#if defined(TINYSIMD_HAS_SVML)
584 return _mm512_log_pd(in._data);
585#else
586 // there is no avx512 log intrinsic
587 // this is a dreadful implementation and is simply a stop gap measure
588 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
589 in.store(tmp);
590 tmp[0] = std::log(tmp[0]);
591 tmp[1] = std::log(tmp[1]);
592 tmp[2] = std::log(tmp[2]);
593 tmp[3] = std::log(tmp[3]);
594 tmp[4] = std::log(tmp[4]);
595 tmp[5] = std::log(tmp[5]);
596 tmp[6] = std::log(tmp[6]);
597 tmp[7] = std::log(tmp[7]);
598 avx512Double8 ret;
599 ret.load(tmp);
600 return ret;
601#endif
602}
603
604inline void load_interleave(
605 const double *in, std::uint32_t dataLen,
606 std::vector<avx512Double8, allocator<avx512Double8>> &out)
607{
608
609 alignas(avx512Double8::alignment)
610 avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
611 0, dataLen, 2 * dataLen, 3 * dataLen,
612 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
613
614 using index_t = avx512Long8<avx512Double8::scalarIndexType>;
615 index_t index0(tmp);
616 index_t index1 = index0 + 1;
617 index_t index2 = index0 + 2;
618 index_t index3 = index0 + 3;
619
620 // 4x unrolled loop
621 constexpr uint16_t unrl = 4;
622 size_t nBlocks = dataLen / unrl;
623 for (size_t i = 0; i < nBlocks; ++i)
624 {
625 out[unrl * i + 0].gather(in, index0);
626 out[unrl * i + 1].gather(in, index1);
627 out[unrl * i + 2].gather(in, index2);
628 out[unrl * i + 3].gather(in, index3);
629 index0 = index0 + unrl;
630 index1 = index1 + unrl;
631 index2 = index2 + unrl;
632 index3 = index3 + unrl;
633 }
634
635 // spillover loop
636 for (size_t i = unrl * nBlocks; i < dataLen; ++i)
637 {
638 out[i].gather(in, index0);
639 index0 = index0 + 1;
640 }
641}
642
643inline void deinterleave_store(
644 const std::vector<avx512Double8, allocator<avx512Double8>> &in,
645 std::uint32_t dataLen, double *out)
646{
647 // size_t nBlocks = dataLen / 4;
648
649 alignas(avx512Double8::alignment)
650 avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
651 0, dataLen, 2 * dataLen, 3 * dataLen,
652 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
653 using index_t = avx512Long8<avx512Double8::scalarIndexType>;
654 index_t index0(tmp);
655 for (size_t i = 0; i < dataLen; ++i)
656 {
657 in[i].scatter(out, index0);
658 index0 = index0 + 1;
659 }
660}
661
662////////////////////////////////////////////////////////////////////////////////
663
664struct avx512Float16
665{
666 static constexpr unsigned int width = 16;
667 static constexpr unsigned int alignment = 64;
668
669 using scalarType = float;
670 using scalarIndexType = std::uint32_t;
671 using vectorType = __m512;
672 using scalarArray = scalarType[width];
673
674 // storage
675 vectorType _data;
676
677 // ctors
678 inline avx512Float16() = default;
679 inline avx512Float16(const avx512Float16 &rhs) = default;
680 inline avx512Float16(const vectorType &rhs) : _data(rhs)
681 {
682 }
683 inline avx512Float16(const scalarType rhs)
684 {
685 _data = _mm512_set1_ps(rhs);
686 }
687
688 // copy assignment
689 inline avx512Float16 &operator=(const avx512Float16 &) = default;
690
691 // store
692 inline void store(scalarType *p) const
693 {
694 _mm512_store_ps(p, _data);
695 }
696
697 template <class flag,
698 typename std::enable_if<is_requiring_alignment<flag>::value &&
699 !is_streaming<flag>::value,
700 bool>::type = 0>
701 inline void store(scalarType *p, flag) const
702 {
703 _mm512_store_ps(p, _data);
704 }
705
706 template <class flag,
707 typename std::enable_if<!is_requiring_alignment<flag>::value,
708 bool>::type = 0>
709 inline void store(scalarType *p, flag) const
710 {
711 _mm512_storeu_ps(p, _data);
712 }
713
714 template <class flag, typename std::enable_if<is_streaming<flag>::value,
715 bool>::type = 0>
716 inline void store(scalarType *p, flag) const
717 {
718 _mm512_stream_ps(p, _data);
719 }
720
721 // load packed
722 inline void load(const scalarType *p)
723 {
724 _data = _mm512_load_ps(p);
725 }
726
727 template <class flag,
728 typename std::enable_if<is_requiring_alignment<flag>::value,
729 bool>::type = 0>
730 inline void load(const scalarType *p, flag)
731 {
732 _data = _mm512_load_ps(p);
733 }
734
735 template <class flag,
736 typename std::enable_if<!is_requiring_alignment<flag>::value,
737 bool>::type = 0>
738 inline void load(const scalarType *p, flag)
739 {
740 _data = _mm512_loadu_ps(p);
741 }
742
743 // broadcast
744 inline void broadcast(const scalarType rhs)
745 {
746 _data = _mm512_set1_ps(rhs);
747 }
748
749 // gather/scatter
750 template <typename T>
751 inline void gather(scalarType const *p, const avx512Int16<T> &indices)
752 {
753 _data = _mm512_i32gather_ps(indices._data, p, sizeof(scalarType));
754 }
755
756 template <typename T>
757 inline void scatter(scalarType *out, const avx512Int16<T> &indices) const
758 {
759 _mm512_i32scatter_ps(out, indices._data, _data, sizeof(scalarType));
760 }
761
762 // fma
763 // this = this + a * b
764 inline void fma(const avx512Float16 &a, const avx512Float16 &b)
765 {
766 _data = _mm512_fmadd_ps(a._data, b._data, _data);
767 }
768
769 // subscript
770 // subscript operators are convienient but expensive
771 // should not be used in optimized kernels
772 inline scalarType operator[](size_t i) const
773 {
774 alignas(alignment) scalarArray tmp;
775 store(tmp, is_aligned);
776 return tmp[i];
777 }
778
779 inline scalarType &operator[](size_t i)
780 {
781 scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
782 return tmp[i];
783 }
784
785 inline void operator+=(avx512Float16 rhs)
786 {
787 _data = _mm512_add_ps(_data, rhs._data);
788 }
789
790 inline void operator-=(avx512Float16 rhs)
791 {
792 _data = _mm512_sub_ps(_data, rhs._data);
793 }
794
795 inline void operator*=(avx512Float16 rhs)
796 {
797 _data = _mm512_mul_ps(_data, rhs._data);
798 }
799
800 inline void operator/=(avx512Float16 rhs)
801 {
802 _data = _mm512_div_ps(_data, rhs._data);
803 }
804};
805
806inline avx512Float16 operator+(avx512Float16 lhs, avx512Float16 rhs)
807{
808 return _mm512_add_ps(lhs._data, rhs._data);
809}
810
811inline avx512Float16 operator-(avx512Float16 lhs, avx512Float16 rhs)
812{
813 return _mm512_sub_ps(lhs._data, rhs._data);
814}
815
816inline avx512Float16 operator*(avx512Float16 lhs, avx512Float16 rhs)
817{
818 return _mm512_mul_ps(lhs._data, rhs._data);
819}
820
821inline avx512Float16 operator/(avx512Float16 lhs, avx512Float16 rhs)
822{
823 return _mm512_div_ps(lhs._data, rhs._data);
824}
825
826inline avx512Float16 sqrt(avx512Float16 in)
827{
828 return _mm512_sqrt_ps(in._data);
829}
830
831inline avx512Float16 abs(avx512Float16 in)
832{
833 return _mm512_abs_ps(in._data);
834}
835
836inline avx512Float16 log(avx512Float16 in)
837{
838#if defined(TINYSIMD_HAS_SVML)
839 return _mm512_log_ps(in._data);
840#else
841 // there is no avx512 log intrinsic
842 // this is a dreadful implementation and is simply a stop gap measure
843 alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
844 in.store(tmp);
845 tmp[0] = std::log(tmp[0]);
846 tmp[1] = std::log(tmp[1]);
847 tmp[2] = std::log(tmp[2]);
848 tmp[3] = std::log(tmp[3]);
849 tmp[4] = std::log(tmp[4]);
850 tmp[5] = std::log(tmp[5]);
851 tmp[6] = std::log(tmp[6]);
852 tmp[7] = std::log(tmp[7]);
853 tmp[8] = std::log(tmp[8]);
854 tmp[9] = std::log(tmp[9]);
855 tmp[10] = std::log(tmp[10]);
856 tmp[11] = std::log(tmp[11]);
857 tmp[12] = std::log(tmp[12]);
858 tmp[13] = std::log(tmp[13]);
859 tmp[14] = std::log(tmp[14]);
860 tmp[15] = std::log(tmp[15]);
861 avx512Float16 ret;
862 ret.load(tmp);
863 return ret;
864#endif
865}
866
867inline void load_interleave(
868 const float *in, std::uint32_t dataLen,
869 std::vector<avx512Float16, allocator<avx512Float16>> &out)
870{
871
872 alignas(avx512Float16::alignment)
873 avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
874 0, dataLen, 2 * dataLen, 3 * dataLen,
875 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
876
877 using index_t = avx512Int16<avx512Float16::scalarIndexType>;
878 index_t index0(tmp);
879 index_t index1 = index0 + 1;
880 index_t index2 = index0 + 2;
881 index_t index3 = index0 + 3;
882
883 // 4x unrolled loop
884 constexpr uint16_t unrl = 4;
885 size_t nBlocks = dataLen / unrl;
886 for (size_t i = 0; i < nBlocks; ++i)
887 {
888 out[unrl * i + 0].gather(in, index0);
889 out[unrl * i + 1].gather(in, index1);
890 out[unrl * i + 2].gather(in, index2);
891 out[unrl * i + 3].gather(in, index3);
892 index0 = index0 + unrl;
893 index1 = index1 + unrl;
894 index2 = index2 + unrl;
895 index3 = index3 + unrl;
896 }
897
898 // spillover loop
899 for (size_t i = unrl * nBlocks; i < dataLen; ++i)
900 {
901 out[i].gather(in, index0);
902 index0 = index0 + 1;
903 }
904}
905
906inline void deinterleave_store(
907 const std::vector<avx512Float16, allocator<avx512Float16>> &in,
908 std::uint32_t dataLen, float *out)
909{
910 // size_t nBlocks = dataLen / 4;
911
912 alignas(avx512Float16::alignment)
913 avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
914 0, dataLen, 2 * dataLen, 3 * dataLen,
915 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
916 using index_t = avx512Int16<avx512Float16::scalarIndexType>;
917
918 index_t index0(tmp);
919 for (size_t i = 0; i < dataLen; ++i)
920 {
921 in[i].scatter(out, index0);
922 index0 = index0 + 1;
923 }
924}
925
926////////////////////////////////////////////////////////////////////////////////
927
928// mask type
929// mask is a int type with special properties (broad boolean vector)
930// broad boolean vectors defined and allowed values are:
931// false=0x0 and true=0xFFFFFFFF
932//
933// VERY LIMITED SUPPORT...just enough to make cubic eos work...
934//
935struct avx512Mask8 : avx512Long8<std::uint64_t>
936{
937 // bring in ctors
938 using avx512Long8::avx512Long8;
939
940 static constexpr scalarType true_v = -1;
941 static constexpr scalarType false_v = 0;
942};
943
944inline avx512Mask8 operator>(avx512Double8 lhs, avx512Double8 rhs)
945{
946 __mmask8 mask = _mm512_cmp_pd_mask(lhs._data, rhs._data, _CMP_GT_OQ);
947 return _mm512_maskz_set1_epi64(mask, avx512Mask8::true_v);
948}
949
950inline bool operator&&(avx512Mask8 lhs, bool rhs)
951{
952 __m512i val_true = _mm512_set1_epi64(avx512Mask8::true_v);
953 __mmask8 mask = _mm512_test_epi64_mask(lhs._data, val_true);
954 unsigned int tmp = _cvtmask16_u32(mask);
955 return tmp && rhs;
956}
957
958struct avx512Mask16 : avx512Int16<std::uint32_t>
959{
960 // bring in ctors
961 using avx512Int16::avx512Int16;
962
963 static constexpr scalarType true_v = -1;
964 static constexpr scalarType false_v = 0;
965};
966
967inline avx512Mask16 operator>(avx512Float16 lhs, avx512Float16 rhs)
968{
969 __mmask16 mask = _mm512_cmp_ps_mask(lhs._data, rhs._data, _CMP_GT_OQ);
970 return _mm512_maskz_set1_epi32(mask, avx512Mask16::true_v);
971}
972
973inline bool operator&&(avx512Mask16 lhs, bool rhs)
974{
975 __m512i val_true = _mm512_set1_epi32(avx512Mask16::true_v);
976 __mmask16 mask = _mm512_test_epi32_mask(lhs._data, val_true);
977 unsigned int tmp = _cvtmask16_u32(mask);
978 return tmp && rhs;
979}
980
981} // namespace tinysimd
982
983#endif // defined(__avx512__)
984
985#endif
void load_interleave(const T *in, size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T > > > &out)
Definition: scalar.hpp:309
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:298
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:235
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:275
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:303
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:255
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:366
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:376
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:294
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T > > > &in, size_t dataLen, T *out)
Definition: scalar.hpp:319
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:215