Nektar++
avx512.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: avx512.hpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Vector type using avx512 extension.
32 //
33 ///////////////////////////////////////////////////////////////////////////////
34 
35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
37 
38 #if defined(__x86_64__)
39 #include <immintrin.h>
40 #if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41 #define TINYSIMD_HAS_SVML
42 #endif
43 #endif
44 #include "allocator.hpp"
45 #include "avx2.hpp"
46 #include "traits.hpp"
47 #include <vector>
48 
49 namespace tinysimd
50 {
51 
52 namespace abi
53 {
54 
55 template <typename scalarType, int width = 0> struct avx512
56 {
57  using type = void;
58 };
59 
60 } // namespace abi
61 
62 #if defined(__AVX512F__) && defined(NEKTAR_ENABLE_SIMD_AVX512)
63 
64 // forward declaration of concrete types
65 template <typename T> struct avx512Long8;
66 template <typename T> struct avx512Int16;
67 struct avx512Double8;
68 struct avx512Float16;
69 struct avx512Mask8;
70 struct avx512Mask16;
71 
72 namespace abi
73 {
74 
75 // mapping between abstract types and concrete floating point types
76 template <> struct avx512<double>
77 {
78  using type = avx512Double8;
79 };
80 template <> struct avx512<float>
81 {
82  using type = avx512Float16;
83 };
84 // generic index mapping
85 // assumes index type width same as floating point type
86 template <> struct avx512<std::int64_t>
87 {
88  using type = avx512Long8<std::int64_t>;
89 };
90 template <> struct avx512<std::uint64_t>
91 {
92  using type = avx512Long8<std::uint64_t>;
93 };
94 #if defined(__APPLE__)
95 template <> struct avx512<std::size_t>
96 {
97  using type = avx512Long8<std::size_t>;
98 };
99 #endif
100 template <> struct avx512<std::int32_t>
101 {
102  using type = avx512Int16<std::int32_t>;
103 };
104 template <> struct avx512<std::uint32_t>
105 {
106  using type = avx512Int16<std::uint32_t>;
107 };
108 // specialized index mapping
109 template <> struct avx512<std::int64_t, 8>
110 {
111  using type = avx512Long8<std::int64_t>;
112 };
113 template <> struct avx512<std::uint64_t, 8>
114 {
115  using type = avx512Long8<std::uint64_t>;
116 };
117 #if defined(__APPLE__)
118 template <> struct avx512<std::size_t, 8>
119 {
120  using type = avx512Long8<std::size_t>;
121 };
122 #endif
123 template <> struct avx512<std::int32_t, 8>
124 {
125  using type = avx2Int8<std::int32_t>;
126 };
127 template <> struct avx512<std::uint32_t, 8>
128 {
129  using type = avx2Int8<std::uint32_t>;
130 };
131 template <> struct avx512<std::int32_t, 16>
132 {
133  using type = avx512Int16<std::int32_t>;
134 };
135 template <> struct avx512<std::uint32_t, 16>
136 {
137  using type = avx512Int16<std::uint32_t>;
138 };
139 // bool mapping
140 template <> struct avx512<bool, 8>
141 {
142  using type = avx512Mask8;
143 };
144 template <> struct avx512<bool, 16>
145 {
146  using type = avx512Mask16;
147 };
148 
149 } // namespace abi
150 
151 // concrete types
152 
153 // could add enable if to allow only unsigned long and long...
154 template <typename T> struct avx512Int16
155 {
156  static_assert(std::is_integral<T>::value && sizeof(T) == 4,
157  "4 bytes Integral required.");
158 
159  static constexpr unsigned int width = 16;
160  static constexpr unsigned int alignment = 64;
161 
162  using scalarType = T;
163  using vectorType = __m512i;
164  using scalarArray = scalarType[width];
165 
166  // storage
167  vectorType _data;
168 
169  // ctors
170  inline avx512Int16() = default;
171  inline avx512Int16(const avx512Int16 &rhs) = default;
172  inline avx512Int16(const vectorType &rhs) : _data(rhs)
173  {
174  }
175  inline avx512Int16(const scalarType rhs)
176  {
177  _data = _mm512_set1_epi32(rhs);
178  }
179  explicit inline avx512Int16(scalarArray &rhs)
180  {
181  _data = _mm512_load_epi32(rhs);
182  }
183 
184  // copy assignment
185  inline avx512Int16 &operator=(const avx512Int16 &) = default;
186 
187  // store
188  inline void store(scalarType *p) const
189  {
190  _mm512_store_epi32(p, _data);
191  }
192 
193  template <class flag,
194  typename std::enable_if<is_requiring_alignment<flag>::value &&
195  !is_streaming<flag>::value,
196  bool>::type = 0>
197  inline void store(scalarType *p, flag) const
198  {
199  _mm512_store_epi32(p, _data);
200  }
201 
202  template <class flag,
203  typename std::enable_if<!is_requiring_alignment<flag>::value,
204  bool>::type = 0>
205  inline void store(scalarType *p, flag) const
206  {
207  _mm512_storeu_epi32(p, _data);
208  }
209 
210  inline void load(const scalarType *p)
211  {
212  _data = _mm512_load_epi32(p);
213  }
214 
215  template <class flag,
216  typename std::enable_if<is_requiring_alignment<flag>::value &&
217  !is_streaming<flag>::value,
218  bool>::type = 0>
219  inline void load(const scalarType *p, flag)
220  {
221  _data = _mm512_load_epi32(p);
222  }
223 
224  template <class flag,
225  typename std::enable_if<!is_requiring_alignment<flag>::value,
226  bool>::type = 0>
227  inline void load(const scalarType *p, flag)
228  {
229  // even though the intel intrisic manual lists
230  // __m512i _mm512_loadu_epi64 (void const* mem_addr)
231  // it is not implemented in some compilers (gcc)
232  // since this is a bitwise load with no extension
233  // the following instruction is equivalent
234  _data = _mm512_loadu_si512(p);
235  }
236 
237  inline void broadcast(const scalarType rhs)
238  {
239  _data = _mm512_set1_epi32(rhs);
240  }
241 
242  // subscript
243  // subscript operators are convienient but expensive
244  // should not be used in optimized kernels
245  inline scalarType operator[](size_t i) const
246  {
247  alignas(alignment) scalarArray tmp;
248  store(tmp, is_aligned);
249  return tmp[i];
250  }
251 
252  inline scalarType &operator[](size_t i)
253  {
254  scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
255  return tmp[i];
256  }
257 };
258 
259 template <typename T>
260 inline avx512Int16<T> operator+(avx512Int16<T> lhs, avx512Int16<T> rhs)
261 {
262  return _mm512_add_epi32(lhs._data, rhs._data);
263 }
264 
265 template <
266  typename T, typename U,
267  typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
268 inline avx512Int16<T> operator+(avx512Int16<T> lhs, U rhs)
269 {
270  return _mm512_add_epi32(lhs._data, _mm512_set1_epi32(rhs));
271 }
272 
273 ////////////////////////////////////////////////////////////////////////////////
274 
275 template <typename T> struct avx512Long8
276 {
277  static_assert(std::is_integral<T>::value && sizeof(T) == 8,
278  "8 bytes Integral required.");
279 
280  static constexpr unsigned int width = 8;
281  static constexpr unsigned int alignment = 64;
282 
283  using scalarType = T;
284  using vectorType = __m512i;
285  using scalarArray = scalarType[width];
286 
287  // storage
288  vectorType _data;
289 
290  // ctors
291  inline avx512Long8() = default;
292  inline avx512Long8(const avx512Long8 &rhs) = default;
293  inline avx512Long8(const vectorType &rhs) : _data(rhs)
294  {
295  }
296  inline avx512Long8(const scalarType rhs)
297  {
298  _data = _mm512_set1_epi64(rhs);
299  }
300  explicit inline avx512Long8(scalarArray &rhs)
301  {
302  _data = _mm512_load_epi64(rhs);
303  }
304 
305  // copy assignment
306  inline avx512Long8 &operator=(const avx512Long8 &) = default;
307 
308  // store
309  inline void store(scalarType *p) const
310  {
311  _mm512_store_epi64(p, _data);
312  }
313 
314  template <class flag,
315  typename std::enable_if<is_requiring_alignment<flag>::value &&
316  !is_streaming<flag>::value,
317  bool>::type = 0>
318  inline void store(scalarType *p, flag) const
319  {
320  _mm512_store_epi64(p, _data);
321  }
322 
323  template <class flag,
324  typename std::enable_if<!is_requiring_alignment<flag>::value,
325  bool>::type = 0>
326  inline void store(scalarType *p, flag) const
327  {
328  _mm512_storeu_epi64(p, _data);
329  }
330 
331  inline void load(const scalarType *p)
332  {
333  _data = _mm512_load_epi64(p);
334  }
335 
336  template <class flag,
337  typename std::enable_if<is_requiring_alignment<flag>::value &&
338  !is_streaming<flag>::value,
339  bool>::type = 0>
340  inline void load(const scalarType *p, flag)
341  {
342  _data = _mm512_load_epi64(p);
343  }
344 
345  template <class flag,
346  typename std::enable_if<!is_requiring_alignment<flag>::value,
347  bool>::type = 0>
348  inline void load(const scalarType *p, flag)
349  {
350  // even though the intel intrisic manual lists
351  // __m512i _mm512_loadu_epi64 (void const* mem_addr)
352  // it is not implemented in some compilers (gcc)
353  // since this is a bitwise load with no extension
354  // the following instruction is equivalent
355  _data = _mm512_loadu_si512(p);
356  }
357 
358  inline void broadcast(const scalarType rhs)
359  {
360  _data = _mm512_set1_epi64(rhs);
361  }
362 
363  // subscript
364  // subscript operators are convienient but expensive
365  // should not be used in optimized kernels
366  inline scalarType operator[](size_t i) const
367  {
368  alignas(alignment) scalarArray tmp;
369  store(tmp, is_aligned);
370  return tmp[i];
371  }
372 
373  inline scalarType &operator[](size_t i)
374  {
375  scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
376  return tmp[i];
377  }
378 };
379 
380 template <typename T>
381 inline avx512Long8<T> operator+(avx512Long8<T> lhs, avx512Long8<T> rhs)
382 {
383  return _mm512_add_epi64(lhs._data, rhs._data);
384 }
385 
386 template <
387  typename T, typename U,
388  typename = typename std::enable_if<std::is_arithmetic<U>::value>::type>
389 inline avx512Long8<T> operator+(avx512Long8<T> lhs, U rhs)
390 {
391  return _mm512_add_epi64(lhs._data, _mm512_set1_epi64(rhs));
392 }
393 
394 ////////////////////////////////////////////////////////////////////////////////
395 
396 struct avx512Double8
397 {
398  static constexpr unsigned int width = 8;
399  static constexpr unsigned int alignment = 64;
400 
401  using scalarType = double;
402  using scalarIndexType = std::uint64_t;
403  using vectorType = __m512d;
404  using scalarArray = scalarType[width];
405 
406  // storage
407  vectorType _data;
408 
409  // ctors
410  inline avx512Double8() = default;
411  inline avx512Double8(const avx512Double8 &rhs) = default;
412  inline avx512Double8(const vectorType &rhs) : _data(rhs)
413  {
414  }
415  inline avx512Double8(const scalarType rhs)
416  {
417  _data = _mm512_set1_pd(rhs);
418  }
419 
420  // copy assignment
421  inline avx512Double8 &operator=(const avx512Double8 &) = default;
422 
423  // store
424  inline void store(scalarType *p) const
425  {
426  _mm512_store_pd(p, _data);
427  }
428 
429  template <class flag,
430  typename std::enable_if<is_requiring_alignment<flag>::value &&
431  !is_streaming<flag>::value,
432  bool>::type = 0>
433  inline void store(scalarType *p, flag) const
434  {
435  _mm512_store_pd(p, _data);
436  }
437 
438  template <class flag,
439  typename std::enable_if<!is_requiring_alignment<flag>::value,
440  bool>::type = 0>
441  inline void store(scalarType *p, flag) const
442  {
443  _mm512_storeu_pd(p, _data);
444  }
445 
446  template <class flag, typename std::enable_if<is_streaming<flag>::value,
447  bool>::type = 0>
448  inline void store(scalarType *p, flag) const
449  {
450  _mm512_stream_pd(p, _data);
451  }
452 
453  // load packed
454  inline void load(const scalarType *p)
455  {
456  _data = _mm512_load_pd(p);
457  }
458 
459  template <class flag,
460  typename std::enable_if<is_requiring_alignment<flag>::value,
461  bool>::type = 0>
462  inline void load(const scalarType *p, flag)
463  {
464  _data = _mm512_load_pd(p);
465  }
466 
467  template <class flag,
468  typename std::enable_if<!is_requiring_alignment<flag>::value,
469  bool>::type = 0>
470  inline void load(const scalarType *p, flag)
471  {
472  _data = _mm512_loadu_pd(p);
473  }
474 
475  // broadcast
476  inline void broadcast(const scalarType rhs)
477  {
478  _data = _mm512_set1_pd(rhs);
479  }
480 
481  // gather/scatter
482  template <typename T>
483  inline void gather(scalarType const *p, const avx2Int8<T> &indices)
484  {
485  _data = _mm512_i32gather_pd(indices._data, p, 8);
486  }
487 
488  template <typename T>
489  inline void scatter(scalarType *out, const avx2Int8<T> &indices) const
490  {
491  _mm512_i32scatter_pd(out, indices._data, _data, 8);
492  }
493 
494  template <typename T>
495  inline void gather(scalarType const *p, const avx512Long8<T> &indices)
496  {
497  _data = _mm512_i64gather_pd(indices._data, p, 8);
498  }
499 
500  template <typename T>
501  inline void scatter(scalarType *out, const avx512Long8<T> &indices) const
502  {
503  _mm512_i64scatter_pd(out, indices._data, _data, 8);
504  }
505 
506  // fma
507  // this = this + a * b
508  inline void fma(const avx512Double8 &a, const avx512Double8 &b)
509  {
510  _data = _mm512_fmadd_pd(a._data, b._data, _data);
511  }
512 
513  // subscript
514  // subscript operators are convienient but expensive
515  // should not be used in optimized kernels
516  inline scalarType operator[](size_t i) const
517  {
518  alignas(alignment) scalarArray tmp;
519  store(tmp, is_aligned);
520  return tmp[i];
521  }
522 
523  inline scalarType &operator[](size_t i)
524  {
525  scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
526  return tmp[i];
527  }
528 
529  // unary ops
530  inline void operator+=(avx512Double8 rhs)
531  {
532  _data = _mm512_add_pd(_data, rhs._data);
533  }
534 
535  inline void operator-=(avx512Double8 rhs)
536  {
537  _data = _mm512_sub_pd(_data, rhs._data);
538  }
539 
540  inline void operator*=(avx512Double8 rhs)
541  {
542  _data = _mm512_mul_pd(_data, rhs._data);
543  }
544 
545  inline void operator/=(avx512Double8 rhs)
546  {
547  _data = _mm512_div_pd(_data, rhs._data);
548  }
549 };
550 
551 inline avx512Double8 operator+(avx512Double8 lhs, avx512Double8 rhs)
552 {
553  return _mm512_add_pd(lhs._data, rhs._data);
554 }
555 
556 inline avx512Double8 operator-(avx512Double8 lhs, avx512Double8 rhs)
557 {
558  return _mm512_sub_pd(lhs._data, rhs._data);
559 }
560 
561 inline avx512Double8 operator*(avx512Double8 lhs, avx512Double8 rhs)
562 {
563  return _mm512_mul_pd(lhs._data, rhs._data);
564 }
565 
566 inline avx512Double8 operator/(avx512Double8 lhs, avx512Double8 rhs)
567 {
568  return _mm512_div_pd(lhs._data, rhs._data);
569 }
570 
571 inline avx512Double8 sqrt(avx512Double8 in)
572 {
573  return _mm512_sqrt_pd(in._data);
574 }
575 
576 inline avx512Double8 abs(avx512Double8 in)
577 {
578  return _mm512_abs_pd(in._data);
579 }
580 
581 inline avx512Double8 log(avx512Double8 in)
582 {
583 #if defined(TINYSIMD_HAS_SVML)
584  return _mm512_log_pd(in._data);
585 #else
586  // there is no avx512 log intrinsic
587  // this is a dreadful implementation and is simply a stop gap measure
588  alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
589  in.store(tmp);
590  tmp[0] = std::log(tmp[0]);
591  tmp[1] = std::log(tmp[1]);
592  tmp[2] = std::log(tmp[2]);
593  tmp[3] = std::log(tmp[3]);
594  tmp[4] = std::log(tmp[4]);
595  tmp[5] = std::log(tmp[5]);
596  tmp[6] = std::log(tmp[6]);
597  tmp[7] = std::log(tmp[7]);
598  avx512Double8 ret;
599  ret.load(tmp);
600  return ret;
601 #endif
602 }
603 
604 inline void load_interleave(
605  const double *in, std::uint32_t dataLen,
606  std::vector<avx512Double8, allocator<avx512Double8>> &out)
607 {
608 
609  alignas(avx512Double8::alignment)
610  avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
611  0, dataLen, 2 * dataLen, 3 * dataLen,
612  4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
613 
614  using index_t = avx512Long8<avx512Double8::scalarIndexType>;
615  index_t index0(tmp);
616  index_t index1 = index0 + 1;
617  index_t index2 = index0 + 2;
618  index_t index3 = index0 + 3;
619 
620  // 4x unrolled loop
621  constexpr uint16_t unrl = 4;
622  size_t nBlocks = dataLen / unrl;
623  for (size_t i = 0; i < nBlocks; ++i)
624  {
625  out[unrl * i + 0].gather(in, index0);
626  out[unrl * i + 1].gather(in, index1);
627  out[unrl * i + 2].gather(in, index2);
628  out[unrl * i + 3].gather(in, index3);
629  index0 = index0 + unrl;
630  index1 = index1 + unrl;
631  index2 = index2 + unrl;
632  index3 = index3 + unrl;
633  }
634 
635  // spillover loop
636  for (size_t i = unrl * nBlocks; i < dataLen; ++i)
637  {
638  out[i].gather(in, index0);
639  index0 = index0 + 1;
640  }
641 }
642 
643 inline void deinterleave_store(
644  const std::vector<avx512Double8, allocator<avx512Double8>> &in,
645  std::uint32_t dataLen, double *out)
646 {
647  // size_t nBlocks = dataLen / 4;
648 
649  alignas(avx512Double8::alignment)
650  avx512Double8::scalarIndexType tmp[avx512Double8::width] = {
651  0, dataLen, 2 * dataLen, 3 * dataLen,
652  4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
653  using index_t = avx512Long8<avx512Double8::scalarIndexType>;
654  index_t index0(tmp);
655  for (size_t i = 0; i < dataLen; ++i)
656  {
657  in[i].scatter(out, index0);
658  index0 = index0 + 1;
659  }
660 }
661 
662 ////////////////////////////////////////////////////////////////////////////////
663 
664 struct avx512Float16
665 {
666  static constexpr unsigned int width = 16;
667  static constexpr unsigned int alignment = 64;
668 
669  using scalarType = float;
670  using scalarIndexType = std::uint32_t;
671  using vectorType = __m512;
672  using scalarArray = scalarType[width];
673 
674  // storage
675  vectorType _data;
676 
677  // ctors
678  inline avx512Float16() = default;
679  inline avx512Float16(const avx512Float16 &rhs) = default;
680  inline avx512Float16(const vectorType &rhs) : _data(rhs)
681  {
682  }
683  inline avx512Float16(const scalarType rhs)
684  {
685  _data = _mm512_set1_ps(rhs);
686  }
687 
688  // copy assignment
689  inline avx512Float16 &operator=(const avx512Float16 &) = default;
690 
691  // store
692  inline void store(scalarType *p) const
693  {
694  _mm512_store_ps(p, _data);
695  }
696 
697  template <class flag,
698  typename std::enable_if<is_requiring_alignment<flag>::value &&
699  !is_streaming<flag>::value,
700  bool>::type = 0>
701  inline void store(scalarType *p, flag) const
702  {
703  _mm512_store_ps(p, _data);
704  }
705 
706  template <class flag,
707  typename std::enable_if<!is_requiring_alignment<flag>::value,
708  bool>::type = 0>
709  inline void store(scalarType *p, flag) const
710  {
711  _mm512_storeu_ps(p, _data);
712  }
713 
714  template <class flag, typename std::enable_if<is_streaming<flag>::value,
715  bool>::type = 0>
716  inline void store(scalarType *p, flag) const
717  {
718  _mm512_stream_ps(p, _data);
719  }
720 
721  // load packed
722  inline void load(const scalarType *p)
723  {
724  _data = _mm512_load_ps(p);
725  }
726 
727  template <class flag,
728  typename std::enable_if<is_requiring_alignment<flag>::value,
729  bool>::type = 0>
730  inline void load(const scalarType *p, flag)
731  {
732  _data = _mm512_load_ps(p);
733  }
734 
735  template <class flag,
736  typename std::enable_if<!is_requiring_alignment<flag>::value,
737  bool>::type = 0>
738  inline void load(const scalarType *p, flag)
739  {
740  _data = _mm512_loadu_ps(p);
741  }
742 
743  // broadcast
744  inline void broadcast(const scalarType rhs)
745  {
746  _data = _mm512_set1_ps(rhs);
747  }
748 
749  // gather/scatter
750  template <typename T>
751  inline void gather(scalarType const *p, const avx512Int16<T> &indices)
752  {
753  _data = _mm512_i32gather_ps(indices._data, p, sizeof(scalarType));
754  }
755 
756  template <typename T>
757  inline void scatter(scalarType *out, const avx512Int16<T> &indices) const
758  {
759  _mm512_i32scatter_ps(out, indices._data, _data, sizeof(scalarType));
760  }
761 
762  // fma
763  // this = this + a * b
764  inline void fma(const avx512Float16 &a, const avx512Float16 &b)
765  {
766  _data = _mm512_fmadd_ps(a._data, b._data, _data);
767  }
768 
769  // subscript
770  // subscript operators are convienient but expensive
771  // should not be used in optimized kernels
772  inline scalarType operator[](size_t i) const
773  {
774  alignas(alignment) scalarArray tmp;
775  store(tmp, is_aligned);
776  return tmp[i];
777  }
778 
779  inline scalarType &operator[](size_t i)
780  {
781  scalarType *tmp = reinterpret_cast<scalarType *>(&_data);
782  return tmp[i];
783  }
784 
785  inline void operator+=(avx512Float16 rhs)
786  {
787  _data = _mm512_add_ps(_data, rhs._data);
788  }
789 
790  inline void operator-=(avx512Float16 rhs)
791  {
792  _data = _mm512_sub_ps(_data, rhs._data);
793  }
794 
795  inline void operator*=(avx512Float16 rhs)
796  {
797  _data = _mm512_mul_ps(_data, rhs._data);
798  }
799 
800  inline void operator/=(avx512Float16 rhs)
801  {
802  _data = _mm512_div_ps(_data, rhs._data);
803  }
804 };
805 
806 inline avx512Float16 operator+(avx512Float16 lhs, avx512Float16 rhs)
807 {
808  return _mm512_add_ps(lhs._data, rhs._data);
809 }
810 
811 inline avx512Float16 operator-(avx512Float16 lhs, avx512Float16 rhs)
812 {
813  return _mm512_sub_ps(lhs._data, rhs._data);
814 }
815 
816 inline avx512Float16 operator*(avx512Float16 lhs, avx512Float16 rhs)
817 {
818  return _mm512_mul_ps(lhs._data, rhs._data);
819 }
820 
821 inline avx512Float16 operator/(avx512Float16 lhs, avx512Float16 rhs)
822 {
823  return _mm512_div_ps(lhs._data, rhs._data);
824 }
825 
826 inline avx512Float16 sqrt(avx512Float16 in)
827 {
828  return _mm512_sqrt_ps(in._data);
829 }
830 
831 inline avx512Float16 abs(avx512Float16 in)
832 {
833  return _mm512_abs_ps(in._data);
834 }
835 
836 inline avx512Float16 log(avx512Float16 in)
837 {
838 #if defined(TINYSIMD_HAS_SVML)
839  return _mm512_log_ps(in._data);
840 #else
841  // there is no avx512 log intrinsic
842  // this is a dreadful implementation and is simply a stop gap measure
843  alignas(avx512Float16::alignment) avx512Float16::scalarArray tmp;
844  in.store(tmp);
845  tmp[0] = std::log(tmp[0]);
846  tmp[1] = std::log(tmp[1]);
847  tmp[2] = std::log(tmp[2]);
848  tmp[3] = std::log(tmp[3]);
849  tmp[4] = std::log(tmp[4]);
850  tmp[5] = std::log(tmp[5]);
851  tmp[6] = std::log(tmp[6]);
852  tmp[7] = std::log(tmp[7]);
853  tmp[8] = std::log(tmp[8]);
854  tmp[9] = std::log(tmp[9]);
855  tmp[10] = std::log(tmp[10]);
856  tmp[11] = std::log(tmp[11]);
857  tmp[12] = std::log(tmp[12]);
858  tmp[13] = std::log(tmp[13]);
859  tmp[14] = std::log(tmp[14]);
860  tmp[15] = std::log(tmp[15]);
861  avx512Float16 ret;
862  ret.load(tmp);
863  return ret;
864 #endif
865 }
866 
867 inline void load_interleave(
868  const float *in, std::uint32_t dataLen,
869  std::vector<avx512Float16, allocator<avx512Float16>> &out)
870 {
871 
872  alignas(avx512Float16::alignment)
873  avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
874  0, dataLen, 2 * dataLen, 3 * dataLen,
875  4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
876 
877  using index_t = avx512Int16<avx512Float16::scalarIndexType>;
878  index_t index0(tmp);
879  index_t index1 = index0 + 1;
880  index_t index2 = index0 + 2;
881  index_t index3 = index0 + 3;
882 
883  // 4x unrolled loop
884  constexpr uint16_t unrl = 4;
885  size_t nBlocks = dataLen / unrl;
886  for (size_t i = 0; i < nBlocks; ++i)
887  {
888  out[unrl * i + 0].gather(in, index0);
889  out[unrl * i + 1].gather(in, index1);
890  out[unrl * i + 2].gather(in, index2);
891  out[unrl * i + 3].gather(in, index3);
892  index0 = index0 + unrl;
893  index1 = index1 + unrl;
894  index2 = index2 + unrl;
895  index3 = index3 + unrl;
896  }
897 
898  // spillover loop
899  for (size_t i = unrl * nBlocks; i < dataLen; ++i)
900  {
901  out[i].gather(in, index0);
902  index0 = index0 + 1;
903  }
904 }
905 
906 inline void deinterleave_store(
907  const std::vector<avx512Float16, allocator<avx512Float16>> &in,
908  std::uint32_t dataLen, float *out)
909 {
910  // size_t nBlocks = dataLen / 4;
911 
912  alignas(avx512Float16::alignment)
913  avx512Float16::scalarIndexType tmp[avx512Float16::width] = {
914  0, dataLen, 2 * dataLen, 3 * dataLen,
915  4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
916  using index_t = avx512Int16<avx512Float16::scalarIndexType>;
917 
918  index_t index0(tmp);
919  for (size_t i = 0; i < dataLen; ++i)
920  {
921  in[i].scatter(out, index0);
922  index0 = index0 + 1;
923  }
924 }
925 
926 ////////////////////////////////////////////////////////////////////////////////
927 
928 // mask type
929 // mask is a int type with special properties (broad boolean vector)
930 // broad boolean vectors defined and allowed values are:
931 // false=0x0 and true=0xFFFFFFFF
932 //
933 // VERY LIMITED SUPPORT...just enough to make cubic eos work...
934 //
935 struct avx512Mask8 : avx512Long8<std::uint64_t>
936 {
937  // bring in ctors
938  using avx512Long8::avx512Long8;
939 
940  static constexpr scalarType true_v = -1;
941  static constexpr scalarType false_v = 0;
942 };
943 
944 inline avx512Mask8 operator>(avx512Double8 lhs, avx512Double8 rhs)
945 {
946  __mmask8 mask = _mm512_cmp_pd_mask(lhs._data, rhs._data, _CMP_GT_OQ);
947  return _mm512_maskz_set1_epi64(mask, avx512Mask8::true_v);
948 }
949 
950 inline bool operator&&(avx512Mask8 lhs, bool rhs)
951 {
952  __m512i val_true = _mm512_set1_epi64(avx512Mask8::true_v);
953  __mmask8 mask = _mm512_test_epi64_mask(lhs._data, val_true);
954  unsigned int tmp = _cvtmask16_u32(mask);
955  return tmp && rhs;
956 }
957 
958 struct avx512Mask16 : avx512Int16<std::uint32_t>
959 {
960  // bring in ctors
961  using avx512Int16::avx512Int16;
962 
963  static constexpr scalarType true_v = -1;
964  static constexpr scalarType false_v = 0;
965 };
966 
967 inline avx512Mask16 operator>(avx512Float16 lhs, avx512Float16 rhs)
968 {
969  __mmask16 mask = _mm512_cmp_ps_mask(lhs._data, rhs._data, _CMP_GT_OQ);
970  return _mm512_maskz_set1_epi32(mask, avx512Mask16::true_v);
971 }
972 
973 inline bool operator&&(avx512Mask16 lhs, bool rhs)
974 {
975  __m512i val_true = _mm512_set1_epi32(avx512Mask16::true_v);
976  __mmask16 mask = _mm512_test_epi32_mask(lhs._data, val_true);
977  unsigned int tmp = _cvtmask16_u32(mask);
978  return tmp && rhs;
979 }
980 
981 #endif // defined(__avx512__)
982 
983 } // namespace tinysimd
984 #endif
scalarT< T > log(scalarT< T > in)
Definition: scalar.hpp:303
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:215
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T >>> &in, size_t dataLen, T *out)
Definition: scalar.hpp:319
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:235
scalarT< T > abs(scalarT< T > in)
Definition: scalar.hpp:298
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
Definition: scalar.hpp:366
void load_interleave(const T *in, size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T >>> &out)
Definition: scalar.hpp:309
bool operator&&(scalarMask lhs, bool rhs)
Definition: scalar.hpp:376
scalarT< T > sqrt(scalarT< T > in)
Definition: scalar.hpp:294
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:275
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)
Definition: scalar.hpp:255