35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_AVX512_H
38 #if defined(__x86_64__)
39 #include <immintrin.h>
40 #if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41 #define TINYSIMD_HAS_SVML
55 template <
typename scalarType>
struct avx512
62 #if defined(__AVX512F__) && defined(NEKTAR_ENABLE_SIMD_AVX512)
65 template <
typename T>
struct avx512Long8;
73 template <>
struct avx512<double>
75 using type = avx512Double8;
77 template <>
struct avx512<std::int64_t>
79 using type = avx512Long8<std::int64_t>;
81 template <>
struct avx512<std::uint64_t>
83 using type = avx512Long8<std::uint64_t>;
85 template <>
struct avx512<bool>
87 using type = avx512Mask;
93 template <
typename T>
struct avx512Long8
95 static_assert(std::is_integral<T>::value &&
sizeof(T) == 8,
96 "8 bytes Integral required.");
98 static constexpr
unsigned int width = 8;
99 static constexpr
unsigned int alignment = 64;
101 using scalarType = T;
102 using vectorType = __m512i;
103 using scalarArray = scalarType[width];
109 inline avx512Long8() =
default;
110 inline avx512Long8(
const avx512Long8 &rhs) =
default;
111 inline avx512Long8(
const vectorType &rhs) : _data(rhs)
114 inline avx512Long8(
const scalarType rhs)
116 _data = _mm512_set1_epi64(rhs);
118 explicit inline avx512Long8(scalarArray &rhs)
120 _data = _mm512_load_epi64(rhs);
124 inline void store(scalarType *
p)
const
126 _mm512_store_epi64(
p, _data);
129 template <
class flag,
130 typename std::enable_if<is_requiring_alignment<flag>::value &&
131 !is_streaming<flag>::value,
133 inline void store(scalarType *
p, flag)
const
135 _mm512_store_epi64(
p, _data);
138 template <
class flag,
139 typename std::enable_if<!is_requiring_alignment<flag>::value,
141 inline void store(scalarType *
p, flag)
const
143 _mm512_storeu_epi64(
p, _data);
146 inline void load(
const scalarType *
p)
148 _data = _mm512_load_epi64(
p);
151 template <
class flag,
152 typename std::enable_if<is_requiring_alignment<flag>::value &&
153 !is_streaming<flag>::value,
155 inline void load(
const scalarType *
p, flag)
157 _data = _mm512_load_epi64(
p);
160 template <
class flag,
161 typename std::enable_if<!is_requiring_alignment<flag>::value,
163 inline void load(
const scalarType *
p, flag)
170 _data = _mm512_loadu_si512(
p);
173 inline void broadcast(
const scalarType rhs)
175 _data = _mm512_set1_epi64(rhs);
181 inline scalarType operator[](
size_t i)
const
183 alignas(alignment) scalarArray tmp;
189 template <
typename T>
190 inline avx512Long8<T>
operator+(avx512Long8<T> lhs, avx512Long8<T> rhs)
192 return _mm512_add_epi64(lhs._data, rhs._data);
196 typename T,
typename U,
197 typename =
typename std::enable_if<std::is_arithmetic<U>::value>::type>
198 inline avx512Long8<T>
operator+(avx512Long8<T> lhs, U rhs)
200 return _mm512_add_epi64(lhs._data, _mm512_set1_epi64(rhs));
207 static constexpr
unsigned int width = 8;
208 static constexpr
unsigned int alignment = 64;
210 using scalarType = double;
211 using vectorType = __m512d;
212 using scalarArray = scalarType[width];
218 inline avx512Double8() =
default;
219 inline avx512Double8(
const avx512Double8 &rhs) =
default;
220 inline avx512Double8(
const vectorType &rhs) : _data(rhs)
223 inline avx512Double8(
const scalarType rhs)
225 _data = _mm512_set1_pd(rhs);
229 inline void store(scalarType *
p)
const
231 _mm512_store_pd(
p, _data);
234 template <
class flag,
235 typename std::enable_if<is_requiring_alignment<flag>::value &&
236 !is_streaming<flag>::value,
238 inline void store(scalarType *
p, flag)
const
240 _mm512_store_pd(
p, _data);
243 template <
class flag,
244 typename std::enable_if<!is_requiring_alignment<flag>::value,
246 inline void store(scalarType *
p, flag)
const
248 _mm512_storeu_pd(
p, _data);
251 template <class flag, typename std::enable_if<is_streaming<flag>::value,
253 inline void store(scalarType *
p, flag)
const
255 _mm512_stream_pd(
p, _data);
259 inline void load(
const scalarType *
p)
261 _data = _mm512_load_pd(
p);
264 template <
class flag,
265 typename std::enable_if<is_requiring_alignment<flag>::value,
267 inline void load(
const scalarType *
p, flag)
269 _data = _mm512_load_pd(
p);
272 template <
class flag,
273 typename std::enable_if<!is_requiring_alignment<flag>::value,
275 inline void load(
const scalarType *
p, flag)
277 _data = _mm512_loadu_pd(
p);
281 inline void broadcast(
const scalarType rhs)
283 _data = _mm512_set1_pd(rhs);
299 template <
typename T>
300 inline void gather(scalarType
const *
p,
const avx512Long8<T> &indices)
302 _data = _mm512_i64gather_pd(indices._data,
p, 8);
305 template <
typename T>
306 inline void scatter(scalarType *out,
const avx512Long8<T> &indices)
const
308 _mm512_i64scatter_pd(out, indices._data, _data, 8);
313 inline void fma(
const avx512Double8 &a,
const avx512Double8 &b)
315 _data = _mm512_fmadd_pd(a._data, b._data, _data);
321 inline scalarType operator[](
size_t i)
const
323 alignas(alignment) scalarArray tmp;
329 inline void operator+=(avx512Double8 rhs)
331 _data = _mm512_add_pd(_data, rhs._data);
334 inline void operator-=(avx512Double8 rhs)
336 _data = _mm512_sub_pd(_data, rhs._data);
339 inline void operator*=(avx512Double8 rhs)
341 _data = _mm512_mul_pd(_data, rhs._data);
344 inline void operator/=(avx512Double8 rhs)
346 _data = _mm512_div_pd(_data, rhs._data);
350 inline avx512Double8
operator+(avx512Double8 lhs, avx512Double8 rhs)
352 return _mm512_add_pd(lhs._data, rhs._data);
355 inline avx512Double8
operator-(avx512Double8 lhs, avx512Double8 rhs)
357 return _mm512_sub_pd(lhs._data, rhs._data);
360 inline avx512Double8
operator*(avx512Double8 lhs, avx512Double8 rhs)
362 return _mm512_mul_pd(lhs._data, rhs._data);
365 inline avx512Double8
operator/(avx512Double8 lhs, avx512Double8 rhs)
367 return _mm512_div_pd(lhs._data, rhs._data);
370 inline avx512Double8
sqrt(avx512Double8 in)
372 return _mm512_sqrt_pd(in._data);
375 inline avx512Double8
abs(avx512Double8 in)
377 return _mm512_abs_pd(in._data);
380 inline avx512Double8
log(avx512Double8 in)
382 #if defined(TINYSIMD_HAS_SVML)
383 return _mm512_log_pd(in._data);
387 alignas(avx512Double8::alignment) avx512Double8::scalarArray tmp;
404 const double *in,
size_t dataLen,
405 std::vector<avx512Double8, allocator<avx512Double8>> &out)
408 alignas(avx512Double8::alignment)
size_t tmp[avx512Double8::width] = {
409 0, dataLen, 2 * dataLen, 3 * dataLen,
410 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
412 using index_t = avx512Long8<size_t>;
414 index_t index1 = index0 + 1;
415 index_t index2 = index0 + 2;
416 index_t index3 = index0 + 3;
419 constexpr uint16_t unrl = 4;
420 size_t nBlocks = dataLen / unrl;
421 for (
size_t i = 0; i < nBlocks; ++i)
423 out[unrl * i + 0].gather(in, index0);
424 out[unrl * i + 1].gather(in, index1);
425 out[unrl * i + 2].gather(in, index2);
426 out[unrl * i + 3].gather(in, index3);
427 index0 = index0 + unrl;
428 index1 = index1 + unrl;
429 index2 = index2 + unrl;
430 index3 = index3 + unrl;
434 for (
size_t i = unrl * nBlocks; i < dataLen; ++i)
436 out[i].gather(in, index0);
442 const std::vector<avx512Double8, allocator<avx512Double8>> &in,
443 size_t dataLen,
double *out)
447 alignas(avx512Double8::alignment)
size_t tmp[avx512Double8::width] = {
448 0, dataLen, 2 * dataLen, 3 * dataLen,
449 4 * dataLen, 5 * dataLen, 6 * dataLen, 7 * dataLen};
450 using index_t = avx512Long8<size_t>;
476 for (
size_t i = 0; i < dataLen; ++i)
478 in[i].scatter(out, index0);
492 struct avx512Mask : avx512Long8<std::uint64_t>
495 using avx512Long8::avx512Long8;
497 static constexpr scalarType true_v = -1;
498 static constexpr scalarType false_v = 0;
501 inline avx512Mask
operator>(avx512Double8 lhs, avx512Double8 rhs)
503 __mmask8 mask = _mm512_cmp_pd_mask(lhs._data, rhs._data, _CMP_GT_OQ);
504 return _mm512_maskz_set1_epi64(mask, avx512Mask::true_v);
507 inline bool operator&&(avx512Mask lhs,
bool rhs)
509 __m512i val_true = _mm512_set1_epi64(avx512Mask::true_v);
510 __mmask8 mask = _mm512_test_epi64_mask(lhs._data, val_true);
511 unsigned int tmp = _cvtmask16_u32(mask);
scalarT< T > log(scalarT< T > in)
scalarT< T > operator+(scalarT< T > lhs, scalarT< T > rhs)
void deinterleave_store(const std::vector< scalarT< T >, allocator< scalarT< T >>> &in, size_t dataLen, T *out)
static constexpr struct tinysimd::is_aligned_t is_aligned
scalarT< T > operator-(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > abs(scalarT< T > in)
scalarMask operator>(scalarT< double > lhs, scalarT< double > rhs)
void load_interleave(const T *in, size_t dataLen, std::vector< scalarT< T >, allocator< scalarT< T >>> &out)
bool operator&&(scalarMask lhs, bool rhs)
scalarT< T > sqrt(scalarT< T > in)
scalarT< T > operator/(scalarT< T > lhs, scalarT< T > rhs)
scalarT< T > operator*(scalarT< T > lhs, scalarT< T > rhs)