37#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
38#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
40#if defined(__x86_64__)
42#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
43#define TINYSIMD_HAS_SVML
52template <
typename scalarType>
struct sse2
59#if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
65template <
typename T>
struct sse2Int4;
71template <>
struct sse2<
std::int32_t>
73 using type = sse2Int4<std::int32_t>;
75template <>
struct sse2<
std::uint32_t>
77 using type = sse2Int4<std::uint32_t>;
83template <
typename T>
struct sse2Int4
85 static_assert(std::is_integral<T>::value &&
sizeof(T) == 4,
86 "4 bytes Integral required.");
88 static constexpr unsigned int width = 4;
89 static constexpr unsigned int alignment = 16;
92 using vectorType = __m128i;
93 using scalarArray = scalarType[width];
99 inline sse2Int4() =
default;
100 inline sse2Int4(
const sse2Int4 &rhs) =
default;
101 inline sse2Int4(
const vectorType &rhs) : _data(rhs)
104 inline sse2Int4(
const scalarType rhs)
106 _data = _mm_set1_epi32(rhs);
110 inline void store(scalarType *
p)
const
112 _mm_store_si128(
reinterpret_cast<vectorType *
>(
p), _data);
115 template <
class flag,
116 typename std::enable_if<is_requiring_alignment<flag>::value &&
117 !is_streaming<flag>::value,
119 inline void store(scalarType *
p, flag)
const
121 _mm_store_si128(
reinterpret_cast<vectorType *
>(
p), _data);
124 template <
class flag,
125 typename std::enable_if<!is_requiring_alignment<flag>::value,
127 inline void store(scalarType *
p, flag)
const
129 _mm_storeu_si128(
reinterpret_cast<vectorType *
>(
p), _data);
132 inline void load(
const scalarType *
p)
134 _data = _mm_load_si128(
reinterpret_cast<const vectorType *
>(
p));
137 template <
class flag,
138 typename std::enable_if<is_requiring_alignment<flag>::value &&
139 !is_streaming<flag>::value,
141 inline void load(
const scalarType *
p, flag)
143 _data = _mm_load_si128(
reinterpret_cast<const vectorType *
>(
p));
146 template <
class flag,
147 typename std::enable_if<!is_requiring_alignment<flag>::value,
149 inline void load(
const scalarType *
p, flag)
151 _data = _mm_loadu_si128(
reinterpret_cast<const vectorType *
>(
p));
155 inline void gather(scalarType
const *
p,
const sse2Int4<T> &indices)
157 _data = _mm_i32gather_epi32(
p, indices._data, 8);
160 inline void scatter(scalarType *out,
const sse2Int4<T> &indices)
const
163 alignas(alignment) scalarArray tmp;
164 _mm_store_epi32(tmp, _data);
166 out[_mm_extract_epi32(indices._data, 0)] = tmp[0];
167 out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
170 inline void broadcast(
const scalarType rhs)
172 _data = _mm_set1_epi32(rhs);
178 inline scalarType operator[](
size_t i)
const
180 alignas(alignment) scalarArray tmp;
static constexpr struct tinysimd::is_aligned_t is_aligned