37#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
38#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
40#if defined(__x86_64__)
42#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
43#define TINYSIMD_HAS_SVML
52template <
typename scalarType>
struct sse2
59#if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
65template <
typename T>
struct sse2Int4;
73 using type = sse2Int4<std::int32_t>;
77 using type = sse2Int4<std::uint32_t>;
83template <
typename T>
struct sse2Int4
85 static_assert(std::is_integral_v<T> &&
sizeof(T) == 4,
86 "4 bytes Integral required.");
88 static constexpr unsigned int width = 4;
89 static constexpr unsigned int alignment = 16;
92 using vectorType = __m128i;
93 using scalarArray = scalarType[width];
99 inline sse2Int4() =
default;
100 inline sse2Int4(
const sse2Int4 &rhs) =
default;
101 inline sse2Int4(
const vectorType &rhs) : _data(rhs)
104 inline sse2Int4(
const scalarType rhs)
106 _data = _mm_set1_epi32(rhs);
110 inline void store(scalarType *
p)
const
112 _mm_store_si128(
reinterpret_cast<vectorType *
>(
p), _data);
115 template <
class flag,
116 typename std::enable_if<is_requiring_alignment_v<flag> &&
117 !is_streaming_v<flag>,
119 inline void store(scalarType *
p, flag)
const
121 _mm_store_si128(
reinterpret_cast<vectorType *
>(
p), _data);
124 template <
class flag,
typename std::enable_if<
125 !is_requiring_alignment_v<flag>,
bool>::type = 0>
126 inline void store(scalarType *
p, flag)
const
128 _mm_storeu_si128(
reinterpret_cast<vectorType *
>(
p), _data);
131 inline void load(
const scalarType *
p)
133 _data = _mm_load_si128(
reinterpret_cast<const vectorType *
>(
p));
136 template <
class flag,
137 typename std::enable_if<is_requiring_alignment_v<flag> &&
138 !is_streaming_v<flag>,
140 inline void load(
const scalarType *
p, flag)
142 _data = _mm_load_si128(
reinterpret_cast<const vectorType *
>(
p));
145 template <
class flag,
typename std::enable_if<
146 !is_requiring_alignment_v<flag>,
bool>::type = 0>
147 inline void load(
const scalarType *
p, flag)
149 _data = _mm_loadu_si128(
reinterpret_cast<const vectorType *
>(
p));
153 inline void gather(scalarType
const *
p,
const sse2Int4<T> &indices)
155 _data = _mm_i32gather_epi32(
p, indices._data, 8);
158 inline void scatter(scalarType *out,
const sse2Int4<T> &indices)
const
161 alignas(alignment) scalarArray tmp;
162 _mm_store_epi32(tmp, _data);
164 out[_mm_extract_epi32(indices._data, 0)] = tmp[0];
165 out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
168 inline void broadcast(
const scalarType rhs)
170 _data = _mm_set1_epi32(rhs);
176 inline scalarType operator[](
size_t i)
const
178 alignas(alignment) scalarArray tmp;
static constexpr struct tinysimd::is_aligned_t is_aligned