35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
38 #if defined(__x86_64__)
39 #include <immintrin.h>
40 #if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41 #define TINYSIMD_HAS_SVML
53 template <
typename scalarType>
struct sse2
60 #if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
63 template <
typename T>
struct sse2Int4;
69 template <>
struct sse2<std::int32_t>
71 using type = sse2Int4<std::int32_t>;
73 template <>
struct sse2<std::uint32_t>
75 using type = sse2Int4<std::uint32_t>;
81 template <
typename T>
struct sse2Int4
83 static_assert(std::is_integral<T>::value &&
sizeof(T) == 4,
84 "4 bytes Integral required.");
86 static constexpr
unsigned int width = 4;
87 static constexpr
unsigned int alignment = 16;
90 using vectorType = __m128i;
91 using scalarArray = scalarType[width];
97 inline sse2Int4() =
default;
98 inline sse2Int4(
const sse2Int4 &rhs) =
default;
99 inline sse2Int4(
const vectorType &rhs) : _data(rhs)
102 inline sse2Int4(
const scalarType rhs)
104 _data = _mm_set1_epi32(rhs);
108 inline void store(scalarType *
p)
const
110 _mm_store_si128(
reinterpret_cast<vectorType *
>(
p), _data);
113 template <
class flag,
114 typename std::enable_if<is_requiring_alignment<flag>::value &&
115 !is_streaming<flag>::value,
117 inline void store(scalarType *
p, flag)
const
119 _mm_store_si128(
reinterpret_cast<vectorType *
>(
p), _data);
122 template <
class flag,
123 typename std::enable_if<!is_requiring_alignment<flag>::value,
125 inline void store(scalarType *
p, flag)
const
127 _mm_storeu_si128(
reinterpret_cast<vectorType *
>(
p), _data);
130 inline void load(
const scalarType *
p)
132 _data = _mm_load_si128(
reinterpret_cast<const vectorType *
>(
p));
135 template <
class flag,
136 typename std::enable_if<is_requiring_alignment<flag>::value &&
137 !is_streaming<flag>::value,
139 inline void load(
const scalarType *
p, flag)
141 _data = _mm_load_si128(
reinterpret_cast<const vectorType *
>(
p));
144 template <
class flag,
145 typename std::enable_if<!is_requiring_alignment<flag>::value,
147 inline void load(
const scalarType *
p, flag)
149 _data = _mm_loadu_si128(
reinterpret_cast<const vectorType *
>(
p));
153 inline void gather(scalarType
const *
p,
const sse2Int4<T> &indices)
155 _data = _mm_i32gather_pd(
p, indices._data, 8);
158 inline void scatter(scalarType *out,
const sse2Int4<T> &indices)
const
161 alignas(alignment) scalarArray tmp;
162 _mm_store_pd(tmp, _data);
164 out[_mm_extract_epi32(indices._data, 0)] = tmp[0];
165 out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
168 inline void broadcast(
const scalarType rhs)
170 _data = _mm_set1_epi32(rhs);
176 inline scalarType operator[](
size_t i)
const
178 alignas(alignment) scalarArray tmp;
static constexpr struct tinysimd::is_aligned_t is_aligned