35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
38 #include <immintrin.h>
48 template <
typename scalarType>
57 #if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
67 template <>
struct sse2<std::int32_t> {
using type = sse2Int4<std::int32_t>; };
68 template <>
struct sse2<std::uint32_t> {
using type = sse2Int4<std::uint32_t>; };
76 static_assert(std::is_integral<T>::value &&
sizeof(T) == 4,
77 "4 bytes Integral required.");
79 static constexpr
unsigned int width = 4;
80 static constexpr
unsigned int alignment = 16;
83 using vectorType = __m128i;
84 using scalarArray = scalarType[width];
90 inline sse2Int4() =
default;
91 inline sse2Int4(
const sse2Int4& rhs) =
default;
92 inline sse2Int4(
const vectorType& rhs) : _data(rhs){}
93 inline sse2Int4(
const scalarType rhs)
95 _data = _mm_set1_epi32(rhs);
99 inline void store(scalarType*
p)
const
101 _mm_store_si128(
reinterpret_cast<vectorType*
>(
p), _data);
107 typename std::enable_if<
108 is_requiring_alignment<flag>::value &&
109 !is_streaming<flag>::value,
bool
112 inline void store(scalarType*
p, flag)
const
114 _mm_store_si128(
reinterpret_cast<vectorType*
>(
p), _data);
120 typename std::enable_if<
121 !is_requiring_alignment<flag>::value,
bool
124 inline void store(scalarType*
p, flag)
const
126 _mm_storeu_si128(
reinterpret_cast<vectorType*
>(
p), _data);
129 inline void load(
const scalarType*
p)
131 _data = _mm_load_si128(
reinterpret_cast<const vectorType*
>(
p));
137 typename std::enable_if<
138 is_requiring_alignment<flag>::value &&
139 !is_streaming<flag>::value,
bool
142 inline void load(
const scalarType*
p, flag)
144 _data = _mm_load_si128(
reinterpret_cast<const vectorType*
>(
p));
150 typename std::enable_if<
151 !is_requiring_alignment<flag>::value,
bool
154 inline void load(
const scalarType*
p, flag)
156 _data = _mm_loadu_si128(
reinterpret_cast<const vectorType*
>(
p));
161 inline void gather(scalarType
const*
p,
const sse2Int4<T>& indices)
163 _data = _mm256_i32gather_pd(
p, indices._data, 8);
166 inline void scatter(scalarType* out,
const sse2Int4<T>& indices)
const
169 alignas(alignment) scalarArray tmp;
170 _mm256_store_pd(tmp, _data);
172 out[_mm_extract_epi32(indices._data, 0)] = tmp[0];
173 out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
174 out[_mm_extract_epi32(indices._data, 2)] = tmp[2];
175 out[_mm_extract_epi32(indices._data, 3)] = tmp[3];
178 inline void broadcast(
const scalarType rhs)
180 _data = _mm_set1_epi32(rhs);
186 inline scalarType operator[](
size_t i)
const
188 alignas(alignment) scalarArray tmp;
193 inline scalarType& operator[](
size_t i)
195 scalarType* tmp =
reinterpret_cast<scalarType*
>(&_data);
static constexpr struct tinysimd::is_aligned_t is_aligned