37 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H 
   38 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H 
   40 #if defined(__x86_64__) 
   41 #include <immintrin.h> 
   42 #if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML) 
   43 #define TINYSIMD_HAS_SVML 
   55 template <
typename scalarType> 
struct sse2 
   62 #if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2) 
   65 template <
typename T> 
struct sse2Int4;
 
   71 template <> 
struct sse2<std::int32_t>
 
   73     using type = sse2Int4<std::int32_t>;
 
   75 template <> 
struct sse2<std::uint32_t>
 
   77     using type = sse2Int4<std::uint32_t>;
 
   83 template <
typename T> 
struct sse2Int4
 
   85     static_assert(std::is_integral<T>::value && 
sizeof(T) == 4,
 
   86                   "4 bytes Integral required.");
 
   88     static constexpr 
unsigned int width     = 4;
 
   89     static constexpr 
unsigned int alignment = 16;
 
   92     using vectorType  = __m128i;
 
   93     using scalarArray = scalarType[width];
 
   99     inline sse2Int4()                    = 
default;
 
  100     inline sse2Int4(
const sse2Int4 &rhs) = 
default;
 
  101     inline sse2Int4(
const vectorType &rhs) : _data(rhs)
 
  104     inline sse2Int4(
const scalarType rhs)
 
  106         _data = _mm_set1_epi32(rhs);
 
  110     inline void store(scalarType *
p)
 const 
  112         _mm_store_si128(
reinterpret_cast<vectorType *
>(
p), _data);
 
  115     template <
class flag,
 
  116               typename std::enable_if<is_requiring_alignment<flag>::value &&
 
  117                                           !is_streaming<flag>::value,
 
  119     inline void store(scalarType *
p, flag)
 const 
  121         _mm_store_si128(
reinterpret_cast<vectorType *
>(
p), _data);
 
  124     template <
class flag,
 
  125               typename std::enable_if<!is_requiring_alignment<flag>::value,
 
  127     inline void store(scalarType *
p, flag)
 const 
  129         _mm_storeu_si128(
reinterpret_cast<vectorType *
>(
p), _data);
 
  132     inline void load(
const scalarType *
p)
 
  134         _data = _mm_load_si128(
reinterpret_cast<const vectorType *
>(
p));
 
  137     template <
class flag,
 
  138               typename std::enable_if<is_requiring_alignment<flag>::value &&
 
  139                                           !is_streaming<flag>::value,
 
  141     inline void load(
const scalarType *
p, flag)
 
  143         _data = _mm_load_si128(
reinterpret_cast<const vectorType *
>(
p));
 
  146     template <
class flag,
 
  147               typename std::enable_if<!is_requiring_alignment<flag>::value,
 
  149     inline void load(
const scalarType *
p, flag)
 
  151         _data = _mm_loadu_si128(
reinterpret_cast<const vectorType *
>(
p));
 
  155     inline void gather(scalarType 
const *
p, 
const sse2Int4<T> &indices)
 
  157         _data = _mm_i32gather_pd(
p, indices._data, 8);
 
  160     inline void scatter(scalarType *out, 
const sse2Int4<T> &indices)
 const 
  163         alignas(alignment) scalarArray tmp;
 
  164         _mm_store_pd(tmp, _data);
 
  166         out[_mm_extract_epi32(indices._data, 0)] = tmp[0]; 
 
  167         out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
 
  170     inline void broadcast(
const scalarType rhs)
 
  172         _data = _mm_set1_epi32(rhs);
 
  178     inline scalarType operator[](
size_t i)
 const 
  180         alignas(alignment) scalarArray tmp;
 
static constexpr struct tinysimd::is_aligned_t is_aligned