Nektar++
sse2.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: sse2.hpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Vector type using sse2 extension.
32 //
33 ///////////////////////////////////////////////////////////////////////////////
34 
35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
37 
38 #if defined(__x86_64__)
39 #include <immintrin.h>
40 #if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
41 #define TINYSIMD_HAS_SVML
42 #endif
43 #endif
44 #include "traits.hpp"
45 #include <cstdint>
46 
47 namespace tinysimd
48 {
49 
50 namespace abi
51 {
52 
53 template <typename scalarType> struct sse2
54 {
55  using type = void;
56 };
57 
58 } // namespace abi
59 
60 #if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
61 
62 // forward declaration of concrete types
63 template <typename T> struct sse2Int4;
64 
65 namespace abi
66 {
67 
68 // mapping between abstract types and concrete types
69 template <> struct sse2<std::int32_t>
70 {
71  using type = sse2Int4<std::int32_t>;
72 };
73 template <> struct sse2<std::uint32_t>
74 {
75  using type = sse2Int4<std::uint32_t>;
76 };
77 
78 } // namespace abi
79 
80 // concrete types
81 template <typename T> struct sse2Int4
82 {
83  static_assert(std::is_integral<T>::value && sizeof(T) == 4,
84  "4 bytes Integral required.");
85 
86  static constexpr unsigned int width = 4;
87  static constexpr unsigned int alignment = 16;
88 
89  using scalarType = T;
90  using vectorType = __m128i;
91  using scalarArray = scalarType[width];
92 
93  // storage
94  vectorType _data;
95 
96  // ctors
97  inline sse2Int4() = default;
98  inline sse2Int4(const sse2Int4 &rhs) = default;
99  inline sse2Int4(const vectorType &rhs) : _data(rhs)
100  {
101  }
102  inline sse2Int4(const scalarType rhs)
103  {
104  _data = _mm_set1_epi32(rhs);
105  }
106 
107  // store
108  inline void store(scalarType *p) const
109  {
110  _mm_store_si128(reinterpret_cast<vectorType *>(p), _data);
111  }
112 
113  template <class flag,
114  typename std::enable_if<is_requiring_alignment<flag>::value &&
115  !is_streaming<flag>::value,
116  bool>::type = 0>
117  inline void store(scalarType *p, flag) const
118  {
119  _mm_store_si128(reinterpret_cast<vectorType *>(p), _data);
120  }
121 
122  template <class flag,
123  typename std::enable_if<!is_requiring_alignment<flag>::value,
124  bool>::type = 0>
125  inline void store(scalarType *p, flag) const
126  {
127  _mm_storeu_si128(reinterpret_cast<vectorType *>(p), _data);
128  }
129 
130  inline void load(const scalarType *p)
131  {
132  _data = _mm_load_si128(reinterpret_cast<const vectorType *>(p));
133  }
134 
135  template <class flag,
136  typename std::enable_if<is_requiring_alignment<flag>::value &&
137  !is_streaming<flag>::value,
138  bool>::type = 0>
139  inline void load(const scalarType *p, flag)
140  {
141  _data = _mm_load_si128(reinterpret_cast<const vectorType *>(p));
142  }
143 
144  template <class flag,
145  typename std::enable_if<!is_requiring_alignment<flag>::value,
146  bool>::type = 0>
147  inline void load(const scalarType *p, flag)
148  {
149  _data = _mm_loadu_si128(reinterpret_cast<const vectorType *>(p));
150  }
151 
152  // gather/scatter with sse2
153  inline void gather(scalarType const *p, const sse2Int4<T> &indices)
154  {
155  _data = _mm_i32gather_pd(p, indices._data, 8);
156  }
157 
158  inline void scatter(scalarType *out, const sse2Int4<T> &indices) const
159  {
160  // no scatter intrinsics for AVX2
161  alignas(alignment) scalarArray tmp;
162  _mm_store_pd(tmp, _data);
163 
164  out[_mm_extract_epi32(indices._data, 0)] = tmp[0]; // SSE4.1
165  out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
166  }
167 
168  inline void broadcast(const scalarType rhs)
169  {
170  _data = _mm_set1_epi32(rhs);
171  }
172 
173  // subscript
174  // subscript operators are convienient but expensive
175  // should not be used in optimized kernels
176  inline scalarType operator[](size_t i) const
177  {
178  alignas(alignment) scalarArray tmp;
179  store(tmp, is_aligned);
180  return tmp[i];
181  }
182 };
183 
184 #endif // defined(__AVX2__)
185 
186 } // namespace tinysimd
187 #endif
static constexpr struct tinysimd::is_aligned_t is_aligned