Nektar++
sse2.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: sse2.cpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Vector type using sse2 extension.
32 //
33 ///////////////////////////////////////////////////////////////////////////////
34 
35 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
36 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
37 
38 #include <immintrin.h>
39 #include <cstdint>
40 #include "traits.hpp"
41 
42 namespace tinysimd
43 {
44 
45 namespace abi
46 {
47 
48 template <typename scalarType>
49 struct sse2
50 {
51  using type = void;
52 };
53 
54 } // namespace abi
55 
56 
57 #if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
58 
59 // forward declaration of concrete types
60 template <typename T>
61 struct sse2Int4;
62 
63 namespace abi
64 {
65 
66 // mapping between abstract types and concrete types
67 template <> struct sse2<std::int32_t> { using type = sse2Int4<std::int32_t>; };
68 template <> struct sse2<std::uint32_t> { using type = sse2Int4<std::uint32_t>; };
69 
70 } // namespace abi
71 
72 // concrete types
73 template <typename T>
74 struct sse2Int4
75 {
76  static_assert(std::is_integral<T>::value && sizeof(T) == 4,
77  "4 bytes Integral required.");
78 
79  static constexpr unsigned int width = 4;
80  static constexpr unsigned int alignment = 16;
81 
82  using scalarType = T;
83  using vectorType = __m128i;
84  using scalarArray = scalarType[width];
85 
86  // storage
87  vectorType _data;
88 
89  // ctors
90  inline sse2Int4() = default;
91  inline sse2Int4(const sse2Int4& rhs) = default;
92  inline sse2Int4(const vectorType& rhs) : _data(rhs){}
93  inline sse2Int4(const scalarType rhs)
94  {
95  _data = _mm_set1_epi32(rhs);
96  }
97 
98  // store
99  inline void store(scalarType* p) const
100  {
101  _mm_store_si128(reinterpret_cast<vectorType*>(p), _data);
102  }
103 
104  template
105  <
106  class flag,
107  typename std::enable_if<
108  is_requiring_alignment<flag>::value &&
109  !is_streaming<flag>::value, bool
110  >::type = 0
111  >
112  inline void store(scalarType* p, flag) const
113  {
114  _mm_store_si128(reinterpret_cast<vectorType*>(p), _data);
115  }
116 
117  template
118  <
119  class flag,
120  typename std::enable_if<
121  !is_requiring_alignment<flag>::value, bool
122  >::type = 0
123  >
124  inline void store(scalarType* p, flag) const
125  {
126  _mm_storeu_si128(reinterpret_cast<vectorType*>(p), _data);
127  }
128 
129  inline void load(const scalarType* p)
130  {
131  _data = _mm_load_si128(reinterpret_cast<const vectorType*>(p));
132  }
133 
134  template
135  <
136  class flag,
137  typename std::enable_if<
138  is_requiring_alignment<flag>::value &&
139  !is_streaming<flag>::value, bool
140  >::type = 0
141  >
142  inline void load(const scalarType* p, flag)
143  {
144  _data = _mm_load_si128(reinterpret_cast<const vectorType*>(p));
145  }
146 
147  template
148  <
149  class flag,
150  typename std::enable_if<
151  !is_requiring_alignment<flag>::value, bool
152  >::type = 0
153  >
154  inline void load(const scalarType* p, flag)
155  {
156  _data = _mm_loadu_si128(reinterpret_cast<const vectorType*>(p));
157  }
158 
159 
160  // gather/scatter with sse2
161  inline void gather(scalarType const* p, const sse2Int4<T>& indices)
162  {
163  _data = _mm256_i32gather_pd(p, indices._data, 8);
164  }
165 
166  inline void scatter(scalarType* out, const sse2Int4<T>& indices) const
167  {
168  // no scatter intrinsics for AVX2
169  alignas(alignment) scalarArray tmp;
170  _mm256_store_pd(tmp, _data);
171 
172  out[_mm_extract_epi32(indices._data, 0)] = tmp[0]; // SSE4.1
173  out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
174  out[_mm_extract_epi32(indices._data, 2)] = tmp[2];
175  out[_mm_extract_epi32(indices._data, 3)] = tmp[3];
176  }
177 
178  inline void broadcast(const scalarType rhs)
179  {
180  _data = _mm_set1_epi32(rhs);
181  }
182 
183  // subscript
184  // subscript operators are convienient but expensive
185  // should not be used in optimized kernels
186  inline scalarType operator[](size_t i) const
187  {
188  alignas(alignment) scalarArray tmp;
189  store(tmp, is_aligned);
190  return tmp[i];
191  }
192 
193  inline scalarType& operator[](size_t i)
194  {
195  scalarType* tmp = reinterpret_cast<scalarType*>(&_data);
196  return tmp[i];
197  }
198 
199 
200 };
201 
202 #endif // defined(__AVX2__)
203 
204 } // namespace tinysimd
205 #endif
static constexpr struct tinysimd::is_aligned_t is_aligned