Nektar++
sse2.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: sse2.hpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Vector type using sse2 extension.
32 // Note that this is not a full implementation: only the int type is
33 // implemented as support index type for avx2.
34 //
35 ///////////////////////////////////////////////////////////////////////////////
36 
37 #ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
38 #define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
39 
40 #if defined(__x86_64__)
41 #include <immintrin.h>
42 #if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
43 #define TINYSIMD_HAS_SVML
44 #endif
45 #endif
46 #include "traits.hpp"
47 #include <cstdint>
48 
49 namespace tinysimd
50 {
51 
52 namespace abi
53 {
54 
55 template <typename scalarType> struct sse2
56 {
57  using type = void;
58 };
59 
60 } // namespace abi
61 
62 #if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
63 
64 // forward declaration of concrete types
65 template <typename T> struct sse2Int4;
66 
67 namespace abi
68 {
69 
70 // mapping between abstract types and concrete types
71 template <> struct sse2<std::int32_t>
72 {
73  using type = sse2Int4<std::int32_t>;
74 };
75 template <> struct sse2<std::uint32_t>
76 {
77  using type = sse2Int4<std::uint32_t>;
78 };
79 
80 } // namespace abi
81 
82 // concrete types
83 template <typename T> struct sse2Int4
84 {
85  static_assert(std::is_integral<T>::value && sizeof(T) == 4,
86  "4 bytes Integral required.");
87 
88  static constexpr unsigned int width = 4;
89  static constexpr unsigned int alignment = 16;
90 
91  using scalarType = T;
92  using vectorType = __m128i;
93  using scalarArray = scalarType[width];
94 
95  // storage
96  vectorType _data;
97 
98  // ctors
99  inline sse2Int4() = default;
100  inline sse2Int4(const sse2Int4 &rhs) = default;
101  inline sse2Int4(const vectorType &rhs) : _data(rhs)
102  {
103  }
104  inline sse2Int4(const scalarType rhs)
105  {
106  _data = _mm_set1_epi32(rhs);
107  }
108 
109  // store
110  inline void store(scalarType *p) const
111  {
112  _mm_store_si128(reinterpret_cast<vectorType *>(p), _data);
113  }
114 
115  template <class flag,
116  typename std::enable_if<is_requiring_alignment<flag>::value &&
117  !is_streaming<flag>::value,
118  bool>::type = 0>
119  inline void store(scalarType *p, flag) const
120  {
121  _mm_store_si128(reinterpret_cast<vectorType *>(p), _data);
122  }
123 
124  template <class flag,
125  typename std::enable_if<!is_requiring_alignment<flag>::value,
126  bool>::type = 0>
127  inline void store(scalarType *p, flag) const
128  {
129  _mm_storeu_si128(reinterpret_cast<vectorType *>(p), _data);
130  }
131 
132  inline void load(const scalarType *p)
133  {
134  _data = _mm_load_si128(reinterpret_cast<const vectorType *>(p));
135  }
136 
137  template <class flag,
138  typename std::enable_if<is_requiring_alignment<flag>::value &&
139  !is_streaming<flag>::value,
140  bool>::type = 0>
141  inline void load(const scalarType *p, flag)
142  {
143  _data = _mm_load_si128(reinterpret_cast<const vectorType *>(p));
144  }
145 
146  template <class flag,
147  typename std::enable_if<!is_requiring_alignment<flag>::value,
148  bool>::type = 0>
149  inline void load(const scalarType *p, flag)
150  {
151  _data = _mm_loadu_si128(reinterpret_cast<const vectorType *>(p));
152  }
153 
154  // gather/scatter with sse2
155  inline void gather(scalarType const *p, const sse2Int4<T> &indices)
156  {
157  _data = _mm_i32gather_pd(p, indices._data, 8);
158  }
159 
160  inline void scatter(scalarType *out, const sse2Int4<T> &indices) const
161  {
162  // no scatter intrinsics for AVX2
163  alignas(alignment) scalarArray tmp;
164  _mm_store_pd(tmp, _data);
165 
166  out[_mm_extract_epi32(indices._data, 0)] = tmp[0]; // SSE4.1
167  out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
168  }
169 
170  inline void broadcast(const scalarType rhs)
171  {
172  _data = _mm_set1_epi32(rhs);
173  }
174 
175  // subscript
176  // subscript operators are convienient but expensive
177  // should not be used in optimized kernels
178  inline scalarType operator[](size_t i) const
179  {
180  alignas(alignment) scalarArray tmp;
181  store(tmp, is_aligned);
182  return tmp[i];
183  }
184 };
185 
186 #endif // defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
187 
188 } // namespace tinysimd
189 #endif
static constexpr struct tinysimd::is_aligned_t is_aligned