Nektar++
sse2.hpp
Go to the documentation of this file.
1///////////////////////////////////////////////////////////////////////////////
2//
3// File: sse2.hpp
4//
5// For more information, please see: http://www.nektar.info
6//
7// The MIT License
8//
9// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10// Department of Aeronautics, Imperial College London (UK), and Scientific
11// Computing and Imaging Institute, University of Utah (USA).
12//
13// Permission is hereby granted, free of charge, to any person obtaining a
14// copy of this software and associated documentation files (the "Software"),
15// to deal in the Software without restriction, including without limitation
16// the rights to use, copy, modify, merge, publish, distribute, sublicense,
17// and/or sell copies of the Software, and to permit persons to whom the
18// Software is furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included
21// in all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29// DEALINGS IN THE SOFTWARE.
30//
31// Description: Vector type using sse2 extension.
32// Note that this is not a full implementation: only the int type is
33// implemented as support index type for avx2.
34//
35///////////////////////////////////////////////////////////////////////////////
36
37#ifndef NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
38#define NEKTAR_LIB_LIBUTILITES_SIMDLIB_SSE2_H
39
40#if defined(__x86_64__)
41#include <immintrin.h>
42#if defined(__INTEL_COMPILER) && !defined(TINYSIMD_HAS_SVML)
43#define TINYSIMD_HAS_SVML
44#endif
45#endif
46#include "traits.hpp"
47#include <cstdint>
48
49namespace tinysimd::abi
50{
51
52template <typename scalarType> struct sse2
53{
54 using type = void;
55};
56
57} // namespace tinysimd::abi
58
59#if defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
60
61namespace tinysimd
62{
63
64// forward declaration of concrete types
65template <typename T> struct sse2Int4;
66
67namespace abi
68{
69
70// mapping between abstract types and concrete types
71template <> struct sse2<std::int32_t>
72{
73 using type = sse2Int4<std::int32_t>;
74};
75template <> struct sse2<std::uint32_t>
76{
77 using type = sse2Int4<std::uint32_t>;
78};
79
80} // namespace abi
81
82// concrete types
83template <typename T> struct sse2Int4
84{
85 static_assert(std::is_integral<T>::value && sizeof(T) == 4,
86 "4 bytes Integral required.");
87
88 static constexpr unsigned int width = 4;
89 static constexpr unsigned int alignment = 16;
90
91 using scalarType = T;
92 using vectorType = __m128i;
93 using scalarArray = scalarType[width];
94
95 // storage
96 vectorType _data;
97
98 // ctors
99 inline sse2Int4() = default;
100 inline sse2Int4(const sse2Int4 &rhs) = default;
101 inline sse2Int4(const vectorType &rhs) : _data(rhs)
102 {
103 }
104 inline sse2Int4(const scalarType rhs)
105 {
106 _data = _mm_set1_epi32(rhs);
107 }
108
109 // store
110 inline void store(scalarType *p) const
111 {
112 _mm_store_si128(reinterpret_cast<vectorType *>(p), _data);
113 }
114
115 template <class flag,
116 typename std::enable_if<is_requiring_alignment<flag>::value &&
117 !is_streaming<flag>::value,
118 bool>::type = 0>
119 inline void store(scalarType *p, flag) const
120 {
121 _mm_store_si128(reinterpret_cast<vectorType *>(p), _data);
122 }
123
124 template <class flag,
125 typename std::enable_if<!is_requiring_alignment<flag>::value,
126 bool>::type = 0>
127 inline void store(scalarType *p, flag) const
128 {
129 _mm_storeu_si128(reinterpret_cast<vectorType *>(p), _data);
130 }
131
132 inline void load(const scalarType *p)
133 {
134 _data = _mm_load_si128(reinterpret_cast<const vectorType *>(p));
135 }
136
137 template <class flag,
138 typename std::enable_if<is_requiring_alignment<flag>::value &&
139 !is_streaming<flag>::value,
140 bool>::type = 0>
141 inline void load(const scalarType *p, flag)
142 {
143 _data = _mm_load_si128(reinterpret_cast<const vectorType *>(p));
144 }
145
146 template <class flag,
147 typename std::enable_if<!is_requiring_alignment<flag>::value,
148 bool>::type = 0>
149 inline void load(const scalarType *p, flag)
150 {
151 _data = _mm_loadu_si128(reinterpret_cast<const vectorType *>(p));
152 }
153
154 // gather/scatter with sse2
155 inline void gather(scalarType const *p, const sse2Int4<T> &indices)
156 {
157 _data = _mm_i32gather_epi32(p, indices._data, 8);
158 }
159
160 inline void scatter(scalarType *out, const sse2Int4<T> &indices) const
161 {
162 // no scatter intrinsics for sse2
163 alignas(alignment) scalarArray tmp;
164 _mm_store_epi32(tmp, _data);
165
166 out[_mm_extract_epi32(indices._data, 0)] = tmp[0]; // SSE4.1
167 out[_mm_extract_epi32(indices._data, 1)] = tmp[1];
168 }
169
170 inline void broadcast(const scalarType rhs)
171 {
172 _data = _mm_set1_epi32(rhs);
173 }
174
175 // subscript
176 // subscript operators are convienient but expensive
177 // should not be used in optimized kernels
178 inline scalarType operator[](size_t i) const
179 {
180 alignas(alignment) scalarArray tmp;
181 store(tmp, is_aligned);
182 return tmp[i];
183 }
184};
185
186} // namespace tinysimd
187
188#endif // defined(__SSE2__) && defined(NEKTAR_ENABLE_SIMD_SSE2)
189
190#endif
STL namespace.
static constexpr struct tinysimd::is_aligned_t is_aligned