Nektar++
VmathSIMD.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: VmathSIMD.hpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Collection of templated functions for vector mathematics using
32 // SIMD types, some are functions have optimization tricks such as manual loop
33 // unrolling.
34 //
35 ///////////////////////////////////////////////////////////////////////////////
36 
37 #ifndef NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
38 #define NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
39 
43 
44 namespace Vmath
45 {
46 namespace SIMD
47 {
48 /// \brief Multiply vector z = x + y
49 template <class T, typename = typename std::enable_if<
50  std::is_floating_point<T>::value>::type>
51 void Vadd(const size_t n, const T *x, const T *y, T *z)
52 {
53  using namespace tinysimd;
54  using vec_t = simd<T>;
55 
56  size_t cnt = n;
57  // Vectorized loop unroll 4x
58  while (cnt >= 4 * vec_t::width)
59  {
60  // load
61  vec_t yChunk0, yChunk1, yChunk2, yChunk3;
62  yChunk0.load(y, is_not_aligned);
63  yChunk1.load(y + vec_t::width, is_not_aligned);
64  yChunk2.load(y + 2 * vec_t::width, is_not_aligned);
65  yChunk3.load(y + 3 * vec_t::width, is_not_aligned);
66 
67  vec_t xChunk0, xChunk1, xChunk2, xChunk3;
68  xChunk0.load(x, is_not_aligned);
69  xChunk1.load(x + vec_t::width, is_not_aligned);
70  xChunk2.load(x + 2 * vec_t::width, is_not_aligned);
71  xChunk3.load(x + 3 * vec_t::width, is_not_aligned);
72 
73  // z = x + y
74  vec_t zChunk0 = xChunk0 + yChunk0;
75  vec_t zChunk1 = xChunk1 + yChunk1;
76  vec_t zChunk2 = xChunk2 + yChunk2;
77  vec_t zChunk3 = xChunk3 + yChunk3;
78 
79  // store
80  zChunk0.store(z, is_not_aligned);
81  zChunk1.store(z + vec_t::width, is_not_aligned);
82  zChunk2.store(z + 2 * vec_t::width, is_not_aligned);
83  zChunk3.store(z + 3 * vec_t::width, is_not_aligned);
84 
85  // update pointers
86  x += 4 * vec_t::width;
87  y += 4 * vec_t::width;
88  z += 4 * vec_t::width;
89  cnt -= 4 * vec_t::width;
90  }
91 
92  // Vectorized loop unroll 2x
93  while (cnt >= 2 * vec_t::width)
94  {
95  // load
96  vec_t yChunk0, yChunk1;
97  yChunk0.load(y, is_not_aligned);
98  yChunk1.load(y + vec_t::width, is_not_aligned);
99 
100  vec_t xChunk0, xChunk1;
101  xChunk0.load(x, is_not_aligned);
102  xChunk1.load(x + vec_t::width, is_not_aligned);
103 
104  // z = x + y
105  vec_t zChunk0 = xChunk0 + yChunk0;
106  vec_t zChunk1 = xChunk1 + yChunk1;
107 
108  // store
109  zChunk0.store(z, is_not_aligned);
110  zChunk1.store(z + vec_t::width, is_not_aligned);
111 
112  // update pointers
113  x += 2 * vec_t::width;
114  y += 2 * vec_t::width;
115  z += 2 * vec_t::width;
116  cnt -= 2 * vec_t::width;
117  }
118 
119  // Vectorized loop
120  while (cnt >= vec_t::width)
121  {
122  // load
123  vec_t yChunk;
124  yChunk.load(y, is_not_aligned);
125  vec_t xChunk;
126  xChunk.load(x, is_not_aligned);
127 
128  // z = x + y
129  vec_t zChunk = xChunk + yChunk;
130 
131  // store
132  zChunk.store(z, is_not_aligned);
133 
134  // update pointers
135  x += vec_t::width;
136  y += vec_t::width;
137  z += vec_t::width;
138  cnt -= vec_t::width;
139  }
140 
141  // spillover loop
142  while (cnt)
143  {
144  // z = x + y;
145  *z = (*x) + (*y);
146  // update pointers
147  ++x;
148  ++y;
149  ++z;
150  --cnt;
151  }
152 }
153 
154 /// \brief Multiply vector z = x * y
155 template <class T, typename = typename std::enable_if<
156  std::is_floating_point<T>::value>::type>
157 void Vmul(const size_t n, const T *x, const T *y, T *z)
158 {
159  using namespace tinysimd;
160  using vec_t = simd<T>;
161 
162  size_t cnt = n;
163  // Vectorized loop unroll 4x
164  while (cnt >= 4 * vec_t::width)
165  {
166  // load
167  vec_t yChunk0, yChunk1, yChunk2, yChunk3;
168  yChunk0.load(y, is_not_aligned);
169  yChunk1.load(y + vec_t::width, is_not_aligned);
170  yChunk2.load(y + 2 * vec_t::width, is_not_aligned);
171  yChunk3.load(y + 3 * vec_t::width, is_not_aligned);
172 
173  vec_t xChunk0, xChunk1, xChunk2, xChunk3;
174  xChunk0.load(x, is_not_aligned);
175  xChunk1.load(x + vec_t::width, is_not_aligned);
176  xChunk2.load(x + 2 * vec_t::width, is_not_aligned);
177  xChunk3.load(x + 3 * vec_t::width, is_not_aligned);
178 
179  // z = x * y
180  vec_t zChunk0 = xChunk0 * yChunk0;
181  vec_t zChunk1 = xChunk1 * yChunk1;
182  vec_t zChunk2 = xChunk2 * yChunk2;
183  vec_t zChunk3 = xChunk3 * yChunk3;
184 
185  // store
186  zChunk0.store(z, is_not_aligned);
187  zChunk1.store(z + vec_t::width, is_not_aligned);
188  zChunk2.store(z + 2 * vec_t::width, is_not_aligned);
189  zChunk3.store(z + 3 * vec_t::width, is_not_aligned);
190 
191  // update pointers
192  x += 4 * vec_t::width;
193  y += 4 * vec_t::width;
194  z += 4 * vec_t::width;
195  cnt -= 4 * vec_t::width;
196  }
197 
198  // Vectorized loop unroll 2x
199  while (cnt >= 2 * vec_t::width)
200  {
201  // load
202  vec_t yChunk0, yChunk1;
203  yChunk0.load(y, is_not_aligned);
204  yChunk1.load(y + vec_t::width, is_not_aligned);
205 
206  vec_t xChunk0, xChunk1;
207  xChunk0.load(x, is_not_aligned);
208  xChunk1.load(x + vec_t::width, is_not_aligned);
209 
210  // z = x * y
211  vec_t zChunk0 = xChunk0 * yChunk0;
212  vec_t zChunk1 = xChunk1 * yChunk1;
213 
214  // store
215  zChunk0.store(z, is_not_aligned);
216  zChunk1.store(z + vec_t::width, is_not_aligned);
217 
218  // update pointers
219  x += 2 * vec_t::width;
220  y += 2 * vec_t::width;
221  z += 2 * vec_t::width;
222  cnt -= 2 * vec_t::width;
223  }
224 
225  // Vectorized loop
226  while (cnt >= vec_t::width)
227  {
228  // load
229  vec_t yChunk;
230  yChunk.load(y, is_not_aligned);
231  vec_t xChunk;
232  xChunk.load(x, is_not_aligned);
233 
234  // z = x * y
235  vec_t zChunk = xChunk * yChunk;
236 
237  // store
238  zChunk.store(z, is_not_aligned);
239 
240  // update pointers
241  x += vec_t::width;
242  y += vec_t::width;
243  z += vec_t::width;
244  cnt -= vec_t::width;
245  }
246 
247  // spillover loop
248  while (cnt)
249  {
250  // z = x * y;
251  *z = (*x) * (*y);
252  // update pointers
253  ++x;
254  ++y;
255  ++z;
256  --cnt;
257  }
258 }
259 
260 /// \brief vvtvp (vector times vector plus vector): z = w*x + y
261 template <class T, typename = typename std::enable_if<
262  std::is_floating_point<T>::value>::type>
263 void Vvtvp(const size_t n, const T *w, const T *x, const T *y, T *z)
264 {
265  using namespace tinysimd;
266  using vec_t = simd<T>;
267 
268  size_t cnt = n;
269  // Vectorized loop
270  while (cnt >= vec_t::width)
271  {
272  // load
273  vec_t wChunk;
274  wChunk.load(w, is_not_aligned);
275  vec_t yChunk;
276  yChunk.load(y, is_not_aligned);
277  vec_t xChunk;
278  xChunk.load(x, is_not_aligned);
279 
280  // z = w * x + y
281  vec_t zChunk = wChunk * xChunk + yChunk;
282 
283  // store
284  zChunk.store(z, is_not_aligned);
285 
286  // update pointers
287  w += vec_t::width;
288  x += vec_t::width;
289  y += vec_t::width;
290  z += vec_t::width;
291  cnt -= vec_t::width;
292  }
293 
294  // spillover loop
295  while (cnt)
296  {
297  // z = w * x + y;
298  *z = (*w) * (*x) + (*y);
299  // update pointers
300  ++w;
301  ++x;
302  ++y;
303  ++z;
304  --cnt;
305  }
306 }
307 
308 /// \brief vvtvm (vector times vector plus vector): z = w*x - y
309 template <class T, typename = typename std::enable_if<
310  std::is_floating_point<T>::value>::type>
311 void Vvtvm(const size_t n, const T *w, const T *x, const T *y, T *z)
312 {
313  using namespace tinysimd;
314  using vec_t = simd<T>;
315 
316  size_t cnt = n;
317  // Vectorized loop
318  while (cnt >= vec_t::width)
319  {
320  // load
321  vec_t wChunk;
322  wChunk.load(w, is_not_aligned);
323  vec_t yChunk;
324  yChunk.load(y, is_not_aligned);
325  vec_t xChunk;
326  xChunk.load(x, is_not_aligned);
327 
328  // z = w * x - y
329  vec_t zChunk = wChunk * xChunk - yChunk;
330 
331  // store
332  zChunk.store(z, is_not_aligned);
333 
334  // update pointers
335  w += vec_t::width;
336  x += vec_t::width;
337  y += vec_t::width;
338  z += vec_t::width;
339  cnt -= vec_t::width;
340  }
341 
342  // spillover loop
343  while (cnt)
344  {
345  // z = w * x - y;
346  *z = (*w) * (*x) - (*y);
347  // update pointers
348  ++w;
349  ++x;
350  ++y;
351  ++z;
352  --cnt;
353  }
354 }
355 
356 /// \brief vvtvvtp (vector times vector plus vector times vector):
357 // z = v*w + x*y
358 template <class T, typename = typename std::enable_if<
359  std::is_floating_point<T>::value>::type>
360 inline void Vvtvvtp(const size_t n, const T *v, const T *w, const T *x,
361  const T *y, T *z)
362 {
363  using namespace tinysimd;
364  using vec_t = simd<T>;
365 
366  size_t cnt = n;
367  // Vectorized loop
368  while (cnt >= vec_t::width)
369  {
370  // load
371  vec_t vChunk;
372  vChunk.load(v, is_not_aligned);
373  vec_t wChunk;
374  wChunk.load(w, is_not_aligned);
375  vec_t yChunk;
376  yChunk.load(y, is_not_aligned);
377  vec_t xChunk;
378  xChunk.load(x, is_not_aligned);
379 
380  // z = v * w + x * y;
381  vec_t z1Chunk = vChunk * wChunk;
382  vec_t z2Chunk = xChunk * yChunk;
383  vec_t zChunk = z1Chunk + z2Chunk;
384 
385  // store
386  zChunk.store(z, is_not_aligned);
387 
388  // update pointers
389  v += vec_t::width;
390  w += vec_t::width;
391  x += vec_t::width;
392  y += vec_t::width;
393  z += vec_t::width;
394  cnt -= vec_t::width;
395  }
396 
397  // spillover loop
398  while (cnt)
399  {
400  // z = v * w + x * y;
401  T z1 = (*v) * (*w);
402  T z2 = (*x) * (*y);
403  *z = z1 + z2;
404  // update pointers
405  ++v;
406  ++w;
407  ++x;
408  ++y;
409  ++z;
410  --cnt;
411  }
412 }
413 
414 /// \brief Gather vector z[i] = x[y[i]]
415 template <class T, class I,
416  typename = typename std::enable_if<std::is_floating_point<T>::value &&
417  std::is_integral<I>::value>::type>
418 void Gathr(const I n, const T *x, const I *y, T *z)
419 {
420  using namespace tinysimd;
421  using vec_t = simd<T>;
422  using vec_t_i = simd<I>;
423 
424  I cnt = n;
425  // Unroll 4x Vectorized loop
426  while (cnt >= 4 * vec_t::width)
427  {
428  // load index
429  vec_t_i yChunk0, yChunk1, yChunk2, yChunk3;
430  yChunk0.load(y, is_not_aligned);
431  yChunk1.load(y + vec_t_i::width, is_not_aligned);
432  yChunk2.load(y + 2 * vec_t_i::width, is_not_aligned);
433  yChunk3.load(y + 3 * vec_t_i::width, is_not_aligned);
434 
435  // z = x[y[i]]
436  vec_t zChunk0, zChunk1, zChunk2, zChunk3;
437  zChunk0.gather(x, yChunk0);
438  zChunk1.gather(x, yChunk1);
439  zChunk2.gather(x, yChunk2);
440  zChunk3.gather(x, yChunk3);
441 
442  // store
443  zChunk0.store(z, is_not_aligned);
444  zChunk1.store(z + vec_t_i::width, is_not_aligned);
445  zChunk2.store(z + 2 * vec_t_i::width, is_not_aligned);
446  zChunk3.store(z + 3 * vec_t_i::width, is_not_aligned);
447 
448  // update pointers
449  y += 4 * vec_t_i::width;
450  z += 4 * vec_t::width;
451  cnt -= 4 * vec_t::width;
452  }
453 
454  // Unroll 2x Vectorized loop
455  while (cnt >= 2 * vec_t::width)
456  {
457  // load index
458  vec_t_i yChunk0, yChunk1;
459  yChunk0.load(y, is_not_aligned);
460  yChunk1.load(y + vec_t_i::width, is_not_aligned);
461 
462  // z = x[y[i]]
463  vec_t zChunk0, zChunk1;
464  zChunk0.gather(x, yChunk0);
465  zChunk1.gather(x, yChunk1);
466 
467  // store
468  zChunk0.store(z, is_not_aligned);
469  zChunk1.store(z + vec_t_i::width, is_not_aligned);
470 
471  // update pointers
472  y += 2 * vec_t_i::width;
473  z += 2 * vec_t::width;
474  cnt -= 2 * vec_t::width;
475  }
476 
477  // Vectorized loop
478  while (cnt >= vec_t::width)
479  {
480  // load index
481  vec_t_i yChunk;
482  yChunk.load(y, is_not_aligned);
483 
484  // z = x[y[i]]
485  vec_t zChunk;
486  zChunk.gather(x, yChunk);
487 
488  // store
489  zChunk.store(z, is_not_aligned);
490 
491  // update pointers
492  y += vec_t_i::width;
493  z += vec_t::width;
494  cnt -= vec_t::width;
495  }
496 
497  // spillover loop
498  while (cnt)
499  {
500  // z = x[y[i]]
501  *z = *(x + *y);
502  // update pointers
503  ++y;
504  ++z;
505  --cnt;
506  }
507 }
508 
509 } // namespace SIMD
510 } // namespace Vmath
511 #endif
tinysimd::simd< NekDouble > vec_t
void Vvtvp(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvp (vector times vector plus vector): z = w*x + y
Definition: VmathSIMD.hpp:263
void Vadd(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x + y.
Definition: VmathSIMD.hpp:51
void Vvtvm(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvm (vector times vector plus vector): z = w*x - y
Definition: VmathSIMD.hpp:311
void Gathr(const I n, const T *x, const I *y, T *z)
Gather vector z[i] = x[y[i]].
Definition: VmathSIMD.hpp:418
void Vvtvvtp(const size_t n, const T *v, const T *w, const T *x, const T *y, T *z)
vvtvvtp (vector times vector plus vector times vector):
Definition: VmathSIMD.hpp:360
void Vmul(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x * y.
Definition: VmathSIMD.hpp:157
Definition: Vmath.cpp:40
static constexpr struct tinysimd::is_not_aligned_t is_not_aligned
typename abi< ScalarType, width >::type simd
Definition: tinysimd.hpp:80