Nektar++
VmathSIMD.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: VmathSIMD.hpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Collection of templated functions for vector mathematics using
32 // SIMD types, some are functions have optimization tricks such as manual loop
33 // unrolling.
34 //
35 ///////////////////////////////////////////////////////////////////////////////
36 
37 #ifndef NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
38 #define NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
39 
43 
44 namespace Vmath
45 {
46 namespace SIMD
47 {
48 /// \brief Multiply vector z = x + y
49 template <class T, typename = typename std::enable_if<
50  std::is_floating_point<T>::value>::type>
51 void Vadd(const size_t n, const T *x, const T *y, T *z)
52 {
53  using namespace tinysimd;
54  using vec_t = simd<T>;
55 
56  size_t cnt = n;
57  // Vectorized loop unroll 4x
58  while (cnt >= 4 * vec_t::width)
59  {
60  // load
61  vec_t yChunk0, yChunk1, yChunk2, yChunk3;
62  yChunk0.load(y, is_not_aligned);
63  yChunk1.load(y + vec_t::width, is_not_aligned);
64  yChunk2.load(y + 2 * vec_t::width, is_not_aligned);
65  yChunk3.load(y + 3 * vec_t::width, is_not_aligned);
66 
67  vec_t xChunk0, xChunk1, xChunk2, xChunk3;
68  xChunk0.load(x, is_not_aligned);
69  xChunk1.load(x + vec_t::width, is_not_aligned);
70  xChunk2.load(x + 2 * vec_t::width, is_not_aligned);
71  xChunk3.load(x + 3 * vec_t::width, is_not_aligned);
72 
73  // z = x + y
74  vec_t zChunk0 = xChunk0 + yChunk0;
75  vec_t zChunk1 = xChunk1 + yChunk1;
76  vec_t zChunk2 = xChunk2 + yChunk2;
77  vec_t zChunk3 = xChunk3 + yChunk3;
78 
79  // store
80  zChunk0.store(z, is_not_aligned);
81  zChunk1.store(z + vec_t::width, is_not_aligned);
82  zChunk2.store(z + 2 * vec_t::width, is_not_aligned);
83  zChunk3.store(z + 3 * vec_t::width, is_not_aligned);
84 
85  // update pointers
86  x += 4 * vec_t::width;
87  y += 4 * vec_t::width;
88  z += 4 * vec_t::width;
89  cnt -= 4 * vec_t::width;
90  }
91 
92  // Vectorized loop unroll 2x
93  while (cnt >= 2 * vec_t::width)
94  {
95  // load
96  vec_t yChunk0, yChunk1;
97  yChunk0.load(y, is_not_aligned);
98  yChunk1.load(y + vec_t::width, is_not_aligned);
99 
100  vec_t xChunk0, xChunk1;
101  xChunk0.load(x, is_not_aligned);
102  xChunk1.load(x + vec_t::width, is_not_aligned);
103 
104  // z = x + y
105  vec_t zChunk0 = xChunk0 + yChunk0;
106  vec_t zChunk1 = xChunk1 + yChunk1;
107 
108  // store
109  zChunk0.store(z, is_not_aligned);
110  zChunk1.store(z + vec_t::width, is_not_aligned);
111 
112  // update pointers
113  x += 2 * vec_t::width;
114  y += 2 * vec_t::width;
115  z += 2 * vec_t::width;
116  cnt -= 2 * vec_t::width;
117  }
118 
119  // Vectorized loop
120  while (cnt >= vec_t::width)
121  {
122  // load
123  vec_t yChunk;
124  yChunk.load(y, is_not_aligned);
125  vec_t xChunk;
126  xChunk.load(x, is_not_aligned);
127 
128  // z = x + y
129  vec_t zChunk = xChunk + yChunk;
130 
131  // store
132  zChunk.store(z, is_not_aligned);
133 
134  // update pointers
135  x += vec_t::width;
136  y += vec_t::width;
137  z += vec_t::width;
138  cnt -= vec_t::width;
139  }
140 
141  // spillover loop
142  while (cnt)
143  {
144  // z = x + y;
145  *z = (*x) + (*y);
146  // update pointers
147  ++x;
148  ++y;
149  ++z;
150  --cnt;
151  }
152 }
153 
154 /// \brief Multiply vector z = x * y
155 template <class T, typename = typename std::enable_if<
156  std::is_floating_point<T>::value>::type>
157 void Vmul(const size_t n, const T *x, const T *y, T *z)
158 {
159  using namespace tinysimd;
160  using vec_t = simd<T>;
161 
162  size_t cnt = n;
163  // Vectorized loop unroll 4x
164  while (cnt >= 4 * vec_t::width)
165  {
166  // load
167  vec_t yChunk0, yChunk1, yChunk2, yChunk3;
168  yChunk0.load(y, is_not_aligned);
169  yChunk1.load(y + vec_t::width, is_not_aligned);
170  yChunk2.load(y + 2 * vec_t::width, is_not_aligned);
171  yChunk3.load(y + 3 * vec_t::width, is_not_aligned);
172 
173  vec_t xChunk0, xChunk1, xChunk2, xChunk3;
174  xChunk0.load(x, is_not_aligned);
175  xChunk1.load(x + vec_t::width, is_not_aligned);
176  xChunk2.load(x + 2 * vec_t::width, is_not_aligned);
177  xChunk3.load(x + 3 * vec_t::width, is_not_aligned);
178 
179  // z = x * y
180  vec_t zChunk0 = xChunk0 * yChunk0;
181  vec_t zChunk1 = xChunk1 * yChunk1;
182  vec_t zChunk2 = xChunk2 * yChunk2;
183  vec_t zChunk3 = xChunk3 * yChunk3;
184 
185  // store
186  zChunk0.store(z, is_not_aligned);
187  zChunk1.store(z + vec_t::width, is_not_aligned);
188  zChunk2.store(z + 2 * vec_t::width, is_not_aligned);
189  zChunk3.store(z + 3 * vec_t::width, is_not_aligned);
190 
191  // update pointers
192  x += 4 * vec_t::width;
193  y += 4 * vec_t::width;
194  z += 4 * vec_t::width;
195  cnt -= 4 * vec_t::width;
196  }
197 
198  // Vectorized loop unroll 2x
199  while (cnt >= 2 * vec_t::width)
200  {
201  // load
202  vec_t yChunk0, yChunk1;
203  yChunk0.load(y, is_not_aligned);
204  yChunk1.load(y + vec_t::width, is_not_aligned);
205 
206  vec_t xChunk0, xChunk1;
207  xChunk0.load(x, is_not_aligned);
208  xChunk1.load(x + vec_t::width, is_not_aligned);
209 
210  // z = x * y
211  vec_t zChunk0 = xChunk0 * yChunk0;
212  vec_t zChunk1 = xChunk1 * yChunk1;
213 
214  // store
215  zChunk0.store(z, is_not_aligned);
216  zChunk1.store(z + vec_t::width, is_not_aligned);
217 
218  // update pointers
219  x += 2 * vec_t::width;
220  y += 2 * vec_t::width;
221  z += 2 * vec_t::width;
222  cnt -= 2 * vec_t::width;
223  }
224 
225  // Vectorized loop
226  while (cnt >= vec_t::width)
227  {
228  // load
229  vec_t yChunk;
230  yChunk.load(y, is_not_aligned);
231  vec_t xChunk;
232  xChunk.load(x, is_not_aligned);
233 
234  // z = x * y
235  vec_t zChunk = xChunk * yChunk;
236 
237  // store
238  zChunk.store(z, is_not_aligned);
239 
240  // update pointers
241  x += vec_t::width;
242  y += vec_t::width;
243  z += vec_t::width;
244  cnt -= vec_t::width;
245  }
246 
247  // spillover loop
248  while (cnt)
249  {
250  // z = x * y;
251  *z = (*x) * (*y);
252  // update pointers
253  ++x;
254  ++y;
255  ++z;
256  --cnt;
257  }
258 }
259 
260 /// \brief vvtvp (vector times vector plus vector): z = w*x + y
261 template <class T, typename = typename std::enable_if<
262  std::is_floating_point<T>::value>::type>
263 void Vvtvp(const size_t n, const T *w, const T *x, const T *y, T *z)
264 {
265  using namespace tinysimd;
266  using vec_t = simd<T>;
267 
268  size_t cnt = n;
269  // Vectorized loop
270  while (cnt >= vec_t::width)
271  {
272  // load
273  vec_t wChunk;
274  wChunk.load(w, is_not_aligned);
275  vec_t yChunk;
276  yChunk.load(y, is_not_aligned);
277  vec_t xChunk;
278  xChunk.load(x, is_not_aligned);
279 
280  // z = w * x + y
281  vec_t zChunk = wChunk * xChunk + yChunk;
282 
283  // store
284  zChunk.store(z, is_not_aligned);
285 
286  // update pointers
287  w += vec_t::width;
288  x += vec_t::width;
289  y += vec_t::width;
290  z += vec_t::width;
291  cnt -= vec_t::width;
292  }
293 
294  // spillover loop
295  while (cnt)
296  {
297  // z = w * x + y;
298  *z = (*w) * (*x) + (*y);
299  // update pointers
300  ++w;
301  ++x;
302  ++y;
303  ++z;
304  --cnt;
305  }
306 }
307 
308 /// \brief vvtvm (vector times vector plus vector): z = w*x - y
309 template <class T, typename = typename std::enable_if<
310  std::is_floating_point<T>::value>::type>
311 void Vvtvm(const size_t n, const T *w, const T *x, const T *y, T *z)
312 {
313  using namespace tinysimd;
314  using vec_t = simd<T>;
315 
316  size_t cnt = n;
317  // Vectorized loop
318  while (cnt >= vec_t::width)
319  {
320  // load
321  vec_t wChunk;
322  wChunk.load(w, is_not_aligned);
323  vec_t yChunk;
324  yChunk.load(y, is_not_aligned);
325  vec_t xChunk;
326  xChunk.load(x, is_not_aligned);
327 
328  // z = w * x - y
329  vec_t zChunk = wChunk * xChunk - yChunk;
330 
331  // store
332  zChunk.store(z, is_not_aligned);
333 
334  // update pointers
335  w += vec_t::width;
336  x += vec_t::width;
337  y += vec_t::width;
338  z += vec_t::width;
339  cnt -= vec_t::width;
340  }
341 
342  // spillover loop
343  while (cnt)
344  {
345  // z = w * x - y;
346  *z = (*w) * (*x) - (*y);
347  // update pointers
348  ++w;
349  ++x;
350  ++y;
351  ++z;
352  --cnt;
353  }
354 }
355 
356 /// \brief vvtvvtp (vector times vector plus vector times vector):
357 // z = v*w + x*y
358 template <class T, typename = typename std::enable_if<
359  std::is_floating_point<T>::value>::type>
360 inline void Vvtvvtp(const size_t n, const T *v, const T *w, const T *x,
361  const T *y, T *z)
362 {
363  using namespace tinysimd;
364  using vec_t = simd<T>;
365 
366  size_t cnt = n;
367  // Vectorized loop
368  while (cnt >= vec_t::width)
369  {
370  // load
371  vec_t vChunk;
372  vChunk.load(v, is_not_aligned);
373  vec_t wChunk;
374  wChunk.load(w, is_not_aligned);
375  vec_t yChunk;
376  yChunk.load(y, is_not_aligned);
377  vec_t xChunk;
378  xChunk.load(x, is_not_aligned);
379 
380  // z = v * w + x * y;
381  vec_t z1Chunk = vChunk * wChunk;
382  vec_t z2Chunk = xChunk * yChunk;
383  vec_t zChunk = z1Chunk + z2Chunk;
384 
385  // store
386  zChunk.store(z, is_not_aligned);
387 
388  // update pointers
389  v += vec_t::width;
390  w += vec_t::width;
391  x += vec_t::width;
392  y += vec_t::width;
393  z += vec_t::width;
394  cnt -= vec_t::width;
395  }
396 
397  // spillover loop
398  while (cnt)
399  {
400  // z = v * w + x * y;
401  T z1 = (*v) * (*w);
402  T z2 = (*x) * (*y);
403  *z = z1 + z2;
404  // update pointers
405  ++v;
406  ++w;
407  ++x;
408  ++y;
409  ++z;
410  --cnt;
411  }
412 }
413 
414 /// \brief vvtvvtm (vector times vector minus vector times vector):
415 // z = v*w - x*y
416 template <class T, typename = typename std::enable_if<
417  std::is_floating_point<T>::value>::type>
418 inline void Vvtvvtm(const size_t n, const T *v, const T *w, const T *x,
419  const T *y, T *z)
420 {
421  using namespace tinysimd;
422  using vec_t = simd<T>;
423 
424  size_t cnt = n;
425  // Vectorized loop
426  while (cnt >= vec_t::width)
427  {
428  // load
429  vec_t vChunk;
430  vChunk.load(v, is_not_aligned);
431  vec_t wChunk;
432  wChunk.load(w, is_not_aligned);
433  vec_t yChunk;
434  yChunk.load(y, is_not_aligned);
435  vec_t xChunk;
436  xChunk.load(x, is_not_aligned);
437 
438  // z = v * w + x * y;
439  vec_t z1Chunk = vChunk * wChunk;
440  vec_t z2Chunk = xChunk * yChunk;
441  vec_t zChunk = z1Chunk - z2Chunk;
442 
443  // store
444  zChunk.store(z, is_not_aligned);
445 
446  // update pointers
447  v += vec_t::width;
448  w += vec_t::width;
449  x += vec_t::width;
450  y += vec_t::width;
451  z += vec_t::width;
452  cnt -= vec_t::width;
453  }
454 
455  // spillover loop
456  while (cnt)
457  {
458  // z = v * w + x * y;
459  T z1 = (*v) * (*w);
460  T z2 = (*x) * (*y);
461  *z = z1 - z2;
462  // update pointers
463  ++v;
464  ++w;
465  ++x;
466  ++y;
467  ++z;
468  --cnt;
469  }
470 }
471 
472 /// \brief Gather vector z[i] = x[y[i]]
473 template <class T, class I,
474  typename = typename std::enable_if<std::is_floating_point<T>::value &&
475  std::is_integral<I>::value>::type>
476 void Gathr(const I n, const T *x, const I *y, T *z)
477 {
478  using namespace tinysimd;
479  using vec_t = simd<T>;
480  using vec_t_i = simd<I, vec_t::width>;
481 
482  I cnt = n;
483  // Unroll 4x Vectorized loop
484  while (cnt >= 4 * vec_t::width)
485  {
486  // load index
487  vec_t_i yChunk0, yChunk1, yChunk2, yChunk3;
488  yChunk0.load(y, is_not_aligned);
489  yChunk1.load(y + vec_t_i::width, is_not_aligned);
490  yChunk2.load(y + 2 * vec_t_i::width, is_not_aligned);
491  yChunk3.load(y + 3 * vec_t_i::width, is_not_aligned);
492 
493  // z = x[y[i]]
494  vec_t zChunk0, zChunk1, zChunk2, zChunk3;
495  zChunk0.gather(x, yChunk0);
496  zChunk1.gather(x, yChunk1);
497  zChunk2.gather(x, yChunk2);
498  zChunk3.gather(x, yChunk3);
499 
500  // store
501  zChunk0.store(z, is_not_aligned);
502  zChunk1.store(z + vec_t_i::width, is_not_aligned);
503  zChunk2.store(z + 2 * vec_t_i::width, is_not_aligned);
504  zChunk3.store(z + 3 * vec_t_i::width, is_not_aligned);
505 
506  // update pointers
507  y += 4 * vec_t_i::width;
508  z += 4 * vec_t::width;
509  cnt -= 4 * vec_t::width;
510  }
511 
512  // Unroll 2x Vectorized loop
513  while (cnt >= 2 * vec_t::width)
514  {
515  // load index
516  vec_t_i yChunk0, yChunk1;
517  yChunk0.load(y, is_not_aligned);
518  yChunk1.load(y + vec_t_i::width, is_not_aligned);
519 
520  // z = x[y[i]]
521  vec_t zChunk0, zChunk1;
522  zChunk0.gather(x, yChunk0);
523  zChunk1.gather(x, yChunk1);
524 
525  // store
526  zChunk0.store(z, is_not_aligned);
527  zChunk1.store(z + vec_t_i::width, is_not_aligned);
528 
529  // update pointers
530  y += 2 * vec_t_i::width;
531  z += 2 * vec_t::width;
532  cnt -= 2 * vec_t::width;
533  }
534 
535  // Vectorized loop
536  while (cnt >= vec_t::width)
537  {
538  // load index
539  vec_t_i yChunk;
540  yChunk.load(y, is_not_aligned);
541 
542  // z = x[y[i]]
543  vec_t zChunk;
544  zChunk.gather(x, yChunk);
545 
546  // store
547  zChunk.store(z, is_not_aligned);
548 
549  // update pointers
550  y += vec_t_i::width;
551  z += vec_t::width;
552  cnt -= vec_t::width;
553  }
554 
555  // spillover loop
556  while (cnt)
557  {
558  // z = x[y[i]]
559  *z = *(x + *y);
560  // update pointers
561  ++y;
562  ++z;
563  --cnt;
564  }
565 }
566 
567 } // namespace SIMD
568 } // namespace Vmath
569 #endif
tinysimd::simd< NekDouble > vec_t
void Vvtvp(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvp (vector times vector plus vector): z = w*x + y
Definition: VmathSIMD.hpp:263
void Vadd(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x + y.
Definition: VmathSIMD.hpp:51
void Vvtvm(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvm (vector times vector plus vector): z = w*x - y
Definition: VmathSIMD.hpp:311
void Vvtvvtm(const size_t n, const T *v, const T *w, const T *x, const T *y, T *z)
vvtvvtm (vector times vector minus vector times vector):
Definition: VmathSIMD.hpp:418
void Gathr(const I n, const T *x, const I *y, T *z)
Gather vector z[i] = x[y[i]].
Definition: VmathSIMD.hpp:476
void Vvtvvtp(const size_t n, const T *v, const T *w, const T *x, const T *y, T *z)
vvtvvtp (vector times vector plus vector times vector):
Definition: VmathSIMD.hpp:360
void Vmul(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x * y.
Definition: VmathSIMD.hpp:157
Definition: Vmath.cpp:40
static constexpr struct tinysimd::is_not_aligned_t is_not_aligned
typename abi< ScalarType, width >::type simd
Definition: tinysimd.hpp:80