Nektar++
VmathSIMD.hpp
Go to the documentation of this file.
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 // File: VmathSIMD.hpp
4 //
5 // For more information, please see: http://www.nektar.info
6 //
7 // The MIT License
8 //
9 // Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10 // Department of Aeronautics, Imperial College London (UK), and Scientific
11 // Computing and Imaging Institute, University of Utah (USA).
12 //
13 // Permission is hereby granted, free of charge, to any person obtaining a
14 // copy of this software and associated documentation files (the "Software"),
15 // to deal in the Software without restriction, including without limitation
16 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 // and/or sell copies of the Software, and to permit persons to whom the
18 // Software is furnished to do so, subject to the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be included
21 // in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 // DEALINGS IN THE SOFTWARE.
30 //
31 // Description: Collection of templated functions for vector mathematics using
32 // SIMD types, some are functions have optimization tricks such as manual loop
33 // unrolling.
34 //
35 ///////////////////////////////////////////////////////////////////////////////
36 
37 
38 #ifndef NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
39 #define NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
40 
44 
45 namespace Vmath
46 {
47 namespace SIMD
48 {
49  /// \brief Multiply vector z = x + y
50  template<class T,typename = typename std::enable_if
51  <std::is_floating_point<T>::value>::type>
52  void Vadd(const size_t n, const T *x, const T *y, T *z)
53  {
54  using namespace tinysimd;
55  using vec_t = simd<T>;
56 
57  size_t cnt = n;
58  // Vectorized loop unroll 4x
59  while (cnt >= 4*vec_t::width)
60  {
61  // load
62  vec_t yChunk0, yChunk1, yChunk2, yChunk3;
63  yChunk0.load(y, is_not_aligned);
64  yChunk1.load(y + vec_t::width, is_not_aligned);
65  yChunk2.load(y + 2*vec_t::width, is_not_aligned);
66  yChunk3.load(y + 3*vec_t::width, is_not_aligned);
67 
68  vec_t xChunk0, xChunk1, xChunk2, xChunk3;
69  xChunk0.load(x, is_not_aligned);
70  xChunk1.load(x + vec_t::width, is_not_aligned);
71  xChunk2.load(x + 2*vec_t::width, is_not_aligned);
72  xChunk3.load(x + 3*vec_t::width, is_not_aligned);
73 
74  // z = x + y
75  vec_t zChunk0 = xChunk0 + yChunk0;
76  vec_t zChunk1 = xChunk1 + yChunk1;
77  vec_t zChunk2 = xChunk2 + yChunk2;
78  vec_t zChunk3 = xChunk3 + yChunk3;
79 
80  // store
81  zChunk0.store(z, is_not_aligned);
82  zChunk1.store(z + vec_t::width, is_not_aligned);
83  zChunk2.store(z + 2*vec_t::width, is_not_aligned);
84  zChunk3.store(z + 3*vec_t::width, is_not_aligned);
85 
86  // update pointers
87  x += 4*vec_t::width;
88  y += 4*vec_t::width;
89  z += 4*vec_t::width;
90  cnt-= 4*vec_t::width;
91  }
92 
93  // Vectorized loop unroll 2x
94  while (cnt >= 2*vec_t::width)
95  {
96  // load
97  vec_t yChunk0, yChunk1;
98  yChunk0.load(y, is_not_aligned);
99  yChunk1.load(y + vec_t::width, is_not_aligned);
100 
101  vec_t xChunk0, xChunk1;
102  xChunk0.load(x, is_not_aligned);
103  xChunk1.load(x + vec_t::width, is_not_aligned);
104 
105  // z = x + y
106  vec_t zChunk0 = xChunk0 + yChunk0;
107  vec_t zChunk1 = xChunk1 + yChunk1;
108 
109  // store
110  zChunk0.store(z, is_not_aligned);
111  zChunk1.store(z + vec_t::width, is_not_aligned);
112 
113  // update pointers
114  x += 2*vec_t::width;
115  y += 2*vec_t::width;
116  z += 2*vec_t::width;
117  cnt-= 2*vec_t::width;
118  }
119 
120  // Vectorized loop
121  while (cnt >= vec_t::width)
122  {
123  // load
124  vec_t yChunk;
125  yChunk.load(y, is_not_aligned);
126  vec_t xChunk;
127  xChunk.load(x, is_not_aligned);
128 
129  // z = x + y
130  vec_t zChunk = xChunk + yChunk;
131 
132  // store
133  zChunk.store(z, is_not_aligned);
134 
135  // update pointers
136  x += vec_t::width;
137  y += vec_t::width;
138  z += vec_t::width;
139  cnt-= vec_t::width;
140  }
141 
142  // spillover loop
143  while(cnt)
144  {
145  // z = x + y;
146  *z = (*x) + (*y);
147  // update pointers
148  ++x;
149  ++y;
150  ++z;
151  --cnt;
152  }
153  }
154 
155  /// \brief Multiply vector z = x * y
156  template<class T,typename = typename std::enable_if
157  <std::is_floating_point<T>::value>::type >
158  void Vmul(const size_t n, const T *x, const T *y, T *z)
159  {
160  using namespace tinysimd;
161  using vec_t = simd<T>;
162 
163  size_t cnt = n;
164  // Vectorized loop unroll 4x
165  while (cnt >= 4*vec_t::width)
166  {
167  // load
168  vec_t yChunk0, yChunk1, yChunk2, yChunk3;
169  yChunk0.load(y, is_not_aligned);
170  yChunk1.load(y + vec_t::width, is_not_aligned);
171  yChunk2.load(y + 2*vec_t::width, is_not_aligned);
172  yChunk3.load(y + 3*vec_t::width, is_not_aligned);
173 
174  vec_t xChunk0, xChunk1, xChunk2, xChunk3;
175  xChunk0.load(x, is_not_aligned);
176  xChunk1.load(x + vec_t::width, is_not_aligned);
177  xChunk2.load(x + 2*vec_t::width, is_not_aligned);
178  xChunk3.load(x + 3*vec_t::width, is_not_aligned);
179 
180  // z = x * y
181  vec_t zChunk0 = xChunk0 * yChunk0;
182  vec_t zChunk1 = xChunk1 * yChunk1;
183  vec_t zChunk2 = xChunk2 * yChunk2;
184  vec_t zChunk3 = xChunk3 * yChunk3;
185 
186  // store
187  zChunk0.store(z, is_not_aligned);
188  zChunk1.store(z + vec_t::width, is_not_aligned);
189  zChunk2.store(z + 2*vec_t::width, is_not_aligned);
190  zChunk3.store(z + 3*vec_t::width, is_not_aligned);
191 
192  // update pointers
193  x += 4*vec_t::width;
194  y += 4*vec_t::width;
195  z += 4*vec_t::width;
196  cnt-= 4*vec_t::width;
197  }
198 
199  // Vectorized loop unroll 2x
200  while (cnt >= 2*vec_t::width)
201  {
202  // load
203  vec_t yChunk0, yChunk1;
204  yChunk0.load(y, is_not_aligned);
205  yChunk1.load(y + vec_t::width, is_not_aligned);
206 
207  vec_t xChunk0, xChunk1;
208  xChunk0.load(x, is_not_aligned);
209  xChunk1.load(x + vec_t::width, is_not_aligned);
210 
211  // z = x * y
212  vec_t zChunk0 = xChunk0 * yChunk0;
213  vec_t zChunk1 = xChunk1 * yChunk1;
214 
215  // store
216  zChunk0.store(z, is_not_aligned);
217  zChunk1.store(z + vec_t::width, is_not_aligned);
218 
219  // update pointers
220  x += 2*vec_t::width;
221  y += 2*vec_t::width;
222  z += 2*vec_t::width;
223  cnt-= 2*vec_t::width;
224  }
225 
226  // Vectorized loop
227  while (cnt >= vec_t::width)
228  {
229  // load
230  vec_t yChunk;
231  yChunk.load(y, is_not_aligned);
232  vec_t xChunk;
233  xChunk.load(x, is_not_aligned);
234 
235  // z = x * y
236  vec_t zChunk = xChunk * yChunk;
237 
238  // store
239  zChunk.store(z, is_not_aligned);
240 
241  // update pointers
242  x += vec_t::width;
243  y += vec_t::width;
244  z += vec_t::width;
245  cnt-= vec_t::width;
246  }
247 
248  // spillover loop
249  while(cnt)
250  {
251  // z = x * y;
252  *z = (*x) * (*y);
253  // update pointers
254  ++x;
255  ++y;
256  ++z;
257  --cnt;
258  }
259  }
260 
261  /// \brief vvtvp (vector times vector plus vector): z = w*x + y
262  template<class T,typename = typename std::enable_if
263  < std::is_floating_point<T>::value >::type >
264  void Vvtvp(const size_t n, const T *w, const T *x, const T *y, T *z)
265  {
266  using namespace tinysimd;
267  using vec_t = simd<T>;
268 
269  size_t cnt = n;
270  // Vectorized loop
271  while (cnt >= vec_t::width)
272  {
273  // load
274  vec_t wChunk;
275  wChunk.load(w, is_not_aligned);
276  vec_t yChunk;
277  yChunk.load(y, is_not_aligned);
278  vec_t xChunk;
279  xChunk.load(x, is_not_aligned);
280 
281  // z = w * x + y
282  vec_t zChunk = wChunk * xChunk + yChunk;
283 
284  // store
285  zChunk.store(z, is_not_aligned);
286 
287  // update pointers
288  w += vec_t::width;
289  x += vec_t::width;
290  y += vec_t::width;
291  z += vec_t::width;
292  cnt-= vec_t::width;
293  }
294 
295  // spillover loop
296  while(cnt)
297  {
298  // z = w * x + y;
299  *z = (*w) * (*x) + (*y);
300  // update pointers
301  ++w;
302  ++x;
303  ++y;
304  ++z;
305  --cnt;
306  }
307  }
308 
309  /// \brief vvtvm (vector times vector plus vector): z = w*x - y
310  template<class T, typename = typename std::enable_if
311  <std::is_floating_point<T>::value>::type >
312  void Vvtvm(const size_t n, const T *w, const T *x, const T *y, T *z)
313  {
314  using namespace tinysimd;
315  using vec_t = simd<T>;
316 
317  size_t cnt = n;
318  // Vectorized loop
319  while (cnt >= vec_t::width)
320  {
321  // load
322  vec_t wChunk;
323  wChunk.load(w, is_not_aligned);
324  vec_t yChunk;
325  yChunk.load(y, is_not_aligned);
326  vec_t xChunk;
327  xChunk.load(x, is_not_aligned);
328 
329  // z = w * x - y
330  vec_t zChunk = wChunk * xChunk - yChunk;
331 
332  // store
333  zChunk.store(z, is_not_aligned);
334 
335  // update pointers
336  w += vec_t::width;
337  x += vec_t::width;
338  y += vec_t::width;
339  z += vec_t::width;
340  cnt-= vec_t::width;
341  }
342 
343  // spillover loop
344  while(cnt)
345  {
346  // z = w * x - y;
347  *z = (*w) * (*x) - (*y);
348  // update pointers
349  ++w;
350  ++x;
351  ++y;
352  ++z;
353  --cnt;
354  }
355  }
356 
357  /// \brief vvtvvtp (vector times vector plus vector times vector):
358  // z = v*w + x*y
359  template<class T, typename = typename std::enable_if
360  <std::is_floating_point<T>::value>::type >
361  inline void Vvtvvtp (const size_t n, const T* v, const T* w, const T* x,
362  const T* y, T* z)
363  {
364  using namespace tinysimd;
365  using vec_t = simd<T>;
366 
367  size_t cnt = n;
368  // Vectorized loop
369  while (cnt >= vec_t::width)
370  {
371  // load
372  vec_t vChunk;
373  vChunk.load(v, is_not_aligned);
374  vec_t wChunk;
375  wChunk.load(w, is_not_aligned);
376  vec_t yChunk;
377  yChunk.load(y, is_not_aligned);
378  vec_t xChunk;
379  xChunk.load(x, is_not_aligned);
380 
381  // z = v * w + x * y;
382  vec_t z1Chunk = vChunk * wChunk;
383  vec_t z2Chunk = xChunk * yChunk;
384  vec_t zChunk = z1Chunk + z2Chunk;
385 
386  // store
387  zChunk.store(z, is_not_aligned);
388 
389  // update pointers
390  v += vec_t::width;
391  w += vec_t::width;
392  x += vec_t::width;
393  y += vec_t::width;
394  z += vec_t::width;
395  cnt-= vec_t::width;
396  }
397 
398  // spillover loop
399  while(cnt)
400  {
401  // z = v * w + x * y;
402  T z1 = (*v) * (*w);
403  T z2 = (*x) * (*y);
404  *z = z1 + z2;
405  // update pointers
406  ++v;
407  ++w;
408  ++x;
409  ++y;
410  ++z;
411  --cnt;
412  }
413  }
414 
415  /// \brief Gather vector z[i] = x[y[i]]
416  template<class T, class I, typename = typename std::enable_if
417  < std::is_floating_point<T>::value &&
418  std::is_integral<I>::value >::type >
419  void Gathr(const I n, const T* x, const I* y, T* z)
420  {
421  using namespace tinysimd;
422  using vec_t = simd<T>;
423  using vec_t_i = simd<I>;
424 
425  I cnt = n;
426  // Unroll 4x Vectorized loop
427  while (cnt >= 4*vec_t::width)
428  {
429  // load index
430  vec_t_i yChunk0, yChunk1, yChunk2, yChunk3;
431  yChunk0.load(y, is_not_aligned);
432  yChunk1.load(y + vec_t_i::width, is_not_aligned);
433  yChunk2.load(y + 2*vec_t_i::width, is_not_aligned);
434  yChunk3.load(y + 3*vec_t_i::width, is_not_aligned);
435 
436  // z = x[y[i]]
437  vec_t zChunk0, zChunk1, zChunk2, zChunk3;
438  zChunk0.gather(x, yChunk0);
439  zChunk1.gather(x, yChunk1);
440  zChunk2.gather(x, yChunk2);
441  zChunk3.gather(x, yChunk3);
442 
443  // store
444  zChunk0.store(z, is_not_aligned);
445  zChunk1.store(z + vec_t_i::width, is_not_aligned);
446  zChunk2.store(z + 2*vec_t_i::width, is_not_aligned);
447  zChunk3.store(z + 3*vec_t_i::width, is_not_aligned);
448 
449  // update pointers
450  y += 4*vec_t_i::width;
451  z += 4*vec_t::width;
452  cnt-= 4*vec_t::width;
453  }
454 
455  // Unroll 2x Vectorized loop
456  while (cnt >= 2*vec_t::width)
457  {
458  // load index
459  vec_t_i yChunk0, yChunk1;
460  yChunk0.load(y, is_not_aligned);
461  yChunk1.load(y + vec_t_i::width, is_not_aligned);
462 
463  // z = x[y[i]]
464  vec_t zChunk0, zChunk1;
465  zChunk0.gather(x, yChunk0);
466  zChunk1.gather(x, yChunk1);
467 
468  // store
469  zChunk0.store(z, is_not_aligned);
470  zChunk1.store(z + vec_t_i::width, is_not_aligned);
471 
472  // update pointers
473  y += 2*vec_t_i::width;
474  z += 2*vec_t::width;
475  cnt-= 2*vec_t::width;
476  }
477 
478  // Vectorized loop
479  while (cnt >= vec_t::width)
480  {
481  // load index
482  vec_t_i yChunk;
483  yChunk.load(y, is_not_aligned);
484 
485  // z = x[y[i]]
486  vec_t zChunk;
487  zChunk.gather(x, yChunk);
488 
489  // store
490  zChunk.store(z, is_not_aligned);
491 
492  // update pointers
493  y += vec_t_i::width;
494  z += vec_t::width;
495  cnt-= vec_t::width;
496  }
497 
498  // spillover loop
499  while(cnt)
500  {
501  // z = x[y[i]]
502  *z = *(x + *y);
503  // update pointers
504  ++y;
505  ++z;
506  --cnt;
507  }
508  }
509 
510 
511 } // namespace SIMD
512 } // namespace Vmath
513 #endif
tinysimd::simd< NekDouble > vec_t
void Vvtvp(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvp (vector times vector plus vector): z = w*x + y
Definition: VmathSIMD.hpp:264
void Vadd(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x + y.
Definition: VmathSIMD.hpp:52
void Vvtvm(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvm (vector times vector plus vector): z = w*x - y
Definition: VmathSIMD.hpp:312
void Gathr(const I n, const T *x, const I *y, T *z)
Gather vector z[i] = x[y[i]].
Definition: VmathSIMD.hpp:419
void Vvtvvtp(const size_t n, const T *v, const T *w, const T *x, const T *y, T *z)
vvtvvtp (vector times vector plus vector times vector):
Definition: VmathSIMD.hpp:361
void Vmul(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x * y.
Definition: VmathSIMD.hpp:158
Definition: Vmath.cpp:40
static constexpr struct tinysimd::is_not_aligned_t is_not_aligned
typename abi< ScalarType >::type simd
Definition: tinysimd.hpp:83