Nektar++
VmathSIMD.hpp
Go to the documentation of this file.
1///////////////////////////////////////////////////////////////////////////////
2//
3// File: VmathSIMD.hpp
4//
5// For more information, please see: http://www.nektar.info
6//
7// The MIT License
8//
9// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10// Department of Aeronautics, Imperial College London (UK), and Scientific
11// Computing and Imaging Institute, University of Utah (USA).
12//
13// Permission is hereby granted, free of charge, to any person obtaining a
14// copy of this software and associated documentation files (the "Software"),
15// to deal in the Software without restriction, including without limitation
16// the rights to use, copy, modify, merge, publish, distribute, sublicense,
17// and/or sell copies of the Software, and to permit persons to whom the
18// Software is furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included
21// in all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29// DEALINGS IN THE SOFTWARE.
30//
31// Description: Collection of templated functions for vector mathematics using
32// SIMD types, some are functions have optimization tricks such as manual loop
33// unrolling.
34//
35///////////////////////////////////////////////////////////////////////////////
36
37#ifndef NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
38#define NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
39
41
42namespace Vmath
43{
44namespace SIMD
45{
46/// \brief Add vector z = x + y
47template <class T, typename = typename std::enable_if<
48 std::is_floating_point<T>::value>::type>
49void Vadd(const size_t n, const T *x, const T *y, T *z)
50{
51 using namespace tinysimd;
52 using vec_t = simd<T>;
53
54 size_t cnt = n;
55 // Vectorized loop unroll 4x
56 while (cnt >= 4 * vec_t::width)
57 {
58 // load
59 vec_t yChunk0, yChunk1, yChunk2, yChunk3;
60 yChunk0.load(y, is_not_aligned);
61 yChunk1.load(y + vec_t::width, is_not_aligned);
62 yChunk2.load(y + 2 * vec_t::width, is_not_aligned);
63 yChunk3.load(y + 3 * vec_t::width, is_not_aligned);
64
65 vec_t xChunk0, xChunk1, xChunk2, xChunk3;
66 xChunk0.load(x, is_not_aligned);
67 xChunk1.load(x + vec_t::width, is_not_aligned);
68 xChunk2.load(x + 2 * vec_t::width, is_not_aligned);
69 xChunk3.load(x + 3 * vec_t::width, is_not_aligned);
70
71 // z = x + y
72 vec_t zChunk0 = xChunk0 + yChunk0;
73 vec_t zChunk1 = xChunk1 + yChunk1;
74 vec_t zChunk2 = xChunk2 + yChunk2;
75 vec_t zChunk3 = xChunk3 + yChunk3;
76
77 // store
78 zChunk0.store(z, is_not_aligned);
79 zChunk1.store(z + vec_t::width, is_not_aligned);
80 zChunk2.store(z + 2 * vec_t::width, is_not_aligned);
81 zChunk3.store(z + 3 * vec_t::width, is_not_aligned);
82
83 // update pointers
84 x += 4 * vec_t::width;
85 y += 4 * vec_t::width;
86 z += 4 * vec_t::width;
87 cnt -= 4 * vec_t::width;
88 }
89
90 // Vectorized loop unroll 2x
91 while (cnt >= 2 * vec_t::width)
92 {
93 // load
94 vec_t yChunk0, yChunk1;
95 yChunk0.load(y, is_not_aligned);
96 yChunk1.load(y + vec_t::width, is_not_aligned);
97
98 vec_t xChunk0, xChunk1;
99 xChunk0.load(x, is_not_aligned);
100 xChunk1.load(x + vec_t::width, is_not_aligned);
101
102 // z = x + y
103 vec_t zChunk0 = xChunk0 + yChunk0;
104 vec_t zChunk1 = xChunk1 + yChunk1;
105
106 // store
107 zChunk0.store(z, is_not_aligned);
108 zChunk1.store(z + vec_t::width, is_not_aligned);
109
110 // update pointers
111 x += 2 * vec_t::width;
112 y += 2 * vec_t::width;
113 z += 2 * vec_t::width;
114 cnt -= 2 * vec_t::width;
115 }
116
117 // Vectorized loop
118 while (cnt >= vec_t::width)
119 {
120 // load
121 vec_t yChunk;
122 yChunk.load(y, is_not_aligned);
123 vec_t xChunk;
124 xChunk.load(x, is_not_aligned);
125
126 // z = x + y
127 vec_t zChunk = xChunk + yChunk;
128
129 // store
130 zChunk.store(z, is_not_aligned);
131
132 // update pointers
133 x += vec_t::width;
134 y += vec_t::width;
135 z += vec_t::width;
136 cnt -= vec_t::width;
137 }
138
139 // spillover loop
140 while (cnt)
141 {
142 // z = x + y;
143 *z = (*x) + (*y);
144 // update pointers
145 ++x;
146 ++y;
147 ++z;
148 --cnt;
149 }
150}
151
152/// \brief Multiply vector z = x * y
153template <class T, typename = typename std::enable_if<
154 std::is_floating_point<T>::value>::type>
155void Vmul(const size_t n, const T *x, const T *y, T *z)
156{
157 using namespace tinysimd;
158 using vec_t = simd<T>;
159
160 size_t cnt = n;
161 // Vectorized loop unroll 4x
162 while (cnt >= 4 * vec_t::width)
163 {
164 // load
165 vec_t yChunk0, yChunk1, yChunk2, yChunk3;
166 yChunk0.load(y, is_not_aligned);
167 yChunk1.load(y + vec_t::width, is_not_aligned);
168 yChunk2.load(y + 2 * vec_t::width, is_not_aligned);
169 yChunk3.load(y + 3 * vec_t::width, is_not_aligned);
170
171 vec_t xChunk0, xChunk1, xChunk2, xChunk3;
172 xChunk0.load(x, is_not_aligned);
173 xChunk1.load(x + vec_t::width, is_not_aligned);
174 xChunk2.load(x + 2 * vec_t::width, is_not_aligned);
175 xChunk3.load(x + 3 * vec_t::width, is_not_aligned);
176
177 // z = x * y
178 vec_t zChunk0 = xChunk0 * yChunk0;
179 vec_t zChunk1 = xChunk1 * yChunk1;
180 vec_t zChunk2 = xChunk2 * yChunk2;
181 vec_t zChunk3 = xChunk3 * yChunk3;
182
183 // store
184 zChunk0.store(z, is_not_aligned);
185 zChunk1.store(z + vec_t::width, is_not_aligned);
186 zChunk2.store(z + 2 * vec_t::width, is_not_aligned);
187 zChunk3.store(z + 3 * vec_t::width, is_not_aligned);
188
189 // update pointers
190 x += 4 * vec_t::width;
191 y += 4 * vec_t::width;
192 z += 4 * vec_t::width;
193 cnt -= 4 * vec_t::width;
194 }
195
196 // Vectorized loop unroll 2x
197 while (cnt >= 2 * vec_t::width)
198 {
199 // load
200 vec_t yChunk0, yChunk1;
201 yChunk0.load(y, is_not_aligned);
202 yChunk1.load(y + vec_t::width, is_not_aligned);
203
204 vec_t xChunk0, xChunk1;
205 xChunk0.load(x, is_not_aligned);
206 xChunk1.load(x + vec_t::width, is_not_aligned);
207
208 // z = x * y
209 vec_t zChunk0 = xChunk0 * yChunk0;
210 vec_t zChunk1 = xChunk1 * yChunk1;
211
212 // store
213 zChunk0.store(z, is_not_aligned);
214 zChunk1.store(z + vec_t::width, is_not_aligned);
215
216 // update pointers
217 x += 2 * vec_t::width;
218 y += 2 * vec_t::width;
219 z += 2 * vec_t::width;
220 cnt -= 2 * vec_t::width;
221 }
222
223 // Vectorized loop
224 while (cnt >= vec_t::width)
225 {
226 // load
227 vec_t yChunk;
228 yChunk.load(y, is_not_aligned);
229 vec_t xChunk;
230 xChunk.load(x, is_not_aligned);
231
232 // z = x * y
233 vec_t zChunk = xChunk * yChunk;
234
235 // store
236 zChunk.store(z, is_not_aligned);
237
238 // update pointers
239 x += vec_t::width;
240 y += vec_t::width;
241 z += vec_t::width;
242 cnt -= vec_t::width;
243 }
244
245 // spillover loop
246 while (cnt)
247 {
248 // z = x * y;
249 *z = (*x) * (*y);
250 // update pointers
251 ++x;
252 ++y;
253 ++z;
254 --cnt;
255 }
256}
257
258/// \brief vvtvp (vector times vector plus vector): z = w*x + y
259template <class T, typename = typename std::enable_if<
260 std::is_floating_point<T>::value>::type>
261void Vvtvp(const size_t n, const T *w, const T *x, const T *y, T *z)
262{
263 using namespace tinysimd;
264 using vec_t = simd<T>;
265
266 size_t cnt = n;
267 // Vectorized loop
268 while (cnt >= vec_t::width)
269 {
270 // load
271 vec_t wChunk;
272 wChunk.load(w, is_not_aligned);
273 vec_t yChunk;
274 yChunk.load(y, is_not_aligned);
275 vec_t xChunk;
276 xChunk.load(x, is_not_aligned);
277
278 // z = w * x + y
279 vec_t zChunk = wChunk * xChunk + yChunk;
280
281 // store
282 zChunk.store(z, is_not_aligned);
283
284 // update pointers
285 w += vec_t::width;
286 x += vec_t::width;
287 y += vec_t::width;
288 z += vec_t::width;
289 cnt -= vec_t::width;
290 }
291
292 // spillover loop
293 while (cnt)
294 {
295 // z = w * x + y;
296 *z = (*w) * (*x) + (*y);
297 // update pointers
298 ++w;
299 ++x;
300 ++y;
301 ++z;
302 --cnt;
303 }
304}
305
306/// \brief vvtvm (vector times vector minus vector): z = w*x - y
307template <class T, typename = typename std::enable_if<
308 std::is_floating_point<T>::value>::type>
309void Vvtvm(const size_t n, const T *w, const T *x, const T *y, T *z)
310{
311 using namespace tinysimd;
312 using vec_t = simd<T>;
313
314 size_t cnt = n;
315 // Vectorized loop
316 while (cnt >= vec_t::width)
317 {
318 // load
319 vec_t wChunk;
320 wChunk.load(w, is_not_aligned);
321 vec_t yChunk;
322 yChunk.load(y, is_not_aligned);
323 vec_t xChunk;
324 xChunk.load(x, is_not_aligned);
325
326 // z = w * x - y
327 vec_t zChunk = wChunk * xChunk - yChunk;
328
329 // store
330 zChunk.store(z, is_not_aligned);
331
332 // update pointers
333 w += vec_t::width;
334 x += vec_t::width;
335 y += vec_t::width;
336 z += vec_t::width;
337 cnt -= vec_t::width;
338 }
339
340 // spillover loop
341 while (cnt)
342 {
343 // z = w * x - y;
344 *z = (*w) * (*x) - (*y);
345 // update pointers
346 ++w;
347 ++x;
348 ++y;
349 ++z;
350 --cnt;
351 }
352}
353
354/// \brief vvtvvtp (vector times vector plus vector times vector):
355// z = v*w + x*y
356template <class T, typename = typename std::enable_if<
357 std::is_floating_point<T>::value>::type>
358inline void Vvtvvtp(const size_t n, const T *v, const T *w, const T *x,
359 const T *y, T *z)
360{
361 using namespace tinysimd;
362 using vec_t = simd<T>;
363
364 size_t cnt = n;
365 // Vectorized loop
366 while (cnt >= vec_t::width)
367 {
368 // load
369 vec_t vChunk;
370 vChunk.load(v, is_not_aligned);
371 vec_t wChunk;
372 wChunk.load(w, is_not_aligned);
373 vec_t yChunk;
374 yChunk.load(y, is_not_aligned);
375 vec_t xChunk;
376 xChunk.load(x, is_not_aligned);
377
378 // z = v * w + x * y;
379 vec_t z1Chunk = vChunk * wChunk;
380 vec_t z2Chunk = xChunk * yChunk;
381 vec_t zChunk = z1Chunk + z2Chunk;
382
383 // store
384 zChunk.store(z, is_not_aligned);
385
386 // update pointers
387 v += vec_t::width;
388 w += vec_t::width;
389 x += vec_t::width;
390 y += vec_t::width;
391 z += vec_t::width;
392 cnt -= vec_t::width;
393 }
394
395 // spillover loop
396 while (cnt)
397 {
398 // z = v * w + x * y;
399 T z1 = (*v) * (*w);
400 T z2 = (*x) * (*y);
401 *z = z1 + z2;
402 // update pointers
403 ++v;
404 ++w;
405 ++x;
406 ++y;
407 ++z;
408 --cnt;
409 }
410}
411
412/// \brief vvtvvtm (vector times vector minus vector times vector):
413// z = v*w - x*y
414template <class T, typename = typename std::enable_if<
415 std::is_floating_point<T>::value>::type>
416inline void Vvtvvtm(const size_t n, const T *v, const T *w, const T *x,
417 const T *y, T *z)
418{
419 using namespace tinysimd;
420 using vec_t = simd<T>;
421
422 size_t cnt = n;
423 // Vectorized loop
424 while (cnt >= vec_t::width)
425 {
426 // load
427 vec_t vChunk;
428 vChunk.load(v, is_not_aligned);
429 vec_t wChunk;
430 wChunk.load(w, is_not_aligned);
431 vec_t yChunk;
432 yChunk.load(y, is_not_aligned);
433 vec_t xChunk;
434 xChunk.load(x, is_not_aligned);
435
436 // z = v * w + x * y;
437 vec_t z1Chunk = vChunk * wChunk;
438 vec_t z2Chunk = xChunk * yChunk;
439 vec_t zChunk = z1Chunk - z2Chunk;
440
441 // store
442 zChunk.store(z, is_not_aligned);
443
444 // update pointers
445 v += vec_t::width;
446 w += vec_t::width;
447 x += vec_t::width;
448 y += vec_t::width;
449 z += vec_t::width;
450 cnt -= vec_t::width;
451 }
452
453 // spillover loop
454 while (cnt)
455 {
456 // z = v * w + x * y;
457 T z1 = (*v) * (*w);
458 T z2 = (*x) * (*y);
459 *z = z1 - z2;
460 // update pointers
461 ++v;
462 ++w;
463 ++x;
464 ++y;
465 ++z;
466 --cnt;
467 }
468}
469
470/// \brief Gather vector z[i] = x[y[i]]
471template <class T, class I,
472 typename = typename std::enable_if<std::is_floating_point<T>::value &&
473 std::is_integral<I>::value>::type>
474void Gathr(const I n, const T *x, const I *y, T *z)
475{
476 using namespace tinysimd;
477 using vec_t = simd<T>;
478 using vec_t_i = simd<I, vec_t::width>;
479
480 I cnt = n;
481 // Unroll 4x Vectorized loop
482 while (cnt >= 4 * vec_t::width)
483 {
484 // load index
485 vec_t_i yChunk0, yChunk1, yChunk2, yChunk3;
486 yChunk0.load(y, is_not_aligned);
487 yChunk1.load(y + vec_t_i::width, is_not_aligned);
488 yChunk2.load(y + 2 * vec_t_i::width, is_not_aligned);
489 yChunk3.load(y + 3 * vec_t_i::width, is_not_aligned);
490
491 // z = x[y[i]]
492 vec_t zChunk0, zChunk1, zChunk2, zChunk3;
493 zChunk0.gather(x, yChunk0);
494 zChunk1.gather(x, yChunk1);
495 zChunk2.gather(x, yChunk2);
496 zChunk3.gather(x, yChunk3);
497
498 // store
499 zChunk0.store(z, is_not_aligned);
500 zChunk1.store(z + vec_t_i::width, is_not_aligned);
501 zChunk2.store(z + 2 * vec_t_i::width, is_not_aligned);
502 zChunk3.store(z + 3 * vec_t_i::width, is_not_aligned);
503
504 // update pointers
505 y += 4 * vec_t_i::width;
506 z += 4 * vec_t::width;
507 cnt -= 4 * vec_t::width;
508 }
509
510 // Unroll 2x Vectorized loop
511 while (cnt >= 2 * vec_t::width)
512 {
513 // load index
514 vec_t_i yChunk0, yChunk1;
515 yChunk0.load(y, is_not_aligned);
516 yChunk1.load(y + vec_t_i::width, is_not_aligned);
517
518 // z = x[y[i]]
519 vec_t zChunk0, zChunk1;
520 zChunk0.gather(x, yChunk0);
521 zChunk1.gather(x, yChunk1);
522
523 // store
524 zChunk0.store(z, is_not_aligned);
525 zChunk1.store(z + vec_t_i::width, is_not_aligned);
526
527 // update pointers
528 y += 2 * vec_t_i::width;
529 z += 2 * vec_t::width;
530 cnt -= 2 * vec_t::width;
531 }
532
533 // Vectorized loop
534 while (cnt >= vec_t::width)
535 {
536 // load index
537 vec_t_i yChunk;
538 yChunk.load(y, is_not_aligned);
539
540 // z = x[y[i]]
541 vec_t zChunk;
542 zChunk.gather(x, yChunk);
543
544 // store
545 zChunk.store(z, is_not_aligned);
546
547 // update pointers
548 y += vec_t_i::width;
549 z += vec_t::width;
550 cnt -= vec_t::width;
551 }
552
553 // spillover loop
554 while (cnt)
555 {
556 // z = x[y[i]]
557 *z = *(x + *y);
558 // update pointers
559 ++y;
560 ++z;
561 --cnt;
562 }
563}
564
565} // namespace SIMD
566} // namespace Vmath
567#endif
std::vector< double > w(NPUPPER)
std::vector< double > z(NPUPPER)
tinysimd::simd< NekDouble > vec_t
void Vvtvp(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvp (vector times vector plus vector): z = w*x + y
Definition: VmathSIMD.hpp:261
void Vadd(const size_t n, const T *x, const T *y, T *z)
Add vector z = x + y.
Definition: VmathSIMD.hpp:49
void Vvtvm(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvm (vector times vector minus vector): z = w*x - y
Definition: VmathSIMD.hpp:309
void Vvtvvtm(const size_t n, const T *v, const T *w, const T *x, const T *y, T *z)
vvtvvtm (vector times vector minus vector times vector):
Definition: VmathSIMD.hpp:416
void Gathr(const I n, const T *x, const I *y, T *z)
Gather vector z[i] = x[y[i]].
Definition: VmathSIMD.hpp:474
void Vvtvvtp(const size_t n, const T *v, const T *w, const T *x, const T *y, T *z)
vvtvvtp (vector times vector plus vector times vector):
Definition: VmathSIMD.hpp:358
void Vmul(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x * y.
Definition: VmathSIMD.hpp:155
Definition: Vmath.cpp:38
static constexpr struct tinysimd::is_not_aligned_t is_not_aligned
typename abi< ScalarType, width >::type simd
Definition: tinysimd.hpp:80