Nektar++
VmathSIMD.hpp
Go to the documentation of this file.
1///////////////////////////////////////////////////////////////////////////////
2//
3// File: VmathSIMD.hpp
4//
5// For more information, please see: http://www.nektar.info
6//
7// The MIT License
8//
9// Copyright (c) 2006 Division of Applied Mathematics, Brown University (USA),
10// Department of Aeronautics, Imperial College London (UK), and Scientific
11// Computing and Imaging Institute, University of Utah (USA).
12//
13// Permission is hereby granted, free of charge, to any person obtaining a
14// copy of this software and associated documentation files (the "Software"),
15// to deal in the Software without restriction, including without limitation
16// the rights to use, copy, modify, merge, publish, distribute, sublicense,
17// and/or sell copies of the Software, and to permit persons to whom the
18// Software is furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included
21// in all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29// DEALINGS IN THE SOFTWARE.
30//
31// Description: Collection of templated functions for vector mathematics using
32// SIMD types, some are functions have optimization tricks such as manual loop
33// unrolling.
34//
35///////////////////////////////////////////////////////////////////////////////
36
37#ifndef NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
38#define NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
39
41
42namespace Vmath::SIMD
43{
44/// \brief Add vector z = x + y
45template <class T, typename = typename std::enable_if<
46 std::is_floating_point<T>::value>::type>
47void Vadd(const size_t n, const T *x, const T *y, T *z)
48{
49 using namespace tinysimd;
50 using vec_t = simd<T>;
51
52 size_t cnt = n;
53 // Vectorized loop unroll 4x
54 while (cnt >= 4 * vec_t::width)
55 {
56 // load
57 vec_t yChunk0, yChunk1, yChunk2, yChunk3;
58 yChunk0.load(y, is_not_aligned);
59 yChunk1.load(y + vec_t::width, is_not_aligned);
60 yChunk2.load(y + 2 * vec_t::width, is_not_aligned);
61 yChunk3.load(y + 3 * vec_t::width, is_not_aligned);
62
63 vec_t xChunk0, xChunk1, xChunk2, xChunk3;
64 xChunk0.load(x, is_not_aligned);
65 xChunk1.load(x + vec_t::width, is_not_aligned);
66 xChunk2.load(x + 2 * vec_t::width, is_not_aligned);
67 xChunk3.load(x + 3 * vec_t::width, is_not_aligned);
68
69 // z = x + y
70 vec_t zChunk0 = xChunk0 + yChunk0;
71 vec_t zChunk1 = xChunk1 + yChunk1;
72 vec_t zChunk2 = xChunk2 + yChunk2;
73 vec_t zChunk3 = xChunk3 + yChunk3;
74
75 // store
76 zChunk0.store(z, is_not_aligned);
77 zChunk1.store(z + vec_t::width, is_not_aligned);
78 zChunk2.store(z + 2 * vec_t::width, is_not_aligned);
79 zChunk3.store(z + 3 * vec_t::width, is_not_aligned);
80
81 // update pointers
82 x += 4 * vec_t::width;
83 y += 4 * vec_t::width;
84 z += 4 * vec_t::width;
85 cnt -= 4 * vec_t::width;
86 }
87
88 // Vectorized loop unroll 2x
89 while (cnt >= 2 * vec_t::width)
90 {
91 // load
92 vec_t yChunk0, yChunk1;
93 yChunk0.load(y, is_not_aligned);
94 yChunk1.load(y + vec_t::width, is_not_aligned);
95
96 vec_t xChunk0, xChunk1;
97 xChunk0.load(x, is_not_aligned);
98 xChunk1.load(x + vec_t::width, is_not_aligned);
99
100 // z = x + y
101 vec_t zChunk0 = xChunk0 + yChunk0;
102 vec_t zChunk1 = xChunk1 + yChunk1;
103
104 // store
105 zChunk0.store(z, is_not_aligned);
106 zChunk1.store(z + vec_t::width, is_not_aligned);
107
108 // update pointers
109 x += 2 * vec_t::width;
110 y += 2 * vec_t::width;
111 z += 2 * vec_t::width;
112 cnt -= 2 * vec_t::width;
113 }
114
115 // Vectorized loop
116 while (cnt >= vec_t::width)
117 {
118 // load
119 vec_t yChunk;
120 yChunk.load(y, is_not_aligned);
121 vec_t xChunk;
122 xChunk.load(x, is_not_aligned);
123
124 // z = x + y
125 vec_t zChunk = xChunk + yChunk;
126
127 // store
128 zChunk.store(z, is_not_aligned);
129
130 // update pointers
131 x += vec_t::width;
132 y += vec_t::width;
133 z += vec_t::width;
134 cnt -= vec_t::width;
135 }
136
137 // spillover loop
138 while (cnt)
139 {
140 // z = x + y;
141 *z = (*x) + (*y);
142 // update pointers
143 ++x;
144 ++y;
145 ++z;
146 --cnt;
147 }
148}
149
150/// \brief Multiply vector z = x * y
151template <class T, typename = typename std::enable_if<
152 std::is_floating_point<T>::value>::type>
153void Vmul(const size_t n, const T *x, const T *y, T *z)
154{
155 using namespace tinysimd;
156 using vec_t = simd<T>;
157
158 size_t cnt = n;
159 // Vectorized loop unroll 4x
160 while (cnt >= 4 * vec_t::width)
161 {
162 // load
163 vec_t yChunk0, yChunk1, yChunk2, yChunk3;
164 yChunk0.load(y, is_not_aligned);
165 yChunk1.load(y + vec_t::width, is_not_aligned);
166 yChunk2.load(y + 2 * vec_t::width, is_not_aligned);
167 yChunk3.load(y + 3 * vec_t::width, is_not_aligned);
168
169 vec_t xChunk0, xChunk1, xChunk2, xChunk3;
170 xChunk0.load(x, is_not_aligned);
171 xChunk1.load(x + vec_t::width, is_not_aligned);
172 xChunk2.load(x + 2 * vec_t::width, is_not_aligned);
173 xChunk3.load(x + 3 * vec_t::width, is_not_aligned);
174
175 // z = x * y
176 vec_t zChunk0 = xChunk0 * yChunk0;
177 vec_t zChunk1 = xChunk1 * yChunk1;
178 vec_t zChunk2 = xChunk2 * yChunk2;
179 vec_t zChunk3 = xChunk3 * yChunk3;
180
181 // store
182 zChunk0.store(z, is_not_aligned);
183 zChunk1.store(z + vec_t::width, is_not_aligned);
184 zChunk2.store(z + 2 * vec_t::width, is_not_aligned);
185 zChunk3.store(z + 3 * vec_t::width, is_not_aligned);
186
187 // update pointers
188 x += 4 * vec_t::width;
189 y += 4 * vec_t::width;
190 z += 4 * vec_t::width;
191 cnt -= 4 * vec_t::width;
192 }
193
194 // Vectorized loop unroll 2x
195 while (cnt >= 2 * vec_t::width)
196 {
197 // load
198 vec_t yChunk0, yChunk1;
199 yChunk0.load(y, is_not_aligned);
200 yChunk1.load(y + vec_t::width, is_not_aligned);
201
202 vec_t xChunk0, xChunk1;
203 xChunk0.load(x, is_not_aligned);
204 xChunk1.load(x + vec_t::width, is_not_aligned);
205
206 // z = x * y
207 vec_t zChunk0 = xChunk0 * yChunk0;
208 vec_t zChunk1 = xChunk1 * yChunk1;
209
210 // store
211 zChunk0.store(z, is_not_aligned);
212 zChunk1.store(z + vec_t::width, is_not_aligned);
213
214 // update pointers
215 x += 2 * vec_t::width;
216 y += 2 * vec_t::width;
217 z += 2 * vec_t::width;
218 cnt -= 2 * vec_t::width;
219 }
220
221 // Vectorized loop
222 while (cnt >= vec_t::width)
223 {
224 // load
225 vec_t yChunk;
226 yChunk.load(y, is_not_aligned);
227 vec_t xChunk;
228 xChunk.load(x, is_not_aligned);
229
230 // z = x * y
231 vec_t zChunk = xChunk * yChunk;
232
233 // store
234 zChunk.store(z, is_not_aligned);
235
236 // update pointers
237 x += vec_t::width;
238 y += vec_t::width;
239 z += vec_t::width;
240 cnt -= vec_t::width;
241 }
242
243 // spillover loop
244 while (cnt)
245 {
246 // z = x * y;
247 *z = (*x) * (*y);
248 // update pointers
249 ++x;
250 ++y;
251 ++z;
252 --cnt;
253 }
254}
255
256/// \brief vvtvp (vector times vector plus vector): z = w*x + y
257template <class T, typename = typename std::enable_if<
258 std::is_floating_point<T>::value>::type>
259void Vvtvp(const size_t n, const T *w, const T *x, const T *y, T *z)
260{
261 using namespace tinysimd;
262 using vec_t = simd<T>;
263
264 size_t cnt = n;
265 // Vectorized loop
266 while (cnt >= vec_t::width)
267 {
268 // load
269 vec_t wChunk;
270 wChunk.load(w, is_not_aligned);
271 vec_t yChunk;
272 yChunk.load(y, is_not_aligned);
273 vec_t xChunk;
274 xChunk.load(x, is_not_aligned);
275
276 // z = w * x + y
277 vec_t zChunk = wChunk * xChunk + yChunk;
278
279 // store
280 zChunk.store(z, is_not_aligned);
281
282 // update pointers
283 w += vec_t::width;
284 x += vec_t::width;
285 y += vec_t::width;
286 z += vec_t::width;
287 cnt -= vec_t::width;
288 }
289
290 // spillover loop
291 while (cnt)
292 {
293 // z = w * x + y;
294 *z = (*w) * (*x) + (*y);
295 // update pointers
296 ++w;
297 ++x;
298 ++y;
299 ++z;
300 --cnt;
301 }
302}
303
304/// \brief vvtvm (vector times vector minus vector): z = w*x - y
305template <class T, typename = typename std::enable_if<
306 std::is_floating_point<T>::value>::type>
307void Vvtvm(const size_t n, const T *w, const T *x, const T *y, T *z)
308{
309 using namespace tinysimd;
310 using vec_t = simd<T>;
311
312 size_t cnt = n;
313 // Vectorized loop
314 while (cnt >= vec_t::width)
315 {
316 // load
317 vec_t wChunk;
318 wChunk.load(w, is_not_aligned);
319 vec_t yChunk;
320 yChunk.load(y, is_not_aligned);
321 vec_t xChunk;
322 xChunk.load(x, is_not_aligned);
323
324 // z = w * x - y
325 vec_t zChunk = wChunk * xChunk - yChunk;
326
327 // store
328 zChunk.store(z, is_not_aligned);
329
330 // update pointers
331 w += vec_t::width;
332 x += vec_t::width;
333 y += vec_t::width;
334 z += vec_t::width;
335 cnt -= vec_t::width;
336 }
337
338 // spillover loop
339 while (cnt)
340 {
341 // z = w * x - y;
342 *z = (*w) * (*x) - (*y);
343 // update pointers
344 ++w;
345 ++x;
346 ++y;
347 ++z;
348 --cnt;
349 }
350}
351
352/// \brief vvtvvtp (vector times vector plus vector times vector):
353// z = v*w + x*y
354template <class T, typename = typename std::enable_if<
355 std::is_floating_point<T>::value>::type>
356inline void Vvtvvtp(const size_t n, const T *v, const T *w, const T *x,
357 const T *y, T *z)
358{
359 using namespace tinysimd;
360 using vec_t = simd<T>;
361
362 size_t cnt = n;
363 // Vectorized loop
364 while (cnt >= vec_t::width)
365 {
366 // load
367 vec_t vChunk;
368 vChunk.load(v, is_not_aligned);
369 vec_t wChunk;
370 wChunk.load(w, is_not_aligned);
371 vec_t yChunk;
372 yChunk.load(y, is_not_aligned);
373 vec_t xChunk;
374 xChunk.load(x, is_not_aligned);
375
376 // z = v * w + x * y;
377 vec_t z1Chunk = vChunk * wChunk;
378 vec_t z2Chunk = xChunk * yChunk;
379 vec_t zChunk = z1Chunk + z2Chunk;
380
381 // store
382 zChunk.store(z, is_not_aligned);
383
384 // update pointers
385 v += vec_t::width;
386 w += vec_t::width;
387 x += vec_t::width;
388 y += vec_t::width;
389 z += vec_t::width;
390 cnt -= vec_t::width;
391 }
392
393 // spillover loop
394 while (cnt)
395 {
396 // z = v * w + x * y;
397 T z1 = (*v) * (*w);
398 T z2 = (*x) * (*y);
399 *z = z1 + z2;
400 // update pointers
401 ++v;
402 ++w;
403 ++x;
404 ++y;
405 ++z;
406 --cnt;
407 }
408}
409
410/// \brief vvtvvtm (vector times vector minus vector times vector):
411// z = v*w - x*y
412template <class T, typename = typename std::enable_if<
413 std::is_floating_point<T>::value>::type>
414inline void Vvtvvtm(const size_t n, const T *v, const T *w, const T *x,
415 const T *y, T *z)
416{
417 using namespace tinysimd;
418 using vec_t = simd<T>;
419
420 size_t cnt = n;
421 // Vectorized loop
422 while (cnt >= vec_t::width)
423 {
424 // load
425 vec_t vChunk;
426 vChunk.load(v, is_not_aligned);
427 vec_t wChunk;
428 wChunk.load(w, is_not_aligned);
429 vec_t yChunk;
430 yChunk.load(y, is_not_aligned);
431 vec_t xChunk;
432 xChunk.load(x, is_not_aligned);
433
434 // z = v * w + x * y;
435 vec_t z1Chunk = vChunk * wChunk;
436 vec_t z2Chunk = xChunk * yChunk;
437 vec_t zChunk = z1Chunk - z2Chunk;
438
439 // store
440 zChunk.store(z, is_not_aligned);
441
442 // update pointers
443 v += vec_t::width;
444 w += vec_t::width;
445 x += vec_t::width;
446 y += vec_t::width;
447 z += vec_t::width;
448 cnt -= vec_t::width;
449 }
450
451 // spillover loop
452 while (cnt)
453 {
454 // z = v * w + x * y;
455 T z1 = (*v) * (*w);
456 T z2 = (*x) * (*y);
457 *z = z1 - z2;
458 // update pointers
459 ++v;
460 ++w;
461 ++x;
462 ++y;
463 ++z;
464 --cnt;
465 }
466}
467
468/// \brief Gather vector z[i] = x[y[i]]
469template <class T, class I,
470 typename = typename std::enable_if<std::is_floating_point<T>::value &&
471 std::is_integral<I>::value>::type>
472void Gathr(const I n, const T *x, const I *y, T *z)
473{
474 using namespace tinysimd;
475 using vec_t = simd<T>;
476 using vec_t_i = simd<I, vec_t::width>;
477
478 I cnt = n;
479 // Unroll 4x Vectorized loop
480 while (cnt >= 4 * vec_t::width)
481 {
482 // load index
483 vec_t_i yChunk0, yChunk1, yChunk2, yChunk3;
484 yChunk0.load(y, is_not_aligned);
485 yChunk1.load(y + vec_t_i::width, is_not_aligned);
486 yChunk2.load(y + 2 * vec_t_i::width, is_not_aligned);
487 yChunk3.load(y + 3 * vec_t_i::width, is_not_aligned);
488
489 // z = x[y[i]]
490 vec_t zChunk0, zChunk1, zChunk2, zChunk3;
491 zChunk0.gather(x, yChunk0);
492 zChunk1.gather(x, yChunk1);
493 zChunk2.gather(x, yChunk2);
494 zChunk3.gather(x, yChunk3);
495
496 // store
497 zChunk0.store(z, is_not_aligned);
498 zChunk1.store(z + vec_t_i::width, is_not_aligned);
499 zChunk2.store(z + 2 * vec_t_i::width, is_not_aligned);
500 zChunk3.store(z + 3 * vec_t_i::width, is_not_aligned);
501
502 // update pointers
503 y += 4 * vec_t_i::width;
504 z += 4 * vec_t::width;
505 cnt -= 4 * vec_t::width;
506 }
507
508 // Unroll 2x Vectorized loop
509 while (cnt >= 2 * vec_t::width)
510 {
511 // load index
512 vec_t_i yChunk0, yChunk1;
513 yChunk0.load(y, is_not_aligned);
514 yChunk1.load(y + vec_t_i::width, is_not_aligned);
515
516 // z = x[y[i]]
517 vec_t zChunk0, zChunk1;
518 zChunk0.gather(x, yChunk0);
519 zChunk1.gather(x, yChunk1);
520
521 // store
522 zChunk0.store(z, is_not_aligned);
523 zChunk1.store(z + vec_t_i::width, is_not_aligned);
524
525 // update pointers
526 y += 2 * vec_t_i::width;
527 z += 2 * vec_t::width;
528 cnt -= 2 * vec_t::width;
529 }
530
531 // Vectorized loop
532 while (cnt >= vec_t::width)
533 {
534 // load index
535 vec_t_i yChunk;
536 yChunk.load(y, is_not_aligned);
537
538 // z = x[y[i]]
539 vec_t zChunk;
540 zChunk.gather(x, yChunk);
541
542 // store
543 zChunk.store(z, is_not_aligned);
544
545 // update pointers
546 y += vec_t_i::width;
547 z += vec_t::width;
548 cnt -= vec_t::width;
549 }
550
551 // spillover loop
552 while (cnt)
553 {
554 // z = x[y[i]]
555 *z = *(x + *y);
556 // update pointers
557 ++y;
558 ++z;
559 --cnt;
560 }
561}
562
563} // namespace Vmath::SIMD
564#endif
std::vector< double > w(NPUPPER)
std::vector< double > z(NPUPPER)
tinysimd::simd< NekDouble > vec_t
void Vvtvp(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvp (vector times vector plus vector): z = w*x + y
Definition: VmathSIMD.hpp:259
void Vadd(const size_t n, const T *x, const T *y, T *z)
Add vector z = x + y.
Definition: VmathSIMD.hpp:47
void Vvtvm(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvm (vector times vector minus vector): z = w*x - y
Definition: VmathSIMD.hpp:307
void Vvtvvtm(const size_t n, const T *v, const T *w, const T *x, const T *y, T *z)
vvtvvtm (vector times vector minus vector times vector):
Definition: VmathSIMD.hpp:414
void Gathr(const I n, const T *x, const I *y, T *z)
Gather vector z[i] = x[y[i]].
Definition: VmathSIMD.hpp:472
void Vvtvvtp(const size_t n, const T *v, const T *w, const T *x, const T *y, T *z)
vvtvvtp (vector times vector plus vector times vector):
Definition: VmathSIMD.hpp:356
void Vmul(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x * y.
Definition: VmathSIMD.hpp:153
static constexpr struct tinysimd::is_not_aligned_t is_not_aligned
typename abi< ScalarType, width >::type simd
Definition: tinysimd.hpp:80