37 #ifndef NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
38 #define NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
49 template <
class T,
typename =
typename std::enable_if<
50 std::is_floating_point<T>::value>::type>
51 void Vadd(
const size_t n,
const T *x,
const T *y, T *z)
58 while (cnt >= 4 * vec_t::width)
61 vec_t yChunk0, yChunk1, yChunk2, yChunk3;
67 vec_t xChunk0, xChunk1, xChunk2, xChunk3;
74 vec_t zChunk0 = xChunk0 + yChunk0;
75 vec_t zChunk1 = xChunk1 + yChunk1;
76 vec_t zChunk2 = xChunk2 + yChunk2;
77 vec_t zChunk3 = xChunk3 + yChunk3;
86 x += 4 * vec_t::width;
87 y += 4 * vec_t::width;
88 z += 4 * vec_t::width;
89 cnt -= 4 * vec_t::width;
93 while (cnt >= 2 * vec_t::width)
96 vec_t yChunk0, yChunk1;
100 vec_t xChunk0, xChunk1;
105 vec_t zChunk0 = xChunk0 + yChunk0;
106 vec_t zChunk1 = xChunk1 + yChunk1;
113 x += 2 * vec_t::width;
114 y += 2 * vec_t::width;
115 z += 2 * vec_t::width;
116 cnt -= 2 * vec_t::width;
120 while (cnt >= vec_t::width)
129 vec_t zChunk = xChunk + yChunk;
155 template <
class T,
typename =
typename std::enable_if<
156 std::is_floating_point<T>::value>::type>
157 void Vmul(
const size_t n,
const T *x,
const T *y, T *z)
164 while (cnt >= 4 * vec_t::width)
167 vec_t yChunk0, yChunk1, yChunk2, yChunk3;
173 vec_t xChunk0, xChunk1, xChunk2, xChunk3;
180 vec_t zChunk0 = xChunk0 * yChunk0;
181 vec_t zChunk1 = xChunk1 * yChunk1;
182 vec_t zChunk2 = xChunk2 * yChunk2;
183 vec_t zChunk3 = xChunk3 * yChunk3;
192 x += 4 * vec_t::width;
193 y += 4 * vec_t::width;
194 z += 4 * vec_t::width;
195 cnt -= 4 * vec_t::width;
199 while (cnt >= 2 * vec_t::width)
202 vec_t yChunk0, yChunk1;
206 vec_t xChunk0, xChunk1;
211 vec_t zChunk0 = xChunk0 * yChunk0;
212 vec_t zChunk1 = xChunk1 * yChunk1;
219 x += 2 * vec_t::width;
220 y += 2 * vec_t::width;
221 z += 2 * vec_t::width;
222 cnt -= 2 * vec_t::width;
226 while (cnt >= vec_t::width)
235 vec_t zChunk = xChunk * yChunk;
261 template <
class T,
typename =
typename std::enable_if<
262 std::is_floating_point<T>::value>::type>
263 void Vvtvp(
const size_t n,
const T *w,
const T *x,
const T *y, T *z)
270 while (cnt >= vec_t::width)
281 vec_t zChunk = wChunk * xChunk + yChunk;
298 *z = (*w) * (*x) + (*y);
309 template <
class T,
typename =
typename std::enable_if<
310 std::is_floating_point<T>::value>::type>
311 void Vvtvm(
const size_t n,
const T *w,
const T *x,
const T *y, T *z)
318 while (cnt >= vec_t::width)
329 vec_t zChunk = wChunk * xChunk - yChunk;
346 *z = (*w) * (*x) - (*y);
358 template <
class T,
typename =
typename std::enable_if<
359 std::is_floating_point<T>::value>::type>
360 inline void Vvtvvtp(
const size_t n,
const T *v,
const T *w,
const T *x,
368 while (cnt >= vec_t::width)
381 vec_t z1Chunk = vChunk * wChunk;
382 vec_t z2Chunk = xChunk * yChunk;
383 vec_t zChunk = z1Chunk + z2Chunk;
416 template <
class T,
typename =
typename std::enable_if<
417 std::is_floating_point<T>::value>::type>
418 inline void Vvtvvtm(
const size_t n,
const T *v,
const T *w,
const T *x,
426 while (cnt >= vec_t::width)
439 vec_t z1Chunk = vChunk * wChunk;
440 vec_t z2Chunk = xChunk * yChunk;
441 vec_t zChunk = z1Chunk - z2Chunk;
473 template <
class T,
class I,
474 typename =
typename std::enable_if<std::is_floating_point<T>::value &&
475 std::is_integral<I>::value>::type>
476 void Gathr(
const I n,
const T *x,
const I *y, T *z)
484 while (cnt >= 4 * vec_t::width)
487 vec_t_i yChunk0, yChunk1, yChunk2, yChunk3;
494 vec_t zChunk0, zChunk1, zChunk2, zChunk3;
495 zChunk0.gather(x, yChunk0);
496 zChunk1.gather(x, yChunk1);
497 zChunk2.gather(x, yChunk2);
498 zChunk3.gather(x, yChunk3);
507 y += 4 * vec_t_i::width;
508 z += 4 * vec_t::width;
509 cnt -= 4 * vec_t::width;
513 while (cnt >= 2 * vec_t::width)
516 vec_t_i yChunk0, yChunk1;
521 vec_t zChunk0, zChunk1;
522 zChunk0.gather(x, yChunk0);
523 zChunk1.gather(x, yChunk1);
530 y += 2 * vec_t_i::width;
531 z += 2 * vec_t::width;
532 cnt -= 2 * vec_t::width;
536 while (cnt >= vec_t::width)
544 zChunk.gather(x, yChunk);
tinysimd::simd< NekDouble > vec_t
void Vvtvp(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvp (vector times vector plus vector): z = w*x + y
void Vadd(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x + y.
void Vvtvm(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvm (vector times vector plus vector): z = w*x - y
void Vvtvvtm(const size_t n, const T *v, const T *w, const T *x, const T *y, T *z)
vvtvvtm (vector times vector minus vector times vector):
void Gathr(const I n, const T *x, const I *y, T *z)
Gather vector z[i] = x[y[i]].
void Vvtvvtp(const size_t n, const T *v, const T *w, const T *x, const T *y, T *z)
vvtvvtp (vector times vector plus vector times vector):
void Vmul(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x * y.
static constexpr struct tinysimd::is_not_aligned_t is_not_aligned
typename abi< ScalarType, width >::type simd