37 #ifndef NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
38 #define NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
49 template <
class T,
typename =
typename std::enable_if<
50 std::is_floating_point<T>::value>::type>
51 void Vadd(
const size_t n,
const T *x,
const T *y, T *z)
58 while (cnt >= 4 * vec_t::width)
61 vec_t yChunk0, yChunk1, yChunk2, yChunk3;
67 vec_t xChunk0, xChunk1, xChunk2, xChunk3;
74 vec_t zChunk0 = xChunk0 + yChunk0;
75 vec_t zChunk1 = xChunk1 + yChunk1;
76 vec_t zChunk2 = xChunk2 + yChunk2;
77 vec_t zChunk3 = xChunk3 + yChunk3;
86 x += 4 * vec_t::width;
87 y += 4 * vec_t::width;
88 z += 4 * vec_t::width;
89 cnt -= 4 * vec_t::width;
93 while (cnt >= 2 * vec_t::width)
96 vec_t yChunk0, yChunk1;
100 vec_t xChunk0, xChunk1;
105 vec_t zChunk0 = xChunk0 + yChunk0;
106 vec_t zChunk1 = xChunk1 + yChunk1;
113 x += 2 * vec_t::width;
114 y += 2 * vec_t::width;
115 z += 2 * vec_t::width;
116 cnt -= 2 * vec_t::width;
120 while (cnt >= vec_t::width)
129 vec_t zChunk = xChunk + yChunk;
155 template <
class T,
typename =
typename std::enable_if<
156 std::is_floating_point<T>::value>::type>
157 void Vmul(
const size_t n,
const T *x,
const T *y, T *z)
164 while (cnt >= 4 * vec_t::width)
167 vec_t yChunk0, yChunk1, yChunk2, yChunk3;
173 vec_t xChunk0, xChunk1, xChunk2, xChunk3;
180 vec_t zChunk0 = xChunk0 * yChunk0;
181 vec_t zChunk1 = xChunk1 * yChunk1;
182 vec_t zChunk2 = xChunk2 * yChunk2;
183 vec_t zChunk3 = xChunk3 * yChunk3;
192 x += 4 * vec_t::width;
193 y += 4 * vec_t::width;
194 z += 4 * vec_t::width;
195 cnt -= 4 * vec_t::width;
199 while (cnt >= 2 * vec_t::width)
202 vec_t yChunk0, yChunk1;
206 vec_t xChunk0, xChunk1;
211 vec_t zChunk0 = xChunk0 * yChunk0;
212 vec_t zChunk1 = xChunk1 * yChunk1;
219 x += 2 * vec_t::width;
220 y += 2 * vec_t::width;
221 z += 2 * vec_t::width;
222 cnt -= 2 * vec_t::width;
226 while (cnt >= vec_t::width)
235 vec_t zChunk = xChunk * yChunk;
261 template <
class T,
typename =
typename std::enable_if<
262 std::is_floating_point<T>::value>::type>
263 void Vvtvp(
const size_t n,
const T *w,
const T *x,
const T *y, T *z)
270 while (cnt >= vec_t::width)
281 vec_t zChunk = wChunk * xChunk + yChunk;
298 *z = (*w) * (*x) + (*y);
309 template <
class T,
typename =
typename std::enable_if<
310 std::is_floating_point<T>::value>::type>
311 void Vvtvm(
const size_t n,
const T *w,
const T *x,
const T *y, T *z)
318 while (cnt >= vec_t::width)
329 vec_t zChunk = wChunk * xChunk - yChunk;
346 *z = (*w) * (*x) - (*y);
358 template <
class T,
typename =
typename std::enable_if<
359 std::is_floating_point<T>::value>::type>
360 inline void Vvtvvtp(
const size_t n,
const T *v,
const T *w,
const T *x,
368 while (cnt >= vec_t::width)
381 vec_t z1Chunk = vChunk * wChunk;
382 vec_t z2Chunk = xChunk * yChunk;
383 vec_t zChunk = z1Chunk + z2Chunk;
415 template <
class T,
class I,
416 typename =
typename std::enable_if<std::is_floating_point<T>::value &&
417 std::is_integral<I>::value>::type>
418 void Gathr(
const I n,
const T *x,
const I *y, T *z)
426 while (cnt >= 4 * vec_t::width)
429 vec_t_i yChunk0, yChunk1, yChunk2, yChunk3;
436 vec_t zChunk0, zChunk1, zChunk2, zChunk3;
437 zChunk0.gather(x, yChunk0);
438 zChunk1.gather(x, yChunk1);
439 zChunk2.gather(x, yChunk2);
440 zChunk3.gather(x, yChunk3);
449 y += 4 * vec_t_i::width;
450 z += 4 * vec_t::width;
451 cnt -= 4 * vec_t::width;
455 while (cnt >= 2 * vec_t::width)
458 vec_t_i yChunk0, yChunk1;
463 vec_t zChunk0, zChunk1;
464 zChunk0.gather(x, yChunk0);
465 zChunk1.gather(x, yChunk1);
472 y += 2 * vec_t_i::width;
473 z += 2 * vec_t::width;
474 cnt -= 2 * vec_t::width;
478 while (cnt >= vec_t::width)
486 zChunk.gather(x, yChunk);
tinysimd::simd< NekDouble > vec_t
void Vvtvp(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvp (vector times vector plus vector): z = w*x + y
void Vadd(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x + y.
void Vvtvm(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvm (vector times vector plus vector): z = w*x - y
void Gathr(const I n, const T *x, const I *y, T *z)
Gather vector z[i] = x[y[i]].
void Vvtvvtp(const size_t n, const T *v, const T *w, const T *x, const T *y, T *z)
vvtvvtp (vector times vector plus vector times vector):
void Vmul(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x * y.
static constexpr struct tinysimd::is_not_aligned_t is_not_aligned
typename abi< ScalarType, width >::type simd