38 #ifndef NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
39 #define NEKTAR_LIB_LIBUTILITIES_BASSICUTILS_VMATHSIMD_HPP
50 template<
class T,
typename =
typename std::enable_if
51 <std::is_floating_point<T>::value>::type>
52 void Vadd(
const size_t n,
const T *x,
const T *y, T *z)
59 while (cnt >= 4*vec_t::width)
62 vec_t yChunk0, yChunk1, yChunk2, yChunk3;
68 vec_t xChunk0, xChunk1, xChunk2, xChunk3;
75 vec_t zChunk0 = xChunk0 + yChunk0;
76 vec_t zChunk1 = xChunk1 + yChunk1;
77 vec_t zChunk2 = xChunk2 + yChunk2;
78 vec_t zChunk3 = xChunk3 + yChunk3;
94 while (cnt >= 2*vec_t::width)
97 vec_t yChunk0, yChunk1;
101 vec_t xChunk0, xChunk1;
106 vec_t zChunk0 = xChunk0 + yChunk0;
107 vec_t zChunk1 = xChunk1 + yChunk1;
117 cnt-= 2*vec_t::width;
121 while (cnt >= vec_t::width)
130 vec_t zChunk = xChunk + yChunk;
156 template<
class T,
typename =
typename std::enable_if
157 <std::is_floating_point<T>::value>::type >
158 void Vmul(
const size_t n,
const T *x,
const T *y, T *z)
165 while (cnt >= 4*vec_t::width)
168 vec_t yChunk0, yChunk1, yChunk2, yChunk3;
174 vec_t xChunk0, xChunk1, xChunk2, xChunk3;
181 vec_t zChunk0 = xChunk0 * yChunk0;
182 vec_t zChunk1 = xChunk1 * yChunk1;
183 vec_t zChunk2 = xChunk2 * yChunk2;
184 vec_t zChunk3 = xChunk3 * yChunk3;
196 cnt-= 4*vec_t::width;
200 while (cnt >= 2*vec_t::width)
203 vec_t yChunk0, yChunk1;
207 vec_t xChunk0, xChunk1;
212 vec_t zChunk0 = xChunk0 * yChunk0;
213 vec_t zChunk1 = xChunk1 * yChunk1;
223 cnt-= 2*vec_t::width;
227 while (cnt >= vec_t::width)
236 vec_t zChunk = xChunk * yChunk;
262 template<
class T,
typename =
typename std::enable_if
263 < std::is_floating_point<T>::value >::type >
264 void Vvtvp(
const size_t n,
const T *w,
const T *x,
const T *y, T *z)
271 while (cnt >= vec_t::width)
282 vec_t zChunk = wChunk * xChunk + yChunk;
299 *z = (*w) * (*x) + (*y);
310 template<
class T,
typename =
typename std::enable_if
311 <std::is_floating_point<T>::value>::type >
312 void Vvtvm(
const size_t n,
const T *w,
const T *x,
const T *y, T *z)
319 while (cnt >= vec_t::width)
330 vec_t zChunk = wChunk * xChunk - yChunk;
347 *z = (*w) * (*x) - (*y);
359 template<
class T,
typename =
typename std::enable_if
360 <std::is_floating_point<T>::value>::type >
361 inline void Vvtvvtp (
const size_t n,
const T* v,
const T* w,
const T* x,
369 while (cnt >= vec_t::width)
382 vec_t z1Chunk = vChunk * wChunk;
383 vec_t z2Chunk = xChunk * yChunk;
384 vec_t zChunk = z1Chunk + z2Chunk;
416 template<
class T,
class I,
typename =
typename std::enable_if
417 < std::is_floating_point<T>::value &&
418 std::is_integral<I>::value >::type >
419 void Gathr(
const I n,
const T* x,
const I* y, T* z)
427 while (cnt >= 4*vec_t::width)
430 vec_t_i yChunk0, yChunk1, yChunk2, yChunk3;
437 vec_t zChunk0, zChunk1, zChunk2, zChunk3;
438 zChunk0.gather(x, yChunk0);
439 zChunk1.gather(x, yChunk1);
440 zChunk2.gather(x, yChunk2);
441 zChunk3.gather(x, yChunk3);
450 y += 4*vec_t_i::width;
452 cnt-= 4*vec_t::width;
456 while (cnt >= 2*vec_t::width)
459 vec_t_i yChunk0, yChunk1;
464 vec_t zChunk0, zChunk1;
465 zChunk0.gather(x, yChunk0);
466 zChunk1.gather(x, yChunk1);
473 y += 2*vec_t_i::width;
475 cnt-= 2*vec_t::width;
479 while (cnt >= vec_t::width)
487 zChunk.gather(x, yChunk);
tinysimd::simd< NekDouble > vec_t
void Vvtvp(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvp (vector times vector plus vector): z = w*x + y
void Vadd(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x + y.
void Vvtvm(const size_t n, const T *w, const T *x, const T *y, T *z)
vvtvm (vector times vector plus vector): z = w*x - y
void Gathr(const I n, const T *x, const I *y, T *z)
Gather vector z[i] = x[y[i]].
void Vvtvvtp(const size_t n, const T *v, const T *w, const T *x, const T *y, T *z)
vvtvvtp (vector times vector plus vector times vector):
void Vmul(const size_t n, const T *x, const T *y, T *z)
Multiply vector z = x * y.
static constexpr struct tinysimd::is_not_aligned_t is_not_aligned
typename abi< ScalarType >::type simd