Functions
template<class T , typename = typename std::enable_if< std::is_floating_point<T>::value>::type>
void	Vadd (const size_t n, const T x, const T y, T *z)
	Add vector z = x + y. More...

template<class T , typename = typename std::enable_if< std::is_floating_point<T>::value>::type>
void	Vmul (const size_t n, const T x, const T y, T *z)
	Multiply vector z = x * y. More...

template<class T , typename = typename std::enable_if< std::is_floating_point<T>::value>::type>
void	Vvtvp (const size_t n, const T w, const T x, const T y, T z)
	vvtvp (vector times vector plus vector): z = w*x + y More...

template<class T , typename = typename std::enable_if< std::is_floating_point<T>::value>::type>
void	Vvtvm (const size_t n, const T w, const T x, const T y, T z)
	vvtvm (vector times vector minus vector): z = w*x - y More...

template<class T , typename = typename std::enable_if< std::is_floating_point<T>::value>::type>
void	Vvtvvtp (const size_t n, const T v, const T w, const T x, const T y, T *z)
	vvtvvtp (vector times vector plus vector times vector): More...

template<class T , typename = typename std::enable_if< std::is_floating_point<T>::value>::type>
void	Vvtvvtm (const size_t n, const T v, const T w, const T x, const T y, T *z)
	vvtvvtm (vector times vector minus vector times vector): More...

template<class T , class I , typename = typename std::enable_if<std::is_floating_point<T>::value && std::is_integral<I>::value>::type>
void	Gathr (const I n, const T x, const I y, T *z)
	Gather vector z[i] = x[y[i]]. More...

Function Documentation

◆ Gathr()

template<class T , class I , typename = typename std::enable_if<std::is_floating_point<T>::value && std::is_integral<I>::value>::type>

void Vmath::SIMD::Gathr	(	const I	n,
		const T *	x,
		const I *	y,
		T *	z
	)

Gather vector z[i] = x[y[i]].

Definition at line 472 of file VmathSIMD.hpp.

{
    using namespace tinysimd;
    using vec_t   = simd<T>;
    using vec_t_i = simd<I, vec_t::width>;
 
    I cnt = n;
    // Unroll 4x Vectorized loop
    while (cnt >= 4 * vec_t::width)
    {
        // load index
        vec_t_i yChunk0, yChunk1, yChunk2, yChunk3;
        yChunk0.load(y, is_not_aligned);
        yChunk1.load(y + vec_t_i::width, is_not_aligned);
        yChunk2.load(y + 2 * vec_t_i::width, is_not_aligned);
        yChunk3.load(y + 3 * vec_t_i::width, is_not_aligned);
 
        // z = x[y[i]]
        vec_t zChunk0, zChunk1, zChunk2, zChunk3;
        zChunk0.gather(x, yChunk0);
        zChunk1.gather(x, yChunk1);
        zChunk2.gather(x, yChunk2);
        zChunk3.gather(x, yChunk3);
 
        // store
        zChunk0.store(z, is_not_aligned);
        zChunk1.store(z + vec_t_i::width, is_not_aligned);
        zChunk2.store(z + 2 * vec_t_i::width, is_not_aligned);
        zChunk3.store(z + 3 * vec_t_i::width, is_not_aligned);
 
        // update pointers
        y += 4 * vec_t_i::width;
        z += 4 * vec_t::width;
        cnt -= 4 * vec_t::width;
    }
 
    // Unroll 2x Vectorized loop
    while (cnt >= 2 * vec_t::width)
    {
        // load index
        vec_t_i yChunk0, yChunk1;
        yChunk0.load(y, is_not_aligned);
        yChunk1.load(y + vec_t_i::width, is_not_aligned);
 
        // z = x[y[i]]
        vec_t zChunk0, zChunk1;
        zChunk0.gather(x, yChunk0);
        zChunk1.gather(x, yChunk1);
 
        // store
        zChunk0.store(z, is_not_aligned);
        zChunk1.store(z + vec_t_i::width, is_not_aligned);
 
        // update pointers
        y += 2 * vec_t_i::width;
        z += 2 * vec_t::width;
        cnt -= 2 * vec_t::width;
    }
 
    // Vectorized loop
    while (cnt >= vec_t::width)
    {
        // load index
        vec_t_i yChunk;
        yChunk.load(y, is_not_aligned);
 
        // z = x[y[i]]
        vec_t zChunk;
        zChunk.gather(x, yChunk);
 
        // store
        zChunk.store(z, is_not_aligned);
 
        // update pointers
        y += vec_t_i::width;
        z += vec_t::width;
        cnt -= vec_t::width;
    }
 
    // spillover loop
    while (cnt)
    {
        // z = x[y[i]]
        *z = *(x + *y);
        // update pointers
        ++y;
        ++z;
        --cnt;
    }
}

References tinysimd::is_not_aligned, and Nektar::UnitTests::z().

Referenced by Nektar::VmathSIMDUnitTests::BOOST_AUTO_TEST_CASE(), Vmath::Gathr(), and main().

◆ Vadd()

template<class T , typename = typename std::enable_if< std::is_floating_point<T>::value>::type>

void Vmath::SIMD::Vadd	(	const size_t	n,
		const T *	x,
		const T *	y,
		T *	z
	)

Add vector z = x + y.

Definition at line 47 of file VmathSIMD.hpp.

{
    using namespace tinysimd;
    using vec_t = simd<T>;
 
    size_t cnt = n;
    // Vectorized loop unroll 4x
    while (cnt >= 4 * vec_t::width)
    {
        // load
        vec_t yChunk0, yChunk1, yChunk2, yChunk3;
        yChunk0.load(y, is_not_aligned);
        yChunk1.load(y + vec_t::width, is_not_aligned);
        yChunk2.load(y + 2 * vec_t::width, is_not_aligned);
        yChunk3.load(y + 3 * vec_t::width, is_not_aligned);
 
        vec_t xChunk0, xChunk1, xChunk2, xChunk3;
        xChunk0.load(x, is_not_aligned);
        xChunk1.load(x + vec_t::width, is_not_aligned);
        xChunk2.load(x + 2 * vec_t::width, is_not_aligned);
        xChunk3.load(x + 3 * vec_t::width, is_not_aligned);
 
        // z = x + y
        vec_t zChunk0 = xChunk0 + yChunk0;
        vec_t zChunk1 = xChunk1 + yChunk1;
        vec_t zChunk2 = xChunk2 + yChunk2;
        vec_t zChunk3 = xChunk3 + yChunk3;
 
        // store
        zChunk0.store(z, is_not_aligned);
        zChunk1.store(z + vec_t::width, is_not_aligned);
        zChunk2.store(z + 2 * vec_t::width, is_not_aligned);
        zChunk3.store(z + 3 * vec_t::width, is_not_aligned);
 
        // update pointers
        x += 4 * vec_t::width;
        y += 4 * vec_t::width;
        z += 4 * vec_t::width;
        cnt -= 4 * vec_t::width;
    }
 
    // Vectorized loop unroll 2x
    while (cnt >= 2 * vec_t::width)
    {
        // load
        vec_t yChunk0, yChunk1;
        yChunk0.load(y, is_not_aligned);
        yChunk1.load(y + vec_t::width, is_not_aligned);
 
        vec_t xChunk0, xChunk1;
        xChunk0.load(x, is_not_aligned);
        xChunk1.load(x + vec_t::width, is_not_aligned);
 
        // z = x + y
        vec_t zChunk0 = xChunk0 + yChunk0;
        vec_t zChunk1 = xChunk1 + yChunk1;
 
        // store
        zChunk0.store(z, is_not_aligned);
        zChunk1.store(z + vec_t::width, is_not_aligned);
 
        // update pointers
        x += 2 * vec_t::width;
        y += 2 * vec_t::width;
        z += 2 * vec_t::width;
        cnt -= 2 * vec_t::width;
    }
 
    // Vectorized loop
    while (cnt >= vec_t::width)
    {
        // load
        vec_t yChunk;
        yChunk.load(y, is_not_aligned);
        vec_t xChunk;
        xChunk.load(x, is_not_aligned);
 
        // z = x + y
        vec_t zChunk = xChunk + yChunk;
 
        // store
        zChunk.store(z, is_not_aligned);
 
        // update pointers
        x += vec_t::width;
        y += vec_t::width;
        z += vec_t::width;
        cnt -= vec_t::width;
    }
 
    // spillover loop
    while (cnt)
    {
        // z = x + y;
        *z = (*x) + (*y);
        // update pointers
        ++x;
        ++y;
        ++z;
        --cnt;
    }
}

References tinysimd::is_not_aligned, and Nektar::UnitTests::z().

Referenced by Nektar::VmathSIMDUnitTests::BOOST_AUTO_TEST_CASE(), and Vmath::Vadd().

◆ Vmul()

template<class T , typename = typename std::enable_if< std::is_floating_point<T>::value>::type>

void Vmath::SIMD::Vmul	(	const size_t	n,
		const T *	x,
		const T *	y,
		T *	z
	)

Multiply vector z = x * y.

Definition at line 153 of file VmathSIMD.hpp.

{
    using namespace tinysimd;
    using vec_t = simd<T>;
 
    size_t cnt = n;
    // Vectorized loop unroll 4x
    while (cnt >= 4 * vec_t::width)
    {
        // load
        vec_t yChunk0, yChunk1, yChunk2, yChunk3;
        yChunk0.load(y, is_not_aligned);
        yChunk1.load(y + vec_t::width, is_not_aligned);
        yChunk2.load(y + 2 * vec_t::width, is_not_aligned);
        yChunk3.load(y + 3 * vec_t::width, is_not_aligned);
 
        vec_t xChunk0, xChunk1, xChunk2, xChunk3;
        xChunk0.load(x, is_not_aligned);
        xChunk1.load(x + vec_t::width, is_not_aligned);
        xChunk2.load(x + 2 * vec_t::width, is_not_aligned);
        xChunk3.load(x + 3 * vec_t::width, is_not_aligned);
 
        // z = x * y
        vec_t zChunk0 = xChunk0 * yChunk0;
        vec_t zChunk1 = xChunk1 * yChunk1;
        vec_t zChunk2 = xChunk2 * yChunk2;
        vec_t zChunk3 = xChunk3 * yChunk3;
 
        // store
        zChunk0.store(z, is_not_aligned);
        zChunk1.store(z + vec_t::width, is_not_aligned);
        zChunk2.store(z + 2 * vec_t::width, is_not_aligned);
        zChunk3.store(z + 3 * vec_t::width, is_not_aligned);
 
        // update pointers
        x += 4 * vec_t::width;
        y += 4 * vec_t::width;
        z += 4 * vec_t::width;
        cnt -= 4 * vec_t::width;
    }
 
    // Vectorized loop unroll 2x
    while (cnt >= 2 * vec_t::width)
    {
        // load
        vec_t yChunk0, yChunk1;
        yChunk0.load(y, is_not_aligned);
        yChunk1.load(y + vec_t::width, is_not_aligned);
 
        vec_t xChunk0, xChunk1;
        xChunk0.load(x, is_not_aligned);
        xChunk1.load(x + vec_t::width, is_not_aligned);
 
        // z = x * y
        vec_t zChunk0 = xChunk0 * yChunk0;
        vec_t zChunk1 = xChunk1 * yChunk1;
 
        // store
        zChunk0.store(z, is_not_aligned);
        zChunk1.store(z + vec_t::width, is_not_aligned);
 
        // update pointers
        x += 2 * vec_t::width;
        y += 2 * vec_t::width;
        z += 2 * vec_t::width;
        cnt -= 2 * vec_t::width;
    }
 
    // Vectorized loop
    while (cnt >= vec_t::width)
    {
        // load
        vec_t yChunk;
        yChunk.load(y, is_not_aligned);
        vec_t xChunk;
        xChunk.load(x, is_not_aligned);
 
        // z = x * y
        vec_t zChunk = xChunk * yChunk;
 
        // store
        zChunk.store(z, is_not_aligned);
 
        // update pointers
        x += vec_t::width;
        y += vec_t::width;
        z += vec_t::width;
        cnt -= vec_t::width;
    }
 
    // spillover loop
    while (cnt)
    {
        // z = x * y;
        *z = (*x) * (*y);
        // update pointers
        ++x;
        ++y;
        ++z;
        --cnt;
    }
}

References tinysimd::is_not_aligned, and Nektar::UnitTests::z().

Referenced by Nektar::VmathSIMDUnitTests::BOOST_AUTO_TEST_CASE(), and Vmath::Vmul().

◆ Vvtvm()

template<class T , typename = typename std::enable_if< std::is_floating_point<T>::value>::type>

void Vmath::SIMD::Vvtvm	(	const size_t	n,
		const T *	w,
		const T *	x,
		const T *	y,
		T *	z
	)

vvtvm (vector times vector minus vector): z = w*x - y

Definition at line 307 of file VmathSIMD.hpp.

{
    using namespace tinysimd;
    using vec_t = simd<T>;
 
    size_t cnt = n;
    // Vectorized loop
    while (cnt >= vec_t::width)
    {
        // load
        vec_t wChunk;
        wChunk.load(w, is_not_aligned);
        vec_t yChunk;
        yChunk.load(y, is_not_aligned);
        vec_t xChunk;
        xChunk.load(x, is_not_aligned);
 
        // z = w * x - y
        vec_t zChunk = wChunk * xChunk - yChunk;
 
        // store
        zChunk.store(z, is_not_aligned);
 
        // update pointers
        w += vec_t::width;
        x += vec_t::width;
        y += vec_t::width;
        z += vec_t::width;
        cnt -= vec_t::width;
    }
 
    // spillover loop
    while (cnt)
    {
        // z = w * x - y;
        *z = (*w) * (*x) - (*y);
        // update pointers
        ++w;
        ++x;
        ++y;
        ++z;
        --cnt;
    }
}

References tinysimd::is_not_aligned, Nektar::UnitTests::w(), and Nektar::UnitTests::z().

Referenced by Vmath::Vvtvm().

◆ Vvtvp()

template<class T , typename = typename std::enable_if< std::is_floating_point<T>::value>::type>

void Vmath::SIMD::Vvtvp	(	const size_t	n,
		const T *	w,
		const T *	x,
		const T *	y,
		T *	z
	)

vvtvp (vector times vector plus vector): z = w*x + y

Definition at line 259 of file VmathSIMD.hpp.

{
    using namespace tinysimd;
    using vec_t = simd<T>;
 
    size_t cnt = n;
    // Vectorized loop
    while (cnt >= vec_t::width)
    {
        // load
        vec_t wChunk;
        wChunk.load(w, is_not_aligned);
        vec_t yChunk;
        yChunk.load(y, is_not_aligned);
        vec_t xChunk;
        xChunk.load(x, is_not_aligned);
 
        // z = w * x + y
        vec_t zChunk = wChunk * xChunk + yChunk;
 
        // store
        zChunk.store(z, is_not_aligned);
 
        // update pointers
        w += vec_t::width;
        x += vec_t::width;
        y += vec_t::width;
        z += vec_t::width;
        cnt -= vec_t::width;
    }
 
    // spillover loop
    while (cnt)
    {
        // z = w * x + y;
        *z = (*w) * (*x) + (*y);
        // update pointers
        ++w;
        ++x;
        ++y;
        ++z;
        --cnt;
    }
}

References tinysimd::is_not_aligned, Nektar::UnitTests::w(), and Nektar::UnitTests::z().

Referenced by Nektar::VmathSIMDUnitTests::BOOST_AUTO_TEST_CASE(), and Vmath::Vvtvp().

◆ Vvtvvtm()

template<class T , typename = typename std::enable_if< std::is_floating_point<T>::value>::type>

void Vmath::SIMD::Vvtvvtm	(	const size_t	n,
		const T *	v,
		const T *	w,
		const T *	x,
		const T *	y,
		T *	z
	)

inline

vvtvvtm (vector times vector minus vector times vector):

Definition at line 414 of file VmathSIMD.hpp.

{
    using namespace tinysimd;
    using vec_t = simd<T>;
 
    size_t cnt = n;
    // Vectorized loop
    while (cnt >= vec_t::width)
    {
        // load
        vec_t vChunk;
        vChunk.load(v, is_not_aligned);
        vec_t wChunk;
        wChunk.load(w, is_not_aligned);
        vec_t yChunk;
        yChunk.load(y, is_not_aligned);
        vec_t xChunk;
        xChunk.load(x, is_not_aligned);
 
        // z = v * w + x * y;
        vec_t z1Chunk = vChunk * wChunk;
        vec_t z2Chunk = xChunk * yChunk;
        vec_t zChunk  = z1Chunk - z2Chunk;
 
        // store
        zChunk.store(z, is_not_aligned);
 
        // update pointers
        v += vec_t::width;
        w += vec_t::width;
        x += vec_t::width;
        y += vec_t::width;
        z += vec_t::width;
        cnt -= vec_t::width;
    }
 
    // spillover loop
    while (cnt)
    {
        // z = v * w + x * y;
        T z1 = (*v) * (*w);
        T z2 = (*x) * (*y);
        *z   = z1 - z2;
        // update pointers
        ++v;
        ++w;
        ++x;
        ++y;
        ++z;
        --cnt;
    }
}

References tinysimd::is_not_aligned, Nektar::UnitTests::w(), and Nektar::UnitTests::z().

Referenced by Vmath::Vvtvvtm().

◆ Vvtvvtp()

template<class T , typename = typename std::enable_if< std::is_floating_point<T>::value>::type>

void Vmath::SIMD::Vvtvvtp	(	const size_t	n,
		const T *	v,
		const T *	w,
		const T *	x,
		const T *	y,
		T *	z
	)

inline

vvtvvtp (vector times vector plus vector times vector):

Definition at line 356 of file VmathSIMD.hpp.

{
    using namespace tinysimd;
    using vec_t = simd<T>;
 
    size_t cnt = n;
    // Vectorized loop
    while (cnt >= vec_t::width)
    {
        // load
        vec_t vChunk;
        vChunk.load(v, is_not_aligned);
        vec_t wChunk;
        wChunk.load(w, is_not_aligned);
        vec_t yChunk;
        yChunk.load(y, is_not_aligned);
        vec_t xChunk;
        xChunk.load(x, is_not_aligned);
 
        // z = v * w + x * y;
        vec_t z1Chunk = vChunk * wChunk;
        vec_t z2Chunk = xChunk * yChunk;
        vec_t zChunk  = z1Chunk + z2Chunk;
 
        // store
        zChunk.store(z, is_not_aligned);
 
        // update pointers
        v += vec_t::width;
        w += vec_t::width;
        x += vec_t::width;
        y += vec_t::width;
        z += vec_t::width;
        cnt -= vec_t::width;
    }
 
    // spillover loop
    while (cnt)
    {
        // z = v * w + x * y;
        T z1 = (*v) * (*w);
        T z2 = (*x) * (*y);
        *z   = z1 + z2;
        // update pointers
        ++v;
        ++w;
        ++x;
        ++y;
        ++z;
        --cnt;
    }
}

References tinysimd::is_not_aligned, Nektar::UnitTests::w(), and Nektar::UnitTests::z().

Referenced by Nektar::VmathSIMDUnitTests::BOOST_AUTO_TEST_CASE(), and Vmath::Vvtvvtp().

Functions

Function Documentation

◆ Gathr()

◆ Vadd()

◆ Vmul()

◆ Vvtvm()

◆ Vvtvp()

◆ Vvtvvtm()

◆ Vvtvvtp()