46void QuadIProduct(
bool colldir0,
bool colldir1,
int numElmt,
int nquad0,
47 int nquad1,
int nmodes0,
int nmodes1,
54 int totpoints = nquad0 * nquad1;
55 int totmodes = nmodes0 * nmodes1;
57 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
59 if (colldir0 && colldir1)
61 Vmath::Vcopy(numElmt * totmodes, wsp.get(), 1, output.get(), 1);
68 for (
int i = 0; i < nquad0; ++i)
71 &wsp1[i * nquad1 * numElmt], 1);
76 Blas::Dgemm(
'T',
'N', nquad1 * numElmt, nmodes0, nquad0, 1.0,
77 &wsp[0], nquad0, base0.get(), nquad0, 0.0, &wsp1[0],
85 for (
int i = 0; i < nquad1; ++i)
88 &wsp[i * numElmt * nmodes0], 1);
94 Blas::Dgemm(
'T',
'N', numElmt * nmodes0, nmodes1, nquad1, 1.0,
95 &wsp1[0], nquad1, base1.get(), nquad1, 0.0, &wsp[0],
99 for (
int i = 0; i < totmodes; ++i)
109 for (
int i = 0; i < nquad1; ++i)
112 &output[i * numElmt * nmodes0], 1);
117 Blas::Dgemm(
'T',
'N', nmodes0, nmodes1, nquad1, 1.0, &wsp1[0],
118 nquad1, base1.get(), nquad1, 0.0, &output[0],
128void TriIProduct(
bool sortTopVertex,
int numElmt,
int nquad0,
int nquad1,
129 int nmodes0,
int nmodes1,
138 int totpoints = nquad0 * nquad1;
140 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
144 Blas::Dgemm(
'T',
'N', nquad1 * numElmt, nmodes0, nquad0, 1.0, &wsp[0],
145 nquad0, base0.get(), nquad0, 0.0, &wsp1[0], nquad1 * numElmt);
149 for (mode = i = 0; i < nmodes0; ++i)
151 Blas::Dgemm(
'T',
'N', nmodes1 - i, numElmt, nquad1, 1.0,
152 base1.get() + mode * nquad1, nquad1,
153 wsp1.get() + i * nquad1 * numElmt, nquad1, 0.0,
154 &output[mode], totmodes);
162 Blas::Dgemv(
'T', nquad1, numElmt, 1.0, wsp1.get() + nquad1 * numElmt,
163 nquad1, base1.get() + nquad1, 1, 1.0, &output[1], totmodes);
170void HexIProduct(
bool colldir0,
bool colldir1,
bool colldir2,
int numElmt,
171 int nquad0,
int nquad1,
int nquad2,
int nmodes0,
int nmodes1,
179 int totmodes = nmodes0 * nmodes1 * nmodes2;
180 int totpoints = nquad0 * nquad1 * nquad2;
182 if (colldir0 && colldir1 && colldir2)
185 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, output, 1);
189 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
196 if (numElmt < nmodes0 || 1)
201 for (
int n = 0; n < numElmt; ++n)
205 for (
int i = 0; i < nmodes0; ++i)
208 nquad0, wsp1.get() + nquad1 * nquad2 * i,
214 Blas::Dgemm(
'T',
'N', nquad1 * nquad2, nmodes0, nquad0, 1.0,
215 &wsp[n * totpoints], nquad0, base0.get(),
216 nquad0, 0.0, wsp1.get(), nquad1 * nquad2);
222 for (
int i = 0; i < nmodes1; ++i)
225 wsp2.get() + nquad2 * nmodes0 * i, 1);
230 Blas::Dgemm(
'T',
'N', nquad2 * nmodes0, nmodes1, nquad1,
231 1.0, wsp1.get(), nquad1, base1.get(), nquad1,
232 0.0, wsp2.get(), nquad2 * nmodes0);
238 for (
int i = 0; i < nmodes2; ++i)
241 nmodes0 * nmodes1, wsp2.get() + i, nquad2,
242 &output[n * totmodes] + nmodes0 * nmodes1 * i, 1);
247 Blas::Dgemm(
'T',
'N', nmodes0 * nmodes1, nmodes2, nquad2,
248 1.0, wsp2.get(), nquad2, base2.get(), nquad2,
249 0.0, &output[n * totmodes], nmodes0 * nmodes1);
256 wsp1 + numElmt * (max(totpoints, totmodes));
260 for (
int i = 0; i < nquad0; ++i)
262 Vmath::Vcopy(nquad1 * nquad2 * numElmt, &wsp[i], nquad0,
263 &wsp1[i * nquad1 * nquad2 * numElmt], 1);
269 Blas::Dgemm(
'T',
'N', nquad1 * nquad2 * numElmt, nmodes0,
270 nquad0, 1.0, &wsp[0], nquad0, base0.get(), nquad0,
271 0.0, &wsp1[0], nquad1 * nquad2 * numElmt);
276 for (
int i = 0; i < nquad1; ++i)
278 Vmath::Vcopy(nquad2 * numElmt * nmodes0, &wsp1[i], nquad1,
279 &wsp2[i * nquad2 * numElmt * nmodes0], 1);
284 Blas::Dgemm(
'T',
'N', nquad2 * numElmt * nmodes0, nmodes1,
285 nquad1, 1.0, &wsp1[0], nquad1, base1.get(), nquad1,
286 0.0, &wsp2[0], nquad2 * numElmt * nmodes0);
293 for (
int i = 0; i < nquad2; ++i)
296 &output[i * nmodes0 * nmodes1], 1);
301 Blas::Dgemm(
'T',
'N', numElmt * nmodes0 * nmodes1, nmodes2,
302 nquad2, 1.0, &wsp2[0], nquad2, base2.get(),
303 nquad2, 0.0, &wsp1[0],
304 numElmt * nmodes0 * nmodes1);
307 for (
int i = 0; i < totmodes; ++i)
309 Vmath::Vcopy(numElmt, &wsp1[i * numElmt], 1, &output[i],
317 for (
int i = 0; i < nquad2; ++i)
320 &output[i * nmodes0 * nmodes1], 1);
325 Blas::Dgemm(
'T',
'N', numElmt * nmodes0 * nmodes1, nmodes2,
326 nquad2, 1.0, &wsp2[0], nquad2, base2.get(),
327 nquad2, 0.0, &output[0],
328 numElmt * nmodes0 * nmodes1);
339 int nquad2,
int nmodes0,
int nmodes1,
int nmodes2,
348 nmodes0, nmodes1, nmodes2);
349 int totpoints = nquad0 * nquad1 * nquad2;
353 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
356 wsp + numElmt * nquad2 * (max(nquad0 * nquad1, nmodes0 * nmodes1));
359 Blas::Dgemm(
'T',
'N', nquad1 * nquad2 * numElmt, nmodes0, nquad0, 1.0,
360 wsp.get(), nquad0, base0.get(), nquad0, 0.0, wsp1.get(),
361 nquad1 * nquad2 * numElmt);
364 Blas::Dgemm(
'T',
'N', nquad2 * numElmt * nmodes0, nmodes1, nquad1, 1.0,
365 wsp1.get(), nquad1, base1.get(), nquad1, 0.0, wsp.get(),
366 nquad2 * numElmt * nmodes0);
370 mode = mode1 = cnt = 0;
371 for (
int i = 0; i < nmodes0; ++i)
373 cnt = i * nquad2 * numElmt;
374 for (
int j = 0; j < nmodes1; ++j)
376 Blas::Dgemm(
'T',
'N', nmodes2 - i, numElmt, nquad2, 1.0,
377 base2.get() + mode * nquad2, nquad2,
378 wsp.get() + j * nquad2 * numElmt * nmodes0 + cnt,
379 nquad2, 0.0, output.get() + mode1, totmodes);
380 mode1 += nmodes2 - i;
391 for (
int j = 0; j < nmodes1; ++j)
394 wsp.get() + j * nquad2 * numElmt * nmodes0 +
396 nquad2, base2.get() + nquad2, 1, 1.0,
397 &output[j * nmodes2 + 1], totmodes);
405void PyrIProduct(
bool sortTopVertex,
int numElmt,
int nquad0,
int nquad1,
406 int nquad2,
int nmodes0,
int nmodes1,
int nmodes2,
415 nmodes0, nmodes1, nmodes2);
416 int totpoints = nquad0 * nquad1 * nquad2;
421 numElmt * (nquad1 * nquad2 * nmodes0 +
422 nquad2 * max(nquad0 * nquad1, nmodes0 * nmodes1)),
423 "Insufficient workspace size");
425 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
428 wsp + numElmt * nquad2 * (max(nquad0 * nquad1, nmodes0 * nmodes1));
431 Blas::Dgemm(
'T',
'N', nquad1 * nquad2 * numElmt, nmodes0, nquad0, 1.0,
432 wsp.get(), nquad0, base0.get(), nquad0, 0.0, wsp1.get(),
433 nquad1 * nquad2 * numElmt);
437 for (
int i = 0; i < nmodes0; ++i)
439 Blas::Dgemm(
'T',
'N', nquad2 * numElmt, nmodes1, nquad1, 1.0,
440 wsp1.get() + i * nquad1 * nquad2 * numElmt, nquad1,
441 base1.get(), nquad1, 0.0,
442 wsp.get() + mode * nquad2 * numElmt, nquad2 * numElmt);
447 mode = mode1 = cnt = 0;
448 for (
int i = 0; i < nmodes0; ++i)
450 for (
int j = 0; j < nmodes1; ++j, ++cnt)
452 int ijmax = max(i, j);
453 Blas::Dgemm(
'T',
'N', nmodes2 - ijmax, numElmt, nquad2, 1.0,
454 base2.get() + mode * nquad2, nquad2,
455 wsp.get() + cnt * nquad2 * numElmt, nquad2, 0.0,
456 output.get() + mode1, totmodes);
457 mode += nmodes2 - ijmax;
458 mode1 += nmodes2 - ijmax;
462 for (
int j = nmodes1; j < nmodes2; ++j)
464 int ijmax = max(i, j);
465 mode += nmodes2 - ijmax;
473 for (
int n = 0; n < numElmt; ++n)
476 output[1 + n * totmodes] +=
478 &wsp[nquad2 * numElmt + n * nquad2], 1);
481 output[1 + n * totmodes] +=
483 &wsp[nquad2 * nmodes1 * numElmt + n * nquad2], 1);
487 nquad2, base2.get() + nquad2, 1,
488 &wsp[nquad2 * (nmodes1 + 1) * numElmt + n * nquad2], 1);
496void TetIProduct(
bool sortTopEdge,
int numElmt,
int nquad0,
int nquad1,
497 int nquad2,
int nmodes0,
int nmodes1,
int nmodes2,
506 nmodes0, nmodes1, nmodes2);
507 int totpoints = nquad0 * nquad1 * nquad2;
511 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
516 (max(nquad0 * nquad1, nmodes0 * (2 * nmodes1 - nmodes0 + 1) / 2));
519 Blas::Dgemm(
'T',
'N', nquad1 * nquad2 * numElmt, nmodes0, nquad0, 1.0,
520 wsp.get(), nquad0, base0.get(), nquad0, 0.0, wsp1.get(),
521 nquad1 * nquad2 * numElmt);
525 for (
int i = 0; i < nmodes0; ++i)
527 Blas::Dgemm(
'T',
'N', nquad2 * numElmt, nmodes1 - i, nquad1, 1.0,
528 wsp1.get() + i * nquad1 * nquad2 * numElmt, nquad1,
529 base1.get() + mode * nquad1, nquad1, 0.0,
530 wsp.get() + mode * nquad2 * numElmt, nquad2 * numElmt);
540 for (
int n = 0; n < numElmt; ++n)
543 wsp1.get() + numElmt * nquad1 * nquad2 +
545 nquad1, base1.get() + nquad1, 1, 1.0,
546 wsp.get() + nquad2 * numElmt + n * nquad2, 1);
551 mode = mode1 = cnt = 0;
552 for (
int i = 0; i < nmodes0; ++i)
554 for (
int j = 0; j < nmodes1 - i; ++j, ++cnt)
556 Blas::Dgemm(
'T',
'N', nmodes2 - i - j, numElmt, nquad2, 1.0,
557 base2.get() + mode * nquad2, nquad2,
558 wsp.get() + cnt * nquad2 * numElmt, nquad2, 0.0,
559 output.get() + mode1, totmodes);
560 mode += nmodes2 - i - j;
561 mode1 += nmodes2 - i - j;
565 mode += (nmodes2 - nmodes1) * (nmodes2 - nmodes1 + 1) / 2;
572 for (
int n = 0; n < numElmt; ++n)
575 output[1 + n * totmodes] +=
577 &wsp[nquad2 * numElmt + n * nquad2], 1);
580 output[1 + n * totmodes] +=
582 &wsp[nquad2 * nmodes1 * numElmt + n * nquad2], 1);
#define ASSERTL1(condition, msg)
Assert Level 1 – Debugging which is used whether in FULLDEBUG or DEBUG compilation mode....
static void Dgemv(const char &trans, const int &m, const int &n, const double &alpha, const double *a, const int &lda, const double *x, const int &incx, const double &beta, double *y, const int &incy)
BLAS level 2: Matrix vector multiply y = alpha A x plus beta y where A[m x n].
static double Ddot(const int &n, const double *x, const int &incx, const double *y, const int &incy)
BLAS level 1: output = .
static void Dgemm(const char &transa, const char &transb, const int &m, const int &n, const int &k, const double &alpha, const double *a, const int &lda, const double *b, const int &ldb, const double &beta, double *c, const int &ldc)
BLAS level 3: Matrix-matrix multiply C = A x B where op(A)[m x k], op(B)[k x n], C[m x n] DGEMM perfo...
void TetIProduct(bool sortTopEdge, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void PyrIProduct(bool sortTopVert, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void TriIProduct(bool sortTopVertex, int numElmt, int nquad0, int nquad1, int nmodes0, int nmodes1, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void QuadIProduct(bool colldir0, bool colldir1, int numElmt, int nquad0, int nquad1, int nmodes0, int nmodes1, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void PrismIProduct(bool sortTopVert, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void HexIProduct(bool colldir0, bool colldir1, bool colldir2, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
int getNumberOfCoefficients(int Na, int Nb, int Nc)
int getNumberOfCoefficients(int Na, int Nb, int Nc)
int getNumberOfCoefficients(int Na, int Nb, int Nc)
int getNumberOfCoefficients(int Na, int Nb)
void Vmul(int n, const T *x, const int incx, const T *y, const int incy, T *z, const int incz)
Multiply vector z = x*y.
void Vcopy(int n, const T *x, const int incx, T *y, const int incy)