48void QuadIProduct(
bool colldir0,
bool colldir1,
int numElmt,
int nquad0,
49 int nquad1,
int nmodes0,
int nmodes1,
56 int totpoints = nquad0 * nquad1;
57 int totmodes = nmodes0 * nmodes1;
59 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
61 if (colldir0 && colldir1)
63 Vmath::Vcopy(numElmt * totmodes, wsp.get(), 1, output.get(), 1);
70 for (
int i = 0; i < nquad0; ++i)
73 &wsp1[i * nquad1 * numElmt], 1);
78 Blas::Dgemm(
'T',
'N', nquad1 * numElmt, nmodes0, nquad0, 1.0,
79 &wsp[0], nquad0, base0.get(), nquad0, 0.0, &wsp1[0],
88 for (
int i = 0; i < nquad1; ++i)
91 &wsp[i * numElmt * nmodes0], 1);
97 Blas::Dgemm(
'T',
'N', numElmt * nmodes0, nmodes1, nquad1, 1.0,
98 &wsp1[0], nquad1, base1.get(), nquad1, 0.0, &wsp[0],
102 for (
int i = 0; i < totmodes; ++i)
112 for (
int i = 0; i < nquad1; ++i)
115 &output[i * numElmt * nmodes0], 1);
120 Blas::Dgemm(
'T',
'N', nmodes0, nmodes1, nquad1, 1.0, &wsp1[0],
121 nquad1, base1.get(), nquad1, 0.0, &output[0],
131void TriIProduct(
bool sortTopVertex,
int numElmt,
int nquad0,
int nquad1,
132 int nmodes0,
int nmodes1,
141 int totpoints = nquad0 * nquad1;
143 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
147 Blas::Dgemm(
'T',
'N', nquad1 * numElmt, nmodes0, nquad0, 1.0, &wsp[0],
148 nquad0, base0.get(), nquad0, 0.0, &wsp1[0], nquad1 * numElmt);
152 for (mode = i = 0; i < nmodes0; ++i)
154 Blas::Dgemm(
'T',
'N', nmodes1 - i, numElmt, nquad1, 1.0,
155 base1.get() + mode * nquad1, nquad1,
156 wsp1.get() + i * nquad1 * numElmt, nquad1, 0.0,
157 &output[mode], totmodes);
165 Blas::Dgemv(
'T', nquad1, numElmt, 1.0, wsp1.get() + nquad1 * numElmt,
166 nquad1, base1.get() + nquad1, 1, 1.0, &output[1], totmodes);
173void HexIProduct(
bool colldir0,
bool colldir1,
bool colldir2,
int numElmt,
174 int nquad0,
int nquad1,
int nquad2,
int nmodes0,
int nmodes1,
182 int totmodes = nmodes0 * nmodes1 * nmodes2;
183 int totpoints = nquad0 * nquad1 * nquad2;
185 if (colldir0 && colldir1 && colldir2)
188 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, output, 1);
192 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
199 if (numElmt < nmodes0 || 1)
204 for (
int n = 0; n < numElmt; ++n)
209 for (
int i = 0; i < nmodes0; ++i)
212 nquad0, wsp1.get() + nquad1 * nquad2 * i,
218 Blas::Dgemm(
'T',
'N', nquad1 * nquad2, nmodes0, nquad0, 1.0,
219 &wsp[n * totpoints], nquad0, base0.get(),
220 nquad0, 0.0, wsp1.get(), nquad1 * nquad2);
226 for (
int i = 0; i < nmodes1; ++i)
229 wsp2.get() + nquad2 * nmodes0 * i, 1);
234 Blas::Dgemm(
'T',
'N', nquad2 * nmodes0, nmodes1, nquad1,
235 1.0, wsp1.get(), nquad1, base1.get(), nquad1,
236 0.0, wsp2.get(), nquad2 * nmodes0);
242 for (
int i = 0; i < nmodes2; ++i)
245 nmodes0 * nmodes1, wsp2.get() + i, nquad2,
246 &output[n * totmodes] + nmodes0 * nmodes1 * i, 1);
251 Blas::Dgemm(
'T',
'N', nmodes0 * nmodes1, nmodes2, nquad2,
252 1.0, wsp2.get(), nquad2, base2.get(), nquad2,
253 0.0, &output[n * totmodes], nmodes0 * nmodes1);
260 wsp1 + numElmt * (max(totpoints, totmodes));
264 for (
int i = 0; i < nquad0; ++i)
266 Vmath::Vcopy(nquad1 * nquad2 * numElmt, &wsp[i], nquad0,
267 &wsp1[i * nquad1 * nquad2 * numElmt], 1);
273 Blas::Dgemm(
'T',
'N', nquad1 * nquad2 * numElmt, nmodes0,
274 nquad0, 1.0, &wsp[0], nquad0, base0.get(), nquad0,
275 0.0, &wsp1[0], nquad1 * nquad2 * numElmt);
280 for (
int i = 0; i < nquad1; ++i)
282 Vmath::Vcopy(nquad2 * numElmt * nmodes0, &wsp1[i], nquad1,
283 &wsp2[i * nquad2 * numElmt * nmodes0], 1);
288 Blas::Dgemm(
'T',
'N', nquad2 * numElmt * nmodes0, nmodes1,
289 nquad1, 1.0, &wsp1[0], nquad1, base1.get(), nquad1,
290 0.0, &wsp2[0], nquad2 * numElmt * nmodes0);
297 for (
int i = 0; i < nquad2; ++i)
300 &output[i * nmodes0 * nmodes1], 1);
305 Blas::Dgemm(
'T',
'N', numElmt * nmodes0 * nmodes1, nmodes2,
306 nquad2, 1.0, &wsp2[0], nquad2, base2.get(),
307 nquad2, 0.0, &wsp1[0],
308 numElmt * nmodes0 * nmodes1);
311 for (
int i = 0; i < totmodes; ++i)
313 Vmath::Vcopy(numElmt, &wsp1[i * numElmt], 1, &output[i],
321 for (
int i = 0; i < nquad2; ++i)
324 &output[i * nmodes0 * nmodes1], 1);
329 Blas::Dgemm(
'T',
'N', numElmt * nmodes0 * nmodes1, nmodes2,
330 nquad2, 1.0, &wsp2[0], nquad2, base2.get(),
331 nquad2, 0.0, &output[0],
332 numElmt * nmodes0 * nmodes1);
343 int nquad2,
int nmodes0,
int nmodes1,
int nmodes2,
352 nmodes0, nmodes1, nmodes2);
353 int totpoints = nquad0 * nquad1 * nquad2;
357 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
360 wsp + numElmt * nquad2 * (max(nquad0 * nquad1, nmodes0 * nmodes1));
363 Blas::Dgemm(
'T',
'N', nquad1 * nquad2 * numElmt, nmodes0, nquad0, 1.0,
364 wsp.get(), nquad0, base0.get(), nquad0, 0.0, wsp1.get(),
365 nquad1 * nquad2 * numElmt);
368 Blas::Dgemm(
'T',
'N', nquad2 * numElmt * nmodes0, nmodes1, nquad1, 1.0,
369 wsp1.get(), nquad1, base1.get(), nquad1, 0.0, wsp.get(),
370 nquad2 * numElmt * nmodes0);
374 mode = mode1 = cnt = 0;
375 for (
int i = 0; i < nmodes0; ++i)
377 cnt = i * nquad2 * numElmt;
378 for (
int j = 0; j < nmodes1; ++j)
380 Blas::Dgemm(
'T',
'N', nmodes2 - i, numElmt, nquad2, 1.0,
381 base2.get() + mode * nquad2, nquad2,
382 wsp.get() + j * nquad2 * numElmt * nmodes0 + cnt,
383 nquad2, 0.0, output.get() + mode1, totmodes);
384 mode1 += nmodes2 - i;
395 for (
int j = 0; j < nmodes1; ++j)
398 wsp.get() + j * nquad2 * numElmt * nmodes0 +
400 nquad2, base2.get() + nquad2, 1, 1.0,
401 &output[j * nmodes2 + 1], totmodes);
409void PyrIProduct(
bool sortTopVertex,
int numElmt,
int nquad0,
int nquad1,
410 int nquad2,
int nmodes0,
int nmodes1,
int nmodes2,
419 nmodes0, nmodes1, nmodes2);
420 int totpoints = nquad0 * nquad1 * nquad2;
425 numElmt * (nquad1 * nquad2 * nmodes0 +
426 nquad2 * max(nquad0 * nquad1, nmodes0 * nmodes1)),
427 "Insufficient workspace size");
429 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
432 wsp + numElmt * nquad2 * (max(nquad0 * nquad1, nmodes0 * nmodes1));
435 Blas::Dgemm(
'T',
'N', nquad1 * nquad2 * numElmt, nmodes0, nquad0, 1.0,
436 wsp.get(), nquad0, base0.get(), nquad0, 0.0, wsp1.get(),
437 nquad1 * nquad2 * numElmt);
441 for (
int i = 0; i < nmodes0; ++i)
443 Blas::Dgemm(
'T',
'N', nquad2 * numElmt, nmodes1, nquad1, 1.0,
444 wsp1.get() + i * nquad1 * nquad2 * numElmt, nquad1,
445 base1.get(), nquad1, 0.0,
446 wsp.get() + mode * nquad2 * numElmt, nquad2 * numElmt);
451 mode = mode1 = cnt = 0;
452 for (
int i = 0; i < nmodes0; ++i)
454 for (
int j = 0; j < nmodes1; ++j, ++cnt)
456 int ijmax = max(i, j);
457 Blas::Dgemm(
'T',
'N', nmodes2 - ijmax, numElmt, nquad2, 1.0,
458 base2.get() + mode * nquad2, nquad2,
459 wsp.get() + cnt * nquad2 * numElmt, nquad2, 0.0,
460 output.get() + mode1, totmodes);
461 mode += nmodes2 - ijmax;
462 mode1 += nmodes2 - ijmax;
466 for (
int j = nmodes1; j < nmodes2; ++j)
468 int ijmax = max(i, j);
469 mode += nmodes2 - ijmax;
477 for (
int n = 0; n < numElmt; ++n)
480 output[1 + n * totmodes] +=
482 &wsp[nquad2 * numElmt + n * nquad2], 1);
485 output[1 + n * totmodes] +=
487 &wsp[nquad2 * nmodes1 * numElmt + n * nquad2], 1);
491 nquad2, base2.get() + nquad2, 1,
492 &wsp[nquad2 * (nmodes1 + 1) * numElmt + n * nquad2], 1);
500void TetIProduct(
bool sortTopEdge,
int numElmt,
int nquad0,
int nquad1,
501 int nquad2,
int nmodes0,
int nmodes1,
int nmodes2,
510 nmodes0, nmodes1, nmodes2);
511 int totpoints = nquad0 * nquad1 * nquad2;
515 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
520 (max(nquad0 * nquad1, nmodes0 * (2 * nmodes1 - nmodes0 + 1) / 2));
523 Blas::Dgemm(
'T',
'N', nquad1 * nquad2 * numElmt, nmodes0, nquad0, 1.0,
524 wsp.get(), nquad0, base0.get(), nquad0, 0.0, wsp1.get(),
525 nquad1 * nquad2 * numElmt);
529 for (
int i = 0; i < nmodes0; ++i)
531 Blas::Dgemm(
'T',
'N', nquad2 * numElmt, nmodes1 - i, nquad1, 1.0,
532 wsp1.get() + i * nquad1 * nquad2 * numElmt, nquad1,
533 base1.get() + mode * nquad1, nquad1, 0.0,
534 wsp.get() + mode * nquad2 * numElmt, nquad2 * numElmt);
544 for (
int n = 0; n < numElmt; ++n)
547 wsp1.get() + numElmt * nquad1 * nquad2 +
549 nquad1, base1.get() + nquad1, 1, 1.0,
550 wsp.get() + nquad2 * numElmt + n * nquad2, 1);
555 mode = mode1 = cnt = 0;
556 for (
int i = 0; i < nmodes0; ++i)
558 for (
int j = 0; j < nmodes1 - i; ++j, ++cnt)
560 Blas::Dgemm(
'T',
'N', nmodes2 - i - j, numElmt, nquad2, 1.0,
561 base2.get() + mode * nquad2, nquad2,
562 wsp.get() + cnt * nquad2 * numElmt, nquad2, 0.0,
563 output.get() + mode1, totmodes);
564 mode += nmodes2 - i - j;
565 mode1 += nmodes2 - i - j;
569 mode += (nmodes2 - nmodes1) * (nmodes2 - nmodes1 + 1) / 2;
576 for (
int n = 0; n < numElmt; ++n)
579 output[1 + n * totmodes] +=
581 &wsp[nquad2 * numElmt + n * nquad2], 1);
584 output[1 + n * totmodes] +=
586 &wsp[nquad2 * nmodes1 * numElmt + n * nquad2], 1);
#define ASSERTL1(condition, msg)
Assert Level 1 – Debugging which is used whether in FULLDEBUG or DEBUG compilation mode....
static void Dgemv(const char &trans, const int &m, const int &n, const double &alpha, const double *a, const int &lda, const double *x, const int &incx, const double &beta, double *y, const int &incy)
BLAS level 2: Matrix vector multiply y = alpha A x plus beta y where A[m x n].
static double Ddot(const int &n, const double *x, const int &incx, const double *y, const int &incy)
BLAS level 1: output = .
static void Dgemm(const char &transa, const char &transb, const int &m, const int &n, const int &k, const double &alpha, const double *a, const int &lda, const double *b, const int &ldb, const double &beta, double *c, const int &ldc)
BLAS level 3: Matrix-matrix multiply C = A x B where op(A)[m x k], op(B)[k x n], C[m x n] DGEMM perfo...
void TetIProduct(bool sortTopEdge, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void PyrIProduct(bool sortTopVert, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void TriIProduct(bool sortTopVertex, int numElmt, int nquad0, int nquad1, int nmodes0, int nmodes1, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void QuadIProduct(bool colldir0, bool colldir1, int numElmt, int nquad0, int nquad1, int nmodes0, int nmodes1, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void PrismIProduct(bool sortTopVert, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void HexIProduct(bool colldir0, bool colldir1, bool colldir2, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
int getNumberOfCoefficients(int Na, int Nb, int Nc)
int getNumberOfCoefficients(int Na, int Nb, int Nc)
int getNumberOfCoefficients(int Na, int Nb, int Nc)
int getNumberOfCoefficients(int Na, int Nb)
The above copyright notice and this permission notice shall be included.
void Vmul(int n, const T *x, const int incx, const T *y, const int incy, T *z, const int incz)
Multiply vector z = x*y.
void Vcopy(int n, const T *x, const int incx, T *y, const int incy)