48 void QuadIProduct(
bool colldir0,
bool colldir1,
int numElmt,
int nquad0,
49 int nquad1,
int nmodes0,
int nmodes1,
56 int totpoints = nquad0 * nquad1;
57 int totmodes = nmodes0 * nmodes1;
59 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
61 if (colldir0 && colldir1)
63 Vmath::Vcopy(numElmt * totmodes, wsp.get(), 1, output.get(), 1);
70 for (
int i = 0; i < nquad0; ++i)
73 &wsp1[i * nquad1 * numElmt], 1);
78 Blas::Dgemm(
'T',
'N', nquad1 * numElmt, nmodes0, nquad0, 1.0,
79 &wsp[0], nquad0, base0.get(), nquad0, 0.0, &wsp1[0],
88 for (
int i = 0; i < nquad1; ++i)
91 &wsp[i * numElmt * nmodes0], 1);
97 Blas::Dgemm(
'T',
'N', numElmt * nmodes0, nmodes1, nquad1, 1.0,
98 &wsp1[0], nquad1, base1.get(), nquad1, 0.0, &wsp[0],
102 for (
int i = 0; i < totmodes; ++i)
112 for (
int i = 0; i < nquad1; ++i)
115 &output[i * numElmt * nmodes0], 1);
120 Blas::Dgemm(
'T',
'N', nmodes0, nmodes1, nquad1, 1.0, &wsp1[0],
121 nquad1, base1.get(), nquad1, 0.0, &output[0],
131 void TriIProduct(
bool sortTopVertex,
int numElmt,
int nquad0,
int nquad1,
132 int nmodes0,
int nmodes1,
141 int totpoints = nquad0 * nquad1;
143 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
147 Blas::Dgemm(
'T',
'N', nquad1 * numElmt, nmodes0, nquad0, 1.0, &wsp[0],
148 nquad0, base0.get(), nquad0, 0.0, &wsp1[0], nquad1 * numElmt);
152 for (mode = i = 0; i < nmodes0; ++i)
154 Blas::Dgemm(
'T',
'N', nmodes1 - i, numElmt, nquad1, 1.0,
155 base1.get() + mode * nquad1, nquad1,
156 wsp1.get() + i * nquad1 * numElmt, nquad1, 0.0,
157 &output[mode], totmodes);
165 Blas::Dgemv(
'T', nquad1, numElmt, 1.0, wsp1.get() + nquad1 * numElmt,
166 nquad1, base1.get() + nquad1, 1, 1.0, &output[1], totmodes);
173 void HexIProduct(
bool colldir0,
bool colldir1,
bool colldir2,
int numElmt,
174 int nquad0,
int nquad1,
int nquad2,
int nmodes0,
int nmodes1,
182 int totmodes = nmodes0 * nmodes1 * nmodes2;
183 int totpoints = nquad0 * nquad1 * nquad2;
185 if (colldir0 && colldir1 && colldir2)
188 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, output, 1);
192 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
199 if (numElmt < nmodes0 || 1)
204 for (
int n = 0; n < numElmt; ++n)
209 for (
int i = 0; i < nmodes0; ++i)
212 nquad0, wsp1.get() + nquad1 * nquad2 * i,
218 Blas::Dgemm(
'T',
'N', nquad1 * nquad2, nmodes0, nquad0, 1.0,
219 &wsp[n * totpoints], nquad0, base0.get(),
220 nquad0, 0.0, wsp1.get(), nquad1 * nquad2);
226 for (
int i = 0; i < nmodes1; ++i)
229 wsp2.get() + nquad2 * nmodes0 * i, 1);
234 Blas::Dgemm(
'T',
'N', nquad2 * nmodes0, nmodes1, nquad1,
235 1.0, wsp1.get(), nquad1, base1.get(), nquad1,
236 0.0, wsp2.get(), nquad2 * nmodes0);
242 for (
int i = 0; i < nmodes2; ++i)
245 nmodes0 * nmodes1, wsp2.get() + i, nquad2,
246 &output[n * totmodes] + nmodes0 * nmodes1 * i, 1);
251 Blas::Dgemm(
'T',
'N', nmodes0 * nmodes1, nmodes2, nquad2,
252 1.0, wsp2.get(), nquad2, base2.get(), nquad2,
253 0.0, &output[n * totmodes], nmodes0 * nmodes1);
260 wsp1 + numElmt * (max(totpoints, totmodes));
264 for (
int i = 0; i < nquad0; ++i)
266 Vmath::Vcopy(nquad1 * nquad2 * numElmt, &wsp[i], nquad0,
267 &wsp1[i * nquad1 * nquad2 * numElmt], 1);
273 Blas::Dgemm(
'T',
'N', nquad1 * nquad2 * numElmt, nmodes0,
274 nquad0, 1.0, &wsp[0], nquad0, base0.get(), nquad0,
275 0.0, &wsp1[0], nquad1 * nquad2 * numElmt);
280 for (
int i = 0; i < nquad1; ++i)
282 Vmath::Vcopy(nquad2 * numElmt * nmodes0, &wsp1[i], nquad1,
283 &wsp2[i * nquad2 * numElmt * nmodes0], 1);
288 Blas::Dgemm(
'T',
'N', nquad2 * numElmt * nmodes0, nmodes1,
289 nquad1, 1.0, &wsp1[0], nquad1, base1.get(), nquad1,
290 0.0, &wsp2[0], nquad2 * numElmt * nmodes0);
297 for (
int i = 0; i < nquad2; ++i)
300 &output[i * nmodes0 * nmodes1], 1);
305 Blas::Dgemm(
'T',
'N', numElmt * nmodes0 * nmodes1, nmodes2,
306 nquad2, 1.0, &wsp2[0], nquad2, base2.get(),
307 nquad2, 0.0, &wsp1[0],
308 numElmt * nmodes0 * nmodes1);
311 for (
int i = 0; i < totmodes; ++i)
313 Vmath::Vcopy(numElmt, &wsp1[i * numElmt], 1, &output[i],
321 for (
int i = 0; i < nquad2; ++i)
324 &output[i * nmodes0 * nmodes1], 1);
329 Blas::Dgemm(
'T',
'N', numElmt * nmodes0 * nmodes1, nmodes2,
330 nquad2, 1.0, &wsp2[0], nquad2, base2.get(),
331 nquad2, 0.0, &output[0],
332 numElmt * nmodes0 * nmodes1);
343 int nquad2,
int nmodes0,
int nmodes1,
int nmodes2,
352 nmodes0, nmodes1, nmodes2);
353 int totpoints = nquad0 * nquad1 * nquad2;
357 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
360 wsp + numElmt * nquad2 * (max(nquad0 * nquad1, nmodes0 * nmodes1));
363 Blas::Dgemm(
'T',
'N', nquad1 * nquad2 * numElmt, nmodes0, nquad0, 1.0,
364 wsp.get(), nquad0, base0.get(), nquad0, 0.0, wsp1.get(),
365 nquad1 * nquad2 * numElmt);
368 Blas::Dgemm(
'T',
'N', nquad2 * numElmt * nmodes0, nmodes1, nquad1, 1.0,
369 wsp1.get(), nquad1, base1.get(), nquad1, 0.0, wsp.get(),
370 nquad2 * numElmt * nmodes0);
374 mode = mode1 = cnt = 0;
375 for (
int i = 0; i < nmodes0; ++i)
377 cnt = i * nquad2 * numElmt;
378 for (
int j = 0; j < nmodes1; ++j)
380 Blas::Dgemm(
'T',
'N', nmodes2 - i, numElmt, nquad2, 1.0,
381 base2.get() + mode * nquad2, nquad2,
382 wsp.get() + j * nquad2 * numElmt * nmodes0 + cnt,
383 nquad2, 0.0, output.get() + mode1, totmodes);
384 mode1 += nmodes2 - i;
395 for (
int j = 0; j < nmodes1; ++j)
398 wsp.get() + j * nquad2 * numElmt * nmodes0 +
400 nquad2, base2.get() + nquad2, 1, 1.0,
401 &output[j * nmodes2 + 1], totmodes);
409 void PyrIProduct(
bool sortTopVertex,
int numElmt,
int nquad0,
int nquad1,
410 int nquad2,
int nmodes0,
int nmodes1,
int nmodes2,
419 nmodes0, nmodes1, nmodes2);
420 int totpoints = nquad0 * nquad1 * nquad2;
425 numElmt * (nquad1 * nquad2 * nmodes0 +
426 nquad2 * max(nquad0 * nquad1, nmodes0 * nmodes1)),
427 "Insufficient workspace size");
429 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
432 wsp + numElmt * nquad2 * (max(nquad0 * nquad1, nmodes0 * nmodes1));
435 Blas::Dgemm(
'T',
'N', nquad1 * nquad2 * numElmt, nmodes0, nquad0, 1.0,
436 wsp.get(), nquad0, base0.get(), nquad0, 0.0, wsp1.get(),
437 nquad1 * nquad2 * numElmt);
441 for (
int i = 0; i < nmodes0; ++i)
443 Blas::Dgemm(
'T',
'N', nquad2 * numElmt, nmodes1, nquad1, 1.0,
444 wsp1.get() + i * nquad1 * nquad2 * numElmt, nquad1,
445 base1.get(), nquad1, 0.0,
446 wsp.get() + mode * nquad2 * numElmt, nquad2 * numElmt);
451 mode = mode1 = cnt = 0;
452 for (
int i = 0; i < nmodes0; ++i)
454 for (
int j = 0; j < nmodes1; ++j, ++cnt)
456 int ijmax = max(i, j);
457 Blas::Dgemm(
'T',
'N', nmodes2 - ijmax, numElmt, nquad2, 1.0,
458 base2.get() + mode * nquad2, nquad2,
459 wsp.get() + cnt * nquad2 * numElmt, nquad2, 0.0,
460 output.get() + mode1, totmodes);
461 mode += nmodes2 - ijmax;
462 mode1 += nmodes2 - ijmax;
466 for (
int j = nmodes1; j < nmodes2; ++j)
468 int ijmax = max(i, j);
469 mode += nmodes2 - ijmax;
477 for (
int n = 0; n < numElmt; ++n)
480 output[1 + n * totmodes] +=
482 &wsp[nquad2 * numElmt + n * nquad2], 1);
485 output[1 + n * totmodes] +=
487 &wsp[nquad2 * nmodes1 * numElmt + n * nquad2], 1);
491 nquad2, base2.get() + nquad2, 1,
492 &wsp[nquad2 * (nmodes1 + 1) * numElmt + n * nquad2], 1);
500 void TetIProduct(
bool sortTopEdge,
int numElmt,
int nquad0,
int nquad1,
501 int nquad2,
int nmodes0,
int nmodes1,
int nmodes2,
510 nmodes0, nmodes1, nmodes2);
511 int totpoints = nquad0 * nquad1 * nquad2;
515 Vmath::Vmul(numElmt * totpoints, jac, 1, input, 1, wsp, 1);
520 (max(nquad0 * nquad1, nmodes0 * (2 * nmodes1 - nmodes0 + 1) / 2));
523 Blas::Dgemm(
'T',
'N', nquad1 * nquad2 * numElmt, nmodes0, nquad0, 1.0,
524 wsp.get(), nquad0, base0.get(), nquad0, 0.0, wsp1.get(),
525 nquad1 * nquad2 * numElmt);
529 for (
int i = 0; i < nmodes0; ++i)
531 Blas::Dgemm(
'T',
'N', nquad2 * numElmt, nmodes1 - i, nquad1, 1.0,
532 wsp1.get() + i * nquad1 * nquad2 * numElmt, nquad1,
533 base1.get() + mode * nquad1, nquad1, 0.0,
534 wsp.get() + mode * nquad2 * numElmt, nquad2 * numElmt);
544 for (
int n = 0; n < numElmt; ++n)
547 wsp1.get() + numElmt * nquad1 * nquad2 +
549 nquad1, base1.get() + nquad1, 1, 1.0,
550 wsp.get() + nquad2 * numElmt + n * nquad2, 1);
555 mode = mode1 = cnt = 0;
556 for (
int i = 0; i < nmodes0; ++i)
558 for (
int j = 0; j < nmodes1 - i; ++j, ++cnt)
560 Blas::Dgemm(
'T',
'N', nmodes2 - i - j, numElmt, nquad2, 1.0,
561 base2.get() + mode * nquad2, nquad2,
562 wsp.get() + cnt * nquad2 * numElmt, nquad2, 0.0,
563 output.get() + mode1, totmodes);
564 mode += nmodes2 - i - j;
565 mode1 += nmodes2 - i - j;
569 mode += (nmodes2 - nmodes1) * (nmodes2 - nmodes1 + 1) / 2;
576 for (
int n = 0; n < numElmt; ++n)
579 output[1 + n * totmodes] +=
581 &wsp[nquad2 * numElmt + n * nquad2], 1);
584 output[1 + n * totmodes] +=
586 &wsp[nquad2 * nmodes1 * numElmt + n * nquad2], 1);
#define ASSERTL1(condition, msg)
Assert Level 1 – Debugging which is used whether in FULLDEBUG or DEBUG compilation mode....
static void Dgemv(const char &trans, const int &m, const int &n, const double &alpha, const double *a, const int &lda, const double *x, const int &incx, const double &beta, double *y, const int &incy)
BLAS level 2: Matrix vector multiply y = A x where A[m x n].
static double Ddot(const int &n, const double *x, const int &incx, const double *y, const int &incy)
BLAS level 1: output = .
static void Dgemm(const char &transa, const char &transb, const int &m, const int &n, const int &k, const double &alpha, const double *a, const int &lda, const double *b, const int &ldb, const double &beta, double *c, const int &ldc)
BLAS level 3: Matrix-matrix multiply C = A x B where op(A)[m x k], op(B)[k x n], C[m x n] DGEMM perfo...
void TetIProduct(bool sortTopEdge, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void PyrIProduct(bool sortTopVert, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void TriIProduct(bool sortTopVertex, int numElmt, int nquad0, int nquad1, int nmodes0, int nmodes1, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void QuadIProduct(bool colldir0, bool colldir1, int numElmt, int nquad0, int nquad1, int nmodes0, int nmodes1, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void PrismIProduct(bool sortTopVert, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void HexIProduct(bool colldir0, bool colldir1, bool colldir2, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
int getNumberOfCoefficients(int Na)
The above copyright notice and this permission notice shall be included.
void Vmul(int n, const T *x, const int incx, const T *y, const int incy, T *z, const int incz)
Multiply vector z = x*y.
void Vcopy(int n, const T *x, const int incx, T *y, const int incy)