49 int nquad0,
int nquad1,
50 int nmodes0,
int nmodes1,
58 int totpoints = nquad0*nquad1;
59 int totmodes = nmodes0*nmodes1;
63 if(colldir0 && colldir1)
65 Vmath::Vcopy(numElmt*totmodes,wsp.get(),1,output.get(),1);
72 for(
int i = 0; i < nquad0; ++i)
75 &wsp1[i*nquad1*numElmt],1);
80 Blas::Dgemm(
'T',
'N', nquad1*numElmt,nmodes0,nquad0,1.0,
81 &wsp[0],nquad0, base0.get(), nquad0,
82 0.0,&wsp1[0], nquad1*numElmt);
91 for(
int i = 0; i < nquad1; ++i)
94 &wsp[i*numElmt*nmodes0],1);
100 Blas::Dgemm(
'T',
'N', numElmt*nmodes0, nmodes1, nquad1,
101 1.0, &wsp1[0], nquad1, base1.get(), nquad1,
102 0.0, &wsp[0], numElmt*nmodes0);
105 for(
int i = 0; i < totmodes; ++i)
107 Vmath::Vcopy(numElmt,&wsp[i*numElmt],1,&output[i],totmodes);
114 for(
int i = 0; i < nquad1; ++i)
117 &output[i*numElmt*nmodes0],1);
123 1.0, &wsp1[0], nquad1, base1.get(), nquad1,
124 0.0, &output[0], nmodes0);
135 int nquad1,
int nmodes0,
int nmodes1,
145 int totpoints = nquad0*nquad1;
147 Vmath::Vmul(numElmt*totpoints,jac,1,input,1,wsp,1);
151 Blas::Dgemm(
'T',
'N', nquad1*numElmt,nmodes0,nquad0,1.0,&wsp[0],nquad0,
152 base0.get(), nquad0, 0.0, &wsp1[0], nquad1*numElmt);
156 for (mode=i=0; i < nmodes0; ++i)
159 1.0, base1.get()+mode*nquad1, nquad1,
160 wsp1.get() + i*nquad1*numElmt, nquad1, 0.0,
161 &output[mode], totmodes);
169 Blas::Dgemv(
'T', nquad1,numElmt,1.0,wsp1.get()+nquad1*numElmt,nquad1,
170 base1.get()+nquad1,1,1.0, &output[1],totmodes);
178 void HexIProduct(
bool colldir0,
bool colldir1,
bool colldir2,
int numElmt,
179 int nquad0,
int nquad1,
int nquad2,
180 int nmodes0,
int nmodes1,
int nmodes2,
189 int totmodes = nmodes0*nmodes1*nmodes2;
190 int totpoints = nquad0 *nquad1 *nquad2;
193 if(colldir0 && colldir1 && colldir2)
196 Vmath::Vmul(numElmt*totpoints,jac,1,input,1,output,1);
200 Vmath::Vmul(numElmt*totpoints,jac,1,input,1,wsp,1);
207 if(numElmt < nmodes0 || 1)
212 for(
int n = 0; n < numElmt; ++n)
217 for(
int i = 0; i < nmodes0; ++i)
219 Vmath::Vcopy(nquad1*nquad2,&wsp[n*totpoints] + i,nquad0,
220 wsp1.get()+nquad1*nquad2*i,1);
225 Blas::Dgemm(
'T',
'N', nquad1*nquad2, nmodes0, nquad0,
226 1.0, &wsp[n*totpoints], nquad0,
228 0.0, wsp1.get(), nquad1*nquad2);
235 for(
int i = 0; i < nmodes1; ++i)
238 wsp2.get()+nquad2*nmodes0*i,1);
243 Blas::Dgemm(
'T',
'N', nquad2*nmodes0, nmodes1, nquad1,
244 1.0, wsp1.get(), nquad1,
246 0.0, wsp2.get(), nquad2*nmodes0);
252 for(
int i = 0; i < nmodes2; ++i)
255 &output[n*totmodes]+nmodes0*nmodes1*i,1);
260 Blas::Dgemm(
'T',
'N', nmodes0*nmodes1, nmodes2, nquad2,
261 1.0, wsp2.get(), nquad2,
263 0.0, &output[n*totmodes], nmodes0*nmodes1);
274 for(
int i = 0; i < nquad0; ++i)
277 &wsp1[i*nquad1*nquad2*numElmt],1);
283 Blas::Dgemm(
'T',
'N', nquad1*nquad2*numElmt, nmodes0, nquad0,
284 1.0, &wsp[0], nquad0, base0.get(), nquad0,
285 0.0, &wsp1[0], nquad1*nquad2*numElmt);
290 for(
int i = 0; i < nquad1; ++i)
293 &wsp2[i*nquad2*numElmt*nmodes0],1);
298 Blas::Dgemm(
'T',
'N', nquad2*numElmt*nmodes0, nmodes1, nquad1,
299 1.0, &wsp1[0], nquad1, base1.get(), nquad1,
300 0.0, &wsp2[0], nquad2*numElmt*nmodes0);
308 for(
int i = 0; i < nquad2; ++i)
311 &output[i*nmodes0*nmodes1],1);
316 Blas::Dgemm(
'T',
'N', numElmt*nmodes0*nmodes1, nmodes2,
317 nquad2, 1.0, &wsp2[0], nquad2,
318 base2.get(), nquad2, 0.0,
319 &wsp1[0], numElmt*nmodes0*nmodes1);
322 for(
int i = 0; i < totmodes; ++i)
325 &output[i], totmodes);
333 for(
int i = 0; i < nquad2; ++i)
336 &output[i*nmodes0*nmodes1],1);
341 Blas::Dgemm(
'T',
'N', numElmt*nmodes0*nmodes1, nmodes2,
342 nquad2, 1.0, &wsp2[0], nquad2,
343 base2.get(), nquad2, 0.0,
344 &output[0], numElmt*nmodes0*nmodes1);
356 int nquad0,
int nquad1,
int nquad2,
357 int nmodes0,
int nmodes1,
int nmodes2,
367 nmodes0,nmodes1,nmodes2);
368 int totpoints = nquad0*nquad1*nquad2;
372 Vmath::Vmul(numElmt*totpoints,jac,1,input,1,wsp,1);
375 * (max(nquad0*nquad1,
379 Blas::Dgemm(
'T',
'N', nquad1*nquad2*numElmt, nmodes0, nquad0,
380 1.0, wsp.get(), nquad0, base0.get(),
381 nquad0, 0.0, wsp1.get(), nquad1*nquad2*numElmt);
385 Blas::Dgemm(
'T',
'N', nquad2*numElmt*nmodes0, nmodes1, nquad1,
386 1.0, wsp1.get(), nquad1, base1.get(),
387 nquad1, 0.0, wsp.get(), nquad2*numElmt*nmodes0);
392 mode = mode1 = cnt = 0;
393 for(
int i = 0; i < nmodes0; ++i)
395 cnt = i*nquad2*numElmt;
396 for(
int j = 0; j < nmodes1; ++j)
399 1.0, base2.get()+mode*nquad2, nquad2,
400 wsp.get()+j*nquad2*numElmt*nmodes0 + cnt, nquad2,
401 0.0, output.get()+mode1, totmodes);
413 for(
int j =0; j < nmodes1; ++j)
416 wsp.get()+j*nquad2*numElmt*nmodes0+nquad2*numElmt,
417 nquad2, base2.get()+nquad2,1,1.0,
418 &output[j*nmodes2+1], totmodes);
428 int nquad0,
int nquad1,
int nquad2,
429 int nmodes0,
int nmodes1,
int nmodes2,
439 nmodes0,nmodes1,nmodes2);
440 int totpoints = nquad0*nquad1*nquad2;
444 ASSERTL1(wsp.size() >= numElmt*(nquad1*nquad2*nmodes0 +
445 nquad2*max(nquad0*nquad1,nmodes0*nmodes1)),
446 "Insufficient workspace size");
448 Vmath::Vmul(numElmt*totpoints,jac,1,input,1,wsp,1);
451 * (max(nquad0*nquad1,
455 Blas::Dgemm(
'T',
'N', nquad1*nquad2*numElmt, nmodes0, nquad0,
456 1.0, wsp.get(), nquad0, base0.get(),
457 nquad0, 0.0, wsp1.get(), nquad1*nquad2*numElmt);
461 for(
int i=0; i < nmodes0; ++i)
463 Blas::Dgemm(
'T',
'N', nquad2*numElmt, nmodes1, nquad1,
464 1.0, wsp1.get()+ i*nquad1*nquad2*numElmt, nquad1,
466 0.0, wsp.get() + mode*nquad2*numElmt,nquad2*numElmt);
471 mode = mode1 = cnt = 0;
472 for(
int i = 0; i < nmodes0; ++i)
474 for(
int j = 0; j < nmodes1; ++j, ++cnt)
476 int ijmax = max(i,j);
477 Blas::Dgemm(
'T',
'N', nmodes2-ijmax, numElmt, nquad2,
478 1.0, base2.get()+mode*nquad2, nquad2,
479 wsp.get()+cnt*nquad2*numElmt, nquad2,
480 0.0, output.get()+mode1, totmodes);
481 mode += nmodes2-ijmax;
482 mode1 += nmodes2-ijmax;
486 for(
int j = nmodes1; j < nmodes2; ++j)
488 int ijmax = max(i,j);
489 mode += nmodes2-ijmax;
497 for(
int n = 0; n < numElmt; ++n)
501 base2.get()+nquad2,1,
502 &wsp[nquad2*numElmt + n*nquad2],1);
506 base2.get()+nquad2,1,
507 &wsp[nquad2*nmodes1*numElmt+n*nquad2],1);
511 base2.get()+nquad2,1,
512 &wsp[nquad2*(nmodes1+1)*numElmt+n*nquad2],1);
523 int nquad0,
int nquad1,
int nquad2,
524 int nmodes0,
int nmodes1,
int nmodes2,
534 nmodes0,nmodes1,nmodes2);
535 int totpoints = nquad0*nquad1*nquad2;
539 Vmath::Vmul(numElmt*totpoints,jac,1,input,1,wsp,1);
542 nquad2*numElmt*(max(nquad0*nquad1,nmodes0*(2*nmodes1-nmodes0+1)/2));
546 Blas::Dgemm(
'T',
'N', nquad1*nquad2*numElmt, nmodes0, nquad0,
547 1.0, wsp.get(), nquad0, base0.get(),
548 nquad0, 0.0, wsp1.get(), nquad1*nquad2*numElmt);
552 for(
int i=0; i < nmodes0; ++i)
554 Blas::Dgemm(
'T',
'N', nquad2*numElmt, nmodes1-i, nquad1,
555 1.0, wsp1.get()+ i*nquad1*nquad2*numElmt, nquad1,
556 base1.get() + mode*nquad1, nquad1,
557 0.0, wsp.get() + mode*nquad2*numElmt,nquad2*numElmt);
568 for(
int n = 0; n < numElmt; ++n)
571 1.0, wsp1.get()+numElmt*nquad1*nquad2 +
572 n*nquad1*nquad2, nquad1,
573 base1.get()+nquad1, 1, 1.0,
574 wsp.get()+nquad2*numElmt + n*nquad2, 1);
579 mode = mode1 = cnt = 0;
580 for(
int i = 0; i < nmodes0; ++i)
582 for(
int j = 0; j < nmodes1-i; ++j, ++cnt)
584 Blas::Dgemm(
'T',
'N', nmodes2-i-j, numElmt, nquad2,
585 1.0, base2.get()+mode*nquad2, nquad2,
586 wsp.get()+cnt*nquad2*numElmt, nquad2,
587 0.0, output.get()+mode1, totmodes);
589 mode1 += nmodes2-i-j;
593 mode += (nmodes2-nmodes1)*(nmodes2-nmodes1+1)/2;
600 for(
int n = 0; n < numElmt; ++n)
604 base2.get()+nquad2,1,
605 &wsp[nquad2*numElmt + n*nquad2],1);
609 base2.get()+nquad2,1,
610 &wsp[nquad2*nmodes1*numElmt+n*nquad2],1);
#define ASSERTL1(condition, msg)
Assert Level 1 – Debugging which is used whether in FULLDEBUG or DEBUG compilation mode....
static void Dgemv(const char &trans, const int &m, const int &n, const double &alpha, const double *a, const int &lda, const double *x, const int &incx, const double &beta, double *y, const int &incy)
BLAS level 2: Matrix vector multiply y = A x where A[m x n].
static double Ddot(const int &n, const double *x, const int &incx, const double *y, const int &incy)
BLAS level 1: output = .
static void Dgemm(const char &transa, const char &transb, const int &m, const int &n, const int &k, const double &alpha, const double *a, const int &lda, const double *b, const int &ldb, const double &beta, double *c, const int &ldc)
BLAS level 3: Matrix-matrix multiply C = A x B where op(A)[m x k], op(B)[k x n], C[m x n] DGEMM perfo...
void TetIProduct(bool sortTopEdge, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void PyrIProduct(bool sortTopVert, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void TriIProduct(bool sortTopVertex, int numElmt, int nquad0, int nquad1, int nmodes0, int nmodes1, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void QuadIProduct(bool colldir0, bool colldir1, int numElmt, int nquad0, int nquad1, int nmodes0, int nmodes1, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void PrismIProduct(bool sortTopVert, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
void HexIProduct(bool colldir0, bool colldir1, bool colldir2, int numElmt, int nquad0, int nquad1, int nquad2, int nmodes0, int nmodes1, int nmodes2, const Array< OneD, const NekDouble > &base0, const Array< OneD, const NekDouble > &base1, const Array< OneD, const NekDouble > &base2, const Array< OneD, const NekDouble > &jac, const Array< OneD, const NekDouble > &input, Array< OneD, NekDouble > &output, Array< OneD, NekDouble > &wsp)
int getNumberOfCoefficients(int Na)
The above copyright notice and this permission notice shall be included.
void Vmul(int n, const T *x, const int incx, const T *y, const int incy, T *z, const int incz)
Multiply vector z = x*y.
void Vcopy(int n, const T *x, const int incx, T *y, const int incy)