Skip to content

Commit

Permalink
enable coalesced access in fusedFDM (#119)
Browse files Browse the repository at this point in the history
  • Loading branch information
pengwang234 authored Jul 24, 2020
1 parent a567894 commit 256cdca
Showing 1 changed file with 12 additions and 17 deletions.
29 changes: 12 additions & 17 deletions src/libP/solvers/elliptic/okl/ellipticSchwarzSolverHex3D.okl
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,9 @@
@shared pfloat S_z_eT[p_Nq_e][p_Nq_e];
@shared pfloat work1[p_Nq_e][p_Nq_e][p_Nq_e];
@shared pfloat work2[p_Nq_e][p_Nq_e][p_Nq_e];
for(int k = 0; k < p_Nq_e; ++k; @inner) {
for(int k = 0; k < p_Nq_e; ++k) {
for(int j = 0; j < p_Nq_e; ++j; @inner) {
#pragma unroll
for(int i = 0; i < p_Nq_e; ++i) {
for(int i = 0; i < p_Nq_e; ++i; @inner) {
const dlong elem_offset = e * p_Nq_e * p_Nq_e * p_Nq_e;
const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset;
work1[k][j][i] = u[idx];
Expand Down Expand Up @@ -268,10 +267,9 @@
}
}
@barrier("local");
for (int k = 0; k < p_Nq_e; k++; @inner) {
for (int k = 0; k < p_Nq_e; k++) {
for (int j = 0; j < p_Nq_e; j++; @inner) {
#pragma unroll
for (int i = 0; i < p_Nq_e; i++) {
for (int i = 0; i < p_Nq_e; i++; @inner) {
const int v = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e;
pfloat value = 0.0;
#pragma unroll
Expand Down Expand Up @@ -308,10 +306,9 @@
}
}
@barrier("local");
for (int k = 0; k < p_Nq_e; k++; @inner) {
for (int k = 0; k < p_Nq_e; k++) {
for (int j = 0; j < p_Nq_e; j++; @inner) {
#pragma unroll
for (int i = 0; i < p_Nq_e; i++) {
for (int i = 0; i < p_Nq_e; i++; @inner) {
pfloat value = 0.0;
#pragma unroll
for (int l = 0; l < p_Nq_e; l++)
Expand Down Expand Up @@ -355,10 +352,9 @@
work2[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l2 - 1];
}
@barrier("local");
for(int k = 0; k < p_Nq_e; ++k; @inner) {
for(int k = 0; k < p_Nq_e; ++k) {
for(int j = 0; j < p_Nq_e; ++j; @inner) {
#pragma unroll
for(int i = 0; i < p_Nq_e; ++i) {
for(int i = 0; i < p_Nq_e; ++i; @inner) {
const dlong elem_offset = e * p_Nq_e * p_Nq_e * p_Nq_e;
const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset;
u[idx] = work2[k][j][i];
Expand All @@ -367,11 +363,10 @@
}
#else /* if (!p_restrict) */
@barrier("local");
for(int k = 0; k < p_Nq_e; ++k; @inner) {
for(int k = 0; k < p_Nq_e; ++k) {
for(int j = 0; j < p_Nq_e; ++j; @inner) {
if(k < p_Nq && j < p_Nq) {
#pragma unroll
for(int i = 0; i < p_Nq; ++i) {
for(int i = 0; i < p_Nq; ++i; @inner) {
if(k < p_Nq && j < p_Nq) {
const dlong elem_offset = e * p_Nq * p_Nq * p_Nq;
const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset;
Su[idx] = work1[k + 1][j + 1][i + 1];
Expand All @@ -381,4 +376,4 @@
}
#endif
}
}
}

0 comments on commit 256cdca

Please sign in to comment.