I have the following serial code that I would like to make parallel. I understand when using the collapse clause for nested loops, it's important to not have code before and after the for(i) loop since is not allowed. Then how do I parallel a nested for loop with if statements like this:
void makeFourierMesh2D(double Lx, double Ly, double KX[], double KY[], double ksqu[], double ninvksqu[], int FourierMeshType){
// Make a variable to account for way wavenumbers are set in FFTW3.
int k_counter = 0;
// kx corresponds to modes [0:n/2-1 , -n/2:-1]. This is why there is an additional step, just due to FFTW3's structuring
#pragma omp parallel for collapse(2)
for(int i = 0; i < nx ; i++){
for(int j = 0; j < nyk; j++){
if (i < nx/2){ // i<nx/2 --> 0 to 127
KX[j + nyk*i] =2.*M_PI*i/Lx; //K = 2pi/L* [0:nx/2-1 , -n/2:-1]' : creates a coloumn vector with nx/2 elements starting from 0 ( from 0 to 127 then -128 to -1) 256/2 is 128
}
if( i >= nx/2){ // i >= nx/2 --> from -128 to -1
KX[j + nyk*i] = 2.* M_PI * (-i + 2.*k_counter) / Lx;
}
}
if( i >= nx/2){ // error here
k_counter++;
}
}
}
#pragma omp parallel for collapse(2)
for(int i = 0; i < nx ; i++){
for(int j = 0; j < nyk; j++){
if (i < nx/2){
KX[j + nyk*i] = 2.*M_PI*i/Lx;
}
if( i >= nx/2){
KX[j + nyk*i] = 2.* M_PI * (-i + 2.*k_counter) / Lx;
}
}
if( i >= nx/2){
k_counter++;
}
}
1 2 3 4 5 6 7 8 9 10 11 12 13
#pragma omp parallel for collapse(2)
for(int i = 0; i < nx/2 ; i++){
for(int j = 0; j < nyk; j++){
KX[j + nyk*i] = 2.*M_PI*i/Lx;
}
}
#pragma omp parallel for collapse(2)
for(int i = nx/2; i < nx ; i++){
for(int j = 0; j < nyk; j++){
int k_counter = i - nx/2;
KX[j + nyk*i] = 2.* M_PI * (-i + 2.*k_counter) / Lx;
}
}
1 2 3 4 5 6 7 8 9 10 11 12 13 14
auto n = nx / 2 * nyk;
#pragma omp parallel for
for (int x = 0; x < n; x++){
auto i = x / nyk;
KX[x] = 2. * M_PI * i / Lx;
}
auto m = nx * nyk - n;
#pragma omp parallel for
for(int x = n; x < m; x++){
auto i = x / nyk;
int 2k_counter = (i - nx/2) * 2;
KX[x] = 2. * M_PI * (-i + 2k_counter) / Lx;
}
1 2 3 4 5 6 7 8 9 10 11 12
assert(nx % 2 == 0);
auto n = nx / 2 * nyk;
#pragma omp parallel for
for (int x = 0; x < n; x++){
auto i = x / nyk;
KX[x] = 2. * M_PI * i / Lx;
int 2k_counter = (i - nx / 2) * 2;
auto x2 = x + n;
auto i2 = x2 / nyk
KX[i2] = 2. * M_PI * (-i2 + 2k_counter) / Lx;
}
Using a shallower loop is easier for the branch predictor. Actually this would be even better unless you have a crazy core count:
1 2 3 4 5 6 7 8 9 10
#pragma omp parallel for
for(int i = 0; i < nx/2 ; i++){
auto x = 2. * M_PI * i / Lx;
std::fill(KX + (nyk * i), KX + (nyk * (i + 1)), x);
auto k_counter2 = i * 2;
auto i2 = i + nx/2;
auto y = 2.* M_PI * (-i2 + k_counter2) / Lx;
std::fill(KX + (nyk * i2), KX + (nyk * (i2 + 1)), y);
}
Having each thread fill some number of contiguous cells is much more cache-friendly than having each thread jump around in memory with no discernible pattern.
As for why I used the x variable previously, it was to avoid confusing it with the original i for index.