I have implemented the Jacobi algorithm based on the routine described in the book Numerical Recipes but since I plan to work with very large matrices I am trying to parallelize it using openmp.
void ROTATE(MatrixXd &a, int i, int j, int k, int l, double s, double tau)
{
double g,h;
g=a(i,j);
h=a(k,l);
a(i,j)=g-s*(h+g*tau);
a(k,l)=h+s*(g-h*tau);
}
void jacobi(int n, MatrixXd &a, MatrixXd &v, VectorXd &d )
{
int j,iq,ip,i;
double tresh,theta,tau,t,sm,s,h,g,c;
VectorXd b(n);
VectorXd z(n);
v.setIdentity();
z.setZero();
#pragma omp parallel for
for (ip=0;ip<n;ip++)
{
d(ip)=a(ip,ip);
b(ip)=d(ip);
}
for (i=0;i<50;i++)
{
sm=0.0;
for (ip=0;ip<n-1;ip++)
{
#pragma omp parallel for reduction (+:sm)
for (iq=ip+1;iq<n;iq++)
sm += fabs(a(ip,iq));
}
if (sm == 0.0) {
break;
}
if (i < 3)
tresh=0.2*sm/(n*n);
else
tresh=0.0;
#pragma omp parallel for private (ip,g,h,t,theta,c,s,tau)
for (ip=0;ip<n-1;ip++)
{
//#pragma omp parallel for private (g,h,t,theta,c,s,tau)
for (iq=ip+1;iq<n;iq++)
{
g=100.0*fabs(a(ip,iq));
if (i > 3 && (fabs(d(ip))+g) == fabs(d[ip]) && (fabs(d[iq])+g) == fabs(d[iq]))
a(ip,iq)=0.0;
else if (fabs(a(ip,iq)) > tresh)
{
h=d(iq)-d(ip);
if ((fabs(h)+g) == fabs(h))
{
t=(a(ip,iq))/h;
}
else
{
theta=0.5*h/(a(ip,iq));
t=1.0/(fabs(theta)+sqrt(1.0+theta*theta));
if (theta < 0.0)
{
t = -t;
}
c=1.0/sqrt(1+t*t);
s=t*c;
tau=s/(1.0+c);
h=t*a(ip,iq);
#pragma omp critical
{
z(ip)=z(ip)-h;
z(iq)=z(iq)+h;
d(ip)=d(ip)-h;
d(iq)=d(iq)+h;
a(ip,iq)=0.0;
for (j=0;j<ip;j++)
ROTATE(a,j,ip,j,iq,s,tau);
for (j=ip+1;j<iq;j++)
ROTATE(a,ip,j,j,iq,s,tau);
for (j=iq+1;j<n;j++)
ROTATE(a,ip,j,iq,j,s,tau);
for (j=0;j<n;j++)
ROTATE(v,j,ip,j,iq,s,tau);
}
}
}
}
}
}
}
I wanted to parallelize the loop that does most of the calculations and both comments inserted in the code:
//#pragma omp parallel for private (ip,g,h,t,theta,c,s,tau)
//#pragma omp parallel for private (g,h,t,theta,c,s,tau)
are my attempts at it. Unfortunately both of them end up producing incorrect results. I suspect the problem may be in this block:
z(ip)=z(ip)-h;
z(iq)=z(iq)+h;
d(ip)=d(ip)-h;
d(iq)=d(iq)+h;
because usually this sort of accumulation would need a reduction, but since each thread accesses a different part of the array, I am not certain of this.
I am not really sure if I am doing the parallelization in a correct manner because I have only recently started working with openmp, so any suggestion or recommendation would also be welcomed.
Sidenote: I know there are faster algorithms for eigenvalue and eigenvector determination including the SelfAdjointEigenSolver in Eigen, but those are not giving me the precision I need in the eigenvectors and this algorithm is.
My thanks in advance.
Edit: I considered to correct answer to be the one provided by The Quantum Physicist because what I did does not reduce the computation time for system of size up to 4096x4096. In any case I corrected the code in order to make it work and maybe for big enough systems it could be of some use. I would advise the use of timers to test if the
#pragma omp for
actually decrease the computation time.