24 #include "common/setmemory.h" 25 #include "wrapperMPI.h" 27 void zgemm_(
char *TRANSA,
char *TRANSB,
int *M,
int *N,
int *K,
double complex *ALPHA,
double complex *matJL,
int *LDA,
double complex *arrayz,
int *LDB,
double complex *BETA,
double complex *arrayx,
int *LDC);
38 double complex *tmp_v0 ,
39 double complex *tmp_v1 ,
40 double complex *tmp_v2 ,
41 double complex *tmp_v3
52 int M, N, K, LDA, LDB, LDC;
53 double complex ALPHA, BETA;
54 long unsigned int i_max;
55 long unsigned int j, k, ell, iloop;
56 long unsigned int i1, i2;
57 long unsigned int iomp;
58 long unsigned int ell4, ell5, ell6, m0, Ipart1;
59 long unsigned int mi, mj, mri, mrj, mrk, mrl;
61 long unsigned int ellrl, ellrk, ellrj, ellri, elli1, elli2, ellj1, ellj2;
62 long unsigned int iSS1, iSS2, iSSL1, iSSL2;
63 double complex **vecJ;
64 double complex **matJ, **matJ2;
65 double complex *matJL;
67 double complex **matB;
68 double complex *arrayz;
69 double complex *arrayx;
70 double complex *arrayw;
71 long unsigned int ishift1, ishift2, ishift3, ishift4, ishift5, pivot_flag, num_J_star;
72 long unsigned int pow4, pow5, pow41, pow51;
86 vecJ = cd_2d_allocate( 3, 3);
87 matJ = cd_2d_allocate(4, 4);
88 matJ2 = cd_2d_allocate(4, 4);
89 matB = cd_2d_allocate(2,2);
90 matJL = cd_1d_allocate(64*64);
91 matI = cd_1d_allocate(64*64);
95 for(iloop=0; iloop < X->
Boost.
R0; iloop++){
110 pow4 = (
int)pow(2.0,ishift1+ishift2+ishift3+ishift4);
111 pow5 = (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5);
115 pow41= (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+1);
116 pow51= (int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5+1);
118 for(k=0; k < (64*64); k++){
119 matJL[k] = 0.0 + 0.0*I;
120 matI[k] = 0.0 + 0.0*I;
122 for(k=0; k < 64; k++){
126 for(ell=0; ell < num_J_star; ell++){
134 for(i1 = 0; i1 < 3; i1++){
135 for(i2 = 0; i2 < 3; i2++){
140 matJ[0][0] = vecJ[2][2];
142 matJ[0][1] = vecJ[0][0]-vecJ[1][1]-I*vecJ[0][1]-I*vecJ[1][0];
144 matJ[0][2] = vecJ[2][0]-I*vecJ[2][1];
146 matJ[0][3] = vecJ[0][2]-I*vecJ[1][2];
148 matJ[1][0] = vecJ[0][0]-vecJ[1][1]+I*vecJ[0][1]+I*vecJ[1][0];
150 matJ[1][1] = vecJ[2][2];
152 matJ[1][2] =(-1.0)*vecJ[0][2]-I*vecJ[1][2];
154 matJ[1][3] =(-1.0)*vecJ[2][0]-I*vecJ[2][1];
156 matJ[2][0] = vecJ[2][0]+I*vecJ[2][1];
158 matJ[2][1] =(-1.0)*vecJ[0][2]+I*vecJ[1][2];
160 matJ[2][2] =(-1.0)*vecJ[2][2];
162 matJ[2][3] = vecJ[0][0]+vecJ[1][1]+I*vecJ[0][1]-I*vecJ[1][0];
164 matJ[3][0] = vecJ[0][2]+I*vecJ[1][2];
166 matJ[3][1] =(-1.0)*vecJ[2][0]+I*vecJ[2][1];
168 matJ[3][2] = vecJ[0][0]+vecJ[1][1]-I*vecJ[0][1]+I*vecJ[1][0];
170 matJ[3][3] =(-1.0)*vecJ[2][2];
172 matJ2[3][3] = matJ[0][0];
173 matJ2[3][0] = matJ[0][1];
174 matJ2[3][1] = matJ[0][2];
175 matJ2[3][2] = matJ[0][3];
176 matJ2[0][3] = matJ[1][0];
177 matJ2[0][0] = matJ[1][1];
178 matJ2[0][1] = matJ[1][2];
179 matJ2[0][2] = matJ[1][3];
180 matJ2[1][3] = matJ[2][0];
181 matJ2[1][0] = matJ[2][1];
182 matJ2[1][1] = matJ[2][2];
183 matJ2[1][2] = matJ[2][3];
184 matJ2[2][3] = matJ[3][0];
185 matJ2[2][0] = matJ[3][1];
186 matJ2[2][1] = matJ[3][2];
187 matJ2[2][2] = matJ[3][3];
189 for(ellri=0; ellri<2; ellri++){
190 for(ellrj=0; ellrj<2; ellrj++){
191 for(ellrk=0; ellrk<2; ellrk++){
192 for(ellrl=0; ellrl<2; ellrl++){
193 for(elli1=0; elli1<2; elli1++){
194 for(ellj1=0; ellj1<2; ellj1++){
195 for(elli2=0; elli2<2; elli2++){
196 for(ellj2=0; ellj2<2; ellj2++){
198 iSSL1 = elli1*(int)pow(2,mi) + ellj1*(int)pow(2,mj) + ellri*(int)pow(2,mri) + ellrj*(int)pow(2,mrj) + ellrk*(int)pow(2,mrk) + ellrl*(int)pow(2,mrl);
199 iSSL2 = elli2*(int)pow(2,mi) + ellj2*(int)pow(2,mj) + ellri*(int)pow(2,mri) + ellrj*(int)pow(2,mrj) + ellrk*(int)pow(2,mrk) + ellrl*(int)pow(2,mrl);
200 iSS1 = elli1 + 2*ellj1;
201 iSS2 = elli2 + 2*ellj2;
202 matJL[iSSL1+64*iSSL2] += matJ2[iSS1][iSS2];
223 for(ellri=0; ellri<2; ellri++){
224 for(ellrj=0; ellrj<2; ellrj++){
225 for(ellrk=0; ellrk<2; ellrk++){
226 for(ellrl=0; ellrl<2; ellrl++){
227 for(ellj1=0; ellj1<2; ellj1++){
228 for(elli1=0; elli1<2; elli1++){
229 for(elli2=0; elli2<2; elli2++){
231 iSSL1 = elli1*(int)pow(2,ellj2) + ellj1*(int)pow(2,((ellj2+1)%6)) + ellri*(int)pow(2,((ellj2+2)%6)) + ellrj*(int)pow(2,((ellj2+3)%6)) + ellrk*(int)pow(2,((ellj2+4)%6)) + ellrl*(int)pow(2,((ellj2+5)%6));
232 iSSL2 = elli2*(int)pow(2,ellj2) + ellj1*(int)pow(2,((ellj2+1)%6)) + ellri*(int)pow(2,((ellj2+2)%6)) + ellrj*(int)pow(2,((ellj2+3)%6)) + ellrk*(int)pow(2,((ellj2+4)%6)) + ellrl*(int)pow(2,((ellj2+5)%6));
233 matJL[iSSL1+64*iSSL2] += matB[elli1][elli2];
245 iomp=i_max/(int)pow(2.0,ishift1+ishift2+ishift3+ishift4+ishift5+2);
247 #pragma omp parallel default(none) private(arrayx,arrayz,arrayw,ell4,ell5,ell6,m0,Ipart1,TRANSA,TRANSB,M,N,K,LDA,LDB,LDC,ALPHA,BETA) \ 248 shared(matJL,matI,iomp,i_max,myrank,ishift1,ishift2,ishift3,ishift4,ishift5,pow4,pow5,pow41,pow51,tmp_v0,tmp_v1,tmp_v3) 251 arrayx = cd_1d_allocate(64*((
int)pow(2.0,ishift4+ishift5-1)));
252 arrayz = cd_1d_allocate(64*((
int)pow(2.0,ishift4+ishift5-1)));
253 arrayw = cd_1d_allocate(64*((
int)pow(2.0,ishift4+ishift5-1)));
256 for(ell6 = 0; ell6 < iomp; ell6++){
258 for(ell5 = 0; ell5 < (int)pow(2.0, ishift5-1); ell5++){
259 for(ell4 = 0; ell4 < (int)pow(2.0, ishift4-1); ell4++){
260 for(m0 = 0; m0 < 16; m0++){
261 arrayz[(0 + m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4 +pow41*ell5+Ipart1)];
262 arrayz[(16+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)];
263 arrayz[(32+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)];
264 arrayz[(48+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)];
265 tmp_v3[(1 + m0+16*ell4 +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4 +pow41*ell5+Ipart1)];
266 tmp_v3[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)];
267 tmp_v3[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)];
268 tmp_v3[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)]=tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)];
269 arrayx[(0 + m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4 +pow41*ell5+Ipart1)];
270 arrayx[(16+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)];
271 arrayx[(32+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)];
272 arrayx[(48+ m0 +64*(ell4+ell5*(int)pow(2.0,ishift4-1)))] = tmp_v0[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)];
278 for(ell5 = 0; ell5 < (int)pow(2.0, ishift5-1); ell5++){
279 for(ell4 = 0; ell4 < (int)pow(2.0, ishift4-1); ell4++){
280 for(m0 = 0; m0 < 16; m0++){
281 arrayz[(0 + m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)];
282 arrayz[(16+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)];
283 arrayz[(32+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)];
284 arrayz[(48+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)];
285 tmp_v3[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)];
286 tmp_v3[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)];
287 tmp_v3[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)];
288 tmp_v3[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)] = tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)];
289 arrayx[(0 + m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)];
290 arrayx[(16+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)];
291 arrayx[(32+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)];
292 arrayx[(48+ m0+64*(ell4+ell5*(int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))] = tmp_v0[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)];
301 N = (int)pow(2.0, ishift4+ishift5-1);
309 zgemm_(&TRANSA,&TRANSB,&M,&N,&K,&ALPHA,matJL,&LDA,arrayz,&LDB,&BETA,arrayx,&LDC);
329 for(ell5 = 0; ell5 < (int)pow(2.0,ishift5-1); ell5++){
330 for(ell4 = 0; ell4 < (int)pow(2.0,ishift4-1); ell4++){
331 for(m0 = 0; m0 < 16; m0++){
332 tmp_v1[(1 + m0+16*ell4 +pow41*ell5+Ipart1)] = arrayx[(0 + m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)))];
333 tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+Ipart1)] = arrayx[(16+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)))];
334 tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+Ipart1)] = arrayx[(32+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)))];
335 tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+Ipart1)] = arrayx[(48+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)))];
339 for(ell5 = 0; ell5 < (int)pow(2.0,ishift5-1); ell5++){
340 for(ell4 = 0; ell4 < (int)pow(2.0,ishift4-1); ell4++){
341 for(m0 = 0; m0 < 16; m0++){
342 tmp_v1[(1 + m0+16*ell4 +pow41*ell5+pow51+Ipart1)] = arrayx[(0 + m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
343 tmp_v1[(1 + m0+16*ell4+pow4 +pow41*ell5+pow51+Ipart1)] = arrayx[(16+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
344 tmp_v1[(1 + m0+16*ell4+pow5 +pow41*ell5+pow51+Ipart1)] = arrayx[(32+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
345 tmp_v1[(1 + m0+16*ell4+pow4+pow5+pow41*ell5+pow51+Ipart1)] = arrayx[(48+ m0+64*(ell4+ell5*(
int)pow(2.0,ishift4-1)+(int)pow(2.0,ishift4+ishift5-2)))];
351 free_cd_1d_allocate(arrayz);
352 free_cd_1d_allocate(arrayx);
353 free_cd_1d_allocate(arrayw);
358 #pragma omp parallel for default(none) private(ell4,ell5,ell6,m0,Ipart1,TRANSA,TRANSB,M,N,K,LDA,LDB,LDC,ALPHA,BETA) \ 359 firstprivate(iomp) shared(i_max,ishift1,ishift2,ishift3,ishift4,ishift5,pow4,pow5,pow41,pow51,X,tmp_v0,tmp_v1) 360 for(ell5 = 0; ell5 < iomp; ell5++ ){
366 #pragma omp parallel for default(none) private(ell4,ell5) \ 367 firstprivate(iomp) shared(i_max,X,tmp_v1,tmp_v3) 368 for(ell5 = 0; ell5 < iomp; ell5++ ){
375 #pragma omp parallel for default(none) private(ell4) \ 376 shared(i_max,tmp_v0,tmp_v1,tmp_v3) 377 for(ell4 = 0; ell4 < i_max; ell4++ ){
378 tmp_v0[1 + ell4] = tmp_v1[1 + ell4];
379 tmp_v1[1 + ell4] = tmp_v3[1 + ell4];
389 MPI_Alltoall(&tmp_v1[1],(
int)(i_max/
nproc),MPI_DOUBLE_COMPLEX,&tmp_v3[1],(
int)(i_max/
nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD);
390 MPI_Alltoall(&tmp_v0[1],(
int)(i_max/
nproc),MPI_DOUBLE_COMPLEX,&tmp_v2[1],(
int)(i_max/
nproc),MPI_DOUBLE_COMPLEX,MPI_COMM_WORLD);
394 #pragma omp parallel for default(none) private(ell4,ell5,ell6) \ 395 firstprivate(iomp) shared(i_max,X,nproc,tmp_v0,tmp_v1,tmp_v2,tmp_v3) 397 for(ell4 = 0; ell4 < iomp; ell4++ ){
398 for(ell5 = 0; ell5 <
nproc; ell5++ ){
399 for(ell6 = 0; ell6 < (int)(i_max/(
int)pow(2.0,X->
Boost.
W0)); ell6++ ){
400 tmp_v1[(1 + ell6+ell5*i_max/(int)pow(2.0,X->
Boost.
W0)+ell4*i_max/((int)pow(2.0,X->
Boost.
W0)/
nproc))] = tmp_v3[(1 + ell6+ell4*i_max/(int)pow(2.0,X->
Boost.
W0)+ell5*i_max/
nproc)];
401 tmp_v0[(1 + ell6+ell5*i_max/(int)pow(2.0,X->
Boost.
W0)+ell4*i_max/((int)pow(2.0,X->
Boost.
W0)/
nproc))] = tmp_v2[(1 + ell6+ell4*i_max/(int)pow(2.0,X->
Boost.
W0)+ell5*i_max/
nproc)];
420 free_cd_2d_allocate(vecJ);
421 free_cd_2d_allocate(matJ);
422 free_cd_2d_allocate(matJ2);
423 free_cd_2d_allocate(matB);
424 free_cd_1d_allocate(matJL);
425 free_cd_1d_allocate(matI);
unsigned long int idim_max
The dimension of the Hilbert space of this process.
long unsigned int num_pivot
void child_general_int_spin_MPIBoost(struct BindStruct *X, double complex *tmp_v0, double complex *tmp_v1, double complex *tmp_v2, double complex *tmp_v3)
int nproc
Number of processors, defined in InitializeMPI()
struct BoostList Boost
For Boost.
double complex *** arrayJ
void zgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, double complex *ALPHA, double complex *matJL, int *LDA, double complex *arrayz, int *LDB, double complex *BETA, double complex *arrayx, int *LDC)
long unsigned int ishift_nspin
struct CheckList Check
Size of the Hilbert space.