Hi
I have a little software where i compute some value like device_J1, device_J2. Later i put all this value in a single array thanks to a kernel.
This is the chaining of my Kernel:
gpuErrchk(cudaMemset(device_J1,0,(i_bus-1)*(i_bus-1)*sizeof(double)));
gpuErrchk(cudaMemset(device_J2,0,npq*(i_bus-1)*sizeof(double)));
gpuErrchk(cudaMemset(device_J3,0,(i_bus-1)*npq*sizeof(double)));
gpuErrchk(cudaMemset(device_J4,0,npq*npq*sizeof(double)));
ComputeJacobienne1<<<Block_Jaco,threadsperBlocksJack>>>(device_J1, device_del, device_G, device_B,device_V, i_bus);
gpuErrchk(cudaGetLastError());
ComputeJacobienne2<<<Block_Jaco,threadsperBlocksJack>>>(device_J2, device_del, device_G, device_B,device_V, i_bus, npq,device_Pq);
gpuErrchk(cudaGetLastError());
ComputeJacobienne3<<<Block_Jaco,threadsperBlocksJack>>>(device_J3, device_del, device_G, device_B,device_V, i_bus, npq,device_Pq);
gpuErrchk(cudaGetLastError());
ComputeJacobienne4<<<block_jaco4,threadsperBlocksJack>>>(device_J4, device_del, device_G, device_B,device_V, i_bus,npq,device_Pq);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
ComputeJ<<<1+((i_bus-1)/1024),1024>>>(device_J, device_J1, device_J2, device_J3, device_J4, i_bus, npq);
And this is an example of a computejacobienne kernel:
__global__ void ComputeJacobienne1(double* J1,double* del,double* G, double* B,double* V, int i_bus ){
int x= blockDim.x*blockIdx.x+threadIdx.x;
int y= blockDim.y*blockIdx.y+threadIdx.y;
int m=x+1;
int n=y+1;
if(m<i_bus&&n<i_bus){
if(n==m){
for(n=0;n<i_bus;n++){
J1[x*(i_bus-1)+y]+=V[m]*V[n]*(-G[m*i_bus+n]*sin(del[m]-del[n])+B[m*i_bus+n]*cos(del[m]-del[n]));
}
J1[x*(i_bus-1)+y]+=-V[m]*V[m]*B[m*i_bus+m];
}else{
J1[x*(i_bus-1)+y]=V[m]*V[n]*(G[m*i_bus+n]*sin(del[m]*del[n])-B[m*i_bus+n]*cos(del[m]-del[n]));
}
}
}
And the kernel Compute J:
__global__ void ComputeJ(double* J,double* J1, double* J2, double* J3, double* J4, int i_bus, int npq){
int x=blockDim.x*blockIdx.x+threadIdx.x;
if(x<i_bus-1){
for(int i=0;i<(i_bus-1);i++){
J[x*(i_bus-1+npq)+i]=J1[x*(i_bus-1)+i];
if(i<npq){
J[x*(i_bus-1+npq)+(i+i_bus-1)]=J2[x*npq+i];
}
}
}
if(x<npq){
for(int i=0;i<i_bus-1;i++){
J[(x+i_bus-1)*(i_bus-1+npq)+i]=J3[x*(i_bus-1)+i];
if(i<npq){
J[(x+i_bus-1)*(i_bus-1+npq)+(i+i_bus-1)]=J4[x*npq+i];
}
}
}
}
My main issue is that i get the good value for device_J1, device_J2 etc (i’ve checked them) but when i use Nvsight to see the value of J1 in ComputeJ, the value are completely different… And i have no clue why.
I’ve considered a possible synchronisation issue but even with some cudaDeviceSynchronize after each kernel, it doesn’t work.
It’s complicated for me to provide something which will work but i can if i must. So if you have some idea you’re welcome.
Thanks