cpu function and gpu kernel overlap

YuKuki · July 22, 2017, 6:09am

There was a problem when I tried to improve my program in this way:
cudaMemcpyAsync(H2D);
kernel<<<>>>();
cudaMemcpyAsync(D2H);
cpufunction();

when I use the nvvp to analyse the implementation of program on windows,I find the cpu function only overlap the first H2D data transfer and the the lauch of kernel is after the compleletion of the cpu function.

In contrast, the implementation on linux is okay. The cpu function can overlap the time consumed by H2D, kernel,and D2H.

I want to what cause this difference and how slove it for implementation on windows.

BulatZiganshin · July 22, 2017, 12:06pm

what are the gpu models in both cases? earlier/low-end gpus have only one copy engine, so d2h and h2d copying can’t be overlapped

YuKuki · July 22, 2017, 1:16pm

titan black with one copy engine. The kernel should be overlapped because it is okay on linux.

Robert_Crovella · July 22, 2017, 2:16pm

windows WDDM command batching

YuKuki · July 22, 2017, 2:39pm

I’m sorry.
What is command batching?

Robert_Crovella · July 22, 2017, 2:50pm

Did you try googling “wddm command batching” and taking the first hit and reading it?

BulatZiganshin · July 22, 2017, 2:51pm

for me, it will lead to recursion :))))

YuKuki · July 22, 2017, 3:02pm

I’m sorry for my asking without searching.
Please forgive me because I know nothing about that at the first glimpse of that.
I have googled it and know it. Thank you very much.

YuKuki · July 23, 2017, 3:40am

another question: when the data transfer and kernel are overlapped, the speed of transfer data will become slow.Why? Is there any method to increase the speed of data transfer?

Robert_Crovella · July 23, 2017, 4:16am

The data transfer requires a certain amount of memory bandwidth. If the kernel is also using up memory bandwidth, this can slow the data transfer down (and slow the kernel down) as they compete for available bandwidth.

PCIE normally can use up 6-12GB/s of bandwidth during a single-direction data transfer. If you have a Titan X with ~300GB/s of memory bandwidth, this probably won’t have a noticeable effect. If you have a tiny little GPU with ~30GB/s of memory bandwidth, then the usage by PCIE could be a noticeable percentage of what is available.

YuKuki · July 23, 2017, 8:02am

Thank you. I’m sorry that I didn’t make question clear.
My question actually is that the when the cpu_function overlaps the data transfer and kernel in GPU, it causes the data transfer between cpu and gpu becomes slow. Is that because the my cpu function competes for the bandwidth of PCIE?

Asnal · July 23, 2017, 11:58am

Hello, i am new to cuda programming and i am writing a program for bessel function on cuda but when i am compiling and executing it. it is giving me segmentation fault everytime. i am unable to debug it. Please help!!

Here is my code (main file):

#include <stdio.h>
#include <bits/stdc++.h>
#include <cuComplex.h>
#include <thrust/complex.h>
#include "bessel.h"

const int N = 5; 
const int blocksize = 5; 


__global__ void add(cuDoubleComplex *a) 
{
	//a[threadIdx.x] =cuCadd(a[threadIdx.x], a[threadIdx.x]);
	cuDoubleComplex c = a[threadIdx.x];
	cuDoubleComplex b = bessel(c);
	a[threadIdx.x]=b;
	printf(" The value is %lf + %lfi \n",cuCreal(b),cuCimag(b)); 
	
}

int main()
{
	double a[N] = {97.24, 80.60, 54.13, -60.05, 99.96};
	double b[N] = {-67.42, -10.54, -5.83, 85.21, -98.33};
	cuDoubleComplex *c,*d;
	for(int i=0;i<N;i++)
	{
	c[i]=make_cuDoubleComplex(a[i],b[i]);
	}
	printf("Working fine");	

	cudaMalloc( (void**)&d, N * sizeof(cuDoubleComplex) ); 
	 
	cudaMemcpy( d, c, N * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice ); 
	 
	printf("\nhere too");
	dim3 dimBlock( blocksize, 1 );
	dim3 dimGrid( 1, 1 );
	add<<<dimGrid, dimBlock>>>(d);
	cudaMemcpy( c, d, N * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost ); 
	printf("here also\n");
	cudaFree( d );

	return EXIT_SUCCESS;
}

here is my other file(bessel function):

#include <cuda.h>
#include <cuComplex.h>
#include <thrust/complex.h>
#include <cuda_runtime.h>
#include <bits/stdc++.h>
#include <math.h>

using namespace std;

__device__ cuDoubleComplex cuClog(cuDoubleComplex a)
{

cuDoubleComplex b;
double x = cuCreal(a);
double y = cuCimag(a);
double r = sqrt((pow(x,2)+pow(y,2)));
double theta = atan(y/x);

b=make_cuDoubleComplex(log(r),theta);

return b; 
}

__device__ cuDoubleComplex cuCexp(cuDoubleComplex x)
{
  double factor = exp(x.x);
  return make_cuDoubleComplex(factor * cos(x.y), factor * sin(x.y));
}

__device__ cuDoubleComplex cuCsqrt(cuDoubleComplex x)
{
  double radius = cuCabs(x);
  double cosA = x.x / radius;
  cuDoubleComplex out;
  out.x = sqrt(radius * (cosA + 1.0) / 2.0);
  out.y = sqrt(radius * (1.0 - cosA) / 2.0);
  // signbit should be false if x.y is negative
  if (signbit(x.y))
    out.y *= -1.0;

  return out;
}
 
__device__ cuDoubleComplex cuCpow(const cuDoubleComplex& z, const int &n) {
  return make_cuDoubleComplex(pow((double)cuCabs(z), n)*cos(n*(double)atan2(cuCimag(z),cuCreal(z))), (pow(cuCabs(z), n)*sin(n*(double)atan2(cuCimag(z),cuCreal(z)))));
}

__device__ cuDoubleComplex cumul(cuDoubleComplex a,float b)
{
cuDoubleComplex c;
c.x=cuCreal(a)*(double(b));
c.y=cuCimag(a)*(double(b));
 return c;
}

__device__ cuDoubleComplex cudiv(cuDoubleComplex a,float b)
{
cuDoubleComplex d;
d.x=cuCreal(a)/(double(b));
d.x=cuCimag(a)/(double(b));
return d;
}


__device__ cuDoubleComplex bessel(cuDoubleComplex z)
  {

float a[12] = {0.125e+00, 7.03125e-02, 7.32421875e-02,1.1215209960938e-01, 2.2710800170898e-01, 5.7250142097473e-01,
    1.7277275025845e+00, 6.0740420012735e+00,
    2.4380529699556e+01, 1.1001714026925e+02,
    5.5133589612202e+02, 3.0380905109224e+03};
  float a0;
  float a1[10] = {0.125e+00, 0.2109375e+00,
    1.0986328125e+00,     1.1775970458984e+01,
    2.1461706161499e+002, 5.9511522710323e+03,
    2.3347645606175e+05,  1.2312234987631e+07,
    8.401390346421e+08,   7.2031420482627e+10}; 
  float b[12] = {
   -0.375e+00,           -1.171875e-01,
   -1.025390625e-01,     -1.4419555664063e-01,
   -2.7757644653320e-01, -6.7659258842468e-01,
   -1.9935317337513e+00, -6.8839142681099e+00,
   -2.7248827311269e+01, -1.2159789187654e+02,
   -6.0384407670507e+02, -3.3022722944809e+03 };
  cuDoubleComplex ca, cb, cbi0, cbi1, cbk0, cbk1, cdi0, cdi1, cdk0, cdk1, ci, cr, cs, ct, cw;
  int k0;
  float pi,w0;
  cuDoubleComplex z1,z2,zr,zr2;

  pi = 3.141592653589793e+00;
  ci = make_cuDoubleComplex(0.0e+00, 1.0e+00);
  a0 = cuCabs(z);
  z2 = cuCmul(z, z);
  z1 = z;

  if ( a0 == 0.0e+00 ){
    cbi0 = make_cuDoubleComplex(1.0e+00, 0.0e+00);
    cbi1 = make_cuDoubleComplex( 0.0e+00, 0.0e+00);
    cdi0 = make_cuDoubleComplex( 0.0e+00, 0.0e+00);
    cdi1 = make_cuDoubleComplex( 0.5e+00, 0.0e+00);
    cbk0 = make_cuDoubleComplex( 1.0e+30, 0.0e+00);
    cbk1 = make_cuDoubleComplex( 1.0e+30, 0.0e+00);
    cdk0 = make_cuDoubleComplex( -1.0e+30, -0.0e+00);
    cdk1 = make_cuDoubleComplex( -1.0e+30, -0.0e+00);
    }
printf("abc\n");
  if (cuCreal(z) < 0.0e+00)
    z1 = make_cuDoubleComplex(-cuCreal(z), -cuCimag(z));

  if ( a0 <= 18.0e+00 ){
    cbi0 = make_cuDoubleComplex(1.0e+00, 0.0e+00);
    cr = make_cuDoubleComplex(1.0e+00, 0.0e+00);
    for(int k=1;k<=50;k++){
    printf("def\n");
	cr=cumul(cr,a[k]);
	cr=cuCmul(cr,z2);
	cr=cudiv(cr,float(k*(k+1)));
	cbi1=cuCadd(cbi1,cr);
      if ( cuCabs(cuCdiv(cr,cbi1)) < 1.0e-15 )
        break;
   }
   cbi1=cumul(cuCmul(z1,cbi1),0.5e+00);
}
else if ( a0 < 35.0e+00 )
      k0 = 12;
    else if ( a0 < 50.0e+00 )
      k0 = 9;
    else
      k0 = 7;
 ca = cuCdiv(cuCexp(z1), cuCsqrt(cumul(z1,pi*2.0e+00)));
 cbi0 = make_cuDoubleComplex( 1.0e+00, 0.0e+00);
 zr = cudiv(z1,1.0e+00);

for(int k = 1;k<=k0;k++)
    {
      cuDoubleComplex temp2=make_cuDoubleComplex(double(a[k]),0.0e+00);
      cbi0 = cuCpow(cuCadd(cbi0,cumul(zr,a[k])),k);
    }
    cbi0 = cuCmul(ca, cbi0);
    cbi1 = make_cuDoubleComplex(1.0e+00, 0.0e+00);
    
for(int k = 1;k <= k0;k++){
      //cuDoubleComplex temp3=make_cuDoubleComplex(double(k),0.0e+00);
      //cuDoubleComplex temp4=make_cuDoubleComplex(double(b[k]),0.0e+00);
      cbi1 = cuCpow(cuCadd(cbi1,cumul(zr,b[k])),k);
    }
    cbi1 = cuCmul(ca, cbi1);
if ( a0 <= 9.0e+00 ) //then
	{
    //cuDoubleComplex temp5=make_cuDoubleComplex(0.5e+00,0.0e+00);
    cs = make_cuDoubleComplex( 0.0e+00, 0.0e+00);
    ct = cuClog(cumul(z1,0.5e+00));
    ct = make_cuDoubleComplex(-cuCreal(ct),-cuCimag(ct));
    cuDoubleComplex temp14=make_cuDoubleComplex(-0.5772156649015329e+00,0.0e+00);
    ct = cuCsub(ct,temp14); 					//-0.5772156649015329e+00;
    w0 = 0.0e+00;
    cr = make_cuDoubleComplex( 1.0e+00, 0.0e+00);
    for(int k=1;k<=50;k++)
	{
      w0 = w0 + 1.0e+00 / k;
      //cuDoubleComplex temp6=make_cuDoubleComplex(0.25e+00,0.0e+00);
      //cuDoubleComplex temp7=make_cuDoubleComplex(double(k*k),0.0e+00);
      cr = cumul(cudiv(cr,float(k*k)),0.25e+00);              //* z2;
      cr = cuCmul(cr,z2);
      cuDoubleComplex temp8=make_cuDoubleComplex(double(w0),0.0e+00);
      cs = cuCadd(cs,cuCmul(cr,cuCadd(temp8,ct)));
      if (cuCabs(cuCdiv(cuCsub(cs,cw),cs)) < 1.0e-15 ) //then
        break;
      //end if
      cw = cs;
    }

    cbk0 = cuCadd(ct,cs);
}
else
	{
    //cuDoubleComplex temp9=make_cuDoubleComplex(0.5e+00,0.0e+00);
    //cuDoubleComplex temp10=make_cuDoubleComplex(1.0e+00,0.0e+00);
    cb = cudiv(z1,0.5e+00);
    zr2 = cudiv(z2,1.0e+00);
    cbk0 = make_cuDoubleComplex( 1.0e+00, 0.0e+00);
    for(int k=1;k<=10;k++)
	 {
      //cuDoubleComplex temp11=make_cuDoubleComplex(double(a1[k]),0.0e+00);
      cbk0 = cuCpow(cuCadd(cbk0,cumul(zr2,a1[k])),k);
    }
    cbk0 = cuCdiv(cuCmul(cb,cbk0),cbi0);

  }//end if

//cuDoubleComplex temp12=make_cuDoubleComplex(1.0e+00,0.0e+00);
  cbk1 = cuCdiv(cuCsub(cudiv(z1,1.0e+00),cuCmul(cbi1,cbk0)),cbi0);

  if ( cuCreal(z)< 0.0e+00 ) 		//then
	{
    if ( cuCimag(z) < 0.0e+00 ) 		//then
      {
      cbk0 = cuCadd(cbk0,cuCmul(cumul(ci,pi),cbi0));
      cbk1 = cuCadd(cbk1,cuCmul(cumul(ci,pi),cbi1));
      cbk1 = make_cuDoubleComplex(-cuCreal(cbk1),-cuCimag(cbk1));                       //cbk1 = - cbk1 + ci * piC * cbi1;
      }
    else
	{
      cbk0 = cuCsub(cbk0,cuCmul(cumul(ci,pi),cbi0));
      cbk1 = cuCsub(cbk1,cuCmul(cumul(ci,pi),cbi1));
      cbk1 = make_cuDoubleComplex(-cuCreal(cbk1),-cuCimag(cbk1));
      /*cbk0 = cbk0 - ci * piC * cbi0;
      cbk1 = - cbk1 - ci * piC * cbi1;*/
	}
    //end if

    cbi1 = make_cuDoubleComplex(-cuCreal(cbi1),-cuCimag(cbi1));

  }//end if

  cdi0 = cbi1;
  cuDoubleComplex temp13=make_cuDoubleComplex(1.0e+00,0.0e+00);
  cdi1 = cuCsub(cbi0,cuCmul(cudiv(z,1.0e+00),cbi1));
  cdk0 = make_cuDoubleComplex(-cuCreal(cbk1),-cuCimag(cbk1));
  cdk1 = cuCsub(cbk0,cuCmul(cuCdiv(temp13,z),cbk1));
  cdk1 = make_cuDoubleComplex(-cuCreal(cdk1),-cuCimag(cdk1));

return cbk0;

}

Robert_Crovella · July 25, 2017, 12:28am

This is not valid:

cuDoubleComplex *c,*d;
	for(int i=0;i<N;i++)
	{
	c[i]=make_cuDoubleComplex(a[i],b[i]);
	}

That’s a problem with basic C programming skills, not CUDA. You have not allocated any space for the c variable.

Try changing this:

cuDoubleComplex *c,*d;

To this:

cuDoubleComplex c[N];
        cuDoubleComplex *d;