Hello all,
I wanted to increase the register usage for a kernel. So I set the compilation flag --maxrregcount=16 for the code below,
__global__ void kernel(float* xIN,float* yIN,float* zIN, float* tmIN, int NumVertices,
float* xOUT,float* yOUT,float* zOUT )
{
int i = threadIdx.x + blockIdx.x*blockDim.x; // get thread ID
float x;
float y;
float z;
if(i<NumVertices){
x = xIN[i];
y = yIN[i];
z = zIN[i];
xOUT[i] = tmIN[0] * x + tmIN[4] * y + tmIN[8] * z + tmIN[12];
yOUT[i] = tmIN[1] * x + tmIN[5] * y + tmIN[9] * z + tmIN[13];
zOUT[i] = tmIN[2] * x + tmIN[6] * y + tmIN[10] * z + tmIN[14];
}
}
The --ptxas-options=-v option shows,
ptxas info : Compiling entry function ‘Z9transformPfS_S_S_iS_S_S’ for ‘sm_13’
ptxas info : Used 8 registers, 64+16 bytes smem
But when I set --maxrregcount=32 and also declare some extra variables in the kernel like below, still the register usage is same?
__global__ void transform(float* xIN,float* yIN,float* zIN, float* tmIN, int NumVertices,
float* xOUT,float* yOUT,float* zOUT )
{
int i = threadIdx.x + blockIdx.x*blockDim.x; // get thread ID
float x;
float y;
float z;
register float a,b,c,d,f,r,w,q,s,h,n;
a=b=c=d=f=r=w=q=s=h=n=4;
a=b+c+d+f+r+w+q+s+h+n;
a=b+a;
if(i<NumVertices){
x = xIN[i];
y = yIN[i];
z = zIN[i];
xOUT[i] = tmIN[0] * x + tmIN[4] * y + tmIN[8] * z + tmIN[12];
yOUT[i] = tmIN[1] * x + tmIN[5] * y + tmIN[9] * z + tmIN[13];
zOUT[i] = tmIN[2] * x + tmIN[6] * y + tmIN[10] * z + tmIN[14];
}
}
ptxas info : Compiling entry function ‘Z9transformPfS_S_S_iS_S_S’ for ‘sm_13’
ptxas info : Used 8 registers, 64+16 bytes smem
note: arch=13
Thanks for any help in this regards.