Function template specialization
Hello
I'm trying to "specialize" (overload would be a more adequate term) a template kernel function. My real code is quite long, but I made a simpler version to show you the problem.
I've got a generic virtual template class called [b]generalClass[/b]
I've got a derived class called [b]derivedClass[/b] which implement [b]generalClass[/b] virtual functions.
Then I have a template kernel which should work for any class derived from [b]generalClass[/b]
But I would like to overload it for [b]derivedClass[/b] in order to implement an optimized version.
And lastly, I have a template class [b]callingClass[/b] which calls the kernel.
Here is the code:
[code]
#include <cuda.h>
#include <cutil_inline.h>

template < typename A, unsigned int D, unsigned int Q >
class generalClass
{
private:
public:
generalClass() {}
};


template < typename A >
class derivedClass : public generalClass<A,2,9>
{
private:
public:
derivedClass() {}
};


//general test kernel
template< class A, template<typename U> class generalClass, typename V >
__global__ void testKernel(
generalClass<A> d,
generalClass<A> d_tmp)
{ }

///specialisation of testKernel
template< class A, template<class U> class generalClass, typename V >
__global__ void testKernel(
derivedClass<A> d,
derivedClass<A> d_tmp)
{ }

/// general template callingClass
template < typename T, template <typename A> class generalClass, typename V >
class callingClass
{
private:
public:
callingClass() { }

generalClass<float> d;
generalClass<float> d_tmp;

void testkernelWrapper()
{
dim3 block_size, grid_size;
block_size.x = 1; block_size.y = 0; block_size.z = 0;
grid_size.x = 1; grid_size.y = 0; grid_size.z = 0;
testKernel< float, generalClass, V> <<< grid_size, block_size>>>(d, d_tmp);
}
};

int main( void )
{
callingClass<float, derivedClass, int> *test = new callingClass<float, derivedClass, int> ();
test->testkernelWrapper();
return 0;
}
[/code]

It doesn't compile, the compiler (nvcc 4.0) doesn't know which kernel to choose.
Error message:
nvcc test.cu -arch=sm_20 -I/usr/not-backed-up/NVIDIA_SDK_4_0/CUDALibraries/common/inc -L/usr/lib64/nvidia -lcuda
In file included from /tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:2:
/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c: In function ‘void __device_stub__Z10testKernelIf12derivedClassiEvS0_IT_ES2_(_Z12derivedClassIfE&, _Z12derivedClassIfE&)’:
/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:6: error: insufficient contextual information to determine type
/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:6: error: insufficient contextual information to determine type
In file included from /tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:2:
/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c: At global scope:
/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:7: error: ambiguous template specialization ‘__wrapper__device_stub_testKernel<float, template<class A> class derivedClass, int>’ for ‘void __wrapper__device_stub_testKernel(_Z12derivedClassIfE&, _Z12derivedClassIfE&)’
In file included from /tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:2:
/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c: In function ‘void __sti____cudaRegisterAll_40_tmpxft_00001e6e_00000000_4_test2_cpp1_ii_main()’:
/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:8: error: insufficient contextual information to determine type

What did I do wrong?
Thank you for your help.

Note: if I get rid of the typename V parameter, the code compile, that means that in this case the compiler how to choose (but I'm not sure which kernel it chose).
Hello

I'm trying to "specialize" (overload would be a more adequate term) a template kernel function. My real code is quite long, but I made a simpler version to show you the problem.

I've got a generic virtual template class called generalClass

I've got a derived class called derivedClass which implement generalClass virtual functions.

Then I have a template kernel which should work for any class derived from generalClass

But I would like to overload it for derivedClass in order to implement an optimized version.

And lastly, I have a template class callingClass which calls the kernel.

Here is the code:



#include <cuda.h>

#include <cutil_inline.h>



template < typename A, unsigned int D, unsigned int Q >

class generalClass

{

private:

public:

generalClass() {}

};





template < typename A >

class derivedClass : public generalClass<A,2,9>

{

private:

public:

derivedClass() {}

};





//general test kernel

template< class A, template<typename U> class generalClass, typename V >

__global__ void testKernel(

generalClass<A> d,

generalClass<A> d_tmp)

{ }



///specialisation of testKernel

template< class A, template<class U> class generalClass, typename V >

__global__ void testKernel(

derivedClass<A> d,

derivedClass<A> d_tmp)

{ }



/// general template callingClass

template < typename T, template <typename A> class generalClass, typename V >

class callingClass

{

private:

public:

callingClass() { }



generalClass<float> d;

generalClass<float> d_tmp;



void testkernelWrapper()

{

dim3 block_size, grid_size;

block_size.x = 1; block_size.y = 0; block_size.z = 0;

grid_size.x = 1; grid_size.y = 0; grid_size.z = 0;

testKernel< float, generalClass, V> <<< grid_size, block_size>>>(d, d_tmp);

}

};



int main( void )

{

callingClass<float, derivedClass, int> *test = new callingClass<float, derivedClass, int> ();

test->testkernelWrapper();

return 0;

}




It doesn't compile, the compiler (nvcc 4.0) doesn't know which kernel to choose.

Error message:

nvcc test.cu -arch=sm_20 -I/usr/not-backed-up/NVIDIA_SDK_4_0/CUDALibraries/common/inc -L/usr/lib64/nvidia -lcuda

In file included from /tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:2:

/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c: In function ‘void __device_stub__Z10testKernelIf12derivedClassiEvS0_IT_ES2_(_Z12derivedClassIfE&, _Z12derivedClassIfE&)’:

/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:6: error: insufficient contextual information to determine type

/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:6: error: insufficient contextual information to determine type

In file included from /tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:2:

/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c: At global scope:

/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:7: error: ambiguous template specialization ‘__wrapper__device_stub_testKernel<float, template<class A> class derivedClass, int>’ for ‘void __wrapper__device_stub_testKernel(_Z12derivedClassIfE&, _Z12derivedClassIfE&)’

In file included from /tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:2:

/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c: In function ‘void __sti____cudaRegisterAll_40_tmpxft_00001e6e_00000000_4_test2_cpp1_ii_main()’:

/tmp/tmpxft_00001e6e_00000000-1_test2.cudafe1.stub.c:8: error: insufficient contextual information to determine type



What did I do wrong?

Thank you for your help.



Note: if I get rid of the typename V parameter, the code compile, that means that in this case the compiler how to choose (but I'm not sure which kernel it chose).

#1
Posted 11/24/2011 03:34 PM   
If possible, could you try with CUDA 4.1 RC1? Based on information I got from the compiler team, your posted example code should work with RC1.
If possible, could you try with CUDA 4.1 RC1? Based on information I got from the compiler team, your posted example code should work with RC1.

#2
Posted 11/28/2011 05:47 PM   
Now that CUDA 4.1 has officialy been released, I tried the same program with the new cuda compiler. But I still get the same errors. What is wrong with my code?
Now that CUDA 4.1 has officialy been released, I tried the same program with the new cuda compiler. But I still get the same errors. What is wrong with my code?

#3
Posted 02/27/2012 10:58 AM   
I do not know enough details about C++ to determine whether this code should work or not. Given that I was previously advised by the compiler team that it should work, I would suggest filing a bug against the compiler.
I do not know enough details about C++ to determine whether this code should work or not. Given that I was previously advised by the compiler team that it should work, I would suggest filing a bug against the compiler.

#4
Posted 02/27/2012 04:29 PM   
The same code in c++ (replacing the kernels functions by c++ functions) compiles and runs fine with g++. I think there is a problem in the way nvcc treats template kernel calls. Where and how can I fill a bug report?
Thanks
The same code in c++ (replacing the kernels functions by c++ functions) compiles and runs fine with g++. I think there is a problem in the way nvcc treats template kernel calls. Where and how can I fill a bug report?

Thanks

#5
Posted 02/28/2012 04:21 PM   
There is a link to the bug reporting form on the starting page of the registered developer site. If you are not a registered developer yet, sign up today and your request will likely be approved within one business day (the process was streamlined in the past year, and I have gone through it myself to exercise exactly what registered developers are seeing). If you experience undue delay in the sign-up process, send me a PM through the forum and I will follow up internally.
There is a link to the bug reporting form on the starting page of the registered developer site. If you are not a registered developer yet, sign up today and your request will likely be approved within one business day (the process was streamlined in the past year, and I have gone through it myself to exercise exactly what registered developers are seeing). If you experience undue delay in the sign-up process, send me a PM through the forum and I will follow up internally.

#6
Posted 02/28/2012 10:19 PM   
Hi, I have filed a bug report. I hope it will be corrected for the next version of CUDA.
Hi, I have filed a bug report. I hope it will be corrected for the next version of CUDA.

#7
Posted 03/02/2012 01:16 PM   
[quote name='nicolas.delbosc' date='02 March 2012 - 01:16 PM' timestamp='1330694211' post='1377583']
Hi, I have filed a bug report. I hope it will be corrected for the next version of CUDA.
[/quote]

I have a similar problem in 4.1 (and older) calling a templated kernel with an inner templated type. Calling a regular templated function works fine though.

[code]template
<
template<typename> class C1
>
void regular_function() {}

template
<
template<typename> class C1
>
__global__ void kernel_function() {}

template<typename> struct C1 {};

struct C
{
template <typename> struct C1_inside_class {};
};

int main()
{
regular_function <C1> ();
kernel_function <C1> <<< 1,1 >>> ();
regular_function <C::C1_inside_class> ();
// works until here

kernel_function <C::C1_inside_class> <<< 1,1 >>> ();
/*
nvcc fails with:
In file included from tmpxft_0000244c_00000000-1_nvcc_bind_problem.cudafe1.stub.c:2:
/tmp/tmpxft_0000244c_00000000-1_nvcc_bind_problem.cudafe1.stub.c:9: error: ‘C1_inside_class’ was not declared in this scope
/tmp/tmpxft_0000244c_00000000-1_nvcc_bind_problem.cudafe1.stub.c:9: error: template-id ‘__wrapper__device_stub_kernel_function<<expression error> >’ for ‘void __wrapper__device_stub_kernel_function()’ does not match any template declaration
*/
}
[/code]
I did not find a way to file a bug though. Can't find the link mentioned on the registered developer starting page.
[quote name='nicolas.delbosc' date='02 March 2012 - 01:16 PM' timestamp='1330694211' post='1377583']

Hi, I have filed a bug report. I hope it will be corrected for the next version of CUDA.





I have a similar problem in 4.1 (and older) calling a templated kernel with an inner templated type. Calling a regular templated function works fine though.



template

<

template<typename> class C1

>

void regular_function() {}



template

<

template<typename> class C1

>

__global__ void kernel_function() {}



template<typename> struct C1 {};



struct C

{

template <typename> struct C1_inside_class {};

};



int main()

{

regular_function <C1> ();

kernel_function <C1> <<< 1,1 >>> ();

regular_function <C::C1_inside_class> ();

// works until here



kernel_function <C::C1_inside_class> <<< 1,1 >>> ();

/*

nvcc fails with:

In file included from tmpxft_0000244c_00000000-1_nvcc_bind_problem.cudafe1.stub.c:2:

/tmp/tmpxft_0000244c_00000000-1_nvcc_bind_problem.cudafe1.stub.c:9: error: ‘C1_inside_class’ was not declared in this scope

/tmp/tmpxft_0000244c_00000000-1_nvcc_bind_problem.cudafe1.stub.c:9: error: template-id ‘__wrapper__device_stub_kernel_function<<expression error> >’ for ‘void __wrapper__device_stub_kernel_function()’ does not match any template declaration

*/

}


I did not find a way to file a bug though. Can't find the link mentioned on the registered developer starting page.

#8
Posted 04/12/2012 08:29 PM   
(1) Goto http://developer.nvidia.com/
(2) Click on "Registered Developer Website" (green link in upper right corner)
(3) Log in or create new account as appropriate
(4) On personal start page, click green link "CUDA/GPU Computing Registered Developer Program"
(5) Scroll to the bottom of the page that comes up, section "Report An Issue"
(6) Click on green link "The Submit a Bug Form"
(1) Goto http://developer.nvidia.com/

(2) Click on "Registered Developer Website" (green link in upper right corner)

(3) Log in or create new account as appropriate

(4) On personal start page, click green link "CUDA/GPU Computing Registered Developer Program"

(5) Scroll to the bottom of the page that comes up, section "Report An Issue"

(6) Click on green link "The Submit a Bug Form"

#9
Posted 04/12/2012 08:56 PM   
Scroll To Top