I was able to prove it to myself using this hacked version of Norbert’s example:
#include <stdio.h>
#include <stdlib.h>
#define N 5
__device__ float add_func (float x, float y)
{
return x + y;
}
__device__ float mul_func (float x, float y)
{
return x * y;
}
__device__ float div_func (float x, float y)
{
return x / y;
}
typedef float (*op_func) (float, float);
__device__ op_func action;
__device__ op_func funcs[3] = { add_func, mul_func, div_func };
__device__ void op_array (const float *a, const float *b, float *res, op_func f, int n)
{
for (int i = 0; i < N; i++) {
res[i] = f(a[i], b[i]);
}
}
__global__ void kernel (void)
{
float x[N];
float y[N];
float res[N];
for (int i = 0; i < N; i++) {
x[i] = (float)(10 + i);
}
for (int i = 0; i < N; i++) {
y[i] = (float)(100 + i);
}
op_array (x, y, res, action, N);
for (int i = 0; i < N; i++) {
printf ("res = % 16.9e\n", res[i]);
}
}
#include <stdio.h>
#include <cuda.h>
int main(void)
{
CUmodule mhandle;
CUcontext chandle;
CUfunction khandle;
CUdeviceptr funcshandle, actionhandle;
size_t op_funcsz;
cuInit(0);
cuCtxCreate( &chandle, 0, 0 );
cuModuleLoad( &mhandle, "funcpointer.cubin");
cuModuleGetFunction( &khandle, mhandle, "_Z6kernelv");
cuModuleGetGlobal( &funcshandle, NULL, mhandle, "funcs");
cuModuleGetGlobal( &actionhandle, &op_funcsz, mhandle, "action");
for(int i=0; i<3; i++) {
size_t offset = size_t(i) * op_funcsz;
cuMemcpyDtoD(actionhandle, funcshandle+offset, op_funcsz);
cuFuncSetBlockShape( khandle, 1, 1, 1 );
cuLaunchGrid( khandle, 1, 1);
cuCtxSynchronize();
}
cuCtxDestroy(chandle);
return 0;
}
avidday@cuda:~$ nvcc -cubin -arch=sm_20 -Xptxas="-v" funcpointer.cu
ptxas info : Compiling entry function '_Z6kernelv' for 'sm_20'
ptxas info : Used 22 registers, 32 bytes cmem[0], 28 bytes cmem[14]
avidday@cuda:~$ g++ funcpointmain.c -I $CUDA_INSTALL_PATH/include -lcuda -o funcpointer.exe
avidday@cuda:~$ ./funcpointer.exe
res = 1.100000000e+02
res = 1.120000000e+02
res = 1.140000000e+02
res = 1.160000000e+02
res = 1.180000000e+02
res = 1.000000000e+03
res = 1.111000000e+03
res = 1.224000000e+03
res = 1.339000000e+03
res = 1.456000000e+03
res = 1.000000015e-01
res = 1.089108884e-01
res = 1.176470593e-01
res = 1.262135953e-01
res = 1.346153915e-01