Hi,
I’m trying to fix a much more complicated problem in a real project related to separating device functions into multiple files, and I’m trying to understand how separable compilation works right now. Unfortunately, I can’t even get my trivial example to link properly.
kernel1.h:
#ifndef _KERNEL1_H_
#define _KERNEL1_H_
__device__ __host__ void adder(int *a, int *b, int *x);
#endif
kernel2.h:
#ifndef _KERNEL2_H_
#define _KERNEL2_H_
__device__ __host__ void doubler(int *a, int *x);
#endif
kernel1.cu:
#include "kernel1.h"
__device__ __host__ void adder(int *a, int *b, int *x) {
*x = *a + *b;
}
kernel2.cu:
#include "kernel1.h"
#include "kernel2.h"
__device__ __host__ void doubler(int *a, int *x) {
adder(a, a, x);
}
main.cu:
#include <cuda.h>
#include <cstdio>
#include "kernel2.h"
__global__ void launch_doubler(int *a, int *x) {
doubler(a, x);
}
int main(void) {
int *a;
int *x;
int a_host = 5;
cudaMalloc((void **)&a, sizeof(int));
cudaMalloc((void **)&x, sizeof(int));
cudaMemcpy(a, &a_host, sizeof(int), cudaMemcpyHostToDevice);
launch_doubler<<<1, 1>>>(a, x);
cudaMemcpy(&a_host, a, sizeof(int), cudaMemcpyDeviceToHost);
printf("%i\n", a_host);
cudaFree(a);
cudaFree(x);
}
build.sh:
#!/bin/sh
set -o verbose
nvcc -std c++11 -Xcompiler -Wall -gencode arch=compute_35,code=sm_35 -dc kernel1.cu -o kernel1.o
nvcc -std c++11 -Xcompiler -Wall -gencode arch=compute_35,code=sm_35 -dc kernel2.cu -o kernel2.o
nvcc -gencode arch=compute_35,code=sm_35 -dlink kernel1.o kernel2.o -o kernels.o
nvcc -std c++11 -Xcompiler -Wall -gencode arch=compute_35,code=sm_35 main.cu kernels.o
Unfortunately, when I run build.sh, I get the following error:
ptxas fatal : Unresolved extern function '_Z7doublerPiS_'
and yet, the symbol does exist in the object file:
$ strings kernels.o | grep _Z7doublerPiS_
.text._Z7doublerPiS_
.nv.info._Z7doublerPiS_
.rel.text._Z7doublerPiS_
_Z7doublerPiS_
.text._Z7doublerPiS_
.nv.info._Z7doublerPiS_
.rel.text._Z7doublerPiS_
I also get the same pxtas error if I run
$ nvcc -std c++11 -Xcompiler -Wall -gencode arch=compute_35,code=sm_35 main.cu kernel1.cu kernel2.cu
ptxas fatal : Unresolved extern function '_Z7doublerPiS_'
Can someone please help me understand what is happening here? If it helps, my version info is
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2015 NVIDIA Corporation
Built on Tue_Aug_11_14:27:32_CDT_2015
Cuda compilation tools, release 7.5, V7.5.17