Hi,
I’ve broken it down to an easily reproducible test case:
In test_main.cu
/** Include common code. */
#include <stdlib.h>
#include <stdio.h>
/** Include CUDA libraries. */
#include <cuda.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include "test_kernel.cuh"
#define cudaErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define GRID (10)
#define BLOCK (128)
//////////////////////////////////////////////////////////////////////////////
/** Main function
*
*
*
* \param[in] argc command line argument count
* \param[in] argv command line argument vector
*/
int main (int argc, char *argv[])
{
double h_host[13 * GRID * BLOCK] = {0};
double* h_device;
cudaErrorCheck( cudaMalloc(&h_device, 13 * GRID * BLOCK * sizeof(double)) );
cudaErrorCheck( cudaMemcpy(h_device, h_host, 13 * GRID * BLOCK * sizeof(double), cudaMemcpyHostToDevice) );
test_kernel<<<GRID, BLOCK>>>(h_device);
cudaErrorCheck( cudaMemcpy(h_host, h_device, 13 * GRID * BLOCK * sizeof(double), cudaMemcpyDeviceToHost) );
for (int i = 0; i < 13; i++)
{
printf("%.15e\n", h_host[i * GRID * BLOCK]);
}
cudaErrorCheck( cudaDeviceReset() );
return 0;
}
test_kernel.cuh:
#ifndef TEST_KERNEL_CUH
#define TEST_KERNEL_CUH
__global__ void test_kernel(double* h);
#endif
test_kernel.cu:
#include "large_device_fn.cuh"
__global__ void
__launch_bounds__(128, 8)
test_kernel(double* h)
{
large_device_fn(1000, h);
for (int i = 0; i < 13; i++)
{
h[i] += 1.0;
}
}
large_device_fn.cuh:
#ifndef LARGE_DEVICE_CUH
#define LARGE_DEVICE_CUH
__device__ void large_device_fn(double T, double* h);
#endif
large_device_fn.cu:
#define TID (threadIdx.x + blockDim.x * blockIdx.x)
#define GRID_SIZE (blockDim.x * gridDim.x)
__device__ void large_device_fn(double T, double* h)
{
if (T <= 1000.0) {
h[TID + GRID_SIZE * 0] = 8.24876732e+07 * (2.54716300e+04 + T * (2.50000000e+00 + T * (0.00000000e+00 + T * (0.00000000e+00 + T * (0.00000000e+00 + 0.00000000e+00 * T)))));
} else {
h[TID + GRID_SIZE * 0] = 8.24876732e+07 * (2.54716300e+04 + T * (2.50000000e+00 + T * (0.00000000e+00 + T * (0.00000000e+00 + T * (0.00000000e+00 + 0.00000000e+00 * T)))));
}
if (T <= 1000.0) {
h[TID + GRID_SIZE * 1] = 4.12438366e+07 * (-1.01252100e+03 + T * (3.29812400e+00 + T * (4.12472100e-04 + T * (-2.71433833e-07 + T * (-2.36885850e-11 + 8.26974400e-14 * T)))));
} else {
h[TID + GRID_SIZE * 1] = 4.12438366e+07 * (-8.35034000e+02 + T * (2.99142300e+00 + T * (3.50032200e-04 + T * (-1.87794300e-08 + T * (-2.30789450e-12 + 3.16550400e-16 * T)))));
}
if (T <= 1000.0) {
h[TID + GRID_SIZE * 2] = 5.19676363e+06 * (2.91476400e+04 + T * (2.94642900e+00 + T * (-8.19083000e-04 + T * (8.07010667e-07 + T * (-4.00710750e-10 + 7.78139200e-14 * T)))));
} else {
h[TID + GRID_SIZE * 2] = 5.19676363e+06 * (2.92308000e+04 + T * (2.54206000e+00 + T * (-1.37753100e-05 + T * (-1.03426767e-09 + T * (1.13776675e-12 + -8.73610400e-17 * T)))));
}
if (T <= 1000.0) {
h[TID + GRID_SIZE * 3] = 4.88876881e+06 * (3.34630913e+03 + T * (4.12530561e+00 + T * (-1.61272470e-03 + T * (2.17588230e-06 + T * (-1.44963411e-09 + 4.12474758e-13 * T)))));
} else {
h[TID + GRID_SIZE * 3] = 4.88876881e+06 * (3.68362875e+03 + T * (2.86472886e+00 + T * (5.28252240e-04 + T * (-8.63609193e-08 + T * (7.63046685e-12 + -2.66391752e-16 * T)))));
}
if (T <= 1000.0) {
h[TID + GRID_SIZE * 4] = 4.61523901e+06 * (-3.02081100e+04 + T * (3.38684200e+00 + T * (1.73749100e-03 + T * (-2.11823200e-06 + T * (1.74214525e-09 + -5.01317600e-13 * T)))));
} else {
h[TID + GRID_SIZE * 4] = 4.61523901e+06 * (-2.98992100e+04 + T * (2.67214600e+00 + T * (1.52814650e-03 + T * (-2.91008667e-07 + T * (3.00249000e-11 + -1.27832360e-15 * T)))));
}
if (T <= 1000.0) {
h[TID + GRID_SIZE * 5] = 2.59838181e+06 * (-1.00524900e+03 + T * (3.21293600e+00 + T * (5.63743000e-04 + T * (-1.91871667e-07 + T * (3.28469250e-10 + -1.75371080e-13 * T)))));
} else {
h[TID + GRID_SIZE * 5] = 2.59838181e+06 * (-1.23393000e+03 + T * (3.69757800e+00 + T * (3.06759850e-04 + T * (-4.19614000e-08 + T * (4.43820250e-12 + -2.27287000e-16 * T)))));
}
if (T <= 1000.0) {
h[TID + GRID_SIZE * 6] = 2.51903170e+06 * (2.94808040e+02 + T * (4.30179801e+00 + T * (-2.37456025e-03 + T * (7.05276303e-06 + T * (-6.06909735e-09 + 1.85845025e-12 * T)))));
} else {
h[TID + GRID_SIZE * 6] = 2.51903170e+06 * (1.11856713e+02 + T * (4.01721090e+00 + T * (1.11991006e-03 + T * (-2.11219383e-07 + T * (2.85615925e-11 + -2.15817070e-15 * T)))));
}
if (T <= 1000.0) {
h[TID + GRID_SIZE * 7] = 2.44438441e+06 * (-1.76631500e+04 + T * (3.38875400e+00 + T * (3.28461300e-03 + T * (-4.95004333e-08 + T * (-1.15645150e-09 + 4.94303000e-13 * T)))));
} else {
h[TID + GRID_SIZE * 7] = 2.44438441e+06 * (-1.80069600e+04 + T * (4.57316700e+00 + T * (2.16806800e-03 + T * (-4.91563000e-07 + T * (5.87226000e-11 + -2.86330800e-15 * T)))));
}
if (T <= 1000.0) {
h[TID + GRID_SIZE * 8] = 2.96804743e+06 * (-1.02090000e+03 + T * (3.29867700e+00 + T * (7.04120000e-04 + T * (-1.32107400e-06 + T * (1.41037875e-09 + -4.88971000e-13 * T)))));
} else {
h[TID + GRID_SIZE * 8] = 2.96804743e+06 * (-9.22797700e+02 + T * (2.92664000e+00 + T * (7.43988500e-04 + T * (-1.89492033e-07 + T * (2.52426000e-11 + -1.35067020e-15 * T)))));
}
if (T <= 1000.0) {
h[TID + GRID_SIZE * 9] = 2.08133323e+06 * (-7.45375000e+02 + T * (2.50000000e+00 + T * (0.00000000e+00 + T * (0.00000000e+00 + T * (0.00000000e+00 + 0.00000000e+00 * T)))));
} else {
h[TID + GRID_SIZE * 9] = 2.08133323e+06 * (-7.45375000e+02 + T * (2.50000000e+00 + T * (0.00000000e+00 + T * (0.00000000e+00 + T * (0.00000000e+00 + 0.00000000e+00 * T)))));
}
if (T <= 1000.0) {
h[TID + GRID_SIZE * 10] = 2.07727727e+07 * (-7.45375000e+02 + T * (2.50000000e+00 + T * (0.00000000e+00 + T * (0.00000000e+00 + T * (0.00000000e+00 + 0.00000000e+00 * T)))));
} else {
h[TID + GRID_SIZE * 10] = 2.07727727e+07 * (-7.45375000e+02 + T * (2.50000000e+00 + T * (0.00000000e+00 + T * (0.00000000e+00 + T * (0.00000000e+00 + 0.00000000e+00 * T)))));
}
if (T <= 1000.0) {
h[TID + GRID_SIZE * 11] = 2.96834943e+06 * (-1.43105400e+04 + T * (3.26245200e+00 + T * (7.55970500e-04 + T * (-1.29391833e-06 + T * (1.39548600e-09 + -4.94990200e-13 * T)))));
} else {
h[TID + GRID_SIZE * 11] = 2.96834943e+06 * (-1.42683500e+04 + T * (3.02507800e+00 + T * (7.21344500e-04 + T * (-1.87694267e-07 + T * (2.54645250e-11 + -1.38219040e-15 * T)))));
}
if (T <= 1000.0) {
h[TID + GRID_SIZE * 12] = 1.88923414e+06 * (-4.83731400e+04 + T * (2.27572500e+00 + T * (4.96103600e-03 + T * (-3.46970333e-06 + T * (1.71667175e-09 + -4.23456000e-13 * T)))));
} else {
h[TID + GRID_SIZE * 12] = 1.88923414e+06 * (-4.89669600e+04 + T * (4.45362300e+00 + T * (1.57008450e-03 + T * (-4.26137000e-07 + T * (5.98499250e-11 + -3.33806600e-15 * T)))));
}
}
compiling as:
nvcc -arch=sm_20 -m64 -O3 -dc -o obj/test_kernel.cu.o src/test_kernel.cu
nvcc -arch=sm_20 -m64 -O3 -dc -o obj/large_device_fn.cu.o src/large_device_fn.cu
nvcc -arch=sm_20 -m64 -O3 -I/usr/local/cuda/include/
-I/usr/local/cuda/samples/common/inc -dc -o obj/test_main.cu.o src/test_main.cu
nvcc obj/test_kernel.cu.o obj/large_device_fn.cu.o o
bj/test_main.cu.o -lm -L/usr/local/cuda/lib64 -L/usr/local/lib -lcuda -lcudart
-lstdc++ -dlink -o obj/dlink.o
The link line breaks with the following error:
nvlink error : entry function ‘_Z11test_kernelPd’ with max regcount of 32 calls function ‘_Z15large_device_fndPd’ with regcount of 41
RHEL 6.6, CUDA runtime = 7.0, Tesla C2075
Nick