error: identifier "texKernel"/"texData" is undefined?

Hello,

I am trying to implement the SDK convolutionFFT2D into a project i am working on to check its performance in comparison to my naive convolution kernel. I created a “ConvFFT2DPerformer.cu” file with a header that does exactly the same as convolutionFFT2D.cu as (except of the gold computation and random data filling) and included the

kernel.cu, but when i try to make it I get:

ConvFFT2DPerformer.cu(125): error: identifier "texKernel" is undefined

ConvFFT2DPerformer.cu(125): error: no instance of overloaded function "cudaBindTextureToArray" matches the argument list

			argument types are: (<error-type>, cudaArray *)

ConvFFT2DPerformer.cu(126): error: identifier "texData" is undefined

ConvFFT2DPerformer.cu(126): error: no instance of overloaded function "cudaBindTextureToArray" matches the argument list

			argument types are: (<error-type>, cudaArray *)

I checked the Makefile of both mine and the sdk project and I also checked the code of the sdk project with what ConvFFT2DPerformer does, and I cant find a difference, but the convolutionFFT2D sdk project compiles just fine.

the code is at the bottom. (the licensing part is also in the code files, but I thought it shouldnt be posted here for length of post issues…)

Thanks in advance and I hope its just a misunderstanding from my side.


ConvFFT2DPerformer.cuh:

#ifndef CONVFFT2DPERFORMER_CUH_

#define CONVFFT2DPERFORMER_CUH_

#include "Matrixd.cuh"

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <cufft.h>

#include <cutil_inline.h>

#include "Complex.h"

#include "convolutionFFT2D_kernel.cuh"

void fft2dConvolve(Matrixd &m, Matrixd &filter, Matrixd &result);

int iDivUp(int, int);

int iAlignUp(int, int);

int calculateFFTsize(int);

#endif

ConvFFT2DPerformer.cu:

#include "ConvFFT2DPerformer.cuh"

void fft2dConvolve(Matrixd &image, Matrixd &filter, Matrixd &result) {

	//Kernel dimensions

	const int KERNEL_W = 5;

	const int KERNEL_H = 5;

	//Kernel center position

	const int KERNEL_X = 2;

	const int KERNEL_Y = 2;

	//Width and height of padding for "clamp to border" addressing mode

	const int PADDING_W = KERNEL_W - 1;

	const int PADDING_H = KERNEL_H - 1;

	//Input data dimension

	const int DATA_W = image.width;

	const int DATA_H = image.height;

	//Derive FFT size from data and kernel dimensions

	const int FFT_W = calculateFFTsize(DATA_W + PADDING_W);

	const int FFT_H = calculateFFTsize(DATA_H + PADDING_H);

	const int FFT_SIZE = FFT_W * FFT_H * sizeof(Complex);

	const int KERNEL_SIZE = KERNEL_W * KERNEL_H * sizeof(Complex);

	const int DATA_SIZE = DATA_W * DATA_H * sizeof(Complex);

	Complex *h_Kernel, *h_Data, *h_ResultCPU, *h_ResultGPU;

	cudaArray *a_Kernel, *a_Data;

	cudaChannelFormatDesc float2tex = cudaCreateChannelDesc<float2> ();

	Complex *d_PaddedKernel, *d_PaddedData;

	cufftHandle FFTplan;

	//needed for gold and checking

	//Complex rCPU, rGPU;

	//double max_delta_ref, delta, ref, sum_delta2, sum_ref2, L2norm;

	//int x,y;

	int i;

	unsigned int hTimer;

	cutilCheckError(cutCreateTimer(&hTimer));

	printf("Input data size		   : %i x %i\n", DATA_W, DATA_H);

	printf("Convolution kernel size   : %i x %i\n", KERNEL_W, KERNEL_H);

	printf("Padded image size		 : %i x %i\n", DATA_W + PADDING_W, DATA_H

			+ PADDING_H);

	printf("Aligned padded image size : %i x %i\n", FFT_W, FFT_H);

	printf("Allocating memory...\n");

	h_Kernel = (Complex *) malloc(KERNEL_SIZE);

	h_Data = (Complex *) malloc(DATA_SIZE);

	h_ResultCPU = (Complex *) malloc(DATA_SIZE);

	h_ResultGPU = (Complex *) malloc(FFT_SIZE);

	cutilSafeCall(cudaMallocArray(&a_Kernel, &float2tex, KERNEL_W, KERNEL_H));

	cutilSafeCall(cudaMallocArray(&a_Data, &float2tex, DATA_W, DATA_H));

	cutilSafeCall(cudaMalloc((void **) &d_PaddedKernel, FFT_SIZE));

	cutilSafeCall(cudaMalloc((void **) &d_PaddedData, FFT_SIZE));

	for (i = 0; i < (KERNEL_W * KERNEL_H); i++) {

		h_Kernel[i].x = filter.elements[i];

		h_Kernel[i].y = 0;

	}

	for (int y = 0; y < DATA_H; y++)

		for (int x = 0; x < DATA_W; x++) {

			h_Data[y * DATA_W + x].x = image.elements[y * image.pitch + x];

			h_Data[y * DATA_W + x].y = 0;

		}

	printf("Creating FFT plan for %i x %i...\n", FFT_W, FFT_H);

	cufftSafeCall(cufftPlan2d(&FFTplan, FFT_H, FFT_W, CUFFT_C2C));

	printf(

			"Uploading to GPU and padding convolution kernel and input data...\n");

	printf("...initializing padded kernel and data storage with zeroes...\n");

	cutilSafeCall(cudaMemset(d_PaddedKernel, 0, FFT_SIZE));

	cutilSafeCall(cudaMemset(d_PaddedData, 0, FFT_SIZE));

	printf(

			"...copying input data and convolution kernel from host to CUDA arrays\n");

	cutilSafeCall(cudaMemcpyToArray(a_Kernel, 0, 0, h_Kernel, KERNEL_SIZE,

			cudaMemcpyHostToDevice));

	cutilSafeCall(cudaMemcpyToArray(a_Data, 0, 0, h_Data, DATA_SIZE,

			cudaMemcpyHostToDevice));

	printf("...binding CUDA arrays to texture references\n");

	cutilSafeCall(cudaBindTextureToArray(texKernel, a_Kernel));

	cutilSafeCall(cudaBindTextureToArray(texData, a_Data));

	//Block width should be a multiple of maximum coalesced write size

	//for coalesced memory writes in padKernel() and padData()

	dim3 threadBlock(16, 12);

	dim3 kernelBlockGrid(iDivUp(KERNEL_W, threadBlock.x), iDivUp(KERNEL_H,

			threadBlock.y));

	dim3 dataBlockGrid(iDivUp(FFT_W, threadBlock.x), iDivUp(FFT_H,

			threadBlock.y));

	printf("...padding convolution kernel\n");

	padKernel<<<kernelBlockGrid, threadBlock>>>(

			d_PaddedKernel,

			FFT_W,

			FFT_H,

			KERNEL_W,

			KERNEL_H,

			KERNEL_X,

			KERNEL_Y

	);

	cutilCheckMsg("padKernel() execution failed\n");

	printf("...padding input data array\n");

	padData<<<dataBlockGrid, threadBlock>>>(

			d_PaddedData,

			FFT_W,

			FFT_H,

			DATA_W,

			DATA_H,

			KERNEL_W,

			KERNEL_H,

			KERNEL_X,

			KERNEL_Y

	);

	cutilCheckMsg("padData() execution failed\n");

	//Not including kernel transformation into time measurement,

	//since convolution kernel is not changed very frequently

	printf("Transforming convolution kernel...\n");

	cufftSafeCall(cufftExecC2C(FFTplan, (cufftComplex *) d_PaddedKernel,

			(cufftComplex *) d_PaddedKernel, CUFFT_FORWARD));

	printf("Running GPU FFT convolution...\n");

	cutilSafeCall(cudaThreadSynchronize());

	cutilCheckError(cutResetTimer(hTimer));

	cutilCheckError(cutStartTimer(hTimer));

	cufftSafeCall(cufftExecC2C(FFTplan, (cufftComplex *) d_PaddedData,

			(cufftComplex *) d_PaddedData, CUFFT_FORWARD));

	modulateAndNormalize<<<16, 128>>>(

			d_PaddedData,

			d_PaddedKernel,

			FFT_W * FFT_H

	);

	cutilCheckMsg("modulateAndNormalize() execution failed\n");

	cufftSafeCall(cufftExecC2C(FFTplan, (cufftComplex *) d_PaddedData,

			(cufftComplex *) d_PaddedData, CUFFT_INVERSE));

	cutilSafeCall(cudaThreadSynchronize());

	cutilCheckError(cutStopTimer(hTimer));

	double gpuTime = cutGetTimerValue(hTimer);

	printf("GPU time: %f msecs. //%f MPix/s\n", gpuTime, DATA_W * DATA_H * 1e-6

			/ (gpuTime * 0.001));

	printf("Reading back GPU FFT results...\n");

	cutilSafeCall(cudaMemcpy(h_ResultGPU, d_PaddedData, FFT_SIZE,

			cudaMemcpyDeviceToHost));

	for (int y = 0; y < result.height; y++)

		for (int x = 0; x < result.width; x++)

			result.elements[y * result.pitch + x]

					= h_ResultGPU[y * DATA_W + x].x;

	printf("Shutting down...\n");

	cutilSafeCall(cudaUnbindTexture(texData));

	cutilSafeCall(cudaUnbindTexture(texKernel));

	cufftSafeCall(cufftDestroy(FFTplan));

	cutilSafeCall(cudaFree(d_PaddedData));

	cutilSafeCall(cudaFree(d_PaddedKernel));

	cutilSafeCall(cudaFreeArray(a_Data));

	cutilSafeCall(cudaFreeArray(a_Kernel));

	free(h_ResultGPU);

	free(h_ResultCPU);

	free(h_Data);

	free(h_Kernel);

	//	cudaThreadExit();

}

int iDivUp(int a, int b) {

	return (a % b != 0) ? (a / b + 1) : (a / b);

}

//Align a to nearest higher multiple of b

int iAlignUp(int a, int b) {

	return (a % b != 0) ? (a - a % b + b) : a;

}

int calculateFFTsize(int dataSize) {

	//Highest non-zero bit position of dataSize

	int hiBit;

	//Neares lower and higher powers of two numbers for dataSize

	unsigned int lowPOT, hiPOT;

	//Align data size to a multiple of half-warp

	//in order to have each line starting at properly aligned addresses

	//for coalesced global memory writes in padKernel() and padData()

	dataSize = iAlignUp(dataSize, 16);

	//Find highest non-zero bit

	for (hiBit = 31; hiBit >= 0; hiBit--)

		if (dataSize & (1U << hiBit))

			break;

	//No need to align, if already power of two

	lowPOT = 1U << hiBit;

	if (lowPOT == dataSize)

		return dataSize;

	//Align to a nearest higher power of two, if the size is small enough,

	//else align only to a nearest higher multiple of 512,

	//in order to save computation and memory bandwidth

	hiPOT = 1U << (hiBit + 1);

	if (hiPOT <= 1024)

		return hiPOT;

	else

		return iAlignUp(dataSize, 512);

}

Sorry I have found it. arhhgh.

I thought all the time that texKernel and texData ought to be some internal variable names and theres some linking problem or something, but theyre defined in the kernel.cu and since i only included a header file for the kernel.cu the ConvFFTPerformer didnt know about it.

…so nevermind.