Hello,
I am trying to implement the SDK convolutionFFT2D into a project i am working on to check its performance in comparison to my naive convolution kernel. I created a “ConvFFT2DPerformer.cu” file with a header that does exactly the same as convolutionFFT2D.cu as (except of the gold computation and random data filling) and included the
kernel.cu, but when i try to make it I get:
ConvFFT2DPerformer.cu(125): error: identifier "texKernel" is undefined
ConvFFT2DPerformer.cu(125): error: no instance of overloaded function "cudaBindTextureToArray" matches the argument list
argument types are: (<error-type>, cudaArray *)
ConvFFT2DPerformer.cu(126): error: identifier "texData" is undefined
ConvFFT2DPerformer.cu(126): error: no instance of overloaded function "cudaBindTextureToArray" matches the argument list
argument types are: (<error-type>, cudaArray *)
I checked the Makefile of both mine and the sdk project and I also checked the code of the sdk project with what ConvFFT2DPerformer does, and I cant find a difference, but the convolutionFFT2D sdk project compiles just fine.
the code is at the bottom. (the licensing part is also in the code files, but I thought it shouldnt be posted here for length of post issues…)
Thanks in advance and I hope its just a misunderstanding from my side.
ConvFFT2DPerformer.cuh:
#ifndef CONVFFT2DPERFORMER_CUH_
#define CONVFFT2DPERFORMER_CUH_
#include "Matrixd.cuh"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cufft.h>
#include <cutil_inline.h>
#include "Complex.h"
#include "convolutionFFT2D_kernel.cuh"
void fft2dConvolve(Matrixd &m, Matrixd &filter, Matrixd &result);
int iDivUp(int, int);
int iAlignUp(int, int);
int calculateFFTsize(int);
#endif
ConvFFT2DPerformer.cu:
#include "ConvFFT2DPerformer.cuh"
void fft2dConvolve(Matrixd &image, Matrixd &filter, Matrixd &result) {
//Kernel dimensions
const int KERNEL_W = 5;
const int KERNEL_H = 5;
//Kernel center position
const int KERNEL_X = 2;
const int KERNEL_Y = 2;
//Width and height of padding for "clamp to border" addressing mode
const int PADDING_W = KERNEL_W - 1;
const int PADDING_H = KERNEL_H - 1;
//Input data dimension
const int DATA_W = image.width;
const int DATA_H = image.height;
//Derive FFT size from data and kernel dimensions
const int FFT_W = calculateFFTsize(DATA_W + PADDING_W);
const int FFT_H = calculateFFTsize(DATA_H + PADDING_H);
const int FFT_SIZE = FFT_W * FFT_H * sizeof(Complex);
const int KERNEL_SIZE = KERNEL_W * KERNEL_H * sizeof(Complex);
const int DATA_SIZE = DATA_W * DATA_H * sizeof(Complex);
Complex *h_Kernel, *h_Data, *h_ResultCPU, *h_ResultGPU;
cudaArray *a_Kernel, *a_Data;
cudaChannelFormatDesc float2tex = cudaCreateChannelDesc<float2> ();
Complex *d_PaddedKernel, *d_PaddedData;
cufftHandle FFTplan;
//needed for gold and checking
//Complex rCPU, rGPU;
//double max_delta_ref, delta, ref, sum_delta2, sum_ref2, L2norm;
//int x,y;
int i;
unsigned int hTimer;
cutilCheckError(cutCreateTimer(&hTimer));
printf("Input data size : %i x %i\n", DATA_W, DATA_H);
printf("Convolution kernel size : %i x %i\n", KERNEL_W, KERNEL_H);
printf("Padded image size : %i x %i\n", DATA_W + PADDING_W, DATA_H
+ PADDING_H);
printf("Aligned padded image size : %i x %i\n", FFT_W, FFT_H);
printf("Allocating memory...\n");
h_Kernel = (Complex *) malloc(KERNEL_SIZE);
h_Data = (Complex *) malloc(DATA_SIZE);
h_ResultCPU = (Complex *) malloc(DATA_SIZE);
h_ResultGPU = (Complex *) malloc(FFT_SIZE);
cutilSafeCall(cudaMallocArray(&a_Kernel, &float2tex, KERNEL_W, KERNEL_H));
cutilSafeCall(cudaMallocArray(&a_Data, &float2tex, DATA_W, DATA_H));
cutilSafeCall(cudaMalloc((void **) &d_PaddedKernel, FFT_SIZE));
cutilSafeCall(cudaMalloc((void **) &d_PaddedData, FFT_SIZE));
for (i = 0; i < (KERNEL_W * KERNEL_H); i++) {
h_Kernel[i].x = filter.elements[i];
h_Kernel[i].y = 0;
}
for (int y = 0; y < DATA_H; y++)
for (int x = 0; x < DATA_W; x++) {
h_Data[y * DATA_W + x].x = image.elements[y * image.pitch + x];
h_Data[y * DATA_W + x].y = 0;
}
printf("Creating FFT plan for %i x %i...\n", FFT_W, FFT_H);
cufftSafeCall(cufftPlan2d(&FFTplan, FFT_H, FFT_W, CUFFT_C2C));
printf(
"Uploading to GPU and padding convolution kernel and input data...\n");
printf("...initializing padded kernel and data storage with zeroes...\n");
cutilSafeCall(cudaMemset(d_PaddedKernel, 0, FFT_SIZE));
cutilSafeCall(cudaMemset(d_PaddedData, 0, FFT_SIZE));
printf(
"...copying input data and convolution kernel from host to CUDA arrays\n");
cutilSafeCall(cudaMemcpyToArray(a_Kernel, 0, 0, h_Kernel, KERNEL_SIZE,
cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpyToArray(a_Data, 0, 0, h_Data, DATA_SIZE,
cudaMemcpyHostToDevice));
printf("...binding CUDA arrays to texture references\n");
cutilSafeCall(cudaBindTextureToArray(texKernel, a_Kernel));
cutilSafeCall(cudaBindTextureToArray(texData, a_Data));
//Block width should be a multiple of maximum coalesced write size
//for coalesced memory writes in padKernel() and padData()
dim3 threadBlock(16, 12);
dim3 kernelBlockGrid(iDivUp(KERNEL_W, threadBlock.x), iDivUp(KERNEL_H,
threadBlock.y));
dim3 dataBlockGrid(iDivUp(FFT_W, threadBlock.x), iDivUp(FFT_H,
threadBlock.y));
printf("...padding convolution kernel\n");
padKernel<<<kernelBlockGrid, threadBlock>>>(
d_PaddedKernel,
FFT_W,
FFT_H,
KERNEL_W,
KERNEL_H,
KERNEL_X,
KERNEL_Y
);
cutilCheckMsg("padKernel() execution failed\n");
printf("...padding input data array\n");
padData<<<dataBlockGrid, threadBlock>>>(
d_PaddedData,
FFT_W,
FFT_H,
DATA_W,
DATA_H,
KERNEL_W,
KERNEL_H,
KERNEL_X,
KERNEL_Y
);
cutilCheckMsg("padData() execution failed\n");
//Not including kernel transformation into time measurement,
//since convolution kernel is not changed very frequently
printf("Transforming convolution kernel...\n");
cufftSafeCall(cufftExecC2C(FFTplan, (cufftComplex *) d_PaddedKernel,
(cufftComplex *) d_PaddedKernel, CUFFT_FORWARD));
printf("Running GPU FFT convolution...\n");
cutilSafeCall(cudaThreadSynchronize());
cutilCheckError(cutResetTimer(hTimer));
cutilCheckError(cutStartTimer(hTimer));
cufftSafeCall(cufftExecC2C(FFTplan, (cufftComplex *) d_PaddedData,
(cufftComplex *) d_PaddedData, CUFFT_FORWARD));
modulateAndNormalize<<<16, 128>>>(
d_PaddedData,
d_PaddedKernel,
FFT_W * FFT_H
);
cutilCheckMsg("modulateAndNormalize() execution failed\n");
cufftSafeCall(cufftExecC2C(FFTplan, (cufftComplex *) d_PaddedData,
(cufftComplex *) d_PaddedData, CUFFT_INVERSE));
cutilSafeCall(cudaThreadSynchronize());
cutilCheckError(cutStopTimer(hTimer));
double gpuTime = cutGetTimerValue(hTimer);
printf("GPU time: %f msecs. //%f MPix/s\n", gpuTime, DATA_W * DATA_H * 1e-6
/ (gpuTime * 0.001));
printf("Reading back GPU FFT results...\n");
cutilSafeCall(cudaMemcpy(h_ResultGPU, d_PaddedData, FFT_SIZE,
cudaMemcpyDeviceToHost));
for (int y = 0; y < result.height; y++)
for (int x = 0; x < result.width; x++)
result.elements[y * result.pitch + x]
= h_ResultGPU[y * DATA_W + x].x;
printf("Shutting down...\n");
cutilSafeCall(cudaUnbindTexture(texData));
cutilSafeCall(cudaUnbindTexture(texKernel));
cufftSafeCall(cufftDestroy(FFTplan));
cutilSafeCall(cudaFree(d_PaddedData));
cutilSafeCall(cudaFree(d_PaddedKernel));
cutilSafeCall(cudaFreeArray(a_Data));
cutilSafeCall(cudaFreeArray(a_Kernel));
free(h_ResultGPU);
free(h_ResultCPU);
free(h_Data);
free(h_Kernel);
// cudaThreadExit();
}
int iDivUp(int a, int b) {
return (a % b != 0) ? (a / b + 1) : (a / b);
}
//Align a to nearest higher multiple of b
int iAlignUp(int a, int b) {
return (a % b != 0) ? (a - a % b + b) : a;
}
int calculateFFTsize(int dataSize) {
//Highest non-zero bit position of dataSize
int hiBit;
//Neares lower and higher powers of two numbers for dataSize
unsigned int lowPOT, hiPOT;
//Align data size to a multiple of half-warp
//in order to have each line starting at properly aligned addresses
//for coalesced global memory writes in padKernel() and padData()
dataSize = iAlignUp(dataSize, 16);
//Find highest non-zero bit
for (hiBit = 31; hiBit >= 0; hiBit--)
if (dataSize & (1U << hiBit))
break;
//No need to align, if already power of two
lowPOT = 1U << hiBit;
if (lowPOT == dataSize)
return dataSize;
//Align to a nearest higher power of two, if the size is small enough,
//else align only to a nearest higher multiple of 512,
//in order to save computation and memory bandwidth
hiPOT = 1U << (hiBit + 1);
if (hiPOT <= 1024)
return hiPOT;
else
return iAlignUp(dataSize, 512);
}