Hi!!
I am not good to construct kernel function…
Now, Im trying to build a kernel function for element-wise product x vector and y vector.
However, the problem is that x’s type is cufftComplex, so x has real and imaginary value
and y is only integer.
For example, x → (3+2i),(2+4i), (10+7i)
y → 2, 5, 3
result: 6+4i, 10+20i, 30+21i
I know that it is really simple but I confuse every time how thread index will be…
Also, it is pretty big size.
the number of x and y are 1024*1024 in respectively.
(Additionally, I think my program dies
on checkCudaErrors(cudaMemcpy(x_w, x_t, _size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(h_w, h_t, _size, cudaMemcpyHostToDevice));
I assume that the size is too big…? is that right??? Is 1024*1024 too big?)
I attach the part of my code below (It doesn’t work well…)
and I’ve checked 3 main problems.
if anyone want to know full code, please tell me!
//////////////////////////////////////////////////////////////
//////////////this part is the problem 1//////////////
global void point_wise_product(cufftComplex *a, int b){
int tid = threadIdx.x;
if(tid<SIGNAL_SIZESIGNAL_SIZE)
a[tid].x = a[tid].x * b[tid];
a[tid].y = a[tid].y * b[tid];
}
#define SIGNAL_SIZE 1024*1024
int main()
{
…
//skip for creation of h_t values
cufftComplex *x_t;
//Allocate host memory for the x(t)
x_t = (cufftComplex *)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
//Allocate host memory for the result(t)
cufftComplex *result = (cufftComplex *)malloc(_size);
cufftComplex *final_result = (cufftComplex *)malloc(_size);
//Initialize the memory for the signal
for(unsigned int i = 0; i < SIGNAL_SIZE; i++)
{
x_t[i].x = rand()/(float)RAND_MAX * 100;
x_t[i].y = 0;
//printf("host signal: %f\n", x_t[i].x);
}
//Allocate device memory for signal
cufftComplex *x_w;
checkCudaErrors(cudaMalloc((void **)&x_w, _size));
//Allocate device memory for h
int *h_w;
checkCudaErrors(cudaMalloc((void **)&h_w, _size));
//////////////////////////////////////////////////////////////////////////////
//////////////**this part is the problem 2**//////////////////////////////
//Copy host memory to device
checkCudaErrors(cudaMemcpy(x_w, x_t, _size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(h_w, h_t, _size, cudaMemcpyHostToDevice));
.....fft.....
//////////////////////////////////////////////////////////////////////////////
//////////////**this part is the problem 3**//////////////////////////////
dim3 dimGrids(SIGNAL_SIZE/TILE_WIDTH, SIGNAL_SIZE/TILE_WIDTH);
dim3 dimBlocks(TILE_WIDTH, TILE_WIDTH);
//Multiply the coefficients tohether and normalize the result
printf("Launching ComplexPointwiseAndScale<<< >>>\n");
point_wise_product<<<dimGrids, dimBlocks>>>(x_w, h_w);
…
}