a problem complex array add with cuda ?????

xuanyz · August 16, 2017, 11:04pm

I created a my complex Structures and writed the codes about complex array add with cuda,but the codes can not work well , can anyone help me?
the codes are as followed:

#include <math.h>
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>

//-----------------------------about the complex number------------------------------------------------------------
typedef struct FCOMPLEX {
float r;
float i;} fcomplex;

host device
fcomplex Complex(float re, float im)
{
fcomplex c;
c.r=re;
c.i=im;
return c;
}

host device
fcomplex Cadd(fcomplex a, fcomplex b)
{
fcomplex c;
c.r=a.r+b.r;
c.i=a.i+b.i;
return c;
}

//---------------------------------------------------------------------------------------------------

cudaError_t addWithCuda(fcomplex *c, const fcomplex *a, const fcomplex *b, unsigned int size);

global void addKernel(fcomplex *c, const fcomplex *a, const fcomplex *b)
{
int i = threadIdx.x;
c[i]=Cadd( a[i], b[i]);
}

int main()
{
const int arraySize = 5;
const fcomplex a[arraySize] = { Complex(1, 1), Complex(2, 2), Complex(3, 3), Complex(4, 4), Complex(5, 5)};
const fcomplex b[arraySize] = { Complex(1, 1), Complex(2, 2), Complex(3, 3), Complex(4, 4), Complex(5, 5) };
fcomplex c[arraySize] = { Complex(1, 1),Complex(1, 1) ,Complex(1, 1) ,Complex(1, 1) ,Complex(1, 1) };

// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addWithCuda failed!");
    return 1;
}

printf("{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.r = {%f,%f,%f,%f,%f}\n",
    c[0].r, c[1].r, c[2].r, c[3].r, c[4].r);
printf("{ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)} + (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)}.i = {%f,%f,%f,%f,%f}\n",
    c[0].i, c[1].i, c[2].i, c[3].i, c[4].i);

// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
    return 1;
}

return 0;

}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(fcomplex *c, const fcomplex *a, const fcomplex *b, unsigned int size)
{
fcomplex *dev_a ;
fcomplex *dev_b ;
fcomplex *dev_c ;
cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    goto Error;
}

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_c->r, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_c->i, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

 cudaStatus = cudaMalloc((void**)&dev_a->r, size * sizeof(float));
 cudaStatus = cudaMalloc((void**)&dev_a->i, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_b->r, size * sizeof(float));
cudaStatus = cudaMalloc((void**)&dev_b->i, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(&dev_a->r, &a->r, size * sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(&dev_a->i, &a->i, size * sizeof(float), cudaMemcpyHostToDevice);

if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(&dev_b->r, &b->r, size * sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(&dev_b->i, &b->i, size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    goto Error;
}

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(&c->r, &dev_c->r, size * sizeof(float), cudaMemcpyDeviceToHost);
cudaStatus = cudaMemcpy(&c->i, &dev_c->i, size * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);

return cudaStatus;

}