cudaMemcpy don't work

QuangAnh94 · July 3, 2015, 10:07am

Hi all,
I’m newbie in programming CUDA.
The first programming in CUDA, i write same as video: " CUDACast #2 - Your First CUDA C Program " in youtube
But my result is not correct. Please help me to fix it.
Sorry because my english not good.
My code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>

using namespace std;
#define SIZE 1024

__global__ void VectorAdd(int *a, int *b, int *c,int n) {
	
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	if(i<n) c[i] = a[i] + b[i];
	
}

int main(void) {
int *a, *b, *c, *d; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size;
size=SIZE*sizeof(int);
// Alloc space for host copies of a, b, c and setup input values
a = (int *) malloc(size); 
b = (int *) malloc(size); 
c = (int *) malloc(size);
d = (int *) malloc(size);
for(int i=0;i<SIZE;++i)
{
	a[i]=i;
	b[i]=i;
	c[i]=5;
}
// Alloc space for device copies of a, b, c
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, size, cudaMemcpyHostToDevice);

// Launch add() kernel on GPU with N blocks
VectorAdd <<< 1, 1 >>>(d_a, d_b, d_c, SIZE);
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
for(int i=0;i<10;++i){printf("c[%d] = %d\n",i,c[i]);}
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
system("PAUSE");
return 0;
}

all of my result equal value 5.
Please help me, thanks.

little_jimmy · July 3, 2015, 11:11am

VectorAdd <<< 32, 32 >>>(d_a, d_b, d_c, SIZE);
VectorAdd <<< 16, 64 >>>(d_a, d_b, d_c, SIZE);
VectorAdd <<< 8, 128 >>>(d_a, d_b, d_c, SIZE);
VectorAdd <<< a, b >>>(d_a, d_b, d_c, SIZE);

a * b >= size

QuangAnh94 · July 3, 2015, 11:29am

I fix

VectorAdd <<< 1, 1 >>>(d_a, d_b, d_c, SIZE);

to

VectorAdd <<< 32, 32 >>>(d_a, d_b, d_c, SIZE);

from you but result doesn’t change

http://i.imgur.com/9dz46st.png

little_jimmy · July 3, 2015, 12:15pm

VectorAdd <<< x, y >>>(d_a, d_b, d_c, SIZE);

cudaDeviceSynchronize();

// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

if that does not solve the matter, either use error checking and/ or a breakpoint (on the 1st kernel line) and the debugger to verify that the kernel indeed runs

QuangAnh94 · July 3, 2015, 1:24pm

thank you so much :D, I think CUDA doesn’t support my Win 7 x86
I will reinstall win x64