Hello people!
Little bit of background first: I have to optimize the speed of an SPH (= Smoothed Particle Hydrodynamics) Code. So, I want to parallelize the calculation of the hydrodynamic forces for every particle and instead of having loops that are iterating over every particle, we had the idea to put that on the GPU.
After some time, I kind of implemented the GPU parallelization with CUDA, but now my program is just crashing without saying what went wrong. I just know, that it is crashing, when it comes to copying the data to the device.
In order to analyze the problem, I created a small example (with the structure of my SPH-Code), that is crashing too, when I augment the size of N …
Here it is:
It includes the following headers:
proto.h (for the functions):
extern void main_GPU(int N, int Ntask);
allvars.h (data structure on the host):
struct particle_data
{
float a;
float b;
}
* P;
struct sph_particle_data
{
float acc;
float vel;
}
* SphP;
Allvars_gpu.h (data structure on the device):
__device__ struct d_particle_data
{
float a;
float b;
}
* d_P;
__device__ struct d_sph_particle_data
{
float acc;
float vel;
}
* d_SphP;
main.cpp (from where the .cu-file is called):
#include <stdio.h>
#include <cuda_runtime.h>
extern "C" {
#include "proto.h"
}
int main(void)
{
int N_gas = 100000000000; // Number of particles
int NTask = 1; // Number of CPUs (Code has MPI-stuff included)
main_GPU(N_gas, NTask);
return 0;
}
and finally, hydro_gpu.cu (with the computation):
#include <cuda_runtime.h>
#include <stdio.h>
extern "C"
{
#include "Allvars_gpu.h"
#include "allvars.h"
#include "proto.h"
}
__device__ void hydro_evaluate(int target, int mode, struct d_particle_data *P, struct d_sph_particle_data *SphP)
{
int c = 5;
float a,b;
float acc, vel;
a = P[target].a;
b = P[target].b;
P[target].a = a+c;
P[target].b = b+c;
acc = SphP[target].acc;
vel = SphP[target].vel;
SphP[target].acc = acc * c;
SphP[target].vel = vel * c;
}
__global__ void hydro_particle_gpu(struct d_particle_data *P, struct d_sph_particle_data *SphP)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
hydro_evaluate(i, 0, P, SphP);
}
void hydro_particle_cpu(struct particle_data *P, int N) {
int i;
float a,b;
float acc, vel;
float c = 5;
for(i=0; i < N; i++) {
a = P[i].a;
b = P[i].b;
P[i].a = a+c;
P[i].b = b+c;
acc = SphP[i].acc;
vel = SphP[i].vel;
SphP[i].acc = acc * c;
SphP[i].vel = vel * c;
}
}
void main_GPU(int N, int Ntask)
{
int Blocks;
cudaMalloc((void**)&d_P, N * sizeof(d_particle_data));
cudaMalloc((void**)&d_SphP, N * sizeof(d_sph_particle_data));
P = (struct particle_data*)malloc(N * sizeof(d_particle_data));
SphP = (struct sph_particle_data*)malloc(N * sizeof(d_sph_particle_data));
// produce some initial test data for the array of N elements
for(int i = 0; i < N; ++i)
{
P[i].a = i;
P[i].b = i + 1;
SphP[i].acc = i * i;
SphP[i].vel = 2 * i;
}
cudaMemcpy(d_P, P, N * sizeof(d_particle_data), cudaMemcpyHostToDevice);
cudaMemcpy(d_SphP, SphP, N * sizeof(d_sph_particle_data), cudaMemcpyHostToDevice);
Blocks = (N + N - 1) / N;
hydro_particle_gpu<<<Blocks,N>>>(d_P, d_SphP);
cudaMemcpy(P, d_P, N * sizeof(d_particle_data), cudaMemcpyDeviceToHost);
cudaMemcpy(SphP, d_SphP, N * sizeof(d_sph_particle_data), cudaMemcpyDeviceToHost);
for(int i = 0; i < N; ++i)
{
printf("P[%d].a = %f, P[%d].b = %f\n", i, P[i].a, i, P[i].b);
printf("SphP[%d].acc = %f, SphP[%d].vel = %f\n", i, SphP[i].acc, i, SphP[i].vel);
}
free(P);
free(SphP);
cudaFree(d_P);
cudaFree(d_SphP);
}
Is there a way to optimize it, that it doesn’t keep crashing anymore? :(
For information: I’m working with a Nvidia Quadro 2000D.
Thanks for any kind of help!! :)
Cheerio!