Hi,
I am trying to achieve parallelism using mpi and cuda.
I am distributing array elements to several processes using mpi and the sum of array elements to be calculated by gpu.
I have this kernel.cu file
#include <stdio.h>
__global__ void add(int *devarray, int *devsum)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
devsum = devsum + devarray[index];
}
extern "C"
void run_kernel(int *array, int nelements)
{
int *devarray, *sum, *devsum;
cudaMalloc((void**) &devarray, sizeof(int)*nelements);
cudaMalloc((void**) &devsum, sizeof(int));
cudaMemcpy(devarray, array, sizeof(int)*nelements, cudaMemcpyHostToDevice);
add<<<2, 3>>>(devarray, devsum);
cudaMemcpy(sum, devsum, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d", sum);
cudaFree(devarray);
cudaFree(sum);
}
When I compile this program I get warning as
“variable sum is used before its value is set.” what is wrong here can someone tell?
And my mpi code is
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <sys/time.h>
#include <mpi.h>
#include <string.h>
#define ARRAYSIZE 2000
#define MASTER 0
int *data;
int main (int argc, char *argv[])
{
int numtasks, taskid, rc, dest, offset, i, j, tag1, tag2, source, chunksize, namelen;
int mysum;
long sum;
char myname[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
double start, stop, time;
double totaltime;
FILE *fp;
char line[128];
char element;
int n;
int k=0;
/***** Initializations *****/
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Get_processor_name(myname, &namelen);
printf ("MPI task %d has started on host %s...\n", taskid, myname);
chunksize = (ARRAYSIZE / numtasks);
tag2 = 1;
tag1 = 2;
data = malloc(ARRAYSIZE * sizeof(int));
/***** Master task only ******/
if (taskid == MASTER){
//read the integers from file
fp=fopen("integers.dat", "r");
if(fp != NULL){
sum = 0;
while(fgets(line, sizeof line, fp)!= NULL){
fscanf(fp,"%d",&data[k]);
sum = sum + data[k];
k++;
}
}
printf("Initialized array sum = %d\n", sum);
/* Send each task its portion of the array - master keeps 1st part */
offset = chunksize;
for (dest=1; dest<numtasks; dest++) {
MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);
MPI_Send(&data[offset], chunksize, MPI_INT, dest, tag2, MPI_COMM_WORLD);
printf("Sent %d elements to task %d offset= %d\n",chunksize,dest,offset);
offset = offset + chunksize;
}
/* Master does its part of the work */
offset = 0;
run_kernel(data[offset + chunksize],chunksize); //perform operation on GPU
/* Wait to receive results from each task */
for (i=1; i<numtasks; i++) {
source = i;
MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status);
MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,
MPI_COMM_WORLD, &status);
}
/* Get final sum and print sample results */
MPI_Reduce(&mysum, &sum, 1, MPI_INT, MPI_SUM, MASTER, MPI_COMM_WORLD);
printf("\n*** Final sum= %d ***\n",sum);
}
if (taskid > MASTER) {
/* Receive my portion of array from the master task */
source = MASTER;
MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status);
MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,MPI_COMM_WORLD, &status);
run_kernel(data[offset+chunksize],chunksize);
/* Send my results back to the master task */
dest = MASTER;
MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);
MPI_Send(&data[offset], chunksize, MPI_INT, MASTER, tag2, MPI_COMM_WORLD);
MPI_Reduce(&mysum, &sum, 1, MPI_INT, MPI_SUM, MASTER, MPI_COMM_WORLD);
} /* end of non-master */
}
The file integers.dat contains 2000 integers I am distributing it to several processes and then each process calculates it sum on gpu.
When I run the code i get error
mpirun -np 4 mpicudacomb
MPI task 0 has started on host node4
MPI task 1 has started on host node4
MPI task 3 has started on host node4
MPI task 2 has started on host node4
Initialized array sum = 9061
Sent 500 elements to task 1 offset= 500
Sent 500 elements to task 2 offset= 1000
Sent 500 elements to task 3 offset= 1500
147399408152775360--------------------------------------------------------------------------
mpirun has exited due to process rank 1 with PID 4538 on
node node4 exiting without calling “finalize”. This may
have caused other processes in the application to be
terminated by signals sent by mpirun (as reported here).
[node4:04537] *** Process received signal ***
[node4:04537] Signal: Segmentation fault (11)
[node4:04537] Signal code: Address not mapped (1)
[node4:04537] Failing at address: 0xc
Can anyone help me with this?
Thanks