Hi, I have MPI and CUDA code which calculates the sum of array elements using GPUs.
I also calculate the sum using CPU to compare the timing results.
My MPI code
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define ARRAYSIZE 200000000
#define MASTER 0
int *data;
int main(int argc, char* argv[])
{
int numtasks, taskid, rc, dest, offset, i, j, tag1, tag2, source, chunksize, namelen;
long mysum, sum;
int update(int myoffset, int chunk, int myid);
char myname[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
double start = 0.0, stop = 0.0, time = 0.0;
double totaltime;
FILE *fp;
char line[128];
char element;
int n;
int k=0;
double start_file_read = 0.0, file_read_time=0.0;
double final_time = 0.0;
/***** Initializations *****/
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Get_processor_name(myname, &namelen);
printf ("MPI task %d has started on host %s...\n", taskid, myname);
chunksize = (ARRAYSIZE / numtasks);
tag2 = 1;
tag1 = 2;
data = malloc(ARRAYSIZE * sizeof(int));
/***** Master task only ******/
if (taskid == MASTER){
start_file_read = MPI_Wtime();
fp=fopen("integers.txt", "r");
if(fp != NULL){
printf("Master task is reading the data file.....\n");
sum = 0;
while(fgets(line, sizeof line, fp)!= NULL){
fscanf(fp,"%d",&data[k]);
k++;
}
}
/* Send each task its portion of the array - master keeps 1st part */
offset = chunksize;
for (dest=1; dest<numtasks; dest++) {
MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);
MPI_Send(&data[offset], chunksize, MPI_INT, dest, tag2, MPI_COMM_WORLD);
printf("Sent %d elements to task %d offset= %d\n",chunksize,dest,offset);
offset = offset + chunksize;
}
/* Master does its part of the work */
offset = 0;
mysum = run_kernel(&data[offset], chunksize, taskid);
} /* end of master section */
if (taskid > MASTER) {
/* Receive my portion of array from the master task */
// start= MPI_Wtime();
source = MASTER;
MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status);
MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,MPI_COMM_WORLD, &status);
mysum = run_kernel(&data[offset], chunksize, taskid);
/* end of non-master */
}
MPI_Finalize();
}
My CUDA code:
#include <stdio.h>
#include <cutil_inline.h>
#include <cutil.h>
#include <thrust/version.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>
#define BLOCK_NUM 32
#define THREAD_NUM 512
__global__ static void sumOfSquares(int * num, int * result, clock_t * time,int DATA_SIZE)
{
extern __shared__ int shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
//if (tid == 0) time[bid] = clock();
shared[tid] = 0;
for (int i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {
shared[tid] += num[i];
}
__syncthreads();
int offset = THREAD_NUM / 2;
while (offset > 0) {
if (tid < offset) {
shared[tid] += shared[tid + offset];
}
offset >>= 1;
__syncthreads();
}
if (tid == 0) {
result[bid] = shared[0];
//time[bid + BLOCK_NUM] = clock();
}
}
extern "C"
int run_kernel(int array[],int nelements, int taskid)
{
int * gpudata, * result, i;
clock_t * time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMalloc((void **) &gpudata, sizeof(int) * nelements);
cudaMalloc((void **) &result, sizeof(int) * THREAD_NUM * BLOCK_NUM);
cudaMalloc((void **) &time, sizeof(clock_t) * BLOCK_NUM * 2);
cudaMemcpy(gpudata, array, sizeof(int) * nelements, cudaMemcpyHostToDevice);
printf("\n MPI Task %d is executing Kernel function........", taskid);
int sum[BLOCK_NUM];
sumOfSquares<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int)>>>(gpudata, result, time, nelements);
cudaMemcpy(&sum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost);
//calculate final sum from each block.
int final_sum = 0;
for (int i = 0; i < BLOCK_NUM; i++) {
final_sum += sum[i];
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaFree(gpudata);
cudaFree(result);
printf(" Task %d has sum (on GPU): %d Time for the kernel: %f ms \n", taskid, final_sum, elapsedTime);
printf("Task %d is calculating sum of %d elements using CPU......\n", taskid, nelements);
timespec time1, time2, temp_time;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1);
long cpu_sum = 0;
for (i = 0; i < nelements; i++) {
cpu_sum += array[i];
}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2);
temp_time.tv_sec = time2.tv_sec - time1.tv_sec;
printf("Task %d calculated sum: %d using CPU in %lf ms \n",taskid, cpu_sum, temp_time.tv_sec);
return final_sum;
}
The timing results are always equal. Say to calculate GPU time 100 ms and same for the CPU time.
Anyone can help what is going wrong.
Thanks