GPU and CPU time comparison

Hi, I have MPI and CUDA code which calculates the sum of array elements using GPUs.

I also calculate the sum using CPU to compare the timing results.

My MPI code

#include "mpi.h"

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#define  ARRAYSIZE	200000000

#define  MASTER		0

int  *data;

int main(int argc, char* argv[])

{

int   numtasks, taskid, rc, dest, offset, i, j, tag1, tag2, source, chunksize, namelen; 

long mysum, sum;

int update(int myoffset, int chunk, int myid);

char myname[MPI_MAX_PROCESSOR_NAME];

MPI_Status status;

double start = 0.0, stop = 0.0, time = 0.0;

double totaltime;

FILE *fp;

char line[128];

char element;

int n;

int k=0;

double start_file_read = 0.0, file_read_time=0.0;

double final_time = 0.0;

/***** Initializations *****/

MPI_Init(&argc, &argv);

MPI_Comm_size(MPI_COMM_WORLD, &numtasks);

MPI_Comm_rank(MPI_COMM_WORLD,&taskid); 

MPI_Get_processor_name(myname, &namelen);

printf ("MPI task %d has started on host %s...\n", taskid, myname);

chunksize = (ARRAYSIZE / numtasks);

tag2 = 1;

tag1 = 2;

data = malloc(ARRAYSIZE * sizeof(int));

/***** Master task only ******/

if (taskid == MASTER){

start_file_read = MPI_Wtime();

  fp=fopen("integers.txt", "r");

  if(fp != NULL){

   printf("Master task is reading the data file.....\n");

   sum = 0;

   while(fgets(line, sizeof line, fp)!= NULL){

    fscanf(fp,"%d",&data[k]);

    k++;

   }

  }

/* Send each task its portion of the array - master keeps 1st part */

offset = chunksize;

for (dest=1; dest<numtasks; dest++) {

MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);

MPI_Send(&data[offset], chunksize, MPI_INT, dest, tag2, MPI_COMM_WORLD);

printf("Sent %d elements to task %d offset= %d\n",chunksize,dest,offset);

offset = offset + chunksize;

}

/* Master does its part of the work */

offset = 0;

  mysum = run_kernel(&data[offset], chunksize, taskid);

 }  /* end of master section */

if (taskid > MASTER) {

/* Receive my portion of array from the master task */

 // start= MPI_Wtime();

source = MASTER;

MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status);

MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,MPI_COMM_WORLD, &status);

  mysum = run_kernel(&data[offset], chunksize, taskid);

 /* end of non-master */

 }

MPI_Finalize();

}

My CUDA code:

#include <stdio.h>

#include <cutil_inline.h>

#include <cutil.h>

#include <thrust/version.h>

#include <thrust/generate.h>

#include <thrust/host_vector.h>

#include <thrust/device_vector.h>

#include <thrust/functional.h>

#include <thrust/transform_reduce.h>

#include <time.h>

#include <sys/time.h>

#include <sys/resource.h>

#define BLOCK_NUM	32

#define THREAD_NUM	512

__global__ static void sumOfSquares(int * num, int * result, clock_t * time,int DATA_SIZE)

{

	extern __shared__ int shared[];

	const int tid = threadIdx.x;

	const int bid = blockIdx.x;

	//if (tid == 0) time[bid] = clock();

	shared[tid] = 0;

	for (int i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {

		shared[tid] += num[i];

	}

	__syncthreads();

	int offset = THREAD_NUM / 2;

	while (offset > 0) {

		if (tid < offset) {

			shared[tid] += shared[tid + offset];

		}

		offset >>= 1;

		__syncthreads();

	}

	if (tid == 0) {

		result[bid] = shared[0];

		//time[bid + BLOCK_NUM] = clock();

	}

}

extern "C"

int run_kernel(int array[],int nelements, int taskid)

{

int * gpudata, * result, i;

clock_t * time;

     cudaEvent_t start, stop;

     cudaEventCreate(&start);

     cudaEventCreate(&stop);

     cudaEventRecord(start, 0);

cudaMalloc((void **) &gpudata, sizeof(int) * nelements);

cudaMalloc((void **) &result, sizeof(int) * THREAD_NUM * BLOCK_NUM);

cudaMalloc((void **) &time, sizeof(clock_t) * BLOCK_NUM * 2);

cudaMemcpy(gpudata, array, sizeof(int) * nelements, cudaMemcpyHostToDevice);

printf("\n MPI Task %d is executing Kernel function........", taskid);

int sum[BLOCK_NUM];

sumOfSquares<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int)>>>(gpudata, result, time, nelements);

cudaMemcpy(&sum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost);

//calculate final sum from each block.

int final_sum = 0;

	for (int i = 0; i < BLOCK_NUM; i++) {

		final_sum += sum[i];

}

cudaEventRecord(stop, 0);

      cudaEventSynchronize(stop);

      float elapsedTime;

      cudaEventElapsedTime(&elapsedTime, start, stop);

      cudaEventDestroy(start);

      cudaEventDestroy(stop);

cudaFree(gpudata);

cudaFree(result);

printf(" Task %d has sum (on GPU): %d Time for the kernel: %f ms \n", taskid, final_sum, elapsedTime);

printf("Task %d is calculating sum of %d elements using CPU......\n", taskid, nelements);

timespec time1, time2, temp_time;

clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1);

     long cpu_sum = 0;

for (i = 0; i < nelements; i++) {

	  cpu_sum += array[i];

}    

    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2);

    temp_time.tv_sec = time2.tv_sec - time1.tv_sec;

    printf("Task %d calculated sum: %d using CPU in %lf ms \n",taskid, cpu_sum, temp_time.tv_sec);

    return final_sum;      

}

The timing results are always equal. Say to calculate GPU time 100 ms and same for the CPU time.

Anyone can help what is going wrong.

Thanks