GPU and CPU time comparison
Hi, I have MPI and CUDA code which calculates the sum of array elements using GPUs.

I also calculate the sum using CPU to compare the timing results.

My MPI code

[code]
#include "mpi.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define ARRAYSIZE 200000000
#define MASTER 0



int *data;

int main(int argc, char* argv[])
{

int numtasks, taskid, rc, dest, offset, i, j, tag1, tag2, source, chunksize, namelen;

long mysum, sum;

int update(int myoffset, int chunk, int myid);
char myname[MPI_MAX_PROCESSOR_NAME];

MPI_Status status;
double start = 0.0, stop = 0.0, time = 0.0;
double totaltime;
FILE *fp;
char line[128];
char element;
int n;
int k=0;
double start_file_read = 0.0, file_read_time=0.0;
double final_time = 0.0;

/***** Initializations *****/

MPI_Init(&argc, &argv);

MPI_Comm_size(MPI_COMM_WORLD, &numtasks);

MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Get_processor_name(myname, &namelen);

printf ("MPI task %d has started on host %s...\n", taskid, myname);

chunksize = (ARRAYSIZE / numtasks);

tag2 = 1;

tag1 = 2;
data = malloc(ARRAYSIZE * sizeof(int));

/***** Master task only ******/

if (taskid == MASTER){

start_file_read = MPI_Wtime();
fp=fopen("integers.txt", "r");
if(fp != NULL){
printf("Master task is reading the data file.....\n");
sum = 0;
while(fgets(line, sizeof line, fp)!= NULL){
fscanf(fp,"%d",&data[k]);
k++;
}
}

/* Send each task its portion of the array - master keeps 1st part */

offset = chunksize;

for (dest=1; dest<numtasks; dest++) {

MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);

MPI_Send(&data[offset], chunksize, MPI_INT, dest, tag2, MPI_COMM_WORLD);

printf("Sent %d elements to task %d offset= %d\n",chunksize,dest,offset);

offset = offset + chunksize;

}

/* Master does its part of the work */

offset = 0;
mysum = run_kernel(&data[offset], chunksize, taskid);
} /* end of master section */

if (taskid > MASTER) {



/* Receive my portion of array from the master task */
// start= MPI_Wtime();

source = MASTER;

MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status);

MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,MPI_COMM_WORLD, &status);
mysum = run_kernel(&data[offset], chunksize, taskid);
/* end of non-master */
}



MPI_Finalize();



}
[/code]

My CUDA code:
[code]
#include <stdio.h>
#include <cutil_inline.h>
#include <cutil.h>



#include <thrust/version.h>

#include <thrust/generate.h>

#include <thrust/host_vector.h>

#include <thrust/device_vector.h>

#include <thrust/functional.h>

#include <thrust/transform_reduce.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>


#define BLOCK_NUM 32

#define THREAD_NUM 512


__global__ static void sumOfSquares(int * num, int * result, clock_t * time,int DATA_SIZE)

{

extern __shared__ int shared[];

const int tid = threadIdx.x;

const int bid = blockIdx.x;



//if (tid == 0) time[bid] = clock();



shared[tid] = 0;

for (int i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {

shared[tid] += num[i];

}



__syncthreads();

int offset = THREAD_NUM / 2;

while (offset > 0) {

if (tid < offset) {

shared[tid] += shared[tid + offset];

}

offset >>= 1;

__syncthreads();

}



if (tid == 0) {

result[bid] = shared[0];

//time[bid + BLOCK_NUM] = clock();

}

}

extern "C"
int run_kernel(int array[],int nelements, int taskid)
{

int * gpudata, * result, i;

clock_t * time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);


cudaMalloc((void **) &gpudata, sizeof(int) * nelements);

cudaMalloc((void **) &result, sizeof(int) * THREAD_NUM * BLOCK_NUM);

cudaMalloc((void **) &time, sizeof(clock_t) * BLOCK_NUM * 2);

cudaMemcpy(gpudata, array, sizeof(int) * nelements, cudaMemcpyHostToDevice);

printf("\n MPI Task %d is executing Kernel function........", taskid);



int sum[BLOCK_NUM];


sumOfSquares<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int)>>>(gpudata, result, time, nelements);


cudaMemcpy(&sum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost);

//calculate final sum from each block.

int final_sum = 0;

for (int i = 0; i < BLOCK_NUM; i++) {

final_sum += sum[i];

}

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);

cudaFree(gpudata);

cudaFree(result);





printf(" Task %d has sum (on GPU): %d Time for the kernel: %f ms \n", taskid, final_sum, elapsedTime);

printf("Task %d is calculating sum of %d elements using CPU......\n", taskid, nelements);

timespec time1, time2, temp_time;

clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1);
long cpu_sum = 0;


for (i = 0; i < nelements; i++) {

cpu_sum += array[i];

}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2);
temp_time.tv_sec = time2.tv_sec - time1.tv_sec;
printf("Task %d calculated sum: %d using CPU in %lf ms \n",taskid, cpu_sum, temp_time.tv_sec);
return final_sum;

}

[/code]

The timing results are always equal. Say to calculate GPU time 100 ms and same for the CPU time.

Anyone can help what is going wrong.

Thanks
Hi, I have MPI and CUDA code which calculates the sum of array elements using GPUs.



I also calculate the sum using CPU to compare the timing results.



My MPI code





#include "mpi.h"



#include <stdio.h>

#include <stdlib.h>

#include <string.h>



#define ARRAYSIZE 200000000

#define MASTER 0







int *data;



int main(int argc, char* argv[])

{



int numtasks, taskid, rc, dest, offset, i, j, tag1, tag2, source, chunksize, namelen;



long mysum, sum;



int update(int myoffset, int chunk, int myid);

char myname[MPI_MAX_PROCESSOR_NAME];



MPI_Status status;

double start = 0.0, stop = 0.0, time = 0.0;

double totaltime;

FILE *fp;

char line[128];

char element;

int n;

int k=0;

double start_file_read = 0.0, file_read_time=0.0;

double final_time = 0.0;



/***** Initializations *****/



MPI_Init(&argc, &argv);



MPI_Comm_size(MPI_COMM_WORLD, &numtasks);



MPI_Comm_rank(MPI_COMM_WORLD,&taskid);

MPI_Get_processor_name(myname, &namelen);



printf ("MPI task %d has started on host %s...\n", taskid, myname);



chunksize = (ARRAYSIZE / numtasks);



tag2 = 1;



tag1 = 2;

data = malloc(ARRAYSIZE * sizeof(int));



/***** Master task only ******/



if (taskid == MASTER){



start_file_read = MPI_Wtime();

fp=fopen("integers.txt", "r");

if(fp != NULL){

printf("Master task is reading the data file.....\n");

sum = 0;

while(fgets(line, sizeof line, fp)!= NULL){

fscanf(fp,"%d",&data[k]);

k++;

}

}



/* Send each task its portion of the array - master keeps 1st part */



offset = chunksize;



for (dest=1; dest<numtasks; dest++) {



MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);



MPI_Send(&data[offset], chunksize, MPI_INT, dest, tag2, MPI_COMM_WORLD);



printf("Sent %d elements to task %d offset= %d\n",chunksize,dest,offset);



offset = offset + chunksize;



}



/* Master does its part of the work */



offset = 0;

mysum = run_kernel(&data[offset], chunksize, taskid);

} /* end of master section */



if (taskid > MASTER) {







/* Receive my portion of array from the master task */

// start= MPI_Wtime();



source = MASTER;



MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status);



MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,MPI_COMM_WORLD, &status);

mysum = run_kernel(&data[offset], chunksize, taskid);

/* end of non-master */

}







MPI_Finalize();







}




My CUDA code:



#include <stdio.h>

#include <cutil_inline.h>

#include <cutil.h>







#include <thrust/version.h>



#include <thrust/generate.h>



#include <thrust/host_vector.h>



#include <thrust/device_vector.h>



#include <thrust/functional.h>



#include <thrust/transform_reduce.h>

#include <time.h>

#include <sys/time.h>

#include <sys/resource.h>





#define BLOCK_NUM 32



#define THREAD_NUM 512





__global__ static void sumOfSquares(int * num, int * result, clock_t * time,int DATA_SIZE)



{



extern __shared__ int shared[];



const int tid = threadIdx.x;



const int bid = blockIdx.x;







//if (tid == 0) time[bid] = clock();







shared[tid] = 0;



for (int i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {



shared[tid] += num[i];



}







__syncthreads();



int offset = THREAD_NUM / 2;



while (offset > 0) {



if (tid < offset) {



shared[tid] += shared[tid + offset];



}



offset >>= 1;



__syncthreads();



}







if (tid == 0) {



result[bid] = shared[0];



//time[bid + BLOCK_NUM] = clock();



}



}



extern "C"

int run_kernel(int array[],int nelements, int taskid)

{



int * gpudata, * result, i;



clock_t * time;

cudaEvent_t start, stop;

cudaEventCreate(&start);

cudaEventCreate(&stop);

cudaEventRecord(start, 0);





cudaMalloc((void **) &gpudata, sizeof(int) * nelements);



cudaMalloc((void **) &result, sizeof(int) * THREAD_NUM * BLOCK_NUM);



cudaMalloc((void **) &time, sizeof(clock_t) * BLOCK_NUM * 2);



cudaMemcpy(gpudata, array, sizeof(int) * nelements, cudaMemcpyHostToDevice);



printf("\n MPI Task %d is executing Kernel function........", taskid);







int sum[BLOCK_NUM];





sumOfSquares<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int)>>>(gpudata, result, time, nelements);





cudaMemcpy(&sum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost);



//calculate final sum from each block.



int final_sum = 0;



for (int i = 0; i < BLOCK_NUM; i++) {



final_sum += sum[i];



}



cudaEventRecord(stop, 0);

cudaEventSynchronize(stop);

float elapsedTime;

cudaEventElapsedTime(&elapsedTime, start, stop);

cudaEventDestroy(start);

cudaEventDestroy(stop);



cudaFree(gpudata);



cudaFree(result);











printf(" Task %d has sum (on GPU): %d Time for the kernel: %f ms \n", taskid, final_sum, elapsedTime);



printf("Task %d is calculating sum of %d elements using CPU......\n", taskid, nelements);



timespec time1, time2, temp_time;



clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1);

long cpu_sum = 0;





for (i = 0; i < nelements; i++) {



cpu_sum += array[i];



}

clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2);

temp_time.tv_sec = time2.tv_sec - time1.tv_sec;

printf("Task %d calculated sum: %d using CPU in %lf ms \n",taskid, cpu_sum, temp_time.tv_sec);

return final_sum;



}






The timing results are always equal. Say to calculate GPU time 100 ms and same for the CPU time.



Anyone can help what is going wrong.



Thanks

#1
Posted 05/01/2012 03:28 AM   
Scroll To Top