Dear all
I had developed a serial merge sort and tested it on Intel core 2 Due, then I parallelized it on GPU using cuda c run time, but what unexpextd for me that the serial program is faster than the parallel program. why does this happened and how can I make the parallel version faster than serial?
this is serial code
// Merge6Serial.cpp : Defines the entry point for the console application.
//
#include <windows.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
//#include <cmath>
#define PP 1
#define P (PP*2048)
#define N P*8*1024
void threadsMerge1(int *a, int *temp, int rec_num);
void threadsMerge0(int *a);
int main(){
int *a, *temp;
FILE *fp = fopen("E:\\data.txt", "w");// file for data
a = (int *)malloc(N*sizeof(int));
temp= (int *)malloc(N*sizeof(int));
//FILE *fp = fopen("C:\\output_merge_serial.txt", "w");// file for data
srand(time(NULL));
for (int i = 0; i < N; i++)
{
int num = rand() % 100;
a[i] = num;
}
printf("\n ******************** \n");
int x = log2(P);
for (int i = 0; i < N; i = i + (8*1024)){
qsort(a + i,8*1024, sizeof(int), compare);
}
//start execution
//*********************************************************************************
//----------------------------------------
LARGE_INTEGER freq;
LARGE_INTEGER t0, tF, tDiff;
double elapsedTime;
double resolution;
QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&t0);
//-----------------------------------------
//sort array of size up to 2048*1024. up to N=8*2048*1024
//------------------------------------------------
threadsMerge1(a, temp, 16*1024, x);
//*********************************************************************************
printf("\n ++++++++++++++++++++++++++++++++++++++\n\n\n\n ");
printf("\n ++++++++++++++++++++++++++++++++++++++\n\n\n\n ");
QueryPerformanceCounter(&tF);
tDiff.QuadPart = tF.QuadPart - t0.QuadPart;
elapsedTime = tDiff.QuadPart / (double)freq.QuadPart;
printf("\n Code under test took %lf sec\n", elapsedTime);
printf("\n ******************** \n");
free(a);
free(temp);
//------------------------------------------
//------------------------------------------
fclose(fp);
return 0;
}
void threadsMerge1(int *a, int *temp,int rec_num,int u)
{
int index1, endIndex1, index2, endIndex2, targetIndex, sortedSize;
for (int i = 0; i <N/rec_num; i++){
index1 = (i*rec_num);
endIndex1 = index1 + (rec_num/2)-1;
index2 = endIndex1 + 1;
endIndex2 = index2 + (rec_num/2)-1;
targetIndex = index1;
sortedSize = 0;
while (index1 <= endIndex1 && index2 <= endIndex2){
if (a[index1] <= a[index2]){
temp[targetIndex] = a[index1];
++sortedSize;
++index1;
++targetIndex;
}
else{
temp[targetIndex] = a[index2];
++index2;
++sortedSize;
++targetIndex;
}
}
if (index1 < endIndex1 &&index2 == endIndex2){
if (sortedSize < rec_num){
int x = 0;
while (x < (rec_num / 2)){
temp[targetIndex] = a[index1];
++sortedSize;
++targetIndex;
++index1;
++x;
}
}
}
if (index2 > endIndex2 &&index1 <= endIndex1){
if (sortedSize < rec_num){
int x = 0;
while (x < (rec_num / 2)){
temp[targetIndex] = a[index1];
++sortedSize;
++targetIndex;
++index1;
++x;
if (index1 > endIndex1)
break;
}
}
}
if (index1 > endIndex1 &&index2 <= endIndex2){
if (sortedSize < rec_num){
int x = 0;
while (x < rec_num / 2){
temp[targetIndex] = a[index2];
++sortedSize;
++targetIndex;
++index2;
++x;
if (index2 > endIndex2)
break;
}
}
}
}
if (u > 1){
threadsMerge1(temp, a, 2 * rec_num, --u);
}
}
this is the parallel code
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cmath>
#define P (PP*2048)
#define N P*8*1024 // size of array to be sorted must be power of 2 multipled of 1024. Minimum size of array is 8*1024
// Define this to turn on error checking
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
//****************************************************************************************
// functions for cuda error checking
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
if (cudaSuccess != err)
{
fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
// More careful checking. However, this will affect performance.
// Comment away if needed.
err = cudaDeviceSynchronize();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
//****************************************************************************************
//sort until maximum size of array = 1*1024*1024
//***************************************************************************************
// (kernel) function that merge each 2048 of the subbarys of size 1024 (2048*1024) in the origin array
__device__ void FinalMergePart1(int *a, int *temp, int rec_num, int d, int x,int s,int b)
{
int threadId = threadIdx.x;
if (0 <= threadId && threadId < (s / (2 * d))){
int BlockStart = blockIdx.x*(b *8* 1024);
int index1 = BlockStart + rec_num * (threadId);
int endIndex1 = index1 + (rec_num / 2) - 1;
int index2 = endIndex1 + 1;
int endIndex2 = index2 + (rec_num / 2) - 1;
int targetIndex = index1;
int sortedSize = 0;
//printf("th= %d ", threadId);
while (index1 <= endIndex1 && index2 <= endIndex2){
if (a[index1] <= a[index2]){
temp[targetIndex] = a[index1];
++sortedSize;
++index1;
++targetIndex;
}
else{
temp[targetIndex] = a[index2];
++index2;
++sortedSize;
++targetIndex;
}
}
if (index1 < endIndex1 &&index2 == endIndex2){
if (sortedSize < rec_num){
int bb = 0;
while (bb < (rec_num / 2)){
temp[targetIndex] = a[index1];
++sortedSize;
++targetIndex;
++index1;
++bb;
}
}
}
if (index2 > endIndex2 &&index1 <= endIndex1){
if (sortedSize < rec_num){
int bb = 0;
while (bb < (rec_num / 2)){
temp[targetIndex] = a[index1];
++sortedSize;
++targetIndex;
++index1;
++bb;
if (index1 > endIndex1)
break;
}
}
}
if (index1 > endIndex1 &&index2 <= endIndex2){
if (sortedSize < rec_num){
int bb = 0;
while (bb < rec_num / 2){
temp[targetIndex] = a[index2];
++sortedSize;
++targetIndex;
++index2;
++bb;
if (index2 > endIndex2)
break;
}
}
}
}
__syncthreads();
if (x > 1){
FinalMergePart1(temp, a, 2 * rec_num, 2 * d, --x,s,b);
}
}
// (kernel) functions that sort each subarray of the origin array, each sorted subarray is of size 1024
__global__ void endphase1(int *a, int *temp, int x, int s,int b){
FinalMergePart1(a, temp, 2048*8, 1, x,s,b);
}
//*********************************************************************************************
//*********************************************************************************************
int compare(const void* left, const void* right)
{
int *lt = (int*)left,
*rt = (int*)right,
diff = *lt - *rt;
if (diff < 0) return -1;
if (diff > 0) return +1;
return 0;
}
//****************************************************************************************************************************************
int main()
{
int *a; // the main array to be sorted
int *dev_a, *dev_temp1; // array on the device
int sum1 = 0;// variables used in error checking
int sum2 = 0;
//FILE *fp = fopen("C:\\data.txt", "w");// file for data
//*********************************************
cudaMalloc((void **)&dev_a, N*sizeof(int));//allocate memory on device
cudaMalloc((void **)&dev_temp1, N*sizeof(int));//allocate memory on device
a = (int *)malloc(N*sizeof(int)); //allocate memory on host
//*******1*************************************
//--------------------------
//genetate random data
for (int i = 0; i < N; i++)
{
int num = rand() % 100;
a[i] = num;
}
//-------------------------
for (int i = 0; i < N; i = i + (8*1024)){
qsort(a + i,8*1024, sizeof(int), compare);
}
printf("\n 1 way \n");
printf("P= %d\n",P);
//*********************************************************
//transfer data from host memory to device memory, where dev_a is the destination
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
//*********************************************************
int x = log2(P);
int z = log2(PP);
//*************************************************************
// calculating elapsed time, phase1
cudaEvent_t start, stop; float time;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//**************************************************************
//start execution
//*********************************************************************************
//sort up to N=1024*1024
if (1 < P&&P <= 2048){
if (P == 2048){
endphase1 << <1, 1024 >> >(dev_a, dev_temp1, x, P,2048);
}
else {
endphase1 << <1, P >> >(dev_a, dev_temp1, x, P,2048);
}
}
//***********************************************************************
// calculating elapsed time, phase2
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("kernel time in ms:\t%.7f\n", time);
//**********************************************************************
//------------------------------------------------------------
// cuda error checking
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess)
{
printf("CUDA Error: %s\n", cudaGetErrorString(error));
// we can't recover from the error -- exit the program
return 1;
}
//-----------------------------------------------------------
cudaDeviceSynchronize();
if (x%2==0){
cudaMemcpy(a, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost); // return sorted array to host
}
else{
cudaMemcpy(a, dev_temp1, N*sizeof(int), cudaMemcpyDeviceToHost); // return sorted array to host
}
//***********************************************************************************
for (int i = 0; i < N; i++)
{
if (a[i] == 0){
++sum2;
}
}
cudaFree(dev_a);//free memory on device
cudaFree(dev_temp1);
free(a);//free memory on host
}