execution time on CPU and GPU

Dear all
I had developed a serial merge sort and tested it on Intel core 2 Due, then I parallelized it on GPU using cuda c run time, but what unexpextd for me that the serial program is faster than the parallel program. why does this happened and how can I make the parallel version faster than serial?

this is serial code

// Merge6Serial.cpp : Defines the entry point for the console application.
//
#include <windows.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
//#include <cmath>

#define PP  1
#define  P (PP*2048)
#define N  P*8*1024 

void threadsMerge1(int *a, int *temp, int rec_num);
void threadsMerge0(int *a);

int main(){

	 int *a, *temp;
	 FILE *fp = fopen("E:\\data.txt", "w");// file for data

	a = (int *)malloc(N*sizeof(int));
	temp= (int *)malloc(N*sizeof(int));

	//FILE *fp = fopen("C:\\output_merge_serial.txt", "w");// file for data
	srand(time(NULL));
	for (int i = 0; i < N; i++)
	{
		int num = rand() % 100;
		a[i] = num;
	}

	printf("\n ******************** \n");

	int x = log2(P);

for (int i = 0; i < N; i = i + (8*1024)){
		qsort(a + i,8*1024, sizeof(int), compare);
	}
	

	
	//start execution
	//*********************************************************************************
	//----------------------------------------

	LARGE_INTEGER freq;
	LARGE_INTEGER t0, tF, tDiff;
	double elapsedTime;
	double resolution;
	QueryPerformanceFrequency(&freq);
	QueryPerformanceCounter(&t0);
	//-----------------------------------------

	//sort array of size up to 2048*1024. up to N=8*2048*1024
	//------------------------------------------------

		
		threadsMerge1(a, temp, 16*1024, x);

	//*********************************************************************************

	printf("\n ++++++++++++++++++++++++++++++++++++++\n\n\n\n ");

	printf("\n ++++++++++++++++++++++++++++++++++++++\n\n\n\n ");

	QueryPerformanceCounter(&tF);
	tDiff.QuadPart = tF.QuadPart - t0.QuadPart;
	elapsedTime = tDiff.QuadPart / (double)freq.QuadPart;

	printf("\n Code under test took %lf sec\n", elapsedTime);
	

	printf("\n ******************** \n");

	free(a);
	free(temp);
	//------------------------------------------

	//------------------------------------------
	fclose(fp);
	return 0;
}

void threadsMerge1(int *a, int *temp,int rec_num,int u)
{
	 int index1, endIndex1, index2, endIndex2, targetIndex, sortedSize;
	 for (int i = 0; i <N/rec_num; i++){
		 
		 index1 =  (i*rec_num);
		 endIndex1 = index1 + (rec_num/2)-1;
	     index2 = endIndex1 + 1;
		 endIndex2 = index2 + (rec_num/2)-1;
		 targetIndex = index1;
		 sortedSize = 0;
	
		while (index1 <= endIndex1 && index2 <= endIndex2){

			if (a[index1] <= a[index2]){
				temp[targetIndex] = a[index1];
				++sortedSize;
				++index1;
				++targetIndex;
			}
			else{
				temp[targetIndex] = a[index2];
				++index2;
				++sortedSize;
				++targetIndex;
			}
		}

		if (index1 < endIndex1 &&index2 == endIndex2){

			if (sortedSize < rec_num){
				int x = 0;
				while (x < (rec_num / 2)){

					temp[targetIndex] = a[index1];
					++sortedSize;
					++targetIndex;
					++index1;
					++x;
				}
			}

		}

		if (index2 > endIndex2 &&index1 <= endIndex1){
			if (sortedSize < rec_num){
				int x = 0;
				while (x < (rec_num / 2)){

					temp[targetIndex] = a[index1];
					++sortedSize;
					++targetIndex;
					++index1;
					++x;
					if (index1 > endIndex1)
						break;
				}

			}
		}

		if (index1 > endIndex1 &&index2 <= endIndex2){
			if (sortedSize < rec_num){
				int x = 0;
				while (x < rec_num / 2){

					temp[targetIndex] = a[index2];
					++sortedSize;
					++targetIndex;
					++index2;
					++x;
					if (index2 > endIndex2)
						break;
				}
			}
		}
	}

	 if (u > 1){

		 threadsMerge1(temp, a, 2 * rec_num, --u);

	 }
	
}

this is the parallel code

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cmath>

#define  P (PP*2048)
#define N  P*8*1024 // size of array to be sorted must be power of 2 multipled of 1024. Minimum size of array is 8*1024

// Define this to turn on error checking
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
 
//****************************************************************************************
// functions for cuda error checking
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}
#endif

	return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
	cudaError err = cudaGetLastError();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}

	// More careful checking. However, this will affect performance.
	// Comment away if needed.
	err = cudaDeviceSynchronize();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}
#endif

	return;
}
//****************************************************************************************

//sort until maximum size of array = 1*1024*1024
//***************************************************************************************
// (kernel) function that merge each 2048 of the subbarys of size 1024 (2048*1024) in the origin array
__device__ void FinalMergePart1(int *a, int *temp, int rec_num, int d, int x,int s,int b)
{

	int threadId = threadIdx.x;

	if (0 <= threadId && threadId < (s / (2 * d))){

		int BlockStart = blockIdx.x*(b *8* 1024);
		

		int index1 = BlockStart + rec_num * (threadId);
		int endIndex1 = index1 + (rec_num / 2) - 1;
		int index2 = endIndex1 + 1;
		int endIndex2 = index2 + (rec_num / 2) - 1;
		int targetIndex = index1;
		int sortedSize = 0;
		//printf("th= %d ", threadId);
		while (index1 <= endIndex1 && index2 <= endIndex2){

			if (a[index1] <= a[index2]){
				temp[targetIndex] = a[index1];
				++sortedSize;
				++index1;
				++targetIndex;
			}
			else{
				temp[targetIndex] = a[index2];
				++index2;
				++sortedSize;
				++targetIndex;
			}
		}

		if (index1 < endIndex1 &&index2 == endIndex2){

			if (sortedSize < rec_num){
				int bb = 0;
				while (bb < (rec_num / 2)){

					temp[targetIndex] = a[index1];
					++sortedSize;
					++targetIndex;
					++index1;
					++bb;
				}
			}

		}

		if (index2 > endIndex2 &&index1 <= endIndex1){
			if (sortedSize < rec_num){
				int bb = 0;
				while (bb < (rec_num / 2)){

					temp[targetIndex] = a[index1];
					++sortedSize;
					++targetIndex;
					++index1;
					++bb;
					if (index1 > endIndex1)
						break;
				}

			}
		}

		if (index1 > endIndex1 &&index2 <= endIndex2){
			if (sortedSize < rec_num){
				int bb = 0;
				while (bb < rec_num / 2){

					temp[targetIndex] = a[index2];
					++sortedSize;
					++targetIndex;
					++index2;
					++bb;
					if (index2 > endIndex2)
						break;
				}
			}
		}
	}
	__syncthreads();
	if (x > 1){
	
		FinalMergePart1(temp, a, 2 * rec_num, 2 * d, --x,s,b);
		
	}
	

}

//  (kernel) functions that sort each subarray of the origin array, each sorted subarray is of size 1024

__global__ void endphase1(int *a, int *temp, int x, int s,int b){

	FinalMergePart1(a, temp, 2048*8, 1, x,s,b);

}

//*********************************************************************************************
//*********************************************************************************************
int compare(const void* left, const void* right)
{
	int   *lt = (int*)left,
		*rt = (int*)right,
		diff = *lt - *rt;

	if (diff < 0) return -1;
	if (diff > 0) return +1;
	return 0;
}

//****************************************************************************************************************************************
int main()
{
	int *a; // the main array to be sorted
	int *dev_a, *dev_temp1;   // array on the device

	int sum1 = 0;// variables used in error checking
	int sum2 = 0;

	//FILE *fp = fopen("C:\\data.txt", "w");// file for data

	//*********************************************
	cudaMalloc((void **)&dev_a, N*sizeof(int));//allocate memory on device
	cudaMalloc((void **)&dev_temp1, N*sizeof(int));//allocate memory on device
	a = (int *)malloc(N*sizeof(int)); //allocate memory on host

	//*******1*************************************

	//--------------------------
	//genetate random data
	for (int i = 0; i < N; i++)
	{
		int num = rand() % 100;
		a[i] = num;
	}
	//-------------------------

	for (int i = 0; i < N; i = i + (8*1024)){
		qsort(a + i,8*1024, sizeof(int), compare);
	}
	

	printf("\n 1 way \n");
	printf("P= %d\n",P);

	//*********************************************************
	//transfer data from host memory to device memory, where dev_a is the destination
	cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
	//*********************************************************
	int x = log2(P);
	int z = log2(PP);

	//*************************************************************
	// calculating elapsed time, phase1
	cudaEvent_t start, stop; float time;
	cudaEventCreate(&start); cudaEventCreate(&stop);
	cudaEventRecord(start, 0);

	//**************************************************************

	//start execution
	//*********************************************************************************

	//sort up to N=1024*1024
 if (1 < P&&P <= 2048){

		if (P == 2048){
			endphase1 << <1, 1024 >> >(dev_a, dev_temp1, x, P,2048);
		}

		else {
			endphase1 << <1, P >> >(dev_a, dev_temp1, x, P,2048);
		}

	}

	//***********************************************************************
	// calculating elapsed time, phase2
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	printf("kernel time in ms:\t%.7f\n", time);
	//**********************************************************************
	//------------------------------------------------------------
	// cuda error checking
	cudaError_t error = cudaGetLastError();
	if (error != cudaSuccess)
	{
		printf("CUDA Error: %s\n", cudaGetErrorString(error));

		// we can't recover from the error -- exit the program
		return 1;
	}
	//-----------------------------------------------------------
	cudaDeviceSynchronize();
	if (x%2==0){
		cudaMemcpy(a, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost); // return sorted array to host
	}
	
	else{
		cudaMemcpy(a, dev_temp1, N*sizeof(int), cudaMemcpyDeviceToHost); // return sorted array to host
	}

	//***********************************************************************************

	for (int i = 0; i < N; i++)
	{
		if (a[i] == 0){
			++sum2;
		}

	}

	
	cudaFree(dev_a);//free memory on device
	cudaFree(dev_temp1);
	free(a);//free memory on host

}

Writing a custom merge sort application in CUDA is not a beginner task.

Instead compare to thrust::sort() using device pointers. That sort should be about 50-70x faster for large N than STD::sort().

Also your CUDA implementation looks wonky, so take clues from the the CUDA SDK mergeSort sample. The code is in there for examination. For N = 4,194,304, this was my output:

Initializing GPU merge sort...
Running GPU merge sort...
Time: 9.452534 ms
Reading back GPU merge sort results...
Inspecting the results...
...inspecting keys array: OK
...inspecting keys and values array: OK
...stability property: stable!
Shutting down...

Also keep in mind the fastest sort implementations on GPUs for primitives are usually Radix sorts.

thank you for your contribution

I think that thrust sort sorts the array as a black box, which I means you don’t worry about each thread how does it works.
I’m very interested with each thread on what element of array to work on.
what can i do?

After a quick search I came across this paper which in a general sense goes over different approaches for sorting in parallel:

[url]https://docs.google.com/viewer?url=http://nets.cis.fiu.edu/images/gpuSort.pdf[/url]

While those authors do not contribute any detailed new techniques, it is a good general overview of others previous work on the topic. Most books on CUDA have sections on sorting, so take a look at those as well.

Also take a look at the CUDA SDK sample code for simple quick sort, but that implementation ( I believe) requires compute capability of 3.5 or higher.

I get the quick sort
what is its benefit for merge sort?

They are both ‘Divide-and-Conquer’ algorithms.

If you are truly interested in this topic, you are going to need to do your own research.

Good luck!

I will do, thank you