Problem with performance running parallel blocks

Rayjunx · March 24, 2018, 2:02pm

Hi!

im a beginner and try to realize a simple “game of life” programm running parallel on blocks.
So I take the AD-LOOP demo out of the CUDA BY EXAMPLE book and added my game of life and some performance counting code brackets inside. But I guess I did something wrong.

The code runs, but theres two mayor problems I cannot fix:

The performance is quite bad, with a GTX1080 I get just about a single Gigaflop :-(
(I know its not a exact benchmark, its just a absolute rough estimation)
If I increase the blocks/N or the rounds of game of the game of life simulation just a bit I get
programm errors and no results. I defined that values direct on top so you can play with it too.

About memory: The blocks can keep the Game Of Life Array-Map alone, no need for sharing them, I guess if this array is defined inside the block it may performing better?

Im just sure its my fault using something wrong…?
Can you maybe help me out a bit?

Cheers
Rayjunx

Heres the code:

// Just a test of how to use CUDA to simulate Game of Life by Rayjunx 2018 //
#include "book.h"
#include <ctime>

#define N 200 // Number of Blocks( 1 Block 1 Game of life world with 200x200 element array )
#define GOLROUNDS 20 // How many rounds in GOL gets simulated
#define REPEATS 2 // Repeat all simulations x times

__global__ void gol(int *a, int *b, int *c) {
	int tid = blockIdx.x; // tid = Block Number

	//printf("Handle block nr: %d\n", tid);

	//c[tid] = a[tid] + b[tid];

	int gol[200][200]; // game of life array
	int golnew[200][200]; // game of life array
	int i1, i2, i3, i4, i5;
	int ld1;
		
	// Clean Game of life grid
	for (i1 = 0; i1 < 200; i1++)
		for (i2 = 0; i2 < 200; i2++)
		{
			gol[i1][i2]=0;
			golnew[i1][i2] = 0;
		}

	// Now fill array depending onto 3 setup variables
	i3 = a[tid];
	i4 = b[tid];
	i5 = c[tid];
	for (i1 = 1; i1 < 199; i1++)
		for (i2 = 1; i2 < 199; i2++)
		{
			gol[i1][i2] = 0;

			i3--;
			if (i3 <= 0)
			{
				//printf("a(tid)=%d", a[tid]);
				i3 = a[tid];
				gol[i1][i2] = 1;
			}

			i4--;
			if (i4 <= 0)
			{
				i4 = b[tid];
				gol[i1][i2] = 1;
			}

			//printf("%d", gol[i1][i2]);
		}

	// Now simulate some rounds of game of life:
	for (i5 = 0; i5 < GOLROUNDS; i5++) // Turns of game of life
	{
		for (i1 = 1; i1 < 199; i1++)
			for (i2 = 1; i2 < 199; i2++)
			{
				i3 = 0; // Count value of libing cells nearby
				i3 += gol[i1 - 1][i2 - 1];
				i3 += gol[i1][i2 - 1];
				i3 += gol[i1 + 1][i2 - 1];
				i3 += gol[i1 - 1][i2];
				i3 += gol[i1 + 1][i2];
				i3 += gol[i1 - 1][i2 + 1];
				i3 += gol[i1][i2 + 1];
				i3 += gol[i1 + 1][i2 + 1];

				if (i3 < 2) golnew[i1][i2] = 0;
				if ((i3 == 2) || (i3 == 3))
					if (gol[i1][i2] == 1) golnew[i1][i2] = 1;
				if (i3 > 3) golnew[i1][i2] = 0;
				if (i3 == 3) golnew[i1][i2] = 1;
			}
		for (i1 = 1; i1 < 199; i1++)
			for (i2 = 1; i2 < 199; i2++)
			{
				gol[i1][i2] = golnew[i1][i2];
				golnew[i1][i2] = 0;
			}
	}

	// Count all living cells
	ld1 = 0;
	for (i1 = 1; i1 < 199; i1++)
		for (i2 = 1; i2 < 199; i2++)
		{
			ld1 += gol[i1][i2];
		}
		
	a[tid] = ld1; // Give this value back as 'a'
	b[tid] = 0; // unused
	b[tid] = 0; // unused

	//printf("Block Nr: %d of %d finished and have had a totol of %d living cells \n", tid, N - 1, ld1);
	
}

int main(void) {
	printf("Starting GAME OF LIFE simulation: \n\n");
	printf("Setuped with: \n");
	printf("N: %d - total blocks/worlds of game of life with 200x200 grid each\n", N);
	printf("GOLROUNDS: %d - simulate rounds of GOL\n", GOLROUNDS);
	printf("REPEATS: %d - repeat all\n\n", REPEATS);

	srand(1);

	clock_t t;
	t = clock();

	int *a, *b, *c;
	int *dev_a, *dev_b, *dev_c;
	int alltogether;
	int i1, i2, i3, i4;
	float f1, f2;

	// allocate the memory on the CPU
	a = (int*)malloc(N * sizeof(int));
	b = (int*)malloc(N * sizeof(int));
	c = (int*)malloc(N * sizeof(int));

	// allocate the memory on the GPU
	HANDLE_ERROR(cudaMalloc((void**)&dev_a, N * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void**)&dev_b, N * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void**)&dev_c, N * sizeof(int)));


	for (int iloop = 0; iloop < REPEATS; iloop++)
	{

		// fill the arrays 'a' and 'b' and c on the CPU
		for (int i = 0; i < N; i++) 
		{
			a[i] = rand() % 20 + 1;
			b[i] = rand() % 1000 + 1; // Random value 1-1000
			c[i] = 0;
		}

		// copy the arrays 'a' and 'b' and c to the GPU
		HANDLE_ERROR(cudaMemcpy(dev_a, a, N * sizeof(int),  cudaMemcpyHostToDevice));
		HANDLE_ERROR(cudaMemcpy(dev_b, b, N * sizeof(int),	cudaMemcpyHostToDevice));
		HANDLE_ERROR(cudaMemcpy(dev_c, c, N * sizeof(int),	cudaMemcpyHostToDevice));

		//printf("Starte ADD Nr.%d \n", iloop);

		gol <<<N, 1 >>> (dev_a, dev_b, dev_c); 

		// copy the array a b and 'c' back from the GPU to the CPU
		HANDLE_ERROR(cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost));
		HANDLE_ERROR(cudaMemcpy(b, dev_b, N * sizeof(int), cudaMemcpyDeviceToHost));
		HANDLE_ERROR(cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost));

		alltogether = 0;
		for (int i = 0; i < N; i++) // Count all results together
		{
			//printf("a: %d  and together: %d \n", a[i], alltogether);
			alltogether += (a[i]);
		}
		printf("%d run. All Living Cells in all Blocks: %d \n", iloop+1, alltogether);

	}


	t = clock() - t;
	printf("Clocktime: %d ms \n\n",  t);

	printf("Per Block/GOL World: Arround 200x200x12 operations for creation and adding living cells at the end\n");
	printf("Per Block/GOL World: Arround 200x200x17 operations for simulationg a single round of GOL\n");
	i1 = 200 * 200 * 12 + 200*200*17*GOLROUNDS;
	printf("Per Block/GOL World: Creating + n simulated rounds are arround %d operations \n", i1 );
	i2 = N * (i1 / 1000/1000);
	printf("Operations for all Blocks of one run together in megaflops: %d \n", i2 );
	i3 = i2 * REPEATS;
	printf("Operations of all runs together in megaflops: %d \n", i3);
	
	f2 = t;
	f1 = 1000 / f2;
	printf("Time factor to get work capacity of one second: %.3f \n\n", f1);
	i4 = float(i3)*f1;
	printf("megaflops per second: %d \n", i4);
	printf("gigaflops per second: %d \n", i4/1000);
	printf("terraflops per second: %d \n", i4/1000000);

	// free the memory we allocated on the GPU
	HANDLE_ERROR(cudaFree(dev_a));
	HANDLE_ERROR(cudaFree(dev_b));
	HANDLE_ERROR(cudaFree(dev_c));

	// free the memory we allocated on the CPU
	free(a);
	free(b);
	free(c);

	return 0;
}

and heres the book.h source file:

/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and
 * proprietary rights in and to this software and related documentation.
 * Any use, reproduction, disclosure, or distribution of this software
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 * associated with this source code for terms and conditions that govern
 * your use of this NVIDIA software.
 *
 */


#ifndef __BOOK_H__
#define __BOOK_H__
#include <stdio.h>

static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


#define HANDLE_NULL( a ) {if (a == NULL) { \
                            printf( "Host memory failed in %s at line %d\n", \
                                    __FILE__, __LINE__ ); \
                            exit( EXIT_FAILURE );}}

template< typename T >
void swap( T& a, T& b ) {
    T t = a;
    a = b;
    b = t;
}


void* big_random_block( int size ) {
    unsigned char *data = (unsigned char*)malloc( size );
    HANDLE_NULL( data );
    for (int i=0; i<size; i++)
        data[i] = rand();

    return data;
}

int* big_random_block_int( int size ) {
    int *data = (int*)malloc( size * sizeof(int) );
    HANDLE_NULL( data );
    for (int i=0; i<size; i++)
        data[i] = rand();

    return data;
}


// a place for common kernels - starts here

__device__ unsigned char value( float n1, float n2, int hue ) {
    if (hue > 360)      hue -= 360;
    else if (hue < 0)   hue += 360;

    if (hue < 60)
        return (unsigned char)(255 * (n1 + (n2-n1)*hue/60));
    if (hue < 180)
        return (unsigned char)(255 * n2);
    if (hue < 240)
        return (unsigned char)(255 * (n1 + (n2-n1)*(240-hue)/60));
    return (unsigned char)(255 * n1);
}

__global__ void float_to_color( unsigned char *optr,
                              const float *outSrc ) {
    // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    float l = outSrc[offset];
    float s = 1;
    int h = (180 + (int)(360.0f * outSrc[offset])) % 360;
    float m1, m2;

    if (l <= 0.5f)
        m2 = l * (1 + s);
    else
        m2 = l + s - l * s;
    m1 = 2 * l - m2;

    optr[offset*4 + 0] = value( m1, m2, h+120 );
    optr[offset*4 + 1] = value( m1, m2, h );
    optr[offset*4 + 2] = value( m1, m2, h -120 );
    optr[offset*4 + 3] = 255;
}

__global__ void float_to_color( uchar4 *optr,
                              const float *outSrc ) {
    // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    float l = outSrc[offset];
    float s = 1;
    int h = (180 + (int)(360.0f * outSrc[offset])) % 360;
    float m1, m2;

    if (l <= 0.5f)
        m2 = l * (1 + s);
    else
        m2 = l + s - l * s;
    m1 = 2 * l - m2;

    optr[offset].x = value( m1, m2, h+120 );
    optr[offset].y = value( m1, m2, h );
    optr[offset].z = value( m1, m2, h -120 );
    optr[offset].w = 255;
}


#if _WIN32
    //Windows threads.
    #include <windows.h>

    typedef HANDLE CUTThread;
    typedef unsigned (WINAPI *CUT_THREADROUTINE)(void *);

    #define CUT_THREADPROC unsigned WINAPI
    #define  CUT_THREADEND return 0

#else
    //POSIX threads.
    #include <pthread.h>

    typedef pthread_t CUTThread;
    typedef void *(*CUT_THREADROUTINE)(void *);

    #define CUT_THREADPROC void
    #define  CUT_THREADEND
#endif

//Create thread.
CUTThread start_thread( CUT_THREADROUTINE, void *data );

//Wait for thread to finish.
void end_thread( CUTThread thread );

//Destroy thread.
void destroy_thread( CUTThread thread );

//Wait for multiple threads.
void wait_for_threads( const CUTThread *threads, int num );

#if _WIN32
    //Create thread
    CUTThread start_thread(CUT_THREADROUTINE func, void *data){
        return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
    }

    //Wait for thread to finish
    void end_thread(CUTThread thread){
        WaitForSingleObject(thread, INFINITE);
        CloseHandle(thread);
    }

    //Destroy thread
    void destroy_thread( CUTThread thread ){
        TerminateThread(thread, 0);
        CloseHandle(thread);
    }

    //Wait for multiple threads
    void wait_for_threads(const CUTThread * threads, int num){
        WaitForMultipleObjects(num, threads, true, INFINITE);

        for(int i = 0; i < num; i++)
            CloseHandle(threads[i]);
    }

#else
    //Create thread
    CUTThread start_thread(CUT_THREADROUTINE func, void * data){
        pthread_t thread;
        pthread_create(&thread, NULL, func, data);
        return thread;
    }

    //Wait for thread to finish
    void end_thread(CUTThread thread){
        pthread_join(thread, NULL);
    }

    //Destroy thread
    void destroy_thread( CUTThread thread ){
        pthread_cancel(thread);
    }

    //Wait for multiple threads
    void wait_for_threads(const CUTThread * threads, int num){
        for(int i = 0; i < num; i++)
            end_thread( threads[i] );
    }

#endif




#endif  // __BOOK_H__

BulatZiganshin · March 25, 2018, 9:18am

you need to continue learning CUDA and NVidia GPUs. Just now you are trying to use it as CPU, and it doesn’t work

You instance kernel with <N,1> config, but CUDA warp is 32-wide. This means that you are using only single thread of each warp, wasting 97% of GPY power. You should use <N/32,32> instead
Then, each thread in your program simulates its own world. But you need a lot of threads to fill up entire GPU, it’s in range from thousands to dozens of thousands, depending on how large it. With 320 KB per each thread, you obviously have too much data to keep it in GPU caches. So, each access to arrays will go into RAM, that is pretty slow, and your program speed is limited by RAM access rather than computation speed

The right way to implement efficient simulation is to run one simulation per thread block, but use as much as possible threads in each block, i.e. <N, 1024>. Put your arrays into shared memory, and make sure that they are as small as possible (10 KB is enough for 200*200 field). You will need syncthreads() call to share work between threads in the block.

Yes, this means some optimization, and it’s usual way to make full use of GPU

Rayjunx · March 25, 2018, 8:06pm

Hi BulatZiganshin,

thank you a lot for your help, I understand and try to fix it based on your advices.

I now use 128 blocks with 128 threads each. Coming to about 380gigaflops.
I reduced the array to bool and 50x50 so just about 5 kilobit for the 2 maps.
The tip about using syncthreads sounds good, already read a bit about it, going to use it,
for this test I do not share information between the threadds/blocks.

What I still not understand is why the programm causes errors if for example I just higher
the turns of game of life simulated. Have each thread / block a max. runtime? It just looks
crazy what happens somethimes, for example if I higher the block number a bit more, the
programm finish very quick, theoretical with 100+ terraflops but I find out it NOT handle the
code I writen for the blocks.

And shouldnt it be already perfect using 128 blocks with 128 threads so about 16384 parallel processes for a GPU with just minimum cache consumption, about 0,7kb per thread. So I guess I should get something out of the 8gigaflops a gtx1080 have. 380gigaflops are still just 5% of this potential :(

I guess theres still a problem to solve?

// Just a test of how to use CUDA ^-
#include "book.h"
#include <ctime>

#define N 128 // Number of Blocks( 1 Block 1 Game of life world with 50x50 element array )
#define GOLROUNDS 2000 // How many rounds in GOL gets simulated
#define REPEATS 1 // Repeat all simulations x times

__global__ void gol(int *a, int *b, int *c) {
	int tid = threadIdx.x; // tid = Block Number

	//printf("Handle thread nr: %d\n", tid);

	//c[tid] = a[tid] + b[tid];

    bool gol[50][50]; // game of life array
	bool golnew[50][50]; // game of life array
	int i1, i2, i3, i4, i5;
	int ld1;
		
	// Clean Game of life grid
	for (i1 = 0; i1 < 50; i1++)
		for (i2 = 0; i2 < 50; i2++)
		{
			gol[i1][i2]=0;
			golnew[i1][i2] = 0;
		}

	// Now fill array depending onto 3 setup variables
	i3 = a[tid];
	i4 = b[tid];
	for (i1 = 1; i1 < 49; i1++)
		for (i2 = 1; i2 < 49; i2++)
		{
			gol[i1][i2] = 0;

			i3--;
			if (i3 <= 0)
			{
				//printf("a(tid)=%d", a[tid]);
				i3 = a[tid];
				gol[i1][i2] = 1;
			}

			i4--;
			if (i4 <= 0)
			{
				i4 = b[tid];
				gol[i1][i2] = 1;
			}
		}

	// Now simulate some rounds of game of life:
	for (i5 = 0; i5 < GOLROUNDS; i5++) // Turns of game of life
	{
		for (i1 = 1; i1 < 49; i1++)
			for (i2 = 1; i2 < 49; i2++)
			{
				i3 = 0; // Count value of libing cells nearby
				i3 += gol[i1 - 1][i2 - 1];
				i3 += gol[i1][i2 - 1];
				i3 += gol[i1 + 1][i2 - 1];
				i3 += gol[i1 - 1][i2];
				i3 += gol[i1 + 1][i2];
				i3 += gol[i1 - 1][i2 + 1];
				i3 += gol[i1][i2 + 1];
				i3 += gol[i1 + 1][i2 + 1];

				if (i3 < 2) golnew[i1][i2] = 0;
				if ((i3 == 2) || (i3 == 3))
					if (gol[i1][i2] == 1) golnew[i1][i2] = 1;
				if (i3 > 3) golnew[i1][i2] = 0;
				if (i3 == 3) golnew[i1][i2] = 1;
			}
		for (i1 = 1; i1 < 49; i1++)
			for (i2 = 1; i2 < 49; i2++)
			{
				gol[i1][i2] = golnew[i1][i2];
				golnew[i1][i2] = 0;
			}
	}

	// Count all living cells
	ld1 = 0;
	for (i1 = 1; i1 < 49; i1++)
		for (i2 = 1; i2 < 49; i2++)
		{
			ld1 += gol[i1][i2];
		}
		
	a[tid] = ld1; // Give this value back as 'a'
	b[tid] = 0; // unused
	c[tid] = -1; // successfully handled

	//printf("Block Nr: %d of %d finished and have had a totol of %d living cells \n", tid, N - 1, ld1);
	
}

int main(void) {
	printf("Starting GAME OF LIFE simulation: \n\n");
	printf("Setuped with: \n");
	printf("N: %d - total blocks/worlds of game of life with 50x50 grid each\n", N);
	printf("GOLROUNDS: %d - simulate rounds of GOL\n", GOLROUNDS);
	printf("REPEATS: %d - repeat all\n\n", REPEATS);

	srand(1);

	clock_t t;
	t = clock();

	int *a, *b, *c;
	int *dev_a, *dev_b, *dev_c;
	int alltogether;
	int i1, i2, i3, i4;
	float f1, f2;

	// allocate the memory on the CPU
	a = (int*)malloc(N * sizeof(int));
	b = (int*)malloc(N * sizeof(int));
	c = (int*)malloc(N * sizeof(int));

	// allocate the memory on the GPU
	HANDLE_ERROR(cudaMalloc((void**)&dev_a, N * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void**)&dev_b, N * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void**)&dev_c, N * sizeof(int)));

	for (int iloop = 0; iloop < REPEATS; iloop++)
	{

		// fill the arrays 'a' and 'b' and c on the CPU
		for (int i = 0; i < N; i++) 
		{
			a[i] = rand() % 20 + 1;
			b[i] = rand() % 1000 + 1; // Random value 1-1000
			c[i] = 0;
		}

		// copy the arrays 'a' and 'b' and c to the GPU
		HANDLE_ERROR(cudaMemcpy(dev_a, a, N * sizeof(int),  cudaMemcpyHostToDevice));
		HANDLE_ERROR(cudaMemcpy(dev_b, b, N * sizeof(int),	cudaMemcpyHostToDevice));
		HANDLE_ERROR(cudaMemcpy(dev_c, c, N * sizeof(int),	cudaMemcpyHostToDevice));

		//printf("Starte ADD Nr.%d \n", iloop);

		gol <<<128, N>>> (dev_a, dev_b, dev_c); 

		// copy the array a b and 'c' back from the GPU to the CPU
		HANDLE_ERROR(cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost));
		HANDLE_ERROR(cudaMemcpy(b, dev_b, N * sizeof(int), cudaMemcpyDeviceToHost));
		HANDLE_ERROR(cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost));

		alltogether = 0;
		for (int i = 0; i < N; i++) // Count all results together
		{
			//printf("a: %d  and together: %d \n", a[i], alltogether);
			alltogether += (a[i]);
		}
		printf("%d run. All Living Cells in all Blocks: %d \n", iloop+1, alltogether);

		// Check if something not processed well
		for (int i = 0; i < N; i++) 
		{
			if (c[i] > 0)
				printf("\n W T F !!! \n\n");
		}

	}

	t = clock() - t;
	printf("Clocktime: %d ms \n\n",  t);

	printf("Per Block/GOL World: Arround 50x50x12 operations for creation and adding living cells at the end\n");
	printf("Per Block/GOL World: Arround 50x50x17 operations for simulationg a single round of GOL\n");
	i1 = 50 * 50 * 12 + 50*50*17*GOLROUNDS;
	printf("Per Block/GOL World: Creating + n simulated rounds are arround %d operations \n", i1 );
	i2 = 128*N * (i1 / 1000/1000);
	printf("Operations for all Blocks of one run together in kiloflops: %d \n", i2 );
	i3 = i2 * REPEATS;
	printf("Operations of all runs together in kiloflops: %d \n", i3);
	
	f2 = t;
	f1 = 1000 / f2;
	printf("Time factor to get work capacity of one second: %.3f \n\n", f1);
	i4 = float(i3)*f1;
	printf("megaflops per second: %d \n", i4);
	printf("gigaflops per second: %d \n", i4/1000);
	printf("terraflops per second: %d \n", i4/1000000);

	// free the memory we allocated on the GPU
	HANDLE_ERROR(cudaFree(dev_a));
	HANDLE_ERROR(cudaFree(dev_b));
	HANDLE_ERROR(cudaFree(dev_c));

	// free the memory we allocated on the CPU
	free(a);
	free(b);
	free(c);

	return 0;
}

Rayjunx · March 26, 2018, 5:44pm

Optimized in Block / Thread ratio and minimize the array map again.
Now we come to about 1,9 gigaflops.

Its just a bit strange why the code acts so sensitive, even a few bits less of a 50x50 or 30x30 bool array, and it performs different. And i have to use 16 threads per block, not 32 to get best results.

And most important, I have no Idea why it still get unstabele, increasing simulation time of the Game Of Life, or further increase threads or blocks, and programm crashes… That shouldnt be… :(

Do somebody have some idea?

// Just a test of how to use CUDA ^-
#include "book.h"
#include <ctime>

#define N 4096 // Blocks n ( 1 Block 1 Game of life world with 50x50 element array ) - max. 65535 blocks
#define T 16 // Threads n - at least 32 threads per block to get good performance
#define GOLROUNDS 500 // How many rounds in GOL gets simulated
#define REPEATS 5 // Repeat all simulations x times

__global__ void gol(int *a, int *b, int *c) {
	int tid = threadIdx.x + blockIdx.x * blockDim.x;

	//printf("Handle Block: %d - Thread: %d - Tid: %d \n", blockIdx.x, threadIdx.x, tid);

    bool gol[30][30]; // game of life array
	bool golnew[30][30]; // game of life array
	int i1, i2, i3, i4, i5;
	int ld1;
		
	// Clean Game of life grid
	for (i1 = 0; i1 < 30; i1++)
		for (i2 = 0; i2 < 30; i2++)
		{
			gol[i1][i2]=0;
			golnew[i1][i2] = 0;
		}

	// Now fill array depending onto 3 setup variables
	i3 = a[tid];
	i4 = b[tid];
	for (i1 = 1; i1 < 29; i1++)
		for (i2 = 1; i2 < 29; i2++)
		{
			gol[i1][i2] = 0;

			i3--;
			if (i3 <= 0)
			{
				//printf("a(tid)=%d", a[tid]);
				i3 = a[tid];
				gol[i1][i2] = 1;
			}

			i4--;
			if (i4 <= 0)
			{
				i4 = b[tid];
				gol[i1][i2] = 1;
			}
		}

	// Now simulate some rounds of game of life:
	for (i5 = 0; i5 < GOLROUNDS; i5++) // Turns of game of life
	{
		for (i1 = 1; i1 < 29; i1++)
			for (i2 = 1; i2 < 29; i2++)
			{
				i3 = 0; // Count value of libing cells nearby
				i3 += gol[i1 - 1][i2 - 1];
				i3 += gol[i1][i2 - 1];
				i3 += gol[i1 + 1][i2 - 1];
				i3 += gol[i1 - 1][i2];
				i3 += gol[i1 + 1][i2];
				i3 += gol[i1 - 1][i2 + 1];
				i3 += gol[i1][i2 + 1];
				i3 += gol[i1 + 1][i2 + 1];

				if (i3 < 2) golnew[i1][i2] = 0;
				if ((i3 == 2) || (i3 == 3))
					if (gol[i1][i2] == 1) golnew[i1][i2] = 1;
				if (i3 > 3) golnew[i1][i2] = 0;
				if (i3 == 3) golnew[i1][i2] = 1;
			}
		// gol old to new map
		for (i1 = 1; i1 < 29; i1++)
			for (i2 = 1; i2 < 29; i2++)
			{
				gol[i1][i2] = golnew[i1][i2];
				golnew[i1][i2] = 0;
			}
	}

	// Count all living cells
	ld1 = 0;
	for (i1 = 1; i1 < 29; i1++)
		for (i2 = 1; i2 < 29; i2++)
		{
			ld1 += gol[i1][i2];
		}
		
	a[tid] = ld1; // Give this value back as 'a'
	//b[tid] = 0; // unused
	c[tid] = 1; // successfully handled

	//printf("Block Nr: %d of %d finished and have had a totol of %d living cells \n", tid, N - 1, ld1);
	
}

int main(void) {
	printf("Starting GAME OF LIFE simulation: \n\n");
	printf("Setuped with: \n");
	printf("N: %d - total blocks with T: %d Threads each, of game of life with 30x30 grid each\n", N, T);
	printf("GOLROUNDS: %d - simulate rounds of GOL\n", GOLROUNDS);
	printf("REPEATS: %d - repeat all\n\n", REPEATS);

	srand(1);

	clock_t t;
	t = clock();

	int *a, *b, *c;
	int *dev_a, *dev_b, *dev_c;
	int alltogether;
	int i1, i2, i3, i4;
	float f1, f2;

	// allocate the memory on the CPU
	a = (int*)malloc(N * T * sizeof(int));
	b = (int*)malloc(N * T * sizeof(int));
	c = (int*)malloc(N * T * sizeof(int));

	// allocate the memory on the GPU
	HANDLE_ERROR(cudaMalloc((void**)&dev_a, N * T * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void**)&dev_b, N * T * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void**)&dev_c, N * T * sizeof(int)));

	for (int iloop = 0; iloop < REPEATS; iloop++)
	{

		// fill the arrays 'a' and 'b' and c on the CPU
		for (int i = 0; i < N * T; i++) 
		{
			a[i] = rand() % 20 + 1;
			b[i] = rand() % 1000 + 1; // Random value 1-1000
			c[i] = 0;
		}

		// copy the arrays 'a' and 'b' and c to the GPU
		HANDLE_ERROR(cudaMemcpy(dev_a, a, N * T * sizeof(int),  cudaMemcpyHostToDevice));
		HANDLE_ERROR(cudaMemcpy(dev_b, b, N * T * sizeof(int),	cudaMemcpyHostToDevice));
		HANDLE_ERROR(cudaMemcpy(dev_c, c, N * T * sizeof(int),	cudaMemcpyHostToDevice));

		//printf("Starte ADD Nr.%d \n", iloop);

		gol <<<N, T>>> (dev_a, dev_b, dev_c); 

		// copy the array a b and 'c' back from the GPU to the CPU
		HANDLE_ERROR(cudaMemcpy(a, dev_a, N * T * sizeof(int), cudaMemcpyDeviceToHost));
		HANDLE_ERROR(cudaMemcpy(b, dev_b, N * T * sizeof(int), cudaMemcpyDeviceToHost));
		HANDLE_ERROR(cudaMemcpy(c, dev_c, N * T * sizeof(int), cudaMemcpyDeviceToHost));

		alltogether = 0;
		for (int i = 0; i < N * T; i++) // Count all results together
		{
			//printf("a: %d  and together: %d \n", a[i], alltogether);
			alltogether += (a[i]);
		}
		//printf("%d run. All Living Cells in all Blocks: %d \n", iloop+1, alltogether);

		// Check if something not processed well
		for (int i = 0; i < N * T; i++)
		{
			if (c[i] == 0)
				printf("\n W T F !!! \n\n");
		}

	}

	t = clock() - t;
	printf("Clocktime: %d ms \n\n",  t);

	printf("Per Block/GOL World: Arround 30x30x12 operations for creation and adding living cells at the end\n");
	printf("Per Block/GOL World: Arround 30x30x17 operations for simulationg a single round of GOL\n");
	i1 = 30 * 30 * 12 + 30*30*17*GOLROUNDS;
	printf("Per Block/GOL World: Creating + n simulated rounds are arround %d operations \n", i1 );
	i2 = 128*N * (i1 / 1000/1000);
	printf("Operations for all Blocks of one run together in kiloflops: %d \n", i2 );
	i3 = i2 * REPEATS;
	printf("Operations of all runs together in kiloflops: %d \n", i3);
	
	f2 = t;
	f1 = 1000 / f2;
	printf("Time factor to get work capacity of one second: %.3f \n\n", f1);
	i4 = float(i3)*f1;
	printf("megaflops per second: %d \n", i4);
	printf("gigaflops per second: %d \n", i4/1000);
	printf("terraflops per second: %d \n", i4/1000000);

	// free the memory we allocated on the GPU
	HANDLE_ERROR(cudaFree(dev_a));
	HANDLE_ERROR(cudaFree(dev_b));
	HANDLE_ERROR(cudaFree(dev_c));

	// free the memory we allocated on the CPU
	free(a);
	free(b);
	free(c);

	return 0;
}