Use cudaGetDeviceCount properly, Floating Point Exceptions

I am able to make multi-GPU Cuda-C++ code.
https://github.com/Newbie-Coder-1105/NoviceChannel/blob/master/Cuda-Practice/multi_GPU_practice/ex1/sample_multigpu.cu
Here I know the number of GPUs. So I am able to perform the task as I want.

When I am using

cudaGetDeviceCount

to make it generic code, I am getting errors.

#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cuda.h>

#define NB 32
#define NT 500
#define N NB*NT

__global__ void add( double *a, double *b, double *c, const int Ns );

//===========================================
__global__ void add( double *a, double *b, double *c, const int Ns){

    int tid = threadIdx.x + blockIdx.x * blockDim.x; 

    while(tid < Ns){
        c[tid] = a[tid] + b[tid];
        tid += blockDim.x * gridDim.x;
    }

}

//============================================
//BEGIN
//===========================================
int main( void ) {

    double *a, *b, *c;

    // allocate the memory on the CPU
    a=(double *)malloc(N*sizeof(double));
    b=(double *)malloc(N*sizeof(double));
    c=(double *)malloc(N*sizeof(double));
	
    int gpu_n;
    cudaGetDeviceCount(&gpu_n);
    printf("CUDA-capable device count: %i\n", gpu_n);

	int m_size = gpu_n + 1 ;
	double *dev_a[m_size], *dev_b[m_size], *dev_c[m_size];
	int Ns[m_size] ;
	for (int i = 0 ; i < m_size ; i++ )
	{
		Ns[i] = (i * N / gpu_n) + 1 ;
	}

	// allocate the memory on the GPUs
	for(int dev=0; dev<m_size; dev++) 
	{
		cudaSetDevice(dev);
		cudaMalloc( (void**)&dev_a[dev], Ns[dev] * sizeof(double) );
		cudaMalloc( (void**)&dev_b[dev], Ns[dev] * sizeof(double) );
		cudaMalloc( (void**)&dev_c[dev], Ns[dev] * sizeof(double) );
	}

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i<N; i++) {
        a[i] = (double)i;
        b[i] = (double)i*2;
    }

 
	// copy the arrays 'a' and 'b' to the GPUs
	for(int dev=0,pos=0; dev<m_size; pos+=Ns[dev], dev++) 
	{
		cudaSetDevice(dev);
		cudaMemcpy( dev_a[dev], a+pos, Ns[dev] * sizeof(double), cudaMemcpyHostToDevice);
		cudaMemcpy( dev_b[dev], b+pos, Ns[dev] * sizeof(double), cudaMemcpyHostToDevice);
	}


	for(int i=0;i<10000;++i) 
	{
		for(int dev=0; dev<m_size; dev++) 
		{
			cudaSetDevice(dev);
			add<<<NB,NT>>>( dev_a[dev], dev_b[dev], dev_c[dev], Ns[dev] );
		//	std::cout<<"next Kernel started for dev "<< dev<< '\n' ;
		}
	}
	
	// copy the arrays 'c' from the GPUs
	for(int dev=0,pos=0; dev<m_size; pos+=Ns[dev], dev++) 
	{
		cudaSetDevice(dev);
		cudaMemcpy( c+pos,dev_c[dev], Ns[dev] * sizeof(double), cudaMemcpyDeviceToHost);
	}

    // display the results
    // for (int i=0; i<N; i++) {
    //      printf( "%g + %g = %g\n", a[i], b[i], c[i] );
    //  }
    printf("\nGPU done\n");

    // free the memory allocated on the GPUS
	for(int dev=0; dev<m_size; dev++) 
	{
		cudaFree( dev_a[dev] );
		cudaFree( dev_b[dev] );
		cudaFree( dev_c[dev] );
	}
//	for (int i = 0 ; i < N ; i++ )
//	{
//		std::cout<<c[i]<<"    "<< i <<'\n' ;
//	}

    // free the memory allocated on the CPU
    free( a );
    free( b );
    free( c );

    return 0;
}

Where am I going wrong ?

My suggestion is that any time you are having trouble with a CUDA code, you should be using proper CUDA error checking (google that, and apply it to your code) and run your code with cuda-memcheck, before asking others for help.

Regarding the problem, you have a logical error here:

int gpu_n;
    cudaGetDeviceCount(&gpu_n);
        ...
	int m_size = gpu_n + 1 ;  // ******now m_size is one larger than the number of GPUs you have
        ...
	for(int dev=0; dev<m_size; dev++) 
	{
		cudaSetDevice(dev);  //when dev is m_size-1, this will be an illegal function call

suppose cudaGetDeviceCount returns 4, meaning you have 4 GPUs.
Your code will set m_size to 5 (right???)
The for-loop iterates up to 5, i.e. 0,1,2,3,4
but you can’t do a cudaSetDevice(4) when you have 4 GPUs. Only 0,1,2,3 are legal.

Yes, I got that error but still, the issue is there.

int gpu_n;
    cudaGetDeviceCount(&gpu_n);
    printf("CUDA-capable device count: %i\n", gpu_n);

	int m_size = gpu_n   ;
	double *dev_a[m_size], *dev_b[m_size], *dev_c[m_size];
	int Ns[m_size] ;
	for (int i = 0 ; i < m_size ; i++ )
	{
		Ns[i] = ((i+1) * N / gpu_n)  ;
	}
	std::cout<<"Ns is created"<<'\n' ;

After changes, still, there is an issue, i.e., SEGFAULT, at

// copy the arrays 'c' from the GPUs
	for(int dev=0,pos=0; dev<m_size; pos+=Ns[dev], dev++) 
	{
		cudaSetDevice(dev);
		cudaMemcpy( c+pos,dev_c[dev], Ns[dev] * sizeof(double), cudaMemcpyDeviceToHost);
		std::cout<<"cudaMemcpyDeviceToHost     "<<dev<<'\n' ;
	}

Throwing issue

cudaMemcpyDeviceToHost     0
cudaMemcpyDeviceToHost     1
Segmentation fault

Is there any issue with cudaMemcpyDeviceToHost for communicating between GPUs?

Once again, the problem is in your code. I’m working off the code you posted with the modification of this line:

int m_size = gpu_n   ;

In your code, sum up all the values of Ns[…]

Then compare that to N.

(don’t try to do this in your head. Actually write the code to sum up the values and print it out, and print out N also, for comparison)

If Ns sum is larger than N (by even one) that is a logical error in your code.

Since you are only allocating on the host for arrays of size N, but on the device side the sum of all Ns values is much larger than N, you have a logical error in your code.

Thanks

int m_size = gpu_n   ;
	double *dev_a[m_size], *dev_b[m_size], *dev_c[m_size];
	int Ns[m_size] ;

	for (int i = 0 ; i < m_size+1 ; i++ )
	{ 
		Ns[i]  =  N / gpu_n  ;
		
	}

It worked.

Now you are indexing out-of-bounds in your for-loop.

The problems here have nothing to do with CUDA.

Yes, Now I got the proper way to perform this task.

int m_size = gpu_n   ;
	double *dev_a[m_size], *dev_b[m_size], *dev_c[m_size];
	int Ns[m_size] ;

	for (int i = 0 ; i < m_size ; i++ )
	{ 
		if(i<(N%gpu_n))
		{
			Ns[i] = N/gpu_n + 1 ;
		}
		else 
		{
			Ns[i] = N/gpu_n ;
		}
		
	}

I think it will be generic for all kind of array

N

It is working fine. I checked it with

cuda-memcheck

I got this

========= CUDA-MEMCHECK
CUDA-capable device count: 4
Ns is created
data created
4000
4000
4000
4000
cudamemcpy done
kernel done
cudaMemcpyDeviceToHost     0
cudaMemcpyDeviceToHost     1
cudaMemcpyDeviceToHost     2
cudaMemcpyDeviceToHost     3
cudaMemcpyDeviceToHost done

GPU done
cudafree done
success!========= ERROR SUMMARY: 0 errors