I am using MPI and CUDA thrust.
I have MPI code as follows:
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>
#define MASTER 0
#define ARRAYSIZE 40000000
int *masterarray;
int *onearray;
int *twoarray;
int *threearray;
int *fourarray;
int *fivearray;
int *sixarray;
int *sevenarray;
int *eightarray;
int *ninearray;
int main(int argc, char* argv[])
{
int numtasks, taskid,chunksize, namelen;
int mysum,one,two,three,four,five,six,seven,eight,nine;
char myname[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
int a,b,c,d,e,f,g,h,i,j;
/***** Initializations *****/
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Get_processor_name(myname, &namelen);
printf ("MPI task %d has started on host %s...\n", taskid, myname);
//chunksize = 20000000;
masterarray= malloc(ARRAYSIZE * sizeof(int));
onearray= malloc(ARRAYSIZE * sizeof(int));
twoarray= malloc(ARRAYSIZE * sizeof(int));
threearray= malloc(ARRAYSIZE * sizeof(int));
fourarray= malloc(ARRAYSIZE * sizeof(int));
fivearray= malloc(ARRAYSIZE * sizeof(int));
sixarray= malloc(ARRAYSIZE * sizeof(int));
sevenarray= malloc(ARRAYSIZE * sizeof(int));
eightarray= malloc(ARRAYSIZE * sizeof(int));
ninearray= malloc(ARRAYSIZE * sizeof(int));
/***** Master task only ******/
if (taskid == MASTER){
for(a=0;a<ARRAYSIZE;a++){
masterarray[a] = 1;
}
mysum = run_kernel0(masterarray,ARRAYSIZE,taskid, myname);
} /* end of master section */
if (taskid > MASTER) {
if(taskid == 1){
for(b=0;b<ARRAYSIZE;b++){
onearray[b] = 1;
}
one = run_kernel0(onearray,ARRAYSIZE,taskid, myname);
}
if(taskid == 2){
for(c=0;c<ARRAYSIZE;c++){
twoarray[c] = 1;
}
two = run_kernel0(twoarray,ARRAYSIZE,taskid, myname);
}
if(taskid == 3){
for(d=0;d<ARRAYSIZE;d++){
threearray[d] = 1;
}
three = run_kernel0(threearray,ARRAYSIZE,taskid, myname);
}
if(taskid == 4){
for(e=0;e<ARRAYSIZE;e++){
fourarray[e] = 1;
}
four = run_kernel0(fourarray,ARRAYSIZE,taskid, myname);
}
if(taskid == 5){
for(f=0;f<ARRAYSIZE;f++){
fivearray[f] = 1;
}
five = run_kernel0(fivearray,ARRAYSIZE,taskid, myname);
}
if(taskid == 6){
for(g=0;g<ARRAYSIZE;g++){
sixarray[g] = 1;
}
six = run_kernel0(sixarray,ARRAYSIZE,taskid, myname);
}
if(taskid == 7){
for(h=0;h<ARRAYSIZE;h++){
sevenarray[h] = 1;
}
seven = run_kernel0(sevenarray,ARRAYSIZE,taskid, myname);
}
if(taskid == 8){
for(i=0;i<ARRAYSIZE;i++){
eightarray[i] = 1;
}
eight = run_kernel0(eightarray,ARRAYSIZE,taskid, myname);
}
if(taskid == 9){
for(j=0;j<ARRAYSIZE;j++){
ninearray[j] = 1;
}
nine = run_kernel0(ninearray,ARRAYSIZE,taskid, myname);
}
}
MPI_Finalize();
}
And my cuda thrust code:
#include <stdio.h>
#include <cutil_inline.h>
#include <cutil.h>
#include <thrust/version.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>
extern "C"
int run_kernel0( int array[], int nelements, int taskid, char hostname[])
{
float elapsedTime;int d_sum;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
thrust::device_vector<int> gpuarray(data, data + nelements);
d_sum = thrust::reduce(gpuarray.begin(),gpuarray.end());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf(" Task %d has sum (on GPU): %ld Time for the kernel: %f ms \n", taskid, d_sum, elapsedTime);
return d_sum;
The code works when ARRAYSIZE is 20000000 but fails when I increase it.
Not sure what is causing the problem.
Here is the trace -
terminate called after throwing an instance of ‘thrust::system::detail::bad_alloc’
what(): std::bad_alloc: out of memory
*** Process received signal ***
Signal: Aborted (6)
Signal code: (-6)
[ 0] [0xe1640c]
[ 1] /lib/libc.so.6(abort+0x17a) [0x5b43ca]
[ 2] /usr/lib/libstdc++.so.6(_ZN9__gnu_cxx27__verbose_terminate_handlerEv+0x167) [0x254327]
[ 3] /usr/lib/libstdc++.so.6(-0xff5b0e7a) [0x252186]
[ 4] /usr/lib/libstdc++.so.6(-0xff5b0e3d) [0x2521c3]
[ 5] /usr/lib/libstdc++.so.6(-0xff5b0cfe) [0x252302]
[ 6] mpi_array_distributed(_ZN6thrust6detail7backend4cuda6mallocILj0EEENS_10device_ptrIvEEj+0x17b) [0x805ff27]
[ 7] mpi_array_distributed(_ZN6thrust6detail7backend8dispatch6mallocILj0EEENS_10device_ptrIvEEjNS0_21cuda_device_space_tagE+0x19) [0x805fa12]
[ 8] mpi_array_distributed(_ZN6thrust13device_mallocEj+0x1d) [0x805f563]
[ 9] mpi_array_distributed(_ZN6thrust13device_mallocIiEENS_10device_ptrIT_EEj+0x23) [0x8061ef3]
[10] mpi_array_distributed(_ZN6thrust23device_malloc_allocatorIiE8allocateEjNS_10device_ptrIKiEE+0x5e) [0x8061886]
[11] mpi_array_distributed(_ZN6thrust6detail18contiguous_storageIiNS_23device_malloc_allocatorIiEEE8allocateEj+0x3f) [0x806136f]
[12] mpi_array_distributed(_ZN6thrust6detail11vector_baseIiNS_23device_malloc_allocatorIiEEE17allocate_and_copyIPiEEvjT_S7_RNS0_18contiguous_storageIiS3_EE+0x158) [0x8060fb8]
[13] mpi_array_distributed(_ZN6thrust6detail11vector_baseIiNS_23device_malloc_allocatorIiEEE10range_initIPiEEvT_S7_NS0_17integral_constantIbLb0EEE+0x42) [0x8060a0e]
[14] mpi_array_distributed(_ZN6thrust6detail11vector_baseIiNS_23device_malloc_allocatorIiEEE13init_dispatchIPiEEvT_S7_NS0_17integral_constantIbLb0EEE+0x23) [0x80602cd]
[15] mpi_array_distributed(ZN6thrust6detail11vector_baseIiNS_23device_malloc_allocatorIiEEEC2IPiEET_S7+0x3a) [0x806004a]
[16] mpi_array_distributed(ZN6thrust13device_vectorIiNS_23device_malloc_allocatorIiEEEC2IPiEET_S6+0x1f) [0x805fa63]
[17] mpi_array_distributed(run_kernel0+0x27) [0x805d407]
[18] mpi_array_distributed(main+0x54a) [0x805d35e]
[19] /lib/libc.so.6(__libc_start_main+0xe6) [0x59ece6]
[20] mpi_array_distributed() [0x805cd81]
*** End of error message ***
I am executing the program on cluster.
Can anyone please help ?
Thanks