//~ nvcc -g -G -rdc=true -arch=sm_35 -o t123 t123.cu -lcublas -lcublas_device -lcudadevrt
//~ sudo optirun --no-xorg ./t123
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <cublas_v2.h>
#define _size 256
#define _GENES 7
#define _MAXGB 536870912
//~ #define _MAXGB 1073741824
const int MINDEGREE = 2;
const int MAXDEGREE = 3;
const int MINSIZE = 5;
#define cudaCheckErrors(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: Error(%d) %s %s in line %d\n", (int)code, cudaGetErrorString(code), file, line);
if (abort)
exit(code);
}
}
struct typebar{
int inh;
int non;
int act;
};
typedef struct typebar typebar;
enum IRT{
IRT_EQ = 0,
IRT_LE,//<
IRT_LQ,//<=
IRT_GR,//>
IRT_GQ //>=
};
__host__ __device__ float* dec2bin(int num, int sizeColunmArray){
float *arr, aux;
int i=0;
int r;
arr = new float;
if(arr == NULL){
printf("Error to allocate arr in dec2bin().\n");
assert(arr);
}
memset(arr, 0, sizeColunmArray*sizeof(float));
while(num != 0){
r = num%2;
arr[i++] = static_cast<float> (r);
num /= 2;
}
int j;
for(j=0;j<i/2; j++){
aux = arr[j];
arr[j] = arr[i-j-1];
arr[i-j-1] = aux;
}
//realloc vector
if(i<sizeColunmArray){
float *ptr;
ptr = new float;
if(ptr == NULL){
printf("Error to allocate ptr in dec2bin().\n");
assert(ptr);
}
memset(ptr, 0, sizeColunmArray*sizeof(float));
for(j=0; j<i; j++){
ptr = arr[j];
}
delete [] arr;
arr = NULL;
return ptr;
}
return arr;
}
__host__ __device__ unsigned int nextPow2(unsigned int x){
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
__device__ void retifica(float *gene_value, char *retify, char *invalid_all, int i, float value){
if(retify[i] == '0' && *invalid_all == '0'){
gene_value[i] = value;
retify[i] = '1';
}
else if(gene_value[i] != value && retify[i] == '1'){
*invalid_all = '1';
}
}
__global__ void kernel_getArrayFill(float *gene_value, char *retify, float *ind, float *array, int line){
int tid = threadIdx.x;
int size = blockDim.x;
if(retify[tid] == '1')
array[line*size + tid] = gene_value[tid];
else
array[line*size + tid] = ind[tid]-1;
}
__device__ void getNumBlocksAndThreads(int n, int maxThreads, int &blocks, int &threads){
threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
blocks = (n + threads - 1) / threads;
}
__device__ float* get_array(float **gene_value, char **retify, char invalid_all, int size_columns, int *sizeArray){
int i, line, quant_indices=1;
float *ind, first_ind;
float *array = NULL;
int size_total = 0;
if(invalid_all == '0'){
for(i=0; i<size_columns; i++){
if((*retify)[i] == '0'){
quant_indices *= 3;
}
}
size_total = quant_indices * size_columns;
array = new float;
assert(array);
ind = new float;
assert(ind);
memset(ind, 0, size_columns*sizeof(float));
((*retify)[0] == '0') ? first_ind = 3.f : first_ind = 1.f;
line = 0;
while(ind[0] < first_ind){
#if (__CUDA_ARCH__ >= 350 )
kernel_getArrayFill<<<1, size_columns>>>(*gene_value, *retify, ind, array, line);
cudaDeviceSynchronize();
#else
for(i=0; i<size_columns; i++){
if((*retify)[i] == '1')
array[line*size_columns+i] = (*gene_value)[i];
else
array[line*size_columns+i] = ind[i]-1;
}
#endif
for(i = size_columns-1; i > -1; i--){
ind[i]++;
if((*retify)[i] == '0'){
if(ind[i] < 3 || i==0){
break;
}
else{
ind[i] = 0;
}
}
else{
if(ind[i] < 1 || i == 0){
break;
}
else{
ind[i] = 0;
}
}
}
line++;
}
delete [] ind;
ind = NULL;
}
delete [] *gene_value;
*gene_value = NULL;
delete [] *retify;
*retify = NULL;
*sizeArray = size_total;
return array;
}
__global__ void kernel_executa_linear_checking_sem_remover(float *result, char *invalids, int rel, int size){
int tid = (blockIdx.x + gridDim.x * blockIdx.y) * (blockDim.x * blockDim.y) + (threadIdx.x + blockDim.x * threadIdx.y);
if(tid < size){
if( invalids[tid] == '0' ){
if(rel == IRT_LE){
if( result[tid] >= 0 ){
invalids[tid]='1';
}
}
else if( rel == IRT_LQ ){
if( result[tid] > 0 ){
invalids[tid]='1';
}
}
else if( rel == IRT_GR ){
if( result[tid] <= 0 ){
invalids[tid]='1';
}
}
else if( rel == IRT_GQ ){
if( result[tid] < 0 ){
invalids[tid]='1';
}
}
else if( rel == IRT_EQ ){
if( result[tid] != 0 ){
invalids[tid]='1';
}
}
}
}
}
__device__ void linear_handle(cublasHandle_t cublashandle, cublasStatus_t cublasstat, float *matrix, int lines, int columns, char *invalids, IRT *_rel, int *num, int tamanho_rel, float *ptrAlpha, float *ptrBeta){
//~ __device__ void linear_handle(float *matrix, int lines, int columns, char *invalids, IRT *_rel, int *num, int tamanho_rel){
float *vetorA = NULL, *result;
result = new float[lines];
assert(result);
int i;
for(i=0; i<tamanho_rel; i++){
vetorA = dec2bin(num[i], columns);
memset(result, 0, lines*sizeof(float));
//chamada ao cublas com vetorA
//cublasSgemv(cublashandle, CUBLAS_OP_T, lines, columns, ptrAlpha, matrix, columns, vetorA, 1, ptrBeta, result, 1);
if(lines >= columns){
cublasstat = cublasSgemv(cublashandle, CUBLAS_OP_T, columns, lines, ptrAlpha, matrix, columns, vetorA, 1, ptrBeta, result, 1);
}
else{
cublasstat = cublasSgemv(cublashandle, CUBLAS_OP_T, lines, columns, ptrAlpha, matrix, columns, vetorA, 1, ptrBeta, result, 1);
}
cudaDeviceSynchronize();
if (cublasstat != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS cublasSgemv() failed in executa_linear_sem_remover() %d\n", (int)cublasstat);
return;
}
#if (__CUDA_ARCH__ >= 350 )
int blocos=0, threads=0;
getNumBlocksAndThreads(lines, 16, blocos, threads);
kernel_executa_linear_checking_sem_remover<<<blocos, threads>>>(result, invalids, _rel[i], lines);
cudaDeviceSynchronize();
#endif
delete [] vetorA;
vetorA = NULL;
}
delete [] result;
result = NULL;
}
__device__ void linear_handle2d(cublasHandle_t cublashandle, cublasStatus_t cublasstat, float *matrix, int lines, int columns, char *invalids, float *vetorA, IRT _rel, float *ptrAlpha, float *ptrBeta){
float *result;
result = new float[lines];
assert(result);
memset(result, 0, lines*sizeof(float));
//~ chamada ao cublas com vetorA
//~ cublasSgemv(cublashandle, CUBLAS_OP_T, lines, columns, ptrAlpha, matrix, columns, vetorA, 1, ptrBeta, result, 1);
if(lines >= columns){
cublasstat = cublasSgemv(cublashandle, CUBLAS_OP_T, columns, lines, ptrAlpha, matrix, columns, vetorA, 1, ptrBeta, result, 1);
cudaDeviceSynchronize();
if (cublasstat != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS cublasSgemv() failed in executa_linear_sem_remover2d first half lines >= columns\n");
return;
}
cublasstat = cublasSgemv(cublashandle, CUBLAS_OP_T, columns, lines, ptrAlpha, matrix, columns, vetorA+columns, 1, ptrBeta, result, 1);
cudaDeviceSynchronize();
if (cublasstat != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS cublasSgemv() failed in executa_linear_sem_remover2d second half lines >= columns\n");
return;
}
}
else{
cublasstat = cublasSgemv(cublashandle, CUBLAS_OP_T, lines, columns, ptrAlpha, matrix, columns, vetorA, 1, ptrBeta, result, 1);
cudaDeviceSynchronize();
if (cublasstat != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS cublasSgemv() failed in executa_linear_sem_remover2d first half lines < columns\n");
return;
}
cublasstat = cublasSgemv(cublashandle, CUBLAS_OP_T, lines, columns, ptrAlpha, matrix, columns, vetorA+columns, 1, ptrBeta, result, 1);
cudaDeviceSynchronize();
if (cublasstat != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS cublasSgemv() failed in executa_linear_sem_remover2d second half lines < columns\n");
return;
}
}
#if (__CUDA_ARCH__ >= 350 )
int blocos=0, threads=0;
getNumBlocksAndThreads(lines, 16, blocos, threads);
kernel_executa_linear_checking_sem_remover<<<blocos, threads>>>(result, invalids, _rel, lines);
cudaDeviceSynchronize();
#endif
//~ printf("%c %c %c %c %c\n", invalids[215], invalids[216], invalids[217], invalids[218], invalids[219]);
delete [] result;
result = NULL;
//~ cublasDestroy(cublashandle);
}
__global__ void kernel_count_invalids(char *invalids, int *lineInvalids, int size){
int tid = (blockIdx.x + gridDim.x * blockIdx.y) * (blockDim.x * blockDim.y) + (threadIdx.x + blockDim.x * threadIdx.y);
if(tid < size){
if(invalids[tid] == '1'){
atomicAdd(lineInvalids, 1);
}
}
}
__host__ __device__ void remove_invalids(float **matrix, int *lines, int columns, char *invalids, int sizeInvalids){
int newline = *lines - sizeInvalids;
int newsizearray = newline*columns;
float *newarray = new float[newsizearray];
assert(newarray);
int inicio = 0;
int i;
for(i=0; i<(*lines); i++){
if(invalids[i] == '0'){
memcpy(newarray+(inicio), (*matrix)+(i*columns), columns*sizeof(float));
inicio = inicio + columns;
}
}
if(*matrix){
delete [] (*matrix);
*matrix = NULL;
}
*matrix = newarray;
*lines = newline;
}
__host__ __device__ void removendo_invalidos(float **matrix, int *lines, int columns, char **invalids){
int *sizeInvalids = new int(0);
#if (__CUDA_ARCH__ >= 350 )
int blocos=0, threads=0;
getNumBlocksAndThreads(*lines, 16, blocos, threads);
kernel_count_invalids<<<blocos, threads>>>(*invalids, sizeInvalids, *lines);
cudaDeviceSynchronize();
#endif
remove_invalids(&(*matrix), lines, columns, *invalids, *sizeInvalids);
//~ for(int i=0; i<(*lines); i++){
//~ for(int j=0; j<_GENES; j++){
//~ printf("%.f ", matrix[i*_GENES+j]);
//~ }
//~ printf("\n");
//~ }
delete sizeInvalids;
sizeInvalids = NULL;
delete [] *invalids;
*invalids = NULL;
}
__device__ void teste_dd(cublasHandle_t cublashandle, cublasStatus_t cublasstat, float **meu_vetor, int *meu_lines){
float *gene_value = new float[_GENES];
assert(gene_value);
char *retify = new char[_GENES], invalid_all;
assert(retify);
memset(gene_value, 0, _GENES*sizeof(float));
memset(retify, '0', _GENES*sizeof(char));
invalid_all = '0';
retifica(gene_value, retify, &invalid_all, 0, -1);
retifica(gene_value, retify, &invalid_all, 1, 1);
if(threadIdx.x == 2){
retifica(gene_value, retify, &invalid_all, 2, 1);
}
int meu_size = 0;
*meu_vetor = get_array(&gene_value, &retify, invalid_all, _GENES, &meu_size);
*meu_lines = meu_size/_GENES;
char *invalids = new char[*meu_lines];
assert(invalids);
memset(invalids, '0', (*meu_lines)*sizeof(char));
float alpha = 1.f, beta = 1.f;
float *ptr_alpha = &alpha, *ptr_beta = β
float *num2 = new float[14];
assert(num2);
num2[0] = 0;
num2[1] = 0;
num2[2] = 0;
num2[3] = 0;
num2[4] = 0;
num2[5] = 0;
num2[6] = 0;
num2[7] = 1;
num2[8] = 1;
num2[9] = 0;
num2[10] = 0;
num2[11] = 0;
num2[12] = 1;
num2[13] = 1;
//~ IRT rel[1]={IRT_GR};
//~ int num1[1]={9};
//~ linear_handle(cublashandle, cublasstat, *meu_vetor, *meu_lines, _GENES, invalids, rel, num1, 1, ptr_alpha, ptr_beta);
linear_handle2d(cublashandle, cublasstat, *meu_vetor, *meu_lines, _GENES, invalids, num2, IRT_GQ, ptr_alpha, ptr_beta);
removendo_invalidos(&(*meu_vetor), &(*meu_lines), _GENES, &invalids);
delete [] num2;
num2 = NULL;
}
__device__ bool CheckDegree(float *row, int size){
int degree, i;
degree = 0;
for (i = 0; i < size; i++) {
if (row[i] != 0) {
degree++;
}
}
return (degree >= MINDEGREE && degree <= MAXDEGREE)? true : false;
}
__device__ void BarChart_per_block_atomic(float *row, typebar *A, int size){
int j;
for (j = 0; j < size; j++) {
if (row[j] == -1) {
atomicAdd(&A[j].inh, 1);
} else if (row[j] == 0) {
atomicAdd(&A[j].non, 1);
} else{
atomicAdd(&A[j].act, 1);
}
}
}
__global__ void kernel_conta_atividade(float *matrix, int lines, int columns, typebar *bar, int *rows_gi){
int tid = (blockIdx.x + gridDim.x * blockIdx.y) * (blockDim.x * blockDim.y) + (threadIdx.x + blockDim.x * threadIdx.y);
if(tid < lines){
float *solucao = matrix+(tid*columns);
if(columns <= MINSIZE || CheckDegree(solucao, columns)){
BarChart_per_block_atomic(solucao, bar, columns);
atomicAdd(rows_gi, 1);
}
}
}
__global__ void kerneldd(){
cublasHandle_t cublashandle;
cublasStatus_t cublasstat = cublasCreate(&cublashandle);
if (cublasstat != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS initialization failed in %s()\n", __FUNCTION__);
return;
}
cublasstat = cublasSetPointerMode(cublashandle, CUBLAS_POINTER_MODE_DEVICE);
if (cublasstat != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS initialization of cublasSetPointerMode failed in %s\n", __FUNCTION__);
return;
}
float *meu_vetor = NULL;
int meu_lines = 0;
teste_dd(cublashandle, cublasstat, &meu_vetor, &meu_lines);
__shared__ typebar *bar;
if(threadIdx.x == 0){
bar = new typebar[_GENES*_GENES];
assert(bar);
memset(bar, 0, _GENES*_GENES*sizeof(typebar));
}
int blocos=0, threads=0;
int *ptr_rows_gi = new int(0);
getNumBlocksAndThreads(meu_lines, 16, blocos, threads);
kernel_conta_atividade<<<blocos, threads>>>(meu_vetor, meu_lines, _GENES, bar+(threadIdx.x*_GENES), ptr_rows_gi);
cudaDeviceSynchronize();
printf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", threadIdx.x, *ptr_rows_gi, bar[threadIdx.x*_GENES].inh, bar[threadIdx.x*_GENES].non, bar[threadIdx.x*_GENES].act, bar[threadIdx.x*_GENES+1].inh, bar[threadIdx.x*_GENES+1].non, bar[threadIdx.x*_GENES+1].act, bar[threadIdx.x*_GENES+2].inh, bar[threadIdx.x*_GENES+2].non, bar[threadIdx.x*_GENES+2].act, bar[threadIdx.x*_GENES+3].inh, bar[threadIdx.x*_GENES+3].non, bar[threadIdx.x*_GENES+3].act, bar[threadIdx.x*_GENES+4].inh, bar[threadIdx.x*_GENES+4].non, bar[threadIdx.x*_GENES+4].act, bar[threadIdx.x*_GENES+5].inh, bar[threadIdx.x*_GENES+5].non, bar[threadIdx.x*_GENES+5].act, bar[threadIdx.x*_GENES+6].inh, bar[threadIdx.x*_GENES+6].non, bar[threadIdx.x*_GENES+6].act);
if(threadIdx.x == 0){
delete [] bar;
bar = NULL;
}
delete [] meu_vetor;
meu_vetor = NULL;
delete ptr_rows_gi;
ptr_rows_gi = NULL;
cublasDestroy(cublashandle);
}
void teste(){
cudaDeviceSetLimit(cudaLimitMallocHeapSize, _MAXGB);
//allocate array of _GENES positions, retifica() sets pos X to value Y,
//get_array() combines all possibilities of array (generating matrix),
//linear_handle() and line_handle2d() manipulates with
//cublas (cublasSgemv), remove_invalids() removes all lines that is invalids of matrix
//kernel_conta_atividade() counts numbers of -1, .., 1
//prints number of -1,..,1 end program
kerneldd<<<1, _GENES>>>();
cudaCheckErrors(cudaDeviceSynchronize());
}
int main(int argc, char **argv){
teste();
return 0;
}
In teste() I talk what program do. I noticed that if takeoff cudaDeviceSetLimit the memory not increase, this implies on memory increase?