Hi, i want to execute the cublas api from kernel, actually my configuration launch 2 blocks (it’s only an example) and execute a kernel like this (it’s compute a matrix vector multiplication and each thread compute a dot product):
__global__ void product(double *dev_a0, double *dev_a1, double *dev_A0, double *dev_A1, double *result, int max, int n){
int i;
double prod = 0.0;
for (i = 0; i < max; i++) {
if(blockIdx.x == 0) {
//i want to call cublas dotproduct here!!!
prod = prod + dev_a0[i] * dev_A0[i + n * threadIdx.x];
}
else if(blockIdx.x == 1) {
//i want to call cublas dotproduct here!!!
prod = prod + dev_a1[i] * dev_A1[i + n * threadIdx.x];
}
}
__syncthreads();
//each block write the result in a column
result[threadIdx.x + n * blockIdx.x] = prod;
}
It’s possible to call cublas API for dot product?