I would like to compute sum of matrices A and matrice B and store result to C:
C = α op ( A ) + β op ( B )
I found that exactly for this purpose there is cublasSgeam function in CUDA. Below is a code in JCUDA:
import static jcuda.runtime.JCuda.cudaFree;
import static jcuda.runtime.JCuda.cudaMalloc;
import static jcuda.runtime.JCuda.cudaMemcpy;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
import java.util.Arrays;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.jcublas.JCublas2;
import jcuda.jcublas.cublasHandle;
import jcuda.jcublas.cublasOperation;
public class SumMatricesExample {
public static void main(String[] args) {
JCuda.setExceptionsEnabled(true);
JCublas2.setExceptionsEnabled(true);
int rows = 2;
int columns = 3;
///////////////////////////////////////////////////////////
// Init matrixces
// Matrix A
float[] dataA = new float[] { 3, 1, 2, 1, 2, 3 };
Pointer deviceDataA = new Pointer();
cudaMalloc(deviceDataA, rows * columns * Sizeof.FLOAT);
cudaMemcpy(deviceDataA, Pointer.to(dataA), rows * columns * Sizeof.FLOAT, cudaMemcpyHostToDevice);
System.out.println(Arrays.toString(dataA));
// Matrix B
Pointer deviceDataB = new Pointer();
float[] dataB = new float[] { 0, 1, 2, 1, 2, 3 };
cudaMalloc(deviceDataB, rows * columns * Sizeof.FLOAT);
cudaMemcpy(deviceDataB, Pointer.to(dataB), rows * columns * Sizeof.FLOAT, cudaMemcpyHostToDevice);
System.out.println("+");
System.out.println(Arrays.toString(dataB));
// Matrix C
float[] dataC = new float[] { 0, 0, 0, 0, 0, 0 };
Pointer deviceDataC = new Pointer();
cudaMalloc(deviceDataC, rows * columns * Sizeof.FLOAT);
cudaMemcpy(deviceDataC, Pointer.to(dataC), rows * columns * Sizeof.FLOAT, cudaMemcpyHostToDevice);
///////////////////////////////////////////////////////////
// SUM of matrices
float alpha = 1.0f;
float beta = 1.0f;
// init
int m = rows;
int n = columns;
int k = rows;
// init
int lda = m;
int ldb = k;
int ldc = m;
// C = α op ( A ) + β op ( B )
cublasHandle handle = new cublasHandle();
JCublas2.cublasSgeam(handle, cublasOperation.CUBLAS_OP_N, cublasOperation.CUBLAS_OP_N, m, n,
Pointer.to(new float[] { alpha }), deviceDataA, lda, Pointer.to(new float[] { beta }), deviceDataB, ldb,
deviceDataC, ldc);
///////////////////////////////////////////////////////////
// show results
float data[] = new float[rows * columns];
cudaMemcpy(Pointer.to(data), deviceDataC, columns * rows * Sizeof.FLOAT, cudaMemcpyDeviceToHost);
System.out.println("=");
System.out.println(Arrays.toString(data));
///////////////////////////////////////////////////////////
// Clean memory
cudaFree(deviceDataA);
cudaFree(deviceDataB);
cudaFree(deviceDataC);
}
}
Maven:
<dependency>
<groupId>org.jcuda</groupId>
<artifactId>jcuda</artifactId>
<version>0.8.0</version>
</dependency>
<dependency>
<groupId>org.jcuda</groupId>
<artifactId>jcublas</artifactId>
<version>0.8.0</version>
</dependency>
<dependency>
<groupId>org.jcuda</groupId>
<artifactId>jcufft</artifactId>
<version>0.8.0</version>
</dependency>
<dependency>
<groupId>org.jcuda</groupId>
<artifactId>jcusparse</artifactId>
<version>0.8.0</version>
</dependency>
<dependency>
<groupId>org.jcuda</groupId>
<artifactId>jcusolver</artifactId>
<version>0.8.0</version>
</dependency>
<dependency>
<groupId>org.jcuda</groupId>
<artifactId>jcurand</artifactId>
<version>0.8.0</version>
</dependency>
<dependency>
<groupId>org.jcuda</groupId>
<artifactId>jnvgraph</artifactId>
<version>0.8.0</version>
</dependency>
Unfortunately result of this addition is vector of zeros:
Exception in thread "main" jcuda.CudaException: CUBLAS_STATUS_NOT_INITIALIZED
at jcuda.jcublas.JCublas2.checkResult(JCublas2.java:137)
at jcuda.jcublas.JCublas2.cublasSgeam(JCublas2.java:5616)
at snippet.SumMatricesExample.main(SumMatricesExample.java:66)
How can I initialize CuBlas properly?