JCUDA, matrices addition

I would like to compute sum of matrices A and matrice B and store result to C:

C = α op ( A ) + β op ( B )

I found that exactly for this purpose there is cublasSgeam function in CUDA. Below is a code in JCUDA:

import static jcuda.runtime.JCuda.cudaFree;
import static jcuda.runtime.JCuda.cudaMalloc;
import static jcuda.runtime.JCuda.cudaMemcpy;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;

import java.util.Arrays;

import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.jcublas.JCublas2;
import jcuda.jcublas.cublasHandle;
import jcuda.jcublas.cublasOperation;

public class SumMatricesExample {
	public static void main(String[] args) {
		JCuda.setExceptionsEnabled(true);
		JCublas2.setExceptionsEnabled(true);

		int rows = 2;
		int columns = 3;

		///////////////////////////////////////////////////////////
		// Init matrixces
		// Matrix A
		float[] dataA = new float[] { 3, 1, 2, 1, 2, 3 };
		Pointer deviceDataA = new Pointer();
		cudaMalloc(deviceDataA, rows * columns * Sizeof.FLOAT);
		cudaMemcpy(deviceDataA, Pointer.to(dataA), rows * columns * Sizeof.FLOAT, cudaMemcpyHostToDevice);
		System.out.println(Arrays.toString(dataA));
		
		// Matrix B
		Pointer deviceDataB = new Pointer();
		float[] dataB = new float[] { 0, 1, 2, 1, 2, 3 };
		cudaMalloc(deviceDataB, rows * columns * Sizeof.FLOAT);
		cudaMemcpy(deviceDataB, Pointer.to(dataB), rows * columns * Sizeof.FLOAT, cudaMemcpyHostToDevice);
		System.out.println("+");
		System.out.println(Arrays.toString(dataB));
		
		// Matrix C
		float[] dataC = new float[] { 0, 0, 0, 0, 0, 0 };
		Pointer deviceDataC = new Pointer();
		cudaMalloc(deviceDataC, rows * columns * Sizeof.FLOAT);
		cudaMemcpy(deviceDataC, Pointer.to(dataC), rows * columns * Sizeof.FLOAT, cudaMemcpyHostToDevice);

		///////////////////////////////////////////////////////////
		// SUM of matrices
		float alpha = 1.0f;
		float beta = 1.0f;

		// init
		int m = rows;
		int n = columns;
		int k = rows;

		// init
		int lda = m;
		int ldb = k;
		int ldc = m;

		// C = α op ( A ) + β op ( B )
		cublasHandle handle = new cublasHandle();
		JCublas2.cublasSgeam(handle, cublasOperation.CUBLAS_OP_N, cublasOperation.CUBLAS_OP_N, m, n,
				Pointer.to(new float[] { alpha }), deviceDataA, lda, Pointer.to(new float[] { beta }), deviceDataB, ldb,
				deviceDataC, ldc);

		///////////////////////////////////////////////////////////
		// show results
		float data[] = new float[rows * columns];
		cudaMemcpy(Pointer.to(data), deviceDataC, columns * rows * Sizeof.FLOAT, cudaMemcpyDeviceToHost);
		System.out.println("=");
		System.out.println(Arrays.toString(data));

		///////////////////////////////////////////////////////////
		// Clean memory
		cudaFree(deviceDataA);
		cudaFree(deviceDataB);
		cudaFree(deviceDataC);
	}
}

Maven:

<dependency>
    <groupId>org.jcuda</groupId>
    <artifactId>jcuda</artifactId>
    <version>0.8.0</version>
</dependency>
<dependency>
    <groupId>org.jcuda</groupId>
    <artifactId>jcublas</artifactId>
    <version>0.8.0</version>
</dependency>
<dependency>
    <groupId>org.jcuda</groupId>
    <artifactId>jcufft</artifactId>
    <version>0.8.0</version>
</dependency>
<dependency>
    <groupId>org.jcuda</groupId>
    <artifactId>jcusparse</artifactId>
    <version>0.8.0</version>
</dependency>
<dependency>
    <groupId>org.jcuda</groupId>
    <artifactId>jcusolver</artifactId>
    <version>0.8.0</version>
</dependency>
<dependency>
    <groupId>org.jcuda</groupId>
    <artifactId>jcurand</artifactId>
    <version>0.8.0</version>
</dependency>
<dependency>
    <groupId>org.jcuda</groupId>
    <artifactId>jnvgraph</artifactId>
    <version>0.8.0</version>
</dependency>

Unfortunately result of this addition is vector of zeros:

Exception in thread "main" jcuda.CudaException: CUBLAS_STATUS_NOT_INITIALIZED
	at jcuda.jcublas.JCublas2.checkResult(JCublas2.java:137)
	at jcuda.jcublas.JCublas2.cublasSgeam(JCublas2.java:5616)
	at snippet.SumMatricesExample.main(SumMatricesExample.java:66)

How can I initialize CuBlas properly?

Answer:

JCublas2.cublasCreate(handle);

must be called before cublasSgeam is executed.