Difficulties Copying Nested Structs to Device Mem lots of cudaMalloc and cudaMemcpy

jwilson75503 · October 8, 2011, 1:34am

I’m trying to copy some nested structs to device memory for kernel use in a CUDA-accelerated neural network simulator. This code links and runs, but it throws some exceptions and CUDA errors:

typedef struct rdLayer

{

	long NeuronQty ;

	long DendriteQty ;

	

	cuDoubleComplex *gpuWeights ;

	cuDoubleComplex *gpuZOutputs ;

	cuDoubleComplex *gpuDeltas ;

	cuDoubleComplex *gpuUnWeights ;

} rdLayer;

typedef struct rdNetwork

{

	long SectorQty;

	double K_DIV_TWO_PI;

	double two_pi_div_sect_qty;

	cuDoubleComplex *gpuSectorBdry;

	long LayerQty;

	rdLayer *rLayer;

} rdNetwork;

struct rdLearningSet 

{

	long EvalMode ;

	long SampleQty ;

	long InputQty ;

	long OutputQty ;

	long ContOutputs ;

	long SampleIdxReq ;

	cuDoubleComplex *gpuXInputs ;

	cuDoubleComplex *gpuDOutputs ;

	cuDoubleComplex *gpuYOutputs ;

	double *gpudSE1024 ;

	cuDoubleComplex *gpuOutScalar ;

};

[...]

	struct rdLearningSet * rdLearn;

	struct rdNetwork * rdNet;

[...]

	cudaMalloc(&rdNet, sizeof(rdNetwork));

	cudaMalloc(&rdLearn, sizeof(rdLearningSet));

[...]

	cuDoubleComplex * dummy;

	struct rdLayer rdlSource, * rdldummy;

[...]

	//rdLayer *rLayer;

	cudaMalloc(&rdldummy, sizeof(rdLayer)*rSes.rNet->LayerQty);

	cudaMemcpy( &rdNet->rLayer, &rdldummy, sizeof(rdLayer*), cudaMemcpyHostToDevice);

	for (int L=1; L<rSes.rNet->LayerQty; L++){

		// construct layer to be copied

		rdlSource.NeuronQty=rSes.rNet->rLayer[L].iNeuronQty 

		rdlSource.DendriteQty=rSes.rNet->rLayer[L].iDendriteQty 

		cudaMalloc( &rdlSource.gpuWeights, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].DendriteQty+1) * (rSes.rNet->rLayer[L].NeuronQty+1) ) 

			mCheckCudaWorked

		cudaMalloc( &rdlSource.gpuZOutputs, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].DendriteQty+1) * (rSes.rNet->rLayer[L].NeuronQty+1) ) 

			mCheckCudaWorked

		cudaMalloc( &rdlSource.gpuDeltas, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].iDendriteQty+1) * (rSes.rNet->rLayer[L].iNeuronQty+1) ) 

			mCheckCudaWorked

		cudaMalloc( &rdlSource.gpuUnWeights, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].iDendriteQty+1) * (rSes.rNet->rLayer[L].iNeuronQty+1) ) 

			mCheckCudaWorked

		//copy layer sructure to Device mem

		cudaMemcpyToSymbol( "rdNet->rLayer", &rdlSource, sizeof(rdLayer), sizeof(rdLayer) * L, cudaMemcpyHostToDevice );/*! 2D neuron cx weight matrix on GPU */

			mCheckCudaWorked

	}

[...]	

	cudaMalloc(&dummy, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->InputQty+1) ); /*! 2D complex input tuples in GPU. */

		cudaMemcpy( &rdLearn->gpuXInputs, &dummy, sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice );

			cudaMemcpy( &dummy, &rSes.rLearn->gpuXInputs, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->InputQty+1), cudaMemcpyHostToDevice); 

			mCheckCudaWorked	

	cudaMalloc(&dummy, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->OutputQty+1) ); /*! 2D desired complex outputs in GPU. */

		cudaMemcpy( &rdLearn->gpuDOutputs, &dummy, sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice );

			cudaMemcpy( &dummy, &rSes.rLearn->gpuDOutputs, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->OutputQty+1), cudaMemcpyHostToDevice); 

			mCheckCudaWorked

[...]

Unfortunately, the cudaMemcpyToSymbol call returns an error that the mCheckCudaWorked macro says is “invalid device symbol”, while the last (cudaMemcpy( &dummy, &rSes.rLearn->gpuDOutputs…) and third-from-last (cudaMemcpy( &dummy, &rSes.rLearn->gpuXInputs…) cudaMemcpy calls return “invalid argument”.

I am at a loss as to how to proceed to get these items copied to device memory and addressable from kernel code. &dummy and &rdldummy are positively being returned as the pointers to the device memory addresses where the allocated memory awaits, and I can write those pointers to the device memory, but I cannot coax the bulk of the member values into being copied to the pointed-at allocations. Help?