Difficulties Copying Nested Structs to Device Mem lots of cudaMalloc and cudaMemcpy
I'm trying to copy some nested structs to device memory for kernel use in a CUDA-accelerated neural network simulator. This code links and runs, but it throws some exceptions and CUDA errors:

[code]
typedef struct rdLayer
{
long NeuronQty ;
long DendriteQty ;

cuDoubleComplex *gpuWeights ;
cuDoubleComplex *gpuZOutputs ;
cuDoubleComplex *gpuDeltas ;
cuDoubleComplex *gpuUnWeights ;
} rdLayer;

typedef struct rdNetwork
{
long SectorQty;
double K_DIV_TWO_PI;
double two_pi_div_sect_qty;
cuDoubleComplex *gpuSectorBdry;
long LayerQty;
rdLayer *rLayer;
} rdNetwork;

struct rdLearningSet
{
long EvalMode ;
long SampleQty ;
long InputQty ;
long OutputQty ;
long ContOutputs ;
long SampleIdxReq ;

cuDoubleComplex *gpuXInputs ;
cuDoubleComplex *gpuDOutputs ;
cuDoubleComplex *gpuYOutputs ;
double *gpudSE1024 ;
cuDoubleComplex *gpuOutScalar ;
};

[...]
struct rdLearningSet * rdLearn;
struct rdNetwork * rdNet;
[...]
cudaMalloc(&rdNet, sizeof(rdNetwork));
cudaMalloc(&rdLearn, sizeof(rdLearningSet));
[...]
cuDoubleComplex * dummy;
struct rdLayer rdlSource, * rdldummy;
[...]
//rdLayer *rLayer;
cudaMalloc(&rdldummy, sizeof(rdLayer)*rSes.rNet->LayerQty);
cudaMemcpy( &rdNet->rLayer, &rdldummy, sizeof(rdLayer*), cudaMemcpyHostToDevice);
for (int L=1; L<rSes.rNet->LayerQty; L++){
// construct layer to be copied
rdlSource.NeuronQty=rSes.rNet->rLayer[L].iNeuronQty
rdlSource.DendriteQty=rSes.rNet->rLayer[L].iDendriteQty
cudaMalloc( &rdlSource.gpuWeights, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].DendriteQty+1) * (rSes.rNet->rLayer[L].NeuronQty+1) )
mCheckCudaWorked
cudaMalloc( &rdlSource.gpuZOutputs, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].DendriteQty+1) * (rSes.rNet->rLayer[L].NeuronQty+1) )
mCheckCudaWorked
cudaMalloc( &rdlSource.gpuDeltas, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].iDendriteQty+1) * (rSes.rNet->rLayer[L].iNeuronQty+1) )
mCheckCudaWorked
cudaMalloc( &rdlSource.gpuUnWeights, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].iDendriteQty+1) * (rSes.rNet->rLayer[L].iNeuronQty+1) )
mCheckCudaWorked
//copy layer sructure to Device mem
cudaMemcpyToSymbol( "rdNet->rLayer", &rdlSource, sizeof(rdLayer), sizeof(rdLayer) * L, cudaMemcpyHostToDevice );/*! 2D neuron cx weight matrix on GPU */
mCheckCudaWorked
}
[...]
cudaMalloc(&dummy, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->InputQty+1) ); /*! 2D complex input tuples in GPU. */
cudaMemcpy( &rdLearn->gpuXInputs, &dummy, sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice );
cudaMemcpy( &dummy, &rSes.rLearn->gpuXInputs, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->InputQty+1), cudaMemcpyHostToDevice);
mCheckCudaWorked
cudaMalloc(&dummy, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->OutputQty+1) ); /*! 2D desired complex outputs in GPU. */
cudaMemcpy( &rdLearn->gpuDOutputs, &dummy, sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice );
cudaMemcpy( &dummy, &rSes.rLearn->gpuDOutputs, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->OutputQty+1), cudaMemcpyHostToDevice);
mCheckCudaWorked
[...]
[/code]

Unfortunately, the cudaMemcpyToSymbol call returns an error that the mCheckCudaWorked macro says is "invalid device symbol", while the last (cudaMemcpy( &dummy, &rSes.rLearn->gpuDOutputs...) and third-from-last (cudaMemcpy( &dummy, &rSes.rLearn->gpuXInputs...) cudaMemcpy calls return "invalid argument".

I am at a loss as to how to proceed to get these items copied to device memory and addressable from kernel code. &dummy and &rdldummy are positively being returned as the pointers to the device memory addresses where the allocated memory awaits, and I can write those pointers to the device memory, but I cannot coax the bulk of the member values into being copied to the pointed-at allocations. Help?
I'm trying to copy some nested structs to device memory for kernel use in a CUDA-accelerated neural network simulator. This code links and runs, but it throws some exceptions and CUDA errors:





typedef struct rdLayer

{

long NeuronQty ;

long DendriteQty ;



cuDoubleComplex *gpuWeights ;

cuDoubleComplex *gpuZOutputs ;

cuDoubleComplex *gpuDeltas ;

cuDoubleComplex *gpuUnWeights ;

} rdLayer;



typedef struct rdNetwork

{

long SectorQty;

double K_DIV_TWO_PI;

double two_pi_div_sect_qty;

cuDoubleComplex *gpuSectorBdry;

long LayerQty;

rdLayer *rLayer;

} rdNetwork;



struct rdLearningSet

{

long EvalMode ;

long SampleQty ;

long InputQty ;

long OutputQty ;

long ContOutputs ;

long SampleIdxReq ;



cuDoubleComplex *gpuXInputs ;

cuDoubleComplex *gpuDOutputs ;

cuDoubleComplex *gpuYOutputs ;

double *gpudSE1024 ;

cuDoubleComplex *gpuOutScalar ;

};



[...]

struct rdLearningSet * rdLearn;

struct rdNetwork * rdNet;

[...]

cudaMalloc(&rdNet, sizeof(rdNetwork));

cudaMalloc(&rdLearn, sizeof(rdLearningSet));

[...]

cuDoubleComplex * dummy;

struct rdLayer rdlSource, * rdldummy;

[...]

//rdLayer *rLayer;

cudaMalloc(&rdldummy, sizeof(rdLayer)*rSes.rNet->LayerQty);

cudaMemcpy( &rdNet->rLayer, &rdldummy, sizeof(rdLayer*), cudaMemcpyHostToDevice);

for (int L=1; L<rSes.rNet->LayerQty; L++){

// construct layer to be copied

rdlSource.NeuronQty=rSes.rNet->rLayer[L].iNeuronQty

rdlSource.DendriteQty=rSes.rNet->rLayer[L].iDendriteQty

cudaMalloc( &rdlSource.gpuWeights, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].DendriteQty+1) * (rSes.rNet->rLayer[L].NeuronQty+1) )

mCheckCudaWorked

cudaMalloc( &rdlSource.gpuZOutputs, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].DendriteQty+1) * (rSes.rNet->rLayer[L].NeuronQty+1) )

mCheckCudaWorked

cudaMalloc( &rdlSource.gpuDeltas, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].iDendriteQty+1) * (rSes.rNet->rLayer[L].iNeuronQty+1) )

mCheckCudaWorked

cudaMalloc( &rdlSource.gpuUnWeights, sizeof(cuDoubleComplex) * (rSes.rNet->rLayer[L].iDendriteQty+1) * (rSes.rNet->rLayer[L].iNeuronQty+1) )

mCheckCudaWorked

//copy layer sructure to Device mem

cudaMemcpyToSymbol( "rdNet->rLayer", &rdlSource, sizeof(rdLayer), sizeof(rdLayer) * L, cudaMemcpyHostToDevice );/*! 2D neuron cx weight matrix on GPU */

mCheckCudaWorked

}

[...]

cudaMalloc(&dummy, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->InputQty+1) ); /*! 2D complex input tuples in GPU. */

cudaMemcpy( &rdLearn->gpuXInputs, &dummy, sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice );

cudaMemcpy( &dummy, &rSes.rLearn->gpuXInputs, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->InputQty+1), cudaMemcpyHostToDevice);

mCheckCudaWorked

cudaMalloc(&dummy, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->OutputQty+1) ); /*! 2D desired complex outputs in GPU. */

cudaMemcpy( &rdLearn->gpuDOutputs, &dummy, sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice );

cudaMemcpy( &dummy, &rSes.rLearn->gpuDOutputs, sizeof(cuDoubleComplex) * (rSes.rLearn->SampleQty) * (rSes.rLearn->OutputQty+1), cudaMemcpyHostToDevice);

mCheckCudaWorked

[...]




Unfortunately, the cudaMemcpyToSymbol call returns an error that the mCheckCudaWorked macro says is "invalid device symbol", while the last (cudaMemcpy( &dummy, &rSes.rLearn->gpuDOutputs...) and third-from-last (cudaMemcpy( &dummy, &rSes.rLearn->gpuXInputs...) cudaMemcpy calls return "invalid argument".



I am at a loss as to how to proceed to get these items copied to device memory and addressable from kernel code. &dummy and &rdldummy are positively being returned as the pointers to the device memory addresses where the allocated memory awaits, and I can write those pointers to the device memory, but I cannot coax the bulk of the member values into being copied to the pointed-at allocations. Help?

Home: Asus mobo Pentium 4 w/ GTS550 Ti

Work: Asus mobo Core2Duo w/ GTS450 x2

Work: HP Pavillion Elite w/ Tesla C2075

64-bit Windows 7 Professional

CUDA 4.2, NSight 2.2.0.12132, VS 2008 SP1

#1
Posted 10/08/2011 01:34 AM   
Scroll To Top