Hello,
(I am not ashamed of asking again this question)
Is it possible to allocate global memory dynamically inside the CUDA kernel?
If it is possible, how far is it a good practice ? (is it efficient ?)
Thank you.
hi, yes, it possible. We use this to do reduction on the kernel like this:
__global__ void loop_red_arn_47_cuda_kernel(float a[], CudaIndexType a0004, CudaIndexType a0003, CudaIndexType a0002, DvmType dim1_s, DvmType dim2_s, DvmType dim3_s, float s_grid[], float *s_init, float s1__1, float s1__2, float s1__3, float s1__4, float s1__5, float s1__6, float s1__7, float s1__8, float s1__9, float s1_grid[], DvmType dim1_s2, float s2_grid[], float *s2_init, float mm, float mm_grid[], CudaIndexType blocks[], int red_count, DvmType overall_blocks, long int n2, long int n1){
// Private variables
long int k2;
long int k1;
long int cond_1;
long int _k1;
long int cond_0;
long int _k2;
// Local needs
CudaIndexType j, i;
int ibof;
float s2[17];
float s1[9];
int k_k3;
int k_k2;
int k_k1;
float *s;
__shared__ DvmType __addr_s;
// Allocate memory for reduction
i = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
if (i == 0)
{
s = new float [dim1_s * dim2_s * dim3_s * blockDim.x * blockDim.y * blockDim.z];
__addr_s = (DvmType &)s;
}
__syncthreads();
s = (float *)__addr_s;
s = s + i;
// Fill local variable with passed values
for (k_k3 = 0 ; k_k3 < dim3_s ; k_k3 = k_k3 + 1)
{
for (k_k2 = 0 ; k_k2 < dim2_s ; k_k2 = k_k2 + 1)
{
for (k_k1 = 0 ; k_k1 < dim1_s ; k_k1 = k_k1 + 1)
{
s[(k_k1 + k_k2 * dim1_s + k_k3 * (dim1_s * dim2_s)) * (blockDim.x * blockDim.y * blockDim.z)] = s_init[(k_k1 + k_k2 * dim1_s + k_k3 * (dim1_s * dim2_s)) * (blockDim.x * blockDim.y * blockDim.z)];
}
}
}
s1[0] = s1__1;
s1[1] = s1__2;
s1[2] = s1__3;
s1[3] = s1__4;
s1[4] = s1__5;
s1[5] = s1__6;
s1[6] = s1__7;
s1[7] = s1__8;
s1[8] = s1__9;
for (k_k1 = 0 ; k_k1 < dim1_s2 ; k_k1 = k_k1 + 1)
{
s2[k_k1] = s2_init[k_k1];
}
// Calculate each thread's loop variables' values
ibof = blockIdx.x * 4;
j = blocks[ibof + 0] + threadIdx.y;
if (j <= blocks[ibof + 1])
{
i = blocks[ibof + 2] + threadIdx.x;
if (i <= blocks[ibof + 3])
{
// Loop body
for (k2 = 1, cond_0 = abs(1 - n2) + abs(1), _k2 = 0 ; _k2 < cond_0 ; k2 = k2 + 1, _k2 = _k2 + 1)
{
for (k1 = 1, cond_1 = abs(1 - n1) + abs(1), _k1 = 0 ; _k1 < cond_1 ; k1 = k1 + 1, _k1 = _k1 + 1)
{
s[(k1 - 1 + (k2 - 1) * dim1_s + (0 - 1) * (dim1_s * dim2_s)) * (blockDim.x * blockDim.y * blockDim.z)] = s[(k1 - 1 + (k2 - 1) * dim1_s + (0 - 1) * (dim1_s * dim2_s)) * (blockDim.x * blockDim.y * blockDim.z)] + a[k1 + a0004 * k2 + a0003 * i + a0002 * j];
}
}
}
}
// Reduction
i = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
__dvmh_blockReduceSumN(s, blockDim.x * blockDim.y * blockDim.z, dim1_s * dim2_s * dim3_s);
__dvmh_blockReduceMaxN<float, 9 >(s1);
__dvmh_blockReduceMinN<float, 17 >(s2);
mm = __dvmh_blockReduceSum(mm);
if (i % warpSize == 0)
{
mm_grid[blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = mm;
for (k_k3 = 0 ; k_k3 < dim3_s ; k_k3 = k_k3 + 1)
{
for (k_k2 = 0 ; k_k2 < dim2_s ; k_k2 = k_k2 + 1)
{
for (k_k1 = 0 ; k_k1 < dim1_s ; k_k1 = k_k1 + 1)
{
s_grid[overall_blocks * (k_k1 + k_k2 * dim1_s + k_k3 * (dim1_s * dim2_s)) + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s[(k_k1 + k_k2 * dim1_s + k_k3 * (dim1_s * dim2_s)) * (blockDim.x * blockDim.y * blockDim.z)];
}
}
}
s1_grid[overall_blocks * 0 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[0];
s1_grid[overall_blocks * 1 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[1];
s1_grid[overall_blocks * 2 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[2];
s1_grid[overall_blocks * 3 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[3];
s1_grid[overall_blocks * 4 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[4];
s1_grid[overall_blocks * 5 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[5];
s1_grid[overall_blocks * 6 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[6];
s1_grid[overall_blocks * 7 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[7];
s1_grid[overall_blocks * 8 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s1[8];
s2_grid[overall_blocks * 0 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[0];
s2_grid[overall_blocks * 1 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[1];
s2_grid[overall_blocks * 2 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[2];
s2_grid[overall_blocks * 3 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[3];
s2_grid[overall_blocks * 4 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[4];
s2_grid[overall_blocks * 5 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[5];
s2_grid[overall_blocks * 6 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[6];
s2_grid[overall_blocks * 7 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[7];
s2_grid[overall_blocks * 8 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[8];
s2_grid[overall_blocks * 9 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[9];
s2_grid[overall_blocks * 10 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[10];
s2_grid[overall_blocks * 11 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[11];
s2_grid[overall_blocks * 12 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[12];
s2_grid[overall_blocks * 13 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[13];
s2_grid[overall_blocks * 14 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[14];
s2_grid[overall_blocks * 15 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[15];
s2_grid[overall_blocks * 16 + blockIdx.x * (blockDim.x * blockDim.y * blockDim.z / warpSize) + i / warpSize] = s2[16];
}
// Deallocate memory for reduction
__syncthreads();
if (i == 0)
{
delete s;
}
}
memory allocation once on the block gives the best performance, but you must understand that it gives overheads.
dynamic memory allocation is also covered in the programming guide:
both malloc/free and new/delete are natively supported on devices of cc 2.0 and greater.