Does my kernel use coalesced memory access?
Hi,
I'm trying to get the best performance when using DX11 Dispatch.
I have 100x100 mesh data and I'm Dispatching 4x4 threads
Dispatch(4,4,1);
using 32x32 threadgroups.
I do not use groupshared memory

How can I ensure that I get coalesced memory access. This is a very simple compute kernel.
I'm guessing around how to ensure that memory access is aligned and coalesced.
All GPU Tools I found Intel,Nvidia,AMD they all present my a GPU time in microseconds thats nice but I have no idea if the kernel has aligned and coalesced memory access.
The AMD Shaderanalyzer presents me a Throughput value in GB/s but no matter how I access the memory the throughput from the
Shaderanalyzer stays the same.

I know your CUDA and OpenCL examples where each shader gets tested separately and the commandline tool tells things like bandwidth and coalesced access.
Is there something similar for DX11?

Do I need
GroupMemoryBarrierWithGroupSync();
here to ensure that all shaders access the memory at the same time to get coalesced memory access or is this just required to sync groupshared memory.

GroupMemoryBarrierWithGroupSync();
BufferOut.Store4(
offsetout * 4, // destination offset in bytes
write4);

GroupMemoryBarrierWithGroupSync();
// index 4
BufferOut.Store2(
(offsetout+4) * 4, // destination offset in bytes
write2);


#define groupthreads_x 32
#define groupthreads_y 32
#define groupthreads_z 1
static const int3 groupthreads = { groupthreads_x,groupthreads_y,groupthreads_z };

[numthreads(groupthreads_x, groupthreads_y, 1)]
void main( uint3 Gid : SV_GroupID,
uint3 Gtid : SV_GroupThreadID,
uint3 dtid : SV_DispatchThreadID,
uint GI : SV_GroupIndex )
{
// xy within the group
uint3 CurrentXY = uint3(GI % groupthreads_x,
GI / groupthreads_x,
0);
// xy
uint2 tid = uint2(Gid*groupthreads+CurrentXY);

uint nIndex = tid.x+tid.y*g_nMeshWidth;
if(nIndex < g_nMeshWidth*g_nMeshHeight)
{
uint nFace = tid.x + tid.y * (g_nMeshWidth-1);

uint xi = tid.x + tid.y * g_nMeshWidth;
uint xmi = xi + g_nMeshWidth;
uint offsetout = g_nOffsetOutput+nFace*6;

// write 6 uints by witing 4 + 2

// index 0
uint4 write4 = uint4(
xi,
xi+1,
xmi,
xmi
);
uint2 write2 = uint2(
xi+1,
xmi+1
);

BufferOut.Store4(
offsetout * 4, // destination offset in bytes
write4);

// index 4
BufferOut.Store2(
(offsetout+4) * 4, // destination offset in bytes
write2);
}
}
Hi,

I'm trying to get the best performance when using DX11 Dispatch.

I have 100x100 mesh data and I'm Dispatching 4x4 threads

Dispatch(4,4,1);

using 32x32 threadgroups.

I do not use groupshared memory



How can I ensure that I get coalesced memory access. This is a very simple compute kernel.

I'm guessing around how to ensure that memory access is aligned and coalesced.

All GPU Tools I found Intel,Nvidia,AMD they all present my a GPU time in microseconds thats nice but I have no idea if the kernel has aligned and coalesced memory access.

The AMD Shaderanalyzer presents me a Throughput value in GB/s but no matter how I access the memory the throughput from the

Shaderanalyzer stays the same.



I know your CUDA and OpenCL examples where each shader gets tested separately and the commandline tool tells things like bandwidth and coalesced access.

Is there something similar for DX11?



Do I need

GroupMemoryBarrierWithGroupSync();

here to ensure that all shaders access the memory at the same time to get coalesced memory access or is this just required to sync groupshared memory.



GroupMemoryBarrierWithGroupSync();

BufferOut.Store4(

offsetout * 4, // destination offset in bytes

write4);



GroupMemoryBarrierWithGroupSync();

// index 4

BufferOut.Store2(

(offsetout+4) * 4, // destination offset in bytes

write2);





#define groupthreads_x 32

#define groupthreads_y 32

#define groupthreads_z 1

static const int3 groupthreads = { groupthreads_x,groupthreads_y,groupthreads_z };



[numthreads(groupthreads_x, groupthreads_y, 1)]

void main( uint3 Gid : SV_GroupID,

uint3 Gtid : SV_GroupThreadID,

uint3 dtid : SV_DispatchThreadID,

uint GI : SV_GroupIndex )

{

// xy within the group

uint3 CurrentXY = uint3(GI % groupthreads_x,

GI / groupthreads_x,

0);

// xy

uint2 tid = uint2(Gid*groupthreads+CurrentXY);



uint nIndex = tid.x+tid.y*g_nMeshWidth;

if(nIndex < g_nMeshWidth*g_nMeshHeight)

{

uint nFace = tid.x + tid.y * (g_nMeshWidth-1);



uint xi = tid.x + tid.y * g_nMeshWidth;

uint xmi = xi + g_nMeshWidth;

uint offsetout = g_nOffsetOutput+nFace*6;



// write 6 uints by witing 4 + 2



// index 0

uint4 write4 = uint4(

xi,

xi+1,

xmi,

xmi

);

uint2 write2 = uint2(

xi+1,

xmi+1

);



BufferOut.Store4(

offsetout * 4, // destination offset in bytes

write4);



// index 4

BufferOut.Store2(

(offsetout+4) * 4, // destination offset in bytes

write2);

}

}

#1
Posted 04/11/2011 01:01 PM   
Scroll To Top