Hello,
NvInterlockedAddFp32 has no effect, here’s a barebone reproduction (I can send the vcxproj and sln by email if desired).
In this repro I SHOULD have bufferContent equal to 320.0 but it is 0.
I’m on Windows 10-64 bits laptop with a GTX 980M
Can you tell me what’s wrong please.
Thanks
#include <SDKDDKVer.h>
#include <d3d11.h>
#include <d3dcompiler.h>
#include <stdio.h>
#include "nvapi.h"
#include "nvShaderExtnEnums.h"
#if 1
//Directs nVidia Optimus driver to select the high performance GPU on laptop with integrated & dedicated GPUs
extern "C" {
_declspec(dllexport) DWORD NvOptimusEnablement = 0x00000001;
}
#endif
ID3D11Device* g_pDevice = NULL;
ID3D11DeviceContext* g_pDevCtx = NULL;
ID3D11ComputeShader* g_pTestCS = NULL;
ID3D11Buffer* g_pBuffer = NULL;
ID3D11Buffer* g_pStagingBuffer = NULL;
ID3D11UnorderedAccessView* g_pUAV = NULL;
const char* g_pTestCSSource =
"#define NV_SHADER_EXTN_SLOT u2\n"
"#include \"nvHLSLExtns.h\"\n"
"\n"
"RWByteAddressBuffer g_TestBuffer : register(u0);\n"
"\n"
"[numthreads( 32, 1, 1 )]\n"
"void MainCS( uint _id : SV_DispatchThreadID )\n"
"{\n"
" NvInterlockedAddFp32( g_TestBuffer, 0, 1.0f );\n"
"}\n";
class MyIncludeHandler : public ID3DInclude
{
public:
STDMETHOD( Open )(THIS_ D3D_INCLUDE_TYPE IncludeType, LPCSTR pFileName, LPCVOID pParentData, LPCVOID *ppData, UINT *pBytes)
{
FILE* pFile = fopen( pFileName, "rb" );
if( !pFile )
return E_FAIL;
fseek( pFile, 0, SEEK_END );
long fileSize = ftell( pFile );
fseek( pFile, 0, SEEK_SET );
char* data = new char[ fileSize ];
if( fread( data, 1, fileSize, pFile ) != fileSize )
{
fclose( pFile );
delete[] data;
return E_FAIL;
}
*ppData = data;
*pBytes = fileSize;
fclose( pFile );
return S_OK;
}
STDMETHOD( Close )(THIS_ LPCVOID pData)
{
delete[] pData;
return S_OK;
}
};
bool InitD3D()
{
if( NvAPI_Initialize() != NVAPI_OK )
return false;
UINT flags = D3D11_CREATE_DEVICE_SINGLETHREADED;
#ifdef _DEBUG
flags |= D3D11_CREATE_DEVICE_DEBUG;
//flags |= D3D11_CREATE_DEVICE_DEBUGGABLE;
#endif
if( FAILED( D3D11CreateDevice( NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, flags, NULL, 0, D3D11_SDK_VERSION, &g_pDevice, NULL, &g_pDevCtx ) ) )
return false;
bool bAtomicAddSupported = false;
NvAPI_Status NvapiStatus = NvAPI_D3D11_IsNvShaderExtnOpCodeSupported( g_pDevice, NV_EXTN_OP_FP32_ATOMIC, &bAtomicAddSupported );
if( (NvapiStatus != NVAPI_OK) || !bAtomicAddSupported )
return false;
return true;
}
bool InitUAV()
{
D3D11_BUFFER_DESC bufferDesc;
bufferDesc.ByteWidth = sizeof( float );
bufferDesc.Usage = D3D11_USAGE_DEFAULT;
bufferDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;
bufferDesc.CPUAccessFlags = 0;
bufferDesc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
bufferDesc.StructureByteStride = 0;
if( FAILED( g_pDevice->CreateBuffer( &bufferDesc, nullptr, &g_pBuffer ) ) )
return false;
bufferDesc.ByteWidth = sizeof( float );
bufferDesc.Usage = D3D11_USAGE_STAGING;
bufferDesc.BindFlags = 0;
bufferDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
bufferDesc.MiscFlags = 0;
bufferDesc.StructureByteStride = 0;
if( FAILED( g_pDevice->CreateBuffer( &bufferDesc, nullptr, &g_pStagingBuffer ) ) )
return false;
D3D11_UNORDERED_ACCESS_VIEW_DESC uavDesc;
uavDesc.Format = DXGI_FORMAT_R32_TYPELESS;
uavDesc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
uavDesc.Buffer.FirstElement = 0;
uavDesc.Buffer.NumElements = 1;
uavDesc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW;
if( FAILED( g_pDevice->CreateUnorderedAccessView( g_pBuffer, &uavDesc, &g_pUAV ) ) )
return false;
return true;
}
bool LoadTestShader()
{
DWORD shaderFlags = D3DCOMPILE_ENABLE_STRICTNESS;
#if defined( DEBUG ) || defined( _DEBUG )
shaderFlags |= D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION;
#endif
MyIncludeHandler includeHandler;
ID3DBlob* pCompiledShader;
ID3DBlob* pErrorBlob;
HRESULT hr = D3DCompile( g_pTestCSSource, strlen( g_pTestCSSource ) + 1, "testCS", NULL, &includeHandler, "MainCS", "cs_5_0",
shaderFlags, 0, &pCompiledShader, &pErrorBlob );
if( pErrorBlob )
{
OutputDebugStringA( (char*)pErrorBlob->GetBufferPointer() );
pErrorBlob->Release();
}
if( FAILED( hr ) )
return false;
if( NvAPI_D3D11_SetNvShaderExtnSlot( g_pDevice, 2 ) != NVAPI_OK )
return false;
if( FAILED( g_pDevice->CreateComputeShader( pCompiledShader->GetBufferPointer(), pCompiledShader->GetBufferSize(), NULL, &g_pTestCS ) ) )
return false;
NvAPI_D3D11_SetNvShaderExtnSlot( g_pDevice, ~0u );
pCompiledShader->Release();
return true;
}
int main()
{
if( !InitD3D() )
return -1;
if( !InitUAV() )
return -1;
if( !LoadTestShader() )
return -1;
// while( true )
{
g_pDevCtx->CSSetShader( g_pTestCS, NULL, 0 );
ID3D11UnorderedAccessView* uavs[] = { g_pUAV };
g_pDevCtx->CSSetUnorderedAccessViews( 0, 1, uavs, NULL );
g_pDevCtx->Dispatch( 10, 1, 1 );
g_pDevCtx->Flush();
}
//Retrieve buffer content from GPU
g_pDevCtx->CopyResource( g_pStagingBuffer, g_pBuffer );
float bufferContent;
D3D11_MAPPED_SUBRESOURCE mappedResource;
g_pDevCtx->Map( g_pStagingBuffer, 0, D3D11_MAP_READ, 0, &mappedResource );
bufferContent = *(float*)mappedResource.pData;
g_pDevCtx->Unmap( g_pStagingBuffer, 0 );
return 0;
}