Failing to use NvInterlockedAddFp32

Hello,

NvInterlockedAddFp32 has no effect, here’s a barebone reproduction (I can send the vcxproj and sln by email if desired).
In this repro I SHOULD have bufferContent equal to 320.0 but it is 0.

I’m on Windows 10-64 bits laptop with a GTX 980M

Can you tell me what’s wrong please.

Thanks

#include <SDKDDKVer.h>
#include <d3d11.h>
#include <d3dcompiler.h>
#include <stdio.h>

#include "nvapi.h"
#include "nvShaderExtnEnums.h"

#if 1
//Directs nVidia Optimus driver to select the high performance GPU on laptop with integrated & dedicated GPUs
extern "C" {
	_declspec(dllexport) DWORD NvOptimusEnablement = 0x00000001;
}
#endif

ID3D11Device* g_pDevice = NULL;
ID3D11DeviceContext* g_pDevCtx = NULL;
ID3D11ComputeShader* g_pTestCS = NULL;
ID3D11Buffer* g_pBuffer = NULL;
ID3D11Buffer* g_pStagingBuffer = NULL;
ID3D11UnorderedAccessView* g_pUAV = NULL;

const char* g_pTestCSSource =
"#define NV_SHADER_EXTN_SLOT u2\n"
"#include \"nvHLSLExtns.h\"\n"
"\n"
"RWByteAddressBuffer g_TestBuffer : register(u0);\n"
"\n"
"[numthreads( 32, 1, 1 )]\n"
"void MainCS( uint _id : SV_DispatchThreadID )\n"
"{\n"
"	NvInterlockedAddFp32( g_TestBuffer, 0, 1.0f );\n"
"}\n";

class MyIncludeHandler : public ID3DInclude
{
public:
	STDMETHOD( Open )(THIS_ D3D_INCLUDE_TYPE IncludeType, LPCSTR pFileName, LPCVOID pParentData, LPCVOID *ppData, UINT *pBytes)
	{
		FILE* pFile = fopen( pFileName, "rb" );

		if( !pFile )
			return E_FAIL;

		fseek( pFile, 0, SEEK_END );
		long fileSize = ftell( pFile );
		fseek( pFile, 0, SEEK_SET );

		char* data = new char[ fileSize ];
		if( fread( data, 1, fileSize, pFile ) != fileSize )
		{
			fclose( pFile );
			delete[] data;
			return E_FAIL;
		}

		*ppData = data;
		*pBytes = fileSize;

		fclose( pFile );

		return S_OK;
	}

	STDMETHOD( Close )(THIS_ LPCVOID pData)
	{
		delete[] pData;
		return S_OK;
	}
};


bool InitD3D()
{
	if( NvAPI_Initialize() != NVAPI_OK )
		return false;

	UINT flags = D3D11_CREATE_DEVICE_SINGLETHREADED;
#ifdef _DEBUG
	flags |= D3D11_CREATE_DEVICE_DEBUG;
	//flags |= D3D11_CREATE_DEVICE_DEBUGGABLE;
#endif

	if( FAILED( D3D11CreateDevice( NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, flags, NULL, 0, D3D11_SDK_VERSION, &g_pDevice, NULL, &g_pDevCtx ) ) )
		return false;

	bool bAtomicAddSupported = false;
	NvAPI_Status NvapiStatus = NvAPI_D3D11_IsNvShaderExtnOpCodeSupported( g_pDevice, NV_EXTN_OP_FP32_ATOMIC, &bAtomicAddSupported );

	if( (NvapiStatus != NVAPI_OK) || !bAtomicAddSupported )
		return false;

	return true;
}

bool InitUAV()
{
	D3D11_BUFFER_DESC bufferDesc;
	bufferDesc.ByteWidth = sizeof( float );
	bufferDesc.Usage = D3D11_USAGE_DEFAULT;
	bufferDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;
	bufferDesc.CPUAccessFlags = 0;
	bufferDesc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
	bufferDesc.StructureByteStride = 0;

	if( FAILED( g_pDevice->CreateBuffer( &bufferDesc, nullptr, &g_pBuffer ) ) )
		return false;

	bufferDesc.ByteWidth = sizeof( float );
	bufferDesc.Usage = D3D11_USAGE_STAGING;
	bufferDesc.BindFlags = 0;
	bufferDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
	bufferDesc.MiscFlags = 0;
	bufferDesc.StructureByteStride = 0;

	if( FAILED( g_pDevice->CreateBuffer( &bufferDesc, nullptr, &g_pStagingBuffer ) ) )
		return false;


	D3D11_UNORDERED_ACCESS_VIEW_DESC uavDesc;
	uavDesc.Format = DXGI_FORMAT_R32_TYPELESS;
	uavDesc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
	uavDesc.Buffer.FirstElement = 0;
	uavDesc.Buffer.NumElements = 1;
	uavDesc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW;

	if( FAILED( g_pDevice->CreateUnorderedAccessView( g_pBuffer, &uavDesc, &g_pUAV ) ) )
		return false;

	return true;
}

bool LoadTestShader()
{

	DWORD shaderFlags = D3DCOMPILE_ENABLE_STRICTNESS;
#if defined( DEBUG ) || defined( _DEBUG )
	shaderFlags |= D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION;
#endif

	MyIncludeHandler includeHandler;
	ID3DBlob* pCompiledShader;
	ID3DBlob* pErrorBlob;
	HRESULT hr = D3DCompile( g_pTestCSSource, strlen( g_pTestCSSource ) + 1, "testCS", NULL, &includeHandler, "MainCS", "cs_5_0",
							shaderFlags, 0, &pCompiledShader, &pErrorBlob );

	if( pErrorBlob )
	{
		OutputDebugStringA( (char*)pErrorBlob->GetBufferPointer() );
		pErrorBlob->Release();
	}

	if( FAILED( hr ) )
		return false;

	if( NvAPI_D3D11_SetNvShaderExtnSlot( g_pDevice, 2 ) != NVAPI_OK )
		return false;

	if( FAILED( g_pDevice->CreateComputeShader( pCompiledShader->GetBufferPointer(), pCompiledShader->GetBufferSize(), NULL, &g_pTestCS ) ) )
		return false;

	NvAPI_D3D11_SetNvShaderExtnSlot( g_pDevice, ~0u );

	pCompiledShader->Release();

	return true;
}

int main()
{
	if( !InitD3D() )
		return -1;

	if( !InitUAV() )
		return -1;

	if( !LoadTestShader() )
		return -1;

//	while( true )
	{
		g_pDevCtx->CSSetShader( g_pTestCS, NULL, 0 );

		ID3D11UnorderedAccessView* uavs[] = { g_pUAV };

		g_pDevCtx->CSSetUnorderedAccessViews( 0, 1, uavs, NULL );
		
		g_pDevCtx->Dispatch( 10, 1, 1 );

		g_pDevCtx->Flush();
	}

	//Retrieve buffer content from GPU

	g_pDevCtx->CopyResource( g_pStagingBuffer, g_pBuffer );

	float bufferContent;

	D3D11_MAPPED_SUBRESOURCE mappedResource;
	g_pDevCtx->Map( g_pStagingBuffer, 0, D3D11_MAP_READ, 0, &mappedResource );
	bufferContent = *(float*)mappedResource.pData;
	g_pDevCtx->Unmap( g_pStagingBuffer, 0 );

    return 0;
}

PS: I also tried with a RWTexture1D, it doesn’t work either.

removed