Mapped memory access causes BSOD with drivers newer than 378.92
On Windows, the following code causes a blue screen of death using drivers newer than 378.92. The problem might be related to the motherboard or HW in general, as it worked on some configurations with other CPU and motherboard (unfortunately I cannot reproduce that test anymore). The issue was reproduced on multiple devices (both PCs and GPUs) with the specs below. I am trying to find the root cause of this issue. Please, if you have similar hardware, help me and see if you can reproduce this issue. Any possible workarounds are welcome. Thank you! Software and hardware details: - Intel Core I7 4930K, ASUS P9X79-E WS, 32GB RAM - BIOS: Version 1704 - Windows 10, 1703 and 1709 - CUDA 8 and CUDA 9.1 - GPU: GTX 750Ti and GTX 980Ti (980Ti was selected using cudaSetDevice for the tests) - Tested with multiple drivers: 378.92 does not produce the issue, drivers after that do (e.g. 391.35) Error message: [code]Technical Information: *** STOP: 0x00000133 (0x0000000000000001, 0x0000000000001e00, 0xfffff8025e806370, 0x0000000000000000) *** ntoskrnl.exe - Address 0xfffff8025e57f6e0 base at 0xfffff8025e40a000 DateStamp 0x5a4a1659[/code] Code: [code] #include <cuda.h> #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <cstdio> #include <cstdlib> #include <malloc.h> static size_t DEVICE_ELEMENTS = 256*1024*1024; static size_t LAYER_ELEMENTS = 512*512; static size_t LAYERS = 16*1024; static size_t MEMORY_ALIGNMENT = 4096; #define CUDA_SAFE_CALL( call ) \ { \ const cudaError_t error = call; \ if( error != cudaSuccess ) \ { \ printf( "Error: %s: %d, ", __FILE__, __LINE__ ); \ printf( "code: %d, reason: %s\n", error, cudaGetErrorString( error ) ); \ exit( 1 ); \ } \ } __global__ void doStuff( float* data, size_t n ) { auto idx = blockIdx.x*blockDim.x+threadIdx.x; if ( idx < n ) data[idx] += idx; } int main(int argc, char **argv) { int iterations = 1; int gpuId = 0; if ( argc >= 2 ) iterations = atoi(argv[1]); if ( argc >= 3 ) gpuId = atoi(argv[2]); printf("CUDA 980Ti driver crash test.\nThe test should result in a BSOD with drivers newer than 378.92\n"); int devices; CUDA_SAFE_CALL( cudaGetDeviceCount(&devices) ); printf("Using device #%d (of %d)\n", gpuId, devices); CUDA_SAFE_CALL( cudaSetDevice( gpuId ) ); cudaDeviceProp prop; CUDA_SAFE_CALL( cudaGetDeviceProperties( &prop, gpuId ) ); printf("Device name: %s, total mem: %f GiB\n", prop.name, prop.totalGlobalMem/1024.0f/1024.0f/1024.0f); float** hostData = new float*[LAYERS]; printf("Allocating %f GiB on the host (%lu layers)...\n", LAYERS*LAYER_ELEMENTS*sizeof(float)/1024.0f/1024.0f/1024.0f, LAYERS ); for (size_t i = 0; i < LAYERS; i++) hostData[i] = reinterpret_cast<float*>( malloc( LAYER_ELEMENTS*sizeof(float) ) ); //hostData[i] = reinterpret_cast<float*>( _aligned_malloc( LAYER_ELEMENTS*sizeof(float), MEMORY_ALIGNMENT ) ); printf("Initializing data...\n" ); for (size_t i = 0; i < LAYERS; i++) for (size_t j = 0; j < LAYER_ELEMENTS; j++) hostData[i][j] = 42.0f; for ( int k = 0; k < iterations; k++ ) { printf("Running mapped memory access tests %d/%d...\n", (k+1), iterations ); for (size_t i = 0; i < LAYERS; i++) { float* devicePtr; CUDA_SAFE_CALL( cudaHostRegister( hostData[i], LAYER_ELEMENTS*sizeof(float), cudaHostRegisterMapped | cudaHostRegisterPortable ) ); CUDA_SAFE_CALL( cudaHostGetDevicePointer( (void **)&devicePtr, (void *)hostData[i], 0 ) ); doStuff<<< ceil(LAYER_ELEMENTS/1024.0) , 1024 >>>( devicePtr, LAYER_ELEMENTS ); CUDA_SAFE_CALL( cudaDeviceSynchronize() ); CUDA_SAFE_CALL( cudaHostUnregister( hostData[i] ) ); } } printf("Finished tests.\n" ); for (size_t i = 0; i < LAYERS; i++) //_aligned_free( hostData[i] ); free( hostData[i] ); delete[]( hostData ); return 0; } [/code]
On Windows, the following code causes a blue screen of death using drivers newer than 378.92.

The problem might be related to the motherboard or HW in general, as it worked on some configurations with other CPU and motherboard (unfortunately I cannot reproduce that test anymore). The issue was reproduced on multiple devices (both PCs and GPUs) with the specs below.

I am trying to find the root cause of this issue. Please, if you have similar hardware, help me and see if you can reproduce this issue. Any possible workarounds are welcome. Thank you!

Software and hardware details:
- Intel Core I7 4930K, ASUS P9X79-E WS, 32GB RAM
- BIOS: Version 1704
- Windows 10, 1703 and 1709
- CUDA 8 and CUDA 9.1
- GPU: GTX 750Ti and GTX 980Ti (980Ti was selected using cudaSetDevice for the tests)
- Tested with multiple drivers: 378.92 does not produce the issue, drivers after that do (e.g. 391.35)

Error message:
Technical Information:

*** STOP: 0x00000133 (0x0000000000000001, 0x0000000000001e00, 0xfffff8025e806370,
0x0000000000000000)

*** ntoskrnl.exe - Address 0xfffff8025e57f6e0 base at 0xfffff8025e40a000 DateStamp
0x5a4a1659


Code:
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <cstdio>
#include <cstdlib>
#include <malloc.h>

static size_t DEVICE_ELEMENTS = 256*1024*1024;
static size_t LAYER_ELEMENTS = 512*512;
static size_t LAYERS = 16*1024;

static size_t MEMORY_ALIGNMENT = 4096;

#define CUDA_SAFE_CALL( call ) \
{ \
const cudaError_t error = call; \
if( error != cudaSuccess ) \
{ \
printf( "Error: %s: %d, ", __FILE__, __LINE__ ); \
printf( "code: %d, reason: %s\n", error, cudaGetErrorString( error ) ); \
exit( 1 ); \
} \
}

__global__ void doStuff( float* data, size_t n )
{
auto idx = blockIdx.x*blockDim.x+threadIdx.x;
if ( idx < n )
data[idx] += idx;
}

int main(int argc, char **argv)
{
int iterations = 1;
int gpuId = 0;

if ( argc >= 2 )
iterations = atoi(argv[1]);

if ( argc >= 3 )
gpuId = atoi(argv[2]);

printf("CUDA 980Ti driver crash test.\nThe test should result in a BSOD with drivers newer than 378.92\n");

int devices;
CUDA_SAFE_CALL( cudaGetDeviceCount(&devices) );

printf("Using device #%d (of %d)\n", gpuId, devices);
CUDA_SAFE_CALL( cudaSetDevice( gpuId ) );

cudaDeviceProp prop;
CUDA_SAFE_CALL( cudaGetDeviceProperties( &prop, gpuId ) );

printf("Device name: %s, total mem: %f GiB\n", prop.name, prop.totalGlobalMem/1024.0f/1024.0f/1024.0f);

float** hostData = new float*[LAYERS];

printf("Allocating %f GiB on the host (%lu layers)...\n", LAYERS*LAYER_ELEMENTS*sizeof(float)/1024.0f/1024.0f/1024.0f, LAYERS );
for (size_t i = 0; i < LAYERS; i++)
hostData[i] = reinterpret_cast<float*>( malloc( LAYER_ELEMENTS*sizeof(float) ) );
//hostData[i] = reinterpret_cast<float*>( _aligned_malloc( LAYER_ELEMENTS*sizeof(float), MEMORY_ALIGNMENT ) );

printf("Initializing data...\n" );
for (size_t i = 0; i < LAYERS; i++)
for (size_t j = 0; j < LAYER_ELEMENTS; j++)
hostData[i][j] = 42.0f;

for ( int k = 0; k < iterations; k++ )
{
printf("Running mapped memory access tests %d/%d...\n", (k+1), iterations );

for (size_t i = 0; i < LAYERS; i++)
{
float* devicePtr;

CUDA_SAFE_CALL( cudaHostRegister( hostData[i], LAYER_ELEMENTS*sizeof(float), cudaHostRegisterMapped | cudaHostRegisterPortable ) );

CUDA_SAFE_CALL( cudaHostGetDevicePointer( (void **)&devicePtr, (void *)hostData[i], 0 ) );

doStuff<<< ceil(LAYER_ELEMENTS/1024.0) , 1024 >>>( devicePtr, LAYER_ELEMENTS );

CUDA_SAFE_CALL( cudaDeviceSynchronize() );

CUDA_SAFE_CALL( cudaHostUnregister( hostData[i] ) );
}

}

printf("Finished tests.\n" );

for (size_t i = 0; i < LAYERS; i++)
//_aligned_free( hostData[i] );
free( hostData[i] );

delete[]( hostData );

return 0;
}
Attachments

code.zip

#1
Posted 04/16/2018 12:33 PM   
On Ubuntu 16.04 this code works fine with 980 Ti and newer than 378.92 drivers. (Though LAYERS had to be halved to fit into the memory.)
On Ubuntu 16.04 this code works fine with 980 Ti and newer than 378.92 drivers.

(Though LAYERS had to be halved to fit into the memory.)

#2
Posted 04/18/2018 07:27 AM   
Scroll To Top

Add Reply