This simplified example shows a massive 4x slowdown in performance, by modifying a single line of code!
The size of the block is set to 128, specifically tailored for GTX750Ti (1st gen. Maxwell, 128 cores X 5 SMs)
See the entire code below, lines 32/33 in particular:
int val = fabsf(ind); // 529msec
//int val = fabsf(i - ind); // 2084msec - ~4 times slower!
Note that the ptx’s of the two versions differ by more than a single instruction.
This may provide a lead with respect to the culprit.
Configuration: GTX750Ti, Win7 x64, VS 2013, CUDA 7.0
Same results regardless if compiled for cc2.0 or cc5.0.
#include "cuda_runtime.h"
#include <vector>
#include <iostream>
using namespace std;
// Global device buffers, allocated and freed once
// input buffer
float *pVol_d = NULL;
// output buffer
float *pOut_d = NULL;
__global__ void kernel(const float *pVol, const int volSize, float *pOut)
{
const int nCols = blockDim.x * gridDim.x;
const int iCol = threadIdx.x + blockDim.x * blockIdx.x;
const int ind = blockIdx.y * nCols + iCol;
// All threads in the block join forces in reading the voxels into the shared memory in a perfectly coalesced manner
extern __shared__ float currVoxels[]; // dynamic shared memory (allocated at runtime to be the size of the cuda block)
float acc = 0;
const int nVoxelsPerIter = volSize / blockDim.x;
for (int j = 0; j < nVoxelsPerIter; ++j)
{
currVoxels[threadIdx.x] = pVol[threadIdx.x + j * blockDim.x];
__syncthreads();
for (int i = 0; i < blockDim.x; ++i)
{
int val = fabsf(ind); // 529msec
//int val = fabsf(i - ind); // 2084msec - ~4 times slower!
val = val >= 0 ? 1 : val; // effectively set to 1, regardless of the previous assignment (same result in both cases)
acc += val*currVoxels[i]; // cc2.x: broadcast (no bank conflict)
}
}
pOut[ind] = acc;
}
int CUDABuffersAllocate(int volSize, int nCols, int nRows)
{
// return at once if the buffers have already been allocated
if (pOut_d)
return cudaSuccess;
// Allocate device buffers
cudaError cudaStatus;
// inputs
cudaStatus = cudaMalloc(&pVol_d, volSize * sizeof(float));
// output
cudaStatus = cudaMalloc(&pOut_d, nCols * nRows * sizeof(float));
return cudaStatus;
}
void CUDABuffersFree(void)
{
// Free device buffers
if (pOut_d)
{
cudaFree(pOut_d);
cudaFree(pVol_d);
pOut_d = NULL;
pVol_d = NULL;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
if (cudaDeviceReset() != cudaSuccess)
{
cerr << "cudaDeviceReset failed!" << endl;
}
}
void CallCUDA(const float *pVol, int volSize, float *pOut, int nCols, int nRows)
{
// return at once if the buffers have already been allocated
if (!pOut_d)
{
cerr << "Uninitialized buffers. CUDABuffersAllocate must be called once before calling CallCUDA()." << endl;
return;
}
cudaError_t cudaStatus;
// Copy data pertaining to current execution to device
cudaStatus = cudaMemcpy(pVol_d, pVol, volSize * sizeof(float), cudaMemcpyHostToDevice);
// Timing net kernel execution
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// Launch the CUDA kernels, with dynamic shared memory
// hardcoded block/grid sizes
dim3 blockSize(128, 1, 1); // = 640 cores / 5 SMs
dim3 gridSize(nCols / blockSize.x, nRows, 1); // = 128
kernel<<<gridSize, blockSize, blockSize.x * sizeof(float)>>>(pVol_d, volSize, pOut_d);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
cerr << "kernel launch failed: " << cudaGetErrorString(cudaStatus) << endl;
goto KernelError;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)
{
cerr << "cudaDeviceSynchronize returned error code " << cudaStatus << "after launching kernel!" << endl;
goto KernelError;
}
KernelError:
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cout << "Net runtime: " << time << " ms" << endl;
// copy output to host
cudaStatus = cudaMemcpy(pOut, pOut_d, nCols * nRows * sizeof(float), cudaMemcpyDeviceToHost);
cout << "Done." << endl;
}
int main()
{
const int volSize = 128 * 128 * 128;
const int nCols = 128 * 128;
const int nRows = 2;
vector<float> vol(volSize, 1.f); // initialize to 1's
vector<float> out(nCols * nRows); // output
CUDABuffersAllocate(volSize, nCols, nRows);
CallCUDA(&vol[0], volSize, &out[0], nCols, nRows);
CUDABuffersFree();
system("Pause"); // required for an empty project
return 0;
}