Issue with NPP and opencv does not work with all image sizes?

I have written small piece of code using NPP and opencv-3.1 on GTX-745 platform. It is working with image size 800x600 but not working with other image sizes.
Is there anything i missed ?
I am not getting why it is behaving like this ?
Is NPP have any restriction on image size ?

below is the code what i have written.

#include <iostream>
 #include <cuda_runtime.h>
 #include <npp.h>
 #include <helper_string.h>
 #include <helper_cuda.h>
 #include "opencv2/core/version.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/highgui/highgui.hpp"
 //#include "ImagesCPU.h"
 //#include <ImagesNPP.h>
 #include <chrono>
 
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
 {
     if (code != cudaSuccess)
     {
         fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
         if (abort) exit(code);
     }
 }


int main(int argc, const char *argv[])
{
 
     cv::Mat srcImg, dstImg, grayImg;
     double maxVal=0, minVal=0;
     cv::Mat blur;
     //load image using opencv
     srcImg = cv::imread(argv[1]);
     if (srcImg.channels() == 3) {
         cvtColor(srcImg, grayImg, CV_RGB2GRAY);
     }
     else {
         grayImg = srcImg;
     }
     cv::Mat_<float> grayImg32 = cv::Mat::zeros(cvSize(grayImg.cols, grayImg.rows), CV_32FC1);
 
     grayImg.convertTo(grayImg32, CV_32F);
 
     NppStatus eStatusNPP;
     int pitch = grayImg32.step;
 
     std::cout << "step width height : " <<grayImg32.step <<" "<<grayImg32.cols <<" "<<grayImg32.rows <<std::endl;
     std::cout << "step width height : " <<grayImg.step   <<" "<<grayImg.cols   <<" "<< grayImg.rows  <<std::endl;
 
 
     NppiSize sizeRoi = {grayImg.cols-5+1, grayImg.rows-5+1};
     NppiSize size    = {grayImg.cols, grayImg.rows};
 
     float *dstCuda, *srcCuda;
     auto start = std::chrono::high_resolution_clock::now();
     gpuErrchk(cudaMalloc((void**)&srcCuda, grayImg.cols*grayImg.rows*sizeof(float)));
     gpuErrchk(cudaMalloc((void**)&dstCuda, grayImg.cols*grayImg.rows*sizeof(float)));
     auto end   = std::chrono::high_resolution_clock::now();
     std::cout<< " cuda malloc : "<< std::chrono::duration_cast<std::chrono::microseconds>(end-start).count() <<std::endl;
 
     start = std::chrono::high_resolution_clock::now();
     //copy data to device
     gpuErrchk(cudaMemcpy(srcCuda, (float*)grayImg32.data, grayImg.cols*grayImg.rows*sizeof(float), cudaMemcpyHostToDevice));
     end   = std::chrono::high_resolution_clock::now();
     std::cout << " memcpy H2D : "<< std::chrono::duration_cast<std::chrono::microseconds>(end-start).count() <<std::endl;
 
 
     start = std::chrono::high_resolution_clock::now();
     eStatusNPP = nppiFilterGauss_32f_C1R(srcCuda, pitch, dstCuda, pitch, sizeRoi, NPP_MASK_SIZE_5_X_5);
     if(eStatusNPP != NPP_SUCCESS)
         std::cout << " err: nppiFilterGauss_32f_C1R : " << eStatusNPP << std::endl;

     //gpuErrchk(cudaMemcpy(dstCuda,  srcCuda, grayImg.cols*grayImg.rows*sizeof(float), cudaMemcpyDeviceToDevice));
     gpuErrchk(cudaDeviceSynchronize());
     gpuErrchk(cudaPeekAtLastError());
     end   = std::chrono::high_resolution_clock::now();
     std::cout<< " gauss : "<< std::chrono::duration_cast<std::chrono::microseconds>(end-start).count() <<std::endl;
 
 
     //save dst image
     cv::Mat_<float> dstImg32 = cv::Mat::zeros(cvSize(grayImg.cols, grayImg.rows), CV_32FC1);
     //copy data back to host
     start = std::chrono::high_resolution_clock::now();
     gpuErrchk(cudaMemcpy((float *)dstImg32.data,  dstCuda, grayImg.cols*grayImg.rows*sizeof(float), cudaMemcpyDeviceToHost));
     end   = std::chrono::high_resolution_clock::now();
     std::cout<< " memcpy D2H : "<< std::chrono::duration_cast<std::chrono::microseconds>(end-start).count() <<std::endl;
 
     //display
     minMaxLoc(dstImg32, &minVal, &maxVal);
     std::cout <<"min max val: " << minVal <<" "<< maxVal << std::endl;
     dstImg32.convertTo(blur, CV_8U, 255.0/(maxVal - minVal), -minVal * 255.0/(maxVal - minVal));
     cv::imshow("opencv_npp", blur);
     cv::waitKey(0);
     return 0;
 }