nppiFilter functions return NPP_CUDA_KERNEL_EXECUTION_ERROR and cause CPU memory leak

int ImgW = d_img_1->width(); // 128 = d_img_2->width();
int ImgH = d_img_1->height(); // 128 = d_img_2->height();

Npp32s *pKernel = KernelAllocate( kernelSize, kernelSize ); // kernelSize = 7

setNppSize( SrcSize, ImgW-kernelSize+1, ImgH-kernelSize+1 );
setNppSize( MaskSize, kernelSize, kernelSize );
setNppPoint( start_location, kernelSize-1, kernelSize-1 );

NPP_ASSERT( nppiFilter_8u_C1R( d_img_1->data(), d_img_1->pitch(), d_img_2->data(), d_img_2->pitch(), SrcSize,
 pKernel, MaskSize, start_location, 1 ) );

KernelFree();

When I use nppiFilter functions(nppiFilter_8u_C1R、nppiFilterRow_8u_C1R、nppiFilterColumn_8u_C1R), I find some CPU memory leaks. Is there anything wrong?

CDeviceImage *d_img_3 = NULL;
CDeviceImage *d_img_4 = NULL;
NppiSize SrcSize;
Npp32s *pCPU = NULL;
Npp32s *pGPU = NULL;
Npp32s tmpAnchor, tmpMaskSize;
int nDiv = 1;

SrcSize.width = 248;
SrcSize.height = 248;
tmpMaskSize = 248;
tmpAnchor = 247;

NullPtr<CDeviceImage>(d_img_3);
NullPtr<CDeviceImage>(d_img_4);

d_img_3 = new CDeviceImage( SrcSize.width, SrcSize.height );
d_img_4 = new CDeviceImage( SrcSize.width, SrcSize.height );

MaskAllocate(pCPU, pGPU, tmpMaskSize, 1);

int loopSize = 1025;
for(int i=0; i<loopSize; i++){
	NPP_ASSERT( nppiFilterRow_8u_C1R(d_img_4->data(), d_img_4->pitch(), d_img_3->data(), d_img_3->pitch(), SrcSize, pGPU, tmpMaskSize, tmpAnchor, nDiv) );
}

MaskFree( pCPU, pGPU );
DeletePtrItem<CDeviceImage>(d_img_3);
DeletePtrItem<CDeviceImage>(d_img_4);

If loopSize >= 1025, nppiFilterRow_8u_C1R will return NPP_CUDA_KERNEL_EXECUTION_ERROR and cause CPU memory leak.

What am I doing wrong?

Got over this problem by updating updating graphics driver.