nppiFilterGauss_8u_C1R

sourcecode:

Npp8u* p1 = NULL;
    Npp8u* p2 = NULL;
    unsigned char* p3 = NULL;
    unsigned char* p4 = NULL;
    int step1 = 0;
    int step2 = 0;
    NppiSize roi;
    roi.width = 352*288;
    roi.height = 1;
    int ret = 0;

    p1 = nppiMalloc_8u_C1(352, 288, &step1);
    p2 = nppiMalloc_8u_C1(352, 288, &step2);
    cudaMalloc((void**)&p3, 352*288);
    cudaMalloc((void**)&p4, 352*288);

    printf("p1[%x],p2[%x],p3[%x],p4[%x]\n", p1, p2, p3, p4);
    printf("step1[%d]\n", step1);
    printf("step2[%d]\n", step2);

    int count = 1;
    while(count < 3) {
//      ret = nppiFilterGauss_8u_C1R(p1, step1, p2, step2, roi, NPP_MASK_SIZE_3_X_3);
      ret = nppiFilterGauss_8u_C1R(p3, 352*288, p4, 352*288, roi, NPP_MASK_SIZE_3_X_3);
      printf("count[%d],ret[%d]\n", count, ret);
      if(ret) {
        break;
      }
      count++;
    }

    nppiFree(p1);
    nppiFree(p2);
    cudaFree(p3);
    cudaFree(p4);

error:

CUDA Clock sample
GPU Device 0: "GK20A" with compute capability 3.2

p1[ab84a000],p2[ab86e000],p3[ab892000],p4[ab8aac00]
step1[512]
step2[512]
count[1],ret[0]
count[2],ret[0]
========= CUDA-MEMCHECK
========= Invalid __global__ read of size 1
=========     at 0x00000448 in void ForEachTupleByteQuad<unsigned char, int=1, TupleByteQuadFunctor<unsigned char, int=1, FilterGauss3x3QuadNew<unsigned char, int=1>>>(Tuple8<unsigned char, int=1>*, int, NppiSize, unsigned char)
=========     by thread (31,0,0) in block (395,0,0)
=========     Address 0xab8c3800 is out of bounds
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= ERROR SUMMARY: 5 errors

Could someone please explain the correct approach?