CUDA - unspecified launch failure on cudaMemcpy and cudaFree

Hello, everyone. When I ran my code on Cuda-memcheck, it gave me the following error

========= Program hit cudaErrorLaunchFailure (error 4) due to “unspecified launch failure” on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error

========= Program hit cudaErrorLaunchFailure (error 4) due to “unspecified launch failure” on CUDA API call to cudaFree.
========= Saved host backtrace up to driver entry point at error

Those errors happens after the kernel launch…

This is the host code

void hostIntersection(float *start, float *dirs, float *p0, float *p1, float *p2, float *normal, float *pHit,
int *faceIndexList, int *bounding, int *boundsPerIdx, float *boundMax, float *boundMin, int numBounds,
int pixX, int pixY, int facesSize, float kaP, float kaT, float kdP, float kdT, float lampIp,
float ksP, float ksT, float lampIs, float KrP, float KrT, float KtP, float KtT, float RefrT, float *lampPos, float *planeColor, float *objColor, float *lampColor){

float *d_start, *d_dirs, *d_p0, *d_p1, *d_p2, *d_normal, *d_pHit, *d_boundMax, *d_boundMin, *d_lampPos, *d_planeColor, *d_objColor, *d_lampColor;
int *d_bounding, *d_faceIndexList, *d_boundsPerIdx;

int pixels = pixX * pixY;
size_t sizeVec = 3 * sizeof(float);
size_t sizeVecAll = pixels * sizeVec;
size_t sizeFaceAll = facesSize * sizeVec;
cudaError_t err;

//malloc
err = cudaMalloc((void**) &d_start, sizeVec);
err = cudaMalloc((void**) &d_dirs, sizeVecAll);
err = cudaMalloc((void**) &d_p0, sizeFaceAll);
err = cudaMalloc((void**) &d_p1, sizeFaceAll);
err = cudaMalloc((void**) &d_p2, sizeFaceAll);
err = cudaMalloc((void**) &d_normal, sizeFaceAll);
err = cudaMalloc((void**) &d_pHit, sizeVecAll);	//stores pHits
err = cudaMalloc((void**) &d_boundMax, numBounds * sizeVec);
err = cudaMalloc((void**) &d_boundMin, numBounds * sizeVec);
err = cudaMalloc((void**) &d_bounding, numBounds * facesSize * sizeof(int));
err = cudaMalloc((void**) &d_boundsPerIdx, numBounds * sizeof(int));
err = cudaMalloc((void**) &d_faceIndexList, pixels * sizeof(int));
err = cudaMalloc((void**) &d_lampPos, sizeVec);
err = cudaMalloc((void**) &d_planeColor, sizeVec);
err = cudaMalloc((void**) &d_objColor, sizeVec);
err = cudaMalloc((void**) &d_lampColor, sizeVec);

//copy
err = cudaMemcpy(d_start, start, sizeVec, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_dirs, dirs, sizeVecAll, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_p0, p0, sizeFaceAll, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_p1, p1, sizeFaceAll, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_p2, p2, sizeFaceAll, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_normal, normal, sizeFaceAll, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_boundMax, boundMax, numBounds * sizeVec, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_boundMin, boundMin, numBounds * sizeVec, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_bounding, bounding, numBounds * facesSize * sizeof(int), cudaMemcpyHostToDevice);
err = cudaMemcpy(d_boundsPerIdx, boundsPerIdx, numBounds * sizeof(int), cudaMemcpyHostToDevice);
err = cudaMemcpy(d_lampPos, lampPos, sizeVec, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_planeColor, planeColor, sizeVec, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_objColor, objColor, sizeVec, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_lampColor, lampColor, sizeVec, cudaMemcpyHostToDevice);

//call kernel
deviceIntersection<<<pixY, pixX>>>(d_start, d_dirs, d_p0, d_p1, d_p2, d_normal, d_pHit, d_faceIndexList, d_bounding, d_boundsPerIdx,
	d_boundMax, d_boundMin, numBounds, pixX, pixY, facesSize, kaP, kaT, kdP, kdT, lampIp, ksP, ksT, lampIs, KrP, KrT, KtP, KtT, RefrT, d_lampPos, d_planeColor, d_objColor, d_lampColor);

//copying result
err = cudaMemcpy(faceIndexList, d_faceIndexList, pixels * sizeof(int), cudaMemcpyDeviceToHost);
cout<<"Error code faceindex copy: "<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy(pHit, d_pHit, sizeVecAll, cudaMemcpyDeviceToHost);
cout<<"Error code phit copy: "<<cudaGetErrorString(err)<<endl;

cudaFree(d_start);	cudaFree(d_dirs);	cudaFree(d_p0);		cudaFree(d_p1);	cudaFree(d_p2);	cudaFree(d_normal);
cudaFree(d_bounding);	cudaFree(d_boundsPerIdx);	cudaFree(d_boundMax);	cudaFree(d_boundMin); cudaFree(d_lampPos);	
cudaFree(d_planeColor);	cudaFree(d_objColor);
cudaFree(d_pHit);	cudaFree(d_faceIndexList);		

}

Please help ._.
Thank you