Loading memory that was allocated in kernel.

Hello,
I have a std::vector-like class(but limited to structures, that can be allocated using simple malloc), that can upload a copy of itself to CUDA device and download the data back after any modification(on demand). My problem is, that it sometimes needs to reallocate an array and in that case cudaMemcpy fails to download the data back.

here’s the implementation of both functions:

template<typename Type, unsigned int sizeOnStack>
// Adds an element to the CuVector
__device__ __host__ inline Type& CuVector<Type, sizeOnStack>::push(const Type& value){
	if (usedSize >= allocSize){
		allocSize *= 2;
		Type *newData = new Type[allocSize];
		if (newData == NULL) return(data[usedSize - 1]);
		for (int i = 0; i < usedSize; i++)
			newData[i] = data[i];
		if ((data != stackData) && data != NULL) delete[] data;
		data = newData;
		printf("allocated\n");// just to ensure, the kernel reached this line.
	}
	data[usedSize] = value;
	usedSize++;
	return(data[usedSize - 1]);
}

template<typename Type, unsigned int sizeOnStack>
// Loads data from the GPU clone (Returns true if successful)
inline bool CuVector<Type, sizeOnStack>::updateFromClone(){
	if (clone == NULL) return(false);
	char cln;
	CuVector *cpuClone = (CuVector*)cln;
	if (cudaMemcpy(cpuClone, clone, sizeof(CuVector<Type, sizeOnStack>), cudaMemcpyDeviceToHost) != cudaSuccess){
		std::cout << "Failed to load clone..." << std::endl;
		return(false);
	}
	Type *newData = NULL;
	if (cpuClone->data != clone->stackData){
		newData = new Type[cpuClone->allocSize];
		if (newData == NULL) return(false);
		if (cudaMemcpy(newData, cpuClone->data, sizeof(Type)*cpuClone->allocSize, cudaMemcpyDeviceToHost) != cudaSuccess){
			std::cout << "Failed to load clone data..." << std::endl; // I see this massage if and only if the upper function reallocates data
			std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
			return(false);
		}
	}
	if ((data != stackData) && data != NULL) delete[] data;
	usedSize = cpuClone->usedSize;
	allocSize = cpuClone->allocSize;
	if (newData == NULL){
		data = stackData;
		for (int i = 0; i < usedSize; i++)
			data[i] = cpuClone->stackData[i];
	}
	else data = newData;
	return(true);
}

What might be the reason?
Do kernel “new” and host “cudaMalloc” operate on different parts of the GPU memory?

Well, I looked up a little bit and found out, that I simply wasn’t allowed to access something allocated in kernel using host APIs.
Found another solution thought… And it works just fine.