Dealing with Vectors - cudaMemcpyDeviceToHost

It is not obvious how to use std::vector in CUDA, so I have designed my own Vector class:

#ifndef VECTORHEADERDEF
#define VECTORHEADERDEF

#include <cmath>
#include <iostream>
#include <cassert>

template <typename T>
class Vector
{
private:
   T* mData;   // data stored in vector
   int mSize;  // size of vector
public:
        Vector(const Vector& otherVector);  // Constructor
        Vector(int size);   // Constructor
        ~Vector();   // Desructor

        __host__ __device__ int GetSize() const; // get size of the vector
        
        T& operator[](int i);  // see element
  
        // change element i
        __host__ __device__ void set(size_t i, T value) {
                mData[i] = value;
        }

        template <class S>    // output vector
        friend std::ostream& operator<<(std::ostream& output, Vector<S>& v);
};

// Overridden copy constructor
// Allocates memory for new vector, and copies entries of other vector into it
template <typename T>
Vector<T>::Vector(const Vector& otherVector)
{
   mSize = otherVector.GetSize();
   mData = new T [mSize];
   for (int i=0; i<mSize; i++)
   {
      mData[i] = otherVector.mData[i];
   }
}

// Constructor for vector of a given size
// Allocates memory, and initialises entries to zero
template <typename T>
Vector<T>::Vector(int size)
{
   assert(size > 0);
   mSize = size;
   mData = new T [mSize];
   for (int i=0; i<mSize; i++)
   {
      mData[i] = 0.0;
   }
}

// Overridden destructor to correctly free memory
template <typename T>
Vector<T>::~Vector()
{
   delete[] mData;
}

// Method to get the size of a vector
template <typename T>
__host__ __device__ int Vector<T>::GetSize() const
{
   return mSize;
}

// Overloading square brackets
// Note that this uses `zero-based' indexing, and a check on the validity of the index
template <typename T>
T& Vector<T>::operator[](int i)
{
        assert(i > -1);
        assert(i < mSize);
        return mData[i];
}

// Overloading the assignment operator
template <typename T>
Vector<T>& Vector<T>::operator=(const Vector& otherVector)
{
   assert(mSize == otherVector.mSize);
   for (int i=0; i<mSize; i++)
   {
      mData[i] = otherVector.mData[i];
   }
   return *this;
}

// Overloading the insertion << operator
template <typename T>
std::ostream& operator<<(std::ostream& output, Vector<T>& v) {
   for (int i=0; i<v.mSize; i++) {
      output << v[i] << "   ";
   }
  return output;
}

My main function - where I just pass a vector to the device, modify it and pass it back - is as follows (with the kernel designed just for testing purposes):

#include <iostream>

#include "Vector.hpp"

__global__ void alpha(Vector<int>* d_num)
{
        int myId = threadIdx.x + blockDim.x * blockIdx.x;

d_num->set(0,100);
        d_num->set(2,11);
}

int main()
{
        Vector<int> num(10);
        
        for (int i=0; i < num.GetSize(); ++i) num.set(i,i); // initialize elements to 0:9

        std::cout << "Size of vector: " << num.GetSize() << "\n";
        std::cout << num << "\n"; // print vector

        Vector<int>* d_num;

        // allocate global memory on the device
        cudaMalloc((void **) &d_num, num.GetSize()*sizeof(Vector<int>));

        // copy data from host memory to the device memory
        cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(Vector<int>), cudaMemcpyHostToDevice);

// launch the kernel
        alpha<<<1,100>>>(d_num);

// copy the modified array back to the host, overwriting the contents of h_arr
        cudaMemcpy(num, &d_num[0], num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);

        std::cout << num << "\n";

// free GPU memory allocation and exit
        cudaFree(d_num);

        return 0;
}

The problem I encounter is with cudaMemcpyDeviceToHost. It does not really copy the device vector to the num vector as can be seen from the output.

How should I deal with that?

d_num is a pointer to Vector. Vector is a structure consisting of pointer and size. so you just copy data from the num array over this structure

btw, sizeof(Vector) is a size of Vector structure, so it’s also an error

Bulat, does your comment mean that I cannot do smth like this?
or, could you please indicate how can I make it work?

And, is it acceptable, if I put:

num.GetSize()*sizeof(int)

?

you can do it, of course. your error is that you declared d_num as a pointer. it’s not initialized so it’s just a jungling pointer to structure - well-known C++ error

declare d_num without “*” and use d_dum.mData as the pointer to data

num.GetSize()*sizeof(int)

yes

mData is private, I cannot access it

Or maybe, I don’t understand what you mean. I should have mentioned that I am new to CUDA.

this isn’t specific to CUDA - you need to access mData in the way that’s not supported by your class implementation. anyway, declaring it as class will not make data transfer automatic as you wish, and i fear that you will have even more troubles with Vector than without it

so i suggest you to look into Thrust that is shipped with CUDA and provides operations on std::vector (data are transferred automaticall to/from GPU); alternatively, use plain C-style arrays with CUDA - it will be simpler than deal with your Vector class