Hi All,
Attached below is a piece of code which demonstrates a problem faced with thrust::reduce_by_key.
To sum my observations up:
For GTX980 or Titanx (Maxwell devices):
- reduce_by_key doesn't work in case thrust::device_ptr are used to point on the input.
- reduce_by_key works fine in case thrust::device_vector is used instead to hold the input.
For GTX780Ti, (a Kepler device), reduce_by_key works fine in both listed above cases.
Demonstrating example codes follow.
Note: PC Platform runs cuda 7.5 under linux centos 7.
Example 1: reduce_by_key with thrust::device_ptr. (gives the incorrect results with Maxwell devices)
Here’s rbk.cu:
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/functional.h>
#include <thrust/reduce.h>
#include <thrust/fill.h>
#include <thrust/execution_policy.h>
int main(int argc, char **argv)
{
if(cudaSuccess != cudaSetDevice((int)0)){
std::cout << "cudaSetDevice err devId\n";
}
if(cudaSuccess !=cudaDeviceReset()){
std::cout << "cudaDeviceReset err reset\n";
}
unsigned int hostBuf[] = {5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9};
unsigned int inLen = sizeof(hostBuf)/sizeof(unsigned int);
unsigned int* d_inKey;
// allocate device raw buffers:
if(cudaSuccess != cudaMalloc((void **) &d_inKey, inLen * sizeof(unsigned int))){
std::cout << "cudaMalloc err\n";
}
unsigned int* d_outKey;
if(cudaSuccess != cudaMalloc((void **) &d_outKey, inLen * sizeof(unsigned int))){
std::cout << "cudaMalloc err\n";
}
unsigned int* d_outVal;
if(cudaSuccess != cudaMalloc((void **) &d_outVal, inLen * sizeof(unsigned int))){
std::cout << "cudaMalloc err\n";
}
// fill input key:
cudaMemcpy(d_inKey, hostBuf, inLen * sizeof(unsigned int), cudaMemcpyHostToDevice);
// set thrust device ptrs:
thrust::device_ptr<unsigned int> d_inKeyPtr(d_inKey);
thrust::device_ptr<unsigned int> d_outKeyPtr(d_outKey);
thrust::device_ptr<unsigned int> d_outValPtr(d_outVal);
thrust::pair<thrust::device_ptr<unsigned int>, thrust::device_ptr<unsigned int> > resultIterPair;
thrust::constant_iterator<unsigned int> const_iter(1);
unsigned int outLen = thrust::reduce_by_key(d_inKeyPtr, d_inKeyPtr + inLen, const_iter, d_outKeyPtr, d_outValPtr).first - d_outKeyPtr;
// copy results back to host"
unsigned int* outKey = new unsigned int[outLen];
cudaMemcpy(outKey, d_outKey, outLen * sizeof(unsigned int), cudaMemcpyDeviceToHost);
unsigned int* outVal = new unsigned int[outLen];
cudaMemcpy(outVal, d_outVal, outLen * sizeof(unsigned int), cudaMemcpyDeviceToHost);
// print reduced keys:
for(unsigned int idx =0; idx < outLen; idx++){
std::cout << "idx " << idx << " out key: " << outKey[idx] << " out val (=length): " << outVal[idx] << "\n";
}
return 0;
}
Some code orientation:
Line 23: Input sequence which is needed to be reduced(=keys).
Line 50: The input sequence related values are all 1s. Accordingly, results’ values will hold reduced sequence lengths.
Here are the execution’s prints, when gpu is gtx780ti:
idx 0 out key: 5 out val (=length): 2
idx 1 out key: 6 out val (=length): 3
idx 2 out key: 7 out val (=length): 4
idx 3 out key: 8 out val (=length): 5
idx 4 out key: 9 out val (=length): 3
Those are indeed the expected results.
Here are the execution’s prints, when gpu is gtx980 (same as with Titanx):
idx 0 out key: 5 out val (=length): 1
After modifying the above implementation by replacing device_prt with device_vector, the correct results were received on the Maxwell devices as well:
Here’s the modified code:
rbkVector.cu
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/functional.h>
#include <thrust/reduce.h>
#include <thrust/fill.h>
#include <thrust/execution_policy.h>
int main(int argc, char **argv)
{
if(cudaSuccess != cudaSetDevice((int)0)){
std::cout << "cudaSetDevice err devId\n";
}
if(cudaSuccess !=cudaDeviceReset()){
std::cout << "cudaDeviceReset err reset\n";
}
unsigned int hostBuf[] = {5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9};
unsigned int inLen = sizeof(hostBuf)/sizeof(unsigned int);
unsigned int* d_inKey;
// allocate device raw buffers:
if(cudaSuccess != cudaMalloc((void **) &d_inKey, inLen * sizeof(unsigned int))){
std::cout << "cudaMalloc err\n";
}
unsigned int* d_outKey;
if(cudaSuccess != cudaMalloc((void **) &d_outKey, inLen * sizeof(unsigned int))){
std::cout << "cudaMalloc err\n";
}
unsigned int* d_outVal;
if(cudaSuccess != cudaMalloc((void **) &d_outVal, inLen * sizeof(unsigned int))){
std::cout << "cudaMalloc err\n";
}
// fill input key:
cudaMemcpy(d_inKey, hostBuf, inLen * sizeof(unsigned int), cudaMemcpyHostToDevice);
thrust::device_vector<unsigned int> dinput(d_inKey, d_inKey + inLen);
thrust::device_vector<unsigned int> outKey(inLen);
thrust::device_vector<unsigned int> outVal(inLen);
unsigned int outLen = thrust::reduce_by_key(dinput.begin(), dinput.end(), thrust::constant_iterator<unsigned int>(1), outKey.begin(), outVal.begin()).first - outKey.begin();
// copy results back to raw pointers:
thrust::device_ptr<unsigned int> dd1(d_outKey);
thrust::copy( outKey.begin(), outKey.begin() + outLen, dd1);
thrust::device_ptr<unsigned int> dd2(d_outVal);
thrust::copy( outVal.begin(), outVal.begin() + outLen, dd2);
// copy results to host:
unsigned int* hostKey = new unsigned int[outLen];
cudaMemcpy(hostKey, d_outKey, outLen * sizeof(unsigned int), cudaMemcpyDeviceToHost);
unsigned int* hostVal = new unsigned int[outLen];
cudaMemcpy(hostVal, d_outVal, outLen * sizeof(unsigned int), cudaMemcpyDeviceToHost);
for(unsigned int idx = 0; idx < outLen; idx++){
std::cout << "idx " << idx << " out key: " << hostKey[idx] << " out val (=length): " << hostVal[idx] << "\n";
}
return 0;
}
I Would be grateful to have your advice about the reasons to the problem with the Maxwell devices.
Thanks,
Ronen Halevy.