thrust::reduce_by_key - issues with Maxwell devices

ronen567 · November 29, 2015, 2:34pm

Hi All,
Attached below is a piece of code which demonstrates a problem faced with thrust::reduce_by_key.
To sum my observations up:
For GTX980 or Titanx (Maxwell devices):

reduce_by_key doesn't work in case thrust::device_ptr are used to point on the input.
reduce_by_key works fine in case thrust::device_vector is used instead to hold the input.

For GTX780Ti, (a Kepler device), reduce_by_key works fine in both listed above cases.

Demonstrating example codes follow.

Note: PC Platform runs cuda 7.5 under linux centos 7.

Example 1: reduce_by_key with thrust::device_ptr. (gives the incorrect results with Maxwell devices)

Here’s rbk.cu:

#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/functional.h>
#include <thrust/reduce.h>
#include <thrust/fill.h>
#include <thrust/execution_policy.h>

int main(int argc, char **argv)
{

	if(cudaSuccess != cudaSetDevice((int)0)){
		std::cout << "cudaSetDevice err devId\n";
	}
	if(cudaSuccess !=cudaDeviceReset()){
		std::cout << "cudaDeviceReset err reset\n";
	}
	
	unsigned int hostBuf[] = {5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9};
	unsigned int inLen = sizeof(hostBuf)/sizeof(unsigned int);

 	unsigned int* d_inKey;

	// allocate device raw buffers:
	if(cudaSuccess !=  cudaMalloc((void **) &d_inKey, inLen * sizeof(unsigned int))){
		std::cout << "cudaMalloc err\n";
	}
	unsigned int* d_outKey;
	if(cudaSuccess !=  cudaMalloc((void **) &d_outKey, inLen * sizeof(unsigned int))){
		std::cout << "cudaMalloc err\n";
	}
	unsigned int* d_outVal;
	if(cudaSuccess !=  cudaMalloc((void **) &d_outVal, inLen * sizeof(unsigned int))){
		std::cout << "cudaMalloc err\n";
	}
	// fill input key:
	cudaMemcpy(d_inKey, hostBuf,  inLen * sizeof(unsigned int), cudaMemcpyHostToDevice);	

	// set thrust device ptrs: 
	thrust::device_ptr<unsigned int> d_inKeyPtr(d_inKey);
	thrust::device_ptr<unsigned int> d_outKeyPtr(d_outKey);
	thrust::device_ptr<unsigned int> d_outValPtr(d_outVal);
	thrust::pair<thrust::device_ptr<unsigned int>, thrust::device_ptr<unsigned int> > resultIterPair;
	
	thrust::constant_iterator<unsigned int> const_iter(1);
	unsigned int outLen = thrust::reduce_by_key(d_inKeyPtr, d_inKeyPtr + inLen, const_iter, d_outKeyPtr, d_outValPtr).first - d_outKeyPtr;
	
	// copy results back to host"
	unsigned int* outKey = new unsigned int[outLen];
	cudaMemcpy(outKey, d_outKey,  outLen * sizeof(unsigned int), cudaMemcpyDeviceToHost);
 	unsigned int* outVal = new unsigned int[outLen];
	cudaMemcpy(outVal, d_outVal,  outLen * sizeof(unsigned int), cudaMemcpyDeviceToHost);

	// print reduced keys:
 	for(unsigned int idx =0; idx < outLen; idx++){
	 	std::cout << "idx " << idx << " out key: " << outKey[idx] << " out val (=length): " << outVal[idx] << "\n";
	}
	
	return 0;
}

Some code orientation:
Line 23: Input sequence which is needed to be reduced(=keys).
Line 50: The input sequence related values are all 1s. Accordingly, results’ values will hold reduced sequence lengths.

Here are the execution’s prints, when gpu is gtx780ti:

idx 0 out key: 5 out val (=length): 2
idx 1 out key: 6 out val (=length): 3
idx 2 out key: 7 out val (=length): 4
idx 3 out key: 8 out val (=length): 5
idx 4 out key: 9 out val (=length): 3

Those are indeed the expected results.

Here are the execution’s prints, when gpu is gtx980 (same as with Titanx):

idx 0 out key: 5 out val (=length): 1

After modifying the above implementation by replacing device_prt with device_vector, the correct results were received on the Maxwell devices as well:

Here’s the modified code:
rbkVector.cu

#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/functional.h>
#include <thrust/reduce.h>
#include <thrust/fill.h>
#include <thrust/execution_policy.h>

int main(int argc, char **argv)
{

	if(cudaSuccess != cudaSetDevice((int)0)){
		std::cout << "cudaSetDevice err devId\n";
	}
	if(cudaSuccess !=cudaDeviceReset()){
		std::cout << "cudaDeviceReset err reset\n";
	}
	
	unsigned int hostBuf[] = {5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9};
	unsigned int inLen = sizeof(hostBuf)/sizeof(unsigned int);

 	unsigned int* d_inKey;

        // allocate device raw buffers:
	if(cudaSuccess !=  cudaMalloc((void **) &d_inKey, inLen * sizeof(unsigned int))){
		std::cout << "cudaMalloc err\n";
	}
	unsigned int* d_outKey;
	if(cudaSuccess !=  cudaMalloc((void **) &d_outKey, inLen * sizeof(unsigned int))){
		std::cout << "cudaMalloc err\n";
	}
	unsigned int* d_outVal;
	if(cudaSuccess !=  cudaMalloc((void **) &d_outVal, inLen * sizeof(unsigned int))){
		std::cout << "cudaMalloc err\n";
	}
	// fill input key:
	cudaMemcpy(d_inKey, hostBuf,  inLen * sizeof(unsigned int), cudaMemcpyHostToDevice);	

	thrust::device_vector<unsigned int> dinput(d_inKey, d_inKey + inLen);
	thrust::device_vector<unsigned int> outKey(inLen);
	thrust::device_vector<unsigned int> outVal(inLen);   
	unsigned int outLen =  thrust::reduce_by_key(dinput.begin(), dinput.end(),      thrust::constant_iterator<unsigned int>(1), outKey.begin(), outVal.begin()).first - outKey.begin();   
  
	// copy results back to raw pointers:
	thrust::device_ptr<unsigned int> dd1(d_outKey);
	thrust::copy( outKey.begin(),  outKey.begin() + outLen, dd1);
	
	thrust::device_ptr<unsigned int> dd2(d_outVal);
	thrust::copy( outVal.begin(),  outVal.begin() + outLen, dd2);

// copy results to host:
 	unsigned int* hostKey =  new  unsigned int[outLen];

	cudaMemcpy(hostKey, d_outKey,  outLen * sizeof(unsigned int), cudaMemcpyDeviceToHost);	
 	unsigned int* hostVal =  new  unsigned int[outLen];
	cudaMemcpy(hostVal, d_outVal,  outLen * sizeof(unsigned int), cudaMemcpyDeviceToHost);	

	for(unsigned int idx = 0; idx < outLen; idx++){
		std::cout << "idx " << idx << " out key: " << hostKey[idx] << " out val (=length): " << hostVal[idx] << "\n";

	}  
	
	return 0;
}

I Would be grateful to have your advice about the reasons to the problem with the Maxwell devices.
Thanks,
Ronen Halevy.

Robert_Crovella · November 30, 2015, 8:21pm

I would suggest filing a bug at developer.nvidia.com

You can also send a report to the thrust-users google group mailing list:

[url]Redirecting to Google Groups

ronen567 · December 3, 2015, 10:07am

I have posted a bug and will update the forum about the resolution.

Vectorizer · February 11, 2016, 5:34pm

Hi @ronen567,

Did you get a resolution for this from Nvidia?
I have the same problem.
Is there a Nvidia problem ticket number for this?
Thanks

Robert_Crovella · February 11, 2016, 7:01pm

This is a confirmed bug and should be fixed in the CUDA 8.0 release. nv bug is 1709293

ronen567 · February 16, 2016, 4:21pm

Hi Vectorizer,
I actually received an email from nvidia by today, stating that “This problem could be reproduced in the CUDA 7.5 Production release, however, it has been fixed in our development versions for the next CUDA release.”
ronen halevy.
+
txbob, thanks for your post.

cysin · May 18, 2016, 9:51am

I had the same issue and wondering when a newer version of cuda will be released?

Robert_Crovella · May 18, 2016, 12:32pm

NVIDIA announced CUDA 8 at GTC, and CUDA 8 has completed the EA (early access) phase. The next phase for CUDA 8 should be RC (release candidate) at which point you could download and test it to see if the issue is fixed.

slstuart3 · June 30, 2016, 7:24pm

I had this same issue and using CUDA 8.0 RC resolved it.