I asked a similar question in another part of the forum but I think it is more suited in this part of the forum, so please forgive the repost.
I want to implement a reduction using CUDA CUB library. The problem is that the reduction is a multiplication rather than a summation as well as the datatype being employed is a complex value and not a simple double, float or integer. The HOST code that I want to convert to CUDA CUB follows:
#include <stdio.h>
#include <cuComplex.h>
int main(int argc, char **argv){
cuComplex result;
result.x = result.y = 1.0f;
cuComplex *array;
int numElems = 128;
array = (cuComplex*)malloc(numElems*sizeof(cuComplex));
// initialize with sample data
for(int i = 0; i < numElems; ++i){
array[i].x = i * 1.5f;
array[i].y = 1.0f;
}
// -----------------------------------
// how to convert this to CUDA CUB ??
// -----------------------------------
for(int i = 0; i < numElems; ++i){
result.x = result.x * array[i].x;
result.y = result.y * array[i].y;
}
// -----------------------------------
// -----------------------------------
free(array);
return 0;
}
I just want to use CUB to execute the above for-loop. Any help is greatly appreciated.
I got this working with a great deal of help from Robert_Corvella in another area of this forum. THANK you a million times to Robert.
Below is the final working code employing CUB, not exactly like what original code post was but hopefully it will be of some help to someone else out there in same boat.
#include <stdio.h>
#include <cuComplex.h>
#include "cub/cub.cuh" // assumes CUB is in current directory
struct ComplexProd{
__device__ __forceinline__
cuComplex operator()(cuComplex &a, cuComplex &b){
a.x = a.x * b.x;
a.y = a.y * b.y;
return a;
}
};
void computeReduction(const int N, cuComplex *d_a, cuComplex *d_b){
// temporary variable employed for CUDA CUB
ComplexProd prod;
cuComplex init;
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
// set initial value
init.x = init.y = 1.0f;
// determine temporary device storage requirements
cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_a, d_b, N, prod, init);
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// run actual reduction via CUB - result is stored in d_b
cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_a, d_b, N, prod, init);
cudaFree(&d_temp_storage);
}
int main(){
const int numElems = 128;
cuComplex *d_a, *d_b;
cuComplex *h_a, h_b;
cuComplex chk;
h_a = (cuComplex*)malloc(numElems*sizeof(cuComplex));
cudaMalloc((void**)&d_a, numElems*sizeof(cuComplex));
// initialize
h_b.x = h_b.y = 1.0f;
for(int i = 0; i < numElems; ++i){
h_a[i].x = (float)(i + 1);
h_a[i].y = 1.0f;
}
// copy to DEVICE
cudaMemcpy(d_a, h_a, numElems*sizeof(cuComplex), cudaMemcpyHostToDevice);
// call using CUB
computeReduction(const int N, cuComplex *d_a, cuComplex *d_b);
// get result from DEVICE
cudaMemcpy(&chk, d_b, sizeof(cuComplex), cudaMemcpyDeviceToHost);
// compute on HOST
for(int i = 0; i < numElems; ++i){
h_b.x *= h_a[i].x;
h_b.y *= h_a[i].y;
}
print("HOST: %0.4f, DEVICE: %0.4f\n", h_b.x, chk.x);
return 0;
}
Looks like I posted the solution too soon. I messed up on the complex multiplication but made the changes that I think I need ? Can anyone tell me if the following modified code will execute a CUDA CUB reduction properly using complex numbers?
#include <stdio.h>
#include <cuComplex.h>
#include "cub/cub.cuh" // assumes CUB is in current directory
struct ComplexProd{
__device__ __forceinline__
cuComplex operator()(cuComplex &a, cuComplex &b){
// -------------------------------------------
// properly defined complex multiplication ?
a.x = a.x * b.x - a.y * b.y;
a.y = a.y * b.x + a.x * b.y;
// -------------------------------------------
//a.x = a.x * b.x;
//a.y = a.y * b.y;
return a;
}
};
void computeReduction(const int N, cuComplex *d_a, cuComplex *d_b){
// temporary variable employed for CUDA CUB
ComplexProd prod;
cuComplex init;
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
// set initial value
init.x = init.y = 1.0f;
// determine temporary device storage requirements
cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_a, d_b, N, prod, init);
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// run actual reduction via CUB - result is stored in d_b
cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_a, d_b, N, prod, init);
cudaFree(&d_temp_storage);
}
int main(){
const int numElems = 128;
cuComplex *d_a, *d_b;
cuComplex *h_a, h_b;
cuComplex chk;
h_a = (cuComplex*)malloc(numElems*sizeof(cuComplex));
cudaMalloc((void**)&d_a, numElems*sizeof(cuComplex));
// initialize
h_b.x = h_b.y = 1.0f;
for(int i = 0; i < numElems; ++i){
h_a[i].x = (float)(i + 1);
h_a[i].y = 1.0f;
}
// copy to DEVICE
cudaMemcpy(d_a, h_a, numElems*sizeof(cuComplex), cudaMemcpyHostToDevice);
// call using CUB
computeReduction(const int N, cuComplex *d_a, cuComplex *d_b);
// get result from DEVICE
cudaMemcpy(&chk, d_b, sizeof(cuComplex), cudaMemcpyDeviceToHost);
// compute on HOST
for(int i = 0; i < numElems; ++i){
// ------------------------------------------
// proper multiplication of complex numbers ?
h_b.x = h_b.x * h_a[i].x - h_b.y * h_a[i].y;
h_b.y = h_b.y * h_a[i].x + h_b.x * h_a[i].y;
// ------------------------------------------
//h_b.x *= h_a[i].x;
//h_b.y *= h_a[i].y;
}
print("HOST: %0.4f, DEVICE: %0.4f\n", h_b.x, chk.x);
return 0;
}
no that won’t work. you need a temporary variable to hold your results in while you are calculating. Once you overwrite a.x in the first line of calculation, you will comptue the wrong thing in the second line of calculation.