CUB reduction with complex number and multiplication

Good morning,

I asked a similar question in another part of the forum but I think it is more suited in this part of the forum, so please forgive the repost.

I want to implement a reduction using CUDA CUB library. The problem is that the reduction is a multiplication rather than a summation as well as the datatype being employed is a complex value and not a simple double, float or integer. The HOST code that I want to convert to CUDA CUB follows:

#include <stdio.h>
#include <cuComplex.h>

int main(int argc, char **argv){
  cuComplex result;
  result.x = result.y = 1.0f;
  
  cuComplex *array;
  int numElems = 128;

  array = (cuComplex*)malloc(numElems*sizeof(cuComplex));

  // initialize with sample data
  for(int i = 0; i < numElems; ++i){
    array[i].x = i * 1.5f;
    array[i].y = 1.0f;
  }

  // -----------------------------------
  // how to convert this to CUDA CUB ??
  // -----------------------------------
  for(int i = 0; i < numElems; ++i){
    result.x = result.x * array[i].x;
    result.y = result.y * array[i].y;
  }
  // -----------------------------------
  // -----------------------------------

  free(array);

  return 0;
}

I just want to use CUB to execute the above for-loop. Any help is greatly appreciated.

Thank you.

I got this working with a great deal of help from Robert_Corvella in another area of this forum. THANK you a million times to Robert.

Below is the final working code employing CUB, not exactly like what original code post was but hopefully it will be of some help to someone else out there in same boat.

#include <stdio.h>
#include <cuComplex.h>
#include "cub/cub.cuh"  // assumes CUB is in current directory

struct ComplexProd{
  __device__ __forceinline__
  cuComplex operator()(cuComplex &a, cuComplex &b){
    a.x = a.x * b.x;
    a.y = a.y * b.y;
    return a;
  }
};

void computeReduction(const int N, cuComplex *d_a, cuComplex *d_b){
  // temporary variable employed for CUDA CUB
  ComplexProd prod;
  cuComplex init;
  void *d_temp_storage = NULL;
  size_t temp_storage_bytes = 0;
    
  // set initial value
  init.x = init.y = 1.0f;

  // determine temporary device storage requirements
  cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_a, d_b, N, prod, init);
  cudaMalloc(&d_temp_storage, temp_storage_bytes);
    
  // run actual reduction via CUB - result is stored in d_b
  cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_a, d_b, N, prod, init);
  cudaFree(&d_temp_storage);
}

int main(){
  const int numElems = 128;
  cuComplex *d_a, *d_b;
  cuComplex *h_a, h_b;
  cuComplex chk;

  h_a = (cuComplex*)malloc(numElems*sizeof(cuComplex));
  cudaMalloc((void**)&d_a, numElems*sizeof(cuComplex));
 
  // initialize
  h_b.x = h_b.y = 1.0f;
  for(int i = 0; i < numElems; ++i){
    h_a[i].x = (float)(i + 1);
    h_a[i].y = 1.0f;
  }

  // copy to DEVICE
  cudaMemcpy(d_a, h_a, numElems*sizeof(cuComplex), cudaMemcpyHostToDevice);
  
  // call using CUB
  computeReduction(const int N, cuComplex *d_a, cuComplex *d_b);

  // get result from DEVICE
  cudaMemcpy(&chk, d_b, sizeof(cuComplex), cudaMemcpyDeviceToHost);

  // compute on HOST
  for(int i = 0; i < numElems; ++i){
    h_b.x *= h_a[i].x;
    h_b.y *= h_a[i].y;
  }

  print("HOST: %0.4f, DEVICE: %0.4f\n", h_b.x, chk.x);

  return 0;
}

Hello again,

Looks like I posted the solution too soon. I messed up on the complex multiplication but made the changes that I think I need ? Can anyone tell me if the following modified code will execute a CUDA CUB reduction properly using complex numbers?

#include <stdio.h>
#include <cuComplex.h>
#include "cub/cub.cuh"  // assumes CUB is in current directory

struct ComplexProd{
  __device__ __forceinline__
  cuComplex operator()(cuComplex &a, cuComplex &b){
    // -------------------------------------------
    // properly defined complex multiplication ?
    a.x = a.x * b.x - a.y * b.y;
    a.y = a.y * b.x + a.x * b.y;
    // -------------------------------------------
    //a.x = a.x * b.x;
    //a.y = a.y * b.y;
    return a;
  }
};

void computeReduction(const int N, cuComplex *d_a, cuComplex *d_b){
  // temporary variable employed for CUDA CUB
  ComplexProd prod;
  cuComplex init;
  void *d_temp_storage = NULL;
  size_t temp_storage_bytes = 0;
    
  // set initial value
  init.x = init.y = 1.0f;

  // determine temporary device storage requirements
  cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_a, d_b, N, prod, init);
  cudaMalloc(&d_temp_storage, temp_storage_bytes);
    
  // run actual reduction via CUB - result is stored in d_b
  cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_a, d_b, N, prod, init);
  cudaFree(&d_temp_storage);
}

int main(){
  const int numElems = 128;
  cuComplex *d_a, *d_b;
  cuComplex *h_a, h_b;
  cuComplex chk;

  h_a = (cuComplex*)malloc(numElems*sizeof(cuComplex));
  cudaMalloc((void**)&d_a, numElems*sizeof(cuComplex));
 
  // initialize
  h_b.x = h_b.y = 1.0f;
  for(int i = 0; i < numElems; ++i){
    h_a[i].x = (float)(i + 1);
    h_a[i].y = 1.0f;
  }

  // copy to DEVICE
  cudaMemcpy(d_a, h_a, numElems*sizeof(cuComplex), cudaMemcpyHostToDevice);
  
  // call using CUB
  computeReduction(const int N, cuComplex *d_a, cuComplex *d_b);

  // get result from DEVICE
  cudaMemcpy(&chk, d_b, sizeof(cuComplex), cudaMemcpyDeviceToHost);

  // compute on HOST
  for(int i = 0; i < numElems; ++i){
    // ------------------------------------------
    // proper multiplication of complex numbers ?
    h_b.x = h_b.x * h_a[i].x - h_b.y * h_a[i].y;
    h_b.y = h_b.y * h_a[i].x + h_b.x * h_a[i].y;
    // ------------------------------------------
    //h_b.x *= h_a[i].x;
    //h_b.y *= h_a[i].y;
  }

  print("HOST: %0.4f, DEVICE: %0.4f\n", h_b.x, chk.x);

  return 0;
}

no that won’t work. you need a temporary variable to hold your results in while you are calculating. Once you overwrite a.x in the first line of calculation, you will comptue the wrong thing in the second line of calculation.

Thank you for the fast response.

Something like the following for complex product then?

...
struct ComplexProd{
  __device__ __forceinline__
  cuComplex operator()(cuComplex &a, cuComplex &b){
    // added temp. variable to store results while calculating
    cuComplex tmp;
    tmp.x = a.x * b.x - a.y * b.y;
    tmp.y = a.y * b.x + a.x * b.y;
    a.x = tmp.x;
    a.y = tmp.y;

    // -------------------------------------------
    // properly defined complex multiplication ?
    //a.x = a.x * b.x - a.y * b.y;
    //a.y = a.y * b.x + a.x * b.y;
    // -------------------------------------------
    //a.x = a.x * b.x;
    //a.y = a.y * b.y;
    return a;
  }
};
...

sure, that would be better, if it were me, I would just return tmp, but it shouldn’t matter - the compiler should optimize either case.

This doesn’t really have anything to do with CUDA, right? We’re just discussing how to get the correct result from a single thread of C++ code.

Thanks for the help.

Well, it has to do with creating a custom functor that is called from CUDA CUB for a reduction.