Boolean array packed into 32 bit registers?

Hey experts,

Is there a fast way to convert a boolean array into an array of 32 bit integers so that I could perform instructions such as __popc or __clz on each register?

I don’t mind writing ptx if it gives me better performance :)

I’ve tried it using warp-shuffle.

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cstdint>
#include <iostream>

__device__ __forceinline__ uint32_t warp_pack32(uint8_t cval) {
  uint32_t val = cval;
  val |= __shfl_xor(val<<16,16);
  val |= __shfl_xor(val<< 8, 8);
  val |= __shfl_xor(val<< 4, 4);
  val |= __shfl_xor(val<< 2, 2);
  val |= __shfl_xor(val<< 1, 1);
  return val;
}

__global__ void kernel_pack32(const uint8_t* in, uint32_t* out, unsigned int size) {
  unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
  uint32_t val = warp_pack32((i < size) ? in[i]&1 : 0);
  if ( threadIdx.x == 0 ) out[blockIdx.x] = val;
}

__host__ void pack32(const uint8_t* in, uint32_t* out, unsigned int size) {
  kernel_pack32<<<(size+31)/32,32>>>(in, out, size);
}

int main() {
  const int n = 32; // # of bits
  uint8_t*  in;
  cudaMallocManaged(&in , n*sizeof(uint8_t));
  uint32_t* out;
  cudaMallocManaged(&out, n/32*sizeof(uint32_t));

  uint32_t expected = 0x12345678UL;

  // make boolean array
  uint32_t tmp = expected;
  for ( int i = 0; i < n; ++i ) {
    in[i] = tmp & 1;
    tmp >>= 1;
  }

  pack32(in, out, n);
  cudaDeviceSynchronize();

  std::cout << "expected: " << std::hex << expected << std::endl; 
  std::cout << "actual:   " << std::hex << out[0]   << std::endl; 

  cudaFree(in);
  cudaFree(out);
}