Hey experts,
Is there a fast way to convert a boolean array into an array of 32 bit integers so that I could perform instructions such as __popc or __clz on each register?
I don’t mind writing ptx if it gives me better performance :)
Hey experts,
Is there a fast way to convert a boolean array into an array of 32 bit integers so that I could perform instructions such as __popc or __clz on each register?
I don’t mind writing ptx if it gives me better performance :)
I’ve tried it using warp-shuffle.
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cstdint>
#include <iostream>
__device__ __forceinline__ uint32_t warp_pack32(uint8_t cval) {
uint32_t val = cval;
val |= __shfl_xor(val<<16,16);
val |= __shfl_xor(val<< 8, 8);
val |= __shfl_xor(val<< 4, 4);
val |= __shfl_xor(val<< 2, 2);
val |= __shfl_xor(val<< 1, 1);
return val;
}
__global__ void kernel_pack32(const uint8_t* in, uint32_t* out, unsigned int size) {
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
uint32_t val = warp_pack32((i < size) ? in[i]&1 : 0);
if ( threadIdx.x == 0 ) out[blockIdx.x] = val;
}
__host__ void pack32(const uint8_t* in, uint32_t* out, unsigned int size) {
kernel_pack32<<<(size+31)/32,32>>>(in, out, size);
}
int main() {
const int n = 32; // # of bits
uint8_t* in;
cudaMallocManaged(&in , n*sizeof(uint8_t));
uint32_t* out;
cudaMallocManaged(&out, n/32*sizeof(uint32_t));
uint32_t expected = 0x12345678UL;
// make boolean array
uint32_t tmp = expected;
for ( int i = 0; i < n; ++i ) {
in[i] = tmp & 1;
tmp >>= 1;
}
pack32(in, out, n);
cudaDeviceSynchronize();
std::cout << "expected: " << std::hex << expected << std::endl;
std::cout << "actual: " << std::hex << out[0] << std::endl;
cudaFree(in);
cudaFree(out);
}