Hi, I met some problems with calculating the prefix sum(inclusive scan). I used simple scan method. But I want to extend this program to scan large array(about 100,000,000). My idea is that we do the scan within each block cause we can only syncthreads in this scope. and than extract the last number of every blocks and then do the scan to these numbers and last add them to the previous array to get the final results. but I stuck at the first step :(.
I define every block has 1024 threads, and I initialized every number to 1. so I am supposed to get results like :
1,2,3 …1024,1,2,3…1024. but instead I got:
x1016 = 901
x1017 = 901
x1018 = 908
x1019 = 908
x1020 = 905
x1021 = 905
x1022 = 912
x1023 = 912
x1024 = 1
x1025 = 2
x1026 = 3
x1027 = 4
x1028 = 5
x1029 = 6
x1030 = 7
x1031 = 8
x1032 = 9
I think something is wrong with sycnthread. here is my code:
#include <iostream>
#include <time.h>
#define THREAD 1024
using namespace std;
// Kernel function to add the elements of two arrays
__global__
void scan(int n, float *x)
{
int thid = blockIdx.x * blockDim.x + threadIdx.x;
__syncthreads();
if(thid < n){
for(int k = 1; k <= threadIdx.x; k = k*2){
float temp1 = x[thid];
float temp2 = x[thid - k];
__syncthreads();
x[thid] = temp1 + temp2;
}
}
}
int main(void)
{
int n = 4000;
float *x, *d_x;
//Allocate memory on CPU
x = (float*)malloc(n * sizeof(float));
//Allocate memory on GPU
cudaMalloc(&d_x, n * sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < n; i++) {
x[i] = 1.0f;
}
//Copy memory from CPU to GPU
cudaMemcpy(d_x, x, n * sizeof(float), cudaMemcpyHostToDevice);
scan << <(n + THREAD - 1) / THREAD, THREAD >> >(n, d_x);
//Copy memory from GPU to CPU
cudaMemcpy(x, d_x, n * sizeof(float), cudaMemcpyDeviceToHost);
//Verify Results
for (int i = 0; i < n; i++){
cout<<"x"<<i<<" = "<<x[i]<<endl;
}
//Free memory on GPU
cudaFree(d_x);
//Free memory on CPU
free(x);
return 0;
}