CUDA Mersenne Twister re-written in OpenCL
Hello,

I am trying to re-write Nvidia's CUDA Mersenne Twister shared memory version in OpenCL. I basically needed it with generating on the fly random numbers rather than pre-generated ones.

this is essentially the CUDA version :

#define NVG80 /* For Nvidia G80 achitecture where mod is VERY slow */

#ifdef NVG80
#define mod(x, y) ((x) < (y) ? (x) : (x) - (y)) /* Short mod - known input range */
#else
#define mod(x, y) ((x) % (y))
#endif

#define N 624
#define M 397
#define INIT_MULT 1812433253 /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
#define ARRAY_SEED 19650218 /* Seed for initial setup before incorp array seed */
#define MATRIX_A 0x9908b0df /* Constant vector a */
#define UPPER_MASK 0x80000000 /* Most significant w-r bits */
#define LOWER_MASK 0x7fffffff /* Least significant r bits */
#define TEMPER1 0x9d2c5680
#define TEMPER2 0xefc60000

__shared__ int mtNexts; /* Start of next block of seeds */
__shared__ uint s_seeds[N + 1];

/* Init by single seed - single threaded as only used once */

__device__ void mt19937si(uint seed)
{
int i;

if (threadIdx.x == 0)
{
mtNexts = 0;
s_seeds[0] = seed;
for (i = 1; i < N; i++)
{
seed = (INIT_MULT * (seed ^ (seed >> 30)) + i);
s_seeds[i] = seed;
}
}
__syncthreads(); /* Ensure mtNexts set & needed for mt19937w() */
return;
}


__device__ uint mt19937s(void)
{
int kk;
uint y;
const int tid = threadIdx.x;

kk = mod(mtNexts + tid, N);
__syncthreads(); /* Finished with mtNexts & s_seed[] ready from last run */

if (tid == blockDim.x - 1)
{
mtNexts = kk + 1; /* Will get modded on next call */
}
y = (s_seeds[kk] & UPPER_MASK) | (s_seeds[kk + 1] & LOWER_MASK);
y = s_seeds[kk < N - M ? kk + M : kk + (M - N)] ^ (y >> 1) ^ mag01[y & 1];
//y = s_seeds[kk < N - M ? kk + M : kk + (M - N)] ^ (y >> 1) ^ (y & 1 ? MATRIX_A : 0); // Same speed
__syncthreads(); /* All done before we update */

s_seeds[kk] = y;
if (kk == 0) /* Copy up for next round */
{
s_seeds[N] = y;
}
y ^= (y >> 11); /* Tempering */
y ^= (y << 7) & TEMPER1;
y ^= (y << 15) & TEMPER2;
y ^= (y >> 18);
return y;
}

For the OpenCL bit, I merged the two functions as I did not know how to maintain state between those functions and could not declare mtNexts and s_seeds as shared out of any function. I came up with the following in OpenCL :

#define NVG80 /* For Nvidia G80 achitecture where mod is VERY slow */

#ifdef NVG80
#define mod(x, y) ((x) < (y) ? (x) : (x) - (y)) /* Short mod - known input range */
#else
#define mod(x, y) ((x) % (y))
#endif


#define N 624
#define M 397
#define INIT_MULT 1812433253 /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
#define ARRAY_SEED 19650218 /* Seed for initial setup before incorp array seed */
#define MATRIX_A 0x9908b0df /* Constant vector a */
#define UPPER_MASK 0x80000000 /* Most significant w-r bits */
#define LOWER_MASK 0x7fffffff /* Least significant r bits */
#define TEMPER1 0x9d2c5680
#define TEMPER2 0xefc60000

__constant int mag01[2] = {0,0x9908b0df}; // you can't use MATRIX_A as you have __constant

unsigned int my_mt19937s(unsigned int seed) {

int mtNexts; /* Start of next block of seeds */

unsigned int s_seeds[N + 1];

int i;

if (get_local_id(0) == 0)
{
mtNexts = 0;
s_seeds[0] = seed;
for (i = 1; i < N; i++)
{
seed = (INIT_MULT * (seed ^ (seed >> 30)) + i);
s_seeds[i] = seed;
}
}
barrier(CLK_LOCAL_MEM_FENCE); /* Ensure mtNexts set & needed for mt19937w() */

int kk;
unsigned int y;
const int tid = get_local_id(0);

kk = mod(mtNexts + tid, N);
barrier(CLK_LOCAL_MEM_FENCE); /* Finished with mtNexts & s_seed[] ready from last run */

if (tid == get_local_size(0) - 1)
{
mtNexts = kk + 1; /* Will get modded on next call */
}
y = (s_seeds[kk] & UPPER_MASK) | (s_seeds[kk + 1] & LOWER_MASK);
y = s_seeds[kk < N - M ? kk + M : kk + (M - N)] ^ (y >> 1) ^ mag01[y & 1];
//y = s_seeds[kk < N - M ? kk + M : kk + (M - N)] ^ (y >> 1) ^ (y & 1 ? MATRIX_A : 0); // Same speed
barrier(CLK_LOCAL_MEM_FENCE); /* All done before we update */

s_seeds[kk] = y;
if (kk == 0) /* Copy up for next round */
{
s_seeds[N] = y;
}
y ^= (y >> 11); /* Tempering */
y ^= (y << 7) & TEMPER1;
y ^= (y << 15) & TEMPER2;
y ^= (y >> 18);

return y;

}

In my kernel function I call my_mt19937s(get_global_id(0)) 1000 times but I get a correct value only for the first time, and for the rest I get 0, any ideas what is going wrong? If you require any additional info please let me know.

Thanks,

- vihan
Hello,



I am trying to re-write Nvidia's CUDA Mersenne Twister shared memory version in OpenCL. I basically needed it with generating on the fly random numbers rather than pre-generated ones.



this is essentially the CUDA version :



#define NVG80 /* For Nvidia G80 achitecture where mod is VERY slow */



#ifdef NVG80

#define mod(x, y) ((x) < (y) ? (x) : (x) - (y)) /* Short mod - known input range */

#else

#define mod(x, y) ((x) % (y))

#endif



#define N 624

#define M 397

#define INIT_MULT 1812433253 /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */

#define ARRAY_SEED 19650218 /* Seed for initial setup before incorp array seed */

#define MATRIX_A 0x9908b0df /* Constant vector a */

#define UPPER_MASK 0x80000000 /* Most significant w-r bits */

#define LOWER_MASK 0x7fffffff /* Least significant r bits */

#define TEMPER1 0x9d2c5680

#define TEMPER2 0xefc60000



__shared__ int mtNexts; /* Start of next block of seeds */

__shared__ uint s_seeds[N + 1];



/* Init by single seed - single threaded as only used once */



__device__ void mt19937si(uint seed)

{

int i;



if (threadIdx.x == 0)

{

mtNexts = 0;

s_seeds[0] = seed;

for (i = 1; i < N; i++)

{

seed = (INIT_MULT * (seed ^ (seed >> 30)) + i);

s_seeds[i] = seed;

}

}

__syncthreads(); /* Ensure mtNexts set & needed for mt19937w() */

return;

}





__device__ uint mt19937s(void)

{

int kk;

uint y;

const int tid = threadIdx.x;



kk = mod(mtNexts + tid, N);

__syncthreads(); /* Finished with mtNexts & s_seed[] ready from last run */



if (tid == blockDim.x - 1)

{

mtNexts = kk + 1; /* Will get modded on next call */

}

y = (s_seeds[kk] & UPPER_MASK) | (s_seeds[kk + 1] & LOWER_MASK);

y = s_seeds[kk < N - M ? kk + M : kk + (M - N)] ^ (y >> 1) ^ mag01[y & 1];

//y = s_seeds[kk < N - M ? kk + M : kk + (M - N)] ^ (y >> 1) ^ (y & 1 ? MATRIX_A : 0); // Same speed

__syncthreads(); /* All done before we update */



s_seeds[kk] = y;

if (kk == 0) /* Copy up for next round */

{

s_seeds[N] = y;

}

y ^= (y >> 11); /* Tempering */

y ^= (y << 7) & TEMPER1;

y ^= (y << 15) & TEMPER2;

y ^= (y >> 18);

return y;

}



For the OpenCL bit, I merged the two functions as I did not know how to maintain state between those functions and could not declare mtNexts and s_seeds as shared out of any function. I came up with the following in OpenCL :



#define NVG80 /* For Nvidia G80 achitecture where mod is VERY slow */



#ifdef NVG80

#define mod(x, y) ((x) < (y) ? (x) : (x) - (y)) /* Short mod - known input range */

#else

#define mod(x, y) ((x) % (y))

#endif





#define N 624

#define M 397

#define INIT_MULT 1812433253 /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */

#define ARRAY_SEED 19650218 /* Seed for initial setup before incorp array seed */

#define MATRIX_A 0x9908b0df /* Constant vector a */

#define UPPER_MASK 0x80000000 /* Most significant w-r bits */

#define LOWER_MASK 0x7fffffff /* Least significant r bits */

#define TEMPER1 0x9d2c5680

#define TEMPER2 0xefc60000



__constant int mag01[2] = {0,0x9908b0df}; // you can't use MATRIX_A as you have __constant



unsigned int my_mt19937s(unsigned int seed) {



int mtNexts; /* Start of next block of seeds */



unsigned int s_seeds[N + 1];



int i;



if (get_local_id(0) == 0)

{

mtNexts = 0;

s_seeds[0] = seed;

for (i = 1; i < N; i++)

{

seed = (INIT_MULT * (seed ^ (seed >> 30)) + i);

s_seeds[i] = seed;

}

}

barrier(CLK_LOCAL_MEM_FENCE); /* Ensure mtNexts set & needed for mt19937w() */



int kk;

unsigned int y;

const int tid = get_local_id(0);



kk = mod(mtNexts + tid, N);

barrier(CLK_LOCAL_MEM_FENCE); /* Finished with mtNexts & s_seed[] ready from last run */



if (tid == get_local_size(0) - 1)

{

mtNexts = kk + 1; /* Will get modded on next call */

}

y = (s_seeds[kk] & UPPER_MASK) | (s_seeds[kk + 1] & LOWER_MASK);

y = s_seeds[kk < N - M ? kk + M : kk + (M - N)] ^ (y >> 1) ^ mag01[y & 1];

//y = s_seeds[kk < N - M ? kk + M : kk + (M - N)] ^ (y >> 1) ^ (y & 1 ? MATRIX_A : 0); // Same speed

barrier(CLK_LOCAL_MEM_FENCE); /* All done before we update */



s_seeds[kk] = y;

if (kk == 0) /* Copy up for next round */

{

s_seeds[N] = y;

}

y ^= (y >> 11); /* Tempering */

y ^= (y << 7) & TEMPER1;

y ^= (y << 15) & TEMPER2;

y ^= (y >> 18);



return y;



}



In my kernel function I call my_mt19937s(get_global_id(0)) 1000 times but I get a correct value only for the first time, and for the rest I get 0, any ideas what is going wrong? If you require any additional info please let me know.



Thanks,



- vihan

#1
Posted 02/17/2012 03:50 PM   
Never mind, I sorted it out.

- vihan
Never mind, I sorted it out.



- vihan

#2
Posted 02/23/2012 09:18 PM   
Scroll To Top