larger than 64bit integer I want to count with very big integers, larger than 64bit


Is there any way in CUDA to count with larger integer than 64bit?
If its possible can you give me instructions how can I do this?

(unsigned) long long int provides 64-bits. There is no built-in non-vector integer type that is wider than 64 bits. However, you could easily build your own 128-bit integer type. For example:

typedef struct {

  unsigned long long int lo;

  unsigned long long int hi;

} my_uint128;

my_uint128 add_uint128 (my_uint128 a, my_uint128 b)


  my_uint128 res;

  res.lo = a.lo + b.lo;

  res.hi = a.hi + b.hi + (res.lo < a.lo);

  return res;


If a higher performance solution is desired, consider mapping a 128-bit integer to a uint4 and using inline PTX for more efficient handling of the carries between the four 32-bit chunks.

Thats exactly what I want. Thank you very much!

More questions: :-)

1.How can I display in decimal number system?

  1. If I want to allocate N-element my_uint128 array to my device, I have to use this code?
cudaMalloc( (void**)&deviceArray, (N * sizeof(my_uint128)) )

Your malloc() call looks OK. Here is some code that does conversions. This is rather simplistic and will not win any speed records, and it does not provide industrial-strength robustness (for example, conversion from my_uint128 is limited to numbers < 10**38 instead of the full range). But it should be sufficient as a starting point for your own work.

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#define TRIALS 1000000

typedef struct {

  unsigned long long int lo;

  unsigned long long int hi;

} my_uint128;

my_uint128 add_my_uint128 (my_uint128 a, my_uint128 b) 


  my_uint128 res;

  res.lo = a.lo + b.lo;

  res.hi = a.hi + b.hi + (res.lo < a.lo);

  return res;


my_uint128 sub_my_uint128 (my_uint128 a, my_uint128 b) 


  my_uint128 res;

  res.lo = a.lo - b.lo;

  res.hi = a.hi - b.hi - (res.lo > a.lo);

  return res;


my_uint128 shl_my_uint128 (my_uint128 a, int s)


  if (s) {

    a.hi = (a.hi << s) | (a.lo >> (64 - s));

    a.lo =  a.lo << s;


  return a;


my_uint128 mul10_my_uint128 (my_uint128 a)


  my_uint128 s, t;

  s = shl_my_uint128 (a, 3);

  t = shl_my_uint128 (a, 1);

  return add_my_uint128 (s, t);


static const my_uint128 pwrten [] =


  {0x0000000000000001, 0x0000000000000000}, /* 10**0  */

  {0x000000000000000a, 0x0000000000000000}, /* 10**1  */

  {0x0000000000000064, 0x0000000000000000}, /* 10**2  */

  {0x00000000000003e8, 0x0000000000000000}, /* 10**3  */

  {0x0000000000002710, 0x0000000000000000}, /* 10**4  */

  {0x00000000000186a0, 0x0000000000000000}, /* 10**5  */

  {0x00000000000f4240, 0x0000000000000000}, /* 10**6  */

  {0x0000000000989680, 0x0000000000000000}, /* 10**7  */

  {0x0000000005f5e100, 0x0000000000000000}, /* 10**8  */

  {0x000000003b9aca00, 0x0000000000000000}, /* 10**9  */

  {0x00000002540be400, 0x0000000000000000}, /* 10**10 */

  {0x000000174876e800, 0x0000000000000000}, /* 10**11 */

  {0x000000e8d4a51000, 0x0000000000000000}, /* 10**12 */

  {0x000009184e72a000, 0x0000000000000000}, /* 10**13 */

  {0x00005af3107a4000, 0x0000000000000000}, /* 10**14 */

  {0x00038d7ea4c68000, 0x0000000000000000}, /* 10**15 */

  {0x002386f26fc10000, 0x0000000000000000}, /* 10**16 */

  {0x016345785d8a0000, 0x0000000000000000}, /* 10**17 */

  {0x0de0b6b3a7640000, 0x0000000000000000}, /* 10**18 */

  {0x8ac7230489e80000, 0x0000000000000000}, /* 10**19 */

  {0x6bc75e2d63100000, 0x0000000000000005}, /* 10**20 */

  {0x35c9adc5dea00000, 0x0000000000000036}, /* 10**21 */

  {0x19e0c9bab2400000, 0x000000000000021e}, /* 10**22 */

  {0x02c7e14af6800000, 0x000000000000152d}, /* 10**23 */

  {0x1bcecceda1000000, 0x000000000000d3c2}, /* 10**24 */

  {0x161401484a000000, 0x0000000000084595}, /* 10**25 */

  {0xdcc80cd2e4000000, 0x000000000052b7d2}, /* 10**26 */

  {0x9fd0803ce8000000, 0x00000000033b2e3c}, /* 10**27 */

  {0x3e25026110000000, 0x00000000204fce5e}, /* 10**28 */

  {0x6d7217caa0000000, 0x00000001431e0fae}, /* 10**29 */

  {0x4674edea40000000, 0x0000000c9f2c9cd0}, /* 10**30 */

  {0xc0914b2680000000, 0x0000007e37be2022}, /* 10**31 */

  {0x85acef8100000000, 0x000004ee2d6d415b}, /* 10**32 */

  {0x38c15b0a00000000, 0x0000314dc6448d93}, /* 10**33 */

  {0x378d8e6400000000, 0x0001ed09bead87c0}, /* 10**34 */

  {0x2b878fe800000000, 0x0013426172c74d82}, /* 10**35 */

  {0xb34b9f1000000000, 0x00c097ce7bc90715}, /* 10**36 */

  {0x00f436a000000000, 0x0785ee10d5da46d9}, /* 10**37 */


#define MAX_PWR ((int)(sizeof(pwrten)/sizeof(pwrten[0]))-1)

#define DIGITS  (MAX_PWR+1)

void cvt_my_uint128_to_str (my_uint128 a, char *cp) 


  my_uint128 t;

  int pwr, bit, non_zero, digit, remainder_neg;

  non_zero = 0;

  for (pwr = MAX_PWR; pwr >= 0; pwr--) {

    digit = 0;

    for (bit = 3; bit >= 0; bit--) {

      t = shl_my_uint128 (pwrten[pwr], bit);

      a = sub_my_uint128 (a, t);

      remainder_neg = ((long long int)a.hi) < 0;

      digit = (digit << 1) | !remainder_neg;

      if (remainder_neg) {

        a = add_my_uint128 (a, t);



    non_zero |= digit;

    if (non_zero || pwr == 0) {

      *cp++ = '0' + digit;


    *cp = 0;



my_uint128 cvt_str_to_my_uint128 (char *cp)


  my_uint128 a = {0, 0};

  my_uint128 t = {0, 0};

  while (*cp) {

    a = mul10_my_uint128 (a);

    t.lo = *cp++ - '0';

    a = add_my_uint128 (a, t);


  return a;


void rand_num_str (char *cp, int digits)


  int i;

  for (i = 0; i < digits; i++) {

    *cp++ = '0' + ((i) ? (rand() % 10) : (rand() % 9 + 1));


  *cp = 0;


int main (void)


  my_uint128 a;

  int i;  

  char in_str[DIGITS+1];

  char out_str[DIGITS+1];

  int errors = 0;

  printf ("^^^^ DIGITS=%d\n", DIGITS);

  for (i = 0; i < TRIALS; i++) {

    rand_num_str (in_str, DIGITS);

    a = cvt_str_to_my_uint128 (in_str);

    cvt_my_uint128_to_str (a, out_str);

    if (strcmp (in_str, out_str) != 0) {

      printf ("!!!! error: in_str = %s  out_str=%s\n", in_str, out_str);




  printf ("@@@@ %d errors in %d trials\n", errors, TRIALS);

  printf ("&&&& my_uint128 conversion test %s\n", errors ? "FAILED":"PASSED");

  return EXIT_SUCCESS;


Works fine! :D
You helped me a lot.
Thank you very much!

Have you considered to use logarithms?