why is shared memory example not faster
I made up an example to see if shared memory is really faster.

The two programs are x.cu and y.cu
They both return the same answer like they should
x.cu uses shared memory, so must declare an array to support the working set
per thread,

y.cu juse uses the device memory you get with the stack frame so each thread
gets its copy of the working set this way

both do a reduction of the data set computed by thread id, into item 0

Time results are essentially the same. same answer.
1 time x
2 2089967186485776.000000
3
4 real. 0m10.53s
5 user. 0m10.42s
6 sys. 0m 0.09s
7
8 time y
9 2089967186485776.000000
10
11 real. 0m 9.75s
12 user. 0m 9.68s
13 sys. 0m 0.06s

Here is the source for both

cat x.cu|greatawk 1 10000
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <assert.h>
4 #include <math.h>
5 #include "book.h"
6
7 enum {M=2000};
8 enum {N=1024};
9 struct workdef
10 {
11 . int i,j,k1,k2;
12 . double A;
13 . double B;
14 . double S;
15 };
16
17 __global__ void K(double (*D)[N])
18 {
19 . //int n = gridDim.x;
20 . //int m = blockDim.x;
21 . //int bid = blockIdx.x;
22 . int tid = threadIdx.x;

-------------------------------------------------
23 . __shared__ struct workdef Z[N];
24 . struct workdef *W = &Z[tid];
-------------------------------------------------

25 . {
26 . . W->S = tid;
27 . . W->A = 0.0;
28 . . W->B = 0.0;
29 . }
30 . __syncthreads();
31 . for ( W->i=0;W->i < M;(W->i)++)
32 . {
33 . . W->k1 = (tid + W->i) % N;
34 . . W->A= (double) W->i;
35 . . for ( W->j=0;W->j < M;W->j++)
36 . . {
37 . . . W->k2 = (tid + W->j) % N;
38 . . . W->B = (double) W->k2;
39 . . . W->S+=max(102.0,max(10.0,W->A * W->B) - max(20.0,W->A/ min(1.0,W->B+1.0)));
40 . . }
41 . }
42 . (*D)[tid] = W->S;
43 . __syncthreads();
------------------------------------
44 . {
45 . . int s;
46 . . for (
47 . . . s=N/2;
48 . . . s>0;
49 . . . s>>=1)
50 . . {
51 . . . if ( tid < s)
52 . . . {
53 . . . . (*D)[tid] += (*D)[tid+s];
54 . . . }
55 . . . __syncthreads();
56 . . }
57 . }
------------------------------------
58 }
59
60 int main( void )
61 {
62 . double (*D)[N] = 0;
63 . double H;
64 . size_t sz = sizeof( double [N]);
65
66 . HANDLE_ERROR( cudaMalloc( (void**)&D , sz ));
67
68 . K<<<1,N>>>(D);
69 . cudaThreadSynchronize(); /* sync up mapped mem with host */
70 . {
71 . . cudaError_t e= cudaGetLastError();
72 . . assert ( e == cudaSuccess);
73 . }
74
75 . HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );
76 . printf("%lf\n", H);
77 . HANDLE_ERROR( cudaFree(D));
78 }



cat y.cu|greatawk 1 10000
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <assert.h>
4 #include <math.h>
5 #include "book.h"
6
7 enum {M=2000};
8 enum {N=1024};
9 struct workdef
10 {
11 . int i,j,k1,k2;
12 . double A;
13 . double B;
14 . double S;
15 };
16
17 __global__ void K(double (*D)[N])
18 {
19 . //int n = gridDim.x;
20 . //int m = blockDim.x;
21 . //int bid = blockIdx.x;
22 . int tid = threadIdx.x;

---------------------------------------------
23 . struct workdef Z[1];
24 . struct workdef *W = &Z[0];
---------------------------------------------

25 . //if( tid == 0)
26 . {
27 . . int i;
28 . . W->S = tid;
29 . . for ( i=0;i < N;i++)
30 . . {
31 . . . W->A = 0.0;
32 . . . W->B = 0.0;
33 . . }
34 . }
35 . __syncthreads();
36 . for ( W->i=0;W->i < M;(W->i)++)
37 . {
38 . . W->k1 = (tid + W->i) % N;
39 . . W->A= (double) W->i;
40 . . for ( W->j=0;W->j < M;W->j++)
41 . . {
42 . . . W->k2 = (tid + W->j) % N;
43 . . . W->B = (double) W->k2;
44 . . . W->S+=max(102.0,max(10.0,W->A * W->B) - max(20.0,W->A/ min(1.0,W->B+1.0)));
45 . . }
46 . }
47 . (*D)[tid] = W->S;
48 . __syncthreads();
------------------------------------
49 . {
50 . . int s;
51 . . for (
52 . . . s=N/2;
53 . . . s>0;
54 . . . s>>=1)
55 . . {
56 . . . if ( tid < s)
57 . . . {
58 . . . . (*D)[tid] += (*D)[tid+s];
59 . . . }
60 . . . __syncthreads();
61 . . }
62 . }
------------------------------------
63 }
64
65 int main( void )
66 {
67 . double (*D)[N] = 0;
68 . double H;
69 . size_t sz = sizeof( double [N]);
70
71 . HANDLE_ERROR( cudaMalloc( (void**)&D , sz ));
72
73 . K<<<1,N>>>(D);
74 . cudaThreadSynchronize(); /* sync up mapped mem with host */
75 . {
76 . . cudaError_t e= cudaGetLastError();
77 . . assert ( e == cudaSuccess);
78 . }
79
80 . HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );
81 . printf("%lf\n", H);
82 . HANDLE_ERROR( cudaFree(D));
83 }


Can you explain what I have to do to get the big time difference that shared memory is supposed to
yield?
I made up an example to see if shared memory is really faster.



The two programs are x.cu and y.cu

They both return the same answer like they should

x.cu uses shared memory, so must declare an array to support the working set

per thread,



y.cu juse uses the device memory you get with the stack frame so each thread

gets its copy of the working set this way



both do a reduction of the data set computed by thread id, into item 0



Time results are essentially the same. same answer.

1 time x

2 2089967186485776.000000

3

4 real. 0m10.53s

5 user. 0m10.42s

6 sys. 0m 0.09s

7

8 time y

9 2089967186485776.000000

10

11 real. 0m 9.75s

12 user. 0m 9.68s

13 sys. 0m 0.06s



Here is the source for both



cat x.cu|greatawk 1 10000

1 #include <stdio.h>

2 #include <stdlib.h>

3 #include <assert.h>

4 #include <math.h>

5 #include "book.h"

6

7 enum {M=2000};

8 enum {N=1024};

9 struct workdef

10 {

11 . int i,j,k1,k2;

12 . double A;

13 . double B;

14 . double S;

15 };

16

17 __global__ void K(double (*D)[N])

18 {

19 . //int n = gridDim.x;

20 . //int m = blockDim.x;

21 . //int bid = blockIdx.x;

22 . int tid = threadIdx.x;



-------------------------------------------------

23 . __shared__ struct workdef Z[N];

24 . struct workdef *W = &Z[tid];

-------------------------------------------------



25 . {

26 . . W->S = tid;

27 . . W->A = 0.0;

28 . . W->B = 0.0;

29 . }

30 . __syncthreads();

31 . for ( W->i=0;W->i < M;(W->i)++)

32 . {

33 . . W->k1 = (tid + W->i) % N;

34 . . W->A= (double) W->i;

35 . . for ( W->j=0;W->j < M;W->j++)

36 . . {

37 . . . W->k2 = (tid + W->j) % N;

38 . . . W->B = (double) W->k2;

39 . . . W->S+=max(102.0,max(10.0,W->A * W->B) - max(20.0,W->A/ min(1.0,W->B+1.0)));

40 . . }

41 . }

42 . (*D)[tid] = W->S;

43 . __syncthreads();

------------------------------------

44 . {

45 . . int s;

46 . . for (

47 . . . s=N/2;

48 . . . s>0;

49 . . . s>>=1)

50 . . {

51 . . . if ( tid < s)

52 . . . {

53 . . . . (*D)[tid] += (*D)[tid+s];

54 . . . }

55 . . . __syncthreads();

56 . . }

57 . }

------------------------------------

58 }

59

60 int main( void )

61 {

62 . double (*D)[N] = 0;

63 . double H;

64 . size_t sz = sizeof( double [N]);

65

66 . HANDLE_ERROR( cudaMalloc( (void**)&D , sz ));

67

68 . K<<<1,N>>>(D);

69 . cudaThreadSynchronize(); /* sync up mapped mem with host */

70 . {

71 . . cudaError_t e= cudaGetLastError();

72 . . assert ( e == cudaSuccess);

73 . }

74

75 . HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );

76 . printf("%lf\n", H);

77 . HANDLE_ERROR( cudaFree(D));

78 }







cat y.cu|greatawk 1 10000

1 #include <stdio.h>

2 #include <stdlib.h>

3 #include <assert.h>

4 #include <math.h>

5 #include "book.h"

6

7 enum {M=2000};

8 enum {N=1024};

9 struct workdef

10 {

11 . int i,j,k1,k2;

12 . double A;

13 . double B;

14 . double S;

15 };

16

17 __global__ void K(double (*D)[N])

18 {

19 . //int n = gridDim.x;

20 . //int m = blockDim.x;

21 . //int bid = blockIdx.x;

22 . int tid = threadIdx.x;



---------------------------------------------

23 . struct workdef Z[1];

24 . struct workdef *W = &Z[0];

---------------------------------------------



25 . //if( tid == 0)

26 . {

27 . . int i;

28 . . W->S = tid;

29 . . for ( i=0;i < N;i++)

30 . . {

31 . . . W->A = 0.0;

32 . . . W->B = 0.0;

33 . . }

34 . }

35 . __syncthreads();

36 . for ( W->i=0;W->i < M;(W->i)++)

37 . {

38 . . W->k1 = (tid + W->i) % N;

39 . . W->A= (double) W->i;

40 . . for ( W->j=0;W->j < M;W->j++)

41 . . {

42 . . . W->k2 = (tid + W->j) % N;

43 . . . W->B = (double) W->k2;

44 . . . W->S+=max(102.0,max(10.0,W->A * W->B) - max(20.0,W->A/ min(1.0,W->B+1.0)));

45 . . }

46 . }

47 . (*D)[tid] = W->S;

48 . __syncthreads();

------------------------------------

49 . {

50 . . int s;

51 . . for (

52 . . . s=N/2;

53 . . . s>0;

54 . . . s>>=1)

55 . . {

56 . . . if ( tid < s)

57 . . . {

58 . . . . (*D)[tid] += (*D)[tid+s];

59 . . . }

60 . . . __syncthreads();

61 . . }

62 . }

------------------------------------

63 }

64

65 int main( void )

66 {

67 . double (*D)[N] = 0;

68 . double H;

69 . size_t sz = sizeof( double [N]);

70

71 . HANDLE_ERROR( cudaMalloc( (void**)&D , sz ));

72

73 . K<<<1,N>>>(D);

74 . cudaThreadSynchronize(); /* sync up mapped mem with host */

75 . {

76 . . cudaError_t e= cudaGetLastError();

77 . . assert ( e == cudaSuccess);

78 . }

79

80 . HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );

81 . printf("%lf\n", H);

82 . HANDLE_ERROR( cudaFree(D));

83 }





Can you explain what I have to do to get the big time difference that shared memory is supposed to

yield?

#1
Posted 04/23/2012 04:29 AM   
A couple of comments:

[list=1]
[*]Are you running the code on a device of compute capability 1.x? On 2.x and 3.0 global memory is cached, so shared memory is unlikely to be any faster (even slower, since you have to manually load data into shared memory first).

[*]That said, the actual reduction is in global memory in both versions. So only the first part of the kernel could be accelerated by use of shared memory.

[*]Why are you using a [font="Courier New"]struct workdef[/font] in shared or local memory at all? All accesses to it are thread-local, so it would be most appropriate to place it's contents in registers. Particularly the loop counters [font="Courier New"]W->i[/font] and [font="Courier New"]W->j[/font] should be simple automatic variables [font="Courier New"]int i[/font] and [font="Courier New"]int j[/font] in registers.

[*]However, in the second version the compiler is likely able to determine that by itself so it actually places these variables in registers. Taking the address of [font="Courier New"]Z[/font] and manipulating it through the pointer would spoil this optimization on older compilers (and thus isn't good style), but the CUDA compilers are likely to notice that the pointer is never leaked to outside the kernel and can savely be optimized away.

[*]Taking the address in shared memory [font="Courier New"]*W = &Z[tid][/font] in the first example and manipulating contents through that pointer might be slower than directly using [font="Courier New"]Z[tid][/font], as there is some direct support in the machine instruction set for the latter and the compiler made very smart about it, while it may or may not be optimized to recognize the former form.

[*]Thanks for going to great lengths to make the code readable on the forums. It looks even nicer if posted between [font="Courier New"][code[i][/i]]...[/co[u][/u]de][/font] tags.
[/list]
A couple of comments:



[list=1]

  • Are you running the code on a device of compute capability 1.x? On 2.x and 3.0 global memory is cached, so shared memory is unlikely to be any faster (even slower, since you have to manually load data into shared memory first).

  • That said, the actual reduction is in global memory in both versions. So only the first part of the kernel could be accelerated by use of shared memory.

  • Why are you using a struct workdef in shared or local memory at all? All accesses to it are thread-local, so it would be most appropriate to place it's contents in registers. Particularly the loop counters W->i and W->j should be simple automatic variables int i and int j in registers.

  • However, in the second version the compiler is likely able to determine that by itself so it actually places these variables in registers. Taking the address of Z and manipulating it through the pointer would spoil this optimization on older compilers (and thus isn't good style), but the CUDA compilers are likely to notice that the pointer is never leaked to outside the kernel and can savely be optimized away.

  • Taking the address in shared memory *W = &Z[tid] in the first example and manipulating contents through that pointer might be slower than directly using Z[tid], as there is some direct support in the machine instruction set for the latter and the compiler made very smart about it, while it may or may not be optimized to recognize the former form.

  • Thanks for going to great lengths to make the code readable on the forums. It looks even nicer if posted between [code[i][/i]]...[/co[u][/u]de] tags.
  • [/list]

    Always check return codes of CUDA calls for errors. Do not use __syncthreads() in conditional code unless the condition is guaranteed to evaluate identically for all threads of each block. Run your program under cuda-memcheck to detect stray memory accesses. If your kernel dies for larger problem sizes, it might exceed the runtime limit and trigger the watchdog timer.

    #2
    Posted 04/23/2012 08:59 AM   
    First of all, thanks VERY much for your answers and your help and the tip abourt how to get
    the forum to leave my spaces alone in my cut paste program fragments.

    It is easy to lost sight of the point when we get into details, but my point, is, I want a way to recast this
    silly make work problem using shared memory to execute in less time than the "naive" approach that just uses the
    automatic stack frame storage.



    1) I am running a C2050, which I think is compute capacity 2.0 so my search for the canonical
    example of shared memory speedup is not going to be successful. THIS is probably my answer - right here

    2) The reduction part is designed to be a minor part of the loop cost, Lets test that by incrasing
    M which makes the compute part a larger fraction of the total time


    [code]
    1 recall that x.cu uses shared memory
    2 x.cu that uses shared memory
    3 --------------------------------------
    4 __shared__ struct workdef Z[N]; <---need array so each thread tid has unique W
    5 struct workdef *W = &Z[tid]; <----- ref by tid
    6 --------------------------------------
    7
    8 y.cu does not use shared memory. stack frame makes W distinct for each thread.
    9 --------------------------------------
    10 struct workdef Z[1];
    11 struct workdef *W = &Z[0];
    12 --------------------------------------


    1 original M
    2 ----------------------------------------
    3 1 grep -n 'M=' [xy].cu
    4 2 x.cu:7:enum {M=2000};
    5 3 y.cu:7:enum {M=2000};
    6 4
    7 5 make x.exe;make y.exe
    8 6 `x.exe' is up to date
    9 7 `y.exe' is up to date
    10 8
    11 9 time x
    12 10 2089967186485776.000000
    13 11
    14 12 real 0m10.51s
    15 13 user 0m10.42s
    16 14 sys 0m 0.09s
    17 15 time y
    18 16 2089967186485776.000000
    19 17
    20 18 real 0m 9.73s
    21 19 user 0m 9.62s
    22 20 sys 0m 0.07s
    23 ----------------------------------------
    24
    25
    26 increase M to make reduction at the end less significant
    27 ------------------------------------
    28 1 grep -n 'M=' [xy].cu
    29 2 x.cu:7:enum {M=3000};
    30 3 y.cu:7:enum {M=3000};
    31 4 make x.exe;make y.exe
    32 5 `x.exe' is up to date
    33 6 `y.exe' is up to date
    34 7
    35 8 time x
    36 9 7054814761966776.000000
    37 10
    38 11 real 0m23.52s
    39 12 user 0m23.44s
    40 13 sys 0m 0.07s
    41 14
    42 15 time y
    43 16 7054814761966776.000000
    44 17
    45 18 real 0m21.82s
    46 19 user 0m21.74s
    47 20 sys 0m 0.07s
    48 ------------------------------------
    [/code]

    conclusion:
    The reductin is a small fraction of the total time and the x.cu and y.cu
    run in the same time.


    3) I am using a struct to contain the working set of variables so that
    I can make a distinction in the cases that the working set is either in shared
    memory or is not. That is the reason for this organization. If I use shared
    memoery of course i need to make an array of working variables, on for every
    thread in the thread block so the threads dont step on one another.
    Thats why I made everyting, including the loop indexs reside in the working
    set struct. In the case where we dont use shared memory, y.cu, there is
    that hope that the compiler will forgive the pointer dereference and figure out
    the equivalent thing.
    Just to make sure I made a z.c which is the cleaner non shared memory
    version of y.cu.
    x.cu must have the working storage stuff in the array of structs so I can reference it
    by tid.

    [code]
    cat z.cu|greatawk 1 1000
    1 #include <stdio.h>
    2 #include <stdlib.h>
    3 #include <assert.h>
    4 #include <math.h>
    5 #include "book.h"
    6
    7 enum {M=3000};
    8 enum {N=1024};
    9
    10 __global__ void K(double (*D)[N])
    11 {
    12 int tid = threadIdx.x;
    13 double S = 0;
    14 __syncthreads();
    15 {
    16 int i;
    17 for ( i=0;i < M;i++)
    18 {
    19 double A= (double) i;
    20 int j;
    21 for ( j=0;j < M;j++)
    22 {
    23 int k2 = (tid +j) % N;
    24 double B = (double) k2;
    25 S+=max(102.0,max(10.0,A * B) - max(20.0,A/min(1.0,B+1.0)));
    26 }
    27 }
    28 }
    29 (*D)[tid] = S;
    30
    31 __syncthreads();
    32 {
    33 int s;
    34 for (
    35 s=N/2;
    36 s>0;
    37 s>>=1)
    38 {
    39 if ( tid < s)
    40 {
    41 (*D)[tid] += (*D)[tid+s];
    42 }
    43 __syncthreads();
    44 }
    45 }
    46 }
    47
    48 int main( void )
    49 {
    50 double (*D)[N] = 0;
    51 double H;
    52 size_t sz = sizeof( double [N]);
    53
    54 HANDLE_ERROR( cudaMalloc( (void**)&D , sz ));
    55
    56 K<<<1,N>>>(D);
    57 cudaThreadSynchronize(); /* sync up mapped mem with host */
    58 {
    59 cudaError_t e= cudaGetLastError();
    60 assert ( e == cudaSuccess);
    61 }
    62
    63 HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );
    64 printf("%lf\n", H);
    65 HANDLE_ERROR( cudaFree(D));
    66 }
    [/code]

    1 time z
    2 7054814761443000.000000
    3
    4 real 0m22.48s
    5 user 0m22.38s
    6 sys 0m 0.07s


    The hand crafted version z.cu using automatic variables, and M=3000, runs in about the
    same time as x.cu which kept the working variables in shared memory.


    Conclusing: moving working set memory, scalars anyway, into shared memory and indexing it by
    thread id is not a way to make things work better.

    ===============================================================
    ===============================================================
    ===============================================================


    NEXT QUESTION: Does using shared memory to hold an array accessed by thread id confer any advantage
    over using device memory to hold the araray?
    ------------------------------------------------------
    The next experiment is using shared memory- hoping for a canonical speedup case,
    I wonder if I had an array involved in my
    calculation, if I kept that array in shared memory, is there some advantage.

    wxd.cu uses shared memory in this way.
    wyd.cu does not


    make wxd.exe wyd.exe
    `wxd.exe' is up to date
    `wyd.exe' is up to date

    1 time wxd
    2 outside K -503450582508.729740
    3
    4 real 0m55.86s
    5 user 0m55.73s
    6 sys 0m 0.09s

    1 time wyd
    2 outside K -503450582508.729740
    3
    4 real 0m55.55s
    5 user 0m55.47s
    6 sys 0m 0.06s

    Conclusion: There is no evidence that using shared memory in this way yields any benefit
    at all. I would like to see some small exaple along these lines to illuustrate some
    advantage in using it, with a compute capability 2.0 device.


    [code]
    cat wxd.cu|greatawk 1 1000
    1 #include <stdio.h>
    2 #include <stdlib.h>
    3 #include <assert.h>
    4 #include <math.h>
    5 #include "book.h"
    6
    7 enum {M=5000};
    8 enum {N=1024};
    9 __global__ void K(double (*D)[N]) /* D is not input too */
    10 {
    11 int tid = threadIdx.x;
    12 __shared__ double Z[N];
    13 double S = 0.0;
    ----------------------

    14 Z[tid] = (*D)[tid];
    use shared memory to access the array D

    ----------------------
    15 __syncthreads();
    16 {
    17 int i;
    18 for ( i=0;i < M;i++)
    19 {
    20 //int k1 = (i % N);
    21 double A= Z[tid];
    22 int j;
    23 for ( j=0;j < M;j++)
    24 {
    25 double B = Z[tid];
    26 double C=A * B - max(20.0,A/min(1.0,B+1.0));
    27 S+=C;
    28 //printf("tid=%2ld,[%ld][%ld] A=%lf B=%lf C=%lf S=%lf\n", tid, i,j,A,B,C,S);
    29 }
    30 }
    31 }
    32 (*D)[tid] = S;
    33 __syncthreads();
    34 {
    35 int s;
    36 for (
    37 s=N/2;
    38 s>0;
    39 s>>=1)
    40 {
    41 if ( tid < s)
    42 {
    43 (*D)[tid] += (*D)[tid+s];
    44 }
    45 __syncthreads();
    46 }
    47 }
    48 __syncthreads();
    49 }
    50
    51 int main( void )
    52 {
    53 double (*D)[N] = 0;
    54 double H[N];
    55 size_t sz = sizeof( double [N]);
    56
    57 HANDLE_ERROR( cudaMalloc( (void**)&D , sz ));
    58 {
    59 int i;
    60 double denominator = (double) RAND_MAX;
    61 for( i=0;i<N;i++)
    62 {
    63 double numerator = (double) rand();
    64 H[i] = numerator / denominator;
    65 assert( H[i] >= 0 && H[i] <=1.0);
    66 }
    67 HANDLE_ERROR( cudaMemcpy( D, H, sz,cudaMemcpyHostToDevice)) ;
    68 }
    69
    70 K<<<1,N>>>(D);
    71 cudaThreadSynchronize(); /* sync up mapped mem with host */
    72 {
    73 cudaError_t e= cudaGetLastError();
    74 assert ( e == cudaSuccess);
    75 }
    76
    77 HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );
    78 printf("outside K %lf\n", H[0]);
    79 HANDLE_ERROR( cudaFree(D));
    80 }
    [/code]

    [code]
    cat wyd.cu|greatawk 1 1000
    1 #include <stdio.h>
    2 #include <stdlib.h>
    3 #include <assert.h>
    4 #include <math.h>
    5 #include "book.h"
    6
    7 enum {M=5000};
    8 enum {N=1024};
    9 __global__ void K(double (*D)[N]) /* D is not input too */
    10 {
    11 int tid = threadIdx.x;

    ----------------------
    use plain old device memory D to access the input array
    ----------------------

    12 double S = 0.0;
    13 __syncthreads();
    14 {
    15 int i;
    16 for ( i=0;i < M;i++)
    17 {
    18 //int k1 = ( i % N);
    19 double A= (*D)[tid];
    20 int j;
    21 for ( j=0;j < M;j++)
    22 {
    23 double B = (*D)[tid];
    24 double C=A * B - max(20.0,A/min(1.0,B+1.0));
    25 S+=C;
    26 //printf("tid=%2ld,[%ld][%ld] A=%lf B=%lf C=%lf S=%lf\n", tid, i,j,A,B,C,S);
    27 }
    28 }
    29 }
    30 (*D)[tid] = S;
    31 __syncthreads();
    32
    33 __syncthreads();
    34 {
    35 int s;
    36 for (
    37 s=N/2;
    38 s>0;
    39 s>>=1)
    40 {
    41 if ( tid < s)
    42 {
    43 (*D)[tid] += (*D)[tid+s];
    44 }
    45 __syncthreads();
    46 }
    47 }
    48 __syncthreads();
    49 }
    50
    51 int main( void )
    52 {
    53 double (*D)[N] = 0;
    54 double H[N];
    55 size_t sz = sizeof( double [N]);
    56
    57 HANDLE_ERROR( cudaMalloc( (void**)&D , sz ));
    58 {
    59 int i;
    60 double denominator = (double) RAND_MAX;
    61 for( i=0;i<N;i++)
    62 {
    63 double numerator = (double) rand();
    64 H[i] = numerator / denominator;
    65 assert( H[i] >= 0 && H[i] <=1.0);
    66 }
    67 HANDLE_ERROR( cudaMemcpy( D, H, sz,cudaMemcpyHostToDevice)) ;
    68 }
    69
    70 K<<<1,N>>>(D);
    71 cudaThreadSynchronize(); /* sync up mapped mem with host */
    72 {
    73 cudaError_t e= cudaGetLastError();
    74 assert ( e == cudaSuccess);
    75 }
    76
    77 HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );
    78 printf("outside K %lf\n", H[0]);
    79 HANDLE_ERROR( cudaFree(D));
    80 }
    [/code]
    First of all, thanks VERY much for your answers and your help and the tip abourt how to get

    the forum to leave my spaces alone in my cut paste program fragments.



    It is easy to lost sight of the point when we get into details, but my point, is, I want a way to recast this

    silly make work problem using shared memory to execute in less time than the "naive" approach that just uses the

    automatic stack frame storage.







    1) I am running a C2050, which I think is compute capacity 2.0 so my search for the canonical

    example of shared memory speedup is not going to be successful. THIS is probably my answer - right here



    2) The reduction part is designed to be a minor part of the loop cost, Lets test that by incrasing

    M which makes the compute part a larger fraction of the total time







    1 recall that x.cu uses shared memory

    2 x.cu that uses shared memory

    3 --------------------------------------

    4 __shared__ struct workdef Z[N]; <---need array so each thread tid has unique W

    5 struct workdef *W = &Z[tid]; <----- ref by tid

    6 --------------------------------------

    7

    8 y.cu does not use shared memory. stack frame makes W distinct for each thread.

    9 --------------------------------------

    10 struct workdef Z[1];

    11 struct workdef *W = &Z[0];

    12 --------------------------------------





    1 original M

    2 ----------------------------------------

    3 1 grep -n 'M=' [xy].cu

    4 2 x.cu:7:enum {M=2000};

    5 3 y.cu:7:enum {M=2000};

    6 4

    7 5 make x.exe;make y.exe

    8 6 `x.exe' is up to date

    9 7 `y.exe' is up to date

    10 8

    11 9 time x

    12 10 2089967186485776.000000

    13 11

    14 12 real 0m10.51s

    15 13 user 0m10.42s

    16 14 sys 0m 0.09s

    17 15 time y

    18 16 2089967186485776.000000

    19 17

    20 18 real 0m 9.73s

    21 19 user 0m 9.62s

    22 20 sys 0m 0.07s

    23 ----------------------------------------

    24

    25

    26 increase M to make reduction at the end less significant

    27 ------------------------------------

    28 1 grep -n 'M=' [xy].cu

    29 2 x.cu:7:enum {M=3000};

    30 3 y.cu:7:enum {M=3000};

    31 4 make x.exe;make y.exe

    32 5 `x.exe' is up to date

    33 6 `y.exe' is up to date

    34 7

    35 8 time x

    36 9 7054814761966776.000000

    37 10

    38 11 real 0m23.52s

    39 12 user 0m23.44s

    40 13 sys 0m 0.07s

    41 14

    42 15 time y

    43 16 7054814761966776.000000

    44 17

    45 18 real 0m21.82s

    46 19 user 0m21.74s

    47 20 sys 0m 0.07s

    48 ------------------------------------




    conclusion:

    The reductin is a small fraction of the total time and the x.cu and y.cu

    run in the same time.





    3) I am using a struct to contain the working set of variables so that

    I can make a distinction in the cases that the working set is either in shared

    memory or is not. That is the reason for this organization. If I use shared

    memoery of course i need to make an array of working variables, on for every

    thread in the thread block so the threads dont step on one another.

    Thats why I made everyting, including the loop indexs reside in the working

    set struct. In the case where we dont use shared memory, y.cu, there is

    that hope that the compiler will forgive the pointer dereference and figure out

    the equivalent thing.

    Just to make sure I made a z.c which is the cleaner non shared memory

    version of y.cu.

    x.cu must have the working storage stuff in the array of structs so I can reference it

    by tid.





    cat z.cu|greatawk 1 1000

    1 #include <stdio.h>

    2 #include <stdlib.h>

    3 #include <assert.h>

    4 #include <math.h>

    5 #include "book.h"

    6

    7 enum {M=3000};

    8 enum {N=1024};

    9

    10 __global__ void K(double (*D)[N])

    11 {

    12 int tid = threadIdx.x;

    13 double S = 0;

    14 __syncthreads();

    15 {

    16 int i;

    17 for ( i=0;i < M;i++)

    18 {

    19 double A= (double) i;

    20 int j;

    21 for ( j=0;j < M;j++)

    22 {

    23 int k2 = (tid +j) % N;

    24 double B = (double) k2;

    25 S+=max(102.0,max(10.0,A * B) - max(20.0,A/min(1.0,B+1.0)));

    26 }

    27 }

    28 }

    29 (*D)[tid] = S;

    30

    31 __syncthreads();

    32 {

    33 int s;

    34 for (

    35 s=N/2;

    36 s>0;

    37 s>>=1)

    38 {

    39 if ( tid < s)

    40 {

    41 (*D)[tid] += (*D)[tid+s];

    42 }

    43 __syncthreads();

    44 }

    45 }

    46 }

    47

    48 int main( void )

    49 {

    50 double (*D)[N] = 0;

    51 double H;

    52 size_t sz = sizeof( double [N]);

    53

    54 HANDLE_ERROR( cudaMalloc( (void**)&D , sz ));

    55

    56 K<<<1,N>>>(D);

    57 cudaThreadSynchronize(); /* sync up mapped mem with host */

    58 {

    59 cudaError_t e= cudaGetLastError();

    60 assert ( e == cudaSuccess);

    61 }

    62

    63 HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );

    64 printf("%lf\n", H);

    65 HANDLE_ERROR( cudaFree(D));

    66 }




    1 time z

    2 7054814761443000.000000

    3

    4 real 0m22.48s

    5 user 0m22.38s

    6 sys 0m 0.07s





    The hand crafted version z.cu using automatic variables, and M=3000, runs in about the

    same time as x.cu which kept the working variables in shared memory.





    Conclusing: moving working set memory, scalars anyway, into shared memory and indexing it by

    thread id is not a way to make things work better.



    ===============================================================

    ===============================================================

    ===============================================================





    NEXT QUESTION: Does using shared memory to hold an array accessed by thread id confer any advantage

    over using device memory to hold the araray?

    ------------------------------------------------------

    The next experiment is using shared memory- hoping for a canonical speedup case,

    I wonder if I had an array involved in my

    calculation, if I kept that array in shared memory, is there some advantage.



    wxd.cu uses shared memory in this way.

    wyd.cu does not





    make wxd.exe wyd.exe

    `wxd.exe' is up to date

    `wyd.exe' is up to date



    1 time wxd

    2 outside K -503450582508.729740

    3

    4 real 0m55.86s

    5 user 0m55.73s

    6 sys 0m 0.09s



    1 time wyd

    2 outside K -503450582508.729740

    3

    4 real 0m55.55s

    5 user 0m55.47s

    6 sys 0m 0.06s



    Conclusion: There is no evidence that using shared memory in this way yields any benefit

    at all. I would like to see some small exaple along these lines to illuustrate some

    advantage in using it, with a compute capability 2.0 device.







    cat wxd.cu|greatawk 1 1000

    1 #include <stdio.h>

    2 #include <stdlib.h>

    3 #include <assert.h>

    4 #include <math.h>

    5 #include "book.h"

    6

    7 enum {M=5000};

    8 enum {N=1024};

    9 __global__ void K(double (*D)[N]) /* D is not input too */

    10 {

    11 int tid = threadIdx.x;

    12 __shared__ double Z[N];

    13 double S = 0.0;

    ----------------------



    14 Z[tid] = (*D)[tid];

    use shared memory to access the array D



    ----------------------

    15 __syncthreads();

    16 {

    17 int i;

    18 for ( i=0;i < M;i++)

    19 {

    20 //int k1 = (i % N);

    21 double A= Z[tid];

    22 int j;

    23 for ( j=0;j < M;j++)

    24 {

    25 double B = Z[tid];

    26 double C=A * B - max(20.0,A/min(1.0,B+1.0));

    27 S+=C;

    28 //printf("tid=%2ld,[%ld][%ld] A=%lf B=%lf C=%lf S=%lf\n", tid, i,j,A,B,C,S);

    29 }

    30 }

    31 }

    32 (*D)[tid] = S;

    33 __syncthreads();

    34 {

    35 int s;

    36 for (

    37 s=N/2;

    38 s>0;

    39 s>>=1)

    40 {

    41 if ( tid < s)

    42 {

    43 (*D)[tid] += (*D)[tid+s];

    44 }

    45 __syncthreads();

    46 }

    47 }

    48 __syncthreads();

    49 }

    50

    51 int main( void )

    52 {

    53 double (*D)[N] = 0;

    54 double H[N];

    55 size_t sz = sizeof( double [N]);

    56

    57 HANDLE_ERROR( cudaMalloc( (void**)&D , sz ));

    58 {

    59 int i;

    60 double denominator = (double) RAND_MAX;

    61 for( i=0;i<N;i++)

    62 {

    63 double numerator = (double) rand();

    64 H[i] = numerator / denominator;

    65 assert( H[i] >= 0 && H[i] <=1.0);

    66 }

    67 HANDLE_ERROR( cudaMemcpy( D, H, sz,cudaMemcpyHostToDevice)) ;

    68 }

    69

    70 K<<<1,N>>>(D);

    71 cudaThreadSynchronize(); /* sync up mapped mem with host */

    72 {

    73 cudaError_t e= cudaGetLastError();

    74 assert ( e == cudaSuccess);

    75 }

    76

    77 HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );

    78 printf("outside K %lf\n", H[0]);

    79 HANDLE_ERROR( cudaFree(D));

    80 }






    cat wyd.cu|greatawk 1 1000

    1 #include <stdio.h>

    2 #include <stdlib.h>

    3 #include <assert.h>

    4 #include <math.h>

    5 #include "book.h"

    6

    7 enum {M=5000};

    8 enum {N=1024};

    9 __global__ void K(double (*D)[N]) /* D is not input too */

    10 {

    11 int tid = threadIdx.x;



    ----------------------

    use plain old device memory D to access the input array

    ----------------------



    12 double S = 0.0;

    13 __syncthreads();

    14 {

    15 int i;

    16 for ( i=0;i < M;i++)

    17 {

    18 //int k1 = ( i % N);

    19 double A= (*D)[tid];

    20 int j;

    21 for ( j=0;j < M;j++)

    22 {

    23 double B = (*D)[tid];

    24 double C=A * B - max(20.0,A/min(1.0,B+1.0));

    25 S+=C;

    26 //printf("tid=%2ld,[%ld][%ld] A=%lf B=%lf C=%lf S=%lf\n", tid, i,j,A,B,C,S);

    27 }

    28 }

    29 }

    30 (*D)[tid] = S;

    31 __syncthreads();

    32

    33 __syncthreads();

    34 {

    35 int s;

    36 for (

    37 s=N/2;

    38 s>0;

    39 s>>=1)

    40 {

    41 if ( tid < s)

    42 {

    43 (*D)[tid] += (*D)[tid+s];

    44 }

    45 __syncthreads();

    46 }

    47 }

    48 __syncthreads();

    49 }

    50

    51 int main( void )

    52 {

    53 double (*D)[N] = 0;

    54 double H[N];

    55 size_t sz = sizeof( double [N]);

    56

    57 HANDLE_ERROR( cudaMalloc( (void**)&D , sz ));

    58 {

    59 int i;

    60 double denominator = (double) RAND_MAX;

    61 for( i=0;i<N;i++)

    62 {

    63 double numerator = (double) rand();

    64 H[i] = numerator / denominator;

    65 assert( H[i] >= 0 && H[i] <=1.0);

    66 }

    67 HANDLE_ERROR( cudaMemcpy( D, H, sz,cudaMemcpyHostToDevice)) ;

    68 }

    69

    70 K<<<1,N>>>(D);

    71 cudaThreadSynchronize(); /* sync up mapped mem with host */

    72 {

    73 cudaError_t e= cudaGetLastError();

    74 assert ( e == cudaSuccess);

    75 }

    76

    77 HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );

    78 printf("outside K %lf\n", H[0]);

    79 HANDLE_ERROR( cudaFree(D));

    80 }

    #3
    Posted 04/23/2012 05:53 PM   
    The Fermi architecture made in many ways the programmers work easier. I was reading some book with examples made for older compute capabilities and many of the examples shownhad no difference in speed on the cc 2.0. Just a suggestion try disabling the cache see what happens.
    The Fermi architecture made in many ways the programmers work easier. I was reading some book with examples made for older compute capabilities and many of the examples shownhad no difference in speed on the cc 2.0. Just a suggestion try disabling the cache see what happens.

    #4
    Posted 04/23/2012 07:40 PM   
    Thanks.
    Does disabling the cache have any potential to increase my overall performance using shared memory, or does it merely demand that I use shared memory to get back to nominal level of performance i have with the cache enabled?
    If the answer is , yet it could improve the performance, how do I disable the cache? Is this done on a device basis for all applications that use that device? (which I suspect) or can i do it for a particular kernel launch?
    thanks again.
    Thanks.

    Does disabling the cache have any potential to increase my overall performance using shared memory, or does it merely demand that I use shared memory to get back to nominal level of performance i have with the cache enabled?

    If the answer is , yet it could improve the performance, how do I disable the cache? Is this done on a device basis for all applications that use that device? (which I suspect) or can i do it for a particular kernel launch?

    thanks again.

    #5
    Posted 04/23/2012 09:03 PM   
    Disabling the cache does not increase performance (only in the very special case that only a few bytes are used out of each 128 byte cacheline, so that transferring a whole cacheline at a time is wasteful).

    The main advantage of shared memory over cache is that you have full control over it. Because the cache is quite small compared to the large number of threads running on each SM, each thread only gets a handful bytes of cache. And you might have a much better idea of which data is going to be reused than the hardware may deduce from the access pattern. But if the LRU algorithm in the hardware does a good job, there is little to be gained from using shared memory.
    Disabling the cache does not increase performance (only in the very special case that only a few bytes are used out of each 128 byte cacheline, so that transferring a whole cacheline at a time is wasteful).



    The main advantage of shared memory over cache is that you have full control over it. Because the cache is quite small compared to the large number of threads running on each SM, each thread only gets a handful bytes of cache. And you might have a much better idea of which data is going to be reused than the hardware may deduce from the access pattern. But if the LRU algorithm in the hardware does a good job, there is little to be gained from using shared memory.

    Always check return codes of CUDA calls for errors. Do not use __syncthreads() in conditional code unless the condition is guaranteed to evaluate identically for all threads of each block. Run your program under cuda-memcheck to detect stray memory accesses. If your kernel dies for larger problem sizes, it might exceed the runtime limit and trigger the watchdog timer.

    #6
    Posted 04/24/2012 10:28 AM   
    I have also experienced this with an n-body type problem on a GTS450. The "naive" approach (each thread gets a particle and computes the interaction with all other particles, loading each from global memory) is no slower than the shared memory approach (each thread gets a particle, and loads a tile's worth of other particles into shared memory, loops through the tiles, and computes the interactions from there). At first I thought there must be a problem with my code, but what has been said above makes sense. Everything I read prior to my own experimentation made shared memory use sound unquestionably good...maybe this is a case of the technology moving too fast for the academic papers and documentation?
    I have also experienced this with an n-body type problem on a GTS450. The "naive" approach (each thread gets a particle and computes the interaction with all other particles, loading each from global memory) is no slower than the shared memory approach (each thread gets a particle, and loads a tile's worth of other particles into shared memory, loops through the tiles, and computes the interactions from there). At first I thought there must be a problem with my code, but what has been said above makes sense. Everything I read prior to my own experimentation made shared memory use sound unquestionably good...maybe this is a case of the technology moving too fast for the academic papers and documentation?

    #7
    Posted 05/16/2012 03:42 PM   
    Hello,

    I both cases it will be good to check what the profiler says. I suspect that the kernel is compute bound and it has too many instructions per byte of data. The usage of shared memory would not improve the code until other optimisations are done.
    Hello,



    I both cases it will be good to check what the profiler says. I suspect that the kernel is compute bound and it has too many instructions per byte of data. The usage of shared memory would not improve the code until other optimisations are done.

    #8
    Posted 05/16/2012 04:15 PM   
    Scroll To Top