I made up an example to see if shared memory is really faster.
The two programs are x.cu and y.cu
They both return the same answer like they should
x.cu uses shared memory, so must declare an array to support the working set
per thread,
y.cu juse uses the device memory you get with the stack frame so each thread
gets its copy of the working set this way
both do a reduction of the data set computed by thread id, into item 0
Time results are essentially the same. same answer.
1 time x
2 2089967186485776.000000
3
4 real. 0m10.53s
5 user. 0m10.42s
6 sys. 0m 0.09s
7
8 time y
9 2089967186485776.000000
10
11 real. 0m 9.75s
12 user. 0m 9.68s
13 sys. 0m 0.06s
Here is the source for both
cat x.cu|greatawk 1 10000
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <assert.h>
4 #include <math.h>
5 #include “book.h”
6
7 enum {M=2000};
8 enum {N=1024};
9 struct workdef
10 {
11 . int i,j,k1,k2;
12 . double A;
13 . double B;
14 . double S;
15 };
16
17 global void K(double (*D)[N])
18 {
19 . //int n = gridDim.x;
20 . //int m = blockDim.x;
21 . //int bid = blockIdx.x;
22 . int tid = threadIdx.x;
23 . shared struct workdef Z[N];
24 . struct workdef *W = &Z[tid];
25 . {
26 . . W->S = tid;
27 . . W->A = 0.0;
28 . . W->B = 0.0;
29 . }
30 . __syncthreads();
31 . for ( W->i=0;W->i < M;(W->i)++)
32 . {
33 . . W->k1 = (tid + W->i) % N;
34 . . W->A= (double) W->i;
35 . . for ( W->j=0;W->j < M;W->j++)
36 . . {
37 . . . W->k2 = (tid + W->j) % N;
38 . . . W->B = (double) W->k2;
39 . . . W->S+=max(102.0,max(10.0,W->A * W->B) - max(20.0,W->A/ min(1.0,W->B+1.0)));
40 . . }
41 . }
42 . (*D)[tid] = W->S;
43 . __syncthreads();
44 . {
45 . . int s;
46 . . for (
47 . . . s=N/2;
48 . . . s>0;
49 . . . s>>=1)
50 . . {
51 . . . if ( tid < s)
52 . . . {
53 . . . . (*D)[tid] += (*D)[tid+s];
54 . . . }
55 . . . __syncthreads();
56 . . }
57 . }
58 }
59
60 int main( void )
61 {
62 . double (D)[N] = 0;
63 . double H;
64 . size_t sz = sizeof( double [N]);
65
66 . HANDLE_ERROR( cudaMalloc( (void*)&D , sz ));
67
68 . K<<<1,N>>>(D);
69 . cudaThreadSynchronize(); /* sync up mapped mem with host */
70 . {
71 . . cudaError_t e= cudaGetLastError();
72 . . assert ( e == cudaSuccess);
73 . }
74
75 . HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );
76 . printf(“%lf\n”, H);
77 . HANDLE_ERROR( cudaFree(D));
78 }
cat y.cu|greatawk 1 10000
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <assert.h>
4 #include <math.h>
5 #include “book.h”
6
7 enum {M=2000};
8 enum {N=1024};
9 struct workdef
10 {
11 . int i,j,k1,k2;
12 . double A;
13 . double B;
14 . double S;
15 };
16
17 global void K(double (*D)[N])
18 {
19 . //int n = gridDim.x;
20 . //int m = blockDim.x;
21 . //int bid = blockIdx.x;
22 . int tid = threadIdx.x;
23 . struct workdef Z[1];
24 . struct workdef *W = &Z[0];
25 . //if( tid == 0)
26 . {
27 . . int i;
28 . . W->S = tid;
29 . . for ( i=0;i < N;i++)
30 . . {
31 . . . W->A = 0.0;
32 . . . W->B = 0.0;
33 . . }
34 . }
35 . __syncthreads();
36 . for ( W->i=0;W->i < M;(W->i)++)
37 . {
38 . . W->k1 = (tid + W->i) % N;
39 . . W->A= (double) W->i;
40 . . for ( W->j=0;W->j < M;W->j++)
41 . . {
42 . . . W->k2 = (tid + W->j) % N;
43 . . . W->B = (double) W->k2;
44 . . . W->S+=max(102.0,max(10.0,W->A * W->B) - max(20.0,W->A/ min(1.0,W->B+1.0)));
45 . . }
46 . }
47 . (*D)[tid] = W->S;
48 . __syncthreads();
49 . {
50 . . int s;
51 . . for (
52 . . . s=N/2;
53 . . . s>0;
54 . . . s>>=1)
55 . . {
56 . . . if ( tid < s)
57 . . . {
58 . . . . (*D)[tid] += (*D)[tid+s];
59 . . . }
60 . . . __syncthreads();
61 . . }
62 . }
63 }
64
65 int main( void )
66 {
67 . double (D)[N] = 0;
68 . double H;
69 . size_t sz = sizeof( double [N]);
70
71 . HANDLE_ERROR( cudaMalloc( (void*)&D , sz ));
72
73 . K<<<1,N>>>(D);
74 . cudaThreadSynchronize(); /* sync up mapped mem with host */
75 . {
76 . . cudaError_t e= cudaGetLastError();
77 . . assert ( e == cudaSuccess);
78 . }
79
80 . HANDLE_ERROR( cudaMemcpy( &H, D, sizeof(double),cudaMemcpyDeviceToHost ) );
81 . printf(“%lf\n”, H);
82 . HANDLE_ERROR( cudaFree(D));
83 }
Can you explain what I have to do to get the big time difference that shared memory is supposed to
yield?