Thank avidday and SPWorley for your opinions.
__asm("ld.shared.f32 %0, %1;" : "=f" (b_reg) : "m" (b_ptr+j) );
and
asm("ld.shared.f32 %0, %1;" : "=f" (b_reg) : "m" (b_ptr+0) );
c[0] += A_reg * b_reg;
asm("ld.shared.f32 %0, %1;" : "=f" (b_reg) : "m" (b_ptr+1) );
c[1] += A_reg * b_reg;
asm("ld.shared.f32 %0, %1;" : "=f" (b_reg) : "m" (b_ptr+2) );
c[2] += A_reg * b_reg;
...
asm("ld.shared.f32 %0, %1;" : "=f" (b_reg) : "m" (b_ptr+15) );
c[15] += A_reg * b_reg;
do not work when compiling to binary file because nvcc translate
asm(“ld.shared.f32 %0, %1;” : “=f” (b_reg) : “m” (b_ptr+0) );
to
ld.shared.f32 %f7, b_ptr ;
but nvcc does not recognize variable “b_ptr”, the error message is
./volkov_v2.cu(183): Advisory: Loop was not unrolled, inline assembly
ptxas C:\DOCUME~1\lschien\LOCALS~1\Temp/tmpxft_00001c9c_00000000-4_volkov_v2.ptx
, line 149; error : Address expected for argument 1 of instruction 'ld'
ptxas C:\DOCUME~1\lschien\LOCALS~1\Temp/tmpxft_00001c9c_00000000-4_volkov_v2.ptx
, line 149; error : Unknown symbol 'b_ptr'
ptxas fatal : Ptx assembly aborted due to errors
if one uses
asm("ld.shared.f32 %0, %1;" : "=f" (b_reg) : "m" (b+0) ); // b is shared memory
, then nvcc translates it to
ld.shared.f32 %f7, asm.by.address.temp_0 ;
I think that something wrong on translation of shared memory.
Finally I design a workaround
float A_reg;
float *b64 = (float*)b;
A_reg = A[0]; A += lda;
asm("ld.shared.f32 %0, [__cuda_%1+0];" : "=f" (b_reg) : "m"(b64) );
c[0] += A_reg * b_reg;
asm("ld.shared.f32 %0, [__cuda_%1+4];" : "=f" (b_reg) : "m"(b64) );
c[1] += A_reg * b_reg;
asm("ld.shared.f32 %0, [__cuda_%1+8];" : "=f" (b_reg) : "m"(b64) );
c[2] += A_reg * b_reg;
....
c[12] += A_reg * b_reg;
asm("ld.shared.f32 %0, [__cuda_%1+1072];" : "=f" (b_reg) : "m"(b64) );
c[13] += A_reg * b_reg;
asm("ld.shared.f32 %0, [__cuda_%1+1076];" : "=f" (b_reg) : "m"(b64) );
c[14] += A_reg * b_reg;
asm("ld.shared.f32 %0, [__cuda_%1+1080];" : "=f" (b_reg) : "m"(b64) );
c[15] += A_reg * b_reg;
which is explicit loop-unrolling of
float A_reg;
float *b64 = (float*)b;
#pragma unroll
for( int i = 0; i < 16; i++ ){
A_reg = A[0] ; A += lda;
#pragma unroll
for( int j = 0; j < 16; j++){
int offset = 4*(j+17*i);
asm("ld.shared.f32 %0, [__cuda_%1];" : "=f" (b_reg) : "m" (b64 + offset ) );
c[j] += A_reg * b_reg;
}
The PTX code shows correct translation,
[code]
ld.shared.f32 %f22, [__cuda_b64+0];
ld.shared.f32 %f24, [__cuda_b64+4];
…
ld.shared.f32 %f278, [__cuda_b64+1080];
[code]
However the binary code still uses “MAD dest, [smem], src2, src3”, not “MAD dest, src1, src2, src3”
(one can verify this via decuda). Even we can use inline assembly to write PTX code,
nvcc would do optimization himeself and may not match your design.