Regarding OP’s posting, it is as BulatZiganshin said.
The getVals kernel is entirely optimized out (I am assuming we are building without debug switches)
For me, the getVals kernel profiles with a 0% hitrate, not 100%. The setVals kernel profiles with 57% read hit rate.
$ cat t59.cu
#include <stdio.h>
#include <stdlib.h>
#include <curand.h>
#include <curand_kernel.h>
#define SIZE (2048*1024/sizeof(int))
__device__ int *nonce;
// called by host, executed by GPU
__global__ void init() {
nonce = (int *)malloc(SIZE*sizeof(int));
}
__global__ void setVals() {
curandState_t state;
/* we have to initialize the state */
curand_init(0, /* the seed controls the sequence of random values that are produced */
0, /* the sequence number is only important with multiple cores */
0, /* the offset is how much extra we advance in the sequence for each call, can be 0 */
&state);
for(int i=0;i<SIZE;i++){
int r = curand(&state) % SIZE;
//printf("%d ", r);
*(nonce+r) = i;
}
}
__global__ void getVals() {
int j;
for(int i=0;i<SIZE;i++){
j = *(nonce+i);
//printf("%d ", j);
}
}
int main(void) {
//printf("%d\n", sizeof(int));
init<<<1, 1>>>();
getVals<<<1, 1>>>();
setVals<<<1, 1>>>();
return 0;
}
$ nvcc -arch=sm_60 -o t59 t59.cu -lcurand
t59.cu(41): warning: variable "j" was set but never used
t59.cu(41): warning: variable "j" was set but never used
$ cuobjdump -sass ./t59
Fatbin elf code:
================
arch = sm_60
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_60
Fatbin elf code:
================
arch = sm_60
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_60
Function : _Z7getValsv
.headerflags @"EF_CUDA_SM60 EF_CUDA_PTX_SM(EF_CUDA_SM60)"
/* 0x001fbc00fde007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ NOP; /* 0x50b0000000070f00 */
/*0018*/ NOP; /* 0x50b0000000070f00 */
/* 0x001ffc00ffe007ed */
/*0028*/ NOP; /* 0x50b0000000070f00 */
/*0030*/ EXIT; /* 0xe30000000007000f */
/*0038*/ BRA 0x38; /* 0xe2400fffff87000f */
......................
Function : _Z7setValsv
.headerflags @"EF_CUDA_SM60 EF_CUDA_PTX_SM(EF_CUDA_SM60)"
/* 0x001fc400fe2007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ MOV R4, RZ; /* 0x5c9807800ff70004 */
/*0018*/ MOV32I R12, 0x58213ed2; /* 0x01058213ed27f00c */
/* 0x001fc400fe2007f1 */
/*0028*/ MOV32I R11, 0x455f2458; /* 0x010455f24587f00b */
/*0030*/ MOV32I R10, 0xf8a42704; /* 0x010f8a427047f00a */
/*0038*/ MOV32I R9, 0xdcd8f87c; /* 0x010dcd8f87c7f009 */
/* 0x001f8400fec007e1 */
/*0048*/ MOV32I R8, 0x511db0d6; /* 0x010511db0d67f008 */
/*0050*/ MOV R0, RZ; /* 0x5c9807800ff70000 */
/*0058*/ MOV32I R2, 0x0; /* 0x010000000007f002 */
/* 0x001f98000e2007e2 */
/*0068*/ MOV32I R3, 0x0; /* 0x010000000007f003 */
/*0070*/ LDG.E.64 R6, [R2]; /* 0xeed5200000070206 */
/*0078*/ SHR.U32 R5, R12, 0x2; /* 0x3828000000270c05 */
/* 0x001f9400fe2007f6 */
/*0088*/ LOP.XOR R5, R5, R12; /* 0x5c47040000c70505 */
/*0090*/ SHL R14, R5, 0x1; /* 0x384800000017050e */
/*0098*/ SHL R12, R8, 0x4; /* 0x384800000047080c */
/* 0x001f9800fec007e6 */
/*00a8*/ LOP3.LUT R5, R8, R14, R5, 0x96; /* 0x5be7028960e70805 */
/*00b0*/ LOP.XOR R5, R5, R12; /* 0x5c47040000c70505 */
/*00b8*/ IADD3 R12, R4, c[0x2][0x0], R5; /* 0x4cc002880007040c */
/* 0x011f9840fe2007f6 */
/*00c8*/ LOP32I.AND R12, R12, 0x7ffff; /* 0x0400007ffff70c0c */
/*00d0*/ SHR.U32 R15, R12.reuse, 0x1e; /* 0x3828000001e70c0f */
/*00d8*/ ISCADD R14.CC, R12, R6, 0x2; /* 0x5c18810000670c0e */
/* 0x0005c4001e2007e2 */
/*00e8*/ IADD.X R15, R7, R15; /* 0x5c10080000f7070f */
/*00f0*/ STG.E [R14], R0; /* 0xeedc200000070e00 */
/*00f8*/ LDG.E.64 R12, [R2]; /* 0xeed520000007020c */
/* 0x001f9800fec007e6 */
/*0108*/ SHR.U32 R6, R11, 0x2; /* 0x3828000000270b06 */
/*0110*/ LOP.XOR R6, R6, R11; /* 0x5c47040000b70606 */
/*0118*/ SHL R7, R6, 0x1; /* 0x3848000000170607 */
/* 0x001fd800fcc207f1 */
/*0128*/ LOP3.LUT R7, R5.reuse, R7, R6, 0x96; /* 0x5be7030960770507 */
/*0130*/ SHL R6, R5, 0x4; /* 0x3848000000470506 */
/*0138*/ LOP.XOR R6, R7, R6; /* 0x5c47040000670706 */
/* 0x001f8400ffa007f6 */
/*0148*/ IADD3 R7, R4, c[0x2][0x4], R6; /* 0x4cc0030800170407 */
/*0150*/ SHL R7, R7, 0x2; /* 0x3848000000270707 */
/*0158*/ LOP32I.AND R7, R7, 0x1ffffc; /* 0x040001ffffc70707 */
/* 0x001f8809fcc007e5 */
/*0168*/ IADD32I R11, R0, 0x1; /* 0x1c0000000017000b */
/*0170*/ IADD R14.CC, R12, R7; /* 0x5c10800000770c0e */
/*0178*/ IADD.X R15, RZ, R13; /* 0x5c10080000d7ff0f */
/* 0x001f98002e2000f1 */
/*0188*/ STG.E [R14], R11; /* 0xeedc200000070e0b */
/*0190*/ LDG.E.64 R12, [R2]; /* 0xeed520000007020c */
/*0198*/ SHR.U32 R7, R10, 0x2; /* 0x3828000000270a07 */
/* 0x081fc400fcc007f6 */
/*01a8*/ LOP.XOR R7, R7, R10; /* 0x5c47040000a70707 */
/*01b0*/ SHL R16, R7, 0x1; /* 0x3848000000170710 */
/*01b8*/ LOP3.LUT R16, R6.reuse, R16, R7, 0x96; /* 0x5be7038961070610 */
/* 0x001fd800fec007e6 */
/*01c8*/ SHL R7, R6, 0x4; /* 0x3848000000470607 */
/*01d0*/ LOP.XOR R7, R16, R7; /* 0x5c47040000771007 */
/*01d8*/ IADD3 R10, R4, c[0x2][0x8], R7; /* 0x4cc003880027040a */
/* 0x003f9400fc2007fd */
/*01e8*/ SHL R10, R10, 0x2; /* 0x3848000000270a0a */
/*01f0*/ LOP32I.AND R10, R10, 0x1ffffc; /* 0x040001ffffc70a0a */
/*01f8*/ IADD32I R14, R0, 0x2; /* 0x1c0000000027000e */
/* 0x0003c400fc4047e6 */
/*0208*/ IADD R12.CC, R12, R10; /* 0x5c10800000a70c0c */
/*0210*/ IADD.X R13, RZ, R13; /* 0x5c10080000d7ff0d */
/*0218*/ STG.E [R12], R14; /* 0xeedc200000070c0e */
/* 0x001fd800fcc00171 */
/*0228*/ LDG.E.64 R10, [R2]; /* 0xeed520000007020a */
/*0230*/ SHR.U32 R15, R9, 0x2; /* 0x382800000027090f */
/*0238*/ LOP.XOR R9, R15, R9; /* 0x5c47040000970f09 */
/* 0x001f9840fe2007e6 */
/*0248*/ SHL R16, R9, 0x1; /* 0x3848000000170910 */
/*0250*/ LOP3.LUT R16, R7.reuse, R16, R9, 0x96; /* 0x5be7048961070710 */
/*0258*/ SHL R9, R7, 0x4; /* 0x3848000000470709 */
/* 0x001ff400fec007f6 */
/*0268*/ LOP.XOR R9, R16, R9; /* 0x5c47040000971009 */
/*0270*/ IADD3 R15, R4, c[0x2][0xc], R9; /* 0x4cc004880037040f */
/*0278*/ SHL R15, R15, 0x2; /* 0x3848000000270f0f */
/* 0x011f9801fca007e1 */
/*0288*/ LOP32I.AND R15, R15, 0x1ffffc; /* 0x040001ffffc70f0f */
/*0290*/ IADD32I R12, R0, 0x3; /* 0x1c0000000037000c */
/*0298*/ IADD R14.CC, R10, R15; /* 0x5c10800000f70a0e */
/* 0x0005c4001e2007e2 */
/*02a8*/ IADD.X R15, RZ, R11; /* 0x5c10080000b7ff0f */
/*02b0*/ STG.E [R14], R12; /* 0xeedc200000070e0c */
/*02b8*/ LDG.E.64 R10, [R2]; /* 0xeed520000007020a */
/* 0x001fc400fec007e6 */
/*02c8*/ SHR.U32 R13, R8, 0x2; /* 0x382800000027080d */
/*02d0*/ LOP.XOR R8, R13, R8; /* 0x5c47040000870d08 */
/*02d8*/ SHL R18, R8, 0x1; /* 0x3848000000170812 */
/* 0x001fd800fcc007e5 */
/*02e8*/ SHL R13, R9, 0x4; /* 0x384800000047090d */
/*02f0*/ LOP3.LUT R8, R9, R18, R8, 0x96; /* 0x5be7040961270908 */
/*02f8*/ LOP.XOR R13, R8, R13; /* 0x5c47040000d7080d */
/* 0x001f9800ffa007f6 */
/*0308*/ IADD3 R8, R4, c[0x2][0x10], R13; /* 0x4cc0068800470408 */
/*0310*/ SHL R8, R8, 0x2; /* 0x3848000000270808 */
/*0318*/ LOP32I.AND R8, R8, 0x1ffffc; /* 0x040001ffffc70808 */
/* 0x001f8800fca04fe1 */
/*0328*/ IADD R14.CC, R10, R8; /* 0x5c10800000870a0e */
/*0330*/ IADD32I R8, R0, 0x4; /* 0x1c00000000470008 */
/*0338*/ IADD.X R15, RZ, R11; /* 0x5c10080000b7ff0f */
/* 0x001f98002e2000f1 */
/*0348*/ STG.E [R14], R8; /* 0xeedc200000070e08 */
/*0350*/ LDG.E.64 R10, [R2]; /* 0xeed520000007020a */
/*0358*/ SHR.U32 R12, R5, 0x2; /* 0x382800000027050c */
/* 0x001f9400fe2007f6 */
/*0368*/ LOP.XOR R12, R12, R5; /* 0x5c47040000570c0c */
/*0370*/ SHL R18, R12, 0x1; /* 0x3848000000170c12 */
/*0378*/ SHL R5, R13, 0x4; /* 0x3848000000470d05 */
/* 0x001ff400fec007e6 */
/*0388*/ LOP3.LUT R12, R13, R18, R12, 0x96; /* 0x5be7060961270d0c */
/*0390*/ LOP.XOR R5, R12, R5; /* 0x5c47040000570c05 */
/*0398*/ IADD3 R12, R4, c[0x2][0x14], R5; /* 0x4cc002880057040c */
/* 0x003f9400fc2007e6 */
/*03a8*/ SHL R12, R12, 0x2; /* 0x3848000000270c0c */
/*03b0*/ LOP32I.AND R12, R12, 0x1ffffc; /* 0x040001ffffc70c0c */
/*03b8*/ IADD32I R8, R0, 0x5; /* 0x1c00000000570008 */
/* 0x0003c400fc4047e6 */
/*03c8*/ IADD R14.CC, R10, R12; /* 0x5c10800000c70a0e */
/*03d0*/ IADD.X R15, RZ, R11; /* 0x5c10080000b7ff0f */
/*03d8*/ STG.E [R14], R8; /* 0xeedc200000070e08 */
/* 0x001fd800fcc00171 */
/*03e8*/ LDG.E.64 R10, [R2]; /* 0xeed520000007020a */
/*03f0*/ SHR.U32 R12, R6, 0x2; /* 0x382800000027060c */
/*03f8*/ LOP.XOR R6, R12, R6; /* 0x5c47040000670c06 */
/* 0x001f9840fe2007e6 */
/*0408*/ SHL R12, R6, 0x1; /* 0x384800000017060c */
/*0410*/ LOP3.LUT R12, R5.reuse, R12, R6, 0x96; /* 0x5be7030960c7050c */
/*0418*/ SHL R6, R5, 0x4; /* 0x3848000000470506 */
/* 0x001ff400fec007f6 */
/*0428*/ LOP.XOR R6, R12, R6; /* 0x5c47040000670c06 */
/*0430*/ IADD3 R12, R4, c[0x2][0x18], R6; /* 0x4cc003080067040c */
/*0438*/ SHL R12, R12, 0x2; /* 0x3848000000270c0c */
/* 0x011f9801fca007e1 */
/*0448*/ LOP32I.AND R12, R12, 0x1ffffc; /* 0x040001ffffc70c0c */
/*0450*/ IADD32I R8, R0, 0x6; /* 0x1c00000000670008 */
/*0458*/ IADD R10.CC, R10, R12; /* 0x5c10800000c70a0a */
/* 0x001dc4001e2007e2 */
/*0468*/ IADD.X R11, RZ, R11; /* 0x5c10080000b7ff0b */
/*0470*/ STG.E [R10], R8; /* 0xeedc200000070a08 */
/*0478*/ LDG.E.64 R2, [R2]; /* 0xeed5200000070202 */
/* 0x001f9800fec007e6 */
/*0488*/ SHR.U32 R12, R7, 0x2; /* 0x382800000027070c */
/*0490*/ LOP.XOR R7, R12, R7; /* 0x5c47040000770c07 */
/*0498*/ SHL R12, R7, 0x1; /* 0x384800000017070c */
/* 0x001fd800fcc207f1 */
/*04a8*/ LOP3.LUT R12, R6.reuse, R12, R7, 0x96; /* 0x5be7038960c7060c */
/*04b0*/ SHL R7, R6, 0x4; /* 0x3848000000470607 */
/*04b8*/ LOP.XOR R7, R12, R7; /* 0x5c47040000770c07 */
/* 0x001fc401fea007e1 */
/*04c8*/ IADD3 R12, R4, c[0x2][0x1c], R7; /* 0x4cc003880077040c */
/*04d0*/ IADD32I R8, R0, 0x8; /* 0x1c00000000870008 */
/*04d8*/ SHL R12, R12, 0x2; /* 0x3848000000270c0c */
/* 0x001f8400fc2007e5 */
/*04e8*/ ISETP.NE.AND P0, PT, R8, c[0x2][0x20], PT; /* 0x4b6b038800870807 */
/*04f0*/ LOP32I.AND R12, R12, 0x1ffffc; /* 0x040001ffffc70c0c */
/*04f8*/ IADD32I R15, R0, 0x7; /* 0x1c0000000077000f */
/* 0x001f8400fc2007e1 */
/*0508*/ MOV R0, R8; /* 0x5c98078000870000 */
/*0510*/ MOV R11, R13; /* 0x5c98078000d7000b */
/*0518*/ MOV R10, R5; /* 0x5c9807800057000a */
/* 0x011f8400fc4007e1 */
/*0528*/ MOV R8, R7; /* 0x5c98078000770008 */
/*0530*/ IADD32I R4, R4, 0x2c3e28; /* 0x1c0002c3e2870404 */
/*0538*/ IADD R2.CC, R2, R12; /* 0x5c10800000c70202 */
/* 0x001f9c00fc8007e1 */
/*0548*/ MOV R12, R9; /* 0x5c9807800097000c */
/*0550*/ MOV R9, R6; /* 0x5c98078000670009 */
/*0558*/ IADD.X R3, RZ, R3; /* 0x5c1008000037ff03 */
/* 0x001ffc01ffa000fd */
/*0568*/ STG.E [R2], R15; /* 0xeedc20000007020f */
/*0570*/ @P0 BRA 0x58; /* 0xe2400fffae00000f */
/*0578*/ EXIT; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*0588*/ BRA 0x580; /* 0xe2400fffff07000f */
/*0590*/ NOP; /* 0x50b0000000070f00 */
/*0598*/ NOP; /* 0x50b0000000070f00 */
/* 0x001f8000fc0007e0 */
/*05a8*/ NOP; /* 0x50b0000000070f00 */
/*05b0*/ NOP; /* 0x50b0000000070f00 */
/*05b8*/ NOP; /* 0x50b0000000070f00 */
......................
Function : _Z4initv
.headerflags @"EF_CUDA_SM60 EF_CUDA_PTX_SM(EF_CUDA_SM60)"
/* 0x001fd800fe2007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ MOV32I R4, 0x200000; /* 0x010002000007f004 */
/*0018*/ MOV R5, RZ; /* 0x5c9807800ff70005 */
/* 0x001fc800fe2007fd */
/*0028*/ JCAL 0x0; /* 0xe220000000000040 */
/*0030*/ MOV32I R2, 0x0; /* 0x010000000007f002 */
/*0038*/ MOV32I R3, 0x0; /* 0x010000000007f003 */
/* 0x001fac00fde000f1 */
/*0048*/ STG.E.64 [R2], R4; /* 0xeedd200000070204 */
/*0050*/ NOP; /* 0x50b0000000070f00 */
/*0058*/ NOP; /* 0x50b0000000070f00 */
/* 0x001f8000ffe007ff */
/*0068*/ EXIT; /* 0xe30000000007000f */
/*0070*/ BRA 0x70; /* 0xe2400fffff87000f */
/*0078*/ NOP; /* 0x50b0000000070f00 */
...................
Fatbin ptx code:
================
arch = sm_60
code version = [6,1]
producer = cuda
host = linux
compile_size = 64bit
compressed
$ nvprof -m l2_tex_read_hit_rate,l2_read_transactions,l2_write_transactions,tex_cache_transactions,global_hit_rate ./t59
==24000== NVPROF is profiling process 24000, command: ./t59
==24000== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
Replaying kernel "init(void)" (done)
Replaying kernel "getVals(void)" (done)
Replaying kernel "setVals(void)" (done)
==24000== Profiling application: ./t59
==24000== Profiling result:ector_queries
==24000== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "Tesla P100-PCIE-16GB (0)"
Kernel: getVals(void)
1 l2_tex_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00%
1 l2_read_transactions L2 Read Transactions 8 8 8
1 l2_write_transactions L2 Write Transactions 13 13 13
1 tex_cache_transactions Unified Cache Transactions 0 0 0
1 global_hit_rate Global Hit Rate in unified l1/tex 0.00% 0.00% 0.00%
Kernel: init(void)
1 l2_tex_read_hit_rate L2 Hit Rate (Texture Reads) 57.63% 57.63% 57.63%
1 l2_read_transactions L2 Read Transactions 297 297 297
1 l2_write_transactions L2 Write Transactions 72 72 72
1 tex_cache_transactions Unified Cache Transactions 63 63 63
1 global_hit_rate Global Hit Rate in unified l1/tex 0.00% 0.00% 0.00%
Kernel: setVals(void)
1 l2_tex_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00%
1 l2_read_transactions L2 Read Transactions 1417 1417 1417
1 l2_write_transactions L2 Write Transactions 584825 584825 584825
1 tex_cache_transactions Unified Cache Transactions 1048576 1048576 1048576
1 global_hit_rate Global Hit Rate in unified l1/tex 100.00% 100.00% 100.00%
$
CUDA 9.1, CentOS 7.4, Tesla P100PCIE
A small change to getVals like so:
__global__ void getVals() {
int j = 1;
for(int i=0;i<SIZE;i++){
j += *(nonce+i);
}
if (j==0) printf("%d \n", j);
}
avoids the unwanted compiler optimization
profiling that case produces:
$ nvprof -m l2_tex_read_hit_rate,l2_read_transactions,l2_write_transactions,tex_cache_transactions,global_hit_rate ./t59
==24549== NVPROF is profiling process 24549, command: ./t59
==24549== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
Replaying kernel "init(void)" (done)
Replaying kernel "getVals(void)" (done)
Replaying kernel "setVals(void)" (done)
==24549== Profiling application: ./t59
==24549== Profiling result:it_sectors
==24549== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "Tesla P100-PCIE-16GB (0)"
Kernel: getVals(void)
1 l2_tex_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00%
1 l2_read_transactions L2 Read Transactions 65953 65953 65953
1 l2_write_transactions L2 Write Transactions 13 13 13
1 tex_cache_transactions Unified Cache Transactions 524290 524290 524290
1 global_hit_rate Global Hit Rate in unified l1/tex 87.50% 87.50% 87.50%
Kernel: init(void)
1 l2_tex_read_hit_rate L2 Hit Rate (Texture Reads) 57.63% 57.63% 57.63%
1 l2_read_transactions L2 Read Transactions 305 305 305
1 l2_write_transactions L2 Write Transactions 72 72 72
1 tex_cache_transactions Unified Cache Transactions 63 63 63
1 global_hit_rate Global Hit Rate in unified l1/tex 0.00% 0.00% 0.00%
Kernel: setVals(void)
1 l2_tex_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00%
1 l2_read_transactions L2 Read Transactions 1337 1337 1337
1 l2_write_transactions L2 Write Transactions 584825 584825 584825
1 tex_cache_transactions Unified Cache Transactions 1048576 1048576 1048576
1 global_hit_rate Global Hit Rate in unified l1/tex 100.00% 100.00% 100.00%
$
the numbers now look fairly sensible to me, although I haven’t studied it closely.