Thank you for you help, problem detected and solved.
The reason is as suggested by njuffa: when I am iterating over data, the soulution converges to steady state.
The problem may be simply avoided by starting computations from the previously computed data.
When I simply use the data produced by kernels without divisions as an input data to kernels with divisions, the performance stays almost constant.
Unfortunately, saving and restoring data was time consuming and removed at some point, what caused behaviour reported in the first post :(
Once again sorry for confusion.
Some details:
The division procedure works slower when a large number of division are of 0.0 by something by 1.0 (previously I made bad conclusions), because at the beginning of the division procedure nvvp reports 6% of inactive threads (lines 2-28). When the majoriy of divisions is of values different than 0.0 by 1.0 (usually between 0.999995 and 1.00005), no inactive threads are reported and kernel works faster.
__cuda_sm20_div_f64_slowpath_v2:
{ LOP32I.AND R47, R43, 0x40000000;
PBK `(.L_135); }
ISETP.LT.U32.AND P0, PT, R47, c[0x2][0x38], PT;
MOV32I R66, 0x1ff00000;
MOV R56, RZ;
SEL R57, R66, c[0x2][0x3c], !P0;
DMUL R64, R42, R56;
{ LOP32I.AND R47, R44, 0x7f800000;
MUFU.RCP64H R58, R65; }
ISETP.LT.U32.AND P0, PT, R47, c[0x2][0x40], PT;
LOP.XOR R45, R45, R44;
MOV R59, R58;
MOV32I R58, 0x1;
LOP.XOR R44, R45, R44;
DFMA R60, R64, -R58, c[0x2][0x0];
SEL R62, R66, c[0x2][0x3c], !P0;
LOP.XOR R45, R45, R44;
MOV R63, R62;
DFMA R60, R60, R60, R60;
MOV R62, RZ;
DMUL R62, R44, R62;
DFMA R58, R58, R60, R58;
DMUL R60, R62, R58;
DFMA R62, R64, -R60, R62;
DFMA R58, R58, R62, R60;
DSETP.LEU.AND P0, PT, |R58|, RZ, PT;
@P0 BRA `(.L_136);
ISETP.GT.U32.AND P0, PT, R47, c[0x2][0x44], PT;
DMUL R62, R56, R58;
SEL R60, R66, c[0x2][0x3c], !P0;
MOV R61, R60;
MOV R60, RZ;
DMUL R58, R58, R60;
DMUL R60, R60, R62;
DMUL R62, R56, R58;
DFMA R58, R42.reuse, R60, -R44.reuse;
DFMA R56, R42, R62, -R44;
DSETP.GT.AND P0, PT, |R58|, |R56|, PT;
SEL R47, R63, R61, P0;
FSETP.GTU.AND P1, PT, |R47|, 1.469367938527859385e-39, PT;
SEL R56, R62, R60, P0;
{ MOV R57, R47;
@P1 BRK; }
FSETP.LT.AND P0, PT, |R45|, 1.5046327690525280102e-36, PT;
MOV32I R58, 0x3ff00000;
LOP32I.AND R62, R56, 0xfffffffe;
SEL R58, R58, c[0x2][0x48], !P0;
MOV R59, R58;
MOV R58, RZ;
LOP32I.OR R60, R56, 0x1;
MOV R56, R62;
MOV R57, R47.reuse;
DMUL R42, R42, R58.reuse;
DMUL R44, R44, R58;
MOV R61, R47;
DFMA R58, R56, R42.reuse, -R44.reuse;
DFMA R56, R60, R42, -R44;
DSETP.GT.AND P0, PT, |R58|, |R56|, PT;
SEL R58, R60, R62, P0;
LOP32I.AND R56, R58, 0x1;
IADD32I R61.CC, R58, 0x1;
ISETP.EQ.U32.AND P0, PT, R56, 0x1, PT;
IADD.X R60, RZ, R47;
IADD32I R56.CC, R58, -0x1;
IADD32I.X R57, R47.reuse, -0x1;
SEL R61, R58, R61, !P0;
SEL R60, R47, R60, !P0;
SEL R47, R57, R47, !P0;
SEL R58, R56, R58, !P0;
MOV R56, R61;
MOV R57, R60;
MOV R59, R47;
DFMA R56, R42.reuse, R56, -R44.reuse;
DFMA R42, R42, R58, -R44;
DSETP.GT.AND P0, PT, |R56|, |R42|, PT;
SEL R56, R58, R61, P0;
{ SEL R57, R47, R60, P0;
BRK; }
.L_136:
DSETP.EQ.AND P0, PT, R58, RZ, PT;
@P0 BRA `(.L_137);
{ MOV R56, RZ;
MUFU.RCP64H R57, R43; }
DSETP.GT.AND P1, PT, |R56|, RZ, PT;
@!P1 DSETP.NEU.AND P0, PT, |R42|, +INF , PT;
@!P1 SEL R42, R42, R56, P0;
@!P1 SEL R47, R43, R57, P0;
@!P1 MOV R56, R42;
@!P1 MOV R57, R47;
DMUL R56, R44, R56;
BRK;
.L_137:
DMUL R56, R44, R42;
BRK;
.L_135:
MOV R42, R56;
{ MOV R43, R57;
RET; }
.L_138:
BRA `(.L_138);
NOP;
NOP;
NOP;
I have not analysed this code in detail, only observed that it is different than that generated by Cuda 7.5 for Kepler device.