www.digitalmars.com         C & C++   DMDScript  

digitalmars.D.bugs - [Issue 3751] New: Optimalization error in some floating point code

reply d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=3751

           Summary: Optimalization error in some floating point code
           Product: D
           Version: 2.039
          Platform: x86
        OS/Version: Linux
            Status: NEW
          Severity: major
          Priority: P2
         Component: DMD
        AssignedTo: nobody puremagic.com
        ReportedBy: baryluk smp.if.uj.edu.pl


--- Comment #0 from Witold Baryluk <baryluk smp.if.uj.edu.pl> 2010-01-28
20:17:58 PST ---
I tested this in 2.039 and v2.028, so probably all version beetwen are
affected.
I don't know how far this bug was introducent.

Here is simplified test case (functionally it is buggy but it is sufficient to
show bug):

import std.math;
import std.stdio;

double bisect(double z) {
    double left = -7.0, right = 7.0, half;
    while (true) { // do {} while(true); // also have this problem
        half = (left+right)*0.5;
version(something) {
        writefln("%s", half); // adding this solves problem
}

        if (left == half) {
            return half; // or break
        }
        if (half == right) {
            return half; // or break
        }
/+
        // the same effect as two if statments.
        if ((left == half) || (half == right)) {
            return half; // or break
        }
+/
        double fhalf = exp(-0.5*half*half) * (half + 0.7) - z;
        if (fhalf > 0.0) {
            right = half;
        } else if (fhalf <= 0.0) {
            left = half;
        }
    };
    //return half; // not rechable, irrevelant
}

void main() {
    foreach (i; 1 .. 10) {
        auto x = 0.1 + 0.001*i;
        writefln("%g %g", x, bisect(x));
    }
}


compile without optimalisation:
# dmd blad.d; ./blad
0.101 -0.580467
0.102 -0.579361
0.103 -0.578256
0.104 -0.577153
0.105 -0.57605
0.106 -0.574949
0.107 -0.573849
0.108 -0.57275
0.109 -0.571653
#

compile with optimalisation:
# dmd -O blad.d; ./blad
0.101 -0.580467
.... nothing happens, CPU usage 100%
^C
#

compile with optimalisation and dummy write:
# dmd -O -version=something blad.d; ./blad
...
...
0.109 -0.571653
# // correctly ends just like without optimalisations



You can check asmbler code here:

version without optimalisations:
.text._D4blad6bisectFdZd        segment
        assume  CS:.text._D4blad6bisectFdZd
_D4blad6bisectFdZd:
                push    EBP
                mov     EBP,ESP
                sub     ESP,020h
                fld     qword ptr FLAT:.rodata[08h]
                fstp    qword ptr -020h[EBP]
                fld     qword ptr FLAT:.rodata[019h]
                fstp    qword ptr -018h[EBP]
                fld     qword ptr FLAT:.rodata[02Ah]
                fstp    qword ptr -010h[EBP]
L21:            fld     qword ptr -020h[EBP]
                fadd    qword ptr -018h[EBP]
                fmul    qword ptr _TMP6 SYM32[09h]
                fstp    qword ptr -010h[EBP]
                fld     qword ptr -020h[EBP]
                fld     qword ptr -010h[EBP]
                fucompp ST(1),ST
                fstsw   AX
                sahf
                jne     L46
                jp      L46
                fld     qword ptr -010h[EBP]
                leave
                ret     8
L46:            fld     qword ptr -010h[EBP]
                fld     qword ptr -018h[EBP]
                fucompp ST(1),ST
                fstsw   AX
                sahf
                jne     L5C
                jp      L5C
                fld     qword ptr -010h[EBP]
                leave
                ret     8
L5C:            fld     qword ptr -010h[EBP]
                fmul    qword ptr _TMP6 SYM32[049h]
                fmul    qword ptr -010h[EBP]
                sub     ESP,8
                fstp    qword ptr [ESP]
                call    near ptr _D3std4math3expFNaNbdZd PC32
                fld     qword ptr -010h[EBP]
                fadd    qword ptr _TMP10 SYM32
                fmulp   ST(1),ST
                fsub    qword ptr 8[EBP]
                fstp    qword ptr -8[EBP]
                fld     qword ptr -8[EBP]
                ftst
                fstsw   AX
                sahf
                fstp    ST
                jbe     L98
                fld     qword ptr -010h[EBP]
                fstp    qword ptr -018h[EBP]
                jmp short       L21
L98:            fld     qword ptr -8[EBP]
                ftst
                fstsw   AX
                sahf
                fstp    ST
                ja      L21
                jp      L21
                fld     qword ptr -010h[EBP]
                fstp    qword ptr -020h[EBP]
                jmp     near ptr L21
                nop
                nop
                nop
.text._D4blad6bisectFdZd        ends


Version with optimalistion:

_D4blad6bisectFdZd:
                push    EBP
                mov     EBP,ESP
                sub     ESP,020h
                mov     dword ptr -010h[EBP],0
                fld     qword ptr FLAT:.rodata[0Fh]
                fld     qword ptr FLAT:.rodata[01Dh]
                fxch    ST1
                mov     dword ptr -0Ch[EBP],0
                fstp    qword ptr -020h[EBP]
                fstp    qword ptr -018h[EBP]
L28:            fld     qword ptr -010h[EBP]
                fld     qword ptr -018h[EBP]
                fucompp ST(1),ST
                fstsw   AX
                sahf
                jne     L40
                jp      L40
L37:            fld     qword ptr -010h[EBP]
                mov     ESP,EBP
                pop     EBP
                ret     8
L40:            fld     qword ptr -010h[EBP]
                fmul    qword ptr _TMP6 SYM32[025h]
                fmul    qword ptr -010h[EBP]
                sub     ESP,8
                fstp    qword ptr [ESP]
                call    near ptr _D3std4math3expFNaNbdZd PC32
                fld     qword ptr -010h[EBP]
                fadd    qword ptr _TMP6 SYM32[044h]
                fmulp   ST(1),ST
                fsub    qword ptr 8[EBP]
                fst     qword ptr -8[EBP]
                ftst
                fstsw   AX
                fstp    ST
                sahf
                jbe     L93
                fld     qword ptr -010h[EBP]
                fstp    qword ptr -018h[EBP]
L77:            fld     qword ptr -020h[EBP]
                fld     ST0
                fadd    qword ptr -018h[EBP]
                fmul    qword ptr _TMP10 SYM32[09h]
                fst     qword ptr -010h[EBP]
                fucompp ST(1),ST
                fstsw   AX
                sahf
                jp      L28
                je      L37
                jmp short       L28
L93:            fld     qword ptr -8[EBP]
                ftst
                fstsw   AX
                fstp    ST
                sahf
                ja      L77
                jp      L77
                fld     qword ptr -010h[EBP]
                fstp    qword ptr -020h[EBP]
                jmp short       L77
                nop
                nop
                nop
.text._D4blad6bisectFdZd        ends

version with "something" added artifacialy:
.text._D4blad6bisectFdZd        segment
        assume  CS:.text._D4blad6bisectFdZd
_D4blad6bisectFdZd:
                push    EBP
                mov     EBP,ESP
                sub     ESP,020h
                mov     dword ptr -010h[EBP],0
                fld     qword ptr _TMP0 SYM32[017h]
                fld     qword ptr _TMP0 SYM32[025h]
                fxch    ST1
                mov     dword ptr -0Ch[EBP],0
                fstp    qword ptr -020h[EBP]
                fstp    qword ptr -018h[EBP]
                push    dword ptr _TMP0 SYM32[02Eh]
                push    dword ptr _TMP0 SYM32[030h]
                push    0
                push    0
                call    near ptr ...writeflnTAyaTdZ8writeflnFAyadZv PC32
L3D:            fld     qword ptr -010h[EBP]
                fld     qword ptr -018h[EBP]
                fucompp ST(1),ST
                fstsw   AX
                sahf
                jne     L55
                jp      L55
L4C:            fld     qword ptr -010h[EBP]
                mov     ESP,EBP
                pop     EBP
                ret     8
L55:            fld     qword ptr -010h[EBP]
                fmul    qword ptr _TMP7 SYM32[03Ah]
                fmul    qword ptr -010h[EBP]
                sub     ESP,8
                fstp    qword ptr [ESP]
                call    near ptr _D3std4math3expFNaNbdZd PC32
                fld     qword ptr -010h[EBP]
                fadd    qword ptr _TMP10 SYM32[09h]
                fmulp   ST(1),ST
                fsub    qword ptr 8[EBP]
                fst     qword ptr -8[EBP]
                ftst
                fstsw   AX
                fstp    ST
                sahf
                jbe     LCA
                fld     qword ptr -010h[EBP]
                fstp    qword ptr -018h[EBP]
L8C:            push    dword ptr _TMP10 SYM32[02h]
                push    dword ptr _TMP10 SYM32[04h]
                fld     qword ptr -020h[EBP]
                fadd    qword ptr -018h[EBP]
                fmul    qword ptr _TMP11 SYM32[028h]
                fst     qword ptr -010h[EBP]
                sub     ESP,8
                fstp    qword ptr [ESP]
                call    near ptr  ...writeflnTAyaTdZ8writeflnFAyadZv PC32
                fld     qword ptr -020h[EBP]
                fld     qword ptr -010h[EBP]
                fucompp ST(1),ST
                fstsw   AX
                sahf
                jp      L3D
                je      L4C
                jmp     near ptr L3D
LCA:            fld     qword ptr -8[EBP]
                ftst
                fstsw   AX
                fstp    ST
                sahf
                ja      L8C
                jp      L8C
                fld     qword ptr -010h[EBP]
                fstp    qword ptr -020h[EBP]
                jmp short       L8C
.text._D4blad6bisectFdZd        ends



What is interesting, that just adding single writefln makes this asembler code
change in much more places than just call of this function.

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
Jan 28 2010
next sibling parent d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=3751



--- Comment #1 from Witold Baryluk <baryluk smp.if.uj.edu.pl> 2010-01-28
20:24:33 PST ---
exp(-0.5*half*half) can be change to 1.0/(half*half) and bug is still the same.

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
Jan 28 2010
prev sibling next sibling parent d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=3751


Stephan Dilly <spam extrawurst.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |spam extrawurst.org


--- Comment #2 from Stephan Dilly <spam extrawurst.org> 2010-01-28 22:30:29 PST
---
is this maybe related to issue 3736 ?

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
Jan 28 2010
prev sibling next sibling parent d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=3751



--- Comment #3 from Witold Baryluk <baryluk smp.if.uj.edu.pl> 2010-01-29
04:28:20 PST ---
(In reply to comment #2)
 is this maybe related to issue 3736 ?
I don't know. This is quite different issues (i'm not using structs, and i doesn't call functions when error occure). My bug have probably something with FPU register allocation, but this is just a guess. I further reduced test case, change "write("%s",half)" to nic(), where nic is empty function, and write code directly in main import std.math; import std.stdio; void nic() {} void main() { foreach (i; 1 .. 10) { auto z = 0.1 + 0.001*i; double left = -7.0, right = 7.0, half; while (true) { half = (left+right)*0.5; // or left+0.5*(right-left); version(something) { nic(); // adding this solves problem } if (left == half) { break; // or break } if (half == right) { break; // or break } /+ // the same effect if ((left == half) || (half == right)) { return half; // or break } +/ double fhalf = 1.0/(half*half) * (half + 0.7) - z; if (fhalf > 0.0) { right = half; } else if (fhalf <= 0.0) { left = half; } }; writefln("%g %g", z, half); } } One of the dide by side diff (left -O, right -O + nic): http://pastebin.com/f17a77299 -- Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email ------- You are receiving this mail because: -------
Jan 29 2010
prev sibling next sibling parent d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=3751


Don <clugdbug yahoo.com.au> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |clugdbug yahoo.com.au


--- Comment #4 from Don <clugdbug yahoo.com.au> 2010-01-29 04:55:17 PST ---
I'm not certain that there's a bug here (difficult to tell without reducing the
test case further). It could be that in the optimised case you have a temporary
value stored in a register, increasing the precision. It's legal for the
compiler to do that. Adding a function call stops DMD from keeping the value in
a register.
Anyway you need to cut the test case down significantly.
BTW it behaves the same way on DMD0.175, so this is definitely not a
regression.

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
Jan 29 2010
prev sibling next sibling parent d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=3751


Steven Schveighoffer <schveiguy yahoo.com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |schveiguy yahoo.com


--- Comment #5 from Steven Schveighoffer <schveiguy yahoo.com> 2010-01-29
05:44:47 PST ---
(In reply to comment #4)
 I'm not certain that there's a bug here (difficult to tell without reducing the
 test case further). It could be that in the optimised case you have a temporary
 value stored in a register, increasing the precision. It's legal for the
 compiler to do that. Adding a function call stops DMD from keeping the value in
 a register.
 Anyway you need to cut the test case down significantly.
 BTW it behaves the same way on DMD0.175, so this is definitely not a
 regression.
I think this is exactly the problem. You are using floating point expecting that all floats are created equal. One rule about floating point is never to build an infinite loop waiting for two floats to converge using ==. You must use an epsilon if you expect any reasonable result. Your exit condition is this: if (left == half || right == half) What about trying this: if(half - left < SOME_EPSILON || right - half < SOME_EPSILON) where SOME_EPSILON is a very small insignificant number, like 0.000001 floating point convergence is an art form, you need to understand its limitations. -- Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email ------- You are receiving this mail because: -------
Jan 29 2010
prev sibling next sibling parent d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=3751



--- Comment #6 from Witold Baryluk <baryluk smp.if.uj.edu.pl> 2010-01-29
05:48:52 PST ---
(In reply to comment #4)
 I'm not certain that there's a bug here (difficult to tell without reducing the
 test case further). It could be that in the optimised case you have a temporary
 value stored in a register, increasing the precision. It's legal for the
 compiler to do that. Adding a function call stops DMD from keeping the value in
 a register.
 Anyway you need to cut the test case down significantly.
 BTW it behaves the same way on DMD0.175, so this is definitely not a
 regression.
Hmm, You are probably right. Changing coditions to: if (left <= half) { break; } if (half >= right) { break; } solves problem. Interesingly compiling in gdc even with -O3 -finline -frelease -funsafe-math-optimizations -frounding-math -ffast-math doesn't bring this error. What is also problematic is that even adding forcibly something like cast(double) doesn't solve problem (they are noop's, becuase they are already of correct type). -- Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email ------- You are receiving this mail because: -------
Jan 29 2010
prev sibling next sibling parent d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=3751



--- Comment #7 from Witold Baryluk <baryluk smp.if.uj.edu.pl> 2010-01-29
05:50:47 PST ---
(In reply to comment #5)
 
 I think this is exactly the problem.  You are using floating point expecting
 that all floats are created equal.
 
 One rule about floating point is never to build an infinite loop waiting for
 two floats to converge using ==.  You must use an epsilon if you expect any
 reasonable result.
I know this.
 
 Your exit condition is this:
 
 if (left == half || right == half)
 
 What about trying this:
 
 if(half - left < SOME_EPSILON || right - half < SOME_EPSILON)
 
 where SOME_EPSILON is a very small insignificant number, like 0.000001
 
 floating point convergence is an art form, you need to understand its
 limitations.
Yes I know, but I was relaying of IEEE 754 semantic, in which my code should work (there will be point in execuion when left == half or right == half exactly!). I was just trying to write this routine to be slightly faster just by not using EPSILONS and substractions. -- Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email ------- You are receiving this mail because: -------
Jan 29 2010
prev sibling next sibling parent d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=3751



--- Comment #8 from Steven Schveighoffer <schveiguy yahoo.com> 2010-01-29
06:06:41 PST ---
(In reply to comment #7)
 Yes I know, but I was relaying of IEEE 754 semantic, in which my code should
 work (there will be point in execuion when left == half or right == half
 exactly!). I was just trying to write this routine to be slightly faster just
 by not using EPSILONS and substractions.
Are you sure? I'm not really a floating point expert, so I don't know what the rules of IEEE are, but I have written floating point convergence algorithms for things like programming competitions, and I had to learn that things will get stuck if you don't use an epsilon. You can picture what's happening if you make left and right integers. What ends up happening when left - right == 1? half becomes left + .5, so it does not compare equal to left or right, but if your calculation then assigns left equal to half, the .5 is dropped and left is stuck in the same loop. I would say that this is not a bug, but again, not a floating point expert. -- Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email ------- You are receiving this mail because: -------
Jan 29 2010
prev sibling parent d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=3751


Witold Baryluk <baryluk smp.if.uj.edu.pl> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|NEW                         |RESOLVED
         Resolution|                            |INVALID


--- Comment #9 from Witold Baryluk <baryluk smp.if.uj.edu.pl> 2010-01-29
15:50:21 PST ---
(In reply to comment #8)
 (In reply to comment #7)
 Yes I know, but I was relaying of IEEE 754 semantic, in which my code should
 work (there will be point in execuion when left == half or right == half
 exactly!). I was just trying to write this routine to be slightly faster just
 by not using EPSILONS and substractions.
Are you sure? I'm not really a floating point expert, so I don't know what the rules of IEEE are, but I have written floating point convergence algorithms for things like programming competitions, and I had to learn that things will get stuck if you don't use an epsilon. You can picture what's happening if you make left and right integers. What ends up happening when left - right == 1? half becomes left + .5, so it does not compare equal to left or right, but if your calculation then assigns left equal to half, the .5 is dropped and left is stuck in the same loop.
Whell it can't be that "half is not equal to left", and then after assigning it is actually the same value. It is only becuase we have intermidiate half in higher precision than double precision. I'm actually using formula half = left + 0.5*(right-left); which is safer in regards of overflows, but this doesn't really metter.
 I would say that this is not a bug, but again, not a floating point expert.
Closing it. Sorry for a problem. -- Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email ------- You are receiving this mail because: -------
Jan 29 2010