www.digitalmars.com         C & C++   DMDScript  

digitalmars.D.bugs - [Issue 10286] New: Better optimization for struct constructors?

http://d.puremagic.com/issues/show_bug.cgi?id=10286

           Summary: Better optimization for struct constructors?
           Product: D
           Version: D2
          Platform: All
        OS/Version: All
            Status: NEW
          Severity: enhancement
          Priority: P2
         Component: DMD
        AssignedTo: nobody puremagic.com
        ReportedBy: bearophile_hugs eml.cc



Created an attachment (id=1220)
A small raytracer

This is a benchmark program that shows the difference in run-time between a
struct with and without explicit constructor:


import core.stdc.stdio: printf;
import core.stdc.stdlib: atoi;

struct V3a {
    double x, y, z;

    this(in double x_, in double y_, in double z_)
    pure nothrow {
        this.x = x_;
        this.y = y_;
        this.z = z_;
    }
}

struct V3b {
    double x, y, z;
}

double spam1(in uint N) pure nothrow {
    double total = 0.0;

    for (uint i = 0; i < N; i++) {
        immutable v = V3a(i, i, i);
        total += v.y;
    }

    return total;
}

double spam2(in uint N) pure nothrow {
    double total = 0.0;

    for (uint i = 0; i < N; i++) {
        immutable v = V3b(i, i, i);
        total += v.y;
    }

    return total;
}

void main(in string[] args) {
    immutable uint N = (args.length >= 2) ?
                       atoi((args[1] ~ '\0').ptr) :
                       1_000;

    if (args.length >= 3 && args[2] == "1")
        printf("%f\n", spam1(N));
    else
        printf("%f\n", spam2(N));
}



If you run it you see a performance difference between creating V3a and V3b.
This is the asm generated by dmd (dmd 2.064alpha, -O -release -inline
-noboundscheck):



_D5test25spam1FNaNbxkZd:
        sub    ESP,030h
        mov    EDX,EAX
        xor    ECX,ECX
        push    EBX
        test    EDX,EDX
        push    ESI
        push    EDI
        mov    dword ptr 034h[ESP],0
        mov    dword ptr 038h[ESP],0
        je    L52
L1E:        mov    ESI,offset FLAT:_D5test23V3a6__initZ
        lea    EDI,01Ch[ESP]
        movsd
        movsd
        movsd
        movsd
        movsd
        movsd
        xor    EBX,EBX
        mov    EAX,ECX
        mov    0Ch[ESP],ECX
        inc    ECX
        cmp    ECX,EDX
        mov    010h[ESP],EBX
        fild    long64 ptr 0Ch[ESP]
        fstp    qword ptr 024h[ESP]
        fld    qword ptr 024h[ESP]
        fadd    qword ptr 034h[ESP]
        fstp    qword ptr 034h[ESP]
        jb    L1E
L52:        fld    qword ptr 034h[ESP]
        pop    EDI
        pop    ESI
        pop    EBX
        add    ESP,030h
        ret

_D5test25spam2FNaNbxkZd:
        sub    ESP,030h
        mov    EDX,EAX
        xor    ECX,ECX
        push    EBX
        test    EDX,EDX
        mov    dword ptr 02Ch[ESP],0
        mov    dword ptr 030h[ESP],0
        je    L63
L1C:        mov    4[ESP],ECX
        xor    EBX,EBX
        mov    EAX,ECX
        mov    8[ESP],EBX
        inc    ECX
        cmp    ECX,EDX
        fild    long64 ptr 4[ESP]
        fstp    qword ptr 014h[ESP]
        mov    4[ESP],EAX
        mov    8[ESP],EBX
        fild    long64 ptr 4[ESP]
        fstp    qword ptr 01Ch[ESP]
        mov    4[ESP],EAX
        mov    8[ESP],EBX
        fild    long64 ptr 4[ESP]
        fld    qword ptr 01Ch[ESP]
        fxch    ST1
        fstp    qword ptr 024h[ESP]
        fadd    qword ptr 02Ch[ESP]
        fstp    qword ptr 02Ch[ESP]
        jb    L1C
L63:        fld    qword ptr 02Ch[ESP]
        pop    EBX
        add    ESP,030h
        ret


One visible difference is that block of movsd:
        movsd
        movsd
        movsd
        movsd
        movsd
        movsd


Compiling with ldc2 (V. 0.11.0, based on DMD v2.062 and LLVM 3.3svn, -O5
-release -profile-verifier-noassert):

__D5test25spam1FNaNbxkZd:
    pushl    %ebp
    movl    %esp, %ebp
    andl    $-8, %esp
    subl    $8, %esp
    xorps    %xmm0, %xmm0
    testl    %eax, %eax
    je    LBB1_1
    movsd    LCPI1_0, %xmm2
    xorps    %xmm1, %xmm1
    .align    16, 0x90
LBB1_3:
    addsd    %xmm0, %xmm1
    addsd    %xmm2, %xmm0
    decl    %eax
    jne    LBB1_3
    jmp    LBB1_4
LBB1_1:
    xorps    %xmm1, %xmm1
LBB1_4:
    movsd    %xmm1, (%esp)
    fldl    (%esp)
    movl    %ebp, %esp
    popl    %ebp
    ret

__D5test25spam2FNaNbxkZd:
    pushl    %ebp
    movl    %esp, %ebp
    andl    $-8, %esp
    subl    $8, %esp
    xorps    %xmm0, %xmm0
    testl    %eax, %eax
    je    LBB2_3
    movsd    LCPI2_0, %xmm1
    xorps    %xmm2, %xmm2
    .align    16, 0x90
LBB2_2:
    addsd    %xmm2, %xmm0
    addsd    %xmm1, %xmm2
    decl    %eax
    jne    LBB2_2
LBB2_3:
    movsd    %xmm0, (%esp)
    fldl    (%esp)
    movl    %ebp, %esp
    popl    %ebp
    ret


You see that ldc2 compiles the two functions at the same way, and indeed their
run-time is the same.

But the problem is not limited to DMD. As soon as your program becomes a little
longer than such tiny benchmark, the performance difference between structs
with constructors and struct without constructors becomes well visible even
with ldc2.

In attach you find a small single-module ray tracer, it contains a V3 struct.
It contains a basic constructor like this:


    this(in double x_, in double y_, in double z_) pure nothrow {
        this.x = x_;
        this.y = y_;
        this.z = z_;
    }


If you run the program with or without that constructor, using both ldc2 and
dmd, you see a significant performance difference.

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
Jun 06 2013