www.digitalmars.com         C & C++   DMDScript  

digitalmars.D.bugs - [Issue 10523] New: Don't call array op functions for short vector ops

http://d.puremagic.com/issues/show_bug.cgi?id=10523

           Summary: Don't call array op functions for short vector ops
           Product: D
           Version: D2
          Platform: All
        OS/Version: All
            Status: NEW
          Severity: enhancement
          Priority: P2
         Component: DMD
        AssignedTo: nobody puremagic.com
        ReportedBy: bearophile_hugs eml.cc



The D front-end performs some vector ops like a[]+b[] with a call to functions
like __arraySliceSliceAddSliceAssign_d. For more complex array operations like
a[]+b[]+c[] there is no available function, so the D front-end replaces them
with a loop.

When the vector ops are done on fixed sized arrays that are small, the back-end
of gcd and ldc2 compilers are able to produce (very) efficient code from a
loop.

So I suggest to replace _all_ vector ops with a loop when the arrays have a
length known at compile time and such length is small (where small is <= 16).

This avoids to kill the optimization opportunities for ldc2 and gdc back-ends.
And probably improves the performance of the code even for dmd compiler for
arrays like 4 integers or 4 doubles long.

This is not a general solution for this problem, but I think it's very simple
to implement, and improves significantly a wide class of common cases. I can
call it a very low hanging fruit.

- - - - - - - - - - -

Below there is some data to support my request.

The ldc2 compiler is generally a good enough optimizing compiler, it's able to
unroll loops, it should perform some vectorizations or some de-virtualizations,
it will be able to perform link-time optimization, etc.

For this small program:


import core.stdc.stdio: printf;
void main() {
    double[4] a = void, b = void, c = void;
    a[] = 1.0;
    b[] = 2.0;
    c[] = a[] + b[];
    printf("%f %f %f %f\n", c[0], c[1], c[2], c[3]);
}



ldc2 gives, on a X86:

__Dmain:
    pushl   %ebp
    movl    %esp, %ebp
    andl    $-8, %esp
    subl    $136, %esp
    movl    $1072693248, 108(%esp)
    movl    $0, 104(%esp)
    movl    $1072693248, 116(%esp)
    movl    $0, 112(%esp)
    movl    $1072693248, 124(%esp)
    movl    $0, 120(%esp)
    movl    $1072693248, 132(%esp)
    movl    $0, 128(%esp)
    movl    $1073741824, 76(%esp)
    movl    $0, 72(%esp)
    movl    $1073741824, 84(%esp)
    movl    $0, 80(%esp)
    movl    $1073741824, 92(%esp)
    movl    $0, 88(%esp)
    movl    $1073741824, 100(%esp)
    movl    $0, 96(%esp)
    leal    104(%esp), %eax
    movl    %eax, 20(%esp)
    leal    72(%esp), %eax
    movl    %eax, 12(%esp)
    leal    40(%esp), %eax
    movl    %eax, 4(%esp)
    movl    $4, 16(%esp)
    movl    $4, 8(%esp)
    movl    $4, (%esp)
    calll   __arraySliceSliceAddSliceAssign_d
    movsd   40(%esp), %xmm0
    movsd   48(%esp), %xmm1
    movsd   56(%esp), %xmm2
    movsd   64(%esp), %xmm3
    movsd   %xmm3, 28(%esp)
    movsd   %xmm2, 20(%esp)
    movsd   %xmm1, 12(%esp)
    movsd   %xmm0, 4(%esp)
    movl    $_.str, (%esp)
    calll   ___mingw_printf
    xorl    %eax, %eax
    movl    %ebp, %esp
    popl    %ebp
    ret



The first group of movl are the initializations of the two arrays "a", "b" (dmd
calls __memset64 for this. But for such small arrays this can be a bit faster):

    movl    $1072693248, 108(%esp)
    movl    $0, 104(%esp)
    movl    $1072693248, 116(%esp)
    movl    $0, 112(%esp)
    movl    $1072693248, 124(%esp)
    movl    $0, 120(%esp)
    movl    $1072693248, 132(%esp)
    movl    $0, 128(%esp)
    movl    $1073741824, 76(%esp)
    movl    $0, 72(%esp)
    movl    $1073741824, 84(%esp)
    movl    $0, 80(%esp)
    movl    $1073741824, 92(%esp)
    movl    $0, 88(%esp)
    movl    $1073741824, 100(%esp)


Then there is a preparation for calling a druntime function that performs the
vector op, including the array lengths:

    movl    $4, 16(%esp)
    movl    $4, 8(%esp)
    movl    $4, (%esp)

The call:

    calll   __arraySliceSliceAddSliceAssign_d

And then it prepares the call to printf (I don't know why there are 8 movsd
instead of 4):

    movsd   40(%esp), %xmm0
    movsd   48(%esp), %xmm1
    movsd   56(%esp), %xmm2
    movsd   64(%esp), %xmm3
    movsd   %xmm3, 28(%esp)
    movsd   %xmm2, 20(%esp)
    movsd   %xmm1, 12(%esp)
    movsd   %xmm0, 4(%esp)
    movl    $_.str, (%esp)
    calll   ___mingw_printf


This is a similar program that uses a loop instead of vector ops:

import core.stdc.stdio: printf;

__gshared double x0 = 1.0,
                 x1 = 2.0,
                 x2 = 3.0,
                 x3 = 4.0,
                 y0 = 10.0,
                 y1 = 20.0,
                 y2 = 30.0,
                 y3 = 40.0;

void main() {
    double[4] a = [x0, x1, x2, x3];
    double[4] b = [y0, y1, y2, y3];
    double[4] c = void;
    foreach (i; 0 .. 4)
        c[i] = a[i] + b[i];
    printf("%f %f %f %f\n", c[0], c[1], c[2], c[3]);
}


Currently LLVM has some vectorization capabilities, but apparently ldc2 is not
able to use two addpd instructions (or one vaddpd) like this:

    movapd  208(%esp), %xmm0
    movapd  224(%esp), %xmm1
    addpd   176(%esp), %xmm0
    addpd   192(%esp), %xmm1
    movhpd  %xmm1, 28(%esp)
    movlpd  %xmm1, 20(%esp)
    movhpd  %xmm0, 12(%esp)
    movlpd  %xmm0, 4(%esp)
    movl    $_.str, (%esp)
    calll   ___mingw_printf


So currently ldc2 generates code like this for the manual loop, with four
addsd:

    movsd   136(%esp), %xmm0
    movsd   144(%esp), %xmm1
    addsd   104(%esp), %xmm0
    addsd   112(%esp), %xmm1
    movsd   152(%esp), %xmm2
    addsd   120(%esp), %xmm2
    movsd   160(%esp), %xmm3
    addsd   128(%esp), %xmm3
    movsd   %xmm3, 28(%esp)
    movsd   %xmm2, 20(%esp)
    movsd   %xmm1, 12(%esp)
    movsd   %xmm0, 4(%esp)
    movl    $_.str, (%esp)
    calll   ___mingw_printf



Even four addsd are faster than the call to __arraySliceSliceAddSliceAssign_d.

I think the cause of the problem is that the back-end of ldc2 doesn't receive
the lengths (that are 4) of the arrays, that are known at compile-time. A
simple solution is just to not call functions like
__arraySliceSliceAddSliceAssign_d when the length of the arrays is known to be
small, and just inline a small loop. The D front-end is already able to do this
when there is no run-time function available:


import core.stdc.stdio: printf;

__gshared double x0 = 1.0,
                 x1 = 2.0,
                 x2 = 3.0,
                 x3 = 4.0,
                 y0 = 10.0,
                 y1 = 20.0,
                 y2 = 30.0,
                 y3 = 40.0,
                 z0 = 100.0,
                 z1 = 200.0,
                 z2 = 300.0,
                 z3 = 400.0;
void main() {
    double[4] a = [x0, x1, x2, x3];
    double[4] b = [y0, y1, y2, y3];
    double[4] c = [z0, z1, z2, z3];
    double[4] d = void;
    foreach (i; 0 .. 4)
        d[i] = a[i] + b[i] + c[i];
    printf("%f %f %f %f\n", d[0], d[1], d[2], d[3]);
}

    movsd   200(%esp), %xmm0
    movsd   208(%esp), %xmm1
    addsd   168(%esp), %xmm0
    addsd   176(%esp), %xmm1
    addsd   144(%esp), %xmm1
    movsd   216(%esp), %xmm2
    addsd   184(%esp), %xmm2
    addsd   152(%esp), %xmm2
    movsd   224(%esp), %xmm3
    addsd   192(%esp), %xmm3
    addsd   160(%esp), %xmm3
    movsd   136(%esp), %xmm4
    movsd   %xmm3, 28(%esp)
    movsd   %xmm2, 20(%esp)
    movsd   %xmm1, 12(%esp)
    addsd   %xmm0, %xmm4
    movsd   %xmm4, 4(%esp)
    movl    $_.str, (%esp)
    calll   ___mingw_printf

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
Jul 01 2013