digitalmars.D - Two CTFE benchmarks

bearophile (123/123) Mar 28 2010 It's a lot of time I don't show benchmarks here :-)

Don (3/160) Mar 28 2010 Memory reduction would speed it up enormously. Almost all CTFE issues

bearophile (95/104) Mar 28 2010 For comparison, I have run the same programs in Python and Python+Psyco ...

bearophile <bearophileHUGS lycos.com> writes:

It's a lot of time I don't show benchmarks here :-)
I have taken the timings of run time or compile time of a integer-based
function (nqueens) and a function that works with small arrays of integers
(adapted from the Shootout).

I have used dmd 2.042 with -O -release -inline
The timings for the CFTE version don't include the printing on screen, I have
just timed the compile time.
The CPU is a Celeron at about 2.3 GHz. Code is shown below.


Timings nqueens, seconds:
  N=         10    11     12    13    14    15
  ctfe:    0.43  1.83  23.40     -     -     -
  runtime: 0.04  0.04   0.05  0.13  0.55  3.21


Timings fannkuch, seconds:
  N=          6     7      8     9    10    11
  ctfe:    0.22  0.51  17.50     -     -     -
  runtime: 0.03  0.03   0.04  0.07  0.43  4.63


I have not timed ldc, but usually it's about 20-30% faster than dmd at CFTE
(and often 100-300% faster than dmd at runtime).

The CFTE is quite slower, but the main problem is that it uses a too much large
amount of RAM. Reducing the RAM used can probably speed up too a little.

----------------------------

// nqueens

import std.c.stdio: printf;

// the original code is not mine
uint nqueens(uint c, uint ll=0, uint r=0, uint n=0) {
    uint b;
    if (~c)
        for (uint
             S = ll | c | r; ~0U != S;
             b = ~S & (S + 1), S |= b, n = nqueens(c | b, (ll | b) << 1, (r |
b) >> 1, n)
            ) {}
    else
        n++;
    return n;
}

enum int n = 10;
version(ctfe) enum uint result = nqueens(~0U >> n); // ctfe
void main() {
    version(ctfe) {} else uint result = nqueens(~0U >> n); // runtime
    printf("nqueens(%d) = %u\n", n, result);
}

----------------------------

// fannkuch

import std.c.stdio: printf;

int fannkuch(int n) {
    int i, j, k, t, flips, maxFlipsCount, check;
    int r = n;
    auto perm = new int[n];
    auto perm1 = new int[n];
    auto count = new int[n];

    foreach (ist; 0 .. n)
        perm1[ist] = ist;
    for ( ; ; ) {
        if (check < 30) {
            //pragma(msg, permToString(perm1)); // not possible
            check++;
        }

        while (r != 1) {
            count[r - 1] = r;
            r--;
        }

        if (!(perm1[0] == 0 || perm1[n-1] == n-1)) {
            foreach (ist; 0 .. n)
                perm[ist] = perm1[ist];

            flips = 0;
            i = perm[0];
            do {
                for (j = 1, k = i - 1; j < k; j++, k--) {
                    t = perm[j];
                    perm[j] = perm[k];
                    perm[k] = t;
                }
                flips++;
                t = perm[i];
                perm[i] = i;
                i = t;
            } while(i);

            if (flips > maxFlipsCount)
                maxFlipsCount = flips;
        }

        for ( ; ; ) {
            if (r == n)
                return maxFlipsCount;
            t = perm1[0];
            for (i = 0; i < r; ) {
                j = i + 1;
                perm1[i] = perm1[j];
                i = j;
            }
            perm1[r] = t;

            count[r]--;
            if (count[r] > 0)
                break;
            r++;
        }
    }
}


/*
This is faster as run-time function:

int fannkuch(int n)() {
    int i, j, k, t, flips, maxFlipsCount, check;
    int r = n;
    int[n] perm, perm1, count;

    foreach (ist; Range!(0, n))
        perm1[ist] = ist;
    for ( ; ; ) {
        if (check < 30) {
            //pragma(msg, permToString(perm1)); // not possible because of many
D bugs
            check++;
        }

        while (r != 1) {
            count[r - 1] = r;
            r--;
        }

        if (!(perm1[0] == 0 || perm1[n-1] == n-1)) {
            foreach (ist; Range!(0, n))
                perm[ist] = perm1[ist];
...
*/


enum int n = 11;
version(ctfe) enum int result = fannkuch(n); // ctfe
void main() {
    version(ctfe) {} else int result = fannkuch(n); // runtime
    printf("Pfannkuchen(%d) = %d\n", n, result);
}

----------------------------

Bye,
bearophile

Mar 28 2010

Don <nospam nospam.com> writes:

bearophile wrote:
 It's a lot of time I don't show benchmarks here :-)
 I have taken the timings of run time or compile time of a integer-based
function (nqueens) and a function that works with small arrays of integers
(adapted from the Shootout).
 
 I have used dmd 2.042 with -O -release -inline
 The timings for the CFTE version don't include the printing on screen, I have
just timed the compile time.
 The CPU is a Celeron at about 2.3 GHz. Code is shown below.
 
 
 Timings nqueens, seconds:
   N=         10    11     12    13    14    15
   ctfe:    0.43  1.83  23.40     -     -     -
   runtime: 0.04  0.04   0.05  0.13  0.55  3.21
 
 
 Timings fannkuch, seconds:
   N=          6     7      8     9    10    11
   ctfe:    0.22  0.51  17.50     -     -     -
   runtime: 0.03  0.03   0.04  0.07  0.43  4.63
 
 
 I have not timed ldc, but usually it's about 20-30% faster than dmd at CFTE
(and often 100-300% faster than dmd at runtime).
 
 The CFTE is quite slower, but the main problem is that it uses a too much
large amount of RAM. Reducing the RAM used can probably speed up too a little.

Memory reduction would speed it up enormously. Almost all CTFE issues 
trace back to bug 1330. Thanks for the benchmarks, they're very helpful.

 
 ----------------------------
 
 // nqueens
 
 import std.c.stdio: printf;
 
 // the original code is not mine
 uint nqueens(uint c, uint ll=0, uint r=0, uint n=0) {
     uint b;
     if (~c)
         for (uint
              S = ll | c | r; ~0U != S;
              b = ~S & (S + 1), S |= b, n = nqueens(c | b, (ll | b) << 1, (r |
b) >> 1, n)
             ) {}
     else
         n++;
     return n;
 }
 
 enum int n = 10;
 version(ctfe) enum uint result = nqueens(~0U >> n); // ctfe
 void main() {
     version(ctfe) {} else uint result = nqueens(~0U >> n); // runtime
     printf("nqueens(%d) = %u\n", n, result);
 }
 
 ----------------------------
 
 // fannkuch
 
 import std.c.stdio: printf;
 
 int fannkuch(int n) {
     int i, j, k, t, flips, maxFlipsCount, check;
     int r = n;
     auto perm = new int[n];
     auto perm1 = new int[n];
     auto count = new int[n];
 
     foreach (ist; 0 .. n)
         perm1[ist] = ist;
     for ( ; ; ) {
         if (check < 30) {
             //pragma(msg, permToString(perm1)); // not possible
             check++;
         }
 
         while (r != 1) {
             count[r - 1] = r;
             r--;
         }
 
         if (!(perm1[0] == 0 || perm1[n-1] == n-1)) {
             foreach (ist; 0 .. n)
                 perm[ist] = perm1[ist];
 
             flips = 0;
             i = perm[0];
             do {
                 for (j = 1, k = i - 1; j < k; j++, k--) {
                     t = perm[j];
                     perm[j] = perm[k];
                     perm[k] = t;
                 }
                 flips++;
                 t = perm[i];
                 perm[i] = i;
                 i = t;
             } while(i);
 
             if (flips > maxFlipsCount)
                 maxFlipsCount = flips;
         }
 
         for ( ; ; ) {
             if (r == n)
                 return maxFlipsCount;
             t = perm1[0];
             for (i = 0; i < r; ) {
                 j = i + 1;
                 perm1[i] = perm1[j];
                 i = j;
             }
             perm1[r] = t;
 
             count[r]--;
             if (count[r] > 0)
                 break;
             r++;
         }
     }
 }
 
 
 /*
 This is faster as run-time function:
 
 int fannkuch(int n)() {
     int i, j, k, t, flips, maxFlipsCount, check;
     int r = n;
     int[n] perm, perm1, count;
 
     foreach (ist; Range!(0, n))
         perm1[ist] = ist;
     for ( ; ; ) {
         if (check < 30) {
             //pragma(msg, permToString(perm1)); // not possible because of
many D bugs
             check++;
         }
 
         while (r != 1) {
             count[r - 1] = r;
             r--;
         }
 
         if (!(perm1[0] == 0 || perm1[n-1] == n-1)) {
             foreach (ist; Range!(0, n))
                 perm[ist] = perm1[ist];
 ...
 */
 
 
 enum int n = 11;
 version(ctfe) enum int result = fannkuch(n); // ctfe
 void main() {
     version(ctfe) {} else int result = fannkuch(n); // runtime
     printf("Pfannkuchen(%d) = %d\n", n, result);
 }
 
 ----------------------------
 
 Bye,
 bearophile

Mar 28 2010

bearophile <bearophileHUGS lycos.com> writes:

Don:

Thanks for the benchmarks, they're very helpful.<

You are welcome :-)


 Timings nqueens, seconds:
   N=         10    11     12    13    14    15
   ctfe:    0.43  1.83  23.40     -     -     -
   runtime: 0.04  0.04   0.05  0.13  0.55  3.21


 Timings fannkuch, seconds:
   N=          6     7      8     9    10    11
   ctfe:    0.22  0.51  17.50     -     -     -
   runtime: 0.03  0.03   0.04  0.07  0.43  4.63


For comparison, I have run the same programs in Python and Python+Psyco
(remember that Python integers are multi-precision, so they are slower).

Python 2.6.5 and the latest Psyco, same CPU and OS.


Timings nqueens, seconds:
  N=        10    11     12     13     14     15
  Python: 0.17  0.51   2.34  12.19  71.
  Psyco:  0.15  0.35   1.40   7.15  45.61   261.

Timings fannkuch, seconds:
  N=          6     7     8     9    10      11
  Python:  0.08  0.11  0.32  2.47  28.4  361.
  Psyco:   0.09  0.10  0.10  0.15  0.84    9.80

------------------

The Python code I have used (the extra operations done with 0xFFFFFFFF are
necessary because Python has no uints that wrap, it has multi-precision ints
that keep growing):


import sys, psyco

def search(cb, lb=0, rb=0, cnt=0):
    if cb == 0xFFFFFFFF:
        cnt += 1
    else:
        bs = lb | cb | rb
        while bs != 0xFFFFFFFF:
            b = ~bs & (bs + 1)
            bs |= b
            cnt = search(cb | b, ((lb | b) << 1) & 0xFFFFFFFF,
                         (rb | b) >> 1, cnt)
    return cnt

psyco.full()
N = int(sys.argv[1])
print search(0xFFFFFFFF >> N)

------------------

fannkuch:


import sys, array
import psyco
USES_PSYCO = True

def fannkuch(n):
    if USES_PSYCO:
        perm = array.array('l', [0] * n)
        perm1 = array.array('l', range(n))
        count = array.array('l', [0] * n)
    else:
        perm = [0] * n
        perm1 = range(n)
        count = [0] * n

    m = n - 1
    r = n
    maxFlipsCount = 0
    didpr = 0

    while True:
        if didpr < 30:
            #print "".join(str(i+1) for i in perm1)
            didpr += 1

        while r != 1:
            count[r-1] = r
            r -= 1

        if perm1[0] and perm1[m] != m:
            for i in xrange(n):


            i = perm[0]
            flips = 0
            while i:
                temp = perm[i]
                perm[i] = i
                i = temp

                j = 1
                k = i - 1
                while j < k:
                    temp = perm[j]
                    perm[j] = perm[k]
                    perm[k] = temp
                    j += 1
                    k -= 1
                flips += 1

            if flips > maxFlipsCount:
                maxFlipsCount = flips

        while True:
            if r == n:
                return maxFlipsCount
            temp = perm1[0]
            i = 0
            while i < r:
                j = i + 1
                perm1[i] = perm1[j]
                i = j
            perm1[r] = temp

            count[r] -= 1
            if count[r] > 0:
                break
            r += 1


if USES_PSYCO:
    psyco.bind(fannkuch)
N = int(sys.argv[1])
print "Pfannkuchen(%d) = %d" % (N, fannkuch(N))

------------------

Bye,
bearophile

Mar 28 2010

D Programming

C/C++ Programming

Other

digitalmars.D - Two CTFE benchmarks