www.digitalmars.com         C & C++   DMDScript  

digitalmars.D - Two CTFE benchmarks

reply bearophile <bearophileHUGS lycos.com> writes:
It's a lot of time I don't show benchmarks here :-)
I have taken the timings of run time or compile time of a integer-based
function (nqueens) and a function that works with small arrays of integers
(adapted from the Shootout).

I have used dmd 2.042 with -O -release -inline
The timings for the CFTE version don't include the printing on screen, I have
just timed the compile time.
The CPU is a Celeron at about 2.3 GHz. Code is shown below.


Timings nqueens, seconds:
  N=         10    11     12    13    14    15
  ctfe:    0.43  1.83  23.40     -     -     -
  runtime: 0.04  0.04   0.05  0.13  0.55  3.21


Timings fannkuch, seconds:
  N=          6     7      8     9    10    11
  ctfe:    0.22  0.51  17.50     -     -     -
  runtime: 0.03  0.03   0.04  0.07  0.43  4.63


I have not timed ldc, but usually it's about 20-30% faster than dmd at CFTE
(and often 100-300% faster than dmd at runtime).

The CFTE is quite slower, but the main problem is that it uses a too much large
amount of RAM. Reducing the RAM used can probably speed up too a little.

----------------------------

// nqueens

import std.c.stdio: printf;

// the original code is not mine
uint nqueens(uint c, uint ll=0, uint r=0, uint n=0) {
    uint b;
    if (~c)
        for (uint
             S = ll | c | r; ~0U != S;
             b = ~S & (S + 1), S |= b, n = nqueens(c | b, (ll | b) << 1, (r |
b) >> 1, n)
            ) {}
    else
        n++;
    return n;
}

enum int n = 10;
version(ctfe) enum uint result = nqueens(~0U >> n); // ctfe
void main() {
    version(ctfe) {} else uint result = nqueens(~0U >> n); // runtime
    printf("nqueens(%d) = %u\n", n, result);
}

----------------------------

// fannkuch

import std.c.stdio: printf;

int fannkuch(int n) {
    int i, j, k, t, flips, maxFlipsCount, check;
    int r = n;
    auto perm = new int[n];
    auto perm1 = new int[n];
    auto count = new int[n];

    foreach (ist; 0 .. n)
        perm1[ist] = ist;
    for ( ; ; ) {
        if (check < 30) {
            //pragma(msg, permToString(perm1)); // not possible
            check++;
        }

        while (r != 1) {
            count[r - 1] = r;
            r--;
        }

        if (!(perm1[0] == 0 || perm1[n-1] == n-1)) {
            foreach (ist; 0 .. n)
                perm[ist] = perm1[ist];

            flips = 0;
            i = perm[0];
            do {
                for (j = 1, k = i - 1; j < k; j++, k--) {
                    t = perm[j];
                    perm[j] = perm[k];
                    perm[k] = t;
                }
                flips++;
                t = perm[i];
                perm[i] = i;
                i = t;
            } while(i);

            if (flips > maxFlipsCount)
                maxFlipsCount = flips;
        }

        for ( ; ; ) {
            if (r == n)
                return maxFlipsCount;
            t = perm1[0];
            for (i = 0; i < r; ) {
                j = i + 1;
                perm1[i] = perm1[j];
                i = j;
            }
            perm1[r] = t;

            count[r]--;
            if (count[r] > 0)
                break;
            r++;
        }
    }
}


/*
This is faster as run-time function:

int fannkuch(int n)() {
    int i, j, k, t, flips, maxFlipsCount, check;
    int r = n;
    int[n] perm, perm1, count;

    foreach (ist; Range!(0, n))
        perm1[ist] = ist;
    for ( ; ; ) {
        if (check < 30) {
            //pragma(msg, permToString(perm1)); // not possible because of many
D bugs
            check++;
        }

        while (r != 1) {
            count[r - 1] = r;
            r--;
        }

        if (!(perm1[0] == 0 || perm1[n-1] == n-1)) {
            foreach (ist; Range!(0, n))
                perm[ist] = perm1[ist];
...
*/


enum int n = 11;
version(ctfe) enum int result = fannkuch(n); // ctfe
void main() {
    version(ctfe) {} else int result = fannkuch(n); // runtime
    printf("Pfannkuchen(%d) = %d\n", n, result);
}

----------------------------

Bye,
bearophile
Mar 28 2010
parent reply Don <nospam nospam.com> writes:
bearophile wrote:
 It's a lot of time I don't show benchmarks here :-)
 I have taken the timings of run time or compile time of a integer-based
function (nqueens) and a function that works with small arrays of integers
(adapted from the Shootout).
 
 I have used dmd 2.042 with -O -release -inline
 The timings for the CFTE version don't include the printing on screen, I have
just timed the compile time.
 The CPU is a Celeron at about 2.3 GHz. Code is shown below.
 
 
 Timings nqueens, seconds:
   N=         10    11     12    13    14    15
   ctfe:    0.43  1.83  23.40     -     -     -
   runtime: 0.04  0.04   0.05  0.13  0.55  3.21
 
 
 Timings fannkuch, seconds:
   N=          6     7      8     9    10    11
   ctfe:    0.22  0.51  17.50     -     -     -
   runtime: 0.03  0.03   0.04  0.07  0.43  4.63
 
 
 I have not timed ldc, but usually it's about 20-30% faster than dmd at CFTE
(and often 100-300% faster than dmd at runtime).
 
 The CFTE is quite slower, but the main problem is that it uses a too much
large amount of RAM. Reducing the RAM used can probably speed up too a little.

Memory reduction would speed it up enormously. Almost all CTFE issues trace back to bug 1330. Thanks for the benchmarks, they're very helpful.
 
 ----------------------------
 
 // nqueens
 
 import std.c.stdio: printf;
 
 // the original code is not mine
 uint nqueens(uint c, uint ll=0, uint r=0, uint n=0) {
     uint b;
     if (~c)
         for (uint
              S = ll | c | r; ~0U != S;
              b = ~S & (S + 1), S |= b, n = nqueens(c | b, (ll | b) << 1, (r |
b) >> 1, n)
             ) {}
     else
         n++;
     return n;
 }
 
 enum int n = 10;
 version(ctfe) enum uint result = nqueens(~0U >> n); // ctfe
 void main() {
     version(ctfe) {} else uint result = nqueens(~0U >> n); // runtime
     printf("nqueens(%d) = %u\n", n, result);
 }
 
 ----------------------------
 
 // fannkuch
 
 import std.c.stdio: printf;
 
 int fannkuch(int n) {
     int i, j, k, t, flips, maxFlipsCount, check;
     int r = n;
     auto perm = new int[n];
     auto perm1 = new int[n];
     auto count = new int[n];
 
     foreach (ist; 0 .. n)
         perm1[ist] = ist;
     for ( ; ; ) {
         if (check < 30) {
             //pragma(msg, permToString(perm1)); // not possible
             check++;
         }
 
         while (r != 1) {
             count[r - 1] = r;
             r--;
         }
 
         if (!(perm1[0] == 0 || perm1[n-1] == n-1)) {
             foreach (ist; 0 .. n)
                 perm[ist] = perm1[ist];
 
             flips = 0;
             i = perm[0];
             do {
                 for (j = 1, k = i - 1; j < k; j++, k--) {
                     t = perm[j];
                     perm[j] = perm[k];
                     perm[k] = t;
                 }
                 flips++;
                 t = perm[i];
                 perm[i] = i;
                 i = t;
             } while(i);
 
             if (flips > maxFlipsCount)
                 maxFlipsCount = flips;
         }
 
         for ( ; ; ) {
             if (r == n)
                 return maxFlipsCount;
             t = perm1[0];
             for (i = 0; i < r; ) {
                 j = i + 1;
                 perm1[i] = perm1[j];
                 i = j;
             }
             perm1[r] = t;
 
             count[r]--;
             if (count[r] > 0)
                 break;
             r++;
         }
     }
 }
 
 
 /*
 This is faster as run-time function:
 
 int fannkuch(int n)() {
     int i, j, k, t, flips, maxFlipsCount, check;
     int r = n;
     int[n] perm, perm1, count;
 
     foreach (ist; Range!(0, n))
         perm1[ist] = ist;
     for ( ; ; ) {
         if (check < 30) {
             //pragma(msg, permToString(perm1)); // not possible because of
many D bugs
             check++;
         }
 
         while (r != 1) {
             count[r - 1] = r;
             r--;
         }
 
         if (!(perm1[0] == 0 || perm1[n-1] == n-1)) {
             foreach (ist; Range!(0, n))
                 perm[ist] = perm1[ist];
 ...
 */
 
 
 enum int n = 11;
 version(ctfe) enum int result = fannkuch(n); // ctfe
 void main() {
     version(ctfe) {} else int result = fannkuch(n); // runtime
     printf("Pfannkuchen(%d) = %d\n", n, result);
 }
 
 ----------------------------
 
 Bye,
 bearophile

Mar 28 2010
parent bearophile <bearophileHUGS lycos.com> writes:
Don:

Thanks for the benchmarks, they're very helpful.<

You are welcome :-)
 Timings nqueens, seconds:
   N=         10    11     12    13    14    15
   ctfe:    0.43  1.83  23.40     -     -     -
   runtime: 0.04  0.04   0.05  0.13  0.55  3.21


 Timings fannkuch, seconds:
   N=          6     7      8     9    10    11
   ctfe:    0.22  0.51  17.50     -     -     -
   runtime: 0.03  0.03   0.04  0.07  0.43  4.63

For comparison, I have run the same programs in Python and Python+Psyco (remember that Python integers are multi-precision, so they are slower). Python 2.6.5 and the latest Psyco, same CPU and OS. Timings nqueens, seconds: N= 10 11 12 13 14 15 Python: 0.17 0.51 2.34 12.19 71. Psyco: 0.15 0.35 1.40 7.15 45.61 261. Timings fannkuch, seconds: N= 6 7 8 9 10 11 Python: 0.08 0.11 0.32 2.47 28.4 361. Psyco: 0.09 0.10 0.10 0.15 0.84 9.80 ------------------ The Python code I have used (the extra operations done with 0xFFFFFFFF are necessary because Python has no uints that wrap, it has multi-precision ints that keep growing): import sys, psyco def search(cb, lb=0, rb=0, cnt=0): if cb == 0xFFFFFFFF: cnt += 1 else: bs = lb | cb | rb while bs != 0xFFFFFFFF: b = ~bs & (bs + 1) bs |= b cnt = search(cb | b, ((lb | b) << 1) & 0xFFFFFFFF, (rb | b) >> 1, cnt) return cnt psyco.full() N = int(sys.argv[1]) print search(0xFFFFFFFF >> N) ------------------ fannkuch: import sys, array import psyco USES_PSYCO = True def fannkuch(n): if USES_PSYCO: perm = array.array('l', [0] * n) perm1 = array.array('l', range(n)) count = array.array('l', [0] * n) else: perm = [0] * n perm1 = range(n) count = [0] * n m = n - 1 r = n maxFlipsCount = 0 didpr = 0 while True: if didpr < 30: #print "".join(str(i+1) for i in perm1) didpr += 1 while r != 1: count[r-1] = r r -= 1 if perm1[0] and perm1[m] != m: for i in xrange(n): perm[i] = perm1[i] # To avoid memory trashing i = perm[0] flips = 0 while i: temp = perm[i] perm[i] = i i = temp j = 1 k = i - 1 while j < k: temp = perm[j] perm[j] = perm[k] perm[k] = temp j += 1 k -= 1 flips += 1 if flips > maxFlipsCount: maxFlipsCount = flips while True: if r == n: return maxFlipsCount temp = perm1[0] i = 0 while i < r: j = i + 1 perm1[i] = perm1[j] i = j perm1[r] = temp count[r] -= 1 if count[r] > 0: break r += 1 if USES_PSYCO: psyco.bind(fannkuch) N = int(sys.argv[1]) print "Pfannkuchen(%d) = %d" % (N, fannkuch(N)) ------------------ Bye, bearophile
Mar 28 2010