digitalmars.D - Two CTFE benchmarks
- bearophile (123/123) Mar 28 2010 It's a lot of time I don't show benchmarks here :-)
- Don (3/160) Mar 28 2010 Memory reduction would speed it up enormously. Almost all CTFE issues
- bearophile (95/104) Mar 28 2010 For comparison, I have run the same programs in Python and Python+Psyco ...
It's a lot of time I don't show benchmarks here :-) I have taken the timings of run time or compile time of a integer-based function (nqueens) and a function that works with small arrays of integers (adapted from the Shootout). I have used dmd 2.042 with -O -release -inline The timings for the CFTE version don't include the printing on screen, I have just timed the compile time. The CPU is a Celeron at about 2.3 GHz. Code is shown below. Timings nqueens, seconds: N= 10 11 12 13 14 15 ctfe: 0.43 1.83 23.40 - - - runtime: 0.04 0.04 0.05 0.13 0.55 3.21 Timings fannkuch, seconds: N= 6 7 8 9 10 11 ctfe: 0.22 0.51 17.50 - - - runtime: 0.03 0.03 0.04 0.07 0.43 4.63 I have not timed ldc, but usually it's about 20-30% faster than dmd at CFTE (and often 100-300% faster than dmd at runtime). The CFTE is quite slower, but the main problem is that it uses a too much large amount of RAM. Reducing the RAM used can probably speed up too a little. ---------------------------- // nqueens import std.c.stdio: printf; // the original code is not mine uint nqueens(uint c, uint ll=0, uint r=0, uint n=0) { uint b; if (~c) for (uint S = ll | c | r; ~0U != S; b = ~S & (S + 1), S |= b, n = nqueens(c | b, (ll | b) << 1, (r | b) >> 1, n) ) {} else n++; return n; } enum int n = 10; version(ctfe) enum uint result = nqueens(~0U >> n); // ctfe void main() { version(ctfe) {} else uint result = nqueens(~0U >> n); // runtime printf("nqueens(%d) = %u\n", n, result); } ---------------------------- // fannkuch import std.c.stdio: printf; int fannkuch(int n) { int i, j, k, t, flips, maxFlipsCount, check; int r = n; auto perm = new int[n]; auto perm1 = new int[n]; auto count = new int[n]; foreach (ist; 0 .. n) perm1[ist] = ist; for ( ; ; ) { if (check < 30) { //pragma(msg, permToString(perm1)); // not possible check++; } while (r != 1) { count[r - 1] = r; r--; } if (!(perm1[0] == 0 || perm1[n-1] == n-1)) { foreach (ist; 0 .. n) perm[ist] = perm1[ist]; flips = 0; i = perm[0]; do { for (j = 1, k = i - 1; j < k; j++, k--) { t = perm[j]; perm[j] = perm[k]; perm[k] = t; } flips++; t = perm[i]; perm[i] = i; i = t; } while(i); if (flips > maxFlipsCount) maxFlipsCount = flips; } for ( ; ; ) { if (r == n) return maxFlipsCount; t = perm1[0]; for (i = 0; i < r; ) { j = i + 1; perm1[i] = perm1[j]; i = j; } perm1[r] = t; count[r]--; if (count[r] > 0) break; r++; } } } /* This is faster as run-time function: int fannkuch(int n)() { int i, j, k, t, flips, maxFlipsCount, check; int r = n; int[n] perm, perm1, count; foreach (ist; Range!(0, n)) perm1[ist] = ist; for ( ; ; ) { if (check < 30) { //pragma(msg, permToString(perm1)); // not possible because of many D bugs check++; } while (r != 1) { count[r - 1] = r; r--; } if (!(perm1[0] == 0 || perm1[n-1] == n-1)) { foreach (ist; Range!(0, n)) perm[ist] = perm1[ist]; ... */ enum int n = 11; version(ctfe) enum int result = fannkuch(n); // ctfe void main() { version(ctfe) {} else int result = fannkuch(n); // runtime printf("Pfannkuchen(%d) = %d\n", n, result); } ---------------------------- Bye, bearophile
Mar 28 2010
bearophile wrote:It's a lot of time I don't show benchmarks here :-) I have taken the timings of run time or compile time of a integer-based function (nqueens) and a function that works with small arrays of integers (adapted from the Shootout). I have used dmd 2.042 with -O -release -inline The timings for the CFTE version don't include the printing on screen, I have just timed the compile time. The CPU is a Celeron at about 2.3 GHz. Code is shown below. Timings nqueens, seconds: N= 10 11 12 13 14 15 ctfe: 0.43 1.83 23.40 - - - runtime: 0.04 0.04 0.05 0.13 0.55 3.21 Timings fannkuch, seconds: N= 6 7 8 9 10 11 ctfe: 0.22 0.51 17.50 - - - runtime: 0.03 0.03 0.04 0.07 0.43 4.63 I have not timed ldc, but usually it's about 20-30% faster than dmd at CFTE (and often 100-300% faster than dmd at runtime). The CFTE is quite slower, but the main problem is that it uses a too much large amount of RAM. Reducing the RAM used can probably speed up too a little.Memory reduction would speed it up enormously. Almost all CTFE issues trace back to bug 1330. Thanks for the benchmarks, they're very helpful.---------------------------- // nqueens import std.c.stdio: printf; // the original code is not mine uint nqueens(uint c, uint ll=0, uint r=0, uint n=0) { uint b; if (~c) for (uint S = ll | c | r; ~0U != S; b = ~S & (S + 1), S |= b, n = nqueens(c | b, (ll | b) << 1, (r | b) >> 1, n) ) {} else n++; return n; } enum int n = 10; version(ctfe) enum uint result = nqueens(~0U >> n); // ctfe void main() { version(ctfe) {} else uint result = nqueens(~0U >> n); // runtime printf("nqueens(%d) = %u\n", n, result); } ---------------------------- // fannkuch import std.c.stdio: printf; int fannkuch(int n) { int i, j, k, t, flips, maxFlipsCount, check; int r = n; auto perm = new int[n]; auto perm1 = new int[n]; auto count = new int[n]; foreach (ist; 0 .. n) perm1[ist] = ist; for ( ; ; ) { if (check < 30) { //pragma(msg, permToString(perm1)); // not possible check++; } while (r != 1) { count[r - 1] = r; r--; } if (!(perm1[0] == 0 || perm1[n-1] == n-1)) { foreach (ist; 0 .. n) perm[ist] = perm1[ist]; flips = 0; i = perm[0]; do { for (j = 1, k = i - 1; j < k; j++, k--) { t = perm[j]; perm[j] = perm[k]; perm[k] = t; } flips++; t = perm[i]; perm[i] = i; i = t; } while(i); if (flips > maxFlipsCount) maxFlipsCount = flips; } for ( ; ; ) { if (r == n) return maxFlipsCount; t = perm1[0]; for (i = 0; i < r; ) { j = i + 1; perm1[i] = perm1[j]; i = j; } perm1[r] = t; count[r]--; if (count[r] > 0) break; r++; } } } /* This is faster as run-time function: int fannkuch(int n)() { int i, j, k, t, flips, maxFlipsCount, check; int r = n; int[n] perm, perm1, count; foreach (ist; Range!(0, n)) perm1[ist] = ist; for ( ; ; ) { if (check < 30) { //pragma(msg, permToString(perm1)); // not possible because of many D bugs check++; } while (r != 1) { count[r - 1] = r; r--; } if (!(perm1[0] == 0 || perm1[n-1] == n-1)) { foreach (ist; Range!(0, n)) perm[ist] = perm1[ist]; ... */ enum int n = 11; version(ctfe) enum int result = fannkuch(n); // ctfe void main() { version(ctfe) {} else int result = fannkuch(n); // runtime printf("Pfannkuchen(%d) = %d\n", n, result); } ---------------------------- Bye, bearophile
Mar 28 2010
Don:Thanks for the benchmarks, they're very helpful.<You are welcome :-)Timings nqueens, seconds: N= 10 11 12 13 14 15 ctfe: 0.43 1.83 23.40 - - - runtime: 0.04 0.04 0.05 0.13 0.55 3.21 Timings fannkuch, seconds: N= 6 7 8 9 10 11 ctfe: 0.22 0.51 17.50 - - - runtime: 0.03 0.03 0.04 0.07 0.43 4.63For comparison, I have run the same programs in Python and Python+Psyco (remember that Python integers are multi-precision, so they are slower). Python 2.6.5 and the latest Psyco, same CPU and OS. Timings nqueens, seconds: N= 10 11 12 13 14 15 Python: 0.17 0.51 2.34 12.19 71. Psyco: 0.15 0.35 1.40 7.15 45.61 261. Timings fannkuch, seconds: N= 6 7 8 9 10 11 Python: 0.08 0.11 0.32 2.47 28.4 361. Psyco: 0.09 0.10 0.10 0.15 0.84 9.80 ------------------ The Python code I have used (the extra operations done with 0xFFFFFFFF are necessary because Python has no uints that wrap, it has multi-precision ints that keep growing): import sys, psyco def search(cb, lb=0, rb=0, cnt=0): if cb == 0xFFFFFFFF: cnt += 1 else: bs = lb | cb | rb while bs != 0xFFFFFFFF: b = ~bs & (bs + 1) bs |= b cnt = search(cb | b, ((lb | b) << 1) & 0xFFFFFFFF, (rb | b) >> 1, cnt) return cnt psyco.full() N = int(sys.argv[1]) print search(0xFFFFFFFF >> N) ------------------ fannkuch: import sys, array import psyco USES_PSYCO = True def fannkuch(n): if USES_PSYCO: perm = array.array('l', [0] * n) perm1 = array.array('l', range(n)) count = array.array('l', [0] * n) else: perm = [0] * n perm1 = range(n) count = [0] * n m = n - 1 r = n maxFlipsCount = 0 didpr = 0 while True: if didpr < 30: #print "".join(str(i+1) for i in perm1) didpr += 1 while r != 1: count[r-1] = r r -= 1 if perm1[0] and perm1[m] != m: for i in xrange(n): i = perm[0] flips = 0 while i: temp = perm[i] perm[i] = i i = temp j = 1 k = i - 1 while j < k: temp = perm[j] perm[j] = perm[k] perm[k] = temp j += 1 k -= 1 flips += 1 if flips > maxFlipsCount: maxFlipsCount = flips while True: if r == n: return maxFlipsCount temp = perm1[0] i = 0 while i < r: j = i + 1 perm1[i] = perm1[j] i = j perm1[r] = temp count[r] -= 1 if count[r] > 0: break r += 1 if USES_PSYCO: psyco.bind(fannkuch) N = int(sys.argv[1]) print "Pfannkuchen(%d) = %d" % (N, fannkuch(N)) ------------------ Bye, bearophile
Mar 28 2010