www.digitalmars.com         C & C++   DMDScript  

digitalmars.D.ldc - Timing too good to be true

reply "Paul Jurczak" <pauljurczak yahoo.com> writes:
I'm running this test program with multiple compilers on Xubuntu 
and I'm getting impossibly short timings with LDC compared to DMD 
and GDC (timing details in comments below). I would appreciate 
any ideas about what is going on here?

/*
Xubuntu 13.04 64-bit Core i5 3450S 2.8GHz:
GDC 4.8.1:     gdc-4.8 -m64 -march=native -fno-bounds-check 
-frename-registers -frelease -O3
669171001  826ns  e28_0
669171001  824ns  e28_1

DMD64 2.063.2: dmd -O -noboundscheck -inline -release
669171001  1115ns  e28_0
669171001  1955ns  e28_1

LDC 0.11.0: ldmd2 -m64 -O -noboundscheck -inline -release
669171001  25ns  e28_0
669171001  25ns  e28_1
*/

module main;

import std.stdio, std.algorithm, std.range, std.datetime, 
std.typetuple;


int e28_0(int N = 1002) {
	int diagNumber = 1;					
	int sum        = diagNumber;	

	for (int width = 2; width < N; width += 2)	
		for (int j = 0; j < 4; ++j) {			
			diagNumber += width;				
			sum        += diagNumber;			
		}

	return sum;
}


int e28_1(int N = 1002) {
	int diagNumber = 1;					
	int sum        = diagNumber;	

	foreach (width; iota(2, N, 2))
		foreach (_; 0..4) {			
			diagNumber += width;				
			sum        += diagNumber;			
		}

	return sum;
}


void main()
{
   enum      N = 100_000;
   StopWatch sw;

    foreach (iFun, fun; TypeTuple!(e28_0, e28_1)) {
       int [N] results;
       long[N] timings;

       foreach (i; 0..N) {
          sw.reset;
          sw.start;
          results[i] = fun();
          sw.stop;
          timings[i] = sw.peek.nsecs;
       }

       writeln(results.reduce!min, "  ", timings.reduce!min, "ns  
e28_", iFun);
   }
}
Sep 21 2013
next sibling parent "Paul Jurczak" <pauljurczak yahoo.com> writes:
On Saturday, 21 September 2013 at 22:07:27 UTC, Paul Jurczak 
wrote:
 [...]
One more thing: I have multiple versions of gcc installed, but my ldc installation was from binaries provided on dlang.org. I'm not sure if there are any gcc dependencies, which could cause this problem.
Sep 21 2013
prev sibling next sibling parent reply "bearophile" <bearophileHUGS lycos.com> writes:
Paul Jurczak:

 I'm running this test program with multiple compilers on 
 Xubuntu and I'm getting impossibly short timings with LDC 
 compared to DMD and GDC (timing details in comments below). I 
 would appreciate any ideas about what is going on here?
Use -output-s on ldc and show us the asm of just the two functions. Bye, bearophile
Sep 21 2013
parent reply "Paul Jurczak" <pauljurczak yahoo.com> writes:
On Saturday, 21 September 2013 at 22:47:16 UTC, bearophile wrote:
 Paul Jurczak:

 I'm running this test program with multiple compilers on 
 Xubuntu and I'm getting impossibly short timings with LDC 
 compared to DMD and GDC (timing details in comments below). I 
 would appreciate any ideas about what is going on here?
Use -output-s on ldc and show us the asm of just the two functions. Bye, bearophile
Here it is: .globl _D4main5e28_0FiZi .align 16, 0x90 .type _D4main5e28_0FiZi, function _D4main5e28_0FiZi: .cfi_startproc movl $1, %eax cmpl $3, %edi jl .LBB0_3 movl $2, %r8d movl $20, %edx movl $8, %esi movl %eax, %ecx .align 16, 0x90 .LBB0_2: addl %edx, %eax leal (%rax,%rcx,4), %eax addl %esi, %ecx addl $8, %esi addl $20, %edx addl $2, %r8d cmpl %edi, %r8d jl .LBB0_2 .LBB0_3: ret .Ltmp0: .size _D4main5e28_0FiZi, .Ltmp0-_D4main5e28_0FiZi .cfi_endproc .globl _D4main5e28_1FiZi .align 16, 0x90 .type _D4main5e28_1FiZi, function _D4main5e28_1FiZi: .cfi_startproc movl $1, %eax cmpl $3, %edi jl .LBB1_4 leal -3(%rdi), %ecx shrl $31, %ecx leal -3(%rdi,%rcx), %ecx andl $-2, %ecx negl %ecx leal -3(%rdi,%rcx), %ecx leal 1(%rdi), %edx subl %ecx, %edx cmpl $2, %edx je .LBB1_4 decl %edi subl %ecx, %edi movl $1, %ecx movl $20, %edx movl $8, %esi movl $1, %eax .align 16, 0x90 .LBB1_3: addl %edx, %eax leal (%rax,%rcx,4), %eax addl %esi, %ecx addl $8, %esi addl $20, %edx addl $-2, %edi jne .LBB1_3 .LBB1_4: ret .Ltmp1: .size _D4main5e28_1FiZi, .Ltmp1-_D4main5e28_1FiZi .cfi_endproc
Sep 21 2013
parent reply David Nadlinger <code klickverbot.at> writes:
On Sun, Sep 22, 2013 at 2:27 AM, Paul Jurczak <pauljurczak yahoo.com> wrote:
 On Saturday, 21 September 2013 at 22:47:16 UTC, bearophile wrote:
 Paul Jurczak:

 I'm running this test program with multiple compilers on Xubuntu and I'm
 getting impossibly short timings with LDC compared to DMD and GDC (timing
 details in comments below). I would appreciate any ideas about what is going
 on here?
Use -output-s on ldc and show us the asm of just the two functions. Bye, bearophile
Here it is: [...]
As mentioned in my previous message, this is actually irrelevant for your benchmark, as the functions are not even executed once by the optimized test program. David
Sep 21 2013
parent reply "Paul Jurczak" <pauljurczak yahoo.com> writes:
On Sunday, 22 September 2013 at 00:39:43 UTC, David Nadlinger 
wrote:
 [...]
As mentioned in my previous message, this is actually irrelevant for your benchmark, as the functions are not even executed once by the optimized test program. David
If it is not executed even once, how is the value computed? At compile time?
Sep 21 2013
parent reply "Paul Jurczak" <pauljurczak yahoo.com> writes:
On Sunday, 22 September 2013 at 00:58:42 UTC, Paul Jurczak wrote:
 On Sunday, 22 September 2013 at 00:39:43 UTC, David Nadlinger 
 wrote:
 [...]
As mentioned in my previous message, this is actually irrelevant for your benchmark, as the functions are not even executed once by the optimized test program. David
If it is not executed even once, how is the value computed? At compile time?
OK, you've wrote "optimizer constant-folds the entire e28 functions away", which means compile time. Very clever optimizer! To defeat this behavior I changed timing loop to: foreach (i; 0..N) { sw.reset; sw.start; results[i] = fun(1002 + i%2); // a trick to defeat LLVM optimizer sw.stop; timings[i] = sw.peek.nsecs; } and now I'm getting sensible results: GDC 4.8.1: gdc-4.8 -m64 -march=native -fno-bounds-check -frename-registers -frelease -O3 669171001 830ns e28_0 669171001 830ns e28_1 DMD64 2.063.2: dmd -O -noboundscheck -inline -release 669171001 1115ns e28_0 669171001 1958ns e28_1 LDC 0.11.0: ldmd2 -m64 -O -noboundscheck -inline -release 669171001 454ns e28_0 669171001 395ns e28_1 LDC is smoking fast here!
Sep 21 2013
parent David Nadlinger <code klickverbot.at> writes:
On Sun, Sep 22, 2013 at 6:48 AM, Paul Jurczak <pauljurczak yahoo.com> wrote:
 OK, you've wrote "optimizer constant-folds the entire e28 functions away",
 which means compile time. Very clever optimizer! To defeat this behavior I
 changed timing loop to:
By the way, there is an LDC-specific pragma to selectively disable inlining for some functions that you could have used to the same effect: http://wiki.dlang.org/LDC-specific_language_changes#LDC_never_inline. David
Sep 22 2013
prev sibling next sibling parent reply David Nadlinger <code klickverbot.at> writes:
On Sun, Sep 22, 2013 at 12:07 AM, Paul Jurczak <pauljurczak yahoo.com> wrote:
 I'm running this test program with multiple compilers on Xubuntu and I'm
 getting impossibly short timings with LDC compared to DMD and GDC (timing
 details in comments below). I would appreciate any ideas about what is going
 on here?
 [...]
First, a remark unrelated to the actual issue at hand: Your program uses at least 2.4 MB of stack (for the result/timing variables), which is large enough to cause issues e.g. on Win32, where the default stack size is 1 MB. That being said, the LDC-compiled version is so fast because the LLVM optimizer constant-folds the entire e28 functions away. Thus, you are timing the single store of the constant 669171001 to a memory location (or rather, the overhead involved in calling the StopWatch functions). Hope this answers your question, David
Sep 21 2013
parent "Paul Jurczak" <pauljurczak yahoo.com> writes:
On Saturday, 21 September 2013 at 23:15:05 UTC, David Nadlinger 
wrote:
 [...]
First, a remark unrelated to the actual issue at hand: Your program uses at least 2.4 MB of stack (for the result/timing variables), which is large enough to cause issues e.g. on Win32, where the default stack size is 1 MB.
Thanks, that explains the segfault when I was using N = 1_000_000.
 That being said, the LDC-compiled version is so fast because 
 the LLVM
 optimizer constant-folds the entire e28 functions away. Thus, 
 you are
 timing the single store of the constant 669171001 to a memory 
 location
 (or rather, the overhead involved in calling the StopWatch 
 functions).

 Hope this answers your question,
 David
It sounds convincing, but I tried it again with small Ns. For N = 1, function is called only once, so the minimum of all timings should be accurate, but the result is still unbelievably short: N = 1 669171001 57ns e28_0 669171001 58ns e28_1 N = 2 669171001 42ns e28_0 669171001 43ns e28_1 N = 10 669171001 26ns e28_0 669171001 25ns e28_1
Sep 21 2013
prev sibling parent "John Colvin" <john.loughran.colvin gmail.com> writes:
On Saturday, 21 September 2013 at 22:07:27 UTC, Paul Jurczak 
wrote:
 I'm running this test program with multiple compilers on 
 Xubuntu and I'm getting impossibly short timings with LDC 
 compared to DMD and GDC (timing details in comments below). I 
 would appreciate any ideas about what is going on here?

 /*
 Xubuntu 13.04 64-bit Core i5 3450S 2.8GHz:
 GDC 4.8.1:     gdc-4.8 -m64 -march=native -fno-bounds-check 
 -frename-registers -frelease -O3
 669171001  826ns  e28_0
 669171001  824ns  e28_1

 DMD64 2.063.2: dmd -O -noboundscheck -inline -release
 669171001  1115ns  e28_0
 669171001  1955ns  e28_1

 LDC 0.11.0: ldmd2 -m64 -O -noboundscheck -inline -release
 669171001  25ns  e28_0
 669171001  25ns  e28_1
 */

 module main;

 import std.stdio, std.algorithm, std.range, std.datetime, 
 std.typetuple;


 int e28_0(int N = 1002) {
 	int diagNumber = 1;					
 	int sum        = diagNumber;	

 	for (int width = 2; width < N; width += 2)	
 		for (int j = 0; j < 4; ++j) {			
 			diagNumber += width;				
 			sum        += diagNumber;			
 		}

 	return sum;
 }


 int e28_1(int N = 1002) {
 	int diagNumber = 1;					
 	int sum        = diagNumber;	

 	foreach (width; iota(2, N, 2))
 		foreach (_; 0..4) {			
 			diagNumber += width;				
 			sum        += diagNumber;			
 		}

 	return sum;
 }


 void main()
 {
   enum      N = 100_000;
   StopWatch sw;

    foreach (iFun, fun; TypeTuple!(e28_0, e28_1)) {
       int [N] results;
       long[N] timings;

       foreach (i; 0..N) {
          sw.reset;
          sw.start;
          results[i] = fun();
          sw.stop;
          timings[i] = sw.peek.nsecs;
       }

       writeln(results.reduce!min, "  ", timings.reduce!min, "ns
  e28_", iFun);
   }
 }
Generally speaking, you should benchmark using outside input, unknown to the optimizer, that is representative of the use you are expecting. You'll be much less likely to fall foul of situations like this one.
Sep 23 2013