digitalmars.D - Array reverse

bearophile (116/116) Nov 23 2012 I've seen a difference in the performance of

monarch_dodra (21/41) Nov 23 2012 I'm not surprised: Algorithm's reverse is written for the generic

bearophile (62/72) Nov 27 2012 This is the full program:

Walter Bright (3/4) Nov 23 2012 You've got array bounds checking turned on for algorithm.reverse, and yo...

bearophile (75/78) Nov 27 2012 Right, sorry. This is compiled with the latest DMD 2.061alpha

David Nadlinger (64/66) Nov 28 2012 fwiw, here is what LDC produces:

"bearophile" <bearophileHUGS lycos.com> writes:

I've seen a difference in the performance of 
std.algorithm.reverse applied on an array compared to home-made 
code, so I have written a little test.

Below you see the code, and the asm of the functions, compiled 
with DMD 2.060, 32 bit (-O -release -inline).

Feel free to recompile the code yourself to see if I have done 
some mistake :-)

------------------------------

import core.stdc.stdio: printf;
import std.algorithm: reverse;

void reverseArr(T)(T[] arr) {
     for (auto x = arr.ptr, y = &arr[$ - 1]; x < y; ) {
         auto aux = *x;
         *x++ = *y;
         *y-- = aux;
     }
}

void main() {
     auto a = [10, 20, 30, 40];
     foreach(x; a) printf("%d ", x); printf("\n");
     a.reverseArr();
     foreach(x; a) printf("%d ", x); printf("\n");
     a.reverse();
     foreach(x; a) printf("%d ", x); printf("\n");
}

------------------------------

reverseArr:
		push	EBX
		mov	ECX,0Ch[ESP]
		mov	EBX,0Ch[ESP]
		push	ESI
		mov	ESI,0Ch[ESP]
		lea	ESI,-4[ESI*4][ECX]
		cmp	ECX,ESI
		jae	L2B
L19:	mov	EAX,[EBX]
		mov	EDX,[ESI]
		mov	[EBX],EDX
		add	EBX,4
		mov	[ESI],EAX
		add	ESI,0FFFFFFFCh
		cmp	EBX,ESI
		jb	L19
L2B:	pop	ESI
		pop	EBX
		ret	8

-------------------

std.algorithm.reverse:
L0:		sub	ESP,020h
		push	EBX
		push	ESI
		push	EDI
		cmp	dword ptr 030h[ESP],0
		je	LE1
L11:	cmp	dword ptr 030h[ESP],0
		mov	EAX,030h[ESP]
		mov	EDX,034h[ESP]
		mov	018h[ESP],EAX
		mov	01Ch[ESP],EDX
		jne	L32
		mov	EAX,022Dh
		call	near ptr _D3std9algorithm7__arrayZ
L32:	mov	EAX,030h[ESP]
		mov	EDX,034h[ESP]
		mov	024h[ESP],EDX
		mov	ECX,030h[ESP]
		lea	EDX,-1[ECX]
		mov	020h[ESP],EAX
		cmp	EDX,ECX
		mov	010h[ESP],EDX
		jb	L5E
		mov	EAX,0258h
		call	near ptr _D3std9algorithm7__arrayZ
L5E:	mov	ECX,01Ch[ESP]
		mov	EBX,030h[ESP]
		mov	EDX,024h[ESP]
		lea	EBX,-4[EBX*4][EDX]
		mov	ESI,[ECX]
		mov	EDX,[EBX]
		mov	[ECX],EDX
		mov	ECX,030h[ESP]
		cmp	ECX,ECX
		mov	[EBX],ESI
		ja	L86
		cmp	ECX,1
		jae	L90
L86:	mov	EAX,0173h
		call	near ptr _D3std9algorithm7__arrayZ
L90:	mov	EDX,034h[ESP]
		mov	EDI,010h[ESP]
		mov	030h[ESP],EDI
		add	EDX,4
		mov	034h[ESP],EDX
		cmp	dword ptr 030h[ESP],0
		je	LE1
		mov	EBX,030h[ESP]
		lea	ECX,-1[EBX]
		cmp	ECX,EBX
		mov	014h[ESP],ECX
		jbe	LC6
		mov	EAX,01E8h
		call	near ptr _D3std9algorithm7__arrayZ
LC6:	mov	ESI,014h[ESP]
		mov	EDX,034h[ESP]
		mov	030h[ESP],ESI
		mov	034h[ESP],EDX
		cmp	dword ptr 030h[ESP],0
		jne	L11
LE1:	pop	EDI
		pop	ESI
		pop	EBX
		add	ESP,020h
		ret	8

------------------------------

Bye,
bearophile

Nov 23 2012

"monarch_dodra" <monarchdodra gmail.com> writes:

On Friday, 23 November 2012 at 09:14:20 UTC, bearophile wrote:
 I've seen a difference in the performance of 
 std.algorithm.reverse applied on an array compared to home-made 
 code, so I have written a little test.

 Below you see the code, and the asm of the functions, compiled 
 with DMD 2.060, 32 bit (-O -release -inline).

 Feel free to recompile the code yourself to see if I have done 
 some mistake :-)

 ------------------------------

 import core.stdc.stdio: printf;
 import std.algorithm: reverse;

 void reverseArr(T)(T[] arr) {
     for (auto x = arr.ptr, y = &arr[$ - 1]; x < y; ) {
         auto aux = *x;
         *x++ = *y;
         *y-- = aux;
     }
 }

 [SNIP]

 Bye,
 bearophile

I'm not surprised: Algorithm's reverse is written for the generic 
bidir range. It could be made faster using an approach like 
yours', and even faster using a specialized pointer 
implementation for pointers (no bounds checking when accessing 
elements).

The *1* thing I'd be curious about is what costs are 
iteration-related, and what costs are "swap" related.

In theory, swap's implementation should be reduced to what you 
have for integers.

Could you maybe also add this in your study?

void reverseArr(T)(T[] arr) {
     for (auto x = arr.ptr, y = &arr[$ - 1]; x < y; )
         swap(*x++, *y--);
}

--------
Related, the starting condition, as written: "y = &arr[$ - 1]" 
will choke on empty ranges.

I'd have suggested: "y = arr.ptr + arr.length - 1", but that will 
also choke if the arr.ptr is null. I'd say you should just add an 
empty short-circuit test int there.

Nov 23 2012

"bearophile" <bearophileHUGS lycos.com> writes:

monarch_dodra:

 Could you maybe also add this in your study?

 void reverseArr(T)(T[] arr) {
     for (auto x = arr.ptr, y = &arr[$ - 1]; x < y; )
         swap(*x++, *y--);
 }


This is the full program:

import core.stdc.stdio: printf;
import std.algorithm: reverse, swap;

void reverseArr(T)(T[] arr) {
     for (auto x = arr.ptr, y = &arr[$ - 1]; x < y; ) {
         auto aux = *x;
         *x++ = *y;
         *y-- = aux;
     }
}

void reverseArray(T)(T[] arr) {
     for (auto x = arr.ptr, y = &arr[$ - 1]; x < y; )
         swap(*x++, *y--);
}

void main() {
     auto a = [10, 20, 30, 40];
     foreach(x; a) printf("%d ", x); printf("\n");
     a.reverseArr();
     foreach(x; a) printf("%d ", x); printf("\n");
     a.reverse();
     foreach(x; a) printf("%d ", x); printf("\n");
     a.reverseArray();
     foreach(x; a) printf("%d ", x); printf("\n");
}


And this is the asm of the reverseArray:

reverseArray:
         push EBX
         mov ECX, 0Ch[ESP]
         mov EBX, 0Ch[ESP]
         push ESI
         mov ESI, 0Ch[ESP]
         lea ESI, -4[ESI*4][ECX]
         push EDI
         cmp ECX, ESI
         jae L30
L1A:    mov EAX, EBX
         mov EDI, ESI
         mov EDX, [EAX]
         mov ECX, [EDI]
         add EBX, 4
         sub ESI, 4
         mov [EAX], ECX
         cmp EBX, ESI
         mov [EDI], EDX
         jb  L1A
L30:    pop EDI
         pop ESI
         pop EBX
         ret 8


So the inner loop is 10 asm instructions instead of the 8 asm 
instructions of the loop of reverseArr(), that is 2 more register 
swaps, that are performed very quickly by the hardware register 
renaming in the modern CPUs. So this is a small difference, 
probably acceptable in many situations. While the difference 
between the performance of reverseArray() and 
std.algorithm.reverse is significant.


 Related, the starting condition, as written: "y = &arr[$ - 1]" 
 will choke on empty ranges.

 I'd have suggested: "y = arr.ptr + arr.length - 1", but that 
 will also choke if the arr.ptr is null. I'd say you should just 
 add an empty short-circuit test int there.

Right. But my study was mostly about what's inside the loop of 
those functions. What's outside the loop is not influencing 
performance significantly.

Bye,
bearophile

Nov 27 2012

Walter Bright <newshound2 digitalmars.com> writes:

On 11/23/2012 1:14 AM, bearophile wrote:
 Feel free to recompile the code yourself to see if I have done some mistake :-)

You've got array bounds checking turned on for algorithm.reverse, and you 
avoided that in your code by using unchecked pointers.

Nov 23 2012

"bearophile" <bearophileHUGS lycos.com> writes:

Walter Bright:

Sorry for the late reply.

 You've got array bounds checking turned on for 
 algorithm.reverse, and you avoided that in your code by using 
 unchecked pointers.

Right, sorry. This is compiled with the latest DMD 2.061alpha 
with -O -release -inline -noboundscheck. I have used the same 
program source code as above.

Now the calls inside std.algorithm.reverse are fully inlined, 
below there are no "call" istrunctions.

But the number of instructions performed in the loop is rather 
different.

dpaste is currently down and I don't have gdc2/dc2 installed 
here, so I can't show the asm from those other compilers.

I have also done benchmarks, and I have seen performance 
difference in real code between std.algorithm.reverse and a 
reverseArr function.


reverseArr:
         push EBX
         mov ECX, 0Ch[ESP]
         mov EBX, 0Ch[ESP]
         push ESI
         mov ESI, 0Ch[ESP]
         lea ESI, -4[ESI*4][ECX]
         cmp ECX, ESI
         jae L2B
L19:    mov EAX, [EBX]
         mov EDX, [ESI]
         mov [EBX], EDX
         add EBX, 4
         mov [ESI], EAX
         add ESI, 0FFFFFFFCh
         cmp EBX, ESI
         jb  L19
L2B:    pop ESI
         pop EBX
         ret 8


std.algorithm.reverse:
         sub ESP, 014h
         push EBX
         push ESI
         cmp dword ptr 020h[ESP], 0
         je  L76
LC:     mov EAX, 020h[ESP]
         mov EDX, 024h[ESP]
         mov EBX, 020h[ESP]
         mov 0Ch[ESP], EDX
         mov ECX, 0Ch[ESP]
         mov ESI, [ECX]
         mov 014h[ESP], EDX
         mov EDX, 014h[ESP]
         lea EBX, -4[EBX*4][EDX]
         mov 8[ESP], EAX
         mov EDX, [EBX]
         mov [ECX], EDX
         mov ECX, 020h[ESP]
         mov EDX, 024h[ESP]
         mov 010h[ESP], EAX
         lea EAX, -1[ECX]
         add EDX, 4
         mov 020h[ESP], EAX
         mov [EBX], ESI
         mov 024h[ESP], EDX
         cmp dword ptr 020h[ESP], 0
         je  L76
         mov EBX, 020h[ESP]
         lea ESI, -1[EBX]
         mov ECX, 024h[ESP]
         mov 020h[ESP], ESI
         mov 024h[ESP], ECX
         cmp dword ptr 020h[ESP], 0
         jne LC
L76:    pop ESI
         pop EBX
         add ESP, 014h
         ret 8

Bye,
bearophile

Nov 27 2012

"David Nadlinger" <see klickverbot.at> writes:

On Wednesday, 28 November 2012 at 00:19:33 UTC, bearophile wrote:
 dpaste is currently down and I don't have gdc2/dc2 installed 
 here, so I can't show the asm from those other compilers.

fwiw, here is what LDC produces:

---
_D4test18__T10reverseArrTiZ10reverseArrFAiZv:
	lea	RAX, QWORD PTR [RSI + 4*RDI - 4]
	cmp	RSI, RAX
	jae	.LBB1_3
	lea	RAX, QWORD PTR [RSI + 4*RDI - 8]
	.align	16, 0x90
.LBB1_2:
	mov	ECX, DWORD PTR [RSI]
	mov	EDX, DWORD PTR [RAX + 4]
	mov	DWORD PTR [RSI], EDX
	mov	DWORD PTR [RAX + 4], ECX
	add	RSI, 4
	cmp	RSI, RAX
	lea	RAX, QWORD PTR [RAX - 4]
	jb	.LBB1_2
.LBB1_3:
	ret
---

---
_D3std9algorithm15__T7reverseTAiZ7reverseFAiZv:
	test	RDI, RDI
	je	.LBB2_4
	lea	RAX, QWORD PTR [RSI + 4*RDI - 4]
	.align	16, 0x90
.LBB2_2:
	mov	ECX, DWORD PTR [RSI]
	mov	EDX, DWORD PTR [RAX]
	mov	DWORD PTR [RSI], EDX
	mov	DWORD PTR [RAX], ECX
	cmp	RDI, 1
	je	.LBB2_4
	add	RAX, -4
	add	RSI, 4
	add	RDI, -2
	jne	.LBB2_2
.LBB2_4:
	ret
---

---
_D4test20__T12reverseArrayTiZ12reverseArrayFAiZv:
	lea	RAX, QWORD PTR [RSI + 4*RDI - 4]
	cmp	RSI, RAX
	jae	.LBB3_3
	add	RSI, 4
	.align	16, 0x90
.LBB3_2:
	mov	ECX, DWORD PTR [RSI - 4]
	mov	EDX, DWORD PTR [RAX]
	mov	DWORD PTR [RSI - 4], EDX
	mov	DWORD PTR [RAX], ECX
	add	RAX, -4
	cmp	RSI, RAX
	lea	RSI, QWORD PTR [RSI + 4]
	jb	.LBB3_2
.LBB3_3:
	ret
---

The extra jump in the std.algorithm version is still there – I 
haven't checked whether it would be feasible to optimize the 
induction variable away entirely.

David

Nov 28 2012

D Programming

C/C++ Programming

Other

digitalmars.D - Array reverse