www.digitalmars.com         C & C++   DMDScript  

c++.command-line - Handling MMX Instructions

reply Isma'il Adeniran <ismail tamarindseed.com> writes:
I think there's an error in the way dmc handles inline assembler MMX 
instructions.

I compiled and ran this code using dmc:

<code>
#include <stdio.h>

int main(void)
{
    	int cnt;
	int a1[4] = {12, 1, 34, 17};
	int b1[4] = {17, 7, 4, 33};
	int c1[4];

	printf(" a1: ");
	for (cnt = 0; cnt < 4;cnt++)
		printf("%d\t", a1[cnt]);
	
	printf("\n b1: ");
     	for (cnt = 0; cnt < 4;cnt++)
		printf("%d\t", b1[cnt]);

	_asm {
		  movq     mm0, qword ptr a1
		  movq     mm1, qword ptr a1+8
		  packssdw mm0, mm1
		
		  movq     mm1, qword ptr b1
		  movq     mm2, qword ptr b1+8
		  packssdw mm1, mm2
		
          	  paddw    mm0, mm1
		
           	  lea      ESI, c1
		  xor      EDI,EDI
		
    		  pextrw   EDI,mm0, 0
		  mov      dword ptr [ESI], EDI
		  add      ESI, 4
		
		  pextrw   EDI,mm0, 1
		  mov      dword ptr [ESI], EDI
		  add      ESI, 4

		  pextrw   EDI,mm0, 2
		  mov      dword ptr [ESI], EDI
		  add      ESI, 4
		
		  pextrw   EDI,mm0, 3
		  mov      dword ptr [ESI], EDI
		  add      ESI, 4
		
	      emms
	};

	printf("\n\n          c1: \n");
	for (cnt = 0; cnt < 4;cnt++)
		printf(" a1[%d] + b1[%d] = %d\n", cnt, cnt, c1[cnt]);
	
	return 0;
}
</code>

<Output on execution:>

D:>pack (using dmc)
  a1: 12 1       34      17
  b1: 17 7       4       33

           c1:
  a1[0] + b1[0] = 0
  a1[1] + b1[1] = 0
  a1[2] + b1[2] = 0
  a1[3] + b1[3] = 0

This is incorrect.

I compiled and ran it with Open Watcom C/C++ and I got the correct resul 
below:

D:>pack (using Open Watcom)
  a1: 12 1       34      17
  b1: 17 7       4       33

           c1:
  a1[0] + b1[0] = 29
  a1[1] + b1[1] = 8
  a1[2] + b1[2] = 38
  a1[3] + b1[3] = 50

Is this a bug with dmc?

Best regards

Isma'il
-----
Mar 25 2005
next sibling parent reply Isma'il Adeniran <ismail tamarindseed.com> writes:
It also compiles correctly with VC++.NET.

Isma'il Adeniran wrote:
 I think there's an error in the way dmc handles inline assembler MMX 
 instructions.
 
 I compiled and ran this code using dmc:
 
 <code>
 #include <stdio.h>
 
 int main(void)
 {
        int cnt;
     int a1[4] = {12, 1, 34, 17};
     int b1[4] = {17, 7, 4, 33};
     int c1[4];
 
     printf(" a1: ");
     for (cnt = 0; cnt < 4;cnt++)
         printf("%d\t", a1[cnt]);
     
     printf("\n b1: ");
         for (cnt = 0; cnt < 4;cnt++)
         printf("%d\t", b1[cnt]);
 
     _asm {
           movq     mm0, qword ptr a1
           movq     mm1, qword ptr a1+8
           packssdw mm0, mm1
        
           movq     mm1, qword ptr b1
           movq     mm2, qword ptr b1+8
           packssdw mm1, mm2
        
                paddw    mm0, mm1
        
                 lea      ESI, c1
           xor      EDI,EDI
        
              pextrw   EDI,mm0, 0
           mov      dword ptr [ESI], EDI
           add      ESI, 4
        
           pextrw   EDI,mm0, 1
           mov      dword ptr [ESI], EDI
           add      ESI, 4
 
           pextrw   EDI,mm0, 2
           mov      dword ptr [ESI], EDI
           add      ESI, 4
        
           pextrw   EDI,mm0, 3
           mov      dword ptr [ESI], EDI
           add      ESI, 4
        
           emms
     };
 
     printf("\n\n          c1: \n");
     for (cnt = 0; cnt < 4;cnt++)
         printf(" a1[%d] + b1[%d] = %d\n", cnt, cnt, c1[cnt]);
     
     return 0;
 }
 </code>
 
 <Output on execution:>
 
 D:>pack (using dmc)
  a1: 12 1       34      17
  b1: 17 7       4       33
 
           c1:
  a1[0] + b1[0] = 0
  a1[1] + b1[1] = 0
  a1[2] + b1[2] = 0
  a1[3] + b1[3] = 0
 
 This is incorrect.
 
 I compiled and ran it with Open Watcom C/C++ and I got the correct resul 
 below:
 
 D:>pack (using Open Watcom)
  a1: 12 1       34      17
  b1: 17 7       4       33
 
           c1:
  a1[0] + b1[0] = 29
  a1[1] + b1[1] = 8
  a1[2] + b1[2] = 38
  a1[3] + b1[3] = 50
 
 Is this a bug with dmc?
 
 Best regards
 
 Isma'il
 -----
Mar 25 2005
next sibling parent reply "Walter" <newshound digitalmars.com> writes:
If you could post assembler code generated by OW C++ for that function, I
can compare the two.

"Isma'il Adeniran" <ismail tamarindseed.com> wrote in message
news:d21smg$mi7$1 digitaldaemon.com...
 I compiled and ran it with Open Watcom C/C++ and I got the correct resul
Mar 28 2005
next sibling parent Isma'il Adeniran <ismail tamarindseed.com> writes:
This is the assembler code generated with bits cut out. I haven't really 
combed through it extensively but both compilers generate practically 
identical code for the inline assembly (as expected)!

<assembly>

_TEXT		SEGMENT	BYTE PUBLIC USE32 'CODE'
		ASSUME CS:_TEXT, DS:DGROUP, SS:DGROUP
L$1:
     DB	0cH, 0, 0, 0, 1, 0, 0, 0
     DB	22H, 0, 0, 0, 11H, 0, 0, 0
L$2:
     DB	11H, 0, 0, 0, 7, 0, 0, 0
     DB	4, 0, 0, 0, 21H, 0, 0, 0
main:
     push        54H
     call        near ptr FLAT:__CHK
     push        ebx
     push        esi
     push        edi
     push        ebp
     mov         ebp,esp
     sub         esp,30H
     lea         edi,-30H[ebp]
     mov         esi,offset FLAT:L$1
     movsd
     movsd
     movsd
     movsd
     lea         edi,-20H[ebp]
     mov         esi,offset FLAT:L$2
     movsd
     movsd
     movsd
     movsd
     push        offset FLAT:L$9
     call        near ptr FLAT:printf
     add         esp,4
     xor         ebx,ebx
L$3:
     mov         edx,dword ptr -30H[ebp+ebx*4]
     push        edx
     push        offset FLAT:L$10
     call        near ptr FLAT:printf
     add         esp,8
     inc         ebx
     cmp         ebx,4
     jl          L$3
     push        offset FLAT:L$11
     call        near ptr FLAT:printf
     add         esp,4
     xor         ebx,ebx
L$4:
     mov         ecx,dword ptr -20H[ebp+ebx*4]
     push        ecx
     push        offset FLAT:L$10
     call        near ptr FLAT:printf
     add         esp,8
     inc         ebx
     cmp         ebx,4
     jl          L$4
     movq        mm0,-30H[ebp]
     movq        mm1,-28H[ebp]
     packssdw    mm0,mm1
     movq        mm1,-20H[ebp]
     movq        mm2,-18H[ebp]
     packssdw    mm1,mm2
     paddw       mm0,mm1
     lea         esi,-10H[ebp]
     xor         edi,edi
     pextrw      edi,mm0,0
     mov         dword ptr [esi],edi
     add         esi,4
     pextrw      edi,mm0,1
     mov         dword ptr [esi],edi
     add         esi,4
     pextrw      edi,mm0,2
     mov         dword ptr [esi],edi
     add         esi,4
     pextrw      edi,mm0,3
     mov         dword ptr [esi],edi
     add         esi,4
     emms
     push        offset FLAT:L$12
     call        near ptr FLAT:printf
     add         esp,4
     xor         ebx,ebx
L$5:
     mov         esi,dword ptr -10H[ebp+ebx*4]
     push        esi
     push        ebx
     push        ebx
     push        offset FLAT:L$13
     call        near ptr FLAT:printf
     add         esp,10H
     inc         ebx
     cmp         ebx,4
     jl          L$5
     mov         edi,dword ptr FLAT:__iob+4
     test        edi,edi
     jle         L$6
     mov         eax,dword ptr FLAT:__iob
     xor         ebx,ebx
     mov         bl,byte ptr [eax]
     sub         ebx,0dH
     cmp         ebx,0dH
     ja          L$7
L$6:
     push        offset FLAT:__iob
     call        near ptr FLAT:fgetc
     add         esp,4
     jmp         L$8
L$7:
     lea         edx,-1[edi]
     mov         dword ptr FLAT:__iob+4,edx
     inc         eax
     mov         dword ptr FLAT:__iob,eax
L$8:
     xor         eax,eax
     mov         esp,ebp
     pop         ebp
     pop         edi
     pop         esi
     pop         ebx
     ret
_TEXT		ENDS

<\assembly>

Walter wrote:
 If you could post assembler code generated by OW C++ for that function, I
 can compare the two.
 
-- Knowledge comes from finding the answers, yes but understanding what the answers mean is what brings wisdom. - Lionel Luthor
Mar 29 2005
prev sibling parent Isma'il Adeniran <ismail tamarindseed.com> writes:
Jack (check the other posts) just found the bug.
It's with the 'pextrw' instruction.
The word's extracted into the EAX register but it's the EDI register 
(which has been zeroed out) that's actually been copied to ESI.
This produces the zeroes on output.

Comparing the assembly output from DMC and OW compilers confirms this.

Reposting the pertinent assembly listing for the function. Apologies for 
the one I posted yesterday.

****************DMC************          ********Open Watcom************
-------------------------------          -------------------------------
  X$5:
      movq        mm0,-0x30[ebp]		movq        mm0,-0x30[ebp]
      movq        mm1,-0x28[ebp]		movq        mm1,-0x28[ebp]
      packssdw    mm0,mm1		packssdw    mm0,mm1
      movq        mm1,-0x20[ebp]		movq        mm1,-0x20[ebp]
      movq        mm2,-0x18[ebp]		movq        mm2,-0x18[ebp]
      packssdw    mm1,mm2		packssdw    mm1,mm2
      paddw       mm0,mm1		paddw       mm0,mm1
      lea         esi,-0x10[ebp]		lea         esi,-0x10[ebp]
      xor         edi,edi		xor         edi,edi
      pextrw      eax,mm7,0x00		pextrw      edi,mm0,0x00
      mov         [esi],edi		mov         [esi],edi
      add         esi,0x00000004		add         esi,0x00000004
      pextrw      eax,mm7,0x01		pextrw      edi,mm0,0x01
      mov         [esi],edi		mov         [esi],edi
      add         esi,0x00000004		add         esi,0x00000004
      pextrw      eax,mm7,0x02		pextrw      edi,mm0,0x02
      mov         [esi],edi		mov         [esi],edi
      add         esi,0x00000004		add         esi,0x00000004
      pextrw      eax,mm7,0x03		pextrw      edi,mm0,0x03
      mov         [esi],edi		mov         [esi],edi
      add         esi,0x00000004		add         esi,0x00000004
      emms				emms



Walter wrote:
 If you could post assembler code generated by OW C++ for that function, I
 can compare the two.
 
 "Isma'il Adeniran" <ismail tamarindseed.com> wrote in message
 news:d21smg$mi7$1 digitaldaemon.com...
 
I compiled and ran it with Open Watcom C/C++ and I got the correct resul
-- Knowledge comes from finding the answers, yes but understanding what the answers mean is what brings wisdom. - Lionel Luthor
Mar 30 2005
prev sibling parent reply Jack <Jack_member pathlink.com> writes:
Spotted the bug. There is a bug with the with 'pextrw' instruction in the code
compiled with DMC (first operand is always changed when linked).

With obj2asm, everything is correct in the compiled object code (.obj), first
operand of 'pextrw' is just same as specified in the source code.

When linked to an excutable, the first operand of 'pextrw' command changed to
EAX (happened always, no matter the first operand in the source code change to
whatever).

Perhaps a bug with the linker?

In article <d21smg$mi7$1 digitaldaemon.com>, Isma'il Adeniran says...
It also compiles correctly with VC++.NET.

Isma'il Adeniran wrote:
 I think there's an error in the way dmc handles inline assembler MMX 
 instructions.
 
 I compiled and ran this code using dmc:
 
 <code>
 #include <stdio.h>
 
 int main(void)
 {
        int cnt;
     int a1[4] = {12, 1, 34, 17};
     int b1[4] = {17, 7, 4, 33};
     int c1[4];
 
     printf(" a1: ");
     for (cnt = 0; cnt < 4;cnt++)
         printf("%d\t", a1[cnt]);
     
     printf("\n b1: ");
         for (cnt = 0; cnt < 4;cnt++)
         printf("%d\t", b1[cnt]);
 
     _asm {
           movq     mm0, qword ptr a1
           movq     mm1, qword ptr a1+8
           packssdw mm0, mm1
        
           movq     mm1, qword ptr b1
           movq     mm2, qword ptr b1+8
           packssdw mm1, mm2
        
                paddw    mm0, mm1
        
                 lea      ESI, c1
           xor      EDI,EDI
        
              pextrw   EDI,mm0, 0
           mov      dword ptr [ESI], EDI
           add      ESI, 4
        
           pextrw   EDI,mm0, 1
           mov      dword ptr [ESI], EDI
           add      ESI, 4
 
           pextrw   EDI,mm0, 2
           mov      dword ptr [ESI], EDI
           add      ESI, 4
        
           pextrw   EDI,mm0, 3
           mov      dword ptr [ESI], EDI
           add      ESI, 4
        
           emms
     };
 
     printf("\n\n          c1: \n");
     for (cnt = 0; cnt < 4;cnt++)
         printf(" a1[%d] + b1[%d] = %d\n", cnt, cnt, c1[cnt]);
     
     return 0;
 }
 </code>
 
 <Output on execution:>
 
 D:>pack (using dmc)
  a1: 12 1       34      17
  b1: 17 7       4       33
 
           c1:
  a1[0] + b1[0] = 0
  a1[1] + b1[1] = 0
  a1[2] + b1[2] = 0
  a1[3] + b1[3] = 0
 
 This is incorrect.
 
 I compiled and ran it with Open Watcom C/C++ and I got the correct resul 
 below:
 
 D:>pack (using Open Watcom)
  a1: 12 1       34      17
  b1: 17 7       4       33
 
           c1:
  a1[0] + b1[0] = 29
  a1[1] + b1[1] = 8
  a1[2] + b1[2] = 38
  a1[3] + b1[3] = 50
 
 Is this a bug with dmc?
 
 Best regards
 
 Isma'il
 -----
Mar 30 2005
parent reply Jack <Jack_member pathlink.com> writes:
Just found out not only first operand is different from the source code but
second operand too! The second operand of 'pextrw' instruction is always mm7
when linked.

In article <d2egdn$1cge$1 digitaldaemon.com>, Jack says...
Spotted the bug. There is a bug with the with 'pextrw' instruction in the code
compiled with DMC (first operand is always changed when linked).

With obj2asm, everything is correct in the compiled object code (.obj), first
operand of 'pextrw' is just same as specified in the source code.

When linked to an excutable, the first operand of 'pextrw' command changed to
EAX (happened always, no matter the first operand in the source code change to
whatever).

Perhaps a bug with the linker?

In article <d21smg$mi7$1 digitaldaemon.com>, Isma'il Adeniran says...
It also compiles correctly with VC++.NET.

Isma'il Adeniran wrote:
 I think there's an error in the way dmc handles inline assembler MMX 
 instructions.
 
 I compiled and ran this code using dmc:
 
 <code>
 #include <stdio.h>
 
 int main(void)
 {
        int cnt;
     int a1[4] = {12, 1, 34, 17};
     int b1[4] = {17, 7, 4, 33};
     int c1[4];
 
     printf(" a1: ");
     for (cnt = 0; cnt < 4;cnt++)
         printf("%d\t", a1[cnt]);
     
     printf("\n b1: ");
         for (cnt = 0; cnt < 4;cnt++)
         printf("%d\t", b1[cnt]);
 
     _asm {
           movq     mm0, qword ptr a1
           movq     mm1, qword ptr a1+8
           packssdw mm0, mm1
        
           movq     mm1, qword ptr b1
           movq     mm2, qword ptr b1+8
           packssdw mm1, mm2
        
                paddw    mm0, mm1
        
                 lea      ESI, c1
           xor      EDI,EDI
        
              pextrw   EDI,mm0, 0
           mov      dword ptr [ESI], EDI
           add      ESI, 4
        
           pextrw   EDI,mm0, 1
           mov      dword ptr [ESI], EDI
           add      ESI, 4
 
           pextrw   EDI,mm0, 2
           mov      dword ptr [ESI], EDI
           add      ESI, 4
        
           pextrw   EDI,mm0, 3
           mov      dword ptr [ESI], EDI
           add      ESI, 4
        
           emms
     };
 
     printf("\n\n          c1: \n");
     for (cnt = 0; cnt < 4;cnt++)
         printf(" a1[%d] + b1[%d] = %d\n", cnt, cnt, c1[cnt]);
     
     return 0;
 }
 </code>
 
 <Output on execution:>
 
 D:>pack (using dmc)
  a1: 12 1       34      17
  b1: 17 7       4       33
 
           c1:
  a1[0] + b1[0] = 0
  a1[1] + b1[1] = 0
  a1[2] + b1[2] = 0
  a1[3] + b1[3] = 0
 
 This is incorrect.
 
 I compiled and ran it with Open Watcom C/C++ and I got the correct resul 
 below:
 
 D:>pack (using Open Watcom)
  a1: 12 1       34      17
  b1: 17 7       4       33
 
           c1:
  a1[0] + b1[0] = 29
  a1[1] + b1[1] = 8
  a1[2] + b1[2] = 38
  a1[3] + b1[3] = 50
 
 Is this a bug with dmc?
 
 Best regards
 
 Isma'il
 -----
Mar 30 2005
next sibling parent Isma'il Adeniran <ismail tamarindseed.com> writes:
You're right. The problem's with the 'pextrw' instruction. It uses the 
wrong register. The edi is zeroed. The word is extracted into the EAX 
register but this instruction: mov [esi], edi is carried out instead of 
mov [esi], eax. Consequently, the contents of esi is always 0.

Nice work Jack!!!


Jack wrote:
 Just found out not only first operand is different from the source code but
 second operand too! The second operand of 'pextrw' instruction is always mm7
 when linked.
 
 In article <d2egdn$1cge$1 digitaldaemon.com>, Jack says...
 
Spotted the bug. There is a bug with the with 'pextrw' instruction in the code
compiled with DMC (first operand is always changed when linked).

With obj2asm, everything is correct in the compiled object code (.obj), first
operand of 'pextrw' is just same as specified in the source code.

When linked to an excutable, the first operand of 'pextrw' command changed to
EAX (happened always, no matter the first operand in the source code change to
whatever).

Perhaps a bug with the linker?

In article <d21smg$mi7$1 digitaldaemon.com>, Isma'il Adeniran says...

It also compiles correctly with VC++.NET.

Isma'il Adeniran wrote:

I think there's an error in the way dmc handles inline assembler MMX 
instructions.

I compiled and ran this code using dmc:

<code>
#include <stdio.h>

int main(void)
{
       int cnt;
    int a1[4] = {12, 1, 34, 17};
    int b1[4] = {17, 7, 4, 33};
    int c1[4];

    printf(" a1: ");
    for (cnt = 0; cnt < 4;cnt++)
        printf("%d\t", a1[cnt]);
    
    printf("\n b1: ");
        for (cnt = 0; cnt < 4;cnt++)
        printf("%d\t", b1[cnt]);

    _asm {
          movq     mm0, qword ptr a1
          movq     mm1, qword ptr a1+8
          packssdw mm0, mm1
       
          movq     mm1, qword ptr b1
          movq     mm2, qword ptr b1+8
          packssdw mm1, mm2
       
               paddw    mm0, mm1
       
                lea      ESI, c1
          xor      EDI,EDI
       
             pextrw   EDI,mm0, 0
          mov      dword ptr [ESI], EDI
          add      ESI, 4
       
          pextrw   EDI,mm0, 1
          mov      dword ptr [ESI], EDI
          add      ESI, 4

          pextrw   EDI,mm0, 2
          mov      dword ptr [ESI], EDI
          add      ESI, 4
       
          pextrw   EDI,mm0, 3
          mov      dword ptr [ESI], EDI
          add      ESI, 4
       
          emms
    };

    printf("\n\n          c1: \n");
    for (cnt = 0; cnt < 4;cnt++)
        printf(" a1[%d] + b1[%d] = %d\n", cnt, cnt, c1[cnt]);
    
    return 0;
}
</code>

<Output on execution:>

D:>pack (using dmc)
 a1: 12 1       34      17
 b1: 17 7       4       33

          c1:
 a1[0] + b1[0] = 0
 a1[1] + b1[1] = 0
 a1[2] + b1[2] = 0
 a1[3] + b1[3] = 0

This is incorrect.

I compiled and ran it with Open Watcom C/C++ and I got the correct resul 
below:

D:>pack (using Open Watcom)
 a1: 12 1       34      17
 b1: 17 7       4       33

          c1:
 a1[0] + b1[0] = 29
 a1[1] + b1[1] = 8
 a1[2] + b1[2] = 38
 a1[3] + b1[3] = 50

Is this a bug with dmc?

Best regards

Isma'il
-----
-- Knowledge comes from finding the answers, yes but understanding what the answers mean is what brings wisdom. - Lionel Luthor
Mar 30 2005
prev sibling parent Isma'il Adeniran <ismail tamarindseed.com> writes:
I also just posted the assembly output from both DMC and OW for the 
function to my reply to Walter above. Check it out (skewed).

Isma'il


-- 
Knowledge comes from finding the answers, yes but
understanding what the answers mean
is what brings wisdom.
				  - Lionel Luthor
Mar 30 2005
prev sibling parent "Walter" <newshound digitalmars.com> writes:
"Isma'il Adeniran" <ismail tamarindseed.com> wrote in message
news:d21qsd$kq1$1 digitaldaemon.com...
 I think there's an error in the way dmc handles inline assembler MMX
 instructions.
You're right. I have it fixed now, it'll go out in the next update.
Apr 02 2005