www.digitalmars.com         C & C++   DMDScript  

digitalmars.D.learn - SIMD

reply Wyverex <wyverex.cypher gmail.com> writes:
Was messing around with SIMD, SSE stuff..  didn't know how much faster 
it could be!  Its been a few years since I did any assembly.
Though Id just share this, any word of adding this to the lib or 
compiler optimizations for this?


my results:
Parallel	 Single

SQRTPS:0.000120  FSQRT:0.001021
SQRTPS:0.000114  FSQRT:0.001026
SQRTPS:0.000114  FSQRT:0.001021
SQRTPS:0.000114  FSQRT:0.001026


codepad if you wish to play with it..
http://codepad.org/oqq5jsbJ

...times from codepad
SQRTPS:0.000291  FSQRT:0.000634
SQRTPS:0.000289  FSQRT:0.000632
SQRTPS:0.000300  FSQRT:0.000642
SQRTPS:0.000291  FSQRT:0.000632


//used these sites as a resource..
http://www.neilkemp.us/v3/tutorials/SSE_Tutorial_1.html
http://www.tommesani.com/SSEArithmetic.html



import std.stdio : putr = writefln;
import tango.time.StopWatch;


void fastsqrt( float[] a, float[] b )
{
   if(a.length != b.length || a.length % 4 != 0)
     throw new Exception("fsqrt bad params!");

   float* pa = a.ptr, pb = b.ptr;
   uint times = a.length>>2;

   asm
   {
     mov ECX, times;
     mov EAX, [pa];
     mov EBX, [pb];

   REP:
     movups XMM0, [EAX];
     sqrtps XMM0, XMM0;
     movups [EBX], XMM0;
     add EAX, 16;
     add EBX, 16;
     loop REP;
   }
}

void sqrt( float[] a, float[] b )
{
   if(a.length != b.length)
     throw new Exception("fsqrt bad params!");

   float* pa = a.ptr, pb = b.ptr;
   uint times = a.length;

   asm
   {
     mov EAX, [pa];
     mov EBX, [pb];
     mov ECX, times; //error on a.length

   REP2:
     fldpi float ptr[EAX];
     fsqrt;
     fstp float ptr[EBX];
     add EAX, 4;
     add EBX, 4;
     loop REP2;
   }
}

void main()
{
   float[40_000] a, b, c;

   foreach( k, ref i; a )
     i = cast(float)k;

   double A, B;
   StopWatch timer;

   timer.start;
    fastsqrt( a, b );
   A = timer.stop;

   timer.start;
    sqrt( a, c );
   B = timer.stop;

  //putr(a, "\n", b, "\n", c);

  foreach(k, i; b)
     assert( b[k] == c[k] );

  putr("SQRTPS:%.6f  FSQRT:%.6f", A, B);
}
Aug 14 2008
parent reply Don <nospam nospam.com.au> writes:
Wyverex wrote:
 Was messing around with SIMD, SSE stuff..  didn't know how much faster 
 it could be!  Its been a few years since I did any assembly.
 Though Id just share this, any word of adding this to the lib or 
 compiler optimizations for this?
 
 
 my results:
 Parallel     Single
 
 SQRTPS:0.000120  FSQRT:0.001021
 SQRTPS:0.000114  FSQRT:0.001026
 SQRTPS:0.000114  FSQRT:0.001021
 SQRTPS:0.000114  FSQRT:0.001026
 
 
 codepad if you wish to play with it..
 http://codepad.org/oqq5jsbJ
 
 ...times from codepad
 SQRTPS:0.000291  FSQRT:0.000634
 SQRTPS:0.000289  FSQRT:0.000632
   asm
   {
     mov EAX, [pa];
     mov EBX, [pb];
     mov ECX, times; //error on a.length
 
   REP2:
     fldpi float ptr[EAX];
shouldn't that be fld ? fldpi loads 3.1415.... ! Doesn't make any difference to the time, though. This is exactly why D just got array operations.
Aug 15 2008
parent Wyverex <wyverex.cypher gmail.com> writes:
Don wrote:
 Wyverex wrote:
 Was messing around with SIMD, SSE stuff..  didn't know how much faster 
 it could be!  Its been a few years since I did any assembly.
 Though Id just share this, any word of adding this to the lib or 
 compiler optimizations for this?


 my results:
 Parallel     Single

 SQRTPS:0.000120  FSQRT:0.001021
 SQRTPS:0.000114  FSQRT:0.001026
 SQRTPS:0.000114  FSQRT:0.001021
 SQRTPS:0.000114  FSQRT:0.001026


 codepad if you wish to play with it..
 http://codepad.org/oqq5jsbJ

 ...times from codepad
 SQRTPS:0.000291  FSQRT:0.000634
 SQRTPS:0.000289  FSQRT:0.000632
   asm
   {
     mov EAX, [pa];
     mov EBX, [pb];
     mov ECX, times; //error on a.length

   REP2:
     fldpi float ptr[EAX];
shouldn't that be fld ? fldpi loads 3.1415.... ! Doesn't make any difference to the time, though. This is exactly why D just got array operations.
i had fld first but sqrt of 5 or higher came back as -nan....
Aug 15 2008