www.digitalmars.com         C & C++   DMDScript  

digitalmars.D - [Performance] shootout.binarytrees when implemented with gc is 10x slower than C# on .NET 2.0

reply "Andrey Khropov" <andkhropov_nosp m_mtu-net.ru> writes:
Here is the D code for shootout.binarytrees with GC. Absolutely equal to the C#
code (I also embedded a timer):

------------------------------------------------------------------------
import std.c.stdlib, std.stdio, std.perf, std.gc;

int main(char[][] args)
{
    int N = args.length > 1 ? atoi(args[1]) : 1;
    
    auto t = new HighPerformanceCounter();
    t.start();

    int minDepth = 4;
    int maxDepth = (minDepth + 2) > N ? minDepth + 2 : N;
    int stretchDepth = maxDepth + 1;

    TreeNode stretchTree = TreeNode.BottomUpTree(0, stretchDepth);
    writefln("stretch tree of depth ",stretchDepth,"\t check:
",stretchTree.ItemCheck);

    TreeNode longLivedTree = TreeNode.BottomUpTree(0, maxDepth);

    int depth;
    for(depth = minDepth; depth <= maxDepth; depth += 2)
    {
        int check, iterations = 1 << (maxDepth - depth + minDepth);

        for(int i = 1; i <= iterations; i++)
        {
            auto tempTree = TreeNode.BottomUpTree(i, depth);
            check += tempTree.ItemCheck;
            //delete tempTree;

            tempTree = TreeNode.BottomUpTree(-i, depth);
            check += tempTree.ItemCheck;
            //delete tempTree;
        }

        writefln(iterations * 2,"\t trees of depth ",depth,"\t check: ",check);
    }

    writefln("long lived tree of depth ",maxDepth,"\t check:
",longLivedTree.ItemCheck);
    
    t.stop();
    
    writefln(t.milliseconds()," ms elapsed.");

    return 0;
}

class TreeNode
{
public:
    this(int item, TreeNode left = null, TreeNode right = null)
    {
      this.item  = item;
      this.left  = left;
      this.right = right;
    }

    static TreeNode BottomUpTree(int item, int depth)
    {
        if(depth > 0)
            return new TreeNode(item
                                ,BottomUpTree(2 * item - 1, depth - 1)
                                ,BottomUpTree(2 * item, depth - 1));
        return new TreeNode(item);
    }

    int ItemCheck()
    {
        if(left)
            return item + left.ItemCheck() - right.ItemCheck();
        return item;
    }
private:
    TreeNode            left, right;
    int                 item;
}
------------------------------------------------------------------------
Here's the C# code (also with embedded timer):
------------------------------------------------------------------------
using System;
using System.Diagnostics;

class BinaryTrees
{
   const int minDepth = 4;

   public static void Main(String[] args) 
   {        
      int n = 0;
      if (args.Length > 0) n = Int32.Parse(args[0]);
      
      Stopwatch t = new Stopwatch();
      t.Start();  
    
      int maxDepth = Math.Max(minDepth + 2, n);
      int stretchDepth = maxDepth + 1;

      int check = (TreeNode.bottomUpTree(0,stretchDepth)).itemCheck();
      Console.WriteLine("stretch tree of depth {0}\t check: {1}", stretchDepth,
check);

      TreeNode longLivedTree = TreeNode.bottomUpTree(0,maxDepth);

      for (int depth=minDepth; depth<=maxDepth; depth+=2){
         int iterations = 1 << (maxDepth - depth + minDepth);

         check = 0;
         for (int i=1; i<=iterations; i++)
         {
            check += (TreeNode.bottomUpTree(i,depth)).itemCheck();  
            check += (TreeNode.bottomUpTree(-i,depth)).itemCheck(); 
         }

         Console.WriteLine("{0}\t trees of depth {1}\t check: {2}", 
            iterations*2, depth, check);
      }
 
      Console.WriteLine("long lived tree of depth {0}\t check: {1}", 
         maxDepth, longLivedTree.itemCheck());
         
      t.Stop();
      Console.WriteLine(t.ElapsedMilliseconds + "ms elapsed.");      
   }


   class TreeNode 
   {     
      private TreeNode left, right;
      private int item;

      TreeNode(int item){
         this.item = item;
      }
      
      TreeNode(int item,TreeNode left, TreeNode right){
         this.left = left; 
         this.right = right;
         this.item = item;
      }

      internal static TreeNode bottomUpTree(int item, int depth){
         if (depth>0){
            return new TreeNode(
                 item,
                 bottomUpTree(2*item-1, depth-1),
                 bottomUpTree(2*item, depth-1)
               );
         }
         else {
            return new TreeNode(item);
         }
      }

      internal int itemCheck(){
         // if necessary deallocate here 
         if (left==null) return item;
         else return item + left.itemCheck() - right.itemCheck(); 
      }
   }
}
------------------------------------------------------------------------

So the results are (with cmdline parameter = 16):

C# on .NET 2.0	    				 :  1.902 sec
DMD 0.173 - malloc(original version)  :  5.189 sec
C# on Mono 1.1.18				 :  6.630 sec
DMD 0.173 - full GC 				 : 19.720 sec

-- 
AKhropov
Nov 11 2006
next sibling parent reply Kyle Furlong <kylefurlong gmail.com> writes:
Andrey Khropov wrote:
 Here is the D code for shootout.binarytrees with GC. Absolutely equal to the C#
 code (I also embedded a timer):
 
 ------------------------------------------------------------------------
 import std.c.stdlib, std.stdio, std.perf, std.gc;
 
 int main(char[][] args)
 {
     int N = args.length > 1 ? atoi(args[1]) : 1;
     
     auto t = new HighPerformanceCounter();
     t.start();
 
     int minDepth = 4;
     int maxDepth = (minDepth + 2) > N ? minDepth + 2 : N;
     int stretchDepth = maxDepth + 1;
 
     TreeNode stretchTree = TreeNode.BottomUpTree(0, stretchDepth);
     writefln("stretch tree of depth ",stretchDepth,"\t check:
 ",stretchTree.ItemCheck);
 
     TreeNode longLivedTree = TreeNode.BottomUpTree(0, maxDepth);
 
     int depth;
     for(depth = minDepth; depth <= maxDepth; depth += 2)
     {
         int check, iterations = 1 << (maxDepth - depth + minDepth);
 
         for(int i = 1; i <= iterations; i++)
         {
             auto tempTree = TreeNode.BottomUpTree(i, depth);
             check += tempTree.ItemCheck;
             //delete tempTree;
 
             tempTree = TreeNode.BottomUpTree(-i, depth);
             check += tempTree.ItemCheck;
             //delete tempTree;
         }
 
         writefln(iterations * 2,"\t trees of depth ",depth,"\t check: ",check);
     }
 
     writefln("long lived tree of depth ",maxDepth,"\t check:
 ",longLivedTree.ItemCheck);
     
     t.stop();
     
     writefln(t.milliseconds()," ms elapsed.");
 
     return 0;
 }
 
 class TreeNode
 {
 public:
     this(int item, TreeNode left = null, TreeNode right = null)
     {
       this.item  = item;
       this.left  = left;
       this.right = right;
     }
 
     static TreeNode BottomUpTree(int item, int depth)
     {
         if(depth > 0)
             return new TreeNode(item
                                 ,BottomUpTree(2 * item - 1, depth - 1)
                                 ,BottomUpTree(2 * item, depth - 1));
         return new TreeNode(item);
     }
 
     int ItemCheck()
     {
         if(left)
             return item + left.ItemCheck() - right.ItemCheck();
         return item;
     }
 private:
     TreeNode            left, right;
     int                 item;
 }
 ------------------------------------------------------------------------
 Here's the C# code (also with embedded timer):
 ------------------------------------------------------------------------
 using System;
 using System.Diagnostics;
 
 class BinaryTrees
 {
    const int minDepth = 4;
 
    public static void Main(String[] args) 
    {        
       int n = 0;
       if (args.Length > 0) n = Int32.Parse(args[0]);
       
       Stopwatch t = new Stopwatch();
       t.Start();  
     
       int maxDepth = Math.Max(minDepth + 2, n);
       int stretchDepth = maxDepth + 1;
 
       int check = (TreeNode.bottomUpTree(0,stretchDepth)).itemCheck();
       Console.WriteLine("stretch tree of depth {0}\t check: {1}", stretchDepth,
 check);
 
       TreeNode longLivedTree = TreeNode.bottomUpTree(0,maxDepth);
 
       for (int depth=minDepth; depth<=maxDepth; depth+=2){
          int iterations = 1 << (maxDepth - depth + minDepth);
 
          check = 0;
          for (int i=1; i<=iterations; i++)
          {
             check += (TreeNode.bottomUpTree(i,depth)).itemCheck();  
             check += (TreeNode.bottomUpTree(-i,depth)).itemCheck(); 
          }
 
          Console.WriteLine("{0}\t trees of depth {1}\t check: {2}", 
             iterations*2, depth, check);
       }
  
       Console.WriteLine("long lived tree of depth {0}\t check: {1}", 
          maxDepth, longLivedTree.itemCheck());
          
       t.Stop();
       Console.WriteLine(t.ElapsedMilliseconds + "ms elapsed.");      
    }
 
 
    class TreeNode 
    {     
       private TreeNode left, right;
       private int item;
 
       TreeNode(int item){
          this.item = item;
       }
       
       TreeNode(int item,TreeNode left, TreeNode right){
          this.left = left; 
          this.right = right;
          this.item = item;
       }
 
       internal static TreeNode bottomUpTree(int item, int depth){
          if (depth>0){
             return new TreeNode(
                  item,
                  bottomUpTree(2*item-1, depth-1),
                  bottomUpTree(2*item, depth-1)
                );
          }
          else {
             return new TreeNode(item);
          }
       }
 
       internal int itemCheck(){
          // if necessary deallocate here 
          if (left==null) return item;
          else return item + left.itemCheck() - right.itemCheck(); 
       }
    }
 }
 ------------------------------------------------------------------------
 
 So the results are (with cmdline parameter = 16):
 
 C# on .NET 2.0	    				 :  1.902 sec
 DMD 0.173 - malloc(original version)  :  5.189 sec
 C# on Mono 1.1.18				 :  6.630 sec
 DMD 0.173 - full GC 				 : 19.720 sec
 

Andrey, its no secret that the D GC is not implemented to be the fastest around. C# I believe uses at the least a generational collector, which I think is also copying. D currently uses a conservative mark and sweep collector. Rock solid, but slow.
Nov 11 2006
parent reply "Andrey Khropov" <andkhropov_nosp m_mtu-net.ru> writes:
Kyle Furlong wrote:

 Andrey, its no secret that the D GC is not implemented to be the fastest
 around. C# I believe uses at the least a generational collector, which I
 think is also copying.
 
 D currently uses a conservative mark and sweep collector. Rock solid, but
 slow.

My point here is that if we want to successfully compete with .NET we should adapt a better (copying,generational) collector. Of course it poses some problems (pointers to GC data for example), but if we have to go back to new/delete/malloc/free to get fast code (and it's actually very likely that C#'s GC with the fast allocation will be faster anyway (as in this case) unless we employ some special trickery like allocation lists) it spoils the whole idea of GC as a primary memory management mechanism for D. -- AKhropov
Nov 12 2006
parent Kyle Furlong <kylefurlong gmail.com> writes:
Andrey Khropov wrote:
 Kyle Furlong wrote:
 
 Andrey, its no secret that the D GC is not implemented to be the fastest
 around. C# I believe uses at the least a generational collector, which I
 think is also copying.

 D currently uses a conservative mark and sweep collector. Rock solid, but
 slow.

My point here is that if we want to successfully compete with .NET we should adapt a better (copying,generational) collector. Of course it poses some problems (pointers to GC data for example), but if we have to go back to new/delete/malloc/free to get fast code (and it's actually very likely that C#'s GC with the fast allocation will be faster anyway (as in this case) unless we employ some special trickery like allocation lists) it spoils the whole idea of GC as a primary memory management mechanism for D.

One step ahead of you... ;)
Nov 12 2006
prev sibling next sibling parent "Craig Black" <cblack ara.com> writes:
That really sucks.

Eventually, I know that it is Walter's intention to  optimize D's GC.  In
the mean time, when performance is critical, we can do heap allocation in
the traditional manner using malloc and free.  However, IMO, there should be
a more simple way to do this than overloading new and delete.

-Craig
Nov 11 2006
prev sibling next sibling parent reply "Craig Black" <cblack ara.com> writes:
 C# on .NET 2.0     :  1.902 sec
 DMD 0.173 - malloc(original version)  :  5.189 sec
 C# on Mono 1.1.18 :  6.630 sec
 DMD 0.173 - full GC : 19.720 sec

I see that C# GC is faster than DMD's malloc. How is this possible? Did Microsoft figure out how to make the GC really fast, or is D's malloc slow? I wonder how D's malloc compares to nedmalloc. It is a very fast cross-platform allocator. If it is the case that D's malloc is slow, why don't we adopt a public domain allocator that is real fast? It would be very easy to do. -Craig
Nov 11 2006
parent reply "Chris Miller" <chris dprogramming.com> writes:
On Sat, 11 Nov 2006 23:01:50 -0500, Craig Black <cblack ara.com> wrote:

 C# on .NET 2.0     :  1.902 sec
 DMD 0.173 - malloc(original version)  :  5.189 sec
 C# on Mono 1.1.18 :  6.630 sec
 DMD 0.173 - full GC : 19.720 sec

I see that C# GC is faster than DMD's malloc. How is this possible? Did Microsoft figure out how to make the GC really fast, or is D's malloc slow?

The type of GC used by .NET is just about as fast as stack allocation; a simple pointer is advanced. It's an upside to relocating data (copying collector).
Nov 11 2006
parent reply "Craig Black" <cblack ara.com> writes:
GC that performs faster than malloc!?  Incredible!  D needs this.

-Craig

"Chris Miller" <chris dprogramming.com> wrote in message
news:op.tiv11offpo9bzi tanu...
 On Sat, 11 Nov 2006 23:01:50 -0500, Craig Black <cblack ara.com> wrote:

 C# on .NET 2.0     :  1.902 sec
 DMD 0.173 - malloc(original version)  :  5.189 sec
 C# on Mono 1.1.18 :  6.630 sec
 DMD 0.173 - full GC : 19.720 sec

I see that C# GC is faster than DMD's malloc. How is this possible?


 Microsoft figure out how to make the GC really fast, or is D's malloc
 slow?

The type of GC used by .NET is just about as fast as stack allocation; a simple pointer is advanced. It's an upside to relocating data (copying collector).

Nov 12 2006
parent David Medlock <noone nowhere.com> writes:
Craig Black wrote:
 GC that performs faster than malloc!?  Incredible!  D needs this.
 
 -Craig
 

http://citeseer.ist.psu.edu/appel87garbage.html and http://home.pipeline.com/~hbaker1/CheneyMTA.html -DavidM
Nov 13 2006
prev sibling parent reply Dave <Dave_member pathlink.com> writes:
Andrey Khropov wrote:
 So the results are (with cmdline parameter = 16):
 
 C# on .NET 2.0	    				 :  1.902 sec
 DMD 0.173 - malloc(original version)  :  5.189 sec
 C# on Mono 1.1.18				 :  6.630 sec
 DMD 0.173 - full GC 				 : 19.720 sec
 

How about this (diff of original version)? --- binarytrees.d 2006-06-02 09:34:38.000000000 -0500 +++ binarytrees_list.d 2006-11-11 21:30:23.000000000 -0600 -41,7 +41,6 } writefln("long lived tree of depth ",maxDepth,"\t check: ",longLivedTree.ItemCheck); - TreeNode.DeleteTree(longLivedTree); return 0; } -80,6 +79,9 TreeNode* left, right; int item; + static TreeNode* list; + TreeNode* next; + static TreeNode* opCall(int item, TreeNode* left = null, TreeNode* right = null) { TreeNode* t = new TreeNode; -91,11 +93,26 new(uint sz) { - return malloc(sz); + TreeNode* tn; + if(list) + { + tn = list; + list = tn.next; + } + else + { + tn = cast(TreeNode*)malloc(sz); + } + return tn; } delete(void* p) { - free(p); + if(p) + { + TreeNode* tn = cast(TreeNode*)p; + tn.next = list; + list = tn; + } } }
Nov 11 2006
parent reply "Andrey Khropov" <andkhropov_nosp m_mtu-net.ru> writes:
Dave wrote:

Much better :) (you should post it to shootout maybe) :

1) DMD 0.173 - allocation lists             :  1.605 sec
2) C# on .NET 2.0                           :  1.902 sec
3) DMD 0.173 - malloc(original version)     :  5.189 sec
4) C# on Mono 1.1.18                        :  6.630 sec
5) DMD 0.173 - full GC                      : 19.720 sec

But anyway I'd like D's GC to work reasonably well out of the box.

-- 
AKhropov
Nov 12 2006
parent Dave <Dave_member pathlink.com> writes:
Andrey Khropov wrote:
 Dave wrote:
 
 Much better :) (you should post it to shootout maybe) :
 

It doesn't qualify <g>
 1) DMD 0.173 - allocation lists             :  1.605 sec
 2) C# on .NET 2.0                           :  1.902 sec
 3) DMD 0.173 - malloc(original version)     :  5.189 sec
 4) C# on Mono 1.1.18                        :  6.630 sec
 5) DMD 0.173 - full GC                      : 19.720 sec
 
 But anyway I'd like D's GC to work reasonably well out of the box.
 

Ditto.
Nov 12 2006