digitalmars.D - [Performance] shootout.binarytrees when implemented with gc is 10x slower than C# on .NET 2.0
- Andrey Khropov (147/147) Nov 11 2006 Here is the D code for shootout.binarytrees with GC. Absolutely equal to...
- Kyle Furlong (6/180) Nov 11 2006 Andrey, its no secret that the D GC is not implemented to be the fastest...
- Andrey Khropov (10/16) Nov 12 2006 My point here is that if we want to successfully compete with .NET we sh...
- Kyle Furlong (2/20) Nov 12 2006 One step ahead of you... ;)
- Craig Black (6/6) Nov 11 2006 That really sucks.
- Craig Black (7/11) Nov 11 2006 I see that C# GC is faster than DMD's malloc. How is this possible? Di...
- Chris Miller (4/11) Nov 11 2006 The type of GC used by .NET is just about as fast as stack allocation; a...
- Craig Black (5/17) Nov 12 2006 GC that performs faster than malloc!? Incredible! D needs this.
- David Medlock (6/10) Nov 13 2006 see:
- Dave (46/53) Nov 11 2006 How about this (diff of original version)?
- Andrey Khropov (10/10) Nov 12 2006 Dave wrote:
- Dave (3/15) Nov 12 2006 Ditto.
code (I also embedded a timer): ------------------------------------------------------------------------ import std.c.stdlib, std.stdio, std.perf, std.gc; int main(char[][] args) { int N = args.length > 1 ? atoi(args[1]) : 1; auto t = new HighPerformanceCounter(); t.start(); int minDepth = 4; int maxDepth = (minDepth + 2) > N ? minDepth + 2 : N; int stretchDepth = maxDepth + 1; TreeNode stretchTree = TreeNode.BottomUpTree(0, stretchDepth); writefln("stretch tree of depth ",stretchDepth,"\t check: ",stretchTree.ItemCheck); TreeNode longLivedTree = TreeNode.BottomUpTree(0, maxDepth); int depth; for(depth = minDepth; depth <= maxDepth; depth += 2) { int check, iterations = 1 << (maxDepth - depth + minDepth); for(int i = 1; i <= iterations; i++) { auto tempTree = TreeNode.BottomUpTree(i, depth); check += tempTree.ItemCheck; //delete tempTree; tempTree = TreeNode.BottomUpTree(-i, depth); check += tempTree.ItemCheck; //delete tempTree; } writefln(iterations * 2,"\t trees of depth ",depth,"\t check: ",check); } writefln("long lived tree of depth ",maxDepth,"\t check: ",longLivedTree.ItemCheck); t.stop(); writefln(t.milliseconds()," ms elapsed."); return 0; } class TreeNode { public: this(int item, TreeNode left = null, TreeNode right = null) { this.item = item; this.left = left; this.right = right; } static TreeNode BottomUpTree(int item, int depth) { if(depth > 0) return new TreeNode(item ,BottomUpTree(2 * item - 1, depth - 1) ,BottomUpTree(2 * item, depth - 1)); return new TreeNode(item); } int ItemCheck() { if(left) return item + left.ItemCheck() - right.ItemCheck(); return item; } private: TreeNode left, right; int item; } ------------------------------------------------------------------------ ------------------------------------------------------------------------ using System; using System.Diagnostics; class BinaryTrees { const int minDepth = 4; public static void Main(String[] args) { int n = 0; if (args.Length > 0) n = Int32.Parse(args[0]); Stopwatch t = new Stopwatch(); t.Start(); int maxDepth = Math.Max(minDepth + 2, n); int stretchDepth = maxDepth + 1; int check = (TreeNode.bottomUpTree(0,stretchDepth)).itemCheck(); Console.WriteLine("stretch tree of depth {0}\t check: {1}", stretchDepth, check); TreeNode longLivedTree = TreeNode.bottomUpTree(0,maxDepth); for (int depth=minDepth; depth<=maxDepth; depth+=2){ int iterations = 1 << (maxDepth - depth + minDepth); check = 0; for (int i=1; i<=iterations; i++) { check += (TreeNode.bottomUpTree(i,depth)).itemCheck(); check += (TreeNode.bottomUpTree(-i,depth)).itemCheck(); } Console.WriteLine("{0}\t trees of depth {1}\t check: {2}", iterations*2, depth, check); } Console.WriteLine("long lived tree of depth {0}\t check: {1}", maxDepth, longLivedTree.itemCheck()); t.Stop(); Console.WriteLine(t.ElapsedMilliseconds + "ms elapsed."); } class TreeNode { private TreeNode left, right; private int item; TreeNode(int item){ this.item = item; } TreeNode(int item,TreeNode left, TreeNode right){ this.left = left; this.right = right; this.item = item; } internal static TreeNode bottomUpTree(int item, int depth){ if (depth>0){ return new TreeNode( item, bottomUpTree(2*item-1, depth-1), bottomUpTree(2*item, depth-1) ); } else { return new TreeNode(item); } } internal int itemCheck(){ // if necessary deallocate here if (left==null) return item; else return item + left.itemCheck() - right.itemCheck(); } } } ------------------------------------------------------------------------ So the results are (with cmdline parameter = 16): DMD 0.173 - malloc(original version) : 5.189 sec DMD 0.173 - full GC : 19.720 sec -- AKhropov
Nov 11 2006
Andrey Khropov wrote:code (I also embedded a timer): ------------------------------------------------------------------------ import std.c.stdlib, std.stdio, std.perf, std.gc; int main(char[][] args) { int N = args.length > 1 ? atoi(args[1]) : 1; auto t = new HighPerformanceCounter(); t.start(); int minDepth = 4; int maxDepth = (minDepth + 2) > N ? minDepth + 2 : N; int stretchDepth = maxDepth + 1; TreeNode stretchTree = TreeNode.BottomUpTree(0, stretchDepth); writefln("stretch tree of depth ",stretchDepth,"\t check: ",stretchTree.ItemCheck); TreeNode longLivedTree = TreeNode.BottomUpTree(0, maxDepth); int depth; for(depth = minDepth; depth <= maxDepth; depth += 2) { int check, iterations = 1 << (maxDepth - depth + minDepth); for(int i = 1; i <= iterations; i++) { auto tempTree = TreeNode.BottomUpTree(i, depth); check += tempTree.ItemCheck; //delete tempTree; tempTree = TreeNode.BottomUpTree(-i, depth); check += tempTree.ItemCheck; //delete tempTree; } writefln(iterations * 2,"\t trees of depth ",depth,"\t check: ",check); } writefln("long lived tree of depth ",maxDepth,"\t check: ",longLivedTree.ItemCheck); t.stop(); writefln(t.milliseconds()," ms elapsed."); return 0; } class TreeNode { public: this(int item, TreeNode left = null, TreeNode right = null) { this.item = item; this.left = left; this.right = right; } static TreeNode BottomUpTree(int item, int depth) { if(depth > 0) return new TreeNode(item ,BottomUpTree(2 * item - 1, depth - 1) ,BottomUpTree(2 * item, depth - 1)); return new TreeNode(item); } int ItemCheck() { if(left) return item + left.ItemCheck() - right.ItemCheck(); return item; } private: TreeNode left, right; int item; } ------------------------------------------------------------------------ ------------------------------------------------------------------------ using System; using System.Diagnostics; class BinaryTrees { const int minDepth = 4; public static void Main(String[] args) { int n = 0; if (args.Length > 0) n = Int32.Parse(args[0]); Stopwatch t = new Stopwatch(); t.Start(); int maxDepth = Math.Max(minDepth + 2, n); int stretchDepth = maxDepth + 1; int check = (TreeNode.bottomUpTree(0,stretchDepth)).itemCheck(); Console.WriteLine("stretch tree of depth {0}\t check: {1}", stretchDepth, check); TreeNode longLivedTree = TreeNode.bottomUpTree(0,maxDepth); for (int depth=minDepth; depth<=maxDepth; depth+=2){ int iterations = 1 << (maxDepth - depth + minDepth); check = 0; for (int i=1; i<=iterations; i++) { check += (TreeNode.bottomUpTree(i,depth)).itemCheck(); check += (TreeNode.bottomUpTree(-i,depth)).itemCheck(); } Console.WriteLine("{0}\t trees of depth {1}\t check: {2}", iterations*2, depth, check); } Console.WriteLine("long lived tree of depth {0}\t check: {1}", maxDepth, longLivedTree.itemCheck()); t.Stop(); Console.WriteLine(t.ElapsedMilliseconds + "ms elapsed."); } class TreeNode { private TreeNode left, right; private int item; TreeNode(int item){ this.item = item; } TreeNode(int item,TreeNode left, TreeNode right){ this.left = left; this.right = right; this.item = item; } internal static TreeNode bottomUpTree(int item, int depth){ if (depth>0){ return new TreeNode( item, bottomUpTree(2*item-1, depth-1), bottomUpTree(2*item, depth-1) ); } else { return new TreeNode(item); } } internal int itemCheck(){ // if necessary deallocate here if (left==null) return item; else return item + left.itemCheck() - right.itemCheck(); } } } ------------------------------------------------------------------------ So the results are (with cmdline parameter = 16): DMD 0.173 - malloc(original version) : 5.189 sec DMD 0.173 - full GC : 19.720 secAndrey, its no secret that the D GC is not implemented to be the fastest think is also copying. D currently uses a conservative mark and sweep collector. Rock solid, but slow.
Nov 11 2006
Kyle Furlong wrote:Andrey, its no secret that the D GC is not implemented to be the fastest think is also copying. D currently uses a conservative mark and sweep collector. Rock solid, but slow.My point here is that if we want to successfully compete with .NET we should adapt a better (copying,generational) collector. Of course it poses some problems (pointers to GC data for example), but if we have to go back to new/delete/malloc/free to get fast code (and it's actually this case) unless we employ some special trickery like allocation lists) it spoils the whole idea of GC as a primary memory management mechanism for D. -- AKhropov
Nov 12 2006
Andrey Khropov wrote:Kyle Furlong wrote:One step ahead of you... ;)Andrey, its no secret that the D GC is not implemented to be the fastest think is also copying. D currently uses a conservative mark and sweep collector. Rock solid, but slow.My point here is that if we want to successfully compete with .NET we should adapt a better (copying,generational) collector. Of course it poses some problems (pointers to GC data for example), but if we have to go back to new/delete/malloc/free to get fast code (and it's actually this case) unless we employ some special trickery like allocation lists) it spoils the whole idea of GC as a primary memory management mechanism for D.
Nov 12 2006
That really sucks. Eventually, I know that it is Walter's intention to optimize D's GC. In the mean time, when performance is critical, we can do heap allocation in the traditional manner using malloc and free. However, IMO, there should be a more simple way to do this than overloading new and delete. -Craig
Nov 11 2006
DMD 0.173 - malloc(original version) : 5.189 sec DMD 0.173 - full GC : 19.720 secMicrosoft figure out how to make the GC really fast, or is D's malloc slow? I wonder how D's malloc compares to nedmalloc. It is a very fast cross-platform allocator. If it is the case that D's malloc is slow, why don't we adopt a public domain allocator that is real fast? It would be very easy to do. -Craig
Nov 11 2006
On Sat, 11 Nov 2006 23:01:50 -0500, Craig Black <cblack ara.com> wrote:The type of GC used by .NET is just about as fast as stack allocation; a simple pointer is advanced. It's an upside to relocating data (copying collector).DMD 0.173 - malloc(original version) : 5.189 sec DMD 0.173 - full GC : 19.720 secMicrosoft figure out how to make the GC really fast, or is D's malloc slow?
Nov 11 2006
GC that performs faster than malloc!? Incredible! D needs this. -Craig "Chris Miller" <chris dprogramming.com> wrote in message news:op.tiv11offpo9bzi tanu...On Sat, 11 Nov 2006 23:01:50 -0500, Craig Black <cblack ara.com> wrote:DidDMD 0.173 - malloc(original version) : 5.189 sec DMD 0.173 - full GC : 19.720 secMicrosoft figure out how to make the GC really fast, or is D's malloc slow?The type of GC used by .NET is just about as fast as stack allocation; a simple pointer is advanced. It's an upside to relocating data (copying collector).
Nov 12 2006
Craig Black wrote:GC that performs faster than malloc!? Incredible! D needs this. -Craigsee: http://citeseer.ist.psu.edu/appel87garbage.html and http://home.pipeline.com/~hbaker1/CheneyMTA.html -DavidM
Nov 13 2006
Andrey Khropov wrote:So the results are (with cmdline parameter = 16): DMD 0.173 - malloc(original version) : 5.189 sec DMD 0.173 - full GC : 19.720 secHow about this (diff of original version)? --- binarytrees.d 2006-06-02 09:34:38.000000000 -0500 +++ binarytrees_list.d 2006-11-11 21:30:23.000000000 -0600 -41,7 +41,6 } writefln("long lived tree of depth ",maxDepth,"\t check: ",longLivedTree.ItemCheck); - TreeNode.DeleteTree(longLivedTree); return 0; } -80,6 +79,9 TreeNode* left, right; int item; + static TreeNode* list; + TreeNode* next; + static TreeNode* opCall(int item, TreeNode* left = null, TreeNode* right = null) { TreeNode* t = new TreeNode; -91,11 +93,26 new(uint sz) { - return malloc(sz); + TreeNode* tn; + if(list) + { + tn = list; + list = tn.next; + } + else + { + tn = cast(TreeNode*)malloc(sz); + } + return tn; } delete(void* p) { - free(p); + if(p) + { + TreeNode* tn = cast(TreeNode*)p; + tn.next = list; + list = tn; + } } }
Nov 11 2006
Dave wrote: Much better :) (you should post it to shootout maybe) : 1) DMD 0.173 - allocation lists : 1.605 sec 3) DMD 0.173 - malloc(original version) : 5.189 sec 5) DMD 0.173 - full GC : 19.720 sec But anyway I'd like D's GC to work reasonably well out of the box. -- AKhropov
Nov 12 2006
Andrey Khropov wrote:Dave wrote: Much better :) (you should post it to shootout maybe) :It doesn't qualify <g>1) DMD 0.173 - allocation lists : 1.605 sec 3) DMD 0.173 - malloc(original version) : 5.189 sec 5) DMD 0.173 - full GC : 19.720 sec But anyway I'd like D's GC to work reasonably well out of the box.Ditto.
Nov 12 2006