Welcome to Web-News
A Web-based News Reader
Subject Python dictionary inspired hash table implementation
From Moritz Warning <moritzwarning@web.de>
Date Wed, 02 Jul 2008 14:59:23 -0400
Newsgroups digitalmars.D
Attachment(s) dict3.d

I like to share a hash table implementation inspired by Pythons Dictionary.

It's a so called closed hashing table, that is, keys and elements are stored in the table.
Pointers are avoided when possible.
The table size is 2**n and the used load factor is 0.75.

It works for most key types (also static arrays).
All kind of values types should be supported.

For comparison, I did a simple/stupid benchmark,
adding 10.000.000 uint pairs and looking them up
in consecutive order (see attached file).

The build-in AA uint[uint]:
inserts:  1901966/s (5.26s) lookups: 8065673/s (1.24s)

tango.util.container.HashMap!(uint, uint, ..):
inserts:  6346664/s (1.58s) lookups: 23780047/s (0.42s)

The above implementation, Dict!(uint, uint):
inserts:  8199181/s (1.22s) lookups: 34506306/s (0.29s)

This implementation is far from perfect and has room for improvements.
E.g., it is twice as slow for lookups of ulong keys compared to Tango HashMap
(3 times slower for insertion). It's also slower for arrays.
But that's likely to change with some more tweaking.

The attached code is placed into Public Domain.



module dict3; import tango.io.Stdout; import tango.core.Memory; import tango.core.Traits; import tango.util.container.HashMap; import tango.time.StopWatch; import tango.stdc.stdio; //for printf import tango.stdc.string; //for memset /* * This file includes a closed hash map implementation inspired by Pythons Dictionary implementation. * The code of this file is placed into Public Domain. * * Author: Moritz Warning * * * Compiler command used for testing: * gdc dict3.d -o dict3 -fversion=Posix -fversion=Tango -I/home/user/local/include/d/4.1.2 -L/home/user/local/lib -lgtango -ffast-math -O3 -frelease -finline-functions -msse -mfpmath=sse */ const uint M = 1; //number of tests to run const uint N = 1_000_000;  //number of inserts and lookups double lookup_time = 0; double insert_time = 0; StopWatch w; void main() {         GC.disable();                  for (size_t i = 0; i < M; i++)         {                 run();         }                  printf("%u x %u iterations\n", M, N);         printf("inserts:  %u/s (%fs)\n", cast(uint) (M * N / insert_time), (insert_time / M));         printf("lookups: %u/s (%fs)\n",cast(uint) (M * N / lookup_time), (lookup_time / M)); } void run() {         //printf("uint[uint]\n");         //uint[uint] aa;                  //printf("HashMap!(ulong, uint, ..)\n");         //scope aa = new HashMap!(uint, uint, Container.hash, Container.reap, Container.Chunk);         printf("Dict!(uint, uint)\n");         scope aa = new Dict!(uint, uint)();                  w.start;         for (uint i=N ; i--;)         {                 aa[i] = i;         }         insert_time += w.stop();                  w.start;         for (uint i=N; i--;)         {                 auto foo = (i in aa);         }         lookup_time += w.stop(); } class Dict(K, V) { private:         //ulong, double etc. on 32bit         struct BigPODWrapper(T)         {                 T data;                 size_t hash;                                  void ctor()                 {                         static assert(T.sizeof >= size_t.sizeof);                                                  hash = typeid(T).getHash(&data);                                                  //will work for ulong with additional hash                         //hash = *cast(size_t*) &data + (cast(size_t*) &data)[1];                                                  //avoid special hashes                         if(isSpecialKey(*this)) hash += 2;                 }                                  void markDummy()                 {                         this.hash = cast(T) dummy_hash;                 }                 alias typeof(*this) TT;                 static bool cmp1(TT a, TT b) { return a.data == b.data; }                 static bool cmp2(TT a, TT b) { return false; }                 static bool cmp3(TT a, TT b) { return a.data == b.data; }         }         //byte, uint etc. on 32bit         struct SmallPODWrapper(T)         {                 T data;                 alias data hash;                                  void ctor()                 {                         static assert(T.sizeof <= size_t.sizeof && !isPointerType!(T));                 }                                  void markDummy()                 {                         this.data = cast(T) dummy_hash;                 }                 alias typeof(*this) TT;                 static bool cmp1(TT a, TT b) { return a.data == b.data; }                 static bool cmp2(TT a, TT b) { return false; }                 static bool cmp3(TT a, TT b) { return a.data == b.data; }         }         struct PointerWrapper(T)         {                 T data;                                  void ctor()                 {                         static assert(isReferenceType!(T));                 }                                  void markDummy()                 {                         void* tmp = cast(void*) dummy_hash;                         this.data = cast(T) tmp;                 }                                  size_t hash() { return cast(size_t) cast(void*) data; }                                  alias typeof(*this) TT;                 static bool cmp1(TT a, TT b) { return a.data is b.data; }                 static bool cmp2(TT a, TT b)                 {                         static if(is(a.opEquals))                         {                                 return a.opEquals(b);                         }                         else                         {                                 return false;                         }                 }                 static bool cmp3(TT a, TT b)                 {                         static if(is(a.opEquals))                         {                                 return a.data is b.data || a.opEquals(b);                         }                         else                         {                                 return a.data is b.data;                         }                 }         }         struct ArrayWrapper(T)         {                 T data;                 size_t hash;                                  void ctor()                 {                         static assert(isDynamicArrayType!(T) || isStaticArrayType!(T));                                                  if(data.length == 0)                         {                                 if(cast(size_t) cast(void*) data.ptr == unused_hash)                                 {                                         hash = unused_hash;                                         return;                                 }                                 else if(cast(size_t) cast(void*) data.ptr == dummy_hash)                                 {                                         hash = dummy_hash;                                         return;                                 }                         }                                                  //hash function                         void[] a = cast(void[]) data;                         auto len = a.length;                         ubyte* p = cast(ubyte *) a.ptr;                         hash = *p << 7;                         while (--len != 0)                         {                                 hash = (1000003 * hash) ^ *p++;                         }                         hash ^= a.length;                                                  //avoid special hashes                         if(isSpecialKey(*this)) hash += 2;                 }                                  void markDummy()                 {                         this.hash = dummy_hash;                         this.data = null;                 }                                  alias typeof(*this) TT;                 static bool cmp1(TT a, TT b) { return (a.hash == b.hash && a.data == b.data); }                 static bool cmp2(TT a, TT b) { return false; }                 static bool cmp3(TT a, TT b) { return (a.hash == b.hash && a.data == b.data); }         }         /*         struct StructWrapper(T)         {                 size_t hash;                 T data;                                  void ctor()                 {                         this.hash = typeid(T).getHash(&data);                 }                                  //set dummy                 void markDummy()                 {                         this.hash = hash;                         this.data = T.init;                 }                                  T getData() { return data; }                 size_t getHash() { return hash; }                                  alias typeof(*this) TT;                 static bool cmp1(TT a, TT b) { return typeid(T).equals(&a.data, &b.data); }                 static bool cmp2(TT a, TT b) { return false; }                 static bool cmp3(TT a, TT b) { return typeid(T).equals(&a.data, &b.data); }         }         */         struct GenericWrapper(T)         {                 T data;                 size_t hash;                                  void ctor()                 {                         this.hash = typeid(T).getHash(&data);                 }                                  void markDummy()                 {                         this.hash = 1;                         this.data = T.init;                 }                                  size_t getHash() { return hash; }                                  alias typeof(*this) TT;                 static bool cmp1(TT a, TT b) { return cast(bool) typeid(T).equals(&a.data, &b.data); }                 static bool cmp2(TT a, TT b) { return false; }                 static bool cmp3(TT a, TT b) { return cast(bool) typeid(T).equals(&a.data, &b.data); }         }         /*         * The hash may overlap, but not for values 1, 0.         * Good approach is to assign 0,1 to rare (or even invalid) key values         * and prevent the hash function from generating 1 and 0.         */         template KeyWrapper(K)         {                 static if (isDynamicArrayType!(K) || isStaticArrayType!(K))                 {                         alias ArrayWrapper!(K) type;                 }                 else static if (isReferenceType!(K))                 {                         alias PointerWrapper!(K) type;                 }                 else static if (K.sizeof <= size_t.sizeof)                 {                         alias SmallPODWrapper!(K) type;                 }                 else static if (K.sizeof > size_t.sizeof)                 {                         alias BigPODWrapper!(K) type;                 }                 else                 {                         //uses TypeInfo                         alias GenericWrapper!(K) type;                 }         }         //key wrapper type         alias KeyWrapper!(K).type KW;         //need to be 0 for the algorithm to terminate         static const size_t unused_hash = 0;         static const size_t dummy_hash = 1;         //need to be a power of two         static const size_t MINSIZE = 8;         static const size_t PERTURB_SHIFT = 5;         struct Entry         {                 KW key;                 V value;         }                  //active + dummy entries         size_t fill = 0;                  //active entries         size_t used = 0;         /*         * The table contains mask + 1 slots, and that's a power of 2.         * We store the mask instead of the size because the mask         * is more frequently needed.         */         size_t mask = MINSIZE - 1;                  //table of size 2**n         Entry* table = void;         /*         * Since this.table can't hold entries for both special keys,         * they have to be stored and handled separately.         */         bool is_unused = false;         KW unused_key = KW.init;         V unused_value = V.init;                  bool is_dummy = false;         KW dummy_key = KW.init;         V dummy_value = V.init;         public this()         {                 static assert(unused_hash != dummy_hash);                 this.table = cast(Entry*) GC.calloc(Entry.sizeof * MINSIZE);         }                  ~this()         {                 delete this.table;         }         /*         * Any key that is not special key is active.         */         private static bool isActiveKey(KW key)         {                 return (key.hash > 1);         }                  private static bool isDummyKey(KW key)         {                 return (key.hash == dummy_hash);         }                  private static bool isUnusedKey(KW key)         {                 return (key.hash == unused_hash);         }                  private static bool isSpecialKey(KW key)         {                 return (key.hash < 2);         }                  /*         * Lookup an entry in the table.         * This is the workhorse.         */         private Entry* lookdict(KW key)         {                 assert(!isSpecialKey(key));                                  size_t hash = key.hash;                                  size_t perturb = void;                 Entry *freeslot = void;                 size_t mask = this.mask;                 Entry *ep0 = this.table;                                  size_t i = hash & mask;                 Entry *ep = &ep0[i];                                  /*                 * This first lookup will succeed in the very most cases (=> no collision).                 */                                  if (isUnusedKey(ep.key) || KW.cmp1(ep.key, key))                 {                         return ep;                 }                                  if (isDummyKey(ep.key))                 {                         freeslot = ep;                 }                 else                 {                         if (KW.cmp2(ep.key, key))                         {                                 return ep;                         }                                                  freeslot = null;                 }                                  /*                 * In the loop, key == dummy is by far (factor of 100s) the                 * least likely outcome, so test for that last.                 */                 for (perturb = hash; ; perturb >>= PERTURB_SHIFT)                 {                         i = (i << 2) + i + perturb + 1;                         ep = &ep0[i & mask];                                                  if (isUnusedKey(ep.key))                         {                                 return (freeslot is null) ? ep : freeslot;                         }                                                  if (KW.cmp3(ep.key, key))                         {                                 return ep;                         }                                                  if (freeslot is null && isDummyKey(ep.key))                         {                                 freeslot = ep;                         }                 }                 assert(0);        //never reached         }                  public V* opIn_r(K k)         {                 //wrap                 auto key = KW(k);                 key.ctor();                                  if (isSpecialKey(key))                 {                         if (isUnusedKey(key))                         {                                 return is_unused ? &unused_value : null;                         }                         else //must be dummy                         {                                 assert(isDummyKey(key));                                 return is_dummy ? &dummy_value : null;                         }                         assert(0);                 }                                  Entry* ep = lookdict(key);                 assert(ep);                                  if (isActiveKey(ep.key))                 {                         return &ep.value;                 }                 else                 {                         return null;                 }         }         public void opIndexAssign(V value, K k)         {                 assert(this.fill <= this.mask);  //algorithm need at least one empty slot                 //wrap                 auto key = KW(k);                 key.ctor();                                  if (isSpecialKey(key))                 {                         if (isUnusedKey(key))                         {                                 is_unused = true;                                 unused_key = key;                                 unused_value = value;                                 return;                         }                         else //must be dummy                         {                                 assert(isDummyKey(key));                                 is_dummy = true;                                 dummy_key = key;                                 dummy_value = value;                                 return;                         }                 }                                  Entry* ep = lookdict(key);                 assert(ep);                                  if (isActiveKey(ep.key))                 {                         ep.value = value;                 }                 else                 {                         if (isUnusedKey(ep.key))                         {                                 this.fill++;                         }                         else                         {                                 assert(isDummyKey(ep.key));                         }                                                  ep.key = key;                         ep.value = value;                                                  this.used++;                         checkLoad();                 }         }         /*         * Check load factor and allocate two times of the current table size         * if we have > 50_000 items; otherwise four times as much.         */         private void checkLoad()         {                 //Make table bigger if load factor > 3/4.                 //This can also result in smaller table if there are many dummy entries)                 if (this.fill * 4 >= (this.mask + 1) * 3) //load factor is 3/4                 {                         dictresize((this.used > 50000 ? 2 : 4) * this.used);                 }                 /*                 //make table smaller, table size > MINSIZE and load factor is < 1/8                 else if (this.mask + 1 > MINSIZE && this.fill * 4 < (this.mask + 1) * 1)                 {                         dictresize(this.used / (this.used > 50000 ? 4 : 2));                 }*/         }                  public void remove(K k)         {                 //wrap                 auto key = KW(k);                 key.ctor();                                  if (isSpecialKey(key))                 {                         if (isUnusedKey(key))                         {                                 is_unused = false;                                 unused_value = V.init;                                 unused_key = KW.init;                                 return;                         }                         else //must be dummy                         {                                 assert(isDummyKey(key));                                 is_dummy = false;                                 dummy_value = V.init;                                 dummy_key = KW.init;                                 return;                         }                 }                                  Entry* ep = lookdict(key);                 assert(ep);                                  ep.key.markDummy();                 ep.value = V.init; //not needed for POD?                 this.used--;         }                  private void dictresize(size_t minused)         {                 Entry* newtable = void;                 Entry[MINSIZE] small_copy;                 // Find the smallest table size > minused and size == 2**n.                 size_t newsize = MINSIZE;                 while(newsize <= minused)                 {                         newsize <<= 1;                 }                                  // Get space for a new table.                 Entry* oldtable = this.table;                 assert(oldtable !is null);                                  newtable = cast(Entry*) GC.malloc(Entry.sizeof * newsize);                                  assert(newtable);                 assert(newtable != oldtable);                                  this.table = newtable;                 this.mask = newsize - 1;                                  memset(newtable, 0, Entry.sizeof * newsize);                                  this.used = 0;                 size_t i = this.fill;                 this.fill = 0;                 //copy the data over; filter out dummies                 for (Entry* ep = oldtable; i > 0; ep++)                 {                         if (isActiveKey(ep.key))                         {                                 --i;                                 insertdict_clean(ep.key, ep.value);                         }                         else if (isDummyKey(ep.key))                         {                                 --i;                         }                 }                 delete oldtable;         }                  /*         * Insert an item which is known to be absent from the dict.         * This routine also assumes that the dict contains no deleted entries.         */         private void insertdict_clean(KW key, V value)         {                 assert(!isSpecialKey(key));                                  size_t hash = key.hash;                 size_t perturb = void;                 size_t mask = this.mask;                 Entry *ep0 = this.table;                 size_t i = hash & mask;                 Entry* ep = &ep0[i];                                  for (perturb = hash; !isUnusedKey(ep.key); perturb >>= PERTURB_SHIFT)                 {                         i = (i << 2) + i + perturb + 1;                         ep = &ep0[i & mask];                 }                                  this.fill++;                 ep.key = key;                 ep.value = value;                 this.used++;         }                  /*         * We use this template to cast a static array to a dynamic one in opApply,         * since the dmd specs don't allow them as ref parameters :F         */         template Helper(T)         {                 static if(isStaticArrayType!(T))                 {                         alias typeof(T.init[0])[] type; //the equivalent dynamic array                 }                 else                 {                         alias T type;                 }         }                  alias Helper!(K).type K_;         alias Helper!(V).type V_;                  public int opApply(int delegate(ref V_ value) dg)         {                 return opApply((ref K_ k, ref V_ v) { return dg(v); });         }                  public int opApply(int delegate(ref K_ key, ref V_ value) dg)         {                 Entry* ep = this.table;                 int result = 0;                                  if (is_unused)                 {                         auto key = cast(K_) unused_key.data;                         auto value = cast(V_) unused_value;                         result = dg(key, value);                         if(result != 0)                         {                                 return result;                         }                 }                                  if (is_dummy)                 {                         auto key = cast(K_) dummy_key.data;                         auto value = cast(V_) dummy_value;                         result = dg(key, value);                                                  if(result != 0)                         {                                 return result;                         }                 }                                  for (size_t i = 0; i <= this.mask; ++i)                 {                         if (isSpecialKey(ep[i].key))                         {                                 continue;                         }                                                  auto key = cast(K_) ep[i].key.data;                         auto value = cast(V_) ep[i].value;                         result = dg(key, value);                                                  if (result != 0)                         {                                 break;                         }                 }                                  return result;         }                  /*         * Get number of active entries stored.         */         public size_t size()         {                 return used + is_dummy + is_unused;         }                  public K[] keys()         {                 K[] keys = new K[](this.size());                 Entry* ep = this.table;                 size_t length = this.mask;                 size_t n = 0;                                  if (is_unused) keys[n++] = unused_key.data;                 if (is_dummy) keys[n++] = dummy_key.data;                                  for (size_t i = 0; i < length; ++i)                 {                         if (isSpecialKey(ep[i].key))                         {                                 continue;                         }                         keys[n] = ep[i].key.data;                         ++n;                 }                          return keys;         }                  public V[] values()         {                 V[] values = new V[](this.size());                 Entry* ep = this.table;                 size_t length = this.mask;                 size_t n = 0;                                  if (is_unused) values[n] = unused_value;                 if (is_dummy) values[n++] = dummy_value;                                  for (size_t i = 0; i < length; ++i)                 {                         if (isSpecialKey(ep[i].key))                         {                                 continue;                         }                         values[n] = ep[i].value;                         ++n;                 }                                  return values;         } }
Recent messages in this thread
 
-# Python dictionary inspired hash table implementation (Current message) Moritz Warning 02-Jul-2008 02:59 pm
.-# Re: Python dictionary inspired hash table implementation bearophile 02-Jul-2008 05:11 pm
..-# Re: Python dictionary inspired hash table implementation Moritz Warning 02-Jul-2008 06:31 pm
...-# Re: Python dictionary inspired hash table implementation bearophile 02-Jul-2008 09:55 pm
....-# Re: Python dictionary inspired hash table implementation Jarrett Billingsley 02-Jul-2008 10:30 pm
....|\# Re: Python dictionary inspired hash table implementation Jarrett Billingsley 02-Jul-2008 10:31 pm
....-# Re: Python dictionary inspired hash table implementation Manfred_Nowak 03-Jul-2008 02:49 am
....|-# Re: Python dictionary inspired hash table implementation Moritz Warning 03-Jul-2008 06:26 am
....|.-# Re: Python dictionary inspired hash table implementation Manfred_Nowak 04-Jul-2008 03:15 am
....|..\# Re: Python dictionary inspired hash table implementation Moritz Warning 04-Jul-2008 02:09 pm
....\# Re: Python dictionary inspired hash table implementation Moritz Warning 03-Jul-2008 07:01 am