www.digitalmars.com         C & C++   DMDScript  

digitalmars.D - Re: [OT] The horizon of a stream

bearophile Wrote:
 There's a third possible solution, that is very fast: assuming the hash values
are uint (32 bit), then you can create a bitfield of 2^32 bits, it requires
just 1/2 GB, that is 1/4 of my RAM. So each bit represents a possible different
value of the hash.

If the file is "small" my first implementation is good enough (small means the associative array has to fit in for example 2 GB). For larger files here's the implementation of the third idea, with D1, Phobos + my libs: import d.all, std.string, std.c.stdlib, std.outofmemory; const uint bpc = uint.sizeof * 8; bool isFlagSet(uint* flags, uint i) { int offset = i / bpc; uint mask = 1 << (i % bpc); return (flags[offset] & mask) != 0; } void setFlag(uint* flags, uint i) { int offset = i / bpc; uint mask = 1 << (i % bpc); if ((flags[offset] & mask) == 0) flags[offset] |= mask; } void main() { uint* seen = cast(uint*)calloc(1 << 29, 1); uint* seen_again = cast(uint*)calloc(1 << 29, 1); if (seen is null || seen_again is null) throw new OutOfMemoryException(); foreach (line; xfile("data.txt")) { uint h = hash(line.chomp()); if (isFlagSet(seen, h)) // not used std.intrinsics, because they segfault here setFlag(seen_again, h); else setFlag(seen, h); } free(seen); int[string] aa; int horizon; foreach (nline, line; xfile("data.txt")) { auto clean_line = line.chomp(); if (isFlagSet(seen_again, hash(clean_line))) { auto pos_ptr = clean_line in aa; if (pos_ptr) { if ((nline - *pos_ptr) > horizon) horizon = nline - *pos_ptr; } else aa[clean_line.dup] = nline; } } putr(horizon); } With D1, just Phobos: import std.stdio, std.string, std.c.stdlib, std.stream, std.outofmemory; const uint bpc = uint.sizeof * 8; bool isFlagSet(uint* flags, uint i) { int offset = i / bpc; uint mask = 1 << (i % bpc); return (flags[offset] & mask) != 0; } void setFlag(uint* flags, uint i) { int offset = i / bpc; uint mask = 1 << (i % bpc); if ((flags[offset] & mask) == 0) flags[offset] |= mask; } void main() { uint* seen = cast(uint*)calloc(1 << 29, 1); uint* seen_again = cast(uint*)calloc(1 << 29, 1); if (seen is null || seen_again is null) throw new OutOfMemoryException(); foreach (string line; new BufferedFile("data.txt")) { auto clean_line = line.chomp(); uint h = typeid(string).getHash(&clean_line); if (isFlagSet(seen, h)) // not used std.intrinsics, because they segfault here setFlag(seen_again, h); else setFlag(seen, h); } free(seen); int[string] aa; int horizon, nline; foreach (string line; new BufferedFile("data.txt")) { auto clean_line = line.chomp(); if (isFlagSet(seen_again, typeid(string).getHash(&clean_line))) { auto pos_ptr = clean_line in aa; if (pos_ptr) { if ((nline - *pos_ptr) > horizon) horizon = nline - *pos_ptr; } else aa[clean_line.dup] = nline; } nline++; } writefln(horizon); } The worst case is like this: A B C D E E D C B A In this case the aa associative array has to store half of the lines of the file, and it be too much. As you can see with this little program I have found another bug in Phobos, the intrinsics in this case don't work (and I have improved the Record of my libs too, used in another version of this code). Bye, bearophile
Oct 24 2008