www.digitalmars.com         C & C++   DMDScript  

digitalmars.D - How to accelerate this program?

reply Li Jie <cpunion gmail.com> writes:
At first, thank all my helpers. :)


I have some email addresses in a text file, about 780000 lines. There are some
repeated lines, about 8%, and I want to remove them.

I write a D version:
=======================================
import std.stdio;
import std.string;
import std.stdio;
import std.string;
import std.perf;

int main(char[][] argv)
{
if (argv.length < 3) {
writefln("Wrong arguments");
return 1;
}

const int READ_SIZE = 1024;

FILE* fin = fopen(argv[1], "r");
FILE* fout = fopen(argv[2], "w");
char buffer[READ_SIZE];
int[char[]] emails;

PerformanceCounter counter = new PerformanceCounter();
counter.start();
while (!feof(fin)){
fgets(cast(char*)buffer, READ_SIZE, fin);
char[] email = toString(cast(char*)buffer);
if (!(email in emails)){
emails[toString(buffer)] = 0;
fputs(cast(char*)email, fout);
}
}

fclose(fout);
fclose(fin);
counter.stop();

writefln(counter.milliseconds());
return 0;
}
======================================

It used 1080 ms on my computer.

A optimized c++ version:
======================================
#include <iostream>
#include <string>
#include <fstream>
#include <iterator>
#include <sys/time.h>
#include <vector>
using namespace std;


size_t my_hash (const char* str)
{
size_t ret = 0;
while (*str)
ret = 11 * ret + *str++;
return ret;
}

class Email
{
private:
size_t hash;
const char* mail;
friend bool operator < (const Email& lhs, const Email& rhs);
public:
Email (const char* mail_)
: hash(my_hash(mail_)), mail(mail_)
{
}

bool operator == (const Email& rhs)
{
if (hash == rhs.hash)
return strcmp(mail, rhs.mail) == 0;
return false;
}

const char* getEmail()const
{
return mail;
}
};

bool operator < (const Email& lhs, const Email& rhs)
{
if (lhs.hash == rhs.hash)
return strcmp(lhs.mail, rhs.mail) < 0;
return lhs.hash < rhs.hash;
}

int main(int argc, char** argv)
{
if (argc < 3)
{
cout << "Wrong arguments" << endl;
return 1;
}

FILE* fin = fopen(argv[1], "r");
if (!fin)
{
cout << "Invalid input file" << endl;
return 2;
}
FILE* fout = fopen(argv[2], "w");
if (!fout)
{
fclose(fin);
cout << "Invalid output file" << endl;
return 3;
}

timeval start, end;

const int BUF_SIZE = 20 * 1024 * 1024;
char* buffer = new char[BUF_SIZE];
memset(buffer, 0, BUF_SIZE);

gettimeofday(&start, 0);
vector<Email> emails;

size_t read = fread (buffer, BUF_SIZE, 1, fin);
char* begin = buffer;
char* current = buffer;

while (*current != '\0')
{
if (*current == '\n')
{
*current = '\0';
emails.push_back(begin);
begin = current + 1;
}
++ current;
}
fclose(fin);
sort(emails.begin(), emails.end());
emails.erase (unique( emails.begin(), emails.end() ), emails.end());

for (vector<Email>::const_iterator iter = emails.begin();
iter != emails.end();
iter ++)
{
fputs((*iter).getEmail(), fout);
fwrite("\n", 1, 1, fout);
}

fclose(fout);

gettimeofday(&end, 0);

printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec
- start.tv_usec) / 1000));

delete[] buffer;

return 0;
}
=====================================================
It used 680 ms.

I modified the D version to:
====================================================
import std.stdio;
import std.string;
import std.perf;

int main(char[][] argv)
{
if (argv.length < 3) {
writefln("Wrong arguments");
return 1;
}

const int BUF_SIZE = 20 * 1024 * 1024;
char* buffer = new char[BUF_SIZE];

PerformanceCounter counter = new PerformanceCounter();
counter.start();

FILE* fin = fopen(argv[1], "r");
FILE* fout = fopen(argv[2], "w");

fread(buffer, BUF_SIZE, 1, fin);
fclose(fin);

int[char[]] emails;
char* begin = buffer;
char* current = begin;
char* newline = cast(char*)"\n";

counter.stop();
writefln(counter.milliseconds());

while (*current != '\0')
{
if (*current == '\n')
{
*current = '\0';
char[] email = toString(begin);
if (!(email in emails))
{
emails[email] = 0;
fputs (begin, fout);
fwrite(newline, 1, 1, fout);
}
begin = current + 1;
}
++ current;
}

fclose(fout);
counter.stop();

writefln(counter.milliseconds());

delete buffer;
return 0;
}
====================================================

But it used 3600 ms... :(


I don't know how to accelerate it in D. :(
Can you help me?
Apr 03 2006
next sibling parent John Demme <me teqdruid.com> writes:
Li Jie wrote:

 At first, thank all my helpers. :)
 
 
 I have some email addresses in a text file, about 780000 lines. There are
 some repeated lines, about 8%, and I want to remove them.
 
 I write a D version:
 =======================================
 import std.stdio;
 import std.string;
 import std.stdio;
 import std.string;
 import std.perf;
 
 int main(char[][] argv)
 {
 if (argv.length < 3) {
 writefln("Wrong arguments");
 return 1;
 }
 
 const int READ_SIZE = 1024;
 
 FILE* fin = fopen(argv[1], "r");
 FILE* fout = fopen(argv[2], "w");
 char buffer[READ_SIZE];
 int[char[]] emails;
 
 PerformanceCounter counter = new PerformanceCounter();
 counter.start();
 while (!feof(fin)){
 fgets(cast(char*)buffer, READ_SIZE, fin);
 char[] email = toString(cast(char*)buffer);
 if (!(email in emails)){
 emails[toString(buffer)] = 0;
 fputs(cast(char*)email, fout);
 }
 }
 
 fclose(fout);
 fclose(fin);
 counter.stop();
 
 writefln(counter.milliseconds());
 return 0;
 }
 ======================================
 
 It used 1080 ms on my computer.
 
 A optimized c++ version:
 ======================================
 #include <iostream>
 #include <string>
 #include <fstream>
 #include <iterator>
 #include <sys/time.h>
 #include <vector>
 using namespace std;
 
 
 size_t my_hash (const char* str)
 {
 size_t ret = 0;
 while (*str)
 ret = 11 * ret + *str++;
 return ret;
 }
 
 class Email
 {
 private:
 size_t hash;
 const char* mail;
 friend bool operator < (const Email& lhs, const Email& rhs);
 public:
 Email (const char* mail_)
 : hash(my_hash(mail_)), mail(mail_)
 {
 }
 
 bool operator == (const Email& rhs)
 {
 if (hash == rhs.hash)
 return strcmp(mail, rhs.mail) == 0;
 return false;
 }
 
 const char* getEmail()const
 {
 return mail;
 }
 };
 
 bool operator < (const Email& lhs, const Email& rhs)
 {
 if (lhs.hash == rhs.hash)
 return strcmp(lhs.mail, rhs.mail) < 0;
 return lhs.hash < rhs.hash;
 }
 
 int main(int argc, char** argv)
 {
 if (argc < 3)
 {
 cout << "Wrong arguments" << endl;
 return 1;
 }
 
 FILE* fin = fopen(argv[1], "r");
 if (!fin)
 {
 cout << "Invalid input file" << endl;
 return 2;
 }
 FILE* fout = fopen(argv[2], "w");
 if (!fout)
 {
 fclose(fin);
 cout << "Invalid output file" << endl;
 return 3;
 }
 
 timeval start, end;
 
 const int BUF_SIZE = 20 * 1024 * 1024;
 char* buffer = new char[BUF_SIZE];
 memset(buffer, 0, BUF_SIZE);
 
 gettimeofday(&start, 0);
 vector<Email> emails;
 
 size_t read = fread (buffer, BUF_SIZE, 1, fin);
 char* begin = buffer;
 char* current = buffer;
 
 while (*current != '\0')
 {
 if (*current == '\n')
 {
 *current = '\0';
 emails.push_back(begin);
 begin = current + 1;
 }
 ++ current;
 }
 fclose(fin);
 sort(emails.begin(), emails.end());
 emails.erase (unique( emails.begin(), emails.end() ), emails.end());
 
 for (vector<Email>::const_iterator iter = emails.begin();
 iter != emails.end();
 iter ++)
 {
 fputs((*iter).getEmail(), fout);
 fwrite("\n", 1, 1, fout);
 }
 
 fclose(fout);
 
 gettimeofday(&end, 0);
 
 printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 +
 (end.tv_usec - start.tv_usec) / 1000));
 
 delete[] buffer;
 
 return 0;
 }
 =====================================================
 It used 680 ms.
 
 I modified the D version to:
 ====================================================
 import std.stdio;
 import std.string;
 import std.perf;
 
 int main(char[][] argv)
 {
 if (argv.length < 3) {
 writefln("Wrong arguments");
 return 1;
 }
 
 const int BUF_SIZE = 20 * 1024 * 1024;
 char* buffer = new char[BUF_SIZE];
 
 PerformanceCounter counter = new PerformanceCounter();
 counter.start();
 
 FILE* fin = fopen(argv[1], "r");
 FILE* fout = fopen(argv[2], "w");
 
 fread(buffer, BUF_SIZE, 1, fin);
 fclose(fin);
 
 int[char[]] emails;
 char* begin = buffer;
 char* current = begin;
 char* newline = cast(char*)"\n";
 
 counter.stop();
 writefln(counter.milliseconds());
 
 while (*current != '\0')
 {
 if (*current == '\n')
 {
 *current = '\0';
 char[] email = toString(begin);
 if (!(email in emails))
 {
 emails[email] = 0;
 fputs (begin, fout);
 fwrite(newline, 1, 1, fout);
 }
 begin = current + 1;
 }
 ++ current;
 }
 
 fclose(fout);
 counter.stop();
 
 writefln(counter.milliseconds());
 
 delete buffer;
 return 0;
 }
 ====================================================
 
 But it used 3600 ms... :(
 
 
 I don't know how to accelerate it in D. :(
 Can you help me?

I would recommend using the Mango library and it's textual iterators. See the example at: http://www.dsource.org/projects/mango/browser/trunk/example/lineio.d A few tweaks to that example should get you on your way. Also, when compiling, make sure to use the -O -inlinc -release flags. -O optimizes, -inline allows smaller functions to be inlined, and -release removes things like bound checking. I've found that using these flags improves performance significantly. ~John Demme
Apr 03 2006
prev sibling next sibling parent reply Wang Zhen <nehzgnaw gmail.com> writes:
Li Jie wrote:
 At first, thank all my helpers. :)
 
 
 I have some email addresses in a text file, about 780000 lines. There are some
 repeated lines, about 8%, and I want to remove them.
 
 I write a D version:
 =======================================
 import std.stdio;
 import std.string;
 import std.stdio;
 import std.string;
 import std.perf;
 
 int main(char[][] argv)
 {
 if (argv.length < 3) {
 writefln("Wrong arguments");
 return 1;
 }
 
 const int READ_SIZE = 1024;
 
 FILE* fin = fopen(argv[1], "r");
 FILE* fout = fopen(argv[2], "w");
 char buffer[READ_SIZE];
 int[char[]] emails;
 
 PerformanceCounter counter = new PerformanceCounter();
 counter.start();
 while (!feof(fin)){
 fgets(cast(char*)buffer, READ_SIZE, fin);
 char[] email = toString(cast(char*)buffer);
 if (!(email in emails)){
 emails[toString(buffer)] = 0;
 fputs(cast(char*)email, fout);
 }
 }
 
 fclose(fout);
 fclose(fin);
 counter.stop();
 
 writefln(counter.milliseconds());
 return 0;
 }
 ======================================
 
 It used 1080 ms on my computer.
 
 A optimized c++ version:
 ======================================
 #include <iostream>
 #include <string>
 #include <fstream>
 #include <iterator>
 #include <sys/time.h>
 #include <vector>
 using namespace std;
 
 
 size_t my_hash (const char* str)
 {
 size_t ret = 0;
 while (*str)
 ret = 11 * ret + *str++;
 return ret;
 }
 
 class Email
 {
 private:
 size_t hash;
 const char* mail;
 friend bool operator < (const Email& lhs, const Email& rhs);
 public:
 Email (const char* mail_)
 : hash(my_hash(mail_)), mail(mail_)
 {
 }
 
 bool operator == (const Email& rhs)
 {
 if (hash == rhs.hash)
 return strcmp(mail, rhs.mail) == 0;
 return false;
 }
 
 const char* getEmail()const
 {
 return mail;
 }
 };
 
 bool operator < (const Email& lhs, const Email& rhs)
 {
 if (lhs.hash == rhs.hash)
 return strcmp(lhs.mail, rhs.mail) < 0;
 return lhs.hash < rhs.hash;
 }
 
 int main(int argc, char** argv)
 {
 if (argc < 3)
 {
 cout << "Wrong arguments" << endl;
 return 1;
 }
 
 FILE* fin = fopen(argv[1], "r");
 if (!fin)
 {
 cout << "Invalid input file" << endl;
 return 2;
 }
 FILE* fout = fopen(argv[2], "w");
 if (!fout)
 {
 fclose(fin);
 cout << "Invalid output file" << endl;
 return 3;
 }
 
 timeval start, end;
 
 const int BUF_SIZE = 20 * 1024 * 1024;
 char* buffer = new char[BUF_SIZE];
 memset(buffer, 0, BUF_SIZE);
 
 gettimeofday(&start, 0);
 vector<Email> emails;
 
 size_t read = fread (buffer, BUF_SIZE, 1, fin);
 char* begin = buffer;
 char* current = buffer;
 
 while (*current != '\0')
 {
 if (*current == '\n')
 {
 *current = '\0';
 emails.push_back(begin);
 begin = current + 1;
 }
 ++ current;
 }
 fclose(fin);
 sort(emails.begin(), emails.end());
 emails.erase (unique( emails.begin(), emails.end() ), emails.end());
 
 for (vector<Email>::const_iterator iter = emails.begin();
 iter != emails.end();
 iter ++)
 {
 fputs((*iter).getEmail(), fout);
 fwrite("\n", 1, 1, fout);
 }
 
 fclose(fout);
 
 gettimeofday(&end, 0);
 
 printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec
 - start.tv_usec) / 1000));
 
 delete[] buffer;
 
 return 0;
 }
 =====================================================
 It used 680 ms.
 
 I modified the D version to:
 ====================================================
 import std.stdio;
 import std.string;
 import std.perf;
 
 int main(char[][] argv)
 {
 if (argv.length < 3) {
 writefln("Wrong arguments");
 return 1;
 }
 
 const int BUF_SIZE = 20 * 1024 * 1024;
 char* buffer = new char[BUF_SIZE];
 
 PerformanceCounter counter = new PerformanceCounter();
 counter.start();
 
 FILE* fin = fopen(argv[1], "r");
 FILE* fout = fopen(argv[2], "w");
 
 fread(buffer, BUF_SIZE, 1, fin);
 fclose(fin);
 
 int[char[]] emails;
 char* begin = buffer;
 char* current = begin;
 char* newline = cast(char*)"\n";
 
 counter.stop();
 writefln(counter.milliseconds());
 
 while (*current != '\0')
 {
 if (*current == '\n')
 {
 *current = '\0';
 char[] email = toString(begin);
 if (!(email in emails))
 {
 emails[email] = 0;
 fputs (begin, fout);
 fwrite(newline, 1, 1, fout);
 }
 begin = current + 1;
 }
 ++ current;
 }
 
 fclose(fout);
 counter.stop();
 
 writefln(counter.milliseconds());
 
 delete buffer;
 return 0;
 }
 ====================================================
 
 But it used 3600 ms... :(
 
 
 I don't know how to accelerate it in D. :(
 Can you help me?
 
 

Two improvements based on your first D version: 0. Output in a separate loop. 1. Remove the "if(!(email in emails))" check. Code: while(!feof(fin)){ fgets(cast(char*)buffer, READ_SIZE, fin); emails[toString(buffer)] = 0; } foreach(char[] k, int v; emails) fputs(cast(char*)k, fout);
Apr 03 2006
next sibling parent reply Wang Zhen <nehzgnaw gmail.com> writes:
Wang Zhen wrote:
 
 
 Li Jie wrote:
 
 At first, thank all my helpers. :)


 I have some email addresses in a text file, about 780000 lines. There 
 are some
 repeated lines, about 8%, and I want to remove them.

 I write a D version:


BTW, a simple shell command "sort emails.txt | uniq > output.txt" can get the job done in a few seconds. I assume you went all the trouble writing a D version just for the fun of it. No?
Apr 03 2006
parent reply BCS <BCS_member pathlink.com> writes:
Wang Zhen wrote:
  <snip>
 
 BTW, a simple shell command "sort emails.txt | uniq > output.txt" can 
 get the job done in a few seconds. I assume you went all the trouble 
 writing a D version just for the fun of it. No?

or sort -u emails.txt
Apr 03 2006
parent Li Jie <cpunion gmail.com> writes:
In article <e0s9oi$1uih$1 digitaldaemon.com>, BCS says...
Wang Zhen wrote:
  <snip>
 
 BTW, a simple shell command "sort emails.txt | uniq > output.txt" can 
 get the job done in a few seconds. I assume you went all the trouble 
 writing a D version just for the fun of it. No?

or sort -u emails.txt

Thanks. I want to compare the speed of C++, D and python. The best python version only takes 1700 ms, a "stand" c++ version (used set<string>) takes 5000 ms, an optimized c++ version takes 680 ms. I thinks an optimized D version can faster, shorter and easier to write than the c++ version.
Apr 03 2006
prev sibling parent reply Li Jie <cpunion gmail.com> writes:
In article <e0rcej$10it$1 digitaldaemon.com>, Wang Zhen says...

Two improvements based on your first D version:
0. Output in a separate loop.
1. Remove the "if(!(email in emails))" check.

Code:

while(!feof(fin)){
     fgets(cast(char*)buffer, READ_SIZE, fin);
     emails[toString(buffer)] = 0;
}
foreach(char[] k, int v; emails)
     fputs(cast(char*)k, fout);

Thanks. It takes 1080 ms on my system, it's not fast enough. I think "cast(char*)(char[])" and "toString" called too much, and it's very slowly.
Apr 03 2006
parent Dave <Dave_member pathlink.com> writes:
In article <e0slt4$2cki$1 digitaldaemon.com>, Li Jie says...
In article <e0rcej$10it$1 digitaldaemon.com>, Wang Zhen says...

Two improvements based on your first D version:
0. Output in a separate loop.
1. Remove the "if(!(email in emails))" check.

Code:

while(!feof(fin)){
     fgets(cast(char*)buffer, READ_SIZE, fin);
     emails[toString(buffer)] = 0;
}
foreach(char[] k, int v; emails)
     fputs(cast(char*)k, fout);

Thanks. It takes 1080 ms on my system, it's not fast enough. I think "cast(char*)(char[])" and "toString" called too much, and it's very slowly.

That won't give the correct output because the buffer is overwritten with each fgets and a memcpy is *not* done somewhere in the background for the AA keys.
Apr 03 2006
prev sibling parent reply Dave <Dave_member pathlink.com> writes:
From what I've seen, the bottleneck is probably in I/O. Use BufferedFile and a
buffer for each readline. On my system with a file made of 440,000 lines of
(just 4 repeated) seperate emails, this version takes ~200 ms vs. ~800 ms for
your C++ version. Buffering I/O is key in D w/ the phobos I/O routines, as the
built-in AA's are pretty fast.

Let us know your results (you'll have a lot more AA inserts and email dups).
Note you have to dup the string when you insert into your AA.

Try this (dmd v0.150):

import std.stdio;
import std.stream;
import std.perf;

int main(char[][] argv)
{
if (argv.length < 3)
{
writefln("Wrong arguments");
return 1;
}

char[8192] bufr;
int[char[]] emails;
char[] email;

PerformanceCounter counter = new PerformanceCounter();
counter.start();

BufferedFile bsi = new BufferedFile(argv[1]);
BufferedFile bso = new BufferedFile(argv[2],FileMode.Out);
while(!bsi.eof)
{
email = bsi.readLine(bufr); // bufr is key to perf.
if (!(email in emails))
{
emails[email.dup] = 0; // Note .dup
}
}

foreach(key; emails.keys.sort)
{
bso.write(cast(ubyte[])key);
bso.write('\n');
}
bso.close;
bsi.close;

counter.stop();
writefln(counter.milliseconds());

return 0;
}

In article <e0qqb3$3gt$1 digitaldaemon.com>, Li Jie says...
At first, thank all my helpers. :)


I have some email addresses in a text file, about 780000 lines. There are some
repeated lines, about 8%, and I want to remove them.

I write a D version:
=======================================
import std.stdio;
import std.string;
import std.stdio;
import std.string;
import std.perf;

int main(char[][] argv)
{
if (argv.length < 3) {
writefln("Wrong arguments");
return 1;
}

const int READ_SIZE = 1024;

FILE* fin = fopen(argv[1], "r");
FILE* fout = fopen(argv[2], "w");
char buffer[READ_SIZE];
int[char[]] emails;

PerformanceCounter counter = new PerformanceCounter();
counter.start();
while (!feof(fin)){
fgets(cast(char*)buffer, READ_SIZE, fin);
char[] email = toString(cast(char*)buffer);
if (!(email in emails)){
emails[toString(buffer)] = 0;
fputs(cast(char*)email, fout);
}
}

fclose(fout);
fclose(fin);
counter.stop();

writefln(counter.milliseconds());
return 0;
}
======================================

It used 1080 ms on my computer.

A optimized c++ version:
======================================
#include <iostream>
#include <string>
#include <fstream>
#include <iterator>
#include <sys/time.h>
#include <vector>
using namespace std;


size_t my_hash (const char* str)
{
size_t ret = 0;
while (*str)
ret = 11 * ret + *str++;
return ret;
}

class Email
{
private:
size_t hash;
const char* mail;
friend bool operator < (const Email& lhs, const Email& rhs);
public:
Email (const char* mail_)
: hash(my_hash(mail_)), mail(mail_)
{
}

bool operator == (const Email& rhs)
{
if (hash == rhs.hash)
return strcmp(mail, rhs.mail) == 0;
return false;
}

const char* getEmail()const
{
return mail;
}
};

bool operator < (const Email& lhs, const Email& rhs)
{
if (lhs.hash == rhs.hash)
return strcmp(lhs.mail, rhs.mail) < 0;
return lhs.hash < rhs.hash;
}

int main(int argc, char** argv)
{
if (argc < 3)
{
cout << "Wrong arguments" << endl;
return 1;
}

FILE* fin = fopen(argv[1], "r");
if (!fin)
{
cout << "Invalid input file" << endl;
return 2;
}
FILE* fout = fopen(argv[2], "w");
if (!fout)
{
fclose(fin);
cout << "Invalid output file" << endl;
return 3;
}

timeval start, end;

const int BUF_SIZE = 20 * 1024 * 1024;
char* buffer = new char[BUF_SIZE];
memset(buffer, 0, BUF_SIZE);

gettimeofday(&start, 0);
vector<Email> emails;

size_t read = fread (buffer, BUF_SIZE, 1, fin);
char* begin = buffer;
char* current = buffer;

while (*current != '\0')
{
if (*current == '\n')
{
*current = '\0';
emails.push_back(begin);
begin = current + 1;
}
++ current;
}
fclose(fin);
sort(emails.begin(), emails.end());
emails.erase (unique( emails.begin(), emails.end() ), emails.end());

for (vector<Email>::const_iterator iter = emails.begin();
iter != emails.end();
iter ++)
{
fputs((*iter).getEmail(), fout);
fwrite("\n", 1, 1, fout);
}

fclose(fout);

gettimeofday(&end, 0);

printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec
- start.tv_usec) / 1000));

delete[] buffer;

return 0;
}
=====================================================
It used 680 ms.

I modified the D version to:
====================================================
import std.stdio;
import std.string;
import std.perf;

int main(char[][] argv)
{
if (argv.length < 3) {
writefln("Wrong arguments");
return 1;
}

const int BUF_SIZE = 20 * 1024 * 1024;
char* buffer = new char[BUF_SIZE];

PerformanceCounter counter = new PerformanceCounter();
counter.start();

FILE* fin = fopen(argv[1], "r");
FILE* fout = fopen(argv[2], "w");

fread(buffer, BUF_SIZE, 1, fin);
fclose(fin);

int[char[]] emails;
char* begin = buffer;
char* current = begin;
char* newline = cast(char*)"\n";

counter.stop();
writefln(counter.milliseconds());

while (*current != '\0')
{
if (*current == '\n')
{
*current = '\0';
char[] email = toString(begin);
if (!(email in emails))
{
emails[email] = 0;
fputs (begin, fout);
fwrite(newline, 1, 1, fout);
}
begin = current + 1;
}
++ current;
}

fclose(fout);
counter.stop();

writefln(counter.milliseconds());

delete buffer;
return 0;
}
====================================================

But it used 3600 ms... :(


I don't know how to accelerate it in D. :(
Can you help me?

Apr 03 2006
next sibling parent reply Li Jie <cpunion gmail.com> writes:
In article <e0re95$12mg$1 digitaldaemon.com>, Dave says...
From what I've seen, the bottleneck is probably in I/O. Use BufferedFile and a
buffer for each readline. On my system with a file made of 440,000 lines of
(just 4 repeated) seperate emails, this version takes ~200 ms vs. ~800 ms for
your C++ version. Buffering I/O is key in D w/ the phobos I/O routines, as the
built-in AA's are pretty fast.

Let us know your results (you'll have a lot more AA inserts and email dups).
Note you have to dup the string when you insert into your AA.

Try this (dmd v0.150):

Thanks Dave. But it takes ~3400 ms on my system, on windows and gentoo.
Apr 03 2006
parent reply Dave <Dave_member pathlink.com> writes:
In article <e0sltd$2cl7$1 digitaldaemon.com>, Li Jie says...
In article <e0re95$12mg$1 digitaldaemon.com>, Dave says...
From what I've seen, the bottleneck is probably in I/O. Use BufferedFile and a
buffer for each readline. On my system with a file made of 440,000 lines of
(just 4 repeated) seperate emails, this version takes ~200 ms vs. ~800 ms for
your C++ version. Buffering I/O is key in D w/ the phobos I/O routines, as the
built-in AA's are pretty fast.

Let us know your results (you'll have a lot more AA inserts and email dups).
Note you have to dup the string when you insert into your AA.

Try this (dmd v0.150):

Thanks Dave. But it takes ~3400 ms on my system, on windows and gentoo.

There's something screwy going on there... Even if I .dup every line of a 880000 line file with average size e-mails in it, I'm still getting 2x better performance than your C++ version, more in line with this: http://shootout.alioth.debian.org/gp4/benchmark.php?test=knucleotide&lang=all which has similiar operations as your test (I/O and hash table update). I'm using g++ v4.02 and -O3 -fomit-frame-pointer -funroll-loops -mtune=pentium4. Please send: - the output from 'dmd -v' - output from 'll /usr/lib/libphobos.a'. - what type of system is 'your system' - how large is the email file in bytes - which c++ compiler and flags are you using? - dmd compiler flags for your test. There are other ways to make the D code faster but a little less straight-forward. See the above benchmark site for those. Thanks.
Apr 03 2006
parent reply Li Jie <cpunion gmail.com> writes:
In article <e0ssod$2q8b$1 digitaldaemon.com>, Dave says...
Please send:

- the output from 'dmd -v' 

Digital Mars D Compiler v0.150 Copyright (c) 1999-2006 by Digital Mars written by Walter Bright Documentation: www.digitalmars.com/d/index.html Usage: dmd files.d ... { -switch } ....
- output from 'll /usr/lib/libphobos.a'.

-rw-r--r-- 1 root root 1157824 4月 4 13:05 /usr/lib/libphobos.a
- what type of system is 'your system'

Linux kernel version is 2.6.15 g++ version is 3.4.4 dmd version is 0.150
- how large is the email file in bytes

-rw-r--r-- 1 root root 19388869 4月 3 09:27 email.txt
- which c++ compiler and flags are you using?

And change to -O3 -fomit-frame-pointer -funroll-loops -mtune=pentium4, it takes ~630 ms.
- dmd compiler flags for your test.

some tests: ======================================= lijie t # g++ -o test7 test7.cpp -O3 -fomit-frame-pointer -funroll-loops -mtune=pentium4 lijie t # ./test7 email.txt email-new.txt Total used: 637 ms. lijie t # ./test7 email.txt email-new.txt Total used: 628 ms. lijie t # ./test7 email.txt email-new.txt Total used: 629 ms. lijie t # ./test7 email.txt email-new.txt Total used: 639 ms. lijie t # ./test7 email.txt email-new.txt Total used: 627 ms. lijie t # dmd dave_test.d -O -release -inline gcc dave_test.o -o dave_test -lphobos -lpthread -lm lijie t # ./dave_test email.txt email-new.txt 3360 lijie t # ./dave_test email.txt email-new.txt 3315 lijie t # ./dave_test email.txt email-new.txt 3344 lijie t # ./dave_test email.txt email-new.txt 3333 lijie t # ./dave_test email.txt email-new.txt 3398 Thanks.
Apr 03 2006
next sibling parent "Unknown W. Brackets" <unknown simplemachines.org> writes:
DMD 0.150 had some inlining bugs.  Try upgrading to 0.151.

Thanks,
-[Unknown]


 In article <e0ssod$2q8b$1 digitaldaemon.com>, Dave says...
 Please send:

 - the output from 'dmd -v' 

Digital Mars D Compiler v0.150 Copyright (c) 1999-2006 by Digital Mars written by Walter Bright Documentation: www.digitalmars.com/d/index.html Usage: dmd files.d ... { -switch } ....
 - output from 'll /usr/lib/libphobos.a'.

-rw-r--r-- 1 root root 1157824 4月 4 13:05 /usr/lib/libphobos.a
 - what type of system is 'your system'

Linux kernel version is 2.6.15 g++ version is 3.4.4 dmd version is 0.150
 - how large is the email file in bytes

-rw-r--r-- 1 root root 19388869 4月 3 09:27 email.txt
 - which c++ compiler and flags are you using?

And change to -O3 -fomit-frame-pointer -funroll-loops -mtune=pentium4, it takes ~630 ms.
 - dmd compiler flags for your test.

some tests: ======================================= lijie t # g++ -o test7 test7.cpp -O3 -fomit-frame-pointer -funroll-loops -mtune=pentium4 lijie t # ./test7 email.txt email-new.txt Total used: 637 ms. lijie t # ./test7 email.txt email-new.txt Total used: 628 ms. lijie t # ./test7 email.txt email-new.txt Total used: 629 ms. lijie t # ./test7 email.txt email-new.txt Total used: 639 ms. lijie t # ./test7 email.txt email-new.txt Total used: 627 ms. lijie t # dmd dave_test.d -O -release -inline gcc dave_test.o -o dave_test -lphobos -lpthread -lm lijie t # ./dave_test email.txt email-new.txt 3360 lijie t # ./dave_test email.txt email-new.txt 3315 lijie t # ./dave_test email.txt email-new.txt 3344 lijie t # ./dave_test email.txt email-new.txt 3333 lijie t # ./dave_test email.txt email-new.txt 3398 Thanks.

Apr 04 2006
prev sibling parent Dave <Dave_member pathlink.com> writes:
In article <e0svke$2tok$1 digitaldaemon.com>, Li Jie says...
In article <e0ssod$2q8b$1 digitaldaemon.com>, Dave says...
Please send:


Something was screwy - my test data! <g> My test file wasn't even close to what it should've been - the bottleneck is in the AA operations. I apologize for any wasted time... - Dave
Apr 04 2006
prev sibling parent reply Lionello Lunesu <lio remove.lunesu.com> writes:
Dave wrote:
 From what I've seen, the bottleneck is probably in I/O. Use BufferedFile and a
 buffer for each readline. 

C's FILE and fopen also create a buffered stream, so I doubt that will make a difference. But why the sort?? Try this instead: #import std.stdio; #import std.perf; #import std.stream; # #int main(char[][] argv) #{ # if (argv.length < 3) # { # writefln("Wrong arguments"); # return 1; # } # # char[8192] bufr; # int[char[]] emails; # char[] email; # # PerformanceCounter counter = new PerformanceCounter(); # counter.start(); # # BufferedFile bsi = new BufferedFile(argv[1]); # BufferedFile bso = new BufferedFile(argv[2],FileMode.Out); # while(!bsi.eof) # { # email = bsi.readLine(bufr); // bufr is key to perf. # if (!(email in emails)) # { # emails[email.dup] = 0; // Note .dup # bso.writeLine(email); # } # } # bso.close; # bsi.close; # # counter.stop(); # writefln(counter.milliseconds()); # # return 0; #}
Apr 04 2006
next sibling parent reply Dave <Dave_member pathlink.com> writes:
In article <e0tkil$kkn$1 digitaldaemon.com>, Lionello Lunesu says...
Dave wrote:
 From what I've seen, the bottleneck is probably in I/O. Use BufferedFile and a
 buffer for each readline. 

C's FILE and fopen also create a buffered stream, so I doubt that will make a difference.

Yea, but it does make about a 15% difference (with readLine(bufr) faster). I/O is definately not the bottleneck though - my test data was screwed up in that it was only doing 4 AA inserts, which is the actual bottleneck.
But why the sort?? Try this instead:

You're right that is not needed. I saw the sort in the C++ version (but didn't look closely enough at what it was doing).
#import std.stdio;
#import std.perf;
#import std.stream;
#
#int main(char[][] argv)
#{
#	if (argv.length < 3)
#	{
#		writefln("Wrong arguments");
#		return 1;
#	}
#
#	char[8192] bufr;
#	int[char[]] emails;
#	char[] email;
#
#	PerformanceCounter counter = new PerformanceCounter();
#	counter.start();
#
#	BufferedFile bsi = new BufferedFile(argv[1]);
#	BufferedFile bso = new BufferedFile(argv[2],FileMode.Out);
#	while(!bsi.eof)
#	{
#		email = bsi.readLine(bufr); // bufr is key to perf.
#		if (!(email in emails))
#		{
#			emails[email.dup] = 0; // Note .dup
#			bso.writeLine(email);
#		}
#	}
#	bso.close;
#	bsi.close;
#
#	counter.stop();
#	writefln(counter.milliseconds());
#
#	return 0;
#}

Apr 04 2006
parent Georg Wrede <georg.wrede nospam.org> writes:
Dave wrote:
 Lionello Lunesu says...
 Dave wrote:
 
 From what I've seen, the bottleneck is probably in I/O. Use
 BufferedFile and a buffer for each readline.

C's FILE and fopen also create a buffered stream, so I doubt that will make a difference.

Yea, but it does make about a 15% difference (with readLine(bufr) faster). I/O is definately not the bottleneck though - my test data was screwed up in that it was only doing 4 AA inserts, which is the actual bottleneck.

If I'd do this program, the first thing I'd look for is the "sector size" of the media from which the file is read. Then I'd have the buffer that size.
Apr 04 2006
prev sibling parent Li Jie <cpunion gmail.com> writes:
In article <e0tkil$kkn$1 digitaldaemon.com>, Lionello Lunesu says...
Dave wrote:
 From what I've seen, the bottleneck is probably in I/O. Use BufferedFile and a
 buffer for each readline. 

C's FILE and fopen also create a buffered stream, so I doubt that will make a difference. But why the sort?? Try this instead:

Thanks. It takes ~1800ms on my Gentoo, and ~2000ms on my Windows XP.
Apr 04 2006