www.digitalmars.com         C & C++   DMDScript  

digitalmars.D.announce - Shuffle

reply Walter Bright <newshound1 digitalmars.com> writes:
For fun, I ordered a new car stereo that would play music from an SD 
card or USB stick(rather than from CD), and installed it over the 
weekend. The problem is loading songs onto the SD card from my home 
music server, which I like to hear played at random.

The solution is a simple D program, shuffle, which will randomly copy 
music files to an SD card until it fills up. Have some fun with it!
----------------------------

/* Program to randomly copy music files from source to destination device.
  * Written in the D programming language.
  * Written by Walter Bright, http://www.digitalmars.com
  * Placed into the Public Domain.
  */


import std.file;
import std.stdio;
import std.string;
import std.c.stdlib;
import std.path;
import std.random;

int main(string[] args)
{
     if (args.length != 3)
     {	writefln("Usage: shuffle fromdir todir");
	exit(1);
     }
     auto fromdir = args[1];
     auto todir = args[2];

     /* Recursively search for all the mp3 and wma files in directory 
fromdir
      * and put them into files[]
      */
     string[] files;
     bool callback(DirEntry *de)
     {
	if (de.isdir)
	    listdir(de.name, &callback); // recurse into subdirectories
	else
	{
	    // Collect only files with mp3 and wma extensions
	    auto ext = getExt(de.name);
	    if (fnmatch(ext, "mp3") || fnmatch(ext, "wma"))
		files ~= de.name;
	}
	return true;	// keep going
     }
     std.file.listdir(fromdir, &callback);

     writefln(files.length, " music files");

     /* The loop will normally quit via an exception when the target device
      * is full. But if there are not enough files in the source to fill
      * up the target, the loop ensures it will still eventually quit.
      */
     for (size_t i = 0; i < files.length; i++)
     {
	auto j = std.random.rand() % files.length;
	auto fromfile = files[j];
	auto tofile = std.path.join(todir, basename(fromfile));
	writefln("%s => %s", fromfile, tofile);
	std.file.copy(fromfile, tofile);
     }

     writefln("Done");
     return 0;
}
Jan 24 2008
next sibling parent reply John Reimer <terminal.node gmail.com> writes:
Walter Bright wrote:
 For fun, I ordered a new car stereo that would play music from an SD 
 card or USB stick(rather than from CD), and installed it over the 
 weekend. The problem is loading songs onto the SD card from my home 
 music server, which I like to hear played at random.
 
 The solution is a simple D program, shuffle, which will randomly copy 
 music files to an SD card until it fills up. Have some fun with it!
 ----------------------------
 
 /* Program to randomly copy music files from source to destination device.
  * Written in the D programming language.
  * Written by Walter Bright, http://www.digitalmars.com
  * Placed into the Public Domain.
  */
 
 
 import std.file;
 import std.stdio;
 import std.string;
 import std.c.stdlib;
 import std.path;
 import std.random;
 
 int main(string[] args)
 {
     if (args.length != 3)
     {    writefln("Usage: shuffle fromdir todir");
     exit(1);
     }
     auto fromdir = args[1];
     auto todir = args[2];
 
     /* Recursively search for all the mp3 and wma files in directory 
 fromdir
      * and put them into files[]
      */
     string[] files;
     bool callback(DirEntry *de)
     {
     if (de.isdir)
         listdir(de.name, &callback); // recurse into subdirectories
     else
     {
         // Collect only files with mp3 and wma extensions
         auto ext = getExt(de.name);
         if (fnmatch(ext, "mp3") || fnmatch(ext, "wma"))
         files ~= de.name;
     }
     return true;    // keep going
     }
     std.file.listdir(fromdir, &callback);
 
     writefln(files.length, " music files");
 
     /* The loop will normally quit via an exception when the target device
      * is full. But if there are not enough files in the source to fill
      * up the target, the loop ensures it will still eventually quit.
      */
     for (size_t i = 0; i < files.length; i++)
     {
     auto j = std.random.rand() % files.length;
     auto fromfile = files[j];
     auto tofile = std.path.join(todir, basename(fromfile));
     writefln("%s => %s", fromfile, tofile);
     std.file.copy(fromfile, tofile);
     }
 
     writefln("Done");
     return 0;
 }

And what would be the Tango equivalent, I wonder? ;)
Jan 24 2008
next sibling parent reply Walter Bright <newshound1 digitalmars.com> writes:
John Reimer wrote:
 And what would be the Tango equivalent, I wonder?  ;)

What I'd like to do is collect a bunch of simple "cookbook" utilities like this that fit nicely into a single file, which can be put on the website to help people get started with D. I have several floating around my hard disk.
Jan 24 2008
next sibling parent John Reimer <terminal.node gmail.com> writes:
Walter Bright wrote:
 John Reimer wrote:
 And what would be the Tango equivalent, I wonder?  ;)

What I'd like to do is collect a bunch of simple "cookbook" utilities like this that fit nicely into a single file, which can be put on the website to help people get started with D. I have several floating around my hard disk.

It's a great idea. These sort of snippets are probably practical learning tools for those interested in the language.... might turn out to be a good compilation for a d book someday, too :) (The book "Practical Common Lisp" comes to mind as an example of this style of teaching a language). -JJR
Jan 24 2008
prev sibling parent "Saaa" <empty needmail.com> writes:
Yes please :)

 What I'd like to do is collect a bunch of simple "cookbook" utilities like 
 this that fit nicely into a single file, which can be put on the website 
 to help people get started with D. I have several floating around my hard 
 disk. 

Jan 25 2008
prev sibling next sibling parent Lars Noschinski <lars-2006-1 usenet.noschinski.de> writes:
* John Reimer <terminal.node gmail.com> [08-01-25 06:03]:
Walter Bright wrote:
For fun, I ordered a new car stereo that would play music from an SD card or 
USB stick(rather than from CD), and installed it over the weekend. The problem 
is loading songs onto the SD card from my home music server, which I like to 
hear played at random.
The solution is a simple D program, shuffle, which will randomly copy music 
files to an SD card until it fills up. Have some fun with it!


And what would be the Tango equivalent, I wonder?  ;)

I don't know about Tango, but this is the solution with GNU coreutils - lame, I know ;) find -type f -a \( -name '*.mp3' -o -name '*.wma' \) -print0 | \ shuf -z | \ xargs -0 -i cp {} SDCARD_PATH
Jan 25 2008
prev sibling parent Lars Ivar Igesund <larsivar igesund.net> writes:
John Reimer wrote:
 
 And what would be the Tango equivalent, I wonder?  ;)

(see http://pastie.caboo.se/143887 for a syntax highlighted edition) import tango.io.File, tango.io.Stdout, tango.io.FilePath, tango.io.FileScan; import Array = tango.core.Array; int main (char[][] args) { if (args.length != 3) { Stdout.formatln ("usage: shuffle fromdir todir"); return 1; } // set destination path, and add trailing separator as necessary auto dst = new FilePath; dst.path = args[2]; // recursively search for all the mp3, wma and w4a files in specified directory auto songs = new FileScan; songs (args[1], (FilePath fp, bool isDir) {return isDir || fp.suffix == ".mp3" || fp.suffix == ".wma"|| fp.suffix == ".m4a";}); // shuffle the files Stdout.formatln ("{} music files", songs.files.length); Array.shuffle (songs.files); // sequentially fill the target until done, or it quits with an // exception when the device is full foreach (song; songs.files) dst.file(song.file).copy(song); Stdout.formatln ("Done"); return 0; } -- Lars Ivar Igesund blog at http://larsivi.net DSource, #d.tango & #D: larsivi Dancing the Tango
Jan 26 2008
prev sibling next sibling parent reply eao197 <eao197 intervale.ru> writes:
On Fri, 25 Jan 2008 07:35:04 +0300, Walter Bright  =

<newshound1 digitalmars.com> wrote:

 The solution is a simple D program, shuffle, which will randomly copy =

 music files to an SD card until it fills up. Have some fun with it!

I think than the code could be yet more compact, simple and script-like = if = std.file (or std.path) module will contain something like Dir.glob from = = Ruby (http://www.ruby-doc.org/core/classes/Dir.html#M002347)
 /* Program to randomly copy music files from source to destination  =

 device.
   * Written in the D programming language.
   * Written by Walter Bright, http://www.digitalmars.com
   * Placed into the Public Domain.
   */


 import std.file;
 import std.stdio;
 import std.string;
 import std.c.stdlib;
 import std.path;
 import std.random;

 int main(string[] args)
 {
      if (args.length !=3D 3)
      {	writefln("Usage: shuffle fromdir todir");
 	exit(1);
      }
      auto fromdir =3D args[1];
      auto todir =3D args[2];

      /* Recursively search for all the mp3 and wma files in directory =

 fromdir
       * and put them into files[]
       */

wma}" = ) );
      writefln(files.length, " music files");

      /* The loop will normally quit via an exception when the target  =

 device
       * is full. But if there are not enough files in the source to fi=

       * up the target, the loop ensures it will still eventually quit.=

       */
      for (size_t i =3D 0; i < files.length; i++)
      {
 	auto j =3D std.random.rand() % files.length;
 	auto fromfile =3D files[j];
 	auto tofile =3D std.path.join(todir, basename(fromfile));
 	writefln("%s =3D> %s", fromfile, tofile);
 	std.file.copy(fromfile, tofile);
      }

      writefln("Done");
      return 0;
 }

-- = Regards, Yauheni Akhotnikau
Jan 24 2008
next sibling parent Walter Bright <newshound1 digitalmars.com> writes:
eao197 wrote:
 I think than the code could be yet more compact, simple and script-like 
 if std.file (or std.path) module will contain something like Dir.glob 
 from Ruby (http://www.ruby-doc.org/core/classes/Dir.html#M002347)

I agree.
Jan 24 2008
prev sibling parent Brad Roberts <braddr puremagic.com> writes:
Walter Bright wrote:
 eao197 wrote:
 I think than the code could be yet more compact, simple and 
 script-like if std.file (or std.path) module will contain something 
 like Dir.glob from Ruby 
 (http://www.ruby-doc.org/core/classes/Dir.html#M002347)

I agree.

Part of it already exists in 2.0, in the std.file.dirEntries and .DirIterator code. It needs to take a delegate or other filter mechanism to gain the rest of the power. But as is it'd simply the example of a bit of it (untested). string[] files; foreach (DirEntry de; dirEntries(fromdir, SpanMode.depth)) { // Collect only files with mp3 and wma extensions auto ext = getExt(de.name); if (fnmatch(ext, "mp3") || fnmatch(ext, "wma")) files ~= de.name; } Later, Brad
Jan 24 2008
prev sibling next sibling parent reply Roberto Mariottini <rmariottini mail.com> writes:
Walter Bright wrote:
[...]
     for (size_t i = 0; i < files.length; i++)
     {
     auto j = std.random.rand() % files.length;
     auto fromfile = files[j];
     auto tofile = std.path.join(todir, basename(fromfile));
     writefln("%s => %s", fromfile, tofile);
     std.file.copy(fromfile, tofile);
     }

Here you are copying duplicate files over and over. You don't see duplicates because a file copy will overwrite the previously written file, but if you have enough space on the destination device you'll get less files on the destination than those on the source, because duplicates will overwrite. I've found players that implement the shuffle function this way: the pseudo random generator always "prefers" a certain subset of the songs and will play them frequently, while other songs are rarely selected (if ever). A real "shuffle" function should generate the same list of songs that the source contains, changing only the order. Ciao -- Roberto Mariottini, http://www.mariottini.net/roberto/ SuperbCalc, a free tape calculator: http://www.mariottini.net/roberto/superbcalc/
Jan 25 2008
next sibling parent reply Walter Bright <newshound1 digitalmars.com> writes:
Roberto Mariottini wrote:
 Here you are copying duplicate files over and over. You don't see 
 duplicates because a file copy will overwrite the previously written 
 file, but if you have enough space on the destination device you'll get 
 less files on the destination than those on the source, because 
 duplicates will overwrite.

True, the algorithm kinda sucks, but the program was created where I have way, way more songs on the server than will fit on the SD card. So even if it copies the same file multiple times, it doesn't affect the result (except that it takes longer). My defense is I only spent a few minutes writing it <g>.
 I've found players that implement the shuffle function this way: the 
 pseudo random generator always "prefers" a certain subset of the songs 
 and will play them frequently, while other songs are rarely selected (if 
 ever).
 A real "shuffle" function should generate the same list of songs that 
 the source contains, changing only the order.

I agree, a much better algorithm would be to shuffle the files[] array, then sequentially write the shuffled result.
Jan 25 2008
parent Sean Kelly <sean f4.ca> writes:
Walter Bright wrote:
 Roberto Mariottini wrote:
 I've found players that implement the shuffle function this way: the
 pseudo random generator always "prefers" a certain subset of the songs
 and will play them frequently, while other songs are rarely selected
 (if ever).
 A real "shuffle" function should generate the same list of songs that
 the source contains, changing only the order.

I agree, a much better algorithm would be to shuffle the files[] array, then sequentially write the shuffled result.

Strangely, I added a shuffle routine to tango.core.Array just yesterday. Sean
Jan 25 2008
prev sibling next sibling parent reply Walter Bright <newshound1 digitalmars.com> writes:
Roberto Mariottini wrote:
 A real "shuffle" function should generate the same list of songs that 
 the source contains, changing only the order.

Here's a revised one: /* Program to randomly copy music files from source to destination device. * Written in the D programming language. * Written by Walter Bright, http://www.digitalmars.com * Placed into the Public Domain. */ import std.file; import std.stdio; import std.string; import std.c.stdlib; import std.path; import std.random; int main(string[] args) { if (args.length != 3) { writefln("Usage: shuffle fromdir todir"); exit(1); } auto fromdir = args[1]; auto todir = args[2]; /* Recursively search for all the mp3 and wma files in directory fromdir * and put them into files[] */ string[] files; bool callback(DirEntry *de) { if (de.isdir) listdir(de.name, &callback); // recurse into subdirectories else { // Collect only files with mp3 and wma extensions auto ext = getExt(de.name); if (fnmatch(ext, "mp3") || fnmatch(ext, "wma")) files ~= de.name; } return true; // keep going } std.file.listdir(fromdir, &callback); writefln(files.length, " music files"); /* Shuffle the files[] array */ for (size_t i = 0; i < files.length; i++) { auto j = std.random.rand() % files.length; // swap [i] and [j] auto tmp = files[i]; files[i] = files[j]; files[j] = tmp; } /* Sequentially fill the target until done or it quits with an * exception when the device is full. */ foreach (fromfile; files) { auto tofile = std.path.join(todir, basename(fromfile)); writefln("%s => %s", fromfile, tofile); std.file.copy(fromfile, tofile); } writefln("Done"); return 0; }
Jan 25 2008
next sibling parent reply Oskar Linde <oskar.lindeREM OVEgmail.com> writes:
Walter Bright wrote:

     /* Shuffle the files[] array
      */

To be nitpicky, your algorithm will not give a perfect shuffling. Consider shuffling the sequence 012. The algorithm has 3^3 = 27 different ways it can shuffle the sequence, and there exists 3! = 6 different permutations of the sequence. Since 27 isn't divisible by 6, the algorithm will give a slight bias towards some end sequences. (021,102 and 120 in this case). A quick fix is to change the first line of the loop body:
     for (size_t i = 0; i < files.length; i++)
     {
     auto j = std.random.rand() % files.length;

auto j = std.random.rand() % (files.length - i) + i;
     // swap [i] and [j]
     auto tmp = files[i];
     files[i] = files[j];
     files[j] = tmp;
     }

(the last iteration will be a nop, so the loop condition could be changed into i < files.length-1) Regards, -- Oskar
Jan 25 2008
next sibling parent reply Bill Baxter <dnewsgroup billbaxter.com> writes:
Oskar Linde wrote:
 Walter Bright wrote:
 
     /* Shuffle the files[] array
      */

To be nitpicky, your algorithm will not give a perfect shuffling. Consider shuffling the sequence 012. The algorithm has 3^3 = 27 different ways it can shuffle the sequence, and there exists 3! = 6 different permutations of the sequence. Since 27 isn't divisible by 6, the algorithm will give a slight bias towards some end sequences. (021,102 and 120 in this case). A quick fix is to change the first line of the loop body:
     for (size_t i = 0; i < files.length; i++)
     {
     auto j = std.random.rand() % files.length;

auto j = std.random.rand() % (files.length - i) + i;
     // swap [i] and [j]
     auto tmp = files[i];
     files[i] = files[j];
     files[j] = tmp;
     }

(the last iteration will be a nop, so the loop condition could be changed into i < files.length-1)

Yep, this is a classic problem from homeworks sets for algorithms classes. How to make a random permutation of N elements in O(N) time and O(1) additional space such that every element is equally likely to end up in any slot. And the answer is what Oskar said. Walter, your MechE background is showing. :-) --bb
Jan 25 2008
parent reply Walter Bright <newshound1 digitalmars.com> writes:
Bill Baxter wrote:
 Yep, this is a classic problem from homeworks sets for algorithms 
 classes.  How to make a random permutation of N elements in O(N) time 
 and O(1) additional space such that every element is equally likely to 
 end up in any slot.  And the answer is what Oskar said.    Walter, your 
 MechE background is showing. :-)

I remember reading a criticism of the shuffle/swap algorithm and the fix a while back, but don't remember where I read it or what the fix was. I'm glad Oskar has set it straight here. Although this is just a dumb program, because it's going into a 'cookbook' that others may use for more serious applications, the algorithms should be correct. Kudos to Oskar.
Jan 25 2008
parent Walter Bright <newshound1 digitalmars.com> writes:
Anders Bergh wrote:
 Might have been that?

Yes, I think so!
Jan 26 2008
prev sibling next sibling parent reply bearophile <bearophileHUGS lycos.com> writes:
Oskar Linde:
 To be nitpicky, your algorithm will not give a perfect shuffling. 
 Consider shuffling the sequence 012. The algorithm has 3^3 = 27 
 different ways it can shuffle the sequence, and there exists 3! = 6 
 different permutations of the sequence. Since 27 isn't divisible by 6, 
 the algorithm will give a slight bias towards some end sequences.

Thank you Oskar for showing why a shuffle() function is necessary in std.random, among other things. You can find one in my d libs, based on Knuth simple algorithm ;-) Bye, bearophile
Jan 25 2008
parent Lars Ivar Igesund <larsivar igesund.net> writes:
bearophile wrote:

 Oskar Linde:
 To be nitpicky, your algorithm will not give a perfect shuffling.
 Consider shuffling the sequence 012. The algorithm has 3^3 = 27
 different ways it can shuffle the sequence, and there exists 3! = 6
 different permutations of the sequence. Since 27 isn't divisible by 6,
 the algorithm will give a slight bias towards some end sequences.

Thank you Oskar for showing why a shuffle() function is necessary in std.random, among other things. You can find one in my d libs, based on Knuth simple algorithm ;-) Bye, bearophile

FWIW, tango.core.Array gained shuffle last night :) -- Lars Ivar Igesund blog at http://larsivi.net DSource, #d.tango & #D: larsivi Dancing the Tango
Jan 25 2008
prev sibling parent reply BCS <BCS pathlink.com> writes:
Oskar Linde wrote:
 Walter Bright wrote:
 
     /* Shuffle the files[] array
      */

To be nitpicky, your algorithm will not give a perfect shuffling. Consider shuffling the sequence 012. The algorithm has 3^3 = 27 different ways it can shuffle the sequence, and there exists 3! = 6 different permutations of the sequence. Since 27 isn't divisible by 6, the algorithm will give a slight bias towards some end sequences. (021,102 and 120 in this case). A quick fix is to change the first line of the loop body:

What would you get if you qsorted the list with rand used as the compare function?
Jan 25 2008
parent Bill Baxter <dnewsgroup billbaxter.com> writes:
BCS wrote:
 Oskar Linde wrote:
 Walter Bright wrote:

     /* Shuffle the files[] array
      */

To be nitpicky, your algorithm will not give a perfect shuffling. Consider shuffling the sequence 012. The algorithm has 3^3 = 27 different ways it can shuffle the sequence, and there exists 3! = 6 different permutations of the sequence. Since 27 isn't divisible by 6, the algorithm will give a slight bias towards some end sequences. (021,102 and 120 in this case). A quick fix is to change the first line of the loop body:

What would you get if you qsorted the list with rand used as the compare function?

At best, an O(N log N) shuffle instead of O(N). --bb
Jan 25 2008
prev sibling parent reply Frits van Bommel <fvbommel REMwOVExCAPSs.nl> writes:
Walter Bright wrote:
     /* Shuffle the files[] array
      */
     for (size_t i = 0; i < files.length; i++)
     {
     auto j = std.random.rand() % files.length;
     // swap [i] and [j]
     auto tmp = files[i];
     files[i] = files[j];
     files[j] = tmp;
     }
 
     /* Sequentially fill the target until done or it quits with an
      * exception when the device is full.
      */
     foreach (fromfile; files)
     {
     auto tofile = std.path.join(todir, basename(fromfile));
     writefln("%s => %s", fromfile, tofile);
     std.file.copy(fromfile, tofile);
     }
 
     writefln("Done");
     return 0;
 }

As mentioned by Oskar, this won't generate uniformly random lists. However, both your algorithm and his version have an additional flaw: both shuffle the entire list when you only need a random prefix (i.e. you only need to select as many random files as will fit onto your second disk). Try this small modification to the original algorithm instead: ----- /* The loop will normally quit via an exception when the target * device is full. But if there are not enough files in the source * to fill up the target, the loop ensures it will still eventually * quit. */ while (files.length > 0) { auto j = std.random.rand() % files.length; auto fromfile = files[j]; auto tofile = std.path.join(todir, basename(fromfile)); writefln("%s => %s", fromfile, tofile); std.file.copy(fromfile, tofile); // Remove copied file from the list files[j] = files[$-1]; files = files[0 .. $-1]; } ----- Though technically, this will still prefer the files at the front of the list during each iteration because (rand() % files.length) is more likely to be a number <= (RAND_MAX/files.length) than a larger number. (In fact, for RAND_MAX < files.length the higher-ups will *never* be chosen during that iteration) However, this shouldn't be a significant problem as long as your number of files is much less than RAND_MAX; and RAND_MAX appears to be uint.max for std.random, so for any reasonable music collection it should be close enough. To fix that, ignore the result of rand() if it's > ((RAND_MAX / files.length) * files.length), for example by replacing the first line of the loop with: --- uint r = std.random.max; // assumes RAND_MAX == uint.max auto max = (uint.max / files.length) * files.length; while (r > max) r = std.random.rand(); auto j = r % files.length; --- (also, add "assert(files.length <= RAND_MAX);" before the loop if RAND_MAX < uint.max)
Jan 25 2008
parent reply Walter Bright <newshound1 digitalmars.com> writes:
Frits van Bommel wrote:
 As mentioned by Oskar, this won't generate uniformly random lists.
 However, both your algorithm and his version have an additional flaw: 
 both shuffle the entire list when you only need a random prefix (i.e. 
 you only need to select as many random files as will fit onto your 
 second disk).
 
 Try this small modification to the original algorithm instead:
 -----
 
     /* The loop will normally quit via an exception when the target
      * device is full. But if there are not enough files in the source
      * to fill up the target, the loop ensures it will still eventually
      * quit.
      */
     while (files.length > 0)
     {
         auto j = std.random.rand() % files.length;
         auto fromfile = files[j];
         auto tofile = std.path.join(todir, basename(fromfile));
         writefln("%s => %s", fromfile, tofile);
         std.file.copy(fromfile, tofile);
 
         // Remove copied file from the list
         files[j] = files[$-1];
         files = files[0 .. $-1];
     }
 -----

You're right, but I prefer having the shuffle bit be separate so people will see how to do a shuffle.
 Though technically, this will still prefer the files at the front of the 
 list during each iteration because (rand() % files.length) is more 
 likely to be a number <= (RAND_MAX/files.length) than a larger number. 
 (In fact, for RAND_MAX < files.length the higher-ups will *never* be 
 chosen during that iteration)
 However, this shouldn't be a significant problem as long as your number 
 of files is much less than RAND_MAX; and RAND_MAX appears to be uint.max 
 for std.random, so for any reasonable music collection it should be 
 close enough.

You're right again. (This algorithm is obviously harder than it looks!)
 To fix that, ignore the result of rand() if it's > ((RAND_MAX / 
 files.length) * files.length), for example by replacing the first line 
 of the loop with:
 ---
     uint r = std.random.max;
     // assumes RAND_MAX == uint.max
     auto max = (uint.max / files.length) * files.length;
     while (r > max)
         r = std.random.rand();
     auto j = r % files.length;
 ---
 (also, add "assert(files.length <= RAND_MAX);" before the loop if 
 RAND_MAX < uint.max)

I think it's (r >= max).
Jan 25 2008
parent Frits van Bommel <fvbommel REMwOVExCAPSs.nl> writes:
Walter Bright wrote:
 Frits van Bommel wrote:

 
 To fix that, ignore the result of rand() if it's > ((RAND_MAX / 
 files.length) * files.length), for example by replacing the first line 
 of the loop with:
 ---
     uint r = std.random.max;
     // assumes RAND_MAX == uint.max
     auto max = (uint.max / files.length) * files.length;
     while (r > max)
         r = std.random.rand();
     auto j = r % files.length;
 ---
 (also, add "assert(files.length <= RAND_MAX);" before the loop if 
 RAND_MAX < uint.max)

I think it's (r >= max).

You're right, that loop condition was missing an '='. I was typing all that from memory of a class I took years ago, so I'm just glad I got the most important stuff right.
Jan 25 2008
prev sibling parent "Anders Bergh" <anders1 gmail.com> writes:
On Jan 25, 2008 8:51 PM, Walter Bright <newshound1 digitalmars.com> wrote:
 I remember reading a criticism of the shuffle/swap algorithm and the fix
 a while back, but don't remember where I read it or what the fix was.
 I'm glad Oskar has set it straight here.

 Although this is just a dumb program, because it's going into a
 'cookbook' that others may use for more serious applications, the
 algorithms should be correct.

 Kudos to Oskar.

http://www.codinghorror.com/blog/archives/001015.html Might have been that? Anders
Jan 26 2008
prev sibling next sibling parent reply Walter Bright <newshound1 digitalmars.com> writes:
Latest based on suggestions:


/* Program to randomly copy music files from source to destination device.
  * Written in the D programming language.
  * Written by Walter Bright, http://www.digitalmars.com
  * Placed into the Public Domain.
  * Thanks to help from Oskar Linde, Anders Bergh, and Fritz van Bommel
  */


import std.file;
import std.stdio;
import std.string;
import std.c.stdlib;
import std.path;
import std.random;

int main(string[] args)
{
     if (args.length != 3)
     {	writefln("Usage: shuffle fromdir todir");
	exit(1);
     }
     auto fromdir = args[1];
     auto todir = args[2];

     /* Recursively search for all the mp3 and wma files in directory 
fromdir
      * and put them into files[]
      */
     string[] files;
     bool callback(DirEntry *de)
     {
	if (de.isdir)
	    listdir(de.name, &callback); // recurse into subdirectories
	else
	{
	    // Collect only files with mp3 and wma extensions
	    auto ext = getExt(de.name);
	    if (fnmatch(ext, "mp3") || fnmatch(ext, "wma"))
		files ~= de.name;
	}
	return true;	// keep going
     }
     std.file.listdir(fromdir, &callback);

     auto n = files.length;
     writefln(n, " music files");
     if (!n)
	return 0;

     /* Shuffle the files[] array using Durstenfeld's algorithm
      * based on the Fisher-Yates method
      */
     auto max = (typeof(std.random.rand()).max / n) * n;
     while (--n)
     {
	/* Pick random r in range 0..max, discarding others
	 * to eliminate modulo bias
	 */
	auto r = max;
	do
	    r = std.random.rand();
	while (r >= max);

	auto j = r % (n + 1);	// pick element to swap with

	// swap [n] and [j]
	auto tmp = files[n];
	files[n] = files[j];
	files[j] = tmp;
     }

     /* Sequentially fill the target until done or it quits with an
      * exception when the device is full.
      */
     foreach (fromfile; files)
     {
	auto tofile = std.path.join(todir, basename(fromfile));
	writefln("%s => %s", fromfile, tofile);
	std.file.copy(fromfile, tofile);
     }

     writefln("Done");
     return 0;
}
Jan 26 2008
next sibling parent reply Frits van Bommel <fvbommel REMwOVExCAPSs.nl> writes:
Walter Bright wrote:
 Latest based on suggestions:
 
 

     /* Shuffle the files[] array using Durstenfeld's algorithm
      * based on the Fisher-Yates method
      */
     auto max = (typeof(std.random.rand()).max / n) * n;
     while (--n)
     {
     /* Pick random r in range 0..max, discarding others
      * to eliminate modulo bias
      */
     auto r = max;
     do
         r = std.random.rand();
     while (r >= max);
 
     auto j = r % (n + 1);    // pick element to swap with
 
     // swap [n] and [j]
     auto tmp = files[n];
     files[n] = files[j];
     files[j] = tmp;
     }

Wrong again. The calculationof "max" should be inside the loop, since it depends on "n" which is different for each iteration.
Jan 27 2008
next sibling parent Walter Bright <newshound1 digitalmars.com> writes:
Frits van Bommel wrote:
 Wrong again. The calculationof "max" should be inside the loop, since it 
 depends on "n" which is different for each iteration.

Criminy!
Jan 27 2008
prev sibling parent reply "Kris" <foo bar.com> writes:
I hope tango.core.Array.shuffle got it right?

 Wrong again. The calculationof "max" should be inside the loop, since it 
 depends on "n" which is different for each iteration. 

Jan 27 2008
parent reply Bill Baxter <dnewsgroup billbaxter.com> writes:
Kris wrote:
 I hope tango.core.Array.shuffle got it right?
 
 Wrong again. The calculationof "max" should be inside the loop, since it 
 depends on "n" which is different for each iteration. 


It looks like Tango's shuffle doesn't attempt to eliminate the modulo bias in generating a random number. So it doesn't have attempt to calculate any "n" or "max" to begin with. --bb
Jan 27 2008
parent reply Sean Kelly <sean f4.ca> writes:
Bill Baxter wrote:
 Kris wrote:
 I hope tango.core.Array.shuffle got it right?

 Wrong again. The calculationof "max" should be inside the loop, since
 it depends on "n" which is different for each iteration. 


It looks like Tango's shuffle doesn't attempt to eliminate the modulo bias in generating a random number. So it doesn't have attempt to calculate any "n" or "max" to begin with.

The Tango shuffle uses the Knuth-Fischer-Yates shuffle algorithm but iterates from 0 .. N rather than N .. 0. It may be that reversing the direction causes some bias and that it really has to be top-down however--I'm planning on reading up on the algorithm today to be sure. I actually stumbled on the current implementation by accident simply by following the requirements for random_shuffle in the C++ spec. Sean
Jan 28 2008
parent reply Frits van Bommel <fvbommel REMwOVExCAPSs.nl> writes:
Sean Kelly wrote:
 Bill Baxter wrote:
 It looks like Tango's shuffle doesn't attempt to eliminate the modulo
 bias in generating a random number.  So it doesn't have attempt to
 calculate any "n" or "max" to begin with.

The Tango shuffle uses the Knuth-Fischer-Yates shuffle algorithm but iterates from 0 .. N rather than N .. 0. It may be that reversing the direction causes some bias and that it really has to be top-down however--I'm planning on reading up on the algorithm today to be sure. I actually stumbled on the current implementation by accident simply by following the requirements for random_shuffle in the C++ spec.

It really needs to be top-down with that loop body. The way tango.core.Array.shuffle is implemented[1], position n may be swapped with something else after having received its random replacement if n comes up as a random number afterwards. Changing for( size_t pos = 2, end = buf.length; pos < buf.length; ++pos ) to for( size_t pos = buf.length - 1; pos > 0; --pos ) should fix it. However, it's possible to implement a correct shuffle with an upwards-counting loop. The loop body should then be something like buf.swap( n, n + Rand(buf.length - n) ); (assuming 0 <= Rand(N) < N) so that the random argument is always >= n. This is basically a reversed Knuth-Fischer-Yates shuffle. Both versions would be better if the randomizer eliminated modulo bias, though :P. [1]: At least, according to http://www.dsource.org/projects/tango/docs/current/source/tango.core.Array.html
Jan 28 2008
next sibling parent Walter Bright <newshound1 digitalmars.com> writes:
Frits van Bommel wrote:
 However, it's possible to implement a correct shuffle  [...]

std.random.randomShuffle() in Phobos 2.0 gets it right. The shuffle code can be replaced with: version (D_Version2) { auto rnd = Random(unpredictableSeed); randomShuffle(files, rnd); } else { auto n = files.length; writefln(n, " music files"); if (!n) return 0; /* Shuffle the files[] array using Durstenfeld's algorithm * based on the Fisher-Yates method */ while (--n) { /* Pick random r in range 0..max, discarding others * to eliminate modulo bias */ auto max = (typeof(std.random.rand()).max / (n + 1)) * (n + 1); auto r = max; do r = std.random.rand(); while (r >= max); auto j = r % (n + 1); // pick element to swap with // swap [n] and [j] auto tmp = files[n]; files[n] = files[j]; files[j] = tmp; } }
Jan 28 2008
prev sibling parent reply Sean Kelly <sean f4.ca> writes:
Frits van Bommel wrote:
 Sean Kelly wrote:
 Bill Baxter wrote:
 It looks like Tango's shuffle doesn't attempt to eliminate the modulo
 bias in generating a random number.  So it doesn't have attempt to
 calculate any "n" or "max" to begin with.

The Tango shuffle uses the Knuth-Fischer-Yates shuffle algorithm but iterates from 0 .. N rather than N .. 0. It may be that reversing the direction causes some bias and that it really has to be top-down however--I'm planning on reading up on the algorithm today to be sure. I actually stumbled on the current implementation by accident simply by following the requirements for random_shuffle in the C++ spec.

It really needs to be top-down with that loop body. The way tango.core.Array.shuffle is implemented[1], position n may be swapped with something else after having received its random replacement if n comes up as a random number afterwards.

Yup. I just wasn't sure if this would be a problem, but it looks like it will be. I also had an off-by-one error in the limit value being passed to rand, which actually turned it into Sattolo's algorithm. Go figure.
 Both versions would be better if the randomizer eliminated modulo bias,
 though :P.

I just fixed that as well I think. Let me know if I screwed it up: http://dsource.org/projects/tango/changeset/3139 (A change to tango.stdc.posix.sys.stat creeped in as well. Ignore that.) Sean
Jan 28 2008
parent reply Frits van Bommel <fvbommel REMwOVExCAPSs.nl> writes:
Sean Kelly wrote:
 Frits van Bommel wrote:
 Both versions would be better if the randomizer eliminated modulo bias,
 though :P.

I just fixed that as well I think. Let me know if I screwed it up: http://dsource.org/projects/tango/changeset/3139

Looks good, assuming libc's rand() has a uniform distribution over the lower 16 bits. (This is probably just nitpicking:) AFAICT that assumption isn't guaranteed by the C standard though, it seems RAND_MAX is only required to be >= 32767 (so rand() may well return 15-bit random numbers :( [1]). Also, I can't find any mention of what the (lower-bitwise) distribution is supposed to be. For instance, if RAND_MAX happens to be (<some prime> - 1) instead of (2^N - 1) the lower bits won't be uniformly distributed even if the results of rand() are themselves distributed perfectly uniformly in [0 ... RAND_MAX]. For these reasons, it may not be a very good idea to use libc rand(). It's just underspecified AFAICT... (I don't have the actual C standard though, just some library reference sites from my bookmarks and what Google drags up) [1] For glibc RAND_MAX is 2^32-1 though, so this shouldn't be a problem on typical *nix systems. I'm not sure about Windows systems though; IIRC mingw (and presumably gdcwin) uses the MS libc and DMD uses its own libc...
Jan 28 2008
parent reply Sean Kelly <sean f4.ca> writes:
Frits van Bommel wrote:
 Sean Kelly wrote:
 Frits van Bommel wrote:
 Both versions would be better if the randomizer eliminated modulo bias,
 though :P.

I just fixed that as well I think. Let me know if I screwed it up: http://dsource.org/projects/tango/changeset/3139

Looks good, assuming libc's rand() has a uniform distribution over the lower 16 bits.

Cool, thanks.
 (This is probably just nitpicking:)
 AFAICT that assumption isn't guaranteed by the C standard though, it
 seems RAND_MAX is only required to be >= 32767 (so rand() may well
 return 15-bit random numbers :( [1]).
 Also, I can't find any mention of what the (lower-bitwise) distribution
 is supposed to be. For instance, if RAND_MAX happens to be (<some prime>
 - 1) instead of (2^N - 1) the lower bits won't be uniformly distributed
 even if the results of rand() are themselves distributed perfectly
 uniformly in [0 ... RAND_MAX].
 For these reasons, it may not be a very good idea to use libc rand().
 It's just underspecified AFAICT...

Yup. I really just use C's rand as a default because it's available and doing so avoids creating a dependency on tango.math.Random. I half considered just requiring the user to supply a randomizer, but I think people would complain. Still, I'd expect anyone who was really serious about having a uniform distribution would supply one--I intend to add an adapter to the Random module to simplify plugging it into the shuffle routine to simplify this.
 (I don't have the actual C standard though, just some library reference
 sites from my bookmarks and what Google drags up)

I have it, here's what it says: "The rand function computes a sequence of pseudo-random integers in the range 0 to RAND_MAX." Not much there, huh? :-) It's also worth mentioning that C's rand function isn't required to be threadsafe, so it's not a wonderful choice for a number of reasons.
 [1] For glibc RAND_MAX is 2^32-1 though, so this shouldn't be a problem
 on typical *nix systems. I'm not sure about Windows systems though; IIRC
 mingw (and presumably gdcwin) uses the MS libc and DMD uses its own libc...

The current impl in Tango assumes only 16 bits of randomness, so rand is called twice to produce a 32-bit value. I thought about switching based on RAND_MAX, but that would require it to be up to date for all OSes, so being conservative just seemed easier. Sean
Jan 28 2008
parent reply Bill Baxter <dnewsgroup billbaxter.com> writes:
Sean Kelly wrote:
 Frits van Bommel wrote:
 Sean Kelly wrote:
 Frits van Bommel wrote:
 Both versions would be better if the randomizer eliminated modulo bias,
 though :P.



range 0 to RAND_MAX."

I definitely recall reading that low-order bits of most rand() implementations were far from uniform. So it kind of makes me wonder if it's even worth the effort to eliminate the small modulo bias given that the underlying random numbers may not be all that random to begin with. That's why I avoided criticizing Tango for not implementing it in my previous message, but just commented that it didn't implement it. It has a smell of being the random number generation version of "premature optimization". --bb
Jan 28 2008
parent Sean Kelly <sean f4.ca> writes:
Bill Baxter wrote:
 Sean Kelly wrote:
 Frits van Bommel wrote:
 Sean Kelly wrote:
 Frits van Bommel wrote:
 Both versions would be better if the randomizer eliminated modulo
 bias,
 though :P.



range 0 to RAND_MAX."

I definitely recall reading that low-order bits of most rand() implementations were far from uniform. So it kind of makes me wonder if it's even worth the effort to eliminate the small modulo bias given that the underlying random numbers may not be all that random to begin with. That's why I avoided criticizing Tango for not implementing it in my previous message, but just commented that it didn't implement it. It has a smell of being the random number generation version of "premature optimization".

Well, I figure the impl may as well do what it can, so I'm glad you mentioned it. If adjusting for the bias had been overly complex I wouldn't have bothered. Sean
Jan 28 2008
prev sibling parent 0ffh <frank youknow.what.todo.interNETz> writes:
Has it already been proposed to just throw files already
copied out of the list? It's a minimal amendment to the
original (changing one line and adding two):

/* Program to randomly copy music files from source to destination device.
  * Written in the D programming language.
  * Written by Walter Bright, http://www.digitalmars.com
  * Placed into the Public Domain.
  * Minimal amendment by 0ffh to copy any file <= once
  */

import std.file;
import std.stdio;
import std.string;
import std.c.stdlib;
import std.path;
import std.random;

int main(string[] args)
{
     if (args.length != 3)
     {    writefln("Usage: shuffle fromdir todir");
     exit(1);
     }
     auto fromdir = args[1];
     auto todir = args[2];

     /* Recursively search for all the mp3 and wma files in directory fromdir
      * and put them into files[]
      */
     string[] files;
     bool callback(DirEntry *de)
     {
     if (de.isdir)
         listdir(de.name, &callback); // recurse into subdirectories
     else
     {
         // Collect only files with mp3 and wma extensions
         auto ext = getExt(de.name);
         if (fnmatch(ext, "mp3") || fnmatch(ext, "wma"))
         files ~= de.name;
     }
     return true;    // keep going
     }
     std.file.listdir(fromdir, &callback);

     writefln(files.length, " music files");

     /* The loop will normally quit via an exception when the target device
      * is full. But if there are not enough files in the source to fill
      * up the target, the loop ensures it will still eventually quit.
      */
     while (files.length) // 0ffh: changed exit condition and loop type
     {
     auto j = std.random.rand() % files.length;
     auto fromfile = files[j];
     auto tofile = std.path.join(todir, basename(fromfile));
     writefln("%s => %s", fromfile, tofile);
     std.file.copy(fromfile, tofile);
     files[j]=files[$-1]; // 0ffh: copy last list element
     files.length=files.length-1; // 0ffh: decrease length
     }

     writefln("Done");
     return 0;
}
Jan 27 2008
prev sibling parent reply "Tyro[a.c.edwards]" <no spam.com> writes:
Walter Bright さんは書きました:
 For fun, I ordered a new car stereo that would play music from an SD 
 card or USB stick(rather than from CD), and installed it over the 
 weekend. The problem is loading songs onto the SD card from my home 
 music server, which I like to hear played at random.
 
 The solution is a simple D program, shuffle, which will randomly copy 
 music files to an SD card until it fills up. Have some fun with it!
 ----------------------------
 
 /* Program to randomly copy music files from source to destination device.
  * Written in the D programming language.
  * Written by Walter Bright, http://www.digitalmars.com
  * Placed into the Public Domain.
  */
 
 
 import std.file;
 import std.stdio;
 import std.string;
 import std.c.stdlib;
 import std.path;
 import std.random;
 
 int main(string[] args)
 {
     if (args.length != 3)
     {    writefln("Usage: shuffle fromdir todir");
     exit(1);
     }
     auto fromdir = args[1];
     auto todir = args[2];
 
     /* Recursively search for all the mp3 and wma files in directory 
 fromdir
      * and put them into files[]
      */
     string[] files;
     bool callback(DirEntry *de)
     {
     if (de.isdir)
         listdir(de.name, &callback); // recurse into subdirectories
     else
     {
         // Collect only files with mp3 and wma extensions
         auto ext = getExt(de.name);
         if (fnmatch(ext, "mp3") || fnmatch(ext, "wma"))
         files ~= de.name;
     }
     return true;    // keep going
     }
     std.file.listdir(fromdir, &callback);
 
     writefln(files.length, " music files");
 
     /* The loop will normally quit via an exception when the target device
      * is full. But if there are not enough files in the source to fill
      * up the target, the loop ensures it will still eventually quit.
      */
     for (size_t i = 0; i < files.length; i++)
     {
     auto j = std.random.rand() % files.length;
     auto fromfile = files[j];
     auto tofile = std.path.join(todir, basename(fromfile));
     writefln("%s => %s", fromfile, tofile);
     std.file.copy(fromfile, tofile);
     }
 
     writefln("Done");
     return 0;
 }

Appreciate the example, it actually came in quite handy. Quick question, how does one modify this to copy an entire directory? Additionaly, version(linux) std.file.copy() preserves the filestamps of the files copied. Is there a way to build this characteristic into the windows version? Thanks
Jan 27 2008
parent "Tyro[a.c.edwards]" <no spam.com> writes:
Tyro[a.c.edwards] さんは書きました:
 Walter Bright さんは書きました:
 For fun, I ordered a new car stereo that would play music from an SD 
 card or USB stick(rather than from CD), and installed it over the 
 weekend. The problem is loading songs onto the SD card from my home 
 music server, which I like to hear played at random.

 The solution is a simple D program, shuffle, which will randomly copy 
 music files to an SD card until it fills up. Have some fun with it!
 ----------------------------

 /* Program to randomly copy music files from source to destination 
 device.
  * Written in the D programming language.
  * Written by Walter Bright, http://www.digitalmars.com
  * Placed into the Public Domain.
  */


 import std.file;
 import std.stdio;
 import std.string;
 import std.c.stdlib;
 import std.path;
 import std.random;

 int main(string[] args)
 {
     if (args.length != 3)
     {    writefln("Usage: shuffle fromdir todir");
     exit(1);
     }
     auto fromdir = args[1];
     auto todir = args[2];

     /* Recursively search for all the mp3 and wma files in directory 
 fromdir
      * and put them into files[]
      */
     string[] files;
     bool callback(DirEntry *de)
     {
     if (de.isdir)
         listdir(de.name, &callback); // recurse into subdirectories
     else
     {
         // Collect only files with mp3 and wma extensions
         auto ext = getExt(de.name);
         if (fnmatch(ext, "mp3") || fnmatch(ext, "wma"))
         files ~= de.name;
     }
     return true;    // keep going
     }
     std.file.listdir(fromdir, &callback);

     writefln(files.length, " music files");

     /* The loop will normally quit via an exception when the target 
 device
      * is full. But if there are not enough files in the source to fill
      * up the target, the loop ensures it will still eventually quit.
      */
     for (size_t i = 0; i < files.length; i++)
     {
     auto j = std.random.rand() % files.length;
     auto fromfile = files[j];
     auto tofile = std.path.join(todir, basename(fromfile));
     writefln("%s => %s", fromfile, tofile);
     std.file.copy(fromfile, tofile);
     }

     writefln("Done");
     return 0;
 }

Appreciate the example, it actually came in quite handy. Quick question, how does one modify this to copy an entire directory? Additionaly, version(linux) std.file.copy() preserves the filestamps of the files copied. Is there a way to build this characteristic into the windows version? Thanks

I thought there might be a way to copy the entire directory intact. I didn't find anyting in the library that allows this but it really isn't that important. I can recreat the directory structure as I go along. Thanks. Andrew
Jan 27 2008