www.digitalmars.com         C & C++   DMDScript  

digitalmars.D - String theory by example

reply "Regan Heath" <regan netwin.co.nz> writes:
------------P4hyhsZr1p8vYZEA9ADCH7
Content-Type: text/plain; format=flowed; delsp=yes; charset=iso-8859-15
Content-Transfer-Encoding: 8bit

Ok, this time I thought I'd see if I could come up with a string  
struct/class that behaved how I think an in-built string type should  
behave in D, here is the best I could do.

Thoughts, comments, etc.

---

module string;

import std.utf;
import std.stdio;

class StringType(T)
{
	private T data;
	
	this(char[] value)
	{
		assign!(char[])(value);
	}
	
	this(wchar[] value)
	{
		assign!(wchar[])(value);
	}

	this(dchar[] value)
	{
		assign!(dchar[])(value);
	}

	template assign(S) {
		private void assign(S value)
		{
			static if (is(T:char[])) data = toUTF8(value);
			static if (is(T:wchar[])) data = toUTF16(value);
			static if (is(T:dchar[])) data = toUTF32(value);
		}
	}
	
	dchar opIndex(uint index)
	{
		foreach(uint i, dchar c; data)
		{
			if (i == index) return c;
		}
		
		throw new Error("String bounds error");
	}
	
	dchar opIndexAssign(dchar u, uint index)
	{
		dchar[] res;
		uint i;
		
		i = 0;
		foreach(dchar c; data)
		{
			if (i == index) res ~= u;
			else res ~= c;
			i++;
		}
		
		if (i < index) throw new Error("String bounds error");
		
		assign!(dchar[])(res);
		
		return u;
	}
	
	//I dont get it, shouldn't this read:
	//StringType!(T) opSlice(uint start, uint end)
	//?
	
	StringType opSlice(uint start, uint end)
	{
		dchar[] res;
		uint i;

		i = 0;
		foreach(dchar c; data)
		{
			if (i++ < start) continue;
			if (i >= end) continue;
			res ~= c;
		}

		if (i < end) throw new Error("String bounds error");

		return new StringType(res);
	}
	
	StringType opCat(StringType rhs)
	{
		return new StringType(data ~ rhs.data);
	}
	
	StringType opCatAssign(StringType rhs)
	{
		data ~= rhs.data;
		return this;
	}
	
	int length()
	{
		uint i = 0;
		foreach(dchar c; data) i++;
		return i;
	}
	
	int length(int newlength)
	{
		uint nl = newlength - length();
		data.length = data.length + nl;
	}
	
	char[] opCast()
	{
		return toUTF8(data);
	}
	
	char[] toString()
	{
		return toUTF8(data);
	}
}

//Choose your native encoding
alias StringType!(char[]) String;
//alias StringType!(wchar[]) String;
//alias StringType!(dchar[]) String;

//NOTE: for this to work on the windows console you have to:
//      - left-click top left corner of command prompt window
//      - select "properties"
//      - select "font"
//      - select "Lucida Console"
//      - type "chcp 65001" into command prompt
//and now you can finally run this example.

void main()
{
	//hopefully the suffix becomes redundant
	String test = new String("smörgåsbord"c);
	
	//sadly this creates a new string.
	String two = test[0..4];
	
	//as does this, but this time it's expected to
	String three = test[0..4] ~ test[5..test.length];
	
	//modify original string, note that this inserts in character position 3  
(counting from 0) making 'a' the 4th character in this string
	test[3] = 'a';
	
	//for some odd reason if you change any of these to writefln it stops any  
more data appearing
	//I suspect a bug specific to the windows, perhaps in phobos?
	writef("%s",test);
	writef(" ");
	writef("%s",two);
	writef(" ");
	writef("%s",three);
}

------------P4hyhsZr1p8vYZEA9ADCH7
Content-Disposition: attachment; filename=string.d
Content-Type: application/octet-stream; name=string.d
Content-Transfer-Encoding: 8bit

module string;

import std.utf;
import std.stdio;

class StringType(T)
{
	private T data;
	
	this(char[] value)
	{
		assign!(char[])(value);
	}
	
	this(wchar[] value)
	{
		assign!(wchar[])(value);
	}

	this(dchar[] value)
	{
		assign!(dchar[])(value);
	}

	template assign(S) {
		private void assign(S value)
		{
			static if (is(T:char[])) data = toUTF8(value);
			static if (is(T:wchar[])) data = toUTF16(value);
			static if (is(T:dchar[])) data = toUTF32(value);
		}
	}
	
	dchar opIndex(uint index)
	{
		foreach(uint i, dchar c; data)
		{
			if (i == index) return c;
		}
		
		throw new Error("String bounds error");
	}
	
	dchar opIndexAssign(dchar u, uint index)
	{
		dchar[] res;
		uint i;
		
		i = 0;
		foreach(dchar c; data)
		{
			if (i == index) res ~= u;
			else res ~= c;
			i++;
		}
		
		if (i < index) throw new Error("String bounds error");
		
		assign!(dchar[])(res);
		
		return u;
	}
	
	//I dont get it, shouldn't this read:
	//StringType!(T) opSlice(uint start, uint end)
	//?
	
	StringType opSlice(uint start, uint end)
	{
		dchar[] res;
		uint i;

		i = 0;
		foreach(dchar c; data)
		{
			if (i++ < start) continue;
			if (i >= end) continue;
			res ~= c;
		}

		if (i < end) throw new Error("String bounds error");

		return new StringType(res);
	}
	
	StringType opCat(StringType rhs)
	{
		return new StringType(data ~ rhs.data);
	}
	
	StringType opCatAssign(StringType rhs)
	{
		data ~= rhs.data;
		return this;
	}
	
	int length()
	{
		uint i = 0;
		foreach(dchar c; data) i++;
		return i;
	}
	
	int length(int newlength)
	{
		uint nl = newlength - length();
		data.length = data.length + nl;
	}
	
	char[] opCast()
	{
		return toUTF8(data);
	}
	
	char[] toString()
	{
		return toUTF8(data);
	}
}

//Choose your native encoding
alias StringType!(char[]) String;
//alias StringType!(wchar[]) String;
//alias StringType!(dchar[]) String;

//NOTE: for this to work on the windows console you have to:
//      - left-click top left corner of command prompt window
//      - select "properties"
//      - select "font"
//      - select "Lucida Console"
//      - type "chcp 65001" into command prompt
//and now you can finally run this example.

void main()
{
	//hopefully the suffix becomes redundant
	String test = new String("smörgåsbord"c);
	
	//sadly this creates a new string.
	String two = test[0..4]; 
	
	//as does this, but this time it's expected to
	String three = test[0..4] ~ test[5..test.length];
	
	//modify original string, note that this inserts in character position 3
(counting from 0) making 'a' the 4th character in this string
	test[3] = 'a';
	
	//for some odd reason if you change any of these to writefln it stops any more
data appearing
	//I suspect a bug specific to the windows, perhaps in phobos?
	writef("%s",test);
	writef(" ");
	writef("%s",two);
	writef(" ");
	writef("%s",three);
}

------------P4hyhsZr1p8vYZEA9ADCH7--
Nov 24 2005
parent reply "Kris" <fu bar.com> writes:
"Regan Heath" <regan netwin.co.nz> wrote
 Ok, this time I thought I'd see if I could come up with a string
 struct/class that behaved how I think an in-built string type should
 behave in D, here is the best I could do.

 Thoughts, comments, etc.

It seems clear that any unified string notion would be better off as a library suite; not built into the compiler. It's difficult enough to evolve the code within Phobos, let alone something hard-coded into the compiler. Thus, at this point, you're surely talking about a pre-packaged Phobos String class? Exactly the kind of thing that many have discussed in the past. The reason it hasn't yet happened are not fully clear, but I would bet it's partly to do with the following: a) it seems everyone has a different set of requirements for a String class -- tradeoffs regarding performance, flexibility, favourite methods, etc, etc. To wit: there are perfectly good String classes all over the place. Many different implementations to choose from. Some would argue that's a good thing. b) a String class to support Unicode is hardly a trivial undertaking. You really have to consider very hard what the goals are before putting something in stone (as in getting it added to Phobos). I say that from experience with the ICU project ~ there's code in there to handle the kinds of things that would frighten many people. Unicode ain't trivial and, frankly, I think AJ would have a hard time coming up with a "suitable" set of compromises. The latter is important: there will be many compromises one way or another. I think a good place to start is to ask yourself and others (particularly those who actually use unicode on a regular basis) why not just use ICU and be done with it ~ after all, ICU can do just about anything vis-a-vis Unicode. The outcome may be able to provide some guidance?
Nov 24 2005
next sibling parent reply "Regan Heath" <regan netwin.co.nz> writes:
On Thu, 24 Nov 2005 17:41:55 -0800, Kris <fu bar.com> wrote:
 "Regan Heath" <regan netwin.co.nz> wrote
 Ok, this time I thought I'd see if I could come up with a string
 struct/class that behaved how I think an in-built string type should
 behave in D, here is the best I could do.

 Thoughts, comments, etc.

It seems clear that any unified string notion would be better off as a library suite; not built into the compiler.

Perhaps, however the syntax can be better if it's built in.
 Thus, at this point, you're surely talking about a pre-packaged Phobos
 String class?

I have used a class here. I'd have preffered to use a struct but several things didn't work when it was a struct. I'd prefer it was built in most of all, like the arrays are.
 Exactly the kind of thing that many have discussed in the
 past. The reason it hasn't yet happened are not fully clear, but I would  
 bet it's partly to do with the following:

 a) it seems everyone has a different set of requirements for a String
 class -- tradeoffs regarding performance, flexibility, favourite methods,
 etc, etc. To wit: there are perfectly good String classes all over the
 place. Many different implementations to choose from. Some would argue
 that's a good thing.

This is true. I've read/heard many of the arguments. However, I reckon it's possible to make everyone happy with a built in type that doesn't try to do too much. That is what the purpose of this thread is.
 b) a String class to support Unicode is hardly a trivial undertaking. You
 really have to consider very hard what the goals are before putting
 something in stone (as in getting it added to Phobos).

Certainly and it appears to me that there already exists in DMD and Phobos the required code to handle the idea I have in mind. My goal is a built-in type which can store strings in any of the 3 UTF encodings, when sliced will give characters (as opposed to character fragments) and will be transcoded either implcitly or explicitly. Further, if the array feature that allows this: void foo(char[] a) {} char[] a; a.foo(); is also implemented for this type, then it becomes extensible and people can add their favourite methods, tho I would hope that phobos came with many already provided. It doesn't need anything else, from this point we provide the ICU features via methods and libraries, very little else needs to be built in, the class I posted almost does everything I see this built in type doing and it almost does it exactly how I wanted it done. Where it falls short is in the fact that it's not built in and does not have the syntax that would enable us to have.
 I say that from
 experience with the ICU project ~ there's code in there to handle the  
 kinds of things that would frighten many people. Unicode ain't trivial  
 and,
 frankly, I think AJ would have a hard time coming up with a "suitable"  
 set of compromises. The latter is important: there will be many  
 compromises one way or another.

I believe you, your experience would be useful in exploring this idea.
 I think a good place to start is to ask yourself and others (particularly
 those who actually use unicode on a regular basis) why not just use ICU  
 and be done with it ~ after all, ICU can do just about anything vis-a-vis
 Unicode. The outcome may be able to provide some guidance?

I think using ICU is a great idea. As I said above, this would be part of a library and would extend the built in type. All the built in type needs to do is store the 3 encodings, transcode between them and slice full characters (as opposed to fragments). Nothing more. Regan
Nov 24 2005
parent reply kris <fu bar.org> writes:
Regan Heath wrote:
 My goal is a built-in type which can store strings in any of the 3 UTF  
 encodings, when sliced will give characters (as opposed to character  
 fragments)

That would be require 32bits, then. A dchar.
 and will be transcoded either implcitly or explicitly. 
 Further,  if the array feature that allows this:
 
 void foo(char[] a) {}
 char[] a;
 a.foo();
 
 is also implemented for this type, then it becomes extensible

And, just then, the vehicle swerved off the road and over a cliff. Bon voyage.
Nov 24 2005
parent reply "Regan Heath" <regan netwin.co.nz> writes:
On Thu, 24 Nov 2005 20:22:53 -0800, kris <fu bar.org> wrote:
 Regan Heath wrote:
 My goal is a built-in type which can store strings in any of the 3 UTF   
 encodings, when sliced will give characters (as opposed to character   
 fragments)

That would be require 32bits, then. A dchar.

Yes. Note opIndex in the code I posted. Have you had a close look at std.format.doFormat and std.stdio.writefx? Have you noticed that UTF-8 characters are all transcoded to individual dchars then transcoded back to UTF-8 to be output? This doesn't proove anything but it suggests that using a dchar sized variable for characters will have little or no real effect on performance.. maybe, a conclusive test should really be made. My original idea was horribly broken because I tried to fight against the fact that the only type that can store a complete character all the time is the dchar, a 32 bit type. I was trying to make the ASCII app programmers happy, happy because they can store their characters in an 8 bit wide type.
 and will be transcoded either implcitly or explicitly. Further,  if the  
 array feature that allows this:
  void foo(char[] a) {}
 char[] a;
 a.foo();
  is also implemented for this type, then it becomes extensible

And, just then, the vehicle swerved off the road and over a cliff. Bon voyage.

I take it you don't like this feature? or.. I don't mind either way: a) string foo; foo.method(); b) string foo; method(foo); but then, I'm a C programmer by trade. Regan
Nov 24 2005
next sibling parent reply kris <fu bar.org> writes:
Regan Heath wrote:
 On Thu, 24 Nov 2005 20:22:53 -0800, kris <fu bar.org> wrote:
 
 Regan Heath wrote:

 My goal is a built-in type which can store strings in any of the 3 
 UTF   encodings, when sliced will give characters (as opposed to 
 character   fragments)

That would be require 32bits, then. A dchar.

Yes.

So, just use dchar. All it needs are properties to convert it to utf8 and utf16. Wait! You don't need any properties either, since you can use that awful hack below for those purposes <g> Seriously, the extent of what you appear to propose can be done right now, in multiple different ways. No compiler changes required. I'd like to see true properties for UTF transcoding, but that would just be convenient. There's already sufficient to build upon, assuming one would do the necessary research to construct a great API.
 
 Have you had a close look at std.format.doFormat and std.stdio.writefx?  
 Have you noticed that UTF-8 characters are all transcoded to individual  
 dchars then transcoded back to UTF-8 to be output?

I'm rather surprised that wasn't already widely known.
 This doesn't proove anything but it suggests that using a dchar sized  
 variable for characters will have little or no real effect on  
 performance.. 

Pardon me, but this sounds a bit naiive. One has to consider the use case involved ~ printf() can hardly be considered a high-performance, uh, anything. The goal is convenience, not speed (though the writef design could certainly be improved upon quite dramatically). Your above statement is trying to extrapolate an equivalent measure of acceptability in the general case. That doesn't hold up to much scrutiny, IMO. Confusing convenience with acceptable performance is a mistake.
 maybe, a conclusive test should really be made.

A conclusive test of what? This thing about writef is a total red herring. Horses for courses.
 but then, I'm a C programmer by trade.

C makes a great language to write nicely structured OO-style code. Don't knock it <g> Some would claim it's also more maintainable that C++ :-)
Nov 24 2005
parent reply "Regan Heath" <regan netwin.co.nz> writes:
On Thu, 24 Nov 2005 21:13:34 -0800, kris <fu bar.org> wrote:
 Regan Heath wrote:
 On Thu, 24 Nov 2005 20:22:53 -0800, kris <fu bar.org> wrote:

 Regan Heath wrote:

 My goal is a built-in type which can store strings in any of the 3  
 UTF   encodings, when sliced will give characters (as opposed to  
 character   fragments)

That would be require 32bits, then. A dchar.


So, just use dchar.

The advantage the type I'm imagining would have is the ability to store the data as UTF-8 internally. (like my class can). Characters would only exist as dchar sized units rarely i.e. when you actually indexed the string or asking it for them, one at a time. (like my class does).
 Seriously, the extent of what you appear to propose can be done right  
 now, in multiple different ways. No compiler changes required.

Yes, with a class, like I posted. But the syntax could be much nicer if it was built in, and if it came standard (built in or as part of the library) the other 3 array types could fade into obscurity, i.e. only get used when accessing code fragments was desired. It should mean that everyone writing code in D would use it and not one of the other 3, meaning we get no more "this library uses char[]" but "this library uses wchar[]" problems and no more "I have to write 3 functions one for each char type" problems either. Regan.
Nov 24 2005
parent reply kris <fu bar.org> writes:
Regan Heath wrote:
 On Thu, 24 Nov 2005 21:13:34 -0800, kris <fu bar.org> wrote:
 
 Regan Heath wrote:

 On Thu, 24 Nov 2005 20:22:53 -0800, kris <fu bar.org> wrote:

 Regan Heath wrote:

 My goal is a built-in type which can store strings in any of the 3  
 UTF   encodings, when sliced will give characters (as opposed to  
 character   fragments)

That would be require 32bits, then. A dchar.

Yes.

So, just use dchar.

The advantage the type I'm imagining would have is the ability to store the data as UTF-8 internally. (like my class can). Characters would only exist as dchar sized units rarely i.e. when you actually indexed the string or asking it for them, one at a time. (like my class does).
 Seriously, the extent of what you appear to propose can be done right  
 now, in multiple different ways. No compiler changes required.

Yes, with a class, like I posted. But the syntax could be much nicer if it was built in, and if it came standard (built in or as part of the library) the other 3 array types could fade into obscurity, i.e. only get used when accessing code fragments was desired. It should mean that everyone writing code in D would use it and not one of the other 3, meaning we get no more "this library uses char[]" but "this library uses wchar[]" problems and no more "I have to write 3 functions one for each char type" problems either. Regan.

Yep, it's clear what your after. And you're not the first to try. But you won't get there by ignoring the problems inherent in building a compromise. This whole subject needs some serious research, rather than chit chat in a NG. Better to look at how it's done everywhere else, and learn how that could be adapted appropriately? This is a wheel that's been invented before, by those with far more expertise than you or I will likely ever have in this field. It ain't hard to put together a useful String class. Making it extensible is easy too, given tools like interfaces and class inheritance. Designing it with respect to performance and immutability are also not so tough (though D badly needs read-only arrays). What's really hard is getting the initial set of compromises worked out, as I keep repeating. Then comes the hard work of dealing with the edge-conditions, special cases, unexpected gotcha's and, in some cases, just plain old grey-matter and hard work. You mentioned before that this built-in notion would somehow interface with ICU? Well, that would be a consideration. But first you need to review how ICU, and other packages like it, operate before assuming some binding to a native type (other than a class) could make it an attractive marriage. I stongly suspect, based on experience, that you'd end up with a class-based interface anyway. And why not? What on earth is wrong with classes? Especially when they're native to the language?
Nov 24 2005
next sibling parent reply "Regan Heath" <regan netwin.co.nz> writes:
On Thu, 24 Nov 2005 22:19:33 -0800, kris <fu bar.org> wrote:

<snip good advice>

 I stongly suspect, based on experience, that you'd end up with a  
 class-based interface anyway. And why not? What on earth is wrong with  
 classes? Especially when they're native to the language?

To answer that question you have to ask "what is the difference between a class and the built in array types?". Regan
Nov 24 2005
parent reply kris <fu bar.org> writes:
Regan Heath wrote:
 On Thu, 24 Nov 2005 22:19:33 -0800, kris <fu bar.org> wrote:
 
 <snip good advice>
 
 I stongly suspect, based on experience, that you'd end up with a  
 class-based interface anyway. And why not? What on earth is wrong 
 with  classes? Especially when they're native to the language?

To answer that question you have to ask "what is the difference between a class and the built in array types?". Regan

You don't know? :-) If I get your drift, the question should perhaps be thus: at what point of complexity does it become generally acceptable to leave native types behind. Everyone seems to have different opinion. What do you expect? The key to powerful, easy-to-use, practical, and extensible Unicode handling is, IMO, far away on the other side of that divide. I suspect/hope you'd ultimately agree. Since this thread is called "String theory by example", I'll encourage those interested to take a critical look at the ICU project here: http://icu.sourceforge.net/userguide/ and the D wrappers over here: http://svn.dsource.org/projects/mango/trunk/mango/icu/ No, I'm not saying that ICU is the "way and the truth". But one has to start researching somewhere.
Nov 24 2005
next sibling parent reply "Regan Heath" <regan netwin.co.nz> writes:
On Thu, 24 Nov 2005 23:12:45 -0800, kris <fu bar.org> wrote:
 Regan Heath wrote:
 On Thu, 24 Nov 2005 22:19:33 -0800, kris <fu bar.org> wrote:
  <snip good advice>

 I stongly suspect, based on experience, that you'd end up with a   
 class-based interface anyway. And why not? What on earth is wrong  
 with  classes? Especially when they're native to the language?

between a class and the built in array types?". Regan

You don't know? :-) If I get your drift, the question should perhaps be thus: at what point of complexity does it become generally acceptable to leave native types behind.

Yes, or rather to what degree should the built in types go in order to support feature X. X in this case being string handling and/or unicode string handling. What I'd like is for the built in types to go as far as providing support for indexing characters in strings regardless of the encoding(*). The reason I think that is the degree to which it should go is that once it does that anyone can write a function in D which will correctly handle any string in any encoding(*) without having to think about UTF code fragments and the problems associated with that. (*) The 3 UTF encodings are all it needs to support. Other encodings should be handled by libraries i.e. ICU.
 The key to powerful, easy-to-use, practical, and extensible Unicode  
 handling is, IMO, far away on the other side of that divide. I  
 suspect/hope you'd ultimately agree.

A complete solution is certainly, as you say, something a library should handle. But I don't want a complete solution, just a small step really. Regan
Nov 24 2005
parent reply John Reimer <terminal.node gmail.com> writes:
I have a proposal... okay it's not about strings.  After trying to 
follow all these posts, I now can say I'm thoroughly confused about 
everything UTF-like, unicodish, pointless, valueable, or 
characteristically encoded in 8, 16, and 32 discrete portions.

I propose a name change to this thread.

String Theory by Relentless Debate

Regan you're a great guy, but you sure are insatiably persistant!
Kris, is it worth it? I don't think it's getting through to him yet. :)

Cheers to both of you!

-JJR
Nov 25 2005
next sibling parent reply kris <fu bar.org> writes:
John Reimer wrote:
 I propose a name change to this thread.
 
 String Theory by Relentless Debate

:-D
 
 Regan you're a great guy, but you sure are insatiably persistant!
 Kris, is it worth it? I don't think it's getting through to him yet. :)

We're learning how to be nice to each other ;-)
 Cheers to both of you!

To you too ~ you don't frequent here as much as you once did. That's a shame.
Nov 25 2005
parent "Regan Heath" <regan netwin.co.nz> writes:
On Fri, 25 Nov 2005 00:47:25 -0800, kris <fu bar.org> wrote:
 John Reimer wrote:
 I propose a name change to this thread.
  String Theory by Relentless Debate

:-D
  Regan you're a great guy, but you sure are insatiably persistant!
 Kris, is it worth it? I don't think it's getting through to him yet. :)

We're learning how to be nice to each other ;-)

Yeah, seems to be working. Regan
Nov 25 2005
prev sibling next sibling parent "Regan Heath" <regan netwin.co.nz> writes:
On Fri, 25 Nov 2005 00:04:06 -0800, John Reimer <terminal.node gmail.com>  
wrote:
 I have a proposal... okay it's not about strings.  After trying to  
 follow all these posts, I now can say I'm thoroughly confused about  
 everything UTF-like, unicodish, pointless, valueable, or  
 characteristically encoded in 8, 16, and 32 discrete portions.

 I propose a name change to this thread.

 String Theory by Relentless Debate

 Regan you're a great guy, but you sure are insatiably persistant!

Fair comment.
 Kris, is it worth it? I don't think it's getting through to him yet. :)

I sure wish I knew what "it" was :) Regan
Nov 25 2005
prev sibling parent reply Georg Wrede <georg.wrede nospam.org> writes:
John Reimer wrote:
 I have a proposal... okay it's not about strings.  After trying to 
 follow all these posts, I now can say I'm thoroughly confused about 
 everything UTF-like, unicodish, pointless, valueable, or 
 characteristically encoded in 8, 16, and 32 discrete portions.

Yes, and I bet a bunch of other folks who don't write are even more confused. And that gets us conveniently to the exact point of all this: we who carry on this debate, do it precisely so that future D users could gain a few things: 1. Not get drowned in the current utf maze of glass walls and mirrors. 2. Real Soon Now, be able to do their coding without being forced to know a single thing about utf. 3. Not have downright disinformation stuffed down their throats by D documentation, specs, or the existing choice of data types in D. 4. Get rid of all the gotchas (especially the unobvious) hidden in the current framework of what appears to be character types and handling. At least I personally expect this whole "utf" issue to be over and done with, in a couple of weeks. (Heh, knowing SW projects, that probably means before year's end.) Once this is fixed, we have a _much_ smoother API, both factually, but especially in concept. And then -- we'll not hear a word about utf during the entire next year. 8-| And you know what: I actually think we don't have to do very much coding to get that done. Most of the issues here are with the language spec, removing and renaming existing datatypes. The major part of needed code actually exists already within Phobos, so this is technically trivial (the others in this ng. may not agree, but I think so).
 I propose a name change to this thread.
 
 String Theory by Relentless Debate

Nice quip! :-) And hey, I guess most of the already bewildered have skipped this thread for ages ago. It's not only the precisely wrong thing to read if one needs to learn about International Character Set Issues, this thread is downright counter productive for that.
 Regan you're a great guy, but you sure are insatiably persistant! 

While I may be a thinker, visionary and a loudmouth, at least Regan is one who gets things done! And he's persistent, I agree!
 Kris, is it worth it? I don't think it's getting through to him yet.

Well, IMHO, Kris and Regan have been talking about apples and oranges, without either noticing. Regan is talking about this utf thing in terms of what we here have been discussing, while Kris means the entire ICU issue. At least I believe this is so, and that they've not necessarily believed the other one understands the ("same") issue. (Kris, Regan, correct me if I'm wrong here.) And John, I assume you think, taking on the whole ICU issue (I'm using a wrong term here, I know, but you know what I mean) is a little too big job for us, right? Which I wholeheartedly agree with. More specifically, the ICU thing is not something I believe D should even tackle. For the next couple of years, I think *those* application programmers who care about such, should use a library (like ICU, or whatever). What D provides will be adequate UTF handling -- as far as slicin' n' streamin' are concerned, nothing fancier. After a couple of years, we can always check the issue again. Maybe by that time a bunch of now broken issues have been settled, maybe there actually is some need for such functionality, maybe by that time they have stopped fighting with the available compilers (bugs), maybe... Then we can check it out. Upon our solid (but not all encompassing) basement, everybody can build a Galaxy Wide Character set. But we do need the basement first, and it has to be solid.
Nov 25 2005
parent reply John Reimer <terminal.node gmail.com> writes:
Georg Wrede wrote:
 John Reimer wrote:
 
 I have a proposal... okay it's not about strings.  After trying to 
 follow all these posts, I now can say I'm thoroughly confused about 
 everything UTF-like, unicodish, pointless, valueable, or 
 characteristically encoded in 8, 16, and 32 discrete portions.

Yes, and I bet a bunch of other folks who don't write are even more confused. And that gets us conveniently to the exact point of all this: we who carry on this debate, do it precisely so that future D users could gain a few things: 1. Not get drowned in the current utf maze of glass walls and mirrors. 2. Real Soon Now, be able to do their coding without being forced to know a single thing about utf. 3. Not have downright disinformation stuffed down their throats by D documentation, specs, or the existing choice of data types in D. 4. Get rid of all the gotchas (especially the unobvious) hidden in the current framework of what appears to be character types and handling.

I don't doubt in the least the importance of this debate. Despite being unable to wholly understand even half the material presented, I respect the necessity of the wrangling... although in Kris' and Regan's case, I don't think it was getting anywhere.
 At least I personally expect this whole "utf" issue to be over and done 
 with, in a couple of weeks. (Heh, knowing SW projects, that probably 
 means before year's end.) Once this is fixed, we have a _much_ smoother 
 API, both factually, but especially in concept. And then -- we'll not 
 hear a word about utf during the entire next year. 8-|

Really? Such optimism! :) Who says the smoother API will be agreed upon or even adopted? I hope it does, whatever that API is or wherever that API currently exists. If that API resides in ICU, then there's a stopgap solution until people make up there minds (specifically people like Walter). If otherwise, then a solution will be long in coming, I think, and will be debated until the end of our days. Although, I'm still curious to know why people think they can change D without Walter's input in the matter. I expect you people are planning on making a submission to phobos or something and hoping Walter will agree?
 And you know what: I actually think we don't have to do very much coding 
 to get that done. Most of the issues here are with the language spec, 
 removing and renaming existing datatypes. The major part of needed code 
 actually exists already within Phobos, so this is technically trivial 
 (the others in this ng. may not agree, but I think so).

Could be. I don't know much to agree or not... but I'm always hopeful. :D
 I propose a name change to this thread.

 String Theory by Relentless Debate

Nice quip! :-) And hey, I guess most of the already bewildered have skipped this thread for ages ago. It's not only the precisely wrong thing to read if one needs to learn about International Character Set Issues, this thread is downright counter productive for that.

Ho.. yeah. I don't think it helped me. I tried reading some of this but it certainly just confused the issue for me. But that's okay. The purpose of the thread wasn't for education of UTF novices... I can accept that. :)
 Regan you're a great guy, but you sure are insatiably persistant! 

While I may be a thinker, visionary and a loudmouth, at least Regan is one who gets things done! And he's persistent, I agree!

What can I say? this board wouldn't be the same without gents like you two. This topic is an obvious necessity. Discourse is important.
 Kris, is it worth it? I don't think it's getting through to him yet.

Well, IMHO, Kris and Regan have been talking about apples and oranges, without either noticing. Regan is talking about this utf thing in terms of what we here have been discussing, while Kris means the entire ICU issue. At least I believe this is so, and that they've not necessarily believed the other one understands the ("same") issue. (Kris, Regan, correct me if I'm wrong here.)

I'm not so sure they're on different wavelengths. I think they merely see a different solution to the same problem. One sees a broad solution that is feasible in the current D universe. The other sees a narrower one that requires an intimate influence over the development of D. I'll let you decide which is which. Yet the debate is only as productive as far as the language can be influenced; so, of necessity, only one will be more successul than the other, no matter which solution is best. Nonetheless, it is good to discuss alternative solutions, on the off chance that language change may happen. But it appears that any sort of change will create a tremondously large commitment to solving a extremely complicated problem. Have we got the language designer behind us on this? Because if we don't, it'll be the toughest ride of our lives.
 
 And John, I assume you think, taking on the whole ICU issue (I'm using a 
 wrong term here, I know, but you know what I mean) is a little too big 
 job for us, right? Which I wholeheartedly agree with.

Well, it is. But, I certainly respect the necessity of the discussion. I get a little listless, though, wondering whether it's going anywhere.
 More specifically, the ICU thing is not something I believe D should 
 even tackle. For the next couple of years, I think *those* application 
 programmers who care about such, should use a library (like ICU, or 
 whatever). What D provides will be adequate UTF handling -- as far as 
 slicin' n' streamin' are concerned, nothing fancier. After a couple of 
 years, we can always check the issue again. Maybe by that time a bunch 
 of now broken issues have been settled, maybe there actually is some 
 need for such functionality, maybe by that time they have stopped 
 fighting with the available compilers (bugs), maybe... Then we can check 
 it out.

That's a fair assessment of the situation. That makes much more sense.
 Upon our solid (but not all encompassing) basement, everybody can build 
 a Galaxy Wide Character set. But we do need the basement first, and it 
 has to be solid.
 

Yes. I can appreciate that. Thanks for your remarks, Georg. It's always a pleasure. -John
Nov 25 2005
next sibling parent reply "Kris" <fu bar.com> writes:
Long post, with code examples.

Originally thought I was responding to John, but that might not be the case 
anymore. It's just a general heads'-up kind of reply on this lengthly topic. 
No offence intended to anyone.


"John Reimer" <terminal.node gmail.com> wrote
 Georg Wrede wrote:
 John Reimer wrote:

 I have a proposal... okay it's not about strings.  After trying to 
 follow all these posts, I now can say I'm thoroughly confused about 
 everything UTF-like, unicodish, pointless, valueable, or 
 characteristically encoded in 8, 16, and 32 discrete portions.

Yes, and I bet a bunch of other folks who don't write are even more confused. And that gets us conveniently to the exact point of all this: we who carry on this debate, do it precisely so that future D users could gain a few things: 1. Not get drowned in the current utf maze of glass walls and mirrors. 2. Real Soon Now, be able to do their coding without being forced to know a single thing about utf. 3. Not have downright disinformation stuffed down their throats by D documentation, specs, or the existing choice of data types in D. 4. Get rid of all the gotchas (especially the unobvious) hidden in the current framework of what appears to be character types and handling.


============== I certainly don't wish to discourage anyone from writing a useful String API. Quite the opposite, in fact. To that end, I'll add some long-winded concerns to watch out for. 1. It seems reasonable that one should come up with an abstraction of what the String should do, using either an abstract class or an interface. This eliminates ctor() considerations completely, and permits pretty much anyone to write a compatable String. Compatability is important if this is going to become fundamental to D. So, write an abstract specification; and then provide a rudamentary concrete class that implements the spec. Perhaps a simple dchar[] implementation? Anyway, onto an example specification: 2. Suppose we have this: ~~~~~~~~~~~~ class String { // some read only methods abstract bool startsWith (String); abstract bool endsWith (String); abstract int indexOf (String, int start=0); // transcoding methods char* cStr(); char[] utf8(); wchar[] utf16(); dchar[] utf32(); ... // some mutating methods abstract void prepend (String); abstract void append (String); abstract void setCharAt (int index, dchar chr); ... } ~~~~~~~~~~~~~ There's immediately three things to note. (a) many arguments are other instances of String, since otherwise you'd have to provide char/wchar/dchar instances of every method (what you're trying to avoid, right?). (b) the setCharAt() method takes a dchar. How do you avoid that without wrapping dchar too? I don't think it would be practical, so dchar it stays? (c) the operations noted explicitly avoid certain functionality that would add seriously to the complexity of a basic implementation (add your favourite collation-sequence example here). On the other hand, one can hide such nasties with careful choice of methods. For example, one could add a trimWhitespace() method, which can be implemented without the requirement of full Unicode character classification. The point is to be careful about the methods chosen. 3. Notice the distinction between read-only and mutating methods. To assist in writing deterministic (and performant) multi-threaded code, it would be advantageous to split the specification into mutable and non-mutable variations (I'll assume the benefits of doing so are acknowledged) ~~~~~~~~~~ class String // a read-only String { // some read only methods abstract bool startsWith (String); abstract bool endsWith (String); abstract int indexOf (String, int start=0); // transcoding methods char* cStr(); char[] utf8(); wchar[] utf16(); dchar[] utf32(); .. } class MutableString : String // a modifiable String { // some mutating methods abstract void prepend (String); abstract void append (String); abstract void setCharAt (int index, dchar chr); ... } ~~~~~~~~~~~ Now you can pass either type to a method that accepts the read-only String, yet can be somewhat assured of the intent when a called function expects a MutableString as an argument (it is expecting to change the darned thing <g>), and the compiler will catch such mismatches appropriately. 4. Using abstract classes is cool, but it limits the ability of someone trying to build compatible, alternate, implementations: they'd be limited in what to use as a base-class. To open up the compatability aspect, we adopt interfaces instead: ~~~~~~~~~~~~~ interface IString { // some read only methods bool startsWith (String); bool endsWith (String); int indexOf (String, int start=0); // transcoding methods char* cStr(); char[] utf8(); wchar[] utf16(); dchar[] utf32(); .. } interface IMutableString : IString { // some mutating methods void prepend (String); void append (String); void setCharAt (int index, dchar chr); ... } ~~~~~~~~~~~ At this point, there's little stopping another developer making a compatible implementation, yet with completely different internals (and ctors) than the original reference implementation. For example: the ICU wrappers could implement these interfaces, and Hey Presto! Full compatability with the basic specification! 5. So here's where some little gotcha's come into play: (a) Notice that the transcoding routines should never be providing access to the internal content? After all, it's read-only. This is where a read-only attribute would come in handy; e.g. "readonly char[] utf8();". The upside here is that the class could 'cache' the utf8 transcoding, or not transcode at all if the implementation is native utf8. Unfortunately, D currently expects the recipient to "play by the rules" of CoW; something that is completely unenforcable by the class designer. String is just the kind of class that needs readonly support. Let's hope that support comes along soon (and, yes, it can be done with CoW ~ but that's not enforceable). (b) Notice also that the prepend() and append() methods take a String as an argument. They do this such that alternate implementations are allowed to play too. However, this requires any implementation of append(String) to call one of the transcoder methods of it's argument, to get the appending content. From a functional perspective, this is wonderful. From a performance perspective it's not. This is another reason why a String class might 'cache' its transcodings. Again, there's the readonly concern, since CoW is not enforcable by the class designer.
 At least I personally expect this whole "utf" issue to be over and done 
 with, in a couple of weeks. (Heh, knowing SW projects, that probably 
 means before year's end.) Once this is fixed, we have a _much_ smoother 
 API, both factually, but especially in concept. And then -- we'll not 
 hear a word about utf during the entire next year. 8-|

Really? Such optimism! :) Who says the smoother API will be agreed upon or even adopted? I hope it does, whatever that API is or wherever that API currently exists. If that API resides in ICU, then there's a stopgap solution until people make up there minds (specifically people like Walter). If otherwise, then a solution will be long in coming, I think, and will be debated until the end of our days. Although, I'm still curious to know why people think they can change D without Walter's input in the matter. I expect you people are planning on making a submission to phobos or something and hoping Walter will agree?

Some very real concerns. Trying to get the NG to agree on anything has historically been a notable waste of time. Example: I don't expect many people to agree with the considerations layed out above. Getting Walter to agree on something even where there's group concensus has, at times, proved futile in the past also. However, this is a presumably a Phobos submission rather than a language change? Big difference there. The reason such as String class does not exist today (in Phobos) is that nobody could agree if they even wanted a class implementation; let alone what it should do! :) I hope something good happens here. But ain't holding my breath for long <g>
 Well, IMHO, Kris and Regan have been talking about apples and oranges, 
 without either noticing.

 Regan is talking about this utf thing in terms of what we here have been 
 discussing, while Kris means the entire ICU issue. At least I believe 
 this is so, and that they've not necessarily believed the other one 
 understands the ("same") issue. (Kris, Regan, correct me if I'm wrong 
 here.)


I do hope it's perfectly clear by now that I've been saying "take a look at the bigger picture first!" all along?
 And John, I assume you think, taking on the whole ICU issue (I'm using a 
 wrong term here, I know, but you know what I mean) is a little too big 
 job for us, right? Which I wholeheartedly agree with.

Well, it is. But, I certainly respect the necessity of the discussion. I get a little listless, though, wondering whether it's going anywhere.

It's a misconception that my stance is about adopting ICU, so I hope this has been clarified for all who might have felt that way <g>
 More specifically, the ICU thing is not something I believe D should even 
 tackle. For the next couple of years, I think *those* application 
 programmers who care about such, should use a library (like ICU, or 
 whatever). What D provides will be adequate UTF handling -- as far as 
 slicin' n' streamin' are concerned, nothing fancier. After a couple of 
 years, we can always check the issue again. Maybe by that time a bunch of 
 now broken issues have been settled, maybe there actually is some need 
 for such functionality, maybe by that time they have stopped fighting 
 with the available compilers (bugs), maybe... Then we can check it out.

That's a fair assessment of the situation. That makes much more sense.

Again; to do a good job we have to take such things into account <g> Really thought I would be replying to John here, but it turned out otherwise. Hope that's OK with JJR?
Nov 25 2005
next sibling parent reply John Reimer <terminal.node gmail.com> writes:
Kris wrote:
 Long post, with code examples.

<snip long post>
 
More specifically, the ICU thing is not something I believe D should even 
tackle. For the next couple of years, I think *those* application 
programmers who care about such, should use a library (like ICU, or 
whatever). What D provides will be adequate UTF handling -- as far as 
slicin' n' streamin' are concerned, nothing fancier. After a couple of 
years, we can always check the issue again. Maybe by that time a bunch of 
now broken issues have been settled, maybe there actually is some need 
for such functionality, maybe by that time they have stopped fighting 
with the available compilers (bugs), maybe... Then we can check it out.

That's a fair assessment of the situation. That makes much more sense.

Again; to do a good job we have to take such things into account <g> Really thought I would be replying to John here, but it turned out otherwise. Hope that's OK with JJR?

This is perfectly fine, Kris, and I thank you for it. I think you've clarified your perspective well here. Really, I think we were referring to "ICU" rather loosely here as a _symbol_ representing a comprehensive unicode solution for D, which would be a major undertaking. That said, I've always liked the idea of a solid String class, something that could be built upon or expanded over time. Your sample specification is food for thought. If that's the type of API that people could agree upon, then I think the D community can get somewhere. Georg, when you mentioned an API, is that the general idea to which you were referring? Or did you mean something else? Regan, your thoughts? In the past, quite a few people in the community rejected the idea of a string class; they said it wasn't necessary, or they didn't want any string management turning Object Oriented. Their resistance perplexed me because I figured there could be only benefits to adopting such a package. Those that didn't want to use it could stick to the basic D types. Another benefit of adopting a string package is that it can be a ready addition to Phobos. And that, like you said Kris, is much more likely to happen than any promotions for language changes. -JJR
Nov 25 2005
next sibling parent "Regan Heath" <regan netwin.co.nz> writes:
------------Xr2wCNjwY35HIecq3niYHO
Content-Type: text/plain; format=flowed; delsp=yes; charset=iso-8859-15
Content-Transfer-Encoding: 8bit

On Fri, 25 Nov 2005 14:31:05 -0800, John Reimer <terminal.node gmail.com>  
wrote:
 Kris wrote:
 Long post, with code examples.

<snip long post>
 More specifically, the ICU thing is not something I believe D should  
 even tackle. For the next couple of years, I think *those*  
 application programmers who care about such, should use a library  
 (like ICU, or whatever). What D provides will be adequate UTF  
 handling -- as far as slicin' n' streamin' are concerned, nothing  
 fancier. After a couple of years, we can always check the issue  
 again. Maybe by that time a bunch of now broken issues have been  
 settled, maybe there actually is some need for such functionality,  
 maybe by that time they have stopped fighting with the available  
 compilers (bugs), maybe... Then we can check it out.

That's a fair assessment of the situation. That makes much more sense.

Really thought I would be replying to John here, but it turned out otherwise. Hope that's OK with JJR?

This is perfectly fine, Kris, and I thank you for it. I think you've clarified your perspective well here. Really, I think we were referring to "ICU" rather loosely here as a _symbol_ representing a comprehensive unicode solution for D, which would be a major undertaking.

Not just a "unicode" solution but a solution for all major character encodings. It's waaay more than I am looking for at this point in time. D can already convert between the 3 UTF encodings it uses and I'm not looking for anything more at this stage.
 That said, I've always liked the idea of a solid String class, something  
 that could be built upon or expanded over time.  Your sample  
 specification is food for thought.  If that's the type of API that  
 people could agree upon, then I think the D community can get somewhere.  
   Georg, when you mentioned an API, is that the general idea to which  
 you were referring?  Or did you mean something else? Regan, your  
 thoughts?

I'm looking at this from a slightly different angle. I don't want a class with an API which defines methods like "startsWith" and "endsWith" etc. I think they're un-necessary at this stage and here's why... The first stage, to my mind is being able to index and slice complete characters as opposed to fragments of characters and to be able to do this regardless of the actual encoding used to store the data. For example: string test = "smörgåsbord"; assert(test[2] == 'ö'); Regardless of whether this is stored in UTF-8, UTF-16 or UTF-32 this should just work. (the string class I posted to start this thread can do this) Once we can do this, we can write "startsWith" and "endsWith" trivially. bool startsWith(string s, string text) {} bool endsWith(string s, string text) {} Provided D supports it's array method calling feature here too, we could call these like so: string s; s.startsWith(new string("test")); Which should hopefully keep the people who prefer the object call style happy. Further, if "string" becomes a built in type this becomes: string s; s.startsWith("test"); I now see this "string" type as an addition to the language, not as a replacement for char[], wchar[] and dchar[]. "string" would ideally become the type everyone used for general purpose string handling, only in areas where the encoding itself or code fragments were important would people use the char[], wchar[] or dchar[] types and then some of that could be handled by having a "string" which can use an encoding specified at run time (as opposed to only compile time like the class I posted) My latest "string" class effort is attached. It is by no means exactly what I envision, it's more of a "test the theory", "explore the consequences" sort of thing. As Kris said, there are consequences and trade-offs, speed for space etc. I hope for a "string" which can provide the trade offs each person and situation desires, and if not, we still have char[], wchar[] and dchar[] to fall back to.
 In the past, quite a few people in the community rejected the idea of a  
 string class; they said it wasn't necessary, or they didn't want any  
 string management turning Object Oriented.  Their resistance perplexed  
 me because I figured there could be only benefits to adopting such a  
 package. Those that didn't want to use it could stick to the basic D  
 types.

These are all reasons why I think it should be a built in type, and why we should not define an class-style API at this stage. Regan ------------Xr2wCNjwY35HIecq3niYHO Content-Disposition: attachment; filename=string.d Content-Type: application/octet-stream; name=string.d Content-Transfer-Encoding: 8bit /+ Module: string Author: Regan Heath Date : 26/11/2005 +/ module string; import std.utf; import std.stdio; import std.string; class stringT(T:T[]) { private T[] data; // convert data to internal type +/ private template convert(S:S[]) { private T[] convert(S[] value) { static if (is(T == char)) return toUTF8 (value); static if (is(T == wchar)) return toUTF16(value); static if (is(T == dchar)) return toUTF32(value); } } // replaces the character specified by range (start,end) with a new character u private void assignChar(uint start, uint end, char u) { T[] tmp; std.utf.encode(tmp,u); assignChar(start,end,tmp); } // replaces the character specified by range (start,end) with a new character u private void assignChar(uint start, uint end, wchar u) { T[] tmp; std.utf.encode(tmp,u); assignChar(start,end,tmp); } // replaces the character specified by range (start,end) with a new character u private void assignChar(uint start, uint end, dchar u) { T[] tmp; std.utf.encode(tmp,u); assignChar(start,end,tmp); } // replaces the character specified by range (start,end) with a new character tmp private void assignChar(uint start, uint end, T[] tmp) { int x; if (tmp.length == end-start) data[start..end] = tmp[]; else { x = (end-start) - tmp.length; if (x > 0) { memmove(data.ptr + (T.sizeof * (end-x)), data.ptr + (T.sizeof * end), (T.sizeof * (data.length-start))); data.length = data.length - x; data[start..end-x] = tmp[]; } else { x = -x; data.length = data.length + x; memmove(data.ptr + (T.sizeof * (end+x)), data.ptr + (T.sizeof * end), (T.sizeof * (data.length-(end+x)))); data[start..end+x] = tmp[]; } } } // properties for trancoding between UTF encodings char[] utf8 () { static if(is(T == char)) return data; return toUTF8(data); } wchar[] utf16() { static if(is(T == wchar)) return data; return toUTF16(data); } dchar[] utf32() { static if(is(T == dchar)) return data; return toUTF32(data); } // properties for trancoding between UTF encodings char[] utf8 ( char[] value) { static if(is(T == char)) data = value; data = convert!(char[])(value); return utf8; } wchar[] utf16(wchar[] value) { static if(is(T == wchar)) data = value; data = convert!(wchar[])(value); return utf16; } dchar[] utf32(dchar[] value) { static if(is(T == dchar)) data = value; data = convert!(dchar[])(value); return utf32; } // construction from any UTF encoding this( char[] value) { utf8 = value; } this(wchar[] value) { utf16 = value; } this(dchar[] value) { utf32 = value; } // Indexing always gives a dchar dchar opIndex(uint index) { static if(is(T == dchar)) { if(index >= data.length) throw new Error("String bounds error"); return data[index]; } else { foreach(uint i, dchar c; data) if (i == index) return c; throw new Error("String bounds error"); } } // Index assignment always takes a dchar and returns a dchar dchar opIndexAssign(dchar u, uint index) { static if(is(T == dchar)) { if (index >= data.length) throw new Error("String bounds error"); data[index] = u; } else { uint idx,start,end; dchar c; for(end = idx = start = 0; end < data.length; idx++, start = end) { c = std.utf.decode(data,end); if (idx == index) { if (u != c) assignChar(start,end,u); break; } } if (idx != index) throw new Error("String bounds error"); } return u; } // slicing happens on character boundaries, always returning complete characters // the slice result is not a copy, yay! stringT opSlice(uint ss, uint se) { static if(is(T == dchar)) { if (ss < 0) throw new Error("String bounds error"); if (se >= data.length) throw new Error("String bounds error"); if (ss > se) throw new Error("String bounds error"); return new string(data[ss..se]); } else { uint end,idx,start; uint ssi,sei; dchar c; for(end = idx = start = 0; end < data.length; idx++, start = end) { c = std.utf.decode(data,end); if (idx == ss) ssi = start; if (idx == se) { sei = start; break; } } if (idx != se) throw new Error("String bounds error"); return new string(data[ssi..sei]); } } // returns a new string, with a copy of the data involved stringT opCat(stringT rhs) { return new stringT(data ~ rhs.data); } // appends the new data to the current data stringT opCatAssign(stringT rhs) { data ~= rhs.data; return this; } // Read only char foreach int opApply(int delegate(char) dg) { int r = 0; foreach(char c; data) { r = dg(c); if (r) break; } return r; } // Read only wchar foreach int opApply(int delegate(wchar) dg) { int r = 0; foreach(wchar c; data) { r = dg(c); if (r) break; } return r; } // Read only dchar foreach int opApply(int delegate(dchar) dg) { int r = 0; foreach(dchar c; data) { r = dg(c); if (r) break; } return r; } // Read/write char foreach int opApply(int delegate(inout char) dg) { int r = 0; static if(is(T == char)) { foreach(inout T c; data) { r = dg(c); if (r) break; } } else { char[] tmp; foreach(uint i, T c; data) { tmp.length = 0; std.utf.encode(tmp,c); foreach(inout char c; tmp) { r = dg(c); if (r) break; } if (r) break; assignChar(i,i+1,convert!(char[])(tmp)); } } return r; } // Read/write wchar foreach int opApply(int delegate(inout wchar) dg) { int r = 0; static if(is(T == wchar)) { foreach(inout T c; data) { r = dg(c); if (r) break; } } else { wchar[] tmp; foreach(uint i, T c; data) { tmp.length = 0; std.utf.encode(tmp,c); foreach(inout wchar c; tmp) { r = dg(c); if (r) break; } if (r) break; assignChar(i,i+1,convert!(wchar[])(tmp)); } } return r; } // Read/write dchar foreach int opApply(int delegate(inout dchar) dg) { int r = 0; static if(is(T == dchar)) { foreach(inout T c; data) { r = dg(c); if (r) break; } } else { uint end,start; dchar c,u; for(end = 0; end < data.length; start = end) { c = u = std.utf.decode(data,end); r = dg(u); if (r) break; if (u != c) assignChar(start,end,u); } } return r; } // the ubiquitous dup function stringT dup() { return new string(data.dup); } // getter, string length int length() { uint len = 0; foreach(dchar c; data) len++; return len; } // setter, string length int length(int newlength) { uint nl = newlength - length(); data.length = data.length + nl; return newlength; } // can be cast to char[] char[] opCast() { static if(is(T == char)) return data; else return toUTF8(data); } // string representation char[] toString() { static if(is(T == char)) return data; else return toUTF8(data); } } //The 'space saver' encoding alias stringT!(char[]) string; //The 'compromise' encoding //alias stringT!(wchar[]) string; //The 'fast' encoding //alias stringT!(dchar[]) string; //NOTE: for this to work on the windows console you have to: // - left-click top left corner of command prompt window // - select "properties" // - select "font" // - select "Lucida Console" // - type "chcp 65001" into command prompt //and now you can finally run this example. void main() { string test = new string("smörgåsbord"c); //test string here! } ------------Xr2wCNjwY35HIecq3niYHO--
Nov 26 2005
prev sibling parent Georg Wrede <georg.wrede nospam.org> writes:
 More specifically, the ICU thing is not something I believe D
 should even tackle. For the next couple of years, I think
 *those* application programmers who care about such, should use
 a library (like ICU, or whatever). What D provides will be
 adequate UTF handling -- as far as slicin' n' streamin' are
 concerned, nothing fancier. After a couple of years, we can
 always check the issue again. Maybe by that time a bunch of now
 broken issues have been settled, maybe there actually is some
 need for such functionality, maybe by that time they have
 stopped fighting with the available compilers (bugs), maybe...
 Then we can check it out.

That's a fair assessment of the situation. That makes much more sense.

Again; to do a good job we have to take such things into account <g> Really thought I would be replying to John here, but it turned out otherwise. Hope that's OK with JJR?

This is perfectly fine, Kris, and I thank you for it. I think you've clarified your perspective well here. Really, I think we were referring to "ICU" rather loosely here as a _symbol_ representing a comprehensive unicode solution for D, which would be a major undertaking. That said, I've always liked the idea of a solid String class, something that could be built upon or expanded over time. Your sample specification is food for thought. If that's the type of API that people could agree upon, then I think the D community can get somewhere. Georg, when you mentioned an API, is that the general idea to which you were referring? Or did you mean something else? Regan, your thoughts?

Yes. Except I wasn't including the String class at the time, but I'm now picking up the can of Weider Body Building Proteins, which will be needed in the upcoming debate with Walter. ;-) Ideally, we'd have both OO and non-OO for strings. The Docs would have the Strings prominently placed, while the non-OO API would be somewhere that the casual reader doesn't stumble upon too soon. :-) The String class would be needed for normal usage of D, and the non-OO API "because D is a systems language", or whatever. And, the non-OO API would be much smaller, the excuse being that mostly Systems Programmers will use it, for other purposes than general application programming. (Slicin' n' streamin', not very much else.) (And I believe the changes needed for the non-OO api are truly minimal. The rewards however would be big.)
 In the past, quite a few people in the community rejected the idea of
 a string class; they said it wasn't necessary, or they didn't want
 any string management turning Object Oriented.  Their resistance
 perplexed me because I figured there could be only benefits to
 adopting such a package. Those that didn't want to use it could stick
 to the basic D types.

Understandable. The past was USASCII, while folks (not even noticing) already used utf. So strings-as-arrays seemed too lucrative. "Want a String class in your app? Write one into the app."
 Another benefit of adopting a string package is that it can be a
 ready addition to Phobos.

Yup. Sending Walter the diffs (and docs) would probably stand a better chance than just asking.
Nov 26 2005
prev sibling next sibling parent Georg Wrede <georg.wrede nospam.org> writes:
Kris wrote:
 Long post, with code examples.

Read it. Now it is 3:15 AM, so I won't waste my sleep and everybody else's reading bandwidth with replying to this (well thought out, and containing crucial issues) post. I hope I'll have time enough to write a decent reply already tomorrow. This post certainly deserves it. :-) g
Nov 25 2005
prev sibling parent Georg Wrede <georg.wrede nospam.org> writes:
Kris wrote:

...

 1. Not get drowned in the current utf maze of glass walls and
 mirrors.
 
 2. Real Soon Now, be able to do their coding without being forced
 to know a single thing about utf.
 
 3. Not have downright disinformation stuffed down their throats
 by D documentation, specs, or the existing choice of data types
 in D.
 
 4. Get rid of all the gotchas (especially the unobvious) hidden
 in the current framework of what appears to be character types
 and handling.



...
 1. It seems reasonable that one should come up with an abstraction
 of what the String should do, using either an abstract class or an
 interface.
 
 This eliminates ctor() considerations completely, and permits
 pretty much anyone to write a compatable String. Compatability
 is important if this is going to become fundamental to D. So,
 write an abstract specification; and then provide a rudamentary
 concrete class that implements the spec. Perhaps a simple dchar[]
 implementation?

Sounds good.
 Anyway, onto an example specification:

 2. Suppose we have this:
 
 ~~~~~~~~~~~~
 
 class String
 {
     // some read only methods
     abstract bool startsWith (String);
     abstract bool endsWith (String);
     abstract int indexOf (String, int start=0);
 
     // transcoding methods
     char*    cStr();
     char[]    utf8();
     wchar[] utf16();
     dchar[]  utf32();
     ...
 
     // some mutating methods
     abstract void prepend (String);
     abstract void append (String);
 
     abstract void setCharAt (int index, dchar chr);
     ...
 }
 
 ~~~~~~~~~~~~~
 
 There's immediately three things to note.
 
 (a) many arguments are other instances of String, since otherwise
 you'd have to provide char/wchar/dchar instances of every method
 (what you're trying to avoid, right?).
 
 (b) the setCharAt() method takes a dchar. How do you avoid that
 without wrapping dchar too? I don't think it would be practical, so
 dchar it stays?
 
 (c) the operations noted explicitly avoid certain functionality that
 would add seriously to the complexity of a basic implementation (add
 your favourite collation-sequence example here). On the other hand,
 one can hide such nasties with careful choice of methods. For
 example, one could add a trimWhitespace() method, which can be
 implemented without the requirement of full Unicode character
 classification. The point is to be careful about the methods chosen.
 
 3. Notice the distinction between read-only and mutating methods.
 To assist in writing deterministic (and performant) multi-threaded
 code, it would be advantageous to split the specification into
 mutable and non-mutable variations (I'll assume the benefits of
 doing so are acknowledged)
 
 ~~~~~~~~~~
 
 class String    // a read-only String
 {
     // some read only methods
     abstract bool startsWith (String);
     abstract bool endsWith (String);
     abstract int indexOf (String, int start=0);
 
     // transcoding methods
     char*    cStr();
     char[]    utf8();
     wchar[] utf16();
     dchar[]  utf32();
     ..
 }
 
 
 class MutableString : String   // a modifiable String
 {
     // some mutating methods
     abstract void prepend (String);
     abstract void append (String);
 
     abstract void setCharAt (int index, dchar chr);
     ...
 }
 
 ~~~~~~~~~~~
 
 Now you can pass either type to a method that accepts the read-only
 String, yet can be somewhat assured of the intent when a called
 function expects a MutableString as an argument (it is expecting
 to change the darned thing <g>), and the compiler will catch 
 such mismatches appropriately.
 
 4. Using abstract classes is cool, but it limits the ability of
 someone trying to build compatible, alternate, implementations:
 they'd be  limited in what to use as a base-class.
 To open up the compatability aspect, we adopt interfaces instead:
 
 ~~~~~~~~~~~~~
 
 interface IString
 {
     // some read only methods
     bool startsWith (String);
     bool endsWith (String);
     int indexOf (String, int start=0);
 
     // transcoding methods
     char*    cStr();
     char[]    utf8();
     wchar[] utf16();
     dchar[]  utf32();
     ..
 }
 
 interface IMutableString : IString
 {
     // some mutating methods
     void prepend (String);
     void append (String);
 
     void setCharAt (int index, dchar chr);
     ...
 }
 
 ~~~~~~~~~~~
 
 At this point, there's little stopping another developer making a
 compatible implementation, yet with completely different internals
 (and ctors) than the original reference implementation.
 For example: the ICU wrappers could implement these interfaces,
 and Hey Presto! Full compatability with the basic specification!

Now this looks good! Not having looked closer at the ICU wrappers etc. yet, I can't say anything for or gainst the ICU-part, but you certainly made a compelling case for using interfaces.
 5. So here's where some little gotcha's come into play:
 
 (a) Notice that the transcoding routines should never be providing
 access to the internal content? After all, it's read-only. This
 is where a read-only attribute would come in handy; e.g. 
 "readonly char[] utf8();". The upside here is that the class could
 'cache' the utf8 transcoding, or not transcode
 at all if the implementation is native utf8. Unfortunately, D
 currently expects the recipient to "play by the rules" of CoW;
 something that is completely unenforcable by the class designer.
 String is just the kind of class that needs readonly support. Let's
 hope that support comes along soon
 (and, yes, it can be done with CoW ~ but that's not enforceable).
 
 (b) Notice also that the prepend() and append() methods take a String
 as an argument. They do this such that alternate implementations are
 allowed to play too. However, this requires any implementation of
 append(String) to call one of the transcoder methods of it's
 argument, to get the appending content. From a functional
 perspective, this is wonderful. From a performance perspective it's
 not. This is another reason why a String class
 might 'cache' its transcodings. Again, there's the readonly concern,
 since CoW is not enforcable by the class designer.

Nov 26 2005
prev sibling parent reply Georg Wrede <georg.wrede nospam.org> writes:
John Reimer wrote:
 Georg Wrede wrote:
 
 Thanks for your remarks, Georg.  It's always a pleasure.

Good thihg I always read the whole thing before commenting. :-)
Nov 25 2005
parent John Reimer <terminal.node gmail.com> writes:
Georg Wrede wrote:
 John Reimer wrote:
 
 Georg Wrede wrote:

 Thanks for your remarks, Georg.  It's always a pleasure.

Good thihg I always read the whole thing before commenting. :-)

Uh, oh, I must have said something bad... :-P
Nov 25 2005
prev sibling parent Georg Wrede <georg.wrede nospam.org> writes:
kris wrote:

 Since this thread is called "String theory by example", I'll
 encourage those interested to take a critical look at the ICU project
 here:

Aaaaarrrghhh, nooooo! I've been to that place, now THAT was scary. Let's avoid that all till DMD v 1.0!
Nov 25 2005
prev sibling next sibling parent reply Georg Wrede <georg.wrede nospam.org> writes:
kris wrote:
 Regan Heath wrote:
 
 Designing it with respect to performance and immutability are also
 not so tough (though D badly needs read-only arrays).

(OT) never thought about that! Please elaborate.
 What's really hard is getting the initial set of compromises worked
 out, as I keep repeating. Then comes the hard work of dealing with
 the edge-conditions, special cases, unexpected gotcha's and, in some
 cases, just plain old grey-matter and hard work.

I take it you refer here to character classification, collation and other cans of (Unicode-related) _real_ boas and anacondas? I agree.
 You mentioned before that this built-in notion would somehow 
 interface with ICU? Well, that would be a consideration. But first 
 you need to review how ICU, and other packages like it, operate 
 before assuming some binding to a native type (other than a class) 
 could make it an attractive marriage.

******************************************* I seriously suggest, or actually ask all here: Looking at the riot we had before _any_ understanding of utf or unicode things percolated, we just HAVE TO decide that BEFORE D 2.0 we _will_not_ touch any of the above issues!! Promise, everybody? Let's only do the character widths, i/o, and polishing of the utf API (and the language spec) -- and do that well. After D 1.0 we'll have all the time in the world to do the rest of the world.
Nov 25 2005
parent reply kris <fu bar.org> writes:
Georg Wrede wrote:
 kris wrote:
 
 Regan Heath wrote:

 Designing it with respect to performance and immutability are also
 not so tough (though D badly needs read-only arrays).

(OT) never thought about that! Please elaborate.

On Read-Only arrays? Sure. One can easily design a class such that it cannot be mutated when passed from one function to another. However, when it comes to arrays, access to content by the callee is wide open to abuse. That is, if funcA wants to give funcB read-only access to a large quantity of data, one should clone the thing /just in case/ funcB mutates it. This then pervades throughout structs and classes without respect to attribute visibility. The D notion is that CoW will be somehow be adhered to by the callee ~ it will be a "good" function, and clone the array before touching it. Yet this is not enforced by the compiler, to any degree. Thus the caller ends up doing the work, just to be sure. This, I'm sure you'll agree is a bit daft. It's also a significant performance problem for server-code, or anywhere where immutability is a high priority. Anyone who regularly uses multiple threads will attest that enforced immutability is a welcoming lifeboat within a cold sea of unrest and uncertainly.
 *******************************************
 
 I seriously suggest, or actually ask all here:
 
 Looking at the riot we had before _any_ understanding of utf or unicode 
 things percolated, we just HAVE TO decide that BEFORE D 2.0 we 
 _will_not_ touch any of the above issues!!

Talk about making things compatible with certain libraries has to take some general requirements into consideration. That requires research.
Nov 25 2005
parent reply Georg Wrede <georg.wrede nospam.org> writes:
kris wrote:
 Georg Wrede wrote:
 
 kris wrote:
 
 Regan Heath wrote:
 
 Designing it with respect to performance and immutability are
 also not so tough (though D badly needs read-only arrays).

(OT) never thought about that! Please elaborate.

On Read-Only arrays? Sure. One can easily design a class such that it cannot be mutated when passed from one function to another. However, when it comes to arrays, access to content by the callee is wide open to abuse. That is, if funcA wants to give funcB read-only access to a large quantity of data, one should clone the thing /just in case/ funcB mutates it. This then pervades throughout structs and classes without respect to attribute visibility. The D notion is that CoW will be somehow be adhered to by the callee ~ it will be a "good" function, and clone the array before touching it. Yet this is not enforced by the compiler, to any degree. Thus the caller ends up doing the work, just to be sure. This, I'm sure you'll agree is a bit daft. It's also a significant performance problem for server-code, or anywhere where immutability is a high priority. Anyone who regularly uses multiple threads will attest that enforced immutability is a welcoming lifeboat within a cold sea of unrest and uncertainly.

Ah, right. Interesting that there's no convenient hardware support for such. Well, can't have everything, do we. :-) Should we take this up, like after the holiday season? (Not that I'm expecting a sudden panacea invented, but who knows, maybe we could make some small steps.)
 *******************************************
 
 I seriously suggest, or actually ask all here:
 
 Looking at the riot we had before _any_ understanding of utf or 
 unicode things percolated, we just HAVE TO decide that BEFORE D 2.0
 we _will_not_ touch any of the above issues!!

Talk about making things compatible with certain libraries has to take some general requirements into consideration. That requires research.

Agreed. Right now, we've come a long way since what folks understood a month ago. We even have a kind of concensus on that the current "utf" state of affairs is, ehh, not perhaps very good. So, I feel, right now would be a bad spot to drop everything and go out in with the dragons and lizards, looking for Widsom. Instead, I feel it is absolutely vital that we fix the few issues we're at -- and get that over and done with, before we poison the minds of the next thousand D newcomers. Compared to that, I must confess, compatibilty with ICU-like things is not a great priority. Next summer, or sometime, but not right now. I'd love if you'd agree with me on this?
Nov 25 2005
parent reply kris <fu bar.org> writes:
Georg Wrede wrote:
 kris wrote:
 
 Georg Wrede wrote:

 kris wrote:

 Regan Heath wrote:

 Designing it with respect to performance and immutability are
 also not so tough (though D badly needs read-only arrays).

(OT) never thought about that! Please elaborate.

On Read-Only arrays? Sure. One can easily design a class such that it cannot be mutated when passed from one function to another. However, when it comes to arrays, access to content by the callee is wide open to abuse. That is, if funcA wants to give funcB read-only access to a large quantity of data, one should clone the thing /just in case/ funcB mutates it. This then pervades throughout structs and classes without respect to attribute visibility. The D notion is that CoW will be somehow be adhered to by the callee ~ it will be a "good" function, and clone the array before touching it. Yet this is not enforced by the compiler, to any degree. Thus the caller ends up doing the work, just to be sure. This, I'm sure you'll agree is a bit daft. It's also a significant performance problem for server-code, or anywhere where immutability is a high priority. Anyone who regularly uses multiple threads will attest that enforced immutability is a welcoming lifeboat within a cold sea of unrest and uncertainly.

Ah, right. Interesting that there's no convenient hardware support for such. Well, can't have everything, do we. :-)

Hardware support is not needed for such things. Instead the language needs a means to decorate a return-type as being read only (or something akin), and enforce subsequent usage as an rValue only. At compile-time. Support is already there for arrays-as-arguments (the 'in' modifier), though I wonder if that is robust enough? I mean, it's the caller who's concerned about the immutability; not the callee (whose sig could easily change). Yes, there's another case whereby it's the callee who's concerned about the caller changing the content on the fly. But that one is purely the responsibility of the caller, and can thus be managed. The fundamental issue is ensuring an unknown callee can be trusted with the family jewels.
Nov 25 2005
parent Georg Wrede <georg.wrede nospam.org> writes:
kris wrote:
 Georg Wrede wrote:
 kris wrote:
 Georg Wrede wrote:
 kris wrote:
 
 Designing it with respect to performance and immutability are
 also not so tough (though D badly needs read-only arrays).

(OT) never thought about that! Please elaborate.

On Read-Only arrays? Sure. One can easily design a class such that it cannot be mutated when passed from one function to another. However, when it comes to arrays, access to content by the callee is wide open to abuse. That is, if funcA wants to give funcB read-only access to a large quantity of data, one should clone the thing /just in case/ funcB mutates it. This then pervades throughout structs and classes without respect to attribute visibility. The D notion is that CoW will be somehow be adhered to by the callee ~ it will be a "good" function, and clone the array before touching it. Yet this is not enforced by the compiler, to any degree. Thus the caller ends up doing the work, just to be sure. This, I'm sure you'll agree is a bit daft. It's also a significant performance problem for server-code, or anywhere where immutability is a high priority. Anyone who regularly uses multiple threads will attest that enforced immutability is a welcoming lifeboat within a cold sea of unrest and uncertainly.

Ah, right. Interesting that there's no convenient hardware support for such. Well, can't have everything, do we. :-)

Hardware support is not needed for such things.

True.
 Instead the language needs a means to decorate a return-type as being
 read only (or something akin), and enforce subsequent usage as an
 rValue only.

Since we use references in D instead of pointers, it might not be too hard to do. The reference might have an attribute for read-only. Of course there could (and would) be more than one reference to an array in the application, some of which are "read-only", but that is no problem. I'd imagine this would be quite easy to implement into D. And add the same to references to structs, while at it.
Nov 26 2005
prev sibling parent reply Georg Wrede <georg.wrede nospam.org> writes:
kris wrote:

...

I asked you about where you work, etc. and never answered your post.

I consider that bad manners from myself, especially when such might even 
be considered a bit personal to ask on a public ng. Sorry!

I saw the post, and decided to look at the two links to your own 
projects, before I'd comment. (As you've probably seen) I've spent quite 
some time writing about this utf thing, so now I've lost your post. 
Shame on me!

---

But I do remember that you worked at PARC!

I think that's about as cool as working next door to Linus Torvalds or 
(for the other half of mankind) next door to Bill Gates.

That's one place where I'll do a Tourist Pilgrimage one day!
Nov 25 2005
parent kris <fu bar.org> writes:
Georg Wrede wrote:
 I asked you about where you work, etc. and never answered your post.
 
 I consider that bad manners from myself, especially when such might even 
 be considered a bit personal to ask on a public ng. Sorry!
 
 I saw the post, and decided to look at the two links to your own 
 projects, before I'd comment. (As you've probably seen) I've spent quite 
 some time writing about this utf thing, so now I've lost your post. 
 Shame on me!

Yes ~ terribly poor form :-p No problem. Appreciate the thought.
Nov 25 2005
prev sibling parent reply Georg Wrede <georg.wrede nospam.org> writes:
Regan Heath wrote:
 
 This doesn't proove anything but it suggests that using a dchar sized
  variable for characters will have little or no real effect on 
 performance.. maybe, a conclusive test should really be made.

Well, the neat thing here is that since i/o is inherently very slow, at that particular point one can afford to do just about anything -- for free, so to say! I/o goes to/from the display, to a file, to the net, to the printer. They're all so slow that I'd say one can do the transformations ten times over(!), and nobody could see the difference. Probably, Walter's so familiar with this idea that he hardly noticed. So he instinctively was liberal with clock cycles in the right place. (No use optimising to death where it doesn't count. Pun intended.) What I left out from the i/o list above was pipes. But on a Windows machine I guess that's slow anyhow. So we're left with Unix command line chaining, which I guess is about the only place where one would see a difference. (And even there the data ultimately comes from the disk (or the others) and goes somewhere. At least in real life.) --- This actually gives an idea: to compare the efficiency of the different UTF widths in some specific job, it might be a good idea to first have the input data collected in memory, then time whatever operations one wants to test, and then "stop the clock" before either the output or the discard of the resultant data. (Oh yes, and large datasets should absolutely be tested on a quiet machine, or they will get swapped out inbetween. So a single-user mode unix might be pretty close to what's needed.) Similarly, when one talks about the real-life efficiency of utf-this or utf-that, it is _imperative_ to include the i/o (as from+to disk or whatever) in the comparisons.
Nov 24 2005
parent reply kris <fu bar.org> writes:
Georg Wrede wrote:
 Regan Heath wrote:
 
 This doesn't proove anything but it suggests that using a dchar sized
  variable for characters will have little or no real effect on 
 performance.. maybe, a conclusive test should really be made.

Well, the neat thing here is that since i/o is inherently very slow, at that particular point one can afford to do just about anything -- for free, so to say!

Forgive me, Georg; but that sounds like codswallop. You're making an assumption there's just one task taking place, which may be partly true for your machine at home, but it ain't true for real-time systems or servers of any variety. Asynchronous I/O exists for a reason ~ so that one can do as much as possible /whilst/ waiting. Alternatively, one uses multiples threads to keep the CPU occupied whilst others are I/O-bound. For example, I seriously doubt this NG server sits down and twiddles its thumbs whilst waiting for socket transfers <g> Even in this day and age there's little excuse for slothfulness (though it appears less egregious at the high level). Besides; the wprintf thing is a total red-herring, since the goal there is convenience; it's pretty obvious performance was not a priority.
Nov 25 2005
parent reply Georg Wrede <georg.wrede nospam.org> writes:
kris wrote:
 Georg Wrede wrote:
 Regan Heath wrote:
 
 This doesn't proove anything but it suggests that using a dchar
 sized variable for characters will have little or no real effect
 on performance.. maybe, a conclusive test should really be made.

Well, the neat thing here is that since i/o is inherently very slow, at that particular point one can afford to do just about anything -- for free, so to say!

Forgive me, Georg; but that sounds like codswallop.

No panic.
 You're making an assumption there's just one task taking place, which
 may be partly true for your machine at home, but it ain't true for
 real-time systems or servers of any variety.

 Asynchronous I/O exists for a reason ~ so that one can do as much as
 possible /whilst/ waiting. Alternatively, one uses multiples threads
 to keep the CPU occupied whilst others are I/O-bound. For example, I
 seriously doubt this NG server sits down and twiddles its thumbs
 whilst waiting for socket transfers <g>

I'm thinking of the average speed of i/o. Every once in a while a swoosh of data comes into the in-buffer, then it takes some time before we get the next swoosh, even if we'd "use" the data in zero time. If the time before the next buffer fill is used to decode-and-read, then we get the decode "for free". (See below, before answering. :-) )
 Even in this day and age there's little excuse for slothfulness
 (though it appears less egregious at the high level). Besides; the
 wprintf thing is a total red-herring, since the goal there is
 convenience; it's pretty obvious performance was not a priority.

Slothfullness... I'll tell Bill you're picking on me! Of course, on a multiuser system it is part of table manners not to waste clock cycles, there's no disputing that. So you are right. Even more (as I think you also mean), there's no place anywhere, where a code can be slothful without using up from the total of clock cycles available, so other processes of course get less. Be it doing "for free" i/o conversion, or whatever else. But I'm trying to maintain a balance here. Right now we are pressured for time (or actually Walter is -- I can't imagine that his wife hasn't left him already, especially considering how quickly .140 came out with all those things), and we should get this utf thing out of the way, so other things can be tackled. I'd say we can right now implement stuff less-than well -- _as_long_ as the setup is drawn right. In other words, so that later we (without changing the API) can rewrite and optimize the individual routines. Kind of "Why start rocking the boat, when we've just got off the underwater rock, with such effort, too." (...whatever "underwater rock" is in proper English...) PS, I've already suggested doing such tests. ;-) So, fixing a print function that does superfluous intermediate conversions, should, IMHO, not be on our agenda at all before spring. DMD 1.0 or not.
Nov 25 2005
parent reply kris <fu bar.org> writes:
Georg Wrede wrote:

 
 Slothfullness... I'll tell Bill you're picking on me!

Sorry. That wasn't intended to be a personal attribution <g>
 But I'm trying to maintain a balance here. Right now we are pressured 
 for time (or actually Walter is -- I can't imagine that his wife hasn't 
 left him already, especially considering how quickly .140 came out with 
 all those things), and we should get this utf thing out of the way, so 
 other things can be tackled.

Which UTF thing? There's so many threads going on its hard to keep track. There's the one that says "default all argument string-literals to char[]". That would increase consistency on a number of fronts, so that would be great. Bring it on! There's the one that says "add some array properties as a convenience for transcoding", such as adding .utf8 .utf16 and .utf32 properties as appropriate. That would be nice! There's a call for a "unified" string, which is a String class by any other name. Yet there's precious little evidence of a well considered class at this time. I hope you're no referring to the latter?
 I'd say we can right now implement stuff less-than well -- _as_long_ as 
 the setup is drawn right. In other words, so that later we (without 
 changing the API) can rewrite and optimize the individual routines.

I'm really missing something here. You're talking about an API for what? It must be a String class, yes? Getting it "right" so that it doesn't change, is not something that can be done on a whim. I know you know that, so what's the huge rush all of a sudden? Don't you think it would be better to build something and let it mature with use for a period of time? Why not pick up one of the String classes that's been around for a year or more? There's at least three of them that old. As you might guess, this is not a new topic at all :-)
 So, fixing a print function that does superfluous intermediate 
 conversions, should, IMHO, not be on our agenda at all before spring. 
 DMD 1.0 or not.

I believe you're badly miscontruing something here, Georg. Who ever said anything about fixing writef? I certainly countered an argument that was effectively stating "if writef can do it like that, then that's probably good enough for everything else". Merely pointing out that printf exists for convenience, not performance, should hardly be interpreted in this manner :-D Can you please tell me about this Spring date? Is something important happening then?
Nov 25 2005
next sibling parent reply Derek Parnell <derek psych.ward> writes:
On Fri, 25 Nov 2005 10:32:47 -0800, kris wrote:


[snip]
 
 There's the one that says "add some array properties as a convenience 
 for transcoding", such as adding .utf8 .utf16 and .utf32 properties as 
 appropriate. That would be nice!

For what it's worth, here's a small convenience module... ================================== module transcode; private import std.utf; void transcode( char[] a, inout char[] b ) { b = a; } void transcode( char[] a, inout wchar[] b ) { b = std.utf.toUTF16(a); } void transcode( char[] a, inout dchar[] b ) { b = std.utf.toUTF32(a); } void transcode( wchar[] a, inout char[] b ) { b = std.utf.toUTF8 (a); } void transcode( wchar[] a, inout wchar[] b ) { b = a; } void transcode( wchar[] a, inout dchar[] b ) { b = std.utf.toUTF32(a); } void transcode( dchar[] a, inout char[] b ) { b = std.utf.toUTF8 (a); } void transcode( dchar[] a, inout wchar[] b ) { b = std.utf.toUTF16(a); } void transcode( dchar[] a, inout dchar[] b ) { b = a; } unittest { char[] s8; wchar[] s16; dchar[] s32; char[] t8; wchar[] t16; dchar[] t32; s8 = "some text"; transcode(s8, s16); transcode(s16, s32); transcode(s32, t16); transcode(t16, t8); assert(s8 == t8); transcode(t8,t32); assert(t32 == s32); transcode(s32,t8); assert(t8 == s8); assert(s8 != cast(char[])s16); } ================================= -- Derek Parnell Melbourne, Australia 26/11/2005 9:41:52 AM
Nov 25 2005
parent reply "Kris" <fu bar.com> writes:
"Derek Parnell" <derek psych.ward> wrote in message 
news:1ypwa2hwmja.q9pdllu3i85s.dlg 40tude.net...
 On Fri, 25 Nov 2005 10:32:47 -0800, kris wrote:


 [snip]

 There's the one that says "add some array properties as a convenience
 for transcoding", such as adding .utf8 .utf16 and .utf32 properties as
 appropriate. That would be nice!

For what it's worth, here's a small convenience module... ================================== module transcode; private import std.utf; void transcode( char[] a, inout char[] b ) { b = a; } void transcode( char[] a, inout wchar[] b ) { b = std.utf.toUTF16(a); } void transcode( char[] a, inout dchar[] b ) { b = std.utf.toUTF32(a); } void transcode( wchar[] a, inout char[] b ) { b = std.utf.toUTF8 (a); } void transcode( wchar[] a, inout wchar[] b ) { b = a; } void transcode( wchar[] a, inout dchar[] b ) { b = std.utf.toUTF32(a); } void transcode( dchar[] a, inout char[] b ) { b = std.utf.toUTF8 (a); } void transcode( dchar[] a, inout wchar[] b ) { b = std.utf.toUTF16(a); } void transcode( dchar[] a, inout dchar[] b ) { b = a; } unittest { char[] s8; wchar[] s16; dchar[] s32; char[] t8; wchar[] t16; dchar[] t32; s8 = "some text"; transcode(s8, s16); transcode(s16, s32); transcode(s32, t16); transcode(t16, t8); assert(s8 == t8); transcode(t8,t32); assert(t32 == s32); transcode(s32,t8); assert(t8 == s8); assert(s8 != cast(char[])s16); } =================================

or this somewhat dubious variation :) s8 = "some text"; s8.transcode(s16): Now, let's assume for a moment that the user intends to somehow modify the return content. This again brings up the issue about CoW ~ a user might consider these as always being /copies/ of the original content, since they've been transcoded. Right? After being transcoding into a freshly allocated chunk of the heap, as a user I wouldn't expect to .dup the result. Yet, this is not a valid assumption ~ your example return the original content directly in 3 cases, which a user might happily modify, thinking s/he's working with a private copy. To get around this, the user must explicitly follow CoW and always .dup the result. Even when it's presumeably redundant to do so. Alternatively, as the utility designer, you must always .dup the non-transcoded return value. Just in case. This seems utterly wrong. And it's such a fundamental thing too. Perhaps I don't get it?
Nov 25 2005
parent reply Derek Parnell <derek psych.ward> writes:
On Fri, 25 Nov 2005 15:09:44 -0800, Kris wrote:

 "Derek Parnell" <derek psych.ward> wrote in message 
 news:1ypwa2hwmja.q9pdllu3i85s.dlg 40tude.net...
 On Fri, 25 Nov 2005 10:32:47 -0800, kris wrote:


 [snip]

 There's the one that says "add some array properties as a convenience
 for transcoding", such as adding .utf8 .utf16 and .utf32 properties as
 appropriate. That would be nice!

For what it's worth, here's a small convenience module... ================================== module transcode; private import std.utf; void transcode( char[] a, inout char[] b ) { b = a; } void transcode( char[] a, inout wchar[] b ) { b = std.utf.toUTF16(a); } void transcode( char[] a, inout dchar[] b ) { b = std.utf.toUTF32(a); } void transcode( wchar[] a, inout char[] b ) { b = std.utf.toUTF8 (a); } void transcode( wchar[] a, inout wchar[] b ) { b = a; } void transcode( wchar[] a, inout dchar[] b ) { b = std.utf.toUTF32(a); } void transcode( dchar[] a, inout char[] b ) { b = std.utf.toUTF8 (a); } void transcode( dchar[] a, inout wchar[] b ) { b = std.utf.toUTF16(a); } void transcode( dchar[] a, inout dchar[] b ) { b = a; } unittest { char[] s8; wchar[] s16; dchar[] s32; char[] t8; wchar[] t16; dchar[] t32; s8 = "some text"; transcode(s8, s16); transcode(s16, s32); transcode(s32, t16); transcode(t16, t8); assert(s8 == t8); transcode(t8,t32); assert(t32 == s32); transcode(s32,t8); assert(t8 == s8); assert(s8 != cast(char[])s16); } =================================

or this somewhat dubious variation :) s8 = "some text"; s8.transcode(s16): Now, let's assume for a moment that the user intends to somehow modify the return content. This again brings up the issue about CoW ~ a user might consider these as always being /copies/ of the original content, since they've been transcoded. Right? After being transcoding into a freshly allocated chunk of the heap, as a user I wouldn't expect to .dup the result. Yet, this is not a valid assumption ~ your example return the original content directly in 3 cases,

Yeah, I realized this just before I went out to do the shopping. The 'fix' is easy though. void transcode( char[] a, inout char[] b ) { b = a.dup; } void transcode( wchar[] a, inout wchar[] b ) { b = a.dup; } void transcode( dchar[] a, inout dchar[] b ) { b = a.dup; }
 which a user might happily modify, thinking 
 s/he's working with a private copy. To get around this, the user must 
 explicitly follow CoW and always .dup the result. Even when it's presumeably 
 redundant to do so. Alternatively, as the utility designer, you must always 
 .dup the non-transcoded return value. Just in case.
 
 This seems utterly wrong. And it's such a fundamental thing too. Perhaps I 
 don't get it?

It called a mistake, Kris. And yes, even I make them on rare occasions ;-) -- Derek Parnell Melbourne, Australia 26/11/2005 6:39:42 PM
Nov 25 2005
parent reply "Kris" <fu bar.com> writes:
"Derek Parnell" <derek psych.ward> wrote in message 
news:k04hb75t8pzh$.q9xcre0h0fgw.dlg 40tude.net...
 On Fri, 25 Nov 2005 15:09:44 -0800, Kris wrote:

 "Derek Parnell" <derek psych.ward> wrote in message
 news:1ypwa2hwmja.q9pdllu3i85s.dlg 40tude.net...
 On Fri, 25 Nov 2005 10:32:47 -0800, kris wrote:


 [snip]

 There's the one that says "add some array properties as a convenience
 for transcoding", such as adding .utf8 .utf16 and .utf32 properties as
 appropriate. That would be nice!

For what it's worth, here's a small convenience module... ================================== module transcode; private import std.utf; void transcode( char[] a, inout char[] b ) { b = a; } void transcode( char[] a, inout wchar[] b ) { b = std.utf.toUTF16(a); } void transcode( char[] a, inout dchar[] b ) { b = std.utf.toUTF32(a); } void transcode( wchar[] a, inout char[] b ) { b = std.utf.toUTF8 (a); } void transcode( wchar[] a, inout wchar[] b ) { b = a; } void transcode( wchar[] a, inout dchar[] b ) { b = std.utf.toUTF32(a); } void transcode( dchar[] a, inout char[] b ) { b = std.utf.toUTF8 (a); } void transcode( dchar[] a, inout wchar[] b ) { b = std.utf.toUTF16(a); } void transcode( dchar[] a, inout dchar[] b ) { b = a; } unittest { char[] s8; wchar[] s16; dchar[] s32; char[] t8; wchar[] t16; dchar[] t32; s8 = "some text"; transcode(s8, s16); transcode(s16, s32); transcode(s32, t16); transcode(t16, t8); assert(s8 == t8); transcode(t8,t32); assert(t32 == s32); transcode(s32,t8); assert(t8 == s8); assert(s8 != cast(char[])s16); } =================================

or this somewhat dubious variation :) s8 = "some text"; s8.transcode(s16): Now, let's assume for a moment that the user intends to somehow modify the return content. This again brings up the issue about CoW ~ a user might consider these as always being /copies/ of the original content, since they've been transcoded. Right? After being transcoding into a freshly allocated chunk of the heap, as a user I wouldn't expect to .dup the result. Yet, this is not a valid assumption ~ your example return the original content directly in 3 cases,

Yeah, I realized this just before I went out to do the shopping. The 'fix' is easy though. void transcode( char[] a, inout char[] b ) { b = a.dup; } void transcode( wchar[] a, inout wchar[] b ) { b = a.dup; } void transcode( dchar[] a, inout dchar[] b ) { b = a.dup; }
 which a user might happily modify, thinking
 s/he's working with a private copy. To get around this, the user must
 explicitly follow CoW and always .dup the result. Even when it's 
 presumeably
 redundant to do so. Alternatively, as the utility designer, you must 
 always
 .dup the non-transcoded return value. Just in case.

 This seems utterly wrong. And it's such a fundamental thing too. Perhaps 
 I
 don't get it?

It called a mistake, Kris. And yes, even I make them on rare occasions ;-)

:-) I didn't mean your code was utterly wrong, Derek. I meant the philosophy ain't straight. One should not have to make copies "just in case". It's terribly wasteful ...
Nov 26 2005
parent Derek Parnell <derek psych.ward> writes:
On Sat, 26 Nov 2005 12:18:08 -0800, Kris wrote:


 I didn't mean your code was utterly wrong, Derek. I meant the philosophy 
 ain't straight. One should not have to make copies "just in case". It's 
 terribly wasteful ...

My code was mistaken because it's behaviour was not consistent, but you're right too. -- Derek Parnell Melbourne, Australia 27/11/2005 8:03:39 AM
Nov 26 2005
prev sibling parent Georg Wrede <georg.wrede nospam.org> writes:
kris wrote:
 Georg Wrede wrote:
 
 Slothfullness... I'll tell Bill you're picking on me!

Sorry. That wasn't intended to be a personal attribution <g>

Actually I was picking, on Bill and Windows. He'd be the last person there's any use discussing with about slothfullnes. :-)
 But I'm trying to maintain a balance here. Right now we are
 pressured for time (or actually Walter is -- I can't imagine that
 his wife hasn't left him already, especially considering how
 quickly .140 came out with all those things), and we should get
 this utf thing out of the way, so other things can be tackled.

Which UTF thing? There's so many threads going on its hard to keep track.

True. :-) And everybody have their own idea, so I'm referring mostly to what I've written. And judging from the verbosity and diversity of this "utf discussion", there's still a lot of misconceptions, ideas based on them, and plain beliefs around.
 I'd say we can right now implement stuff less-than well --
 _as_long_ as the setup is drawn right. In other words, so that
 later we (without changing the API) can rewrite and optimize the
 individual routines. 

I'm really missing something here. You're talking about an API for what? It must be a String class, yes? Getting it "right" so that it doesn't change, is not something that can be done on a whim. I know you know that, so what's the huge rush all of a sudden? Don't you think it would be better to build something and let it mature with use for a period of time?

The "rush": We almost had a feature freeze already. The metaprogramming thing kind of broke it. Then the "utf"-or-whatchmacallit popped up. If you read my posts from the last month or so, you'll see what effort it took to get us even to this level. Now it's time to put that to use, and make the few changes to D I've suggested. And then let this "utf" thing rest for a good while. (Let things "sink in", so to speak.) At the current rate, it'll take som 6 months before enough folks are so clear on these issues that a meaningful try to go further is feasible, without 95% of the ng. writing time going to re-re-re-explaining the obvious to each participant separately. Or to apples and oranges. (This was not personal! I meant in general between posters.)
 Why not pick up one of the String classes that's been around for a
 year or more? There's at least three of them that old. As you might
 guess, this is not a new topic at all :-)

:-) And the likelihood of String classes getting into D may not have changed. (Which is why I'm not pushing it.) UTF32 would not need any String class, it can simply be used as an array. The other two (16 and 8) are problematic, so it would be natural to have them as classes. (And for symmetry, obviously a 32 then too.) But then again, I can understand Walter's reluctance. D is a C-family language, and it would be kind of neat to have the language itself nice and tight. With USASCII there was no problem with that. And that made it possible to have strings just be arrays, which is kinda cool. Having string classes in libraries, or even in Phobos, would be natural now. Somehow I understand Walter's reluctance, though. The compilers (DMC, DMD) are (almost?) totally non-OO, and Walter writes the DMD front-end in D, i.e. he uses D in another way than the future average D programmers will (and should). This doesn't necessarily mean I agree, of course.
 So, fixing a print function that does superfluous intermediate 
 conversions, should, IMHO, not be on our agenda at all before
 spring. DMD 1.0 or not.

I believe you're badly miscontruing something here, Georg. Who ever said anything about fixing writef? I certainly countered an argument that was effectively stating "if writef can do it like that, then that's probably good enough for everything else". Merely pointing out that printf exists for convenience, not performance, should hardly be interpreted in this manner :-D

I totally agree with you! My mistake, I read you as promoting an aggressively time-optimizing rewrite of all such things. ;-(
 Can you please tell me about this Spring date? Is something important
  happening then?

There's no date. Just the everlasting assumption that 1.0 is 3 to 6 months away from now. :-D :-D
Nov 26 2005
prev sibling parent reply =?ISO-8859-15?Q?Jari-Matti_M=E4kel=E4?= <jmjmak invalid_utu.fi> writes:
Kris wrote:
 It seems clear that any unified string notion would be better off as a 
 library suite; not built into the compiler. It's difficult enough to evolve 
 the code within Phobos, let alone something hard-coded into the compiler.

D has associative arrays as a hard-coded feature too.
 b) a String class to support Unicode is hardly a trivial undertaking. You 
 really have to consider very hard what the goals are before putting 
 something in stone (as in getting it added to Phobos). I say that from 
 experience with the ICU project ~ there's code in there to handle the kinds 
 of things that would frighten many people. Unicode ain't trivial and, 
 frankly, I think AJ would have a hard time coming up with a "suitable" set 
 of compromises. The latter is important: there will be many compromises one 
 way or another.

I don't see it as a major compromise if one wants to have an abstract UTF-representation independent string type. If we could create a basic string type that does all it's major operations in O(1) or O(n) time, these 'advanced' operations would be fast enough (even if they're not, you can always handle the string as a raw stream of bytes) Even the current implementation is a compromise. The language doesn't want to take care of any Unicode operations, all the 'hard' work (including char[] symbol-based indexing) is left for the programmer.
Nov 25 2005
parent Georg Wrede <georg.wrede nospam.org> writes:
Jari-Matti Mäkelä wrote:
 If we could create a basic string type that does all it's major
 operations in O(1) or O(n) time, these 'advanced' operations would be
 fast enough (even if they're not, you can always handle the string as
 a raw stream of bytes)

????
 Even the current implementation is a compromise. The language doesn't
  want to take care of any Unicode operations, all the 'hard' work 
 (including char[] symbol-based indexing) is left for the programmer.

That change is just behind the corner.
Nov 25 2005