digitalmars.D - UTF-8 to dchar conversion
- Arcane Jill <Arcane_member pathlink.com> Jul 28 2004
- Arcane Jill <Arcane_member pathlink.com> Jul 28 2004
- Arcane Jill <Arcane_member pathlink.com> Jul 28 2004
- Arcane Jill <Arcane_member pathlink.com> Jul 28 2004
- parabolis <parabolis softhome.net> Jul 28 2004
- Arcane Jill <Arcane_member pathlink.com> Jul 28 2004
- Sean Kelly <sean f4.ca> Jul 28 2004
- "Walter" <newshound digitalmars.com> Jul 28 2004
- Arcane Jill <Arcane_member pathlink.com> Jul 28 2004
- "Walter" <newshound digitalmars.com> Jul 29 2004
- Arcane Jill <Arcane_member pathlink.com> Jul 29 2004
- Arcane Jill <Arcane_member pathlink.com> Jul 29 2004
- "Walter" <newshound digitalmars.com> Jul 29 2004
For Sean...
I noticed your std.utf update on the bugs forum. Using delegates is obviously
sensible, but I noticed the routine looked a tad on the slow side. Here's a
faster algorithm - it doesn't use delegates, but I'm sure you could do some
mixing and matching to get the best of both. Here's my fast converter:
# const ubyte[256] LENGTH =
# [
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
# 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
# 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
# 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
# ];
#
# const ubyte[256] START_CALC =
# [
# 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
# 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
# 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
# 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
# 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27
# 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
# 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
# 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
# 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,
# 0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
# 0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,
# 0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
# 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
# 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
# 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
# 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x02,0x03,0x04,0x05,0x06,0x07,
# 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
# 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
# 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
# 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
# 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
# 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# ];
#
# dchar decode(inout char[] s)
# {
# if (s.length > 0)
# {
# uint firstChar = s[0];
# uint len = LENGTH[firstChar];
# if (len != 0 && s.length >= len)
# {
# if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
# (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
# {
# uint c = START_CALC[firstChar];
# for (uint i=1; i<len; ++i)
# {
# c <<= 6;
# c |= s[i];
# }
# if (isValidDchar(s))
# {
# p = p[len..p.length];
# return c;
# }
# }
# }
# }
# throw new UtfError("invalid UTF-8 sequence");
# }
(and no nasty gotos either!)
Jill
Jul 28 2004
In article <ce91ga$jnj$1 digitaldaemon.com>, Arcane Jill says... Ah, bugger! # c |= s[i]; should read: # c |= s[i] & 0x3F; That'll teach me to post code without testing it first! Jill
Jul 28 2004
In article <ce91t7$jrt$1 digitaldaemon.com>, Arcane Jill says... And # p = p[len..p.length]; should read # s = s[len..s.length]; (Aren't you glad I'm not writing real code myself just now. Just think how many bugs it would end up with! Still - the /principle/ is sound.)
Jul 28 2004
Aaargh!
Found even more bugs. Fixed them. Let's just start again. HERE's the fast UTF-8
routine... (If there are any more bugs after this, someone else can find them).
# const ubyte[256] LENGTH =
# [
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
# 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
# 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
# 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
# ];
#
# const ubyte[256] START_CALC =
# [
# 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
# 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
# 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
# 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
# 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
# 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
# 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
# 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
# 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,
# 0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
# 0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,
# 0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
# 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
# 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
# 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
# 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
# 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
# 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
# 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
# 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
# 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
# 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# ];
#
# dchar convert(inout char[] s)
# {
# if (s.length > 0)
# {
# uint firstChar = s[0];
# uint len = LENGTH[firstChar];
# if (len != 0 && s.length >= len)
# {
# if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
# (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
# {
# uint c = START_CALC[firstChar];
# for (uint i=1; i<len; ++i)
# {
# c <<= 6;
# c |= s[i] & 0x3F;
# }
# if (isValidDchar(c))
# {
# s = s[len..s.length];
# return c;
# }
# }
# }
# }
# throw new UtfError("invalid UTF-8 sequence");
# }
Jul 28 2004
This function does not verify any non-first byte in a UTF-8
sequence actually starts with 10xxxxxx... So it accepts
0xC1,0xBF (correct)
and
0xC1,0xFF (incorrect)
You also probably wanted
isValidDchar(c)
instead of
isValidDchar(s)
and
s = s[len..s.length];
instead of
p = p[len..p.length];
(I also noticed you used uint exclusively... :P)
Out of curiosity why did you define the LENGTH and the
START_CALC arrays?
Arcane Jill wrote:
For Sean...
I noticed your std.utf update on the bugs forum. Using delegates is obviously
sensible, but I noticed the routine looked a tad on the slow side. Here's a
faster algorithm - it doesn't use delegates, but I'm sure you could do some
mixing and matching to get the best of both. Here's my fast converter:
# const ubyte[256] LENGTH =
# [
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
# 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
# 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
# 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
# 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
# ];
#
# const ubyte[256] START_CALC =
# [
# 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
# 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
# 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
# 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
# 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27
# 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
# 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
# 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
# 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,
# 0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
# 0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,
# 0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
# 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
# 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
# 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
# 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# 0x00,0x00,0x02,0x03,0x04,0x05,0x06,0x07,
# 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
# 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
# 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
# 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
# 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
# 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
# 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
# ];
#
# dchar decode(inout char[] s)
# {
# if (s.length > 0)
# {
# uint firstChar = s[0];
# uint len = LENGTH[firstChar];
# if (len != 0 && s.length >= len)
# {
# if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
# (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
# {
# uint c = START_CALC[firstChar];
# for (uint i=1; i<len; ++i)
# {
# c <<= 6;
# c |= s[i];
# }
# if (isValidDchar(s))
# {
# p = p[len..p.length];
# return c;
# }
# }
# }
# }
# throw new UtfError("invalid UTF-8 sequence");
# }
(and no nasty gotos either!)
Jill
Jul 28 2004
In article <ce9483$kq0$1 digitaldaemon.com>, parabolis says...This function does not verify any non-first byte in a UTF-8 sequence actually starts with 10xxxxxx... So it accepts 0xC1,0xBF (correct) and 0xC1,0xFF (incorrect)
Well spotted. Okay, so replace # c |= s[i] & 0x3F; # // etc with # if ((s[i] & 0xC0) == 0x80) # { # c |= s[i] & 0x3F; # // etc # } Thanks very much for pointing that out. I appreciate it.You also probably wanted
Yeah, there were some typos in the original post. I fixed them in the repost.Out of curiosity why did you define the LENGTH and the START_CALC arrays?
Because they're the fast lookup tables. Jill
Jul 28 2004
The routines themselves were left unaltered from the original UTF functions. I'll play with your suggestions and see if I can get it all working though. If the code can be made faster then that's fine with me :) Sean
Jul 28 2004
One aspect to consider when writing fast conversion code is the frequency of various characters. Characters do not have a flat random distribution. I'd wager that the overwhelming majority of them will be ASCII. Thus, a fast converter would first just test for ASCII, and save the more complex processing for non-ASCII. Your routine does numerous unnecessary operations on ASCII chars, so while it may be faster if the data is random, it would be slower on text data.
Jul 28 2004
In article <ce98eo$n71$1 digitaldaemon.com>, Walter says...One aspect to consider when writing fast conversion code is the frequency of various characters. Characters do not have a flat random distribution. I'd wager that the overwhelming majority of them will be ASCII. Thus, a fast converter would first just test for ASCII, and save the more complex processing for non-ASCII. Your routine does numerous unnecessary operations on ASCII chars, so while it may be faster if the data is random, it would be slower on text data.
Good point. Here's a new version then, which tests for ASCII first. (It also makes the lookup tables half the size!) # const ubyte[128] LENGTH = # [ # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, # 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, # 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, # 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0, # ]; # # const ubyte[128] START_CALC = # [ # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, # 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, # 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # ]; # # dchar convert(inout char[] s) # { # if (s.length > 0) # { # uint firstChar = s[0]; # if (firstChar < 0x80) // ASCII # { # s = s[1..s.length]; # return firstChar; # } # firstChar -= 0x80; # uint len = LENGTH[firstChar]; # if (len != 0 && s.length >= len) # { # if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) && # (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80)) # { # uint c = START_CALC[firstChar]; # uint i; # for (i=1; i<len; ++i) # { # if ((s[i] & 0xC0) != 0x80) break; # c <<= 6; # c |= s[i] & 0x3F; # } # if (i == len && isValidDchar(c)) # { # s = s[len..s.length]; # return c; # } # } # } # } # throw new UtfError("invalid UTF-8 sequence"); # } Jill
Jul 28 2004
Does your version also reject UTF-8 sequences that produce the correct value, but are not the shortest possible sequence? "Arcane Jill" <Arcane_member pathlink.com> wrote in message news:cea792$14f4$1 digitaldaemon.com...In article <ce98eo$n71$1 digitaldaemon.com>, Walter says...One aspect to consider when writing fast conversion code is the frequency
various characters. Characters do not have a flat random distribution.
wager that the overwhelming majority of them will be ASCII. Thus, a fast converter would first just test for ASCII, and save the more complex processing for non-ASCII. Your routine does numerous unnecessary
on ASCII chars, so while it may be faster if the data is random, it would
slower on text data.
Good point. Here's a new version then, which tests for ASCII first. (It also makes the lookup tables half the size!) # const ubyte[128] LENGTH = # [ # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, # 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, # 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, # 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0, # ]; # # const ubyte[128] START_CALC = # [ # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, # 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, # 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # ]; # # dchar convert(inout char[] s) # { # if (s.length > 0) # { # uint firstChar = s[0]; # if (firstChar < 0x80) // ASCII # { # s = s[1..s.length]; # return firstChar; # } # firstChar -= 0x80; # uint len = LENGTH[firstChar]; # if (len != 0 && s.length >= len) # { # if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) && # (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80)) # { # uint c = START_CALC[firstChar]; # uint i; # for (i=1; i<len; ++i) # { # if ((s[i] & 0xC0) != 0x80) break; # c <<= 6; # c |= s[i] & 0x3F; # } # if (i == len && isValidDchar(c)) # { # s = s[len..s.length]; # return c; # } # } # } # } # throw new UtfError("invalid UTF-8 sequence"); # } Jill
Jul 29 2004
In article <cebj7l$1mro$1 digitaldaemon.com>, Walter says...Does your version also reject UTF-8 sequences that produce the correct value, but are not the shortest possible sequence?
Theoretically, yes. Two-byte sequences starting with 0xC0 and 0xD0 are caught by the relevant zero entries in the LENGTH table (at offsets 0x40 and 0x41); Overlong three and four byte sequences are ruled out by the test: # if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) && # (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80)) and overlong five or more byte sequences (indeed, /all/ five or more byte sequences) are ruled out, again, by zeroes in the LENGTH table (at offset 0x78 to 0x7F). I have to confess, though, I have not tested this. I wrote it and posted it without testing it, which is bad form, I know, but it's the first D I've written since the funeral and I'm just getting back into practice. I figured you wouldn't want to use it as-is anyway, because you'll want all that delegate stuff with get() and put() instead of just assuming everyone wants a string. That said, I can't /see/ any bugs in it, and it's quite short so there are not many places for them to hide. (So, if you use this, or a variant of it, keep the unit tests in). If you want UTF conversion to /really/ zip along, you could consider dropping to assembler. Just a thought. Jill
Jul 29 2004
In article <cebljb$1nu9$1 digitaldaemon.com>, Arcane Jill says... Textual typo correction:(at offsets 0x40 and 0x41);
should read(at offsets 0x40 and 0x50);
Jul 29 2004
"Arcane Jill" <Arcane_member pathlink.com> wrote in message news:cebljb$1nu9$1 digitaldaemon.com...I have to confess, though, I have not tested this.
It would be nice to have a comprehensive set of test data for these things. Are there any on the UTF sites you look at?
Jul 29 2004









Arcane Jill <Arcane_member pathlink.com> 