X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;f=mcs%2Fclass%2Fcorlib%2FMono.Globalization.Unicode%2Fcreate-mscompat-collation-table.cs;h=02bee38d373c50a0221d9d970d25cbfcb98d3590;hb=bd9f9ee7cb81823608edc76ef9d0b6416783fe71;hp=2e7a3ec5ac6b181c54fa826b2ba4bf3a7dd82bb1;hpb=0ffe4b26fc18fea18825d6d617cc47a5465465ad;p=mono.git diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs index 2e7a3ec5ac6..02bee38d373 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs +++ b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs @@ -1,4 +1,31 @@ // +// create-mscompat-collation-table.cs : generates Windows-like sortkey tables. +// +// Author: +// Atsushi Enomoto +// +// Copyright (C) 2005 Novell, Inc (http://www.novell.com) +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to +// the following conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + // // There are two kind of sort keys : which are computed and which are laid out // as an indexed array. Computed sort keys are: @@ -6,31 +33,20 @@ // - Surrogate // - PrivateUse // -// Also, for composite characters it should prepare different index table. -// // Though it is possible to "compute" level 3 weights, they are still dumped // to an array to avoid execution cost. // - -// -// * sortkey getter signature -// -// int GetSortKey (string s, int index, SortKeyBuffer buf) -// Stores sort key for corresponding character element into buf and -// returns the length of the consumed _source_ character element in s. -// -// * character length to consume -// -// If there are characters whose primary weight is 0, they are consumed -// and considered as a part of the character element. -// +#define Binary using System; using System.IO; using System.Collections; using System.Globalization; +using System.Text; using System.Xml; +using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil; + namespace Mono.Globalization.Unicode { internal class MSCompatSortKeyTableGenerator @@ -59,7 +75,8 @@ namespace Mono.Globalization.Unicode const int DecompositionCompat = 0x11; const int DecompositionCanonical = 0x12; - TextWriter Result = Console.Out; + TextWriter CSResult = Console.Out; + TextWriter CResult = TextWriter.Null; byte [] fillIndex = new byte [256]; // by category CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1]; @@ -94,19 +111,32 @@ namespace Mono.Globalization.Unicode byte [] diacritical = new byte [char.MaxValue + 1]; string [] diacritics = new string [] { - // LATIN - "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;", - "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;", - " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;", - " OGONEK;", " CEDILLA;", + // LATIN, CYRILLIC etc. + "VERTICAL LINE ABOVE", "UPTURN", "DOUBLE-STRUCK", + "ABKHASIAN", + "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS", + "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;", + "WITH ACUTE;", "WITH GRAVE;", + // + "WITH DOT ABOVE;", " MIDDLE DOT;", + "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;", + "WITH DIALYTIKA;", + "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;", + "DIALYTIKA TONOS", "DIALYTIKA AND TONOS", + "ABKHASIAN CHE WITH DESCENDER", + "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;", + "WITH OGONEK;", "WITH CEDILLA;", + // " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;", - " STROKE;", " CIRCUMFLEX AND ACUTE;", + "WITH STROKE;", " CIRCUMFLEX AND ACUTE;", + "STROKE OVERLAY", " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;", " DIAERESIS AND GRAVE;", " BREVE AND ACUTE;", " CARON AND DOT ABOVE;", " BREVE AND GRAVE;", " MACRON AND ACUTE;", " MACRON AND GRAVE;", + // " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE", " RING ABOVE AND ACUTE", " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS", @@ -116,45 +146,63 @@ namespace Mono.Globalization.Unicode " BREVE AND TILDE", " CEDILLA AND BREVE", " OGONEK AND MACRON", - " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;", - " DOUBLE GRAVE;", + // 0x40 + "WITH OVERLINE", "DOUBLE VERTICAL LINE ABOVE", + "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;", + " DOUBLE GRAVE", " INVERTED BREVE", + "ROMAN NUMERAL", " PRECEDED BY APOSTROPHE", - " HORN;", + "WITH HORN;", " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE", " PALATAL HOOK", " DOT BELOW;", - " RETROFLEX;", "DIAERESIS BELOW", - " RING BELOW", + " RETROFLEX;", "DIAERESIS BELOW", "RETROFLEX HOOK", + " RING BELOW", "LOW VERTICAL LINE", + // " CIRCUMFLEX BELOW", "HORN AND ACUTE", " BREVE BELOW;", " HORN AND GRAVE", + " LOW MACRON", " TILDE BELOW", + " TOPBAR", " DOT BELOW AND DOT ABOVE", " RIGHT HALF RING", " HORN AND TILDE", " CIRCUMFLEX AND DOT BELOW", " BREVE AND DOT BELOW", " DOT BELOW AND MACRON", + " TONE TWO", " HORN AND HOOK ABOVE", " HORN AND DOT", // CIRCLED, PARENTHESIZED and so on - "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA", + "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", + "CIRCLED KATAKANA", "CIRCLED SANS-SERIF", "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN", }; byte [] diacriticWeights = new byte [] { // LATIN. - 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, - 0x17, 0x19, 0x1A, 0x1B, 0x1C, - 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F, + 3, 3, 3, 5, 5, 5, 5, + 0xE, 0xF, + 0xE, 0xF, + // + 0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, + 0x16, 0x17, 0x17, 0x19, 0x1A, 0x1B, 0x1C, + // + 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F, 0x20, 0x21, 0x22, 0x22, 0x23, 0x24, + // 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30, - 0x43, 0x43, 0x43, 0x44, 0x46, 0x48, - 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A, - 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, + // + 0x40, 0x41, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48, + 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x59, + 0x5A, 0x5A, + // + 0x60, 0x60, 0x61, 0x61, 0x62, 0x63, 0x68, 0x68, 0x69, 0x69, 0x6A, 0x6D, 0x6E, - 0x95, 0xAA, + 0x87, 0x95, 0xAA, // CIRCLED, PARENTHESIZED and so on. - 0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3 + 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, + 0xF3, 0xF3, 0xF3 }; int [] numberSecondaryWeightBounds = new int [] { @@ -165,7 +213,6 @@ namespace Mono.Globalization.Unicode 0xE50, 0xE60, 0xED0, 0xEE0 }; - char [] orderedCyrillic; char [] orderedGurmukhi; char [] orderedGujarati; char [] orderedGeorgian; @@ -175,18 +222,21 @@ namespace Mono.Globalization.Unicode // based on traditional Tamil consonants, except for // Grantha (where Microsoft breaks traditionalism). // http://www.angelfire.com/empire/thamizh/padanGaL - '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3', - '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF', - '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3', - '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7', - '\u0BB9'}; + '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', + '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', + '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', + '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', + '\u0BB7', '\u0BB9'}; // cp -> character name (only for some characters) ArrayList sortableCharNames = new ArrayList (); - // cp -> arrow value 0:UPWARDS 1:RIGHTWARDS 2:DOWNWARDS 3:LEFTWARDS + // cp -> arrow value (int) ArrayList arrowValues = new ArrayList (); + // cp -> box value (int) + ArrayList boxValues = new ArrayList (); + // cp -> level1 value Hashtable arabicLetterPrimaryValues = new Hashtable (); @@ -202,127 +252,285 @@ namespace Mono.Globalization.Unicode ArrayList jisJapanese = new ArrayList (); ArrayList nonJisJapanese = new ArrayList (); - ushort [] cjkJA = new ushort [char.MaxValue - 0x4E00]; - ushort [] cjkCHS = new ushort [char.MaxValue - 0x3100]; - ushort [] cjkCHT = new ushort [char.MaxValue - 0x4E00]; - ushort [] cjkKO = new ushort [char.MaxValue - 0x4E00]; - byte [] cjkKOlv2 = new byte [char.MaxValue - 0x4E00]; + ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00]; + ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100]; + ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00]; + ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00]; + byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00]; byte [] ignorableFlags = new byte [char.MaxValue + 1]; - double [] unicodeAge = new double [char.MaxValue + 1]; + static double [] unicodeAge = new double [char.MaxValue + 1]; + + ArrayList tailorings = new ArrayList (); void Run (string [] args) { string dirname = args.Length == 0 ? "downloaded" : args [0]; - FillIgnorables (); - ParseSources (dirname); Console.Error.WriteLine ("parse done."); ModifyParsedValues (); GenerateCore (); Console.Error.WriteLine ("generation done."); + CResult = new StreamWriter ("collation-tables.h", false); Serialize (); + CResult.Close (); Console.Error.WriteLine ("serialization done."); +/* +StreamWriter sw = new StreamWriter ("agelog.txt"); +for (int i = 0; i < char.MaxValue; i++) { +bool shouldBe = false; +switch (Char.GetUnicodeCategory ((char) i)) { +case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned: + shouldBe = true; break; +} +if (unicodeAge [i] >= 3.1) + shouldBe = true; +//if (IsIgnorable (i) != shouldBe) +sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' '); +} +sw.Close (); +*/ + } + + byte [] CompressArray (byte [] source, CodePointIndexer i) + { + return (byte []) CodePointIndexer.CompressArray ( + source, typeof (byte), i); + } + + ushort [] CompressArray (ushort [] source, CodePointIndexer i) + { + return (ushort []) CodePointIndexer.CompressArray ( + source, typeof (ushort), i); + } + + void WriteByte (byte value) + { + } void Serialize () { + // Tailorings + SerializeTailorings (); + + byte [] categories = new byte [map.Length]; + byte [] level1 = new byte [map.Length]; + byte [] level2 = new byte [map.Length]; + byte [] level3 = new byte [map.Length]; +// widthCompat is now removed from the mapping table. +// If it turned out that it is still required, grep this source and uncomment +// widthCompat related lines. FIXME: remove those lines in the future. +// ushort [] widthCompat = new ushort [map.Length]; + for (int i = 0; i < map.Length; i++) { + categories [i] = map [i].Category; + level1 [i] = map [i].Level1; + level2 [i] = map [i].Level2; + level3 [i] = ComputeLevel3Weight ((char) i); +/* + // For Japanese Half-width characters, don't + // map widthCompat. It is IgnoreKanaType that + // handles those width differences. + if (0xFF6D <= i && i <= 0xFF9D) + continue; + switch (decompType [i]) { + case DecompositionNarrow: + case DecompositionWide: + case DecompositionSuper: + case DecompositionSub: + // they are always 1 char + widthCompat [i] = (ushort) decompValues [decompIndex [i]]; + break; + } +*/ + } + + // compress + ignorableFlags = CompressArray (ignorableFlags, + UUtil.Ignorable); + categories = CompressArray (categories, UUtil.Category); + level1 = CompressArray (level1, UUtil.Level1); + level2 = CompressArray (level2, UUtil.Level2); + level3 = CompressArray (level3, UUtil.Level3); +// widthCompat = (ushort []) CodePointIndexer.CompressArray ( +// widthCompat, typeof (ushort), UUtil.WidthCompat); + cjkCHS = CompressArray (cjkCHS, UUtil.CjkCHS); + cjkCHT = CompressArray (cjkCHT,UUtil.Cjk); + cjkJA = CompressArray (cjkJA, UUtil.Cjk); + cjkKO = CompressArray (cjkKO, UUtil.Cjk); + cjkKOlv2 = CompressArray (cjkKOlv2, UUtil.Cjk); + // Ignorables - Result.WriteLine ("static byte [] ignorableFlags = new byte [] {"); - for (int i = 0; i <= char.MaxValue; i++) { + CResult.WriteLine ("static const guint8 collation_table_ignorableFlags [] = {"); + CSResult.WriteLine ("static readonly byte [] ignorableFlagsArr = new byte [] {"); +#if Binary + MemoryStream ms = new MemoryStream (); + BinaryWriter binary = new BinaryWriter (ms); + binary.Write (UUtil.ResourceVersion); + binary.Write (ignorableFlags.Length); +#endif + for (int i = 0; i < ignorableFlags.Length; i++) { byte value = ignorableFlags [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); +#if Binary + binary.Write (value); +#endif + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Ignorable.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); // Primary category - Result.WriteLine ("static byte [] categories = new byte [] {"); - for (int i = 0; i < map.Length; i++) { - byte value = map [i].Category; + CResult.WriteLine ("static const guint8 collation_table_category [] = {"); + CSResult.WriteLine ("static readonly byte [] categoriesArr = new byte [] {"); +#if Binary + binary.Write (categories.Length); +#endif + for (int i = 0; i < categories.Length; i++) { + byte value = categories [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); +#if Binary + binary.Write (value); +#endif + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Category.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); // Primary weight value - Result.WriteLine ("static byte [] level1 = new byte [] {"); - for (int i = 0; i < map.Length; i++) { - byte value = map [i].Level1; + CResult.WriteLine ("static const guint8 collation_table_level1 [] = {"); + CSResult.WriteLine ("static readonly byte [] level1Arr = new byte [] {"); +#if Binary + binary.Write (level1.Length); +#endif + for (int i = 0; i < level1.Length; i++) { + byte value = level1 [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); +#if Binary + binary.Write (value); +#endif + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Level1.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); // Secondary weight - Result.WriteLine ("static byte [] level2 = new byte [] {"); - for (int i = 0; i < map.Length; i++) { - int value = map [i].Level2; + CResult.WriteLine ("static const guint8 collation_table_level2 [] = {"); + CSResult.WriteLine ("static readonly byte [] level2Arr = new byte [] {"); +#if Binary + binary.Write (level2.Length); +#endif + for (int i = 0; i < level2.Length; i++) { + byte value = level2 [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); +#if Binary + binary.Write (value); +#endif + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Level2.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); // Thirtiary weight - Result.WriteLine ("static byte [] level3 = new byte [] {"); - for (int i = 0; i < map.Length; i++) { - byte value = ComputeLevel3Weight ((char) i); + CResult.WriteLine ("static const guint8 collation_table_level3 [] = {"); + CSResult.WriteLine ("static readonly byte [] level3Arr = new byte [] {"); +#if Binary + binary.Write (level3.Length); +#endif + for (int i = 0; i < level3.Length; i++) { + byte value = level3 [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); +#if Binary + binary.Write (value); +#endif + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Level3.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); +/* // Width insensitivity mappings // (for now it is more lightweight than dumping the // entire NFKD table). - Result.WriteLine ("static int [] widthCompat = new int [] {"); - for (int i = 0; i < char.MaxValue; i++) { - int value = 0; - switch (decompType [i]) { - case DecompositionNarrow: - case DecompositionWide: - case DecompositionSuper: - case DecompositionSub: - // they are always 1 char - value = decompValues [decompIndex [i]]; - break; - } + CResult.WriteLine ("static const guint16* widthCompat [] = {"); + CSResult.WriteLine ("static readonly ushort [] widthCompatArr = new ushort [] {"); +#if Binary + binary.Write (widthCompat.Length); +#endif + for (int i = 0; i < widthCompat.Length; i++) { + ushort value = widthCompat [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X04},", value); - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); +#if Binary + binary.Write (value); +#endif + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.WidthCompat.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } + } + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); +*/ + +#if Binary + using (FileStream fs = File.Create ("../resources/collation.core.bin")) { + byte [] array = ms.ToArray (); + fs.Write (array, 0, array.Length); } - Result.WriteLine ("};"); - Result.WriteLine (); +#endif // CJK SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue); @@ -332,42 +540,195 @@ namespace Mono.Globalization.Unicode SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0); } - void SerializeCJK (string name, ushort [] cjk, int max) + void SerializeCJK (string name, ushort [] cjk, int max_unused) { - int offset = char.MaxValue - cjk.Length; - Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name); +// CResult.WriteLine ("static const int collation_table_collation_cjk_{0}_size [] = {1};", name, cjk.Length); + CSResult.WriteLine ("const int {0}ArrLength = {1};", name, cjk.Length); + + int len = cjk.Length; + CResult.WriteLine ("static const guint8 collation_table_collation_cjk_{0} [] = {{", name); + CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name); + // the actual length is *2 + for (int i = 0; i < 4; i++, len /= 256) { + CResult.Write ("{0},", len & 0xFF); + CSResult.Write ("0x{0:X04},", len & 0xFF); + } + CResult.WriteLine (); + CSResult.WriteLine (); +#if Binary + MemoryStream ms = new MemoryStream (); + BinaryWriter binary = new BinaryWriter (ms); + binary.Write (UUtil.ResourceVersion); + binary.Write (cjk.Length); // the actual size is *2. +#endif + // category for (int i = 0; i < cjk.Length; i++) { - if (i + offset == max) - break; - ushort value = cjk [i]; +// if (i == max) +// break; + byte value = (byte) (cjk [i] >> 8); + if (value < 10) + CSResult.Write ("{0},", value); + else + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); +#if Binary + binary.Write (value); +#endif + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", i - 0xF); + CResult.WriteLine (); + } + } + + // level 1 + for (int i = 0; i < cjk.Length; i++) { +// if (i == max) +// break; + byte value = (byte) (cjk [i] & 0xFF); if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X04},", value); - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF + offset); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); +#if Binary + binary.Write (value); +#endif + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", i - 0xF); + CResult.WriteLine (); + } + } + + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); +#if Binary + using (FileStream fs = File.Create (String.Format ("../resources/collation.{0}.bin", name))) { + byte [] array = ms.ToArray (); + fs.Write (array, 0, array.Length); } - Result.WriteLine ("};"); - Result.WriteLine (); +#endif } void SerializeCJK (string name, byte [] cjk, int max) { - int offset = char.MaxValue - cjk.Length; - Result.WriteLine ("static byte [] {0} = new byte [] {{", name); + CResult.WriteLine ("static const guint8 collation_table_collation_cjk_{0} [] = {{", name); + CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name); +#if Binary + MemoryStream ms = new MemoryStream (); + BinaryWriter binary = new BinaryWriter (ms); + binary.Write (UUtil.ResourceVersion); +#endif for (int i = 0; i < cjk.Length; i++) { - if (i + offset == max) + if (i == max) break; byte value = cjk [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF + offset); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); +#if Binary + binary.Write (value); +#endif + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", i - 0xF); + CResult.WriteLine (); + } + } + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); +#if Binary + using (FileStream fs = File.Create (String.Format ("../resources/collation.{0}.bin", name))) { + byte [] array = ms.ToArray (); + fs.Write (array, 0, array.Length); } - Result.WriteLine ("};"); - Result.WriteLine (); +#endif + } + + void SerializeTailorings () + { + Hashtable indexes = new Hashtable (); + Hashtable counts = new Hashtable (); + CResult.WriteLine ("static const guint16 collation_table_tailoring [] = {"); + CSResult.WriteLine ("static char [] tailoringArr = new char [] {"); + int count = 0; +#if Binary + MemoryStream ms = new MemoryStream (); + BinaryWriter binary = new BinaryWriter (ms); + // Here we don't need to output resource version. + // This is cached. +#endif + foreach (Tailoring t in tailorings) { + if (t.Alias != 0) + continue; + CResult.Write ("/*{0}*/", t.LCID); + CSResult.Write ("/*{0}*/", t.LCID); + indexes.Add (t.LCID, count); + char [] values = t.ItemToCharArray (); + counts.Add (t.LCID, values.Length); + foreach (char c in values) { + CSResult.Write ("'\\x{0:X}', ", (int) c); + CResult.Write ("{0},", (int) c); + if (++count % 16 == 0) { + CSResult.WriteLine (" // {0:X04}", count - 16); + CResult.WriteLine (); + } +#if Binary + binary.Write ((ushort) c); +#endif + } + } + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + + CResult.WriteLine ("static const guint32 collation_table_tailoring_infos [] = {"); + CResult.WriteLine ("{0}, /*count*/", tailorings.Count); + CSResult.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {"); +#if Binary + byte [] rawdata = ms.ToArray (); + ms = new MemoryStream (); + binary = new BinaryWriter (ms); + binary.Write (UUtil.ResourceVersion); + binary.Write (tailorings.Count); +#endif + foreach (Tailoring t in tailorings) { + int target = t.Alias != 0 ? t.Alias : t.LCID; + if (!indexes.ContainsKey (target)) { + throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias)); + continue; + } + int idx = (int) indexes [target]; + int cnt = (int) counts [target]; + bool french = t.FrenchSort; + if (t.Alias != 0) + foreach (Tailoring t2 in tailorings) + if (t2.LCID == t.LCID) + french = t2.FrenchSort; + CSResult.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false"); + CResult.WriteLine ("{0},{1},{2},{3},", t.LCID, idx, cnt, french ? 1 : 0); +#if Binary + binary.Write (t.LCID); + binary.Write (idx); + binary.Write (cnt); + binary.Write (french); +#endif + } + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); +#if Binary + binary.Write ((byte) 0xFF); + binary.Write ((byte) 0xFF); + binary.Write (rawdata.Length / 2); + binary.Write (rawdata, 0, rawdata.Length); + + + using (FileStream fs = File.Create ("../resources/collation.tailoring.bin")) { + byte [] array = ms.ToArray (); + fs.Write (array, 0, array.Length); + } +#endif } #region Parse @@ -389,11 +750,111 @@ namespace Mono.Globalization.Unicode string koXML = dirname + "/common/collation/ko.xml"; ParseDerivedAge (derivedAge); + + FillIgnorables (); + ParseJISOrder (cp932); // in prior to ParseUnidata() ParseUnidata (unidata); + ModifyUnidata (); ParseDerivedCoreProperties (derivedCoreProps); ParseScripts (scripts); ParseCJK (chXML, jaXML, koXML); + + ParseTailorings ("mono-tailoring-source.txt"); + } + + void ParseTailorings (string filename) + { + Tailoring t = null; + int line = 0; + using (StreamReader sr = new StreamReader (filename)) { + try { + while (sr.Peek () >= 0) { + line++; + ProcessTailoringLine (ref t, + sr.ReadLine ().Trim ()); + } + } catch (Exception) { + Console.Error.WriteLine ("ERROR at line {0}", line); + throw; + } + } + } + + // For now this is enough. + string ParseTailoringSourceValue (string s) + { + StringBuilder sb = new StringBuilder (); + for (int i = 0; i < s.Length; i++) { + if (i + 5 < s.Length && + s [i] == '\\' && s [i + 1] == 'u') { + sb.Append ( + (char) int.Parse ( + s.Substring (i + 2, 4), + NumberStyles.HexNumber), + 1); + i += 5; + } + else + sb.Append (s [i]); + } + return sb.ToString (); + } + + void ProcessTailoringLine (ref Tailoring t, string s) + { + int idx = s.IndexOf ('#'); + if (idx > 0) + s = s.Substring (0, idx).Trim (); + if (s.Length == 0 || s [0] == '#') + return; + if (s [0] == '@') { + idx = s.IndexOf ('='); + if (idx > 0) + t = new Tailoring ( + int.Parse (s.Substring (1, idx - 1)), + int.Parse (s.Substring (idx + 1))); + else + t = new Tailoring (int.Parse (s.Substring (1))); + tailorings.Add (t); + return; + } + if (s.StartsWith ("*FrenchSort")) { + t.FrenchSort = true; + return; + } + string d = "*Diacritical"; + if (s.StartsWith (d)) { + idx = s.IndexOf ("->"); + t.AddDiacriticalMap ( + byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (), + NumberStyles.HexNumber), + byte.Parse (s.Substring (idx + 2).Trim (), + NumberStyles.HexNumber)); + return; + } + idx = s.IndexOf (':'); + if (idx > 0) { + string source = s.Substring (0, idx).Trim (); + string [] l = s.Substring (idx + 1).Trim ().Split (' '); + byte [] b = new byte [4]; + for (int i = 0; i < 4; i++) { + if (l [i] == "*") + b [i] = 0; + else + b [i] = byte.Parse (l [i], + NumberStyles.HexNumber); + } + t.AddSortKeyMap (ParseTailoringSourceValue (source), + b); + } + idx = s.IndexOf ('='); + if (idx > 0) + t.AddReplacementMap ( + ParseTailoringSourceValue ( + s.Substring (0, idx).Trim ()), + ParseTailoringSourceValue ( + s.Substring (idx + 1).Trim ())); } void ParseDerivedAge (string filename) @@ -421,10 +882,12 @@ namespace Mono.Globalization.Unicode if (cp > char.MaxValue) continue; + double v = double.Parse (value); for (int i = cp; i <= cpEnd; i++) - unicodeAge [i] = double.Parse (value); + unicodeAge [i] = v; } } + unicodeAge [0] = double.MaxValue; // never be supported } void ParseUnidata (string filename) @@ -444,7 +907,10 @@ namespace Mono.Globalization.Unicode this.decompValues = (int []) decompValues.ToArray (typeof (int)); } - + + char previousLatinTarget = char.MinValue; + byte [] diacriticalOffset = new byte ['Z' - 'A' + 1]; + void ProcessUnidataLine (string s, ArrayList decompValues) { int idx = s.IndexOf ('#'); @@ -462,37 +928,112 @@ namespace Mono.Globalization.Unicode if (IsIgnorable (cp)) return; + string name = values [0]; + + // SPECIAL CASE: rename some characters for diacritical + // remapping. FIXME: why are they different? + // FIXME: it's still not working. + if (cp == 0x018B || cp == 0x018C) + name = name.Replace ("TOPBAR", "STROKE"); + // isSmallCapital if (s.IndexOf ("SMALL CAPITAL") > 0) isSmallCapital [cp] = true; // latin mapping by character name - if (s.IndexOf ("LATIN") > 0) { + if (s.IndexOf ("LATIN") >= 0) { int lidx = s.IndexOf ("LETTER DOTLESS "); int offset = lidx + 15; if (lidx < 0) { lidx = s.IndexOf ("LETTER TURNED "); offset = lidx + 14; } + if (lidx < 0) { + lidx = s.IndexOf ("LETTER CAPITAL "); + offset = lidx + 15; + } + if (lidx < 0) { + lidx = s.IndexOf ("LETTER SCRIPT "); + offset = lidx + 14; + } if (lidx < 0) { lidx = s.IndexOf ("LETTER "); offset = lidx + 7; } char c = lidx > 0 ? s [offset] : char.MinValue; + char n = s [offset + 1]; + char target = char.MinValue; if ('A' <= c && c <= 'Z' && - (s.Length == offset + 1 || s [offset + 1] == ' ')) { - ArrayList entry = (ArrayList) latinMap [c]; + (n == ' ') || n == ';') { + target = c; + // FIXME: After 'Z', I cannot reset this state. + previousLatinTarget = c == 'Z' ? char.MinValue : c; + } + + if (s.Substring (offset).StartsWith ("ALPHA")) + target = 'A'; + else if (s.Substring (offset).StartsWith ("TONE SIX")) + target = 'B'; + else if (s.Substring (offset).StartsWith ("OPEN O")) + target = 'C'; + else if (s.Substring (offset).StartsWith ("ETH")) + target = 'D'; + else if (s.Substring (offset).StartsWith ("SCHWA")) + target = 'E'; + else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3 + target = 'O'; + else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3 + target = 'R'; + else if (s.Substring (offset).StartsWith ("TONE TWO")) + target = 'S'; + else if (s.Substring (offset).StartsWith ("ESH")) + target = 'S'; + else if (s.Substring (offset).StartsWith ("OUNCE")) + target = 'Z'; + + // For remaining IPA chars, direct mapping is + // much faster. + switch (cp) { + case 0x0166: case 0x0167: + // Though they are 'T', they have different weight + target = char.MinValue; break; + case 0x0299: target = 'B'; break; + case 0x029A: target = 'E'; break; + case 0x029B: target = 'G'; break; + case 0x029C: target = 'H'; break; + case 0x029D: target = 'J'; break; + case 0x029E: target = 'K'; break; + case 0x029F: target = 'L'; break; + case 0x02A0: target = 'Q'; break; + case 0x02A7: target = 'T'; break; + case 0x02A8: target = 'T'; break; + } + + if (target == char.MinValue) + target = previousLatinTarget; + + if (target != char.MinValue) { + ArrayList entry = (ArrayList) latinMap [target]; if (entry == null) { entry = new ArrayList (); - latinMap [c] = entry; + latinMap [target] = entry; } entry.Add (cp); + // FIXME: This secondary weight is hack. + // They are here because they must not + // be identical to the corresponding + // ASCII latins. + if (c != target && diacritical [cp] == 0) { + diacriticalOffset [c - 'A']++; + diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C); + } } } // Arrow names if (0x2000 <= cp && cp < 0x3000) { int value = 0; + // SPECIAL CASES. FIXME: why? switch (cp) { case 0x21C5: value = -1; break; // E2 case 0x261D: value = 1; break; @@ -516,12 +1057,23 @@ namespace Mono.Globalization.Unicode "SOUTH WEST", "LEFTWARDS", "NORTH WEST", + "LEFT RIGHT", + "UP DOWN", }; + if (s.IndexOf ("RIGHTWARDS") >= 0 && + s.IndexOf ("LEFTWARDS") >= 0) + value = 0xE1 - 0xD8; + else if (s.IndexOf ("UPWARDS") >= 0 && + s.IndexOf ("DOWNWARDS") >= 0) + value = 0xE2 - 0xD8; + else if (s.IndexOf ("ARROW") >= 0 && + s.IndexOf ("COMBINING") < 0 && + s.IndexOf ("CLOCKWISE") >= 0) + value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8; if (value == 0) for (int i = 1; value == 0 && i < arrowTargets.Length; i++) if (s.IndexOf (arrowTargets [i]) > 0 && s.IndexOf ("BARB " + arrowTargets [i]) < 0 && -// s.IndexOf (" WITH TIP " + arrowTargets [i]) < 0 && s.IndexOf (" OVER") < 0 ) value = i; @@ -530,25 +1082,176 @@ namespace Mono.Globalization.Unicode cp, value)); } + // Box names + if (0x2500 <= cp && cp < 0x2600) { + int value = int.MinValue; + // flags: + // up:1 down:2 right:4 left:8 vert:16 horiz:32 + // [h,rl] [r] [l] + // [v,ud] [u] [d] + // [dr] [dl] [ur] [ul] + // [vr,udr] [vl,vdl] + // [hd,rld] [hu,rlu] + // [hv,udrl,rlv,udh] + ArrayList flags = new ArrayList (new int [] { + 32, 8 + 4, 8, 4, + 16, 1 + 2, 1, 2, + 4 + 2, 8 + 2, 4 + 1, 8 + 1, + 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8, + 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1, + 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32 + }); + byte [] offsets = new byte [] { + 0, 0, 1, 2, + 3, 3, 4, 5, + 6, 7, 8, 9, + 10, 10, 11, 11, + 12, 12, 13, 13, + 14, 14, 14, 14}; + if (s.IndexOf ("BOX DRAWINGS ") >= 0) { + int flag = 0; + if (s.IndexOf (" UP") >= 0) + flag |= 1; + if (s.IndexOf (" DOWN") >= 0) + flag |= 2; + if (s.IndexOf (" RIGHT") >= 0) + flag |= 4; + if (s.IndexOf (" LEFT") >= 0) + flag |= 8; + if (s.IndexOf (" VERTICAL") >= 0) + flag |= 16; + if (s.IndexOf (" HORIZONTAL") >= 0) + flag |= 32; + + int fidx = flags.IndexOf (flag); + if (fidx >= 0) + value = offsets [fidx]; + } else if (s.IndexOf ("BLOCK") >= 0) { + if (s.IndexOf ("ONE EIGHTH") >= 0) + value = 0x12; + else if (s.IndexOf ("ONE QUARTER") >= 0) + value = 0x13; + else if (s.IndexOf ("THREE EIGHTHS") >= 0) + value = 0x14; + else if (s.IndexOf ("HALF") >= 0) + value = 0x15; + else if (s.IndexOf ("FIVE EIGHTHS") >= 0) + value = 0x16; + else if (s.IndexOf ("THREE QUARTERS") >= 0) + value = 0x17; + else if (s.IndexOf ("SEVEN EIGHTHS") >= 0) + value = 0x18; + else + value = 0x19; + } + else if (s.IndexOf ("SHADE") >= 0) + value = 0x19; + else if (s.IndexOf ("SQUARE") >= 0) + value = 0xBC - 0xE5; + else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0) + value = 0xBE - 0xE5; + else if (s.IndexOf ("RECTANGLE") >= 0) + value = 0xBD - 0xE5; + else if (s.IndexOf ("PARALLELOGRAM") >= 0) + value = 0xBF - 0xE5; + else if (s.IndexOf ("TRIANGLE") >= 0) { + if (s.IndexOf ("UP-POINTING") >= 0) + value = 0xC0 - 0xE5; + else if (s.IndexOf ("RIGHT-POINTING") >= 0) + value = 0xC1 - 0xE5; + else if (s.IndexOf ("DOWN-POINTING") >= 0) + value = 0xC2 - 0xE5; + else if (s.IndexOf ("LEFT-POINTING") >= 0) + value = 0xC3 - 0xE5; + } + else if (s.IndexOf ("POINTER") >= 0) { + if (s.IndexOf ("RIGHT-POINTING") >= 0) + value = 0xC4 - 0xE5; + else if (s.IndexOf ("LEFT-POINTING") >= 0) + value = 0xC5 - 0xE5; + } + else if (s.IndexOf ("DIAMOND") >= 0) + value = 0xC6 - 0xE5; + else if (s.IndexOf ("FISHEYE") >= 0) + value = 0xC7 - 0xE5; + else if (s.IndexOf ("LOZENGE") >= 0) + value = 0xC8 - 0xE5; + else if (s.IndexOf ("BULLSEYE") >= 0) + value = 0xC9 - 0xE5; + else if (s.IndexOf ("CIRCLE") >= 0) { + if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE") + value = 0xCA - 0xE5; + else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE") + value = 0xCB - 0xE5; + else + value = 0xC9 - 0xE5; + } + else if (s.IndexOf ("BULLET") >= 0) + value = 0xCC - 0xE5; + if (0x25DA <= cp && cp <= 0x25E5) + value = 0xCD + cp - 0x25DA - 0xE5; + + // SPECIAL CASE: BOX DRAWING DIAGONAL patterns + switch (cp) { + case 0x2571: value = 0xF; break; + case 0x2572: value = 0x10; break; + case 0x2573: value = 0x11; break; + } + if (value != int.MinValue) + boxValues.Add (new DictionaryEntry ( + cp, value)); + } // For some characters store the name and sort later // to determine sorting. if (0x2100 <= cp && cp <= 0x213F && Char.IsSymbol ((char) cp)) sortableCharNames.Add ( - new DictionaryEntry (cp, values [0])); + new DictionaryEntry (cp, name)); else if (0x3380 <= cp && cp <= 0x33DD) sortableCharNames.Add (new DictionaryEntry ( - cp, values [0].Substring (7))); + cp, name.Substring (7))); + + if (Char.GetUnicodeCategory ((char) cp) == + UnicodeCategory.MathSymbol) { + if (name.StartsWith ("CIRCLED ")) + diacritical [cp] = 0xEE; + if (name.StartsWith ("SQUARED ")) + diacritical [cp] = 0xEF; + } // diacritical weights by character name - for (int d = 0; d < diacritics.Length; d++) - if (s.IndexOf (diacritics [d]) > 0) - diacritical [cp] |= diacriticWeights [d]; +if (diacritics.Length != diacriticWeights.Length) +throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length)); + for (int d = diacritics.Length - 1; d >= 0; d--) { + if (s.IndexOf (diacritics [d]) > 0) { + diacritical [cp] += diacriticWeights [d]; + if (s.IndexOf ("COMBINING") >= 0) + diacritical [cp] -= (byte) 2; + break; + } + // also process "COMBINING blah" here + // For now it is limited to cp < 0x0370 +// if (cp < 0x0300 || cp >= 0x0370) +// continue; + string tmp = diacritics [d].TrimEnd (';'); + if (tmp.IndexOf ("WITH ") == 0) + tmp = tmp.Substring (4); + tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp); + if (name == tmp) { + diacritical [cp] = (byte) (diacriticWeights [d] - 2); + break; + } +//if (name == tmp) +//Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp); + } // Two-step grep required for it. if (s.IndexOf ("FULL STOP") > 0 && (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0)) diacritical [cp] |= 0xF4; + if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0) + diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 : + s.IndexOf ("CAPITAL") > 0 ? 5 : 4); // Arabic letter name if (0x0621 <= cp && cp <= 0x064A && @@ -574,8 +1277,8 @@ namespace Mono.Globalization.Unicode (cp == 0x0640) ? // 0x0640 is special: it does // not start with ARABIC LETTER - values [0] : - values [0].Substring (14); + name : + name.Substring (14); int tmpIdx = letterName.IndexOf (' '); letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx); //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName); @@ -591,7 +1294,7 @@ namespace Mono.Globalization.Unicode // Japanese square letter if (0x3300 <= cp && cp <= 0x3357) if (!ExistsJIS (cp)) - nonJisJapanese.Add (new NonJISCharacter (cp, values [0])); + nonJisJapanese.Add (new NonJISCharacter (cp, name)); // normalizationType string decomp = values [4]; @@ -765,7 +1468,6 @@ namespace Mono.Globalization.Unicode void ParseScripts (string filename) { - ArrayList cyrillic = new ArrayList (); ArrayList gurmukhi = new ArrayList (); ArrayList gujarati = new ArrayList (); ArrayList georgian = new ArrayList (); @@ -795,11 +1497,6 @@ namespace Mono.Globalization.Unicode continue; switch (value) { - case "Cyrillic": - for (int x = cp; x <= cpEnd; x++) - if (!IsIgnorable (x)) - cyrillic.Add ((char) x); - break; case "Gurmukhi": for (int x = cp; x <= cpEnd; x++) if (!IsIgnorable (x)) @@ -823,12 +1520,10 @@ namespace Mono.Globalization.Unicode } } } - cyrillic.Sort (UCAComparer.Instance); gurmukhi.Sort (UCAComparer.Instance); gujarati.Sort (UCAComparer.Instance); georgian.Sort (UCAComparer.Instance); thaana.Sort (UCAComparer.Instance); - orderedCyrillic = (char []) cyrillic.ToArray (typeof (char)); orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char)); orderedGujarati = (char []) gujarati.ToArray (typeof (char)); orderedGeorgian = (char []) georgian.ToArray (typeof (char)); @@ -837,26 +1532,37 @@ namespace Mono.Globalization.Unicode void ParseJISOrder (string filename) { - using (StreamReader file = - new StreamReader (filename)) { - while (file.Peek () >= 0) { - string s = file.ReadLine (); - int idx = s.IndexOf ('#'); - if (idx >= 0) - s = s.Substring (0, idx).Trim (); - if (s.Length == 0) - continue; - idx = s.IndexOf (' '); - if (idx < 0) - continue; - // They start with "0x" so cut them out. - int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber); - int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber); - jisJapanese.Add (new JISCharacter (cp, jis)); + int line = 1; + try { + using (StreamReader file = + new StreamReader (filename)) { + for (;file.Peek () >= 0; line++) + ProcessJISOrderLine (file.ReadLine ()); } + } catch (Exception) { + Console.Error.WriteLine ("---- line {0}", line); + throw; } } + char [] ws = new char [] {'\t', ' '}; + + void ProcessJISOrderLine (string s) + { + int idx = s.IndexOf ('#'); + if (idx >= 0) + s = s.Substring (0, idx).Trim (); + if (s.Length == 0) + return; + idx = s.IndexOfAny (ws); + if (idx < 0) + return; + // They start with "0x" so cut them out. + int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber); + int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber); + jisJapanese.Add (new JISCharacter (cp, jis)); + } + void ParseCJK (string zhXML, string jaXML, string koXML) { XmlDocument doc = new XmlDocument (); @@ -870,7 +1576,7 @@ namespace Mono.Globalization.Unicode // Chinese Simplified category = "chs"; arr = cjkCHS; - offset = char.MaxValue - arr.Length; + offset = 0;//char.MaxValue - arr.Length; doc.Load (zhXML); s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText; v = 0x8008; @@ -887,7 +1593,7 @@ namespace Mono.Globalization.Unicode // Chinese Traditional category = "cht"; arr = cjkCHT; - offset = char.MaxValue - arr.Length; + offset = 0;//char.MaxValue - arr.Length; s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText; v = 0x8002; foreach (char c in s) { @@ -903,17 +1609,56 @@ namespace Mono.Globalization.Unicode // Japanese category = "ja"; arr = cjkJA; - offset = char.MaxValue - arr.Length; - doc.Load (jaXML); - s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText; + offset = 0;//char.MaxValue - arr.Length; + + // SPECIAL CASES + arr [0x4EDD] = 0x8002; // Chinese repetition mark? + arr [0x337B] = 0x8004; // Those 4 characters are Gengou + arr [0x337E] = 0x8005; + arr [0x337D] = 0x8006; + arr [0x337C] = 0x8007; + v = 0x8008; - foreach (char c in s) { + foreach (JISCharacter jc in jisJapanese) { + if (jc.JIS < 0x8800) + continue; + char c = (char) jc.CP; + if (c < '\u4E00') - Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v); + // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v); + continue; else { arr [(int) c - offset] = (ushort) v++; if (v % 256 == 0) v += 2; + + // SPECIAL CASES: + if (c == '\u662D') // U+337C + continue; + if (c == '\u5927') // U+337D + continue; + if (c == '\u5E73') // U+337B + continue; + if (c == '\u660E') // U+337E + continue; + if (c == '\u9686') // U+F9DC + continue; + + // FIXME: there are still remaining + // characters after U+FA0C. +// for (int k = 0; k < char.MaxValue; k++) { + for (int k = 0; k < '\uFA0D'; k++) { + if (decompIndex [k] == 0 || IsIgnorable (k)) + continue; + if (decompValues [decompIndex [k]] == c /*&& + decompLength [k] == 1*/ || + decompLength [k] == 3 && + decompValues [decompIndex [k] + 1] == c) { + arr [k - offset] = (ushort) v++; + if (v % 256 == 0) + v += 2; + } + } } } @@ -929,7 +1674,7 @@ namespace Mono.Globalization.Unicode // category = "ko"; arr = cjkKO; - offset = char.MaxValue - arr.Length; + offset = 0;//char.MaxValue - arr.Length; doc.Load (koXML); foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) { XmlElement sc = (XmlElement) reset.NextSibling; @@ -969,8 +1714,124 @@ namespace Mono.Globalization.Unicode } } + void ModifyUnidata () + { + ArrayList decompValues = new ArrayList (this.decompValues); + + // Hebrew uppercase letters. + foreach (int i in new int [] + {0x05DB, 0x05DE, 0x05E0, 0x05E4, 0x05E6}) + isUppercase [i] = true; + + + // Modify some decomposition equivalence + for (int i = 0xFE31; i <= 0xFE34; i++) { + decompType [i] = 0; + decompIndex [i] = 0; + decompLength [i] = 0; + } + decompType [0x037E] = 0; + decompIndex [0x037E] = 0; + decompLength [0x037E] = 0; + + // Hangzhou numbers + for (int i = 0x3021; i <= 0x3029; i++) + diacritical [i] = 0x4E; + // Korean parens numbers + for (int i = 0x3200; i <= 0x321C; i++) + diacritical [i] = 0xA; + for (int i = 0x3260; i <= 0x327B; i++) + diacritical [i] = 0xC; + + // LAMESPEC: these remapping should not be done. + // Windows have incorrect CJK compat mappings. + decompValues [decompIndex [0x32A9]] = 0x91AB; + decompLength [0x323B] = 1; + decompValues [decompIndex [0x323B]] = 0x5B78; + decompValues [decompIndex [0x32AB]] = 0x5B78; + decompValues [decompIndex [0x32A2]] = 0x5BEB; + decompLength [0x3238] = 1; + decompValues [decompIndex [0x3238]] = 0x52DE; + decompValues [decompIndex [0x3298]] = 0x52DE; + + // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things) + decompIndex [0xFA0C] = decompValues.Count; + decompValues.Add ((int) 0x5140); + decompLength [0xFA0C] = 1; + decompIndex [0xF929] = decompLength [0xF929] = 0; + + decompValues [decompIndex [0xF92C]] = 0x90DE; + + decompIndex [0x2125] = decompValues.Count; + decompValues.Add ((int) 0x005A); + decompLength [0x2125] = 1; + decompType [0x2125] = DecompositionFont; + + this.decompValues = decompValues.ToArray (typeof (int)) as int []; + } + void ModifyParsedValues () { + // Sometimes STROKE don't work fine + diacritical [0xD8] = diacritical [0xF8] = 0x21; + diacritical [0x141] = diacritical [0x142] = 0x1F; + // FIXME: why? + diacritical [0xAA] = diacritical [0xBA] = 3; + diacritical [0xD0] = diacritical [0xF0] = 0x68; + diacritical [0x131] = 3; + diacritical [0x138] = 3; + // TOPBAR does not work as an identifier for the weight + diacritical [0x182] = diacritical [0x183] = 0x68; // B + diacritical [0x18B] = diacritical [0x18C] = 0x1E; // D + // TONE TWO + diacritical [0x1A7] = diacritical [0x1A8] = 0x87; + // TONE SIX + diacritical [0x184] = diacritical [0x185] = 0x87; + // OPEN E + diacritical [0x190] = diacritical [0x25B] = 0x7B; + // There are many letters w/ diacritical weight 0x7B + diacritical [0x0192] = diacritical [0x0194] = + diacritical [0x0195] = diacritical [0x0196] = + diacritical [0x019C] = diacritical [0x019E] = + diacritical [0x01A6] = diacritical [0x01B1] = + diacritical [0x01B2] = diacritical [0x01BF] = 0x7B; + // ... as well as 0x7C + diacritical [0x01A2] = diacritical [0x01A3] = 0x7C; + + // NFKD characters seem to have diacritical + // weight as 3,4,5... but the order does not look + // by codepoint and I have no idea how they are sorted. + diacritical [0x210E] = 3; + diacritical [0x210F] = 0x68; + diacritical [0x2110] = 4; + diacritical [0x2111] = 5; + diacritical [0x2112] = 4; + diacritical [0x2113] = 4; + diacritical [0x211B] = 4; + diacritical [0x211C] = 5; + + // some cyrillic diacritical weight. They seem to be + // based on old character names, so it's quicker to + // set them directly here. + // FIXME: they are by mostly unknown reason + diacritical [0x0496] = diacritical [0x0497] = 7; + diacritical [0x0498] = diacritical [0x0499] = 0x1A; + diacritical [0x049A] = diacritical [0x049B] = 0x17; + diacritical [0x049C] = diacritical [0x049D] = 9; + diacritical [0x049E] = diacritical [0x049F] = 4; + diacritical [0x04A0] = diacritical [0x04A1] = 0xA; + diacritical [0x04A2] = diacritical [0x04A3] = 7; + diacritical [0x04A4] = diacritical [0x04A5] = 8; + diacritical [0x04AA] = diacritical [0x04AB] = 0x1A; // ES CEDILLA? + diacritical [0x04AC] = diacritical [0x04AD] = 7; // RIGHT DESCENDER? but U+4B2 + diacritical [0x04AE] = diacritical [0x04AF] = 0xB; // STRAIGHT U? + diacritical [0x04B2] = diacritical [0x04B3] = 0x17; // RIGHT DESCENDER? but U+4AC + diacritical [0x04B4] = diacritical [0x04B5] = 3; + diacritical [0x04B6] = 8; + diacritical [0x04B7] = 7; + diacritical [0x04B8] = diacritical [0x04B9] = 9; + diacritical [0x04BA] = diacritical [0x04BB] = 9; + // number, secondary weights byte weight = 0x38; int [] numarr = numberSecondaryWeightBounds; @@ -979,11 +1840,12 @@ namespace Mono.Globalization.Unicode if (Char.IsNumber ((char) cp)) diacritical [cp] = weight; - // Korean parens numbers - for (int i = 0x3200; i <= 0x321C; i++) - diacritical [i] = 0xA; - for (int i = 0x3260; i <= 0x327B; i++) - diacritical [i] = 0xC; + // Gurmukhi special letters' diacritical weight + for (int i = 0x0A50; i < 0x0A60; i++) + diacritical [i] = 4; + // Oriya special letters' diacritical weight + for (int i = 0x0B5C; i < 0x0B60; i++) + diacritical [i] = 6; // Update name part of named characters for (int i = 0; i < sortableCharNames.Count; i++) { @@ -1025,14 +1887,25 @@ namespace Mono.Globalization.Unicode #region Specially ignored // 01 // This will raise "Defined" flag up. + // FIXME: Check If it is really fine. Actually for + // Japanese voice marks this code does remapping. foreach (char c in specialIgnore) map [(int) c] = new CharMapEntry (0, 0, 0); #endregion + #region Extenders (FF FF) + fillIndex [0xFF] = 0xFF; + char [] specialBiggest = new char [] { + '\u3005', '\u3031', '\u3032', '\u309D', + '\u309E', '\u30FC', '\u30FD', '\u30FE', + '\uFE7C', '\uFE7D', '\uFF70'}; + foreach (char c in specialBiggest) + AddCharMap (c, 0xFF, 0); + #endregion #region Variable weights // Controls : 06 03 - 06 3D - fillIndex [6] = 3; + fillIndex [0x6] = 3; for (int i = 0; i < 65536; i++) { if (IsIgnorable (i)) continue; @@ -1045,22 +1918,41 @@ namespace Mono.Globalization.Unicode } // Apostrophe 06 80 - fillIndex [6] = 0x80; - AddCharMapGroup ('\'', 6, 1, 0); + fillIndex [0x6] = 0x80; + AddCharMap ('\'', 6, 0); + AddCharMap ('\uFF07', 6, 1); AddCharMap ('\uFE63', 6, 1); + // SPECIAL CASE: fill FE32 here in prior to be added + // at 2013. Windows does not always respect NFKD. + map [0xFE32] = new CharMapEntry (6, 0x90, 0); + // Hyphen/Dash : 06 81 - 06 90 for (int i = 0; i < char.MaxValue; i++) { - if (Char.GetUnicodeCategory ((char) i) - == UnicodeCategory.DashPunctuation) - AddCharMapGroupTail ((char) i, 6, 1); + if (!IsIgnorable (i) && + Char.GetUnicodeCategory ((char) i) == + UnicodeCategory.DashPunctuation) { + AddCharMapGroup2 ((char) i, 6, 1, 0); + if (i == 0x2011) { + // SPECIAL: add 2027 and 2043 + // Maybe they are regarded the + // same hyphens in "central" + // position. + AddCharMap ('\u2027', 6, 1); + AddCharMap ('\u2043', 6, 1); + } + } } + // They are regarded as primarily equivalent to '-' + map [0x208B] = new CharMapEntry (6, 0x82, 0); + map [0x207B] = new CharMapEntry (6, 0x82, 0); + map [0xFF0D] = new CharMapEntry (6, 0x82, 0); // Arabic variable weight chars 06 A0 - fillIndex [6] = 0xA0; // vowels for (int i = 0x64B; i <= 0x650; i++) - AddCharMapGroupTail ((char) i, 6, 1); + AddArabicCharMap ((char) i, 6, 1, 0); // sukun AddCharMapGroup ('\u0652', 6, 1, 0); // shadda @@ -1080,10 +1972,11 @@ namespace Mono.Globalization.Unicode for (int i = 0x0329; i <= 0x0334; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); + fillIndex [0x1]++; for (int i = 0x0339; i <= 0x0341; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); - fillIndex [0x1] = 0x72; + fillIndex [0x1] = 0x74; for (int i = 0x0346; i <= 0x0348; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); @@ -1096,6 +1989,7 @@ namespace Mono.Globalization.Unicode for (int i = 0x02CE; i <= 0x02CF; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); + fillIndex [0x1]++; for (int i = 0x02D1; i <= 0x02D3; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); @@ -1104,12 +1998,87 @@ namespace Mono.Globalization.Unicode if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); + + // FIXME: needs more love here (it should eliminate + // all the hacky code above). + for (int i = 0x0300; i < 0x0370; i++) + if (!IsIgnorable (i) && diacritical [i] != 0 + && !map [i].Defined) + map [i] = new CharMapEntry ( + 0x1, 0x1, diacritical [i]); + + // Cyrillic and Armenian nonspacing mark + fillIndex [0x1] = 0x94; + for (int i = 0x400; i < 0x580; i++) + if (!IsIgnorable (i) && + Char.GetUnicodeCategory ((char) i) == + UnicodeCategory.NonSpacingMark) + AddCharMap ((char) i, 1, 1); + + fillIndex [0x1] = 0x8D; + // syriac dotted nonspacing marks (1) + AddCharMap ('\u0740', 0x1, 1); + AddCharMap ('\u0741', 0x1, 1); + AddCharMap ('\u0742', 0x1, 1); + // syriac oblique nonspacing marks + AddCharMap ('\u0747', 0x1, 1); + AddCharMap ('\u0748', 0x1, 1); + // syriac dotted nonspacing marks (2) + fillIndex [0x1] = 0x94; // this reset is mandatory + AddCharMap ('\u0732', 0x1, 1); + AddCharMap ('\u0735', 0x1, 1); + AddCharMap ('\u0738', 0x1, 1); + AddCharMap ('\u0739', 0x1, 1); + AddCharMap ('\u073C', 0x1, 1); + // SPECIAL CASES: superscripts + AddCharMap ('\u073F', 0x1, 1); + AddCharMap ('\u0711', 0x1, 1); + // syriac "DOTS" + for (int i = 0x0743; i <= 0x0746; i++) + AddCharMap ((char) i, 0x1, 1); + for (int i = 0x0730; i <= 0x0780; i++) + if (!map [i].Defined && + Char.GetUnicodeCategory ((char) i) == + UnicodeCategory.NonSpacingMark) + AddCharMap ((char) i, 0x1, 1); + // LAMESPEC: It should not stop at '\u20E1'. There are // a few more characters (that however results in // overflow of level 2 unless we start before 0xDD). - fillIndex [0x1] = 0xDC; - for (int i = 0x20d0; i <= 0x20e1; i++) + fillIndex [0x1] = 0xDD; + for (int i = 0x20D0; i <= 0x20DC; i++) + AddCharMap ((char) i, 0x1, 1); + fillIndex [0x1] = 0xEC; + for (int i = 0x20DD; i <= 0x20E1; i++) + AddCharMap ((char) i, 0x1, 1); + fillIndex [0x1] = 0x4; + AddCharMap ('\u0CD5', 0x1, 1); + AddCharMap ('\u0CD6', 0x1, 1); + AddCharMap ('\u093C', 0x1, 1); + for (int i = 0x302A; i <= 0x302D; i++) AddCharMap ((char) i, 0x1, 1); + AddCharMap ('\u0C55', 0x1, 1); + AddCharMap ('\u0C56', 0x1, 1); + + fillIndex [0x1] = 0x50; // I wonder how they are sorted + for (int i = 0x02D4; i <= 0x02D7; i++) + AddCharMap ((char) i, 0x1, 1); + + // They are not part of Nonspacing marks, but have + // only diacritical weight. + for (int i = 0x3099; i <= 0x309C; i++) + map [i] = new CharMapEntry (1, 1, 1); + map [0xFF9E] = new CharMapEntry (1, 1, 1); + map [0xFF9F] = new CharMapEntry (1, 1, 2); + map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1); + map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1); + for (int i = 0x30FC; i <= 0x30FE; i++) + map [i] = new CharMapEntry (0xFF, 0xFF, 1); + + fillIndex [0x1] = 0xA; + for (int i = 0x0951; i <= 0x0954; i++) + AddCharMap ((char) i, 0x1, 2); + #endregion @@ -1132,35 +2101,51 @@ namespace Mono.Globalization.Unicode // while they aren't. AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol AddCharMap ('\u2423', 0x7, 1, 0); // open box + #endregion - // FIXME: 09 should be more complete. + // category 09 - continued symbols from 08 fillIndex [0x9] = 2; // misc tech mark for (int cp = 0x2300; cp <= 0x237A; cp++) AddCharMap ((char) cp, 0x9, 1, 0); // arrows - byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3}; + byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; foreach (DictionaryEntry de in arrowValues) { int idx = (int) de.Value; int cp = (int) de.Key; if (map [cp].Defined) continue; fillIndex [0x9] = (byte) (0xD8 + idx); -//Console.Error.WriteLine ("-------- {0} {1:X04} {2:X02} {3:X02}", idx, cp, fillIndex [0x9], arrowLv2 [idx]); AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]); arrowLv2 [idx]++; -// map [cp] = new CharMapEntry (0x9, fillIndex [0x9]++, arrowLv2 [idx]++); } - - // FIXME: 08 should be more complete. - fillIndex [0x8] = 2; - for (int cp = 0; cp < char.MaxValue; cp++) - if (!map [cp].Defined && - Char.GetUnicodeCategory ((char) cp) == - UnicodeCategory.MathSymbol) - AddCharMapGroup ((char) cp, 0x8, 1, 0); + // boxes + byte [] boxLv2 = new byte [128]; + // 0-63 will be used for those offsets are positive, + // and 64-127 are for negative ones. + for (int i = 0; i < boxLv2.Length; i++) + boxLv2 [i] = 3; + foreach (DictionaryEntry de in boxValues) { + int cp = (int) de.Key; + int off = (int) de.Value; + if (map [cp].Defined) + continue; + if (off < 0) { + fillIndex [0x9] = (byte) (0xE5 + off); + AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++); + } + else { + fillIndex [0x9] = (byte) (0xE5 + off); + AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++); + } + } + // Some special characters (slanted) + fillIndex [0x9] = 0xF4; + AddCharMap ('\u2571', 0x9, 3); + AddCharMap ('\u2572', 0x9, 3); + AddCharMap ('\u2573', 0x9, 3); // FIXME: implement 0A #region Symbols @@ -1175,26 +2160,58 @@ namespace Mono.Globalization.Unicode } // byte other symbols for (int cp = 0; cp < 0x100; cp++) { + if (cp == 0xA6) + continue; // SPECIAL: skip FIXME: why? uc = Char.GetUnicodeCategory ((char) cp); if (!IsIgnorable (cp) && - uc == UnicodeCategory.OtherSymbol) + uc == UnicodeCategory.OtherSymbol || + cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7') AddCharMapGroup ((char) cp, 0xA, 1, 0); } + // U+30FB here + AddCharMapGroup ('\u30FB', 0xA, 1, 0); + + for (int cp = 0x2020; cp <= 0x2031; cp++) + if (Char.IsPunctuation ((char) cp)) + AddCharMap ((char) cp, 0xA, 1, 0); + // SPECIAL CASES: why? + AddCharMap ('\u203B', 0xA, 1, 0); + AddCharMap ('\u2040', 0xA, 1, 0); + AddCharMap ('\u2041', 0xA, 1, 0); + AddCharMap ('\u2042', 0xA, 1, 0); + + for (int cp = 0x20A0; cp <= 0x20AB; cp++) + AddCharMap ((char) cp, 0xA, 1, 0); + + // 3004 is skipped at first... + for (int cp = 0x3010; cp <= 0x3040; cp++) + if (Char.IsSymbol ((char) cp)) + AddCharMap ((char) cp, 0xA, 1, 0); + // SPECIAL CASES: added here + AddCharMap ('\u3004', 0xA, 1, 0); + AddCharMap ('\u327F', 0xA, 1, 0); - fillIndex [0xA] = 0x2F; // FIXME: it won't be needed for (int cp = 0x2600; cp <= 0x2613; cp++) AddCharMap ((char) cp, 0xA, 1, 0); + // Dingbats for (int cp = 0x2620; cp <= 0x2770; cp++) if (Char.IsSymbol ((char) cp)) AddCharMap ((char) cp, 0xA, 1, 0); - + // OCR + for (int i = 0x2440; i < 0x2460; i++) + AddCharMap ((char) i, 0xA, 1, 0); + + // SPECIAL CASES: why? + AddCharMap ('\u0E3F', 0xA, 1, 0); + AddCharMap ('\u2117', 0xA, 1, 0); + AddCharMap ('\u20AC', 0xA, 1, 0); #endregion #region Numbers // 0C 02 - 0C E1 fillIndex [0xC] = 2; // 9F8 : Bengali "one less than the denominator" - AddCharMap ('\u09F8', 0xC, 1); + AddCharMap ('\u09F8', 0xC, 1, 0x3C); ArrayList numbers = new ArrayList (); for (int i = 0; i < 65536; i++) @@ -1206,11 +2223,15 @@ namespace Mono.Globalization.Unicode ArrayList numberValues = new ArrayList (); foreach (int i in numbers) numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i])); + // SPECIAL CASE: Cyrillic Thousand sign + numberValues.Add (new DictionaryEntry (0x0482, 1000m)); numberValues.Sort (DecimalDictionaryValueComparer.Instance); //foreach (DictionaryEntry de in numberValues) //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]); + // FIXME: fillIndex adjustment lines are too + // complicated. It must be simpler. decimal prevValue = -1; foreach (DictionaryEntry de in numberValues) { int cp = (int) de.Key; @@ -1228,14 +2249,25 @@ namespace Mono.Globalization.Unicode fillIndex [0xC]++; int xcp; - xcp = (int) prevValue + 0x2170 - 1; - AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - xcp = (int) prevValue + 0x2160 - 1; - AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - fillIndex [0xC] += 2; - xcp = (int) prevValue + 0x3021 - 1; - AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - fillIndex [0xC]++; + if (currValue <= 13) { + if (currValue == 4) + fillIndex [0xC]++; + // SPECIAL CASE + if (currValue == 11) + AddCharMap ('\u0BF0', 0xC, 1); + xcp = (int) prevValue + 0x2160 - 1; + AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); + xcp = (int) prevValue + 0x2170 - 1; + AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); + fillIndex [0xC]++; + } + if (currValue < 12) + fillIndex [0xC]++; + if (currValue <= 10) { + xcp = (int) prevValue + 0x3021 - 1; + AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); + fillIndex [0xC]++; + } } if (prevValue < currValue) prevValue = currValue; @@ -1243,39 +2275,56 @@ namespace Mono.Globalization.Unicode continue; // HangZhou and Roman are add later // (code is above) - else if (0x3021 <= cp && cp < 0x302A - || 0x2160 <= cp && cp < 0x216A - || 0x2170 <= cp && cp < 0x217A) + if (0x3021 <= cp && cp < 0x302A + || 0x2160 <= cp && cp < 0x216C + || 0x2170 <= cp && cp < 0x217C) continue; - if (cp == 0x215B) // FIXME: why? + if (cp == 0x215B) // FIXME: why? fillIndex [0xC] += 2; else if (cp == 0x3021) // FIXME: why? fillIndex [0xC]++; - AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]); - if (addnew || cp <= '9') { + int mod = (int) currValue - 1; int xcp; - if (1 <= currValue && currValue <= 10) { - xcp = cp - 0x31 + 0x2776; + if (1 <= currValue && currValue <= 11) { + xcp = mod + 0x2776; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - xcp = cp - 0x31 + 0x2780; + xcp = mod + 0x2780; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - xcp = cp - 0x31 + 0x278A; + xcp = mod + 0x278A; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); } if (1 <= currValue && currValue <= 20) { - xcp = cp - 0x31 + 0x2460; + xcp = mod + 0x2460; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - xcp = cp - 0x31 + 0x2474; + xcp = mod + 0x2474; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - xcp = cp - 0x31 + 0x2488; + xcp = mod + 0x2488; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); } } + if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9) + fillIndex [0xC]++; + AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true); - if (cp != 0x09E7 && cp != 0x09EA) + switch (cp) { + // Maybe Bengali digit numbers do not increase + // indexes, but 0x09E6 does. + case 0x09E7: case 0x09E8: case 0x09E9: + case 0x09EA: + // SPECIAL CASES + case 0x0BF0: case 0x2180: case 0x2181: + break; + // SPECIAL CASE + case 0x0BF1: fillIndex [0xC]++; + break; + default: + if (currValue < 11 || currValue == 1000) + fillIndex [0xC]++; + break; + } // Add special cases that are not regarded as // numbers in UnicodeCategory speak. @@ -1284,7 +2333,7 @@ namespace Mono.Globalization.Unicode AddCharMapGroup ('\u01BD', 0xC, 0, 0); AddCharMapGroup ('\u01BC', 0xC, 1, 0); } - else if (cp == '6') // FIXME: why? + else if (cp == '2' || cp == '6') // FIXME: why? fillIndex [0xC]++; } @@ -1299,7 +2348,6 @@ namespace Mono.Globalization.Unicode for (int i = 0; i < alphabets.Length; i++) AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]); - // non-ASCII Latin alphabets // FIXME: there is no such characters that are placed // *after* "alphabets" array items. This is nothing @@ -1319,6 +2367,9 @@ namespace Mono.Globalization.Unicode // but inside a-to-z range. // 3.there are some expanded characters that // are not part of Unicode Standard NFKD. + // 4. some characters are letter in IsLetter + // but not in sortkeys (maybe unicode version + // difference caused it). switch (i) { // 1. skipping them does not make sense // case 0xD0: case 0xF0: case 0x131: case 0x138: @@ -1336,61 +2387,188 @@ namespace Mono.Globalization.Unicode case 0xFE: // Icelandic Thorn case 0xDF: // German ss case 0xFF: // German ss + // 4. + case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3: // not classified yet // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9: // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8: // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF: -// case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3: // case 0x1DD: continue; } AddCharMapGroup ((char) i, 0xE, 1, 0); } - // Greek and Coptic - fillIndex [0xF] = 02; - for (int i = 0x0380; i < 0x0390; i++) + // IPA extensions + // FIXME: this results in not equivalent values to + // Windows, but is safer for comparison. + char [] ipaArray = new char [0x300 - 0x250 + 0x20]; + for (int i = 0x40; i < 0x60; i++) if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0xF, 1); - fillIndex [0xF] = 02; - for (int i = 0x0391; i < 0x03CF; i++) + ipaArray [i - 0x40] = (char) (i); + for (int i = 0x250; i < 0x300; i++) if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0xF, 1); + ipaArray [i - 0x250 + 0x20] = (char) i; + Array.Sort (ipaArray, UCAComparer.Instance); + int targetASCII = 0; + byte latinDiacritical = 0x7B; + foreach (char c in ipaArray) { + if (c <= 'Z') { + targetASCII = c; + latinDiacritical = 0x7B; + } + else + map [(int) c] = new CharMapEntry ( + 0xE, + map [targetASCII].Level1, + latinDiacritical++); + } + + // Greek and Coptic + + // FIXME: this is (mysterious and) incomplete. + for (int i = 0x0380; i < 0x0400; i++) + if (diacritical [i] == 0 && + decompLength [i] == 1 && + decompType [i] == DecompositionCompat) + diacritical [i] = 3; + + fillIndex [0xF] = 2; + for (int i = 0x0391; i < 0x03AA; i++) + if (i != 0x03A2) + AddCharMap ((char) i, 0xF, 1, + diacritical [i]); + fillIndex [0xF] = 2; + for (int i = 0x03B1; i < 0x03CA; i++) + if (i != 0x03C2) + AddCharMap ((char) i, 0xF, 1, + diacritical [i]); + // Final Sigma + map [0x03C2] = new CharMapEntry (0xF, + map [0x03C3].Level1, map [0x03C3].Level2); + fillIndex [0xF] = 0x40; - for (int i = 0x03D0; i < 0x0400; i++) - if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0xF, 1); + for (int i = 0x03DA; i < 0x03F0; i++) + AddCharMap ((char) i, 0xF, + (byte) (i % 2 == 0 ? 0 : 2), + diacritical [i]); + + // NFKD + for (int i = 0x0386; i <= 0x0400; i++) + FillLetterNFKD (i, true, true); + + // Cyrillic. + // Cyrillic letters are sorted like Latin letters i.e. + // containing culture-specific letters between the + // standard Cyrillic sequence. + // + // We can't use UCA here; it has different sorting. + char [] orderedCyrillic = new char [] { + '\u0430', '\u0431', '\u0432', '\u0433', '\u0434', + '\u0452', // DJE for Serbocroatian + '\u0435', + '\u0454', // IE for Ukrainian + '\u0436', '\u0437', + '\u0455', // DZE + '\u0438', + '\u0456', // Byelorussian-Ukrainian I + '\u0457', // YI + '\u0439', + '\u0458', // JE + '\u043A', '\u043B', + '\u0459', // LJE + '\u043C', '\u043D', + '\u045A', // NJE + '\u043E', + // 4E9 goes here. + '\u043F', '\u0440', '\u0441', '\u0442', + '\u045B', // TSHE for Serbocroatian + '\u0443', + '\u045E', // Short U for Byelorussian + '\u04B1', // Straight U w/ stroke (diacritical!) + '\u0444', '\u0445', '\u0446', '\u0447', + '\u045F', // DZHE + '\u0448', '\u0449', '\u044A', '\u044B', '\u044C', + '\u044D', '\u044E', '\u044F'}; + + // For some characters here is a map to basic cyrillic + // letters. See UnicodeData.txt character names for + // the sources. Here I simply declare an equiv. array. + // The content characters are map from U+490(,491), + // skipping small letters. + char [] cymap_src = new char [] { + '\u0433', '\u0433', '\u0433', '\u0436', + '\u0437', '\u043A', '\u043A', '\u043A', + '\u043A', '\u043D', '\u043D', '\u043F', + '\u0445', '\u0441', '\u0442', '\u0443', + '\u0443', '\u0445', '\u0446', '\u0447', + '\u0447', '\u0432', '\u0435', '\u0435', + '\u0406', '\u0436', '\u043A', '\u043D', + '\u0447', '\u0435'}; + + fillIndex [0x10] = 0x8D; + for (int i = 0x0460; i < 0x0481; i++) { + if (Char.IsLetter ((char) i)) { + if (i == 0x0476) + // U+476/477 have the same + // primary weight as U+474/475. + fillIndex [0x10] -= 3; + AddLetterMap ((char) i, 0x10, 3); + } + } - // Cyrillic - UCA order w/ some modification - fillIndex [0x10] = 0x3; - // table which is moslty from UCA DUCET. + fillIndex [0x10] = 0x6; for (int i = 0; i < orderedCyrillic.Length; i++) { - char c = orderedCyrillic [i]; - if (Char.IsLetter (c)) - AddLetterMap (c, 0x10, 3); + char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture); + if (!IsIgnorable ((int) c) && + Char.IsLetter (c) && + !map [c].Defined) { + AddLetterMap (c, 0x10, 0); + fillIndex [0x10] += 3; + } } - for (int i = 0x0460; i < 0x0481; i++) { - if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0x10, 3); + + // NFKD + for (int i = 0x0401; i <= 0x045F; i++) + FillLetterNFKD (i, false, false); + + for (int i = 0; i < cymap_src.Length; i++) { + char c = cymap_src [i]; + fillIndex [0x10] = map [c].Level1; + int c2 = 0x0490 + i * 2; + AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false); } // Armenian fillIndex [0x11] = 0x3; - for (int i = 0x0531; i < 0x0586; i++) + fillIndex [0x1] = 0x98; + for (int i = 0x0531; i < 0x0586; i++) { + if (i == 0x0559 || i == 0x55A) + AddCharMap ((char) i, 1, 1); if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x11, 1); + } // Hebrew // -Letters - fillIndex [0x12] = 0x3; + fillIndex [0x12] = 0x2; for (int i = 0x05D0; i < 0x05FF; i++) - if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0x12, 1); + if (Char.IsLetter ((char) i)) { + if (isUppercase [i]) { + fillIndex [0x12]--; + AddLetterMap ((char) i, 0x12, 2); + } + else + AddLetterMap ((char) i, 0x12, 1); + } // -Accents fillIndex [0x1] = 0x3; - for (int i = 0x0591; i <= 0x05C2; i++) + for (int i = 0x0591; i <= 0x05C2; i++) { + if (i == 0x05A3 || i == 0x05BB) + fillIndex [0x1]++; if (i != 0x05BE) AddCharMap ((char) i, 0x1, 1); + } // Arabic fillIndex [0x1] = 0x8E; @@ -1408,24 +2586,59 @@ namespace Mono.Globalization.Unicode // (byte) arabicLetterPrimaryValues [i], 1); fillIndex [0x13] = (byte) arabicLetterPrimaryValues [i]; - AddLetterMap ((char) i, 0x13, 0); + byte formDiacritical = 8; // default + // SPECIAL CASES: + switch (i) { + case 0x0622: formDiacritical = 9; break; + case 0x0623: formDiacritical = 0xA; break; + case 0x0624: formDiacritical = 5; break; + case 0x0625: formDiacritical = 0xB; break; + case 0x0626: formDiacritical = 7; break; + case 0x0649: formDiacritical = 5; break; + case 0x064A: formDiacritical = 7; break; + } +// AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false); + AddArabicCharMap ((char) i, 0x13, 1, formDiacritical); } + for (int i = 0x0670; i < 0x0673; i++) + map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670)); fillIndex [0x13] = 0x84; for (int i = 0x0674; i < 0x06D6; i++) if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0x13, 1); + AddLetterMapCore ((char) i, 0x13, 1, 0, false); // Devanagari + + // FIXME: this could be fixed in more decent way + for (int i = 0x0958; i <= 0x095F; i++) + diacritical [i] = 8; + // FIXME: it does seem straight codepoint mapping. fillIndex [0x14] = 04; for (int i = 0x0901; i < 0x0905; i++) if (!IsIgnorable (i)) AddLetterMap ((char) i, 0x14, 2); fillIndex [0x14] = 0xB; - for (int i = 0x0905; i < 0x093A; i++) + for (int i = 0x0905; i < 0x093A; i++) { + if (i == 0x0928) + AddCharMap ('\u0929', 0x14, 0, 8); + if (i == 0x0930) + AddCharMap ('\u0931', 0x14, 0, 8); + if (i == 0x0933) + AddCharMap ('\u0934', 0x14, 0, 8); if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x14, 4); - for (int i = 0x093E; i < 0x094F; i++) + if (i == 0x090B) + AddCharMap ('\u0960', 0x14, 4); + if (i == 0x090C) + AddCharMap ('\u0961', 0x14, 4); + } + fillIndex [0x14] = 0xDA; + for (int i = 0x093E; i < 0x0945; i++) + if (!IsIgnorable (i)) + AddLetterMap ((char) i, 0x14, 2); + fillIndex [0x14] = 0xEC; + for (int i = 0x0945; i < 0x094F; i++) if (!IsIgnorable (i)) AddLetterMap ((char) i, 0x14, 2); @@ -1454,36 +2667,90 @@ namespace Mono.Globalization.Unicode // Gurmukhi. orderedGurmukhi is from UCA // FIXME: it does not look equivalent to UCA. - fillIndex [0x1] = 03; - fillIndex [0x16] = 02; + fillIndex [0x16] = 04; + fillIndex [0x1] = 3; for (int i = 0; i < orderedGurmukhi.Length; i++) { char c = orderedGurmukhi [i]; if (IsIgnorable ((int) c)) continue; - if (!Char.IsLetter (c)) { + if (IsIgnorableNonSpacing (c)) { AddLetterMap (c, 0x1, 1); continue; } if (c == '\u0A3C' || c == '\u0A4D' || '\u0A66' <= c && c <= '\u0A71') continue; - AddLetterMap (c, 0x16, 4); + // SPECIAL CASES + byte shift = 4; + switch (c) { + case '\u0A33': case '\u0A36': case '\u0A16': + case '\u0A17': case '\u0A5B': case '\u0A5E': + shift = 0; + break; + } + if (c == '\u0A3E') // Skip + fillIndex [0x16] = 0xC0; + AddLetterMap (c, 0x16, shift); } // Gujarati. orderedGujarati is from UCA - fillIndex [0x17] = 02; - for (int i = 0; i < orderedGujarati.Length; i++) - AddLetterMap (orderedGujarati [i], 0x17, 4); + fillIndex [0x17] = 0x4; + // nonspacing marks + map [0x0A4D] = new CharMapEntry (1, 0, 0x3); + map [0x0ABD] = new CharMapEntry (1, 0, 0x3); + map [0x0A3C] = new CharMapEntry (1, 0, 0x4); + map [0x0A71] = new CharMapEntry (1, 0, 0x6); + map [0x0ABC] = new CharMapEntry (1, 0, 0xB); + map [0x0A70] = new CharMapEntry (1, 0, 0xE); + // letters go first. + for (int i = 0; i < orderedGujarati.Length; i++) { + // SPECIAL CASE + char c = orderedGujarati [i]; + if (Char.IsLetter (c)) { + // SPECIAL CASES + if (c == '\u0AB3' || c == '\u0A32') + continue; + if (c == '\u0A33') { + AddCharMap ('\u0A32', 0x17, 0); + AddCharMap ('\u0A33', 0x17, 4, 4); + continue; + } + if (c == '\u0A8B') + AddCharMap ('\u0AE0', 0x17, 0, 5); + AddCharMap (c, 0x17, 4); + + if (c == '\u0AB9') + AddCharMap ('\u0AB3', 0x17, 6); + } + } + // non-letters + byte gujaratiShift = 4; + fillIndex [0x17] = 0xC0; + for (int i = 0; i < orderedGujarati.Length; i++) { + char c = orderedGujarati [i]; + if (fillIndex [0x17] == 0xCC) + gujaratiShift = 3; + if (!Char.IsLetter (c)) { + // SPECIAL CASES + if (c == '\u0A82') + AddCharMap ('\u0A81', 0x17, 2); + if (c == '\u0AC2') + fillIndex [0x17]++; + AddLetterMap (c, 0x17, gujaratiShift); + } + } // Oriya + fillIndex [0x1] = 03; fillIndex [0x18] = 02; for (int i = 0x0B00; i < 0x0B7F; i++) { switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.NonSpacingMark: case UnicodeCategory.DecimalDigitNumber: + AddLetterMap ((char) i, 0x1, 1); continue; } - AddLetterMap ((char) i, 0x18, 1); + AddLetterMapCore ((char) i, 0x18, 1, 0, true); } // Tamil @@ -1491,13 +2758,11 @@ namespace Mono.Globalization.Unicode AddCharMap ('\u0BD7', 0x19, 0); fillIndex [0x19] = 0xA; // vowels - for (int i = 0x0BD7; i < 0x0B94; i++) - if (Char.IsLetter ((char) i)) + for (int i = 0x0B82; i <= 0x0B94; i++) + if (!IsIgnorable ((char) i)) AddCharMap ((char) i, 0x19, 2); // special vowel - fillIndex [0x19] = 0x24; - AddCharMap ('\u0B94', 0x19, 0); - fillIndex [0x19] = 0x26; + fillIndex [0x19] = 0x28; // The array for Tamil consonants is a constant. // Windows have almost similar sequence to TAM from // tamilnet but a bit different in Grantha. @@ -1529,47 +2794,82 @@ namespace Mono.Globalization.Unicode for (int i = 0x0C80; i < 0x0CE5; i++) { if (i == 0x0CD5 || i == 0x0CD6) continue; // ignore + if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE) + continue; // shift after 0xCB9 AddCharMap ((char) i, 0x1B, 3); + if (i == 0x0CB9) { + // SPECIAL CASES: but why? + AddCharMap ('\u0CB1', 0x1B, 3); // RRA + AddCharMap ('\u0CB3', 0x1B, 3); // LLA + AddCharMap ('\u0CDE', 0x1B, 3); // FA + } + if (i == 0x0CB2) + AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL } // Malayalam fillIndex [0x1C] = 2; - for (int i = 0x0D02; i < 0x0D61; i++) + fillIndex [0x1] = 3; + for (int i = 0x0D02; i < 0x0D61; i++) { // FIXME: I avoided MSCompatUnicodeTable usage // here (it results in recursion). So check if // using NonSpacingMark makes sense or not. if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark) // if (!MSCompatUnicodeTable.IsIgnorable ((char) i)) AddCharMap ((char) i, 0x1C, 1); + else if (!IsIgnorable ((char) i)) + AddCharMap ((char) i, 1, 1); + } // Thai ... note that it breaks 0x1E wall after E2B! // Also, all Thai characters have level 2 value 3. fillIndex [0x1E] = 2; - for (int i = 0xE44; i < 0xE48; i++) + fillIndex [0x1] = 3; + for (int i = 0xE40; i <= 0xE44; i++) AddCharMap ((char) i, 0x1E, 1, 3); for (int i = 0xE01; i < 0xE2B; i++) - AddCharMap ((char) i, 0x1E, 6, 0); + AddCharMap ((char) i, 0x1E, 6, 3); fillIndex [0x1F] = 5; for (int i = 0xE2B; i < 0xE30; i++) - AddCharMap ((char) i, 0x1F, 6, 0); + AddCharMap ((char) i, 0x1F, 6, 3); + fillIndex [0x1F] = 0x1E; for (int i = 0xE30; i < 0xE3B; i++) AddCharMap ((char) i, 0x1F, 1, 3); // some Thai characters remains. char [] specialThai = new char [] {'\u0E45', '\u0E46', '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'}; foreach (char c in specialThai) - AddCharMap (c, 0x1F, 1); + AddCharMap (c, 0x1F, 1, 3); + + for (int i = 0xE00; i < 0xE80; i++) + if (Char.GetUnicodeCategory ((char) i) == + UnicodeCategory.NonSpacingMark) + AddCharMap ((char) i, 1, 1); // Lao fillIndex [0x1F] = 2; - for (int i = 0xE80; i < 0xEDF; i++) - if (Char.IsLetter ((char) i)) + fillIndex [0x1] = 3; + for (int i = 0xE80; i < 0xEDF; i++) { + if (IsIgnorable ((char) i)) + continue; + else if (Char.IsLetter ((char) i)) AddCharMap ((char) i, 0x1F, 1); + else if (Char.GetUnicodeCategory ((char) i) == + UnicodeCategory.NonSpacingMark) + AddCharMap ((char) i, 1, 1); + } // Georgian. orderedGeorgian is from UCA DUCET. fillIndex [0x21] = 5; - for (int i = 0; i < orderedGeorgian.Length; i++) - AddLetterMap (orderedGeorgian [i], 0x21, 5); + for (int i = 0; i < orderedGeorgian.Length; i++) { + char c = orderedGeorgian [i]; + if (map [(int) c].Defined) + continue; + AddCharMap (c, 0x21, 0); + if (c < '\u10F6') + AddCharMap ((char) (c - 0x30), 0x21, 0); + fillIndex [0x21] += 5; + } // Japanese Kana. fillIndex [0x22] = 2; @@ -1594,6 +2894,16 @@ namespace Mono.Globalization.Unicode AddKanaMap (cp, kanaLines [gyo]); fillIndex [0x22]++; + if (cp == 0x30AB) { + // add small 'ka' (before normal one) + AddKanaMap (0x30F5, 1); + kanaOffset++; + } + if (cp == 0x30B1) { + // add small 'ke' (before normal one) + AddKanaMap (0x30F6, 1); + kanaOffset++; + } if (cp == 0x3061) { // add small 'Tsu' (before normal one) AddKanaMap (0x3063, 1); @@ -1624,11 +2934,27 @@ namespace Mono.Globalization.Unicode AddLetterMap ((char) 0x3093, 0x22, 0); AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0); + map [0x3094] = new CharMapEntry (map [0x30A6].Category, + map [0x30A6].Level1, 3);// voiced hiragana U + map [0x30F4] = new CharMapEntry (map [0x30A6].Category, + map [0x30A6].Level1, 3);// voiced katakana U + + map [0x30F5] = new CharMapEntry (map [0x30AB].Category, + map [0x30AB].Level1, 0);// small katakana Ka + map [0x30F6] = new CharMapEntry (map [0x30B1].Category, + map [0x30B1].Level1, 0);// small katakana Ke + // voiced Wa lines + for (int i = 0x30F7; i < 0x30FB; i++) + map [i] = new CharMapEntry (map [i - 8].Category, + map [i - 8].Level1, + 3); + // JIS Japanese square chars. fillIndex [0x22] = 0x97; jisJapanese.Sort (JISComparer.Instance); foreach (JISCharacter j in jisJapanese) - AddCharMap ((char) j.CP, 0x22, 1); + if (0x3300 <= j.CP && j.CP <= 0x3357) + AddCharMap ((char) j.CP, 0x22, 1); // non-JIS Japanese square chars. nonJisJapanese.Sort (NonJISComparer.Instance); foreach (NonJISCharacter j in nonJisJapanese) @@ -1658,14 +2984,19 @@ namespace Mono.Globalization.Unicode map [cp] = new CharMapEntry (0x24, (byte) (map [cp - 1].Level1 + 2), 0); + // FIXME: Syriac NonSpacingMark should go here. // Thaana // FIXME: it turned out that it does not look like UCA fillIndex [0x24] = 0x6E; + fillIndex [0x1] = 0xAC; for (int i = 0; i < orderedThaana.Length; i++) { - if (IsIgnorableNonSpacing (i)) - continue; - AddCharMap (orderedThaana [i], 0x24, 2); + char c = orderedThaana [i]; + if (IsIgnorableNonSpacing ((int) c)) + AddCharMap (c, 1, 1); + AddCharMap (c, 0x24, 2); + if (c == '\u0782') // SPECIAL CASE: why? + fillIndex [0x24] += 2; } #endregion @@ -1704,28 +3035,31 @@ namespace Mono.Globalization.Unicode + "<{\u1113 \u1116}, \u3165," + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8," + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >" - + "\u11CA, \u1104, \u11CB > \u1105 >" - + "\u11B0, [\u11CC \u11D0], \u11B1, [\u11D1 \u11D2]," - + "\u11B2, [\u11D3 \u11D5], \u11B3," + + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >" + + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1," + + "[\u11D1 \u11D2], \u11B2," + + "[\u11D3 \u11D5], \u11B3," + "[\u11D6 \u11D7], \u11B4, \u11B5," + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >" - + "[\u11DA \u11E2], \u1107=\u11B8 >" - + "<{\u111E \u1120}, \u3172,, \u3173, " - + "\u11E3, \u1108 >" - + "\u11B9,,,,,,,,, [\u11E4 \u11E6],, \u1109=\u11BA,,," - + "\u3214=\u3274 <>" - + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],," - + "\u11EA,, \u110A=\u11BB,,, >" - + "{\u1134 \u1140}, \u317E,,,,,, \u11EB," - + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >" - + "\u11EE, \u11EC, \u11ED,,,,, \u11F1,, \u11F2,,," - + "\u11EF,,, \u11F0, \u110C=\u11BD,, >" - + "\u110D,, >" + + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >" + + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >" + + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, " + + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178," + + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>" + + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C " + + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >" + + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB," + + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >" + + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, " + + "\u11F1,, \u11F2,,," + + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >" + + "<\u114D, \u110D,, >" + "<{\u114E \u1151},, \u110E=\u11BE,, >" + "<{\u1152 \u1155},,, \u110F=\u11BF >" + "\u1110=\u11C0 > \u1111=\u11C1 >" - + "\u11F3, \u11F4, \u1112=\u11C2 >" - + "\u11F9, [\u11F5 \u11F8]" + + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >" + + "<\u1158=\u1159=\u115F, \u3185, \u11F9," + + "[\u11F5 \u11F8]" ; byte hangulCat = 0x52; @@ -1781,6 +3115,40 @@ namespace Mono.Globalization.Unicode } } + // Some Jamo NFKD. + for (int i = 0x3200; i < 0x3300; i++) { + if (IsIgnorable (i) || map [i].Defined) + continue; + int ch = 0; + // w/ bracket + if (decompLength [i] == 4 && + decompValues [decompIndex [i]] == '(') + ch = decompIndex [i] + 1; + // circled + else if (decompLength [i] == 2 && + decompValues [decompIndex [i] + 1] == '\u1161') + ch = decompIndex [i]; + else if (decompLength [i] == 1) + ch = decompIndex [i]; + else + continue; + ch = decompValues [ch]; + if (ch < 0x1100 || 0x1200 < ch && + ch < 0xAC00 || 0xD800 < ch) + continue; + + // SPECIAL CASE ? + int offset = i < 0x3260 ? 1 : 0; + if (0x326E <= i && i <= 0x3273) + offset = 1; + + map [i] = new CharMapEntry (map [ch].Category, + (byte) (map [ch].Level1 + offset), + map [ch].Level2); +// Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]); + } + + #endregion // Letterlike characters and CJK compatibility square @@ -1817,30 +3185,31 @@ namespace Mono.Globalization.Unicode // PrivateUse ... computed. // remaining Surrogate ... computed. - #region Special "biggest" area (FF FF) - fillIndex [0xFF] = 0xFF; - char [] specialBiggest = new char [] { - '\u3005', '\u3031', '\u3032', '\u309D', - '\u309E', '\u30FC', '\u30FD', '\u30FE', - '\uFE7C', '\uFE7D', '\uFF70'}; - foreach (char c in specialBiggest) - AddCharMap (c, 0xFF, 0); - #endregion - #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07 // non-alphanumeric ASCII except for: + - < = > ' for (int i = 0x21; i < 0x7F; i++) { + // SPECIAL CASE: 02C6 looks regarded as + // equivalent to '^', which does not conform + // to Unicode standard character database. + if (i == 0x005B) + AddCharMap ('\u2045', 0x7, 0, 0x1C); + if (i == 0x005D) + AddCharMap ('\u2046', 0x7, 0, 0x1C); + if (i == 0x005E) + AddCharMap ('\u02C6', 0x7, 0, 3); + if (i == 0x0060) + AddCharMap ('\u02CB', 0x7, 0, 3); + if (Char.IsLetterOrDigit ((char) i) || "+-<=>'".IndexOf ((char) i) >= 0) continue; // they are not added here. - AddCharMapGroup2 ((char) i, 0x7, 1, 0); + + AddCharMapGroup2 ((char) i, 0x7, 1, 0); // Insert 3001 after ',' and 3002 after '.' if (i == 0x2C) AddCharMapGroup2 ('\u3001', 0x7, 1, 0); - else if (i == 0x2E) { - fillIndex [0x7]--; + else if (i == 0x2E) AddCharMapGroup2 ('\u3002', 0x7, 1, 0); - } else if (i == 0x3A) AddCharMap ('\uFE30', 0x7, 1, 0); } @@ -1851,9 +3220,37 @@ namespace Mono.Globalization.Unicode if (IsIgnorable (i)) continue; + // FIXME: actually those reset should not be + // done but here I put for easy goal. + if (i == 0x05C3) + fillIndex [0x7]++; + if (i == 0x0700) + fillIndex [0x7] = 0xE2; + if (i == 0x2016) + fillIndex [0x7] = 0x77; + if (i == 0x3008) + fillIndex [0x7] = 0x93; + + if (0x02C8 <= i && i <= 0x02CD) + continue; // nonspacing marks + + // SPECIAL CASE: maybe they could be allocated + // dummy NFKD mapping and no special processing + // would be required here. + if (i == 0x00AF) + AddCharMap ('\u02C9', 0x7, 0, 3); + if (i == 0x00B4) + AddCharMap ('\u02CA', 0x7, 0, 3); + if (i == 0x02C7) + AddCharMap ('\u02D8', 0x7, 0, 3); + // SPECIAL CASES: switch (i) { case 0xAB: // 08 + case 0xB7: // 0A + case 0xBB: // 08 + case 0x02B9: // 01 + case 0x02BA: // 01 case 0x2329: // 09 case 0x232A: // 09 continue; @@ -1862,23 +3259,107 @@ namespace Mono.Globalization.Unicode switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.OtherPunctuation: case UnicodeCategory.ClosePunctuation: + case UnicodeCategory.OpenPunctuation: + case UnicodeCategory.ConnectorPunctuation: case UnicodeCategory.InitialQuotePunctuation: case UnicodeCategory.FinalQuotePunctuation: case UnicodeCategory.ModifierSymbol: // SPECIAL CASES: // 0xA - if (0x2020 <= i && i <= 0x2042) + if (0x2020 <= i && i <= 0x2031) + continue; + if (i == 0x3003) // added later continue; - AddCharMapGroup ((char) i, 0x7, 1, 0); + AddCharMapGroup2 ((char) i, 0x7, 1, 0); break; default: - if (i == 0xA6) // SPECIAL CASE. FIXME: why? + if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why? goto case UnicodeCategory.OtherPunctuation; break; } } + + // Control pictures + // FIXME: it should not need to reset level 1, but + // it's for easy goal. + fillIndex [0x7] = 0xB6; + for (int i = 0x2400; i <= 0x2424; i++) + AddCharMap ((char) i, 0x7, 1, 0); + + // FIXME: what are they? + AddCharMap ('\u3003', 0x7, 1); + AddCharMap ('\u3006', 0x7, 1); + AddCharMap ('\u02D0', 0x7, 1); + AddCharMap ('\u10FB', 0x7, 1); + AddCharMap ('\u0950', 0x7, 1); + AddCharMap ('\u093D', 0x7, 1); + AddCharMap ('\u0964', 0x7, 1); + AddCharMap ('\u0965', 0x7, 1); + AddCharMap ('\u0970', 0x7, 1); + + #endregion + + #region category 08 - symbols + fillIndex [0x8] = 2; + // Here Windows mapping is not straightforward. It is + // not based on computation but seems manual sorting. + AddCharMapGroup ('+', 0x8, 1, 0); // plus + AddCharMapGroup ('\u2212', 0x8, 1); // minus + AddCharMapGroup ('\u229D', 0x8, 1); // minus + AddCharMapGroup ('\u2297', 0x8, 1); // mul + AddCharMapGroup ('\u2044', 0x8, 1); // div + AddCharMapGroup ('\u2215', 0x8, 0); // div + AddCharMapGroup ('\u2298', 0x8, 1); // div slash + AddCharMapGroup ('\u2217', 0x8, 0); // mul + AddCharMapGroup ('\u229B', 0x8, 1); // asterisk oper + AddCharMapGroup ('\u2218', 0x8, 0); // ring + AddCharMapGroup ('\u229A', 0x8, 1); // ring + AddCharMapGroup ('\u2219', 0x8, 0); // bullet + AddCharMapGroup ('\u2299', 0x8, 1); // dot oper + AddCharMapGroup ('\u2213', 0x8, 1); // minus-or-plus + AddCharMapGroup ('\u003C', 0x8, 1); // < + AddCharMapGroup ('\u227A', 0x8, 1); // precedes relation + AddCharMapGroup ('\u22B0', 0x8, 1); // precedes under relation + + for (int cp = 0; cp < 0x2300; cp++) { + if (cp == 0xAC) // SPECIAL CASE: skip + continue; + if (cp == 0x200) { + cp = 0x2200; // skip to 2200 + fillIndex [0x8] = 0x21; + } + if (cp == 0x2295) + fillIndex [0x8] = 0x3; + if (cp == 0x22A2) + fillIndex [0x8] = 0xAB; + if (cp == 0x22B2) + fillIndex [0x8] = 0xB9; + if (!map [cp].Defined && +// Char.GetUnicodeCategory ((char) cp) == +// UnicodeCategory.MathSymbol) + Char.IsSymbol ((char) cp)) + AddCharMapGroup ((char) cp, 0x8, 1); + // SPECIAL CASES: no idea why Windows sorts as such + switch (cp) { + case 0x3E: + AddCharMap ('\u227B', 0x8, 1, 0); + AddCharMap ('\u22B1', 0x8, 1, 0); + break; + case 0xB1: + AddCharMapGroup ('\u00AB', 0x8, 1); + AddCharMapGroup ('\u226A', 0x8, 1); + AddCharMapGroup ('\u00BB', 0x8, 1); + AddCharMapGroup ('\u226B', 0x8, 1); + break; + case 0xF7: + AddCharMap ('\u01C0', 0x8, 1, 0); + AddCharMap ('\u01C1', 0x8, 1, 0); + AddCharMap ('\u01C2', 0x8, 1, 0); + break; + } + } #endregion - // FIXME: for 07 xx we need more love. + #region Hack! // Characters w/ diacritical marks (NFKD) for (int i = 0; i <= char.MaxValue; i++) { @@ -1889,7 +3370,7 @@ namespace Mono.Globalization.Unicode int start = decompIndex [i]; int primaryChar = decompValues [start]; - int secondary = 0; + int secondary = diacritical [i]; bool skip = false; int length = decompLength [i]; // special processing for parenthesized ones. @@ -1918,7 +3399,8 @@ namespace Mono.Globalization.Unicode } - #region Level2 adjustment + // Diacritical weight adjustment + // Arabic Hamzah diacritical [0x624] = 0x5; diacritical [0x626] = 0x7; @@ -1928,7 +3410,6 @@ namespace Mono.Globalization.Unicode diacritical [0x649] = 0x5; // 'alif maqs.uurah diacritical [0x64A] = 0x7; // Yaa' - for (int i = 0; i < char.MaxValue; i++) { byte mod = 0; byte cat = map [i].Category; @@ -1938,7 +3419,11 @@ namespace Mono.Globalization.Unicode mod = diacritical [i]; break; case 0x13: // Arabic - if (diacritical [i] == 0) + if (i == 0x0621) + break; // 0 + if (diacritical [i] == 0 && decompLength [i] != 0) + diacritical [i] = map [decompValues [decompIndex [i]]].Level2; + if (diacritical [i] == 0 && i >= 0xFE8D) mod = 0x8; // default for arabic break; } @@ -1948,17 +3433,79 @@ namespace Mono.Globalization.Unicode map [i] = new CharMapEntry ( cat, map [i].Level1, mod); } - #endregion - // FIXME: this is hack but those which are - // NonSpacingMark characters and still undefined - // are likely to be nonspacing. - for (int i = 0; i < char.MaxValue; i++) - if (!map [i].Defined && - !IsIgnorable (i) && - Char.GetUnicodeCategory ((char) i) == + // FIXME: this is halfly hack but those NonSpacingMark + // characters and still undefined are likely to + // be nonspacing. + for (int i = 0; i < char.MaxValue; i++) { + if (map [i].Defined || + IsIgnorable (i)) + continue; + switch (i) { + // SPECIAL CASES. + case 0x02B9: + case 0x02BA: + break; + default: + if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark) + continue; + break; + } + if (diacritical [i] != 0) + map [i] = new CharMapEntry (1, 1, diacritical [i]); + else AddCharMap ((char) i, 1, 1); + } + + #endregion + } + + TextInfo ti = CultureInfo.InvariantCulture.TextInfo; + + private void FillLetterNFKD (int i, bool checkUpper, bool greekRemap) + { + if (map [i].Defined) + return; + int up = (int) ti.ToUpper ((char) i); + if (checkUpper && map [up].Category == 0xF) { + if (i == up) + return; + FillLetterNFKD (up, checkUpper, greekRemap); + map [i] = new CharMapEntry (0xF, + map [up].Level1, + map [up].Level2); + } else { + int idx = decompIndex [i]; + if (idx == 0) + return; + int primary = decompValues [decompIndex [i]]; + FillLetterNFKD (primary, checkUpper, greekRemap); + + int lv2 = map [primary].Level2; + byte off = 0; + for (int l = 1; l < decompLength [i]; l++) { + int tmp = decompValues [idx + l]; + if (map [tmp].Category != 1) + return; + if (greekRemap && map [tmp].Level2 == 0xC) + off += 3; + else + off += map [tmp].Level2; + } + if (off > 0) { + if (lv2 == 0) + lv2 += 2; + lv2 += off; + } + // ... but override if the value already exists. + if (diacritical [i] != 0) + lv2 = diacritical [i]; + map [i] = new CharMapEntry ( + map [primary].Category, + map [primary].Level1, + (byte) lv2); + } } private void IncrementSequentialIndex (ref byte hangulCat) @@ -1990,32 +3537,32 @@ namespace Mono.Globalization.Unicode char c = (char) (i + b); byte arg = (byte) (b > 0 ? b + 2 : 0); // Hiragana - AddLetterMapCore (c, 0x22, 0, arg); + AddLetterMapCore (c, 0x22, 0, arg, false); // Katakana - AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg); + AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false); } } private void AddLetterMap (char c, byte category, byte updateCount) { - AddLetterMapCore (c, category, updateCount, 0); + AddLetterMapCore (c, category, updateCount, 0, true); } - private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2) + private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2) { char c2; // updates index c2 = ToSmallForm (c); if (c2 != c) - AddCharMapGroup (c2, category, updateCount, level2); + AddCharMapGroup (c2, category, updateCount, level2, deferLevel2); c2 = Char.ToLower (c, CultureInfo.InvariantCulture); if (c2 != c && !map [(int) c2].Defined) - AddLetterMapCore (c2, category, 0, level2); + AddLetterMapCore (c2, category, 0, level2, deferLevel2); bool doUpdate = true; if (IsIgnorable ((int) c) || map [(int) c].Defined) doUpdate = false; else - AddCharMapGroup (c, category, 0, level2); + AddCharMapGroup (c, category, 0, level2, deferLevel2); if (doUpdate) fillIndex [category] += updateCount; } @@ -2036,19 +3583,6 @@ namespace Mono.Globalization.Unicode return true; } - private void AddCharMapGroupTail (char c, byte category, byte updateCount) - { - char c2 = ToSmallFormTail (c); - if (c2 != c) - AddCharMap (c2, category, updateCount, 0); - // itself - AddCharMap (c, category, updateCount, 0); - // - c2 = ToFullWidthTail (c); - if (c2 != c) - AddCharMapGroupTail (c2, category, updateCount); - } - // // Adds characters to table in the order below // (+ increases weight): @@ -2070,11 +3604,24 @@ namespace Mono.Globalization.Unicode DecompositionWide, DecompositionNarrow, }; + private void AddCharMapGroup (char c, byte category, byte updateCount) + { + AddCharMapGroup (c, category, updateCount, 0, true); + } + private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2) + { + AddCharMapGroup (c, category, updateCount, level2, false); + } + + private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2) { if (map [(int) c].Defined) return; + if (deferLevel2) + level2 = diacritical [(int) c]; + char small = char.MinValue; char vertical = char.MinValue; Hashtable nfkd = (Hashtable) nfkdMap [(int) c]; @@ -2088,8 +3635,11 @@ namespace Mono.Globalization.Unicode } // updates index - if (small != char.MinValue) - AddCharMap (small, category, updateCount); + if (small != char.MinValue) { + if (level2 == 0 && deferLevel2) + level2 = diacritical [small]; + AddCharMap (small, category, updateCount, level2); + } // itself AddCharMap (c, category, 0, level2); @@ -2097,16 +3647,22 @@ namespace Mono.Globalization.Unicode if (nfkd != null) { foreach (int weight in sameWeightItems) { object wv = nfkd [(byte) weight]; - if (wv != null) + if (wv != null) { + if (deferLevel2) + level2 = diacritical [(int) wv]; AddCharMap ((char) ((int) wv), category, 0, level2); + } } } // update index here. fillIndex [category] += updateCount; - if (vertical != char.MinValue) + if (vertical != char.MinValue) { + if (level2 == 0 && deferLevel2) + level2 = diacritical [vertical]; AddCharMap (vertical, category, updateCount, level2); + } } private void AddCharMapCJK (char c, ref byte category) @@ -2124,6 +3680,10 @@ namespace Mono.Globalization.Unicode AddCharMapCJK (c, ref category); // LAMESPEC: see below. + if (c == '\u5B78') { + AddCharMapCJK ('\u32AB', ref category); + AddCharMapCJK ('\u323B', ref category); + } if (c == '\u52DE') { AddCharMapCJK ('\u3298', ref category); AddCharMapCJK ('\u3238', ref category); @@ -2153,7 +3713,8 @@ namespace Mono.Globalization.Unicode // mix Chinise and Japanese Kanji when // ordering those characters. switch (w) { - case 0x32A2: case 0x3298: case 0x3238: case 0x32A9: + case 0x32A2: case 0x3298: case 0x3238: + case 0x32A9: case 0x323B: case 0x32AB: continue; } @@ -2164,23 +3725,44 @@ namespace Mono.Globalization.Unicode // For now it is only for 0x7 category. private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2) { - char small = char.MinValue; - char vertical = char.MinValue; - Hashtable nfkd = (Hashtable) nfkdMap [(int) c]; - if (nfkd != null) { - object smv = nfkd [(byte) DecompositionSmall]; - if (smv != null) - small = (char) ((int) smv); - object vv = nfkd [(byte) DecompositionVertical]; - if (vv != null) - vertical = (char) ((int) vv); + if (map [(int) c].Defined) + return; + + bool updateWeight = false; + // Process in advance (lower primary weight) + for (int c2 = 0; c2 < char.MaxValue; c2++) { + if (!map [c2].Defined && + decompLength [c2] == 1 && + (int) (decompValues [decompIndex [c2]]) == (int) c) { + switch (decompType [c2]) { + case DecompositionSmall: + updateWeight = true; + AddCharMap ((char) c2, category, + 0, level2); + break; + } + } } + if (updateWeight) + fillIndex [category] = (byte) + (fillIndex [category] + updateCount); - // updates index - if (small != char.MinValue) - // SPECIAL CASE excluded (FIXME: why?) - if (small != '\u2024') - AddCharMap (small, category, updateCount); + // Identical weight + for (int c2 = 0; c2 < char.MaxValue; c2++) { + if (!map [c2].Defined && + decompLength [c2] == 1 && + (int) (decompValues [decompIndex [c2]]) == (int) c) { + switch (decompType [c2]) { + case DecompositionSub: + case DecompositionSuper: + case DecompositionWide: + case DecompositionNarrow: + AddCharMap ((char) c2, category, + 0, level2); + break; + } + } + } // itself AddCharMap (c, category, updateCount, level2); @@ -2188,30 +3770,40 @@ namespace Mono.Globalization.Unicode // Since nfkdMap is problematic to have two or more // NFKD to an identical character, here I iterate all. for (int c2 = 0; c2 < char.MaxValue; c2++) { - if (decompLength [c2] == 1 && + if (!map [c2].Defined && + decompLength [c2] == 1 && (int) (decompValues [decompIndex [c2]]) == (int) c) { switch (decompType [c2]) { - case DecompositionCompat: + case DecompositionWide: + case DecompositionNarrow: + case DecompositionSmall: + case DecompositionSub: + case DecompositionSuper: + continue; + default: AddCharMap ((char) c2, category, updateCount, level2); break; } } } - - if (vertical != char.MinValue) - // SPECIAL CASE excluded (FIXME: why?) - if (vertical != '\uFE33' && vertical != '\uFE34') - AddCharMap (vertical, category, updateCount, level2); } - char ToFullWidth (char c) + private void AddArabicCharMap (char c, byte category, byte updateCount, byte level2) { - return ToDecomposed (c, DecompositionFull, false); - } + // itself + AddCharMap (c, category, 0, level2); - char ToFullWidthTail (char c) - { - return ToDecomposed (c, DecompositionFull, true); + // Since nfkdMap is problematic to have two or more + // NFKD to an identical character, here I iterate all. + for (int c2 = 0; c2 < char.MaxValue; c2++) { + if (decompLength [c2] == 0) + continue; + int idx = decompIndex [c2] + decompLength [c2] - 1; + if ((int) (decompValues [idx]) == (int) c) + AddCharMap ((char) c2, category, + 0, level2); + } + fillIndex [category] += updateCount; } char ToSmallForm (char c) @@ -2219,11 +3811,6 @@ namespace Mono.Globalization.Unicode return ToDecomposed (c, DecompositionSmall, false); } - char ToSmallFormTail (char c) - { - return ToDecomposed (c, DecompositionSmall, true); - } - char ToDecomposed (char c, byte d, bool tail) { if (decompType [(int) c] != d) @@ -2254,6 +3841,30 @@ namespace Mono.Globalization.Unicode private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value { + // CJK compat + if ('\u3192' <= c && c <= '\u319F') + return 0; + + // They have NFKD mapping, and on Windows + // those narrow characters are regarded as "normal", + // thus those characters themselves are regarded as + // "wide". grep "" and you can pick them up + // (ignoring Kana, Hangul etc.) + switch (c) { + case '\u3002': + case '\u300C': + case '\u300D': + case '\u3001': + case '\u30FB': + case '\u2502': + case '\u2190': + case '\u2191': + case '\u2192': + case '\u2193': + case '\u25A0': + case '\u25CB': + return 1; + } // Korean if ('\u11A8' <= c && c <= '\u11F9') return 2; @@ -2261,6 +3872,11 @@ namespace Mono.Globalization.Unicode return 4; if ('\u3130' <= c && c <= '\u3164') return 5; + if ('\u3165' <= c && c <= '\u318E') + return 4; + // Georgian Capital letters + if ('\u10A0' <= c && c <= '\u10C5') + return 0x10; // numbers if ('\u2776' <= c && c <= '\u277F') return 4; @@ -2269,24 +3885,34 @@ namespace Mono.Globalization.Unicode if ('\u2776' <= c && c <= '\u2793') return 0xC; if ('\u2160' <= c && c <= '\u216F') - return 0x18; + return 0x10; if ('\u2181' <= c && c <= '\u2182') - return 0x18; + return 0x10; // Arabic if ('\u2135' <= c && c <= '\u2138') return 4; - if ('\uFE80' <= c && c < '\uFE8E') { + // I believe that Windows has a bug on setting level 3 + // weight here. NFKD results in different values. + if ('\uFE80' < c && c < '\uFF00') { // 2(Isolated)/8(Final)/0x18(Medial) switch (decompType [(int) c]) { case DecompositionIsolated: - return 2; + return 0; // 2; case DecompositionFinal: return 8; case DecompositionMedial: return 0x18; + case DecompositionInitial: + return 0x10; } } + // I have no idea why those symbols have level 3 weight + if (c == '\u2104' || c == '\u212B') + return 0x18; + if ('\u211E' <= c && c <= '\u212B') + return 0x10; + // actually I dunno the reason why they have weights. switch (c) { case '\u01BC': @@ -2295,17 +3921,23 @@ namespace Mono.Globalization.Unicode return 0x20; case '\u06AA': return 0x28; + // Gurmukhi + case '\u0A39': + case '\u0A59': + case '\u0A5A': + case '\u0A5B': + case '\u0A5E': + return 0x10; } byte ret = 0; switch (c) { case '\u03C2': - case '\u2104': case '\u212B': - ret |= 8; + ret = 8; break; case '\uFE42': - ret |= 0xC; + ret = 0xA; break; } @@ -2328,6 +3960,20 @@ namespace Mono.Globalization.Unicode #endregion #region IsIgnorable +/* + static bool IsIgnorable (int i) + { + if (unicodeAge [i] >= 3.1) + return true; + switch (char.GetUnicodeCategory ((char) i)) { + case UnicodeCategory.OtherNotAssigned: + case UnicodeCategory.Format: + return true; + } + return false; + } +*/ + // FIXME: In the future use DerivedAge.txt to examine character // versions and set those ones that have higher version than // 1.0 as ignorable. @@ -2362,6 +4008,7 @@ namespace Mono.Globalization.Unicode // those ranges. case 0x4d8: case 0x4d9: case 0x4e8: case 0x4e9: + case 0x70F: case 0x3036: case 0x303f: case 0x337b: case 0xfb1e: return false; @@ -2720,7 +4367,7 @@ namespace Mono.Globalization.Unicode { JISCharacter j1 = (JISCharacter) o1; JISCharacter j2 = (JISCharacter) o2; - return j2.JIS - j1.JIS; + return j1.JIS - j2.JIS; } } @@ -2831,4 +4478,136 @@ namespace Mono.Globalization.Unicode return l1 - l2; } } + + class Tailoring + { + int lcid; + int alias; + bool frenchSort; + ArrayList items = new ArrayList (); + + public Tailoring (int lcid) + : this (lcid, 0) + { + } + + public Tailoring (int lcid, int alias) + { + this.lcid = lcid; + this.alias = alias; + } + + public int LCID { + get { return lcid; } + } + + public int Alias { + get { return alias; } + } + + public bool FrenchSort { + get { return frenchSort; } + set { frenchSort = value; } + } + + public void AddDiacriticalMap (byte target, byte replace) + { + items.Add (new DiacriticalMap (target, replace)); + } + + public void AddSortKeyMap (string source, byte [] sortkey) + { + items.Add (new SortKeyMap (source, sortkey)); + } + + public void AddReplacementMap (string source, string replace) + { + items.Add (new ReplacementMap (source, replace)); + } + + public char [] ItemToCharArray () + { + ArrayList al = new ArrayList (); + foreach (ITailoringMap m in items) + al.AddRange (m.ToCharArray ()); + return al.ToArray (typeof (char)) as char []; + } + + interface ITailoringMap + { + char [] ToCharArray (); + } + + class DiacriticalMap : ITailoringMap + { + public readonly byte Target; + public readonly byte Replace; + + public DiacriticalMap (byte target, byte replace) + { + Target = target; + Replace = replace; + } + + public char [] ToCharArray () + { + char [] ret = new char [3]; + ret [0] = (char) 02; // kind:DiacriticalMap + ret [1] = (char) Target; + ret [2] = (char) Replace; + return ret; + } + } + + class SortKeyMap : ITailoringMap + { + public readonly string Source; + public readonly byte [] SortKey; + + public SortKeyMap (string source, byte [] sortkey) + { + Source = source; + SortKey = sortkey; + } + + public char [] ToCharArray () + { + char [] ret = new char [Source.Length + 7]; + ret [0] = (char) 01; // kind:SortKeyMap + for (int i = 0; i < Source.Length; i++) + ret [i + 1] = Source [i]; + // null terminate + for (int i = 0; i < 4; i++) + ret [i + Source.Length + 2] = (char) SortKey [i]; + return ret; + } + } + + class ReplacementMap : ITailoringMap + { + public readonly string Source; + public readonly string Replace; + + public ReplacementMap (string source, string replace) + { + Source = source; + Replace = replace; + } + + public char [] ToCharArray () + { + char [] ret = new char [Source.Length + Replace.Length + 3]; + ret [0] = (char) 03; // kind:ReplaceMap + int pos = 1; + for (int i = 0; i < Source.Length; i++) + ret [pos++] = Source [i]; + // null terminate + pos++; + for (int i = 0; i < Replace.Length; i++) + ret [pos++] = Replace [i]; + // null terminate + return ret; + } + } + } }