X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;f=mcs%2Fclass%2Fcorlib%2FMono.Globalization.Unicode%2Fcreate-mscompat-collation-table.cs;h=c54b9ab8eb936c46967e4952531edbb3207b16a5;hb=ff228e1c801bda9666b6edab3ee962e05edcf480;hp=f6d20851627e7e2c96e1ad81400db9ea61921af8;hpb=967ebd04adb87eaf73970fb297322820abe5393e;p=mono.git diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs index f6d20851627..c54b9ab8eb9 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs +++ b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs @@ -24,7 +24,6 @@ // If there are characters whose primary weight is 0, they are consumed // and considered as a part of the character element. // -#define Binary using System; using System.IO; @@ -96,25 +95,19 @@ namespace Mono.Globalization.Unicode byte [] diacritical = new byte [char.MaxValue + 1]; string [] diacritics = new string [] { - // LATIN, CYRILLIC etc. - "UPTURN", "DOUBLE-STRUCK", - "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", - "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;", - "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;", + // LATIN + "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;", "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;", - " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;", - "WITH OGONEK;", "WITH CEDILLA;", - // + " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;", + " OGONEK;", " CEDILLA;", " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;", - "WITH STROKE;", " CIRCUMFLEX AND ACUTE;", - "STROKE OVERLAY", + " STROKE;", " CIRCUMFLEX AND ACUTE;", " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;", " DIAERESIS AND GRAVE;", " BREVE AND ACUTE;", " CARON AND DOT ABOVE;", " BREVE AND GRAVE;", " MACRON AND ACUTE;", " MACRON AND GRAVE;", - // " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE", " RING ABOVE AND ACUTE", " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS", @@ -124,59 +117,45 @@ namespace Mono.Globalization.Unicode " BREVE AND TILDE", " CEDILLA AND BREVE", " OGONEK AND MACRON", - // - "WITH OVERLINE", - "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;", - " DOUBLE GRAVE", + " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;", + " DOUBLE GRAVE;", " INVERTED BREVE", - "ROMAN NUMERAL", " PRECEDED BY APOSTROPHE", - "WITH HORN;", + " HORN;", " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE", " PALATAL HOOK", " DOT BELOW;", " RETROFLEX;", "DIAERESIS BELOW", " RING BELOW", - // " CIRCUMFLEX BELOW", "HORN AND ACUTE", " BREVE BELOW;", " HORN AND GRAVE", " TILDE BELOW", - " TOPBAR", " DOT BELOW AND DOT ABOVE", " RIGHT HALF RING", " HORN AND TILDE", " CIRCUMFLEX AND DOT BELOW", " BREVE AND DOT BELOW", " DOT BELOW AND MACRON", - " TONE TWO", " HORN AND HOOK ABOVE", " HORN AND DOT", // CIRCLED, PARENTHESIZED and so on - "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", - "CIRCLED KATAKANA", "CIRCLED SANS-SERIF", + "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA", "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN", }; byte [] diacriticWeights = new byte [] { // LATIN. - 3, 3, 5, 5, - 0xF, 0xE, 0x12, 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1A, 0x1B, 0x1C, - // - 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F, + 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F, 0x20, 0x21, 0x22, 0x22, 0x23, 0x24, - // 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30, - // - 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48, + 0x43, 0x43, 0x43, 0x44, 0x46, 0x48, 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A, - // - 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68, + 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x69, 0x69, 0x6A, 0x6D, 0x6E, - 0x87, 0x95, 0xAA, + 0x95, 0xAA, // CIRCLED, PARENTHESIZED and so on. - 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, - 0xF3, 0xF3, 0xF3 + 0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3 }; int [] numberSecondaryWeightBounds = new int [] { @@ -187,6 +166,7 @@ namespace Mono.Globalization.Unicode 0xE50, 0xE60, 0xED0, 0xEE0 }; + char [] orderedCyrillic; char [] orderedGurmukhi; char [] orderedGujarati; char [] orderedGeorgian; @@ -196,11 +176,11 @@ namespace Mono.Globalization.Unicode // based on traditional Tamil consonants, except for // Grantha (where Microsoft breaks traditionalism). // http://www.angelfire.com/empire/thamizh/padanGaL - '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', - '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', - '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', - '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', - '\u0BB7', '\u0BB9'}; + '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3', + '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF', + '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3', + '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7', + '\u0BB9'}; // cp -> character name (only for some characters) ArrayList sortableCharNames = new ArrayList (); @@ -226,11 +206,11 @@ namespace Mono.Globalization.Unicode ArrayList jisJapanese = new ArrayList (); ArrayList nonJisJapanese = new ArrayList (); - ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00]; - ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100]; - ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00]; - ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00]; - byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00]; + ushort [] cjkJA = new ushort [char.MaxValue - 0x4E00]; + ushort [] cjkCHS = new ushort [char.MaxValue - 0x3100]; + ushort [] cjkCHT = new ushort [char.MaxValue - 0x4E00]; + ushort [] cjkKO = new ushort [char.MaxValue - 0x4E00]; + byte [] cjkKOlv2 = new byte [char.MaxValue - 0x4E00]; byte [] ignorableFlags = new byte [char.MaxValue + 1]; @@ -272,12 +252,6 @@ sw.Close (); source, typeof (byte), i); } - ushort [] CompressArray (ushort [] source, CodePointIndexer i) - { - return (ushort []) CodePointIndexer.CompressArray ( - source, typeof (ushort), i); - } - void Serialize () { // Tailorings @@ -287,24 +261,19 @@ sw.Close (); byte [] level1 = new byte [map.Length]; byte [] level2 = new byte [map.Length]; byte [] level3 = new byte [map.Length]; - ushort [] widthCompat = new ushort [map.Length]; + int [] widthCompat = new int [map.Length]; for (int i = 0; i < map.Length; i++) { categories [i] = map [i].Category; level1 [i] = map [i].Level1; level2 [i] = map [i].Level2; level3 [i] = ComputeLevel3Weight ((char) i); - // For Japanese Half-width characters, don't - // map widthCompat. It is IgnoreKanaType that - // handles those width differences. - if (0xFF6D <= i && i <= 0xFF9D) - continue; switch (decompType [i]) { case DecompositionNarrow: case DecompositionWide: case DecompositionSuper: case DecompositionSub: // they are always 1 char - widthCompat [i] = (ushort) decompValues [decompIndex [i]]; + widthCompat [i] = decompValues [decompIndex [i]]; break; } } @@ -320,36 +289,18 @@ sw.Close (); MSCompatUnicodeTableUtil.Level2); level3 = CompressArray (level3, MSCompatUnicodeTableUtil.Level3); - widthCompat = (ushort []) CodePointIndexer.CompressArray ( - widthCompat, typeof (ushort), + widthCompat = (int []) CodePointIndexer.CompressArray ( + widthCompat, typeof (int), MSCompatUnicodeTableUtil.WidthCompat); - cjkCHS = CompressArray (cjkCHS, - MSCompatUnicodeTableUtil.CjkCHS); - cjkCHT = CompressArray (cjkCHT, - MSCompatUnicodeTableUtil.Cjk); - cjkJA = CompressArray (cjkJA, - MSCompatUnicodeTableUtil.Cjk); - cjkKO = CompressArray (cjkKO, - MSCompatUnicodeTableUtil.Cjk); - cjkKOlv2 = CompressArray (cjkKOlv2, - MSCompatUnicodeTableUtil.Cjk); // Ignorables - Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {"); -#if Binary - MemoryStream ms = new MemoryStream (); - BinaryWriter binary = new BinaryWriter (ms); - binary.Write (ignorableFlags.Length); -#endif + Result.WriteLine ("static byte [] ignorableFlags = new byte [] {"); for (int i = 0; i < ignorableFlags.Length; i++) { byte value = ignorableFlags [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); -#if Binary - binary.Write (value); -#endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } @@ -357,19 +308,13 @@ sw.Close (); Result.WriteLine (); // Primary category - Result.WriteLine ("internal static readonly byte [] categories = new byte [] {"); -#if Binary - binary.Write (categories.Length); -#endif + Result.WriteLine ("static byte [] categories = new byte [] {"); for (int i = 0; i < categories.Length; i++) { byte value = categories [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); -#if Binary - binary.Write (value); -#endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } @@ -377,19 +322,13 @@ sw.Close (); Result.WriteLine (); // Primary weight value - Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {"); -#if Binary - binary.Write (level1.Length); -#endif + Result.WriteLine ("static byte [] level1 = new byte [] {"); for (int i = 0; i < level1.Length; i++) { byte value = level1 [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); -#if Binary - binary.Write (value); -#endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } @@ -397,19 +336,13 @@ sw.Close (); Result.WriteLine (); // Secondary weight - Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {"); -#if Binary - binary.Write (level2.Length); -#endif + Result.WriteLine ("static byte [] level2 = new byte [] {"); for (int i = 0; i < level2.Length; i++) { - byte value = level2 [i]; + int value = level2 [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); -#if Binary - binary.Write (value); -#endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } @@ -417,19 +350,13 @@ sw.Close (); Result.WriteLine (); // Thirtiary weight - Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {"); -#if Binary - binary.Write (level3.Length); -#endif + Result.WriteLine ("static byte [] level3 = new byte [] {"); for (int i = 0; i < level3.Length; i++) { byte value = level3 [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); -#if Binary - binary.Write (value); -#endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } @@ -439,30 +366,18 @@ sw.Close (); // Width insensitivity mappings // (for now it is more lightweight than dumping the // entire NFKD table). - Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {"); -#if Binary - binary.Write (widthCompat.Length); -#endif + Result.WriteLine ("static int [] widthCompat = new int [] {"); for (int i = 0; i < widthCompat.Length; i++) { - ushort value = widthCompat [i]; + int value = widthCompat [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); -#if Binary - binary.Write (value); -#endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } Result.WriteLine ("};"); Result.WriteLine (); -#if Binary - using (FileStream fs = File.Create ("../collation.core.bin")) { - byte [] array = ms.ToArray (); - fs.Write (array, 0, array.Length); - } -#endif // CJK SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue); @@ -474,13 +389,8 @@ sw.Close (); void SerializeCJK (string name, ushort [] cjk, int max) { - int offset = 0;//char.MaxValue - cjk.Length; + int offset = char.MaxValue - cjk.Length; Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name); -#if Binary - MemoryStream ms = new MemoryStream (); - BinaryWriter binary = new BinaryWriter (ms); - binary.Write (cjk.Length); -#endif for (int i = 0; i < cjk.Length; i++) { if (i + offset == max) break; @@ -489,30 +399,17 @@ sw.Close (); Result.Write ("{0},", value); else Result.Write ("0x{0:X04},", value); -#if Binary - binary.Write (value); -#endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF + offset); } Result.WriteLine ("};"); Result.WriteLine (); -#if Binary - using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) { - byte [] array = ms.ToArray (); - fs.Write (array, 0, array.Length); - } -#endif } void SerializeCJK (string name, byte [] cjk, int max) { - int offset = 0;//char.MaxValue - cjk.Length; + int offset = char.MaxValue - cjk.Length; Result.WriteLine ("static byte [] {0} = new byte [] {{", name); -#if Binary - MemoryStream ms = new MemoryStream (); - BinaryWriter binary = new BinaryWriter (ms); -#endif for (int i = 0; i < cjk.Length; i++) { if (i + offset == max) break; @@ -521,20 +418,11 @@ sw.Close (); Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); -#if Binary - binary.Write (value); -#endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF + offset); } Result.WriteLine ("};"); Result.WriteLine (); -#if Binary - using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) { - byte [] array = ms.ToArray (); - fs.Write (array, 0, array.Length); - } -#endif } void SerializeTailorings () @@ -543,10 +431,6 @@ sw.Close (); Hashtable counts = new Hashtable (); Result.WriteLine ("static char [] tailorings = new char [] {"); int count = 0; -#if Binary - MemoryStream ms = new MemoryStream (); - BinaryWriter binary = new BinaryWriter (ms); -#endif foreach (Tailoring t in tailorings) { if (t.Alias != 0) continue; @@ -558,24 +442,15 @@ sw.Close (); Result.Write ("'\\x{0:X}', ", (int) c); if (++count % 16 == 0) Result.WriteLine (" // {0:X04}", count - 16); -#if Binary - binary.Write ((ushort) c); -#endif } } Result.WriteLine ("};"); Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {"); -#if Binary - byte [] rawdata = ms.ToArray (); - ms = new MemoryStream (); - binary = new BinaryWriter (ms); - binary.Write (tailorings.Count); -#endif foreach (Tailoring t in tailorings) { int target = t.Alias != 0 ? t.Alias : t.LCID; if (!indexes.ContainsKey (target)) { - throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias)); + Console.Error.WriteLine ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias); continue; } int idx = (int) indexes [target]; @@ -586,26 +461,8 @@ sw.Close (); if (t2.LCID == t.LCID) french = t2.FrenchSort; Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false"); -#if Binary - binary.Write (t.LCID); - binary.Write (idx); - binary.Write (cnt); - binary.Write (french); -#endif } Result.WriteLine ("};"); -#if Binary - binary.Write ((byte) 0xFF); - binary.Write ((byte) 0xFF); - binary.Write (rawdata.Length / 2); - binary.Write (rawdata, 0, rawdata.Length); - - - using (FileStream fs = File.Create ("../collation.tailoring.bin")) { - byte [] array = ms.ToArray (); - fs.Write (array, 0, array.Length); - } -#endif } #region Parse @@ -632,7 +489,6 @@ sw.Close (); ParseJISOrder (cp932); // in prior to ParseUnidata() ParseUnidata (unidata); - ModifyUnidata (); ParseDerivedCoreProperties (derivedCoreProps); ParseScripts (scripts); ParseCJK (chXML, jaXML, koXML); @@ -711,8 +567,8 @@ sw.Close (); if (idx > 0) { string source = s.Substring (0, idx).Trim (); string [] l = s.Substring (idx + 1).Trim ().Split (' '); - byte [] b = new byte [4]; - for (int i = 0; i < 4; i++) { + byte [] b = new byte [5]; + for (int i = 0; i < 5; i++) { if (l [i] == "*") b [i] = 0; else @@ -756,9 +612,8 @@ sw.Close (); if (cp > char.MaxValue) continue; - double v = double.Parse (value); for (int i = cp; i <= cpEnd; i++) - unicodeAge [i] = v; + unicodeAge [i] = double.Parse (value); } } unicodeAge [0] = double.MaxValue; // never be supported @@ -781,10 +636,7 @@ sw.Close (); this.decompValues = (int []) decompValues.ToArray (typeof (int)); } - - char previousLatinTarget = char.MinValue; - byte [] diacriticalOffset = new byte ['Z' - 'A' + 1]; - + void ProcessUnidataLine (string s, ArrayList decompValues) { int idx = s.IndexOf ('#'); @@ -804,98 +656,31 @@ sw.Close (); string name = values [0]; - // SPECIAL CASE: rename some characters for diacritical - // remapping. FIXME: why are they different? - // FIXME: it's still not working. - if (cp == 0x018B || cp == 0x018C) - name = name.Replace ("TOPBAR", "STROKE"); - // isSmallCapital if (s.IndexOf ("SMALL CAPITAL") > 0) isSmallCapital [cp] = true; // latin mapping by character name - if (s.IndexOf ("LATIN") >= 0) { + if (s.IndexOf ("LATIN") > 0) { int lidx = s.IndexOf ("LETTER DOTLESS "); int offset = lidx + 15; if (lidx < 0) { lidx = s.IndexOf ("LETTER TURNED "); offset = lidx + 14; } - if (lidx < 0) { - lidx = s.IndexOf ("LETTER CAPITAL "); - offset = lidx + 15; - } - if (lidx < 0) { - lidx = s.IndexOf ("LETTER SCRIPT "); - offset = lidx + 14; - } if (lidx < 0) { lidx = s.IndexOf ("LETTER "); offset = lidx + 7; } char c = lidx > 0 ? s [offset] : char.MinValue; - char n = s [offset + 1]; - char target = char.MinValue; if ('A' <= c && c <= 'Z' && - (n == ' ') || n == ';') { - target = c; - // FIXME: After 'Z', I cannot reset this state. - previousLatinTarget = c == 'Z' ? char.MinValue : c; - } - - if (s.Substring (offset).StartsWith ("ALPHA")) - target = 'A'; - else if (s.Substring (offset).StartsWith ("TONE SIX")) - target = 'B'; - else if (s.Substring (offset).StartsWith ("OPEN O")) - target = 'C'; - else if (s.Substring (offset).StartsWith ("SCHWA")) - target = 'E'; - else if (s.Substring (offset).StartsWith ("ENG")) - target = 'N'; - else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3 - target = 'O'; - else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3 - target = 'R'; - else if (s.Substring (offset).StartsWith ("TONE TWO")) - target = 'S'; - else if (s.Substring (offset).StartsWith ("ESH")) - target = 'S'; - - // For remaining IPA chars, direct mapping is - // much faster. - switch (cp) { - case 0x0299: target = 'B'; break; - case 0x029A: target = 'E'; break; - case 0x029B: target = 'G'; break; - case 0x029C: target = 'H'; break; - case 0x029D: target = 'J'; break; - case 0x029E: target = 'K'; break; - case 0x029F: target = 'L'; break; - case 0x02A0: target = 'Q'; break; - case 0x02A7: target = 'T'; break; - case 0x02A8: target = 'T'; break; - } - - if (target == char.MinValue) - target = previousLatinTarget; - - if (target != char.MinValue) { - ArrayList entry = (ArrayList) latinMap [target]; + (s.Length == offset + 1 || s [offset + 1] == ' ')) { + ArrayList entry = (ArrayList) latinMap [c]; if (entry == null) { entry = new ArrayList (); - latinMap [target] = entry; + latinMap [c] = entry; } entry.Add (cp); - // FIXME: This secondary weight is hack. - // They are here because they must not - // be identical to the corresponding - // ASCII latins. - if (c != target && diacritical [cp] == 0) { - diacriticalOffset [c - 'A']++; - diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C); - } } } @@ -940,7 +725,7 @@ sw.Close (); } // Box names - if (0x2500 <= cp && cp < 0x2600) { + if (0x2500 <= cp && cp < 0x25B0) { int value = 0; // flags: // up:1 down:2 right:4 left:8 vert:16 horiz:32 @@ -965,93 +750,42 @@ sw.Close (); 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 14}; - if (s.IndexOf ("BOX DRAWINGS ") >= 0) { + if (s.IndexOf ("BOX DRAWINGS ") > 0) { int flag = 0; - if (s.IndexOf (" UP") >= 0) + if (s.IndexOf (" UP") > 0) flag |= 1; - if (s.IndexOf (" DOWN") >= 0) + if (s.IndexOf (" DOWN") > 0) flag |= 2; - if (s.IndexOf (" RIGHT") >= 0) + if (s.IndexOf (" RIGHT") > 0) flag |= 4; - if (s.IndexOf (" LEFT") >= 0) + if (s.IndexOf (" LEFT") > 0) flag |= 8; - if (s.IndexOf (" VERTICAL") >= 0) + if (s.IndexOf (" VERTICAL") > 0) flag |= 16; - if (s.IndexOf (" HORIZONTAL") >= 0) + if (s.IndexOf (" HORIZONTAL") > 0) flag |= 32; int fidx = flags.IndexOf (flag); value = fidx < 0 ? fidx : offsets [fidx]; - } else if (s.IndexOf ("BLOCK") >= 0) { - if (s.IndexOf ("ONE EIGHTH") >= 0) + } else if (s.IndexOf ("BLOCK") > 0) { + if (s.IndexOf ("ONE EIGHTH") > 0) value = 0x12; - else if (s.IndexOf ("ONE QUARTER") >= 0) + else if (s.IndexOf ("ONE QUARTER") > 0) value = 0x13; - else if (s.IndexOf ("THREE EIGHTHS") >= 0) + else if (s.IndexOf ("THREE EIGHTHS") > 0) value = 0x14; - else if (s.IndexOf ("HALF") >= 0) + else if (s.IndexOf ("HALF") > 0) value = 0x15; - else if (s.IndexOf ("FIVE EIGHTHS") >= 0) + else if (s.IndexOf ("FIVE EIGHTHS") > 0) value = 0x16; - else if (s.IndexOf ("THREE QUARTERS") >= 0) + else if (s.IndexOf ("THREE QUARTERS") > 0) value = 0x17; - else if (s.IndexOf ("SEVEN EIGHTHS") >= 0) + else if (s.IndexOf ("SEVEN EIGHTHS") > 0) value = 0x18; else value = 0x19; } - else if (s.IndexOf ("SHADE") >= 0) - value = 0x19; - else if (s.IndexOf ("SQUARE") >= 0) - value = 0xBC - 0xE5; - else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0) - value = 0xBE - 0xE5; - else if (s.IndexOf ("RECTANGLE") >= 0) - value = 0xBD - 0xE5; - else if (s.IndexOf ("PARALLELOGRAM") >= 0) - value = 0xBF - 0xE5; - else if (s.IndexOf ("TRIANGLE") >= 0) { - if (s.IndexOf ("UP-POINTING") >= 0) - value = 0xC0 - 0xE5; - else if (s.IndexOf ("RIGHT-POINTING") >= 0) - value = 0xC1 - 0xE5; - else if (s.IndexOf ("DOWN-POINTING") >= 0) - value = 0xC2 - 0xE5; - else if (s.IndexOf ("LEFT-POINTING") >= 0) - value = 0xC3 - 0xE5; - } - else if (s.IndexOf ("POINTER") >= 0) { - if (s.IndexOf ("RIGHT-POINTING") >= 0) - value = 0xC4 - 0xE5; - else if (s.IndexOf ("LEFT-POINTING") >= 0) - value = 0xC5 - 0xE5; - } - else if (s.IndexOf ("DIAMOND") >= 0) - value = 0xC6 - 0xE5; - else if (s.IndexOf ("FISHEYE") >= 0) - value = 0xC7 - 0xE5; - else if (s.IndexOf ("LOZENGE") >= 0) - value = 0xC8 - 0xE5; - else if (s.IndexOf ("BULLSEYE") >= 0) - value = 0xC9 - 0xE5; - else if (s.IndexOf ("CIRCLE") >= 0) { - if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE") - value = 0xCA - 0xE5; - else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE") - value = 0xCB - 0xE5; - else - value = 0xC9 - 0xE5; - } - if (0x25DA <= cp && cp <= 0x25E5) - value = 0xCD + cp - 0x25DA - 0xE5; - - // SPECIAL CASE: BOX DRAWING DIAGONAL patterns - switch (cp) { - case 0x2571: value = 0xF; break; - case 0x2572: value = 0x10; break; - case 0x2573: value = 0x11; break; - } - if (value != 0) + if (value >= 0) boxValues.Add (new DictionaryEntry ( cp, value)); } @@ -1061,42 +795,15 @@ sw.Close (); if (0x2100 <= cp && cp <= 0x213F && Char.IsSymbol ((char) cp)) sortableCharNames.Add ( - new DictionaryEntry (cp, name)); + new DictionaryEntry (cp, values [0])); else if (0x3380 <= cp && cp <= 0x33DD) sortableCharNames.Add (new DictionaryEntry ( - cp, name.Substring (7))); - - if (Char.GetUnicodeCategory ((char) cp) == - UnicodeCategory.MathSymbol) { - if (name.StartsWith ("CIRCLED ")) - diacritical [cp] = 0xEE; - if (name.StartsWith ("SQUARED ")) - diacritical [cp] = 0xEF; - } + cp, values [0].Substring (7))); // diacritical weights by character name -if (diacritics.Length != diacriticWeights.Length) -throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length)); - for (int d = 0; d < diacritics.Length; d++) { - if (s.IndexOf (diacritics [d]) > 0) { - diacritical [cp] += diacriticWeights [d]; - if (s.IndexOf ("COMBINING") >= 0) - diacritical [cp] -= (byte) 2; - continue; - } - // also process "COMBINING blah" here - // For now it is limited to cp < 0x0370 -// if (cp < 0x0300 || cp >= 0x0370) -// continue; - string tmp = diacritics [d].TrimEnd (';'); - if (tmp.IndexOf ("WITH ") == 0) - tmp = tmp.Substring (4); - tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp); - if (name == tmp) - diacritical [cp] = (byte) (diacriticWeights [d] - 2); -//if (name == tmp) -//Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp); - } + for (int d = 0; d < diacritics.Length; d++) + if (s.IndexOf (diacritics [d]) > 0) + diacritical [cp] |= diacriticWeights [d]; // Two-step grep required for it. if (s.IndexOf ("FULL STOP") > 0 && (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0)) @@ -1126,8 +833,8 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la (cp == 0x0640) ? // 0x0640 is special: it does // not start with ARABIC LETTER - name : - name.Substring (14); + values [0] : + values [0].Substring (14); int tmpIdx = letterName.IndexOf (' '); letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx); //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName); @@ -1143,7 +850,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // Japanese square letter if (0x3300 <= cp && cp <= 0x3357) if (!ExistsJIS (cp)) - nonJisJapanese.Add (new NonJISCharacter (cp, name)); + nonJisJapanese.Add (new NonJISCharacter (cp, values [0])); // normalizationType string decomp = values [4]; @@ -1317,6 +1024,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la void ParseScripts (string filename) { + ArrayList cyrillic = new ArrayList (); ArrayList gurmukhi = new ArrayList (); ArrayList gujarati = new ArrayList (); ArrayList georgian = new ArrayList (); @@ -1346,6 +1054,11 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la continue; switch (value) { + case "Cyrillic": + for (int x = cp; x <= cpEnd; x++) + if (!IsIgnorable (x)) + cyrillic.Add ((char) x); + break; case "Gurmukhi": for (int x = cp; x <= cpEnd; x++) if (!IsIgnorable (x)) @@ -1369,10 +1082,12 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la } } } + cyrillic.Sort (UCAComparer.Instance); gurmukhi.Sort (UCAComparer.Instance); gujarati.Sort (UCAComparer.Instance); georgian.Sort (UCAComparer.Instance); thaana.Sort (UCAComparer.Instance); + orderedCyrillic = (char []) cyrillic.ToArray (typeof (char)); orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char)); orderedGujarati = (char []) gujarati.ToArray (typeof (char)); orderedGeorgian = (char []) georgian.ToArray (typeof (char)); @@ -1381,37 +1096,26 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la void ParseJISOrder (string filename) { - int line = 1; - try { - using (StreamReader file = - new StreamReader (filename)) { - for (;file.Peek () >= 0; line++) - ProcessJISOrderLine (file.ReadLine ()); + using (StreamReader file = + new StreamReader (filename)) { + while (file.Peek () >= 0) { + string s = file.ReadLine (); + int idx = s.IndexOf ('#'); + if (idx >= 0) + s = s.Substring (0, idx).Trim (); + if (s.Length == 0) + continue; + idx = s.IndexOf (' '); + if (idx < 0) + continue; + // They start with "0x" so cut them out. + int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber); + int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber); + jisJapanese.Add (new JISCharacter (cp, jis)); } - } catch (Exception) { - Console.Error.WriteLine ("---- line {0}", line); - throw; } } - char [] ws = new char [] {'\t', ' '}; - - void ProcessJISOrderLine (string s) - { - int idx = s.IndexOf ('#'); - if (idx >= 0) - s = s.Substring (0, idx).Trim (); - if (s.Length == 0) - return; - idx = s.IndexOfAny (ws); - if (idx < 0) - return; - // They start with "0x" so cut them out. - int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber); - int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber); - jisJapanese.Add (new JISCharacter (cp, jis)); - } - void ParseCJK (string zhXML, string jaXML, string koXML) { XmlDocument doc = new XmlDocument (); @@ -1425,7 +1129,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // Chinese Simplified category = "chs"; arr = cjkCHS; - offset = 0;//char.MaxValue - arr.Length; + offset = char.MaxValue - arr.Length; doc.Load (zhXML); s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText; v = 0x8008; @@ -1442,7 +1146,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // Chinese Traditional category = "cht"; arr = cjkCHT; - offset = 0;//char.MaxValue - arr.Length; + offset = char.MaxValue - arr.Length; s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText; v = 0x8002; foreach (char c in s) { @@ -1458,56 +1162,17 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // Japanese category = "ja"; arr = cjkJA; - offset = 0;//char.MaxValue - arr.Length; - - // SPECIAL CASES - arr [0x4EDD] = 0x8002; // Chinese repetition mark? - arr [0x337B] = 0x8004; // Those 4 characters are Gengou - arr [0x337E] = 0x8005; - arr [0x337D] = 0x8006; - arr [0x337C] = 0x8007; - + offset = char.MaxValue - arr.Length; + doc.Load (jaXML); + s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText; v = 0x8008; - foreach (JISCharacter jc in jisJapanese) { - if (jc.JIS < 0x8800) - continue; - char c = (char) jc.CP; - + foreach (char c in s) { if (c < '\u4E00') - // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v); - continue; + Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v); else { arr [(int) c - offset] = (ushort) v++; if (v % 256 == 0) v += 2; - - // SPECIAL CASES: - if (c == '\u662D') // U+337C - continue; - if (c == '\u5927') // U+337D - continue; - if (c == '\u5E73') // U+337B - continue; - if (c == '\u660E') // U+337E - continue; - if (c == '\u9686') // U+F9DC - continue; - - // FIXME: there are still remaining - // characters after U+FA0C. -// for (int k = 0; k < char.MaxValue; k++) { - for (int k = 0; k < '\uFA0D'; k++) { - if (decompIndex [k] == 0 || IsIgnorable (k)) - continue; - if (decompValues [decompIndex [k]] == c /*&& - decompLength [k] == 1*/ || - decompLength [k] == 3 && - decompValues [decompIndex [k] + 1] == c) { - arr [k - offset] = (ushort) v++; - if (v % 256 == 0) - v += 2; - } - } } } @@ -1523,7 +1188,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // category = "ko"; arr = cjkKO; - offset = 0;//char.MaxValue - arr.Length; + offset = char.MaxValue - arr.Length; doc.Load (koXML); foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) { XmlElement sc = (XmlElement) reset.NextSibling; @@ -1563,42 +1228,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la } } - void ModifyUnidata () - { - // Modify some decomposition equivalence - decompType [0xFE31] = 0; - decompIndex [0xFE31] = 0; - decompLength [0xFE31] = 0; - decompType [0xFE32] = 0; - decompIndex [0xFE32] = 0; - decompLength [0xFE32] = 0; - - // Korean parens numbers - for (int i = 0x3200; i <= 0x321C; i++) - diacritical [i] = 0xA; - for (int i = 0x3260; i <= 0x327B; i++) - diacritical [i] = 0xC; - - // LAMESPEC: these remapping should not be done. - // Windows have incorrect CJK compat mappings. - decompValues [decompIndex [0x32A9]] = 0x91AB; - decompLength [0x323B] = 1; - decompValues [decompIndex [0x323B]] = 0x5B78; - decompValues [decompIndex [0x32AB]] = 0x5B78; - decompValues [decompIndex [0x32A2]] = 0x5BEB; - decompLength [0x3238] = 1; - decompValues [decompIndex [0x3238]] = 0x52DE; - decompValues [decompIndex [0x3298]] = 0x52DE; - - // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things) - decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty) - decompValues [decompIndex [0xFA0C]] = 0x5140; - decompLength [0xFA0C] = 1; - decompIndex [0xF929] = decompLength [0xF929] = 0; - - decompValues [decompIndex [0xF92C]] = 0x90DE; - } - void ModifyParsedValues () { // number, secondary weights @@ -1609,6 +1238,12 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la if (Char.IsNumber ((char) cp)) diacritical [cp] = weight; + // Korean parens numbers + for (int i = 0x3200; i <= 0x321C; i++) + diacritical [i] = 0xA; + for (int i = 0x3260; i <= 0x327B; i++) + diacritical [i] = 0xC; + // Update name part of named characters for (int i = 0; i < sortableCharNames.Count; i++) { DictionaryEntry de = @@ -1675,26 +1310,16 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // Hyphen/Dash : 06 81 - 06 90 for (int i = 0; i < char.MaxValue; i++) { - if (!IsIgnorable (i) && - Char.GetUnicodeCategory ((char) i) == - UnicodeCategory.DashPunctuation) { - AddCharMapGroup2 ((char) i, 6, 1, 0); - if (i == 0x2011) { - // SPECIAL: add 2027 and 2043 - // Maybe they are regarded the - // same hyphens in "central" - // position. - AddCharMap ('\u2027', 6, 1); - AddCharMap ('\u2043', 6, 1); - } - } + if (Char.GetUnicodeCategory ((char) i) + == UnicodeCategory.DashPunctuation) + AddCharMapGroupTail ((char) i, 6, 1); } // Arabic variable weight chars 06 A0 - fillIndex [6] = 0xA0; // vowels for (int i = 0x64B; i <= 0x650; i++) - AddArabicCharMap ((char) i); + AddCharMapGroupTail ((char) i, 6, 1); // sukun AddCharMapGroup ('\u0652', 6, 1, 0); // shadda @@ -1738,67 +1363,12 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); - // FIXME: needs more love here (it should eliminate - // all the hacky code above). - for (int i = 0x0300; i < 0x0370; i++) - if (!IsIgnorable (i) && diacritical [i] != 0 - /* especiall here*/ && !map [i].Defined) - map [i] = new CharMapEntry ( - 0x1, 0x1, diacritical [i]); - - // Cyrillic and Armenian nonspacing mark - fillIndex [0x1] = 0x94; - for (int i = 0x400; i < 0x580; i++) - if (!IsIgnorable (i) && - Char.GetUnicodeCategory ((char) i) == - UnicodeCategory.NonSpacingMark) - AddCharMap ((char) i, 1, 1); - - fillIndex [0x1] = 0x8D; - // syriac dotted nonspacing marks (1) - AddCharMap ('\u0740', 0x1, 1); - AddCharMap ('\u0741', 0x1, 1); - AddCharMap ('\u0742', 0x1, 1); - // syriac oblique nonspacing marks - AddCharMap ('\u0747', 0x1, 1); - AddCharMap ('\u0748', 0x1, 1); - // syriac dotted nonspacing marks (2) - fillIndex [0x1] = 0x94; // this reset is mandatory - AddCharMap ('\u0732', 0x1, 1); - AddCharMap ('\u0735', 0x1, 1); - AddCharMap ('\u0738', 0x1, 1); - AddCharMap ('\u0739', 0x1, 1); - AddCharMap ('\u073C', 0x1, 1); - // SPECIAL CASES: superscripts - AddCharMap ('\u073F', 0x1, 1); - AddCharMap ('\u0711', 0x1, 1); - // syriac "DOTS" - for (int i = 0x0743; i <= 0x0746; i++) - AddCharMap ((char) i, 0x1, 1); - for (int i = 0x0730; i <= 0x0780; i++) - if (!map [i].Defined && - Char.GetUnicodeCategory ((char) i) == - UnicodeCategory.NonSpacingMark) - AddCharMap ((char) i, 0x1, 1); - // LAMESPEC: It should not stop at '\u20E1'. There are // a few more characters (that however results in // overflow of level 2 unless we start before 0xDD). - fillIndex [0x1] = 0xDD; + fillIndex [0x1] = 0xDC; for (int i = 0x20d0; i <= 0x20e1; i++) AddCharMap ((char) i, 0x1, 1); - - // They are not part of Nonspacing marks, but have - // only diacritical weight. - for (int i = 0x3099; i <= 0x309C; i++) - map [i] = new CharMapEntry (1, 1, 1); - map [0xFF9E] = new CharMapEntry (1, 1, 1); - map [0xFF9F] = new CharMapEntry (1, 1, 2); - map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1); - map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1); - for (int i = 0x30FC; i <= 0x30FE; i++) - map [i] = new CharMapEntry (0xFF, 0xFF, 1); - #endregion @@ -1823,7 +1393,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la AddCharMap ('\u2423', 0x7, 1, 0); // open box #endregion - // category 09 - continued symbols from 08 + // FIXME: 09 should be more complete. fillIndex [0x9] = 2; // misc tech mark for (int cp = 0x2300; cp <= 0x237A; cp++) @@ -1846,17 +1416,12 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la boxLv2 [i] = 3; foreach (DictionaryEntry de in boxValues) { int cp = (int) de.Key; - int off = (int) de.Value; + int idx = (int) de.Value; if (map [cp].Defined) continue; - if (off < 0) { - fillIndex [0x9] = (byte) (0xE5 + off); - AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++); - } - else { - fillIndex [0x9] = (byte) (0xE5 + off); - AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++); - } + fillIndex [0x9] = (byte) (0xE5 + idx); + AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]); + boxLv2 [idx]++; } // Some special characters (slanted) fillIndex [0x9] = 0xF4; @@ -1872,8 +1437,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la uc = Char.GetUnicodeCategory ((char) cp); if (!IsIgnorable (cp) && uc == UnicodeCategory.CurrencySymbol && - cp != '$' || - cp == 0xAC) + cp != '$') AddCharMapGroup ((char) cp, 0xA, 1, 0); } // byte other symbols @@ -1882,24 +1446,10 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la continue; // SPECIAL: skip FIXME: why? uc = Char.GetUnicodeCategory ((char) cp); if (!IsIgnorable (cp) && - uc == UnicodeCategory.OtherSymbol || - cp == '\u00B5' || cp == '\u00B7') + uc == UnicodeCategory.OtherSymbol) AddCharMapGroup ((char) cp, 0xA, 1, 0); } - // U+30FB here - AddCharMapGroup ('\u30FB', 0xA, 1, 0); - for (int cp = 0x2020; cp <= 0x2031; cp++) - if (Char.IsPunctuation ((char) cp)) - AddCharMap ((char) cp, 0xA, 1, 0); - // SPECIAL CASES: why? - AddCharMap ('\u203B', 0xA, 1, 0); - AddCharMap ('\u2040', 0xA, 1, 0); - AddCharMap ('\u2041', 0xA, 1, 0); - AddCharMap ('\u2042', 0xA, 1, 0); - - for (int cp = 0x20A0; cp <= 0x20AB; cp++) - AddCharMap ((char) cp, 0xA, 1, 0); fillIndex [0xA] = 0x2F; // FIXME: it won't be needed for (int cp = 0x2600; cp <= 0x2613; cp++) AddCharMap ((char) cp, 0xA, 1, 0); @@ -1951,18 +1501,14 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la fillIndex [0xC]++; int xcp; - if (currValue <= 10) { - xcp = (int) prevValue + 0x2170 - 1; - AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - xcp = (int) prevValue + 0x2160 - 1; - AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - fillIndex [0xC] += 2; - xcp = (int) prevValue + 0x3021 - 1; - AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - fillIndex [0xC]++; - } - else if (currValue == 11) - fillIndex [0xC]++; + xcp = (int) prevValue + 0x2170 - 1; + AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); + xcp = (int) prevValue + 0x2160 - 1; + AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); + fillIndex [0xC] += 2; + xcp = (int) prevValue + 0x3021 - 1; + AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); + fillIndex [0xC]++; } if (prevValue < currValue) prevValue = currValue; @@ -1980,23 +1526,23 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la else if (cp == 0x3021) // FIXME: why? fillIndex [0xC]++; AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]); + if (addnew || cp <= '9') { - int mod = (int) currValue - 1; int xcp; if (1 <= currValue && currValue <= 10) { - xcp = mod + 0x2776; + xcp = cp - 0x31 + 0x2776; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - xcp = mod + 0x2780; + xcp = cp - 0x31 + 0x2780; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - xcp = mod + 0x278A; + xcp = cp - 0x31 + 0x278A; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); } if (1 <= currValue && currValue <= 20) { - xcp = mod + 0x2460; + xcp = cp - 0x31 + 0x2460; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - xcp = mod + 0x2474; + xcp = cp - 0x31 + 0x2474; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - xcp = mod + 0x2488; + xcp = cp - 0x31 + 0x2488; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); } } @@ -2026,6 +1572,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la for (int i = 0; i < alphabets.Length; i++) AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]); + // non-ASCII Latin alphabets // FIXME: there is no such characters that are placed // *after* "alphabets" array items. This is nothing @@ -2045,9 +1592,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // but inside a-to-z range. // 3.there are some expanded characters that // are not part of Unicode Standard NFKD. - // 4. some characters are letter in IsLetter - // but not in sortkeys (maybe unicode version - // difference caused it). switch (i) { // 1. skipping them does not make sense // case 0xD0: case 0xF0: case 0x131: case 0x138: @@ -2065,12 +1609,11 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la case 0xFE: // Icelandic Thorn case 0xDF: // German ss case 0xFF: // German ss - // 4. - case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3: // not classified yet // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9: // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8: // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF: +// case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3: // case 0x1DD: continue; } @@ -2091,82 +1634,17 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0xF, 1); - // Cyrillic. - // Cyrillic letters are sorted like Latin letters i.e. - // containing culture-specific letters between the - // standard Cyrillic sequence. - // - // We can't use UCA here; it has different sorting. - char [] orderedCyrillic = new char [] { - '\u0430', '\u0431', '\u0432', '\u0433', '\u0434', - '\u0452', // DJE for Serbocroatian - '\u0435', - '\u0454', // IE for Ukrainian - '\u0436', '\u0437', - '\u0455', // DZE - '\u0438', - '\u0456', // Byelorussian-Ukrainian I - '\u0457', // YI - '\u0439', - '\u0458', // JE - '\u043A', '\u043B', - '\u0459', // LJE - '\u043C', '\u043D', - '\u045A', // NJE - '\u043E', - // 4E9 goes here. - '\u043F', '\u0440', '\u0441', '\u0442', - '\u045B', // TSHE for Serbocroatian - '\u0443', - '\u045E', // Short U for Byelorussian - '\u04B1', // Straight U w/ stroke (diacritical!) - '\u0444', '\u0445', '\u0446', '\u0447', - '\u045F', // DZHE - '\u0448', '\u0449', '\u044A', '\u044B', '\u044C', - '\u044D', '\u044E', '\u044F'}; - - // For some characters here is a map to basic cyrillic - // letters. See UnicodeData.txt character names for - // the sources. Here I simply declare an equiv. array. - // The content characters are map from U+490(,491), - // skipping small letters. - char [] cymap_src = new char [] { - '\u0433', '\u0433', '\u0433', '\u0436', - '\u0437', '\u043A', '\u043A', '\u043A', - '\u043A', '\u043D', '\u043D', '\u043F', - '\u0445', '\u0441', '\u0442', '\u0443', - '\u0443', '\u0445', '\u0446', '\u0447', - '\u0447', '\u0432', '\u0435', '\u0435', - '\u0406', '\u0436', '\u043A', '\u043D', - '\u0447', '\u0435'}; - - fillIndex [0x10] = 0x8D; - for (int i = 0x0460; i < 0x0481; i++) { - if (Char.IsLetter ((char) i)) { - if (i == 0x0476) - // U+476/477 have the same - // primary weight as U+474/475. - fillIndex [0x10] -= 3; - AddLetterMap ((char) i, 0x10, 3); - } - } - - fillIndex [0x10] = 0x6; + // Cyrillic - UCA order w/ some modification + fillIndex [0x10] = 0x3; + // table which is moslty from UCA DUCET. for (int i = 0; i < orderedCyrillic.Length; i++) { - char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture); - if (!IsIgnorable ((int) c) && - Char.IsLetter (c) && - !map [c].Defined) { - AddLetterMap (c, 0x10, 0); - fillIndex [0x10] += 3; - } + char c = orderedCyrillic [i]; + if (Char.IsLetter (c)) + AddLetterMap (c, 0x10, 3); } - - for (int i = 0; i < cymap_src.Length; i++) { - char c = cymap_src [i]; - fillIndex [0x10] = map [c].Level1; - AddLetterMap ((char) (0x0490 + i * 2), - 0x10, 0); + for (int i = 0x0460; i < 0x0481; i++) { + if (Char.IsLetter ((char) i)) + AddLetterMap ((char) i, 0x10, 3); } // Armenian @@ -2177,18 +1655,15 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // Hebrew // -Letters - fillIndex [0x12] = 0x2; + fillIndex [0x12] = 0x3; for (int i = 0x05D0; i < 0x05FF; i++) if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x12, 1); // -Accents fillIndex [0x1] = 0x3; - for (int i = 0x0591; i <= 0x05C2; i++) { - if (i == 0x05A3 || i == 0x05BB) - fillIndex [0x1]++; + for (int i = 0x0591; i <= 0x05C2; i++) if (i != 0x05BE) AddCharMap ((char) i, 0x1, 1); - } // Arabic fillIndex [0x1] = 0x8E; @@ -2206,18 +1681,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // (byte) arabicLetterPrimaryValues [i], 1); fillIndex [0x13] = (byte) arabicLetterPrimaryValues [i]; - byte formDiacritical = 8; // default - // SPECIAL CASES: - switch (i) { - case 0x0622: formDiacritical = 9; break; - case 0x0623: formDiacritical = 0xA; break; - case 0x0624: formDiacritical = 5; break; - case 0x0625: formDiacritical = 0xB; break; - case 0x0626: formDiacritical = 7; break; - case 0x0649: formDiacritical = 5; break; - case 0x064A: formDiacritical = 7; break; - } - AddLetterMapCore ((char) i, 0x13, 1, formDiacritical); + AddLetterMap ((char) i, 0x13, 0); } fillIndex [0x13] = 0x84; for (int i = 0x0674; i < 0x06D6; i++) @@ -2231,26 +1695,10 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la if (!IsIgnorable (i)) AddLetterMap ((char) i, 0x14, 2); fillIndex [0x14] = 0xB; - for (int i = 0x0905; i < 0x093A; i++) { - if (i == 0x0928) - AddCharMap ('\u0929', 0x14, 0, 8); - if (i == 0x0930) - AddCharMap ('\u0931', 0x14, 0, 8); - if (i == 0x0933) - AddCharMap ('\u0934', 0x14, 0, 8); + for (int i = 0x0905; i < 0x093A; i++) if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x14, 4); - if (i == 0x090B) - AddCharMap ('\u0960', 0x14, 4); - if (i == 0x090C) - AddCharMap ('\u0961', 0x14, 4); - } - fillIndex [0x14] = 0xDA; - for (int i = 0x093E; i < 0x0945; i++) - if (!IsIgnorable (i)) - AddLetterMap ((char) i, 0x14, 2); - fillIndex [0x14] = 0xEC; - for (int i = 0x0945; i < 0x094F; i++) + for (int i = 0x093E; i < 0x094F; i++) if (!IsIgnorable (i)) AddLetterMap ((char) i, 0x14, 2); @@ -2279,81 +1727,33 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // Gurmukhi. orderedGurmukhi is from UCA // FIXME: it does not look equivalent to UCA. - fillIndex [0x16] = 04; - fillIndex [0x1] = 3; + fillIndex [0x1] = 03; + fillIndex [0x16] = 02; for (int i = 0; i < orderedGurmukhi.Length; i++) { char c = orderedGurmukhi [i]; if (IsIgnorable ((int) c)) continue; - if (IsIgnorableNonSpacing (c)) { + if (!Char.IsLetter (c)) { AddLetterMap (c, 0x1, 1); continue; } if (c == '\u0A3C' || c == '\u0A4D' || '\u0A66' <= c && c <= '\u0A71') continue; - // SPECIAL CASE: U+A38 = U+A36 at primary level (why?) - byte shift = 4; - if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E') - shift = 0; - AddLetterMap (c, 0x16, shift); + AddLetterMap (c, 0x16, 4); } // Gujarati. orderedGujarati is from UCA - fillIndex [0x17] = 0x4; - // nonspacing marks - map [0x0A4D] = new CharMapEntry (1, 0, 0x3); - map [0x0ABD] = new CharMapEntry (1, 0, 0x3); - map [0x0A3C] = new CharMapEntry (1, 0, 0x4); - map [0x0A71] = new CharMapEntry (1, 0, 0x6); - map [0x0ABC] = new CharMapEntry (1, 0, 0xB); - map [0x0A70] = new CharMapEntry (1, 0, 0xE); - // letters go first. - for (int i = 0; i < orderedGujarati.Length; i++) { - // SPECIAL CASE - char c = orderedGujarati [i]; - if (Char.IsLetter (c)) { - // SPECIAL CASES - if (c == '\u0AB3' || c == '\u0A32') - continue; - if (c == '\u0A33') { - AddCharMap ('\u0A32', 0x17, 0); - AddCharMap ('\u0A33', 0x17, 4, 4); - continue; - } - if (c == '\u0A8B') - AddCharMap ('\u0AE0', 0x17, 0, 5); - AddCharMap (c, 0x17, 4); - - if (c == '\u0AB9') - AddCharMap ('\u0AB3', 0x17, 6); - } - } - // non-letters - byte gujaratiShift = 4; - fillIndex [0x17] = 0xC0; - for (int i = 0; i < orderedGujarati.Length; i++) { - char c = orderedGujarati [i]; - if (fillIndex [0x17] == 0xCC) - gujaratiShift = 3; - if (!Char.IsLetter (c)) { - // SPECIAL CASES - if (c == '\u0A82') - AddCharMap ('\u0A81', 0x17, 2); - if (c == '\u0AC2') - fillIndex [0x17]++; - AddLetterMap (c, 0x17, gujaratiShift); - } - } + fillIndex [0x17] = 02; + for (int i = 0; i < orderedGujarati.Length; i++) + AddLetterMap (orderedGujarati [i], 0x17, 4); // Oriya - fillIndex [0x1] = 03; fillIndex [0x18] = 02; for (int i = 0x0B00; i < 0x0B7F; i++) { switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.NonSpacingMark: case UnicodeCategory.DecimalDigitNumber: - AddLetterMap ((char) i, 0x1, 1); continue; } AddLetterMap ((char) i, 0x18, 1); @@ -2364,11 +1764,13 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la AddCharMap ('\u0BD7', 0x19, 0); fillIndex [0x19] = 0xA; // vowels - for (int i = 0x0B82; i <= 0x0B94; i++) - if (!IsIgnorable ((char) i)) + for (int i = 0x0BD7; i < 0x0B94; i++) + if (Char.IsLetter ((char) i)) AddCharMap ((char) i, 0x19, 2); // special vowel - fillIndex [0x19] = 0x28; + fillIndex [0x19] = 0x24; + AddCharMap ('\u0B94', 0x19, 0); + fillIndex [0x19] = 0x26; // The array for Tamil consonants is a constant. // Windows have almost similar sequence to TAM from // tamilnet but a bit different in Grantha. @@ -2400,82 +1802,47 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la for (int i = 0x0C80; i < 0x0CE5; i++) { if (i == 0x0CD5 || i == 0x0CD6) continue; // ignore - if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE) - continue; // shift after 0xCB9 AddCharMap ((char) i, 0x1B, 3); - if (i == 0x0CB9) { - // SPECIAL CASES: but why? - AddCharMap ('\u0CB1', 0x1B, 3); // RRA - AddCharMap ('\u0CB3', 0x1B, 3); // LLA - AddCharMap ('\u0CDE', 0x1B, 3); // FA - } - if (i == 0x0CB2) - AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL } // Malayalam fillIndex [0x1C] = 2; - fillIndex [0x1] = 3; - for (int i = 0x0D02; i < 0x0D61; i++) { + for (int i = 0x0D02; i < 0x0D61; i++) // FIXME: I avoided MSCompatUnicodeTable usage // here (it results in recursion). So check if // using NonSpacingMark makes sense or not. if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark) // if (!MSCompatUnicodeTable.IsIgnorable ((char) i)) AddCharMap ((char) i, 0x1C, 1); - else if (!IsIgnorable ((char) i)) - AddCharMap ((char) i, 1, 1); - } // Thai ... note that it breaks 0x1E wall after E2B! // Also, all Thai characters have level 2 value 3. fillIndex [0x1E] = 2; - fillIndex [0x1] = 3; - for (int i = 0xE40; i <= 0xE44; i++) + for (int i = 0xE44; i < 0xE48; i++) AddCharMap ((char) i, 0x1E, 1, 3); for (int i = 0xE01; i < 0xE2B; i++) - AddCharMap ((char) i, 0x1E, 6, 3); + AddCharMap ((char) i, 0x1E, 6, 0); fillIndex [0x1F] = 5; for (int i = 0xE2B; i < 0xE30; i++) - AddCharMap ((char) i, 0x1F, 6, 3); - fillIndex [0x1F] = 0x1E; + AddCharMap ((char) i, 0x1F, 6, 0); for (int i = 0xE30; i < 0xE3B; i++) AddCharMap ((char) i, 0x1F, 1, 3); // some Thai characters remains. char [] specialThai = new char [] {'\u0E45', '\u0E46', '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'}; foreach (char c in specialThai) - AddCharMap (c, 0x1F, 1, 3); - - for (int i = 0xE00; i < 0xE80; i++) - if (Char.GetUnicodeCategory ((char) i) == - UnicodeCategory.NonSpacingMark) - AddCharMap ((char) i, 1, 1); + AddCharMap (c, 0x1F, 1); // Lao fillIndex [0x1F] = 2; - fillIndex [0x1] = 3; - for (int i = 0xE80; i < 0xEDF; i++) { - if (IsIgnorable ((char) i)) - continue; - else if (Char.IsLetter ((char) i)) + for (int i = 0xE80; i < 0xEDF; i++) + if (Char.IsLetter ((char) i)) AddCharMap ((char) i, 0x1F, 1); - else if (Char.GetUnicodeCategory ((char) i) == - UnicodeCategory.NonSpacingMark) - AddCharMap ((char) i, 1, 1); - } // Georgian. orderedGeorgian is from UCA DUCET. fillIndex [0x21] = 5; - for (int i = 0; i < orderedGeorgian.Length; i++) { - char c = orderedGeorgian [i]; - if (map [(int) c].Defined) - continue; - AddCharMap (c, 0x21, 0); - if (c < '\u10F6') - AddCharMap ((char) (c - 0x30), 0x21, 0); - fillIndex [0x21] += 5; - } + for (int i = 0; i < orderedGeorgian.Length; i++) + AddLetterMap (orderedGeorgian [i], 0x21, 5); // Japanese Kana. fillIndex [0x22] = 2; @@ -2500,16 +1867,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la AddKanaMap (cp, kanaLines [gyo]); fillIndex [0x22]++; - if (cp == 0x30AB) { - // add small 'ka' (before normal one) - AddKanaMap (0x30F5, 1); - kanaOffset++; - } - if (cp == 0x30B1) { - // add small 'ke' (before normal one) - AddKanaMap (0x30F6, 1); - kanaOffset++; - } if (cp == 0x3061) { // add small 'Tsu' (before normal one) AddKanaMap (0x3063, 1); @@ -2540,27 +1897,11 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la AddLetterMap ((char) 0x3093, 0x22, 0); AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0); - map [0x3094] = new CharMapEntry (map [0x30A6].Category, - map [0x30A6].Level1, 3);// voiced hiragana U - map [0x30F4] = new CharMapEntry (map [0x30A6].Category, - map [0x30A6].Level1, 3);// voiced katakana U - - map [0x30F5] = new CharMapEntry (map [0x30AB].Category, - map [0x30AB].Level1, 0);// small katakana Ka - map [0x30F6] = new CharMapEntry (map [0x30B1].Category, - map [0x30B1].Level1, 0);// small katakana Ke - // voiced Wa lines - for (int i = 0x30F7; i < 0x30FB; i++) - map [i] = new CharMapEntry (map [i - 8].Category, - map [i - 8].Level1, - 3); - // JIS Japanese square chars. fillIndex [0x22] = 0x97; jisJapanese.Sort (JISComparer.Instance); foreach (JISCharacter j in jisJapanese) - if (0x3300 <= j.CP && j.CP <= 0x3357) - AddCharMap ((char) j.CP, 0x22, 1); + AddCharMap ((char) j.CP, 0x22, 1); // non-JIS Japanese square chars. nonJisJapanese.Sort (NonJISComparer.Instance); foreach (NonJISCharacter j in nonJisJapanese) @@ -2590,19 +1931,14 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la map [cp] = new CharMapEntry (0x24, (byte) (map [cp - 1].Level1 + 2), 0); - // FIXME: Syriac NonSpacingMark should go here. // Thaana // FIXME: it turned out that it does not look like UCA fillIndex [0x24] = 0x6E; - fillIndex [0x1] = 0xAC; for (int i = 0; i < orderedThaana.Length; i++) { - char c = orderedThaana [i]; - if (IsIgnorableNonSpacing ((int) c)) - AddCharMap (c, 1, 1); - AddCharMap (c, 0x24, 2); - if (c == '\u0782') // SPECIAL CASE: why? - fillIndex [0x24] += 2; + if (IsIgnorableNonSpacing (i)) + continue; + AddCharMap (orderedThaana [i], 0x24, 2); } #endregion @@ -2641,7 +1977,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la + "<{\u1113 \u1116}, \u3165," + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8," + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >" - + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >" + + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >" + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1," + "[\u11D1 \u11D2], \u11B2," + "[\u11D3 \u11D5], \u11B3," @@ -2649,11 +1985,10 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >" + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >" + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >" - + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, " - + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178," - + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>" - + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C " - + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >" + + "<{\u1121 \u112C}, \u11B9,,,,,,,,, [\u11E4 \u11E6],," + + "\u1109=\u11BA,,, \u3214=\u3274 <>" + + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],," + + "\u11EA,, \u110A=\u11BB,,, >" + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB," + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >" + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, " @@ -2721,40 +2056,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la } } - // Some Jamo NFKD. - for (int i = 0x3200; i < 0x3300; i++) { - if (IsIgnorable (i) || map [i].Defined) - continue; - int ch = 0; - // w/ bracket - if (decompLength [i] == 4 && - decompValues [decompIndex [i]] == '(') - ch = decompIndex [i] + 1; - // circled - else if (decompLength [i] == 2 && - decompValues [decompIndex [i] + 1] == '\u1161') - ch = decompIndex [i]; - else if (decompLength [i] == 1) - ch = decompIndex [i]; - else - continue; - ch = decompValues [ch]; - if (ch < 0x1100 || 0x1200 < ch && - ch < 0xAC00 || 0xD800 < ch) - continue; - - // SPECIAL CASE ? - int offset = i < 0x3260 ? 1 : 0; - if (0x326E <= i && i <= 0x3273) - offset = 1; - - map [i] = new CharMapEntry (map [ch].Category, - (byte) (map [ch].Level1 + offset), - map [ch].Level2); -// Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]); - } - - #endregion // Letterlike characters and CJK compatibility square @@ -2811,8 +2112,10 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // Insert 3001 after ',' and 3002 after '.' if (i == 0x2C) AddCharMapGroup2 ('\u3001', 0x7, 1, 0); - else if (i == 0x2E) + else if (i == 0x2E) { + fillIndex [0x7]--; AddCharMapGroup2 ('\u3002', 0x7, 1, 0); + } else if (i == 0x3A) AddCharMap ('\uFE30', 0x7, 1, 0); } @@ -2823,18 +2126,10 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la if (IsIgnorable (i)) continue; - // FIXME: actually those reset should not be - // done but here I put for easy goal. - if (i == 0x0700) - fillIndex [0x7] = 0xE2; - if (i == 0x2016) - fillIndex [0x7] = 0x77; - // SPECIAL CASES: switch (i) { case 0xAB: // 08 case 0xB7: // 0A - case 0xBB: // 08 case 0x2329: // 09 case 0x232A: // 09 continue; @@ -2848,7 +2143,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la case UnicodeCategory.FinalQuotePunctuation: case UnicodeCategory.ModifierSymbol: // SPECIAL CASES: // 0xA - if (0x2020 <= i && i <= 0x2031) + if (0x2020 <= i && i <= 0x2042) continue; AddCharMapGroup ((char) i, 0x7, 1, 0); break; @@ -2859,15 +2154,20 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la } } // Control pictures - // FIXME: it should not need to reset level 1, but - // it's for easy goal. - fillIndex [0x7] = 0xB6; for (int i = 0x2400; i <= 0x2421; i++) AddCharMap ((char) i, 0x7, 1, 0); #endregion // FIXME: for 07 xx we need more love. + // FIXME: 08 should be more complete. + fillIndex [0x8] = 2; + for (int cp = 0; cp < char.MaxValue; cp++) + if (!map [cp].Defined && + Char.GetUnicodeCategory ((char) cp) == + UnicodeCategory.MathSymbol) + AddCharMapGroup ((char) cp, 0x8, 1, 0); + // Characters w/ diacritical marks (NFKD) for (int i = 0; i <= char.MaxValue; i++) { if (map [i].Defined || IsIgnorable (i)) @@ -2906,60 +2206,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la } - // category 08 - symbols - fillIndex [0x8] = 2; - // Here Windows mapping is not straightforward. It is - // not based on computation but seems manual sorting. - AddCharMapGroup ('+', 0x8, 1, 0); // plus - AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus - AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus - AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul - AddCharMapGroup ('\u2044', 0x8, 1, 0); // div - AddCharMapGroup ('\u2215', 0x8, 1, 0); // div - AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul - AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring - AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet - AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus - AddCharMapGroup ('\u003C', 0x8, 1, 0); // < - AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation - AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation - - for (int cp = 0; cp < 0x2300; cp++) { - if (cp == 0xAC) // SPECIAL CASE: skip - continue; - if (cp == 0x200) { - cp = 0x2200; // skip to 2200 - fillIndex [0x8] = 0x21; - } - if (cp == 0x2295) - fillIndex [0x8] = 0x3; - if (cp == 0x22B2) - fillIndex [0x8] = 0xB9; - if (!map [cp].Defined && -// Char.GetUnicodeCategory ((char) cp) == -// UnicodeCategory.MathSymbol) - Char.IsSymbol ((char) cp)) - AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]); - // SPECIAL CASES: no idea why Windows sorts as such - switch (cp) { - case 0x3E: - AddCharMap ('\u227B', 0x8, 1, 0); - AddCharMap ('\u22B1', 0x8, 1, 0); - break; - case 0xB1: - AddCharMapGroup ('\u00AB', 0x8, 1, 0); - AddCharMapGroup ('\u226A', 0x8, 1, 0); - AddCharMapGroup ('\u00BB', 0x8, 1, 0); - AddCharMapGroup ('\u226B', 0x8, 1, 0); - break; - case 0xF7: - AddCharMap ('\u01C0', 0x8, 1, 0); - AddCharMap ('\u01C1', 0x8, 1, 0); - AddCharMap ('\u01C2', 0x8, 1, 0); - break; - } - } - #region Level2 adjustment // Arabic Hamzah diacritical [0x624] = 0x5; @@ -2970,6 +2216,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la diacritical [0x649] = 0x5; // 'alif maqs.uurah diacritical [0x64A] = 0x7; // Yaa' + for (int i = 0; i < char.MaxValue; i++) { byte mod = 0; byte cat = map [i].Category; @@ -2979,7 +2226,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la mod = diacritical [i]; break; case 0x13: // Arabic - if (diacritical [i] == 0 && i >= 0xFE8D) + if (diacritical [i] == 0) mod = 0x8; // default for arabic break; } @@ -2991,23 +2238,15 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la } #endregion - // FIXME: this is hack but those NonSpacingMark - // characters and still undefined are likely to - // be nonspacing. + // FIXME: this is hack but those which are + // NonSpacingMark characters and still undefined + // are likely to be nonspacing. for (int i = 0; i < char.MaxValue; i++) if (!map [i].Defined && !IsIgnorable (i) && Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 1, 1); - - // FIXME: this is hack but those Symbol characters - // are likely to fall into 0xA category. - for (int i = 0; i < char.MaxValue; i++) - if (!map [i].Defined && - !IsIgnorable (i) && - Char.IsSymbol ((char) i)) - AddCharMap ((char) i, 0xA, 1); } private void IncrementSequentialIndex (ref byte hangulCat) @@ -3085,6 +2324,19 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la return true; } + private void AddCharMapGroupTail (char c, byte category, byte updateCount) + { + char c2 = ToSmallFormTail (c); + if (c2 != c) + AddCharMap (c2, category, updateCount, 0); + // itself + AddCharMap (c, category, updateCount, 0); + // + c2 = ToFullWidthTail (c); + if (c2 != c) + AddCharMapGroupTail (c2, category, updateCount); + } + // // Adds characters to table in the order below // (+ increases weight): @@ -3160,10 +2412,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la AddCharMapCJK (c, ref category); // LAMESPEC: see below. - if (c == '\u5B78') { - AddCharMapCJK ('\u32AB', ref category); - AddCharMapCJK ('\u323B', ref category); - } if (c == '\u52DE') { AddCharMapCJK ('\u3298', ref category); AddCharMapCJK ('\u3238', ref category); @@ -3193,8 +2441,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // mix Chinise and Japanese Kanji when // ordering those characters. switch (w) { - case 0x32A2: case 0x3298: case 0x3238: - case 0x32A9: case 0x323B: case 0x32AB: + case 0x32A2: case 0x3298: case 0x3238: case 0x32A9: continue; } @@ -3245,26 +2492,14 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la AddCharMap (vertical, category, updateCount, level2); } - private void AddArabicCharMap (char c) + char ToFullWidth (char c) { - byte category = 6; - byte updateCount = 1; - byte level2 = 0; - - // itself - AddCharMap (c, category, 0, level2); + return ToDecomposed (c, DecompositionFull, false); + } - // Since nfkdMap is problematic to have two or more - // NFKD to an identical character, here I iterate all. - for (int c2 = 0; c2 < char.MaxValue; c2++) { - if (decompLength [c2] == 0) - continue; - int idx = decompIndex [c2] + decompLength [c2] - 1; - if ((int) (decompValues [idx]) == (int) c) - AddCharMap ((char) c2, category, - 0, level2); - } - fillIndex [category] += updateCount; + char ToFullWidthTail (char c) + { + return ToDecomposed (c, DecompositionFull, true); } char ToSmallForm (char c) @@ -3272,6 +2507,11 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la return ToDecomposed (c, DecompositionSmall, false); } + char ToSmallFormTail (char c) + { + return ToDecomposed (c, DecompositionSmall, true); + } + char ToDecomposed (char c, byte d, bool tail) { if (decompType [(int) c] != d) @@ -3302,30 +2542,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value { - // CJK compat - if ('\u3192' <= c && c <= '\u319F') - return 0; - - // They have NFKD mapping, and on Windows - // those narrow characters are regarded as "normal", - // thus those characters themselves are regarded as - // "wide". grep "" and you can pick them up - // (ignoring Kana, Hangul etc.) - switch (c) { - case '\u3002': - case '\u300C': - case '\u300D': - case '\u3001': - case '\u30FB': - case '\u2502': - case '\u2190': - case '\u2191': - case '\u2192': - case '\u2193': - case '\u25A0': - case '\u25CB': - return 1; - } // Korean if ('\u11A8' <= c && c <= '\u11F9') return 2; @@ -3333,11 +2549,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la return 4; if ('\u3130' <= c && c <= '\u3164') return 5; - if ('\u3165' <= c && c <= '\u318E') - return 4; - // Georgian Capital letters - if ('\u10A0' <= c && c <= '\u10C5') - return 0x10; // numbers if ('\u2776' <= c && c <= '\u277F') return 4; @@ -3346,13 +2557,13 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la if ('\u2776' <= c && c <= '\u2793') return 0xC; if ('\u2160' <= c && c <= '\u216F') - return 0x10; + return 0x18; if ('\u2181' <= c && c <= '\u2182') return 0x18; // Arabic if ('\u2135' <= c && c <= '\u2138') return 4; - if ('\uFE80' <= c && c < '\uFF00') { + if ('\uFE80' <= c && c < '\uFE8E') { // 2(Isolated)/8(Final)/0x18(Medial) switch (decompType [(int) c]) { case DecompositionIsolated: @@ -3453,7 +2664,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // those ranges. case 0x4d8: case 0x4d9: case 0x4e8: case 0x4e9: - case 0x70F: case 0x3036: case 0x303f: case 0x337b: case 0xfb1e: return false; @@ -3812,7 +3022,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la { JISCharacter j1 = (JISCharacter) o1; JISCharacter j2 = (JISCharacter) o2; - return j1.JIS - j2.JIS; + return j2.JIS - j1.JIS; } } @@ -4022,7 +3232,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la for (int i = 0; i < Source.Length; i++) ret [i + 1] = Source [i]; // null terminate - for (int i = 0; i < 4; i++) + for (int i = 0; i < 5; i++) ret [i + Source.Length + 2] = (char) SortKey [i]; return ret; }