From: Atsushi Eno Date: Wed, 25 May 2005 16:42:33 +0000 (-0000) Subject: 2005-05-25 Atsushi Enomoto X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=commitdiff_plain;h=5bd7cf358da3a8d27e43fdd361f7542f04883a38;hp=e8295e94cf86519d79ee3125f452b6c20597ff85;p=mono.git 2005-05-25 Atsushi Enomoto * Collation-notes.txt : more info. Started letter sortkey analysis (some of other stuff are really non-understandable right now.) * create-mscompat-collation-table.cs : table generator proof-of- concept source (not compilable). * MSCompatUnicodeTable.cs : moved some code to the new source. Some more fixes. svn path=/branches/atsushi/mcs/; revision=45005 --- diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog index c25faaf39a8..e98eda9db39 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog +++ b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog @@ -1,3 +1,12 @@ +2005-05-25 Atsushi Enomoto + + * Collation-notes.txt : more info. Started letter sortkey analysis + (some of other stuff are really non-understandable right now.) + * create-mscompat-collation-table.cs : table generator proof-of- + concept source (not compilable). + * MSCompatUnicodeTable.cs : moved some code to the new source. + Some more fixes. + 2005-05-20 Atsushi Enomoto * Collation-notes.txt : started level 2 weight analysis. diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt b/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt index b1dc50b64d2..61d78e9ae29 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt +++ b/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt @@ -210,12 +210,22 @@ **** level 2 + For Japanese voice marks, it just sums the count up. There also seems special rule for Thai (E01-E4F) e.g. E47 works like Japanese voice marks. For other letters, there will be a table. + + + It looks like all level 2 keys are just accumulated, however without + considering overflow. It sometimes makes sense (e.g. diaeresis and + acute) but it causes many conflicts (e.g. "A\u0308\u0301" and "\u1EA6" + are incorrectly regarded as equal). + + Anyways since Japanese voice mark has level 2 value as 1 it just + looked like the sum of voice marks. **** level 3 @@ -404,7 +414,7 @@ Characters in non "0E" category are out of scope. - They could be grepped in UnicodeData.txt. + They can be grepped in UnicodeData.txt. -0E: acute -0F: grave -10: dot above @@ -535,6 +545,7 @@ 1 specially ignored ones (Japanese, Tamil, Thai) + IdentifyBy: constants Unicode: 3099-309C, BCD, E47, E4C, FF9E, FF9F SortKey: 01 01 01 01 00 @@ -546,21 +557,25 @@ 2.1 control characters (specified as such in Unicode), except for whitespaces (0009-000D). + ProcessAfter: 4.1 + IdentifyBy: UnicodeCategory.Control Unicode: 0001-000F minus 0009-000D, 007F-009F - SortKey: 06 80 07 06 03 00 - 06 80 07 06 3D 00 + SortKey: 06 03 - 06 3D 2.2 Apostrophe + IdentifyBy: constant Unicode: 0027,FF07 (') - SortKey: 06 80 (and nonspace equivalent) + SortKey: 06 80 (and width insensitive equivalents) 2.3 minus sign, hyphen, dash minus signs: FE63, 207B (super), 208B (sub), 002D, 00FD (full-width) hyphens: 00AD (soft), 2010, 2011 (nonbreaking) ... Unicode HYPHEN? dashes, horizontal bars: FE58 ... UnicodeCategory.DashPunctuation + IdentifyBy: UnicodeCategory.DashPunctuation SortKey: 06 81 - 06 90 (and nonspace equivalents) - 2.4 Arabic spacing and equivalents (64B-651, FE70-FE7F) + 2.4 Arabic spacing and equivalents (64B-652, FE70-FE7F) They are part of nonspacing mark, but not equal. SortKey: 06 A0 - 06 A7 (and nonspace equivalents) @@ -571,7 +586,7 @@ (i.e. < 128) nor those equivalents NonSpacingMark which is ignorable (IsIgnorableNonSpacing()) - // 30D, CD5-CD6, ABD, 2B9-2C1, 2C8, 2CB-2CD, 591-5C2. NonSpacingMark in + // 30D, CD5-CD6, ABD, 2B9-2C5, 2C8, 2CB-2CD, 591-5C2. NonSpacingMark in // 981-A3C. A4D, A70, A71, ABC ... TODO: I need more insight to write table generator. @@ -596,6 +611,11 @@ If in "discriminatory mode", those tables could be still provided as to be compatible to Windows. + Additionally there seems some bugs around Modifier letter collection. + For example, 2C6 should be nonspacing diacritical character but it + is regarded as a primary character. The same applies to Mandarin + tone marks (2C9-2CB) (and there's a plenty of such characters). + 4 space separators and some kind of marks 4.1 whitespaces, paragraph separator etc. @@ -607,8 +627,12 @@ SortKey : 07 19 - 07 1A - 4.3 other marks ('!', '^', ...) - Non-alpha-numeric < 0x7F except for '+' (math) and '-' (math/hyphen) + 4.3 ASCII compatible marks ('!', '^', ...) + Non-alpha-numeric < 0x7F except for [[+-<=>']] + small compatibility equivalents -> itself, wide + + 4.3 other marks + FIXME: how to identify them? some Punctuations: InitialQuote/FinalQuote/Open/Close/Connector some OtherSymbols: 2400-2424 3003, 3006, 2D0, 10FB @@ -622,7 +646,7 @@ (not Quotation_Mark property in PropList.txt ; 22, 27) byte area MathSymbol: 2B,3C,3D,3E,AB,B1,BB,D7,F7 except for AC - MathSymbol (2044, 208A, 208C, 207A, 207C) + some MathSymbol (2044, 208A, 208C, 207A, 207C) OtherLetter (1C0-1C2) 2200-22FF MathSymbol except for 221E (INF. ; regarded as a number) @@ -630,6 +654,7 @@ 6 Arrows and Box drawings 09 02 .. 09 7C : 2300-237A + only primary differences 09 BC ... 09 FE : 25A0-AB, 25E7-EB, 25AC-B5, 25EC-EF, 25B6-B9, 25BC-C3, 25BA-25BB, 25C4-25D8, 25E6, 25DA-25E5 21*,25*,26*,27* @@ -674,17 +699,24 @@ This ordering is nothing to do with European Ordering Rules (EOR). - 10 (F) greek letters - 0F: 386-3F2 - 10: 400-4E9 exc. 482-486 - 11: 531-586 exc. 559-55F - 12: 5D0-5F2 - 13: 621-64A, 670-6D3, 6D5 + 10 culture dependent letters (general) + 0F: 386-3F2 ... Greek and Coptic + 386-3CF: 0F 02 - 0F 19 (consider primary equivalents) + 3D0-3EF: 0F 40 - 0F 54 + 10: 400-4E9 ... Cyrillic. + For 400-45F and 4B1, they are mostly UCA DUCET order. + After that 460-481 follows, by codepoint. + (490-4FF except for 4B1 and Cyrillic supplementary are unused.) + 11: 531-586 ... Armenian. + Simply sorted by codepoint (handle case). + 12: 5D0-5F2 ... Hebrew + Codepoint order (handle case). + 13: 621-6D5 plus 670 (NonSpacingMark) ... Arabic + 14: 901-963 exc. 93C-93D 950-954 15: 982-9FA exc. NonSpacingMark DecimalDigitNumber OtherNumber 16: A05-A74 exc. A3C A4D A66-A71 17: A81-AE0 exc. ABC-ABD - 18: ... @@ -744,13 +776,14 @@ 3400-4DB5. Ordered, considering case/width equivalents. - 20 (FF FF 01 01 01 01 00) Some supplemental Japanese/Arabic marks + 20 (FF FF 01 01 01 01 00) Some Japanese/Arabic extenders + Actually FE7C and FE7D are not extender in Unicode (PropList.txt) 3005, 3031, 3032, 309D, 309E, 30FC, 30FD, 30FE, FE7C, FE7D, FF70 - by UnicodeCategory - - DashPunctuation 1 1 1 1 (no exception) + DashPunctuation 6 (no exception) DecimalDigitNumber C (no exception) EnclosingMark 1 E (no exception) Format 7 (only 70F) @@ -763,7 +796,7 @@ OtherNumber C(<3192), 9E-A7 (3124<) - Control 1 1 1 1 except for 9-D (7) + Control 6 except for 9-D (7) FinalQuotePunctuation 7 except for BB (8) InitialQuotePunctuation 7 except for AB (8) ClosePunctuation 7 except for 232A (9) @@ -771,7 +804,7 @@ ConnectorPunctuation 7 except for FF65, 30FB, 2040 (A) OtherLetter 1, 7, 8 (1C0-1C2), C, 12-FF - MathSymbol 8, 9, 1 1 1 1, 7, A, C + MathSymbol 8, 9, 6, 7, A, C OtherSymbol 7, 9, A, C, E, F, <22, 52< CurrencySymbol A except for FF69,24,FF04 (7) and 9F2,9F3 (15) @@ -779,8 +812,8 @@ TitlecaseLetter E (no exception) UppercaseLetter E,F,10,11,21 except for 1BC (C) ModifierLetter 1, 7, E, 1F, FF - ModifierSymbol 1 1 1 1, 1, 7 - NonSpacingMark 1 1 1 1, 1, 13-1F + ModifierSymbol 1, 6, 7 + NonSpacingMark 1, 6, 13-1F OtherPunctuation 1, 7, A, 1F SpacingCombiningMark 1, 14-22 @@ -1038,6 +1071,9 @@ (UCD) is informative (it's informative but not normative to us) http://www.unicode.org/Public/UNIDATA/UCD.html + Decent char-by-char explaination is available here: + http://www.fileformat.info/info/unicode/ + Wine uses UCA default element table, but has windows-like character filterings support in their LCMapString implementation: http://cvs.winehq.com/cvsweb/wine/dlls/kernel/locale.c diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs b/mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs index 460f40c8499..9d5184714d2 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs +++ b/mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs @@ -10,7 +10,7 @@ namespace Mono.Globalization.Unicode { switch (i) { case 0: - // No idea why each of those is ignored. + // No idea why they are ignored. case 0x2df: case 0x387: case 0x3d7: case 0x3d8: case 0x3d9: case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6: @@ -28,9 +28,14 @@ namespace Mono.Globalization.Unicode case 0xfffc: case 0xfffd: return true; // exceptional characters filtered by the - // following conditions (no idea why though). - case 0x4d8: case 0x4d9: case 0x4e8: case 0x4e9: - case 0x70f: case 0x3036: case 0x303f: + // following conditions. Originally those exceptional + // ranges are incorrect (they should not be ignored) + // and most of those characters are unfortunately in + // those ranges. + case 0x4d8: case 0x4d9: + case 0x4e8: case 0x4e9: + case 0x70f: + case 0x3036: case 0x303f: case 0x337b: case 0xfb1e: return false; } @@ -364,84 +369,101 @@ namespace Mono.Globalization.Unicode return Normalization.ToWidthInsensitive (i); } - #region Level 3 properties (Case/Width) + #region Utilities - public static byte GetLevel3WeightRaw (char c) // add 2 for sortkey value + public static void GetPrimaryWeight (char c, bool variable, + out byte category, out byte value) { - // Korean - if (0x1100 <= c && c <= 0x11F9) - return 2; - if (0xFFA0 <= c && c <= 0xFFDC) - return 4; - if (0x3130 <= c && c <= 0x3164) - return 5; - // numbers - if (0x2776 <= c && c <= 0x277F) - return 4; - if (0x2780 <= c && c <= 0x2789) - return 8; - if (0x2776 <= c && c <= 0x2793) - return 0xC; - if (0x2160 <= c && c <= 0x216F) - return 0x10; - if (0x2181 <= c && c <= 0x2182) - return 0x10; - // Arabic - if (0x2135 <= c && c <= 0x2138) - return 4; - if (0xFE80 <= c && c <= 0xFE8E) - return GetArabicFormInPresentationB (c); - - // actually I dunno the reason why they have weights. - switch (c) { - case 0x01BC: - return 0x10; - case 0x06A9: - return 0x20; - case 0x06AA: - return 0x28; - } + } - byte ret = 0; + public static string GetExpansion (char c) + { switch (c) { - case 0x03C2: - case 0x2104: - case 0x212B: - ret |= 8; - break; - case 0xFE42: - ret |= 0xC; - break; - } - - // misc - switch (GetNormalizationType (c)) { - case 1: // - ret |= 1; - break; - case 2: // - ret |= 1; - break; - case 3: // - ret |= 0xE; - break; + case '\u00C6': + return "AE"; + case '\u00DE': + return "TH"; + case '\u00DF': + return "ss"; + case '\u00E6': + return "ae"; + case '\u00FE': + return "th"; + case '\u0132': + return "IJ"; + case '\u0133': + return "ij"; + case '\u0152': + return "OE"; + case '\u0153': + return "oe"; + case '\u01C4': + return "DZ\u030C"; // surprisingly Windows works fine here + case '\u01C5': + return "Dz\u030C"; + case '\u01C6': + return "dz\u030C"; + case '\u01C7': + return "LJ"; + case '\u01C8': + return "Lj"; + case '\u01C9': + return "lj"; + case '\u01CA': + return "NJ"; + case '\u01CB': + return "Nj"; + case '\u01CC': + return "nj"; + case '\u01E2': + return "A\u0304E\u0304"; // LAMESPEC: should be \u00C6\u0304 + case '\u01E3': + return "a\u0304e\u0304"; // LAMESPEC: should be \u00E6\u0304 + case '\u01F1': + return "DZ"; + case '\u01F2': + return "Dz"; + case '\u01F3': + return "dz"; + case '\u01FC': + return "A\u0301E\u0301"; // LAMESPEC: should be \u00C6\u0301 + case '\u01FD': + return "a\u0301e\u0301"; // LAMESPEC: should be \u00C6\u0301 + case '\u05F0': + return "\u05D5\u05D5"; + case '\u05F1': + return "\u05D5\u05D9"; + case '\u05F2': + return "\u05D9\u05D9"; + case '\uFB00': + return "ff"; + case '\uFB01': + return "fi"; + case '\uFB02': + return "fl"; } - if (IsSmallCapital (c)) // grep "SMALL CAPITAL" - ret |= 8; - if (IsUppercase (c)) // DerivedCoreProperties - ret |= 0x10; - - return ret; +// if ('\u1113' <= c && c <= '\u115F') Korean Jamo +// return true; + return null; } - - // TODO: implement GetArabicFormInRepresentationD(), - // GetNormalizationType(), IsSmallCapital() and IsUppercase(). - // (They can be easily to be generated.) - #endregion + #region Level 4 properties (Kana) + public static bool HasSpecialWeight (char c) + { + if (c < '\u3041') + return false; + else if (c < '\u3100') + return true; + else if (c < '\uFF60') + return false; + else if (c < '\uFF9F') + return true; + return true; + } + public static byte GetJapaneseDashType (char c) { switch (c) { @@ -497,7 +519,25 @@ namespace Mono.Globalization.Unicode return false; } - #endregion\ + #endregion + + + // 0 means no primary weight. 6 means variable weight + // For expanded character the value is 0. + // Those arrays will be split into blocks (<3400 and >F800) + byte [] categories; + byte [] level1; + byte [] level2; + byte [] level3; + // level 4 is computed. + + // public static bool HasSpecialWeight (char c) + // { return level1 [(int) c] == 6; } + + // + // Maybe autogenerated code or icall to fill array runs here + // } } + diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs new file mode 100644 index 00000000000..59c672aa88b --- /dev/null +++ b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs @@ -0,0 +1,380 @@ +// +// +// There are two kind of sort keys : which are computed and which are laid out +// as an indexed array. Computed sort keys are: +// +// - CJK, which largely vary depending on LCID (namely kr,jp,zh-CHS,zh-TW) +// - Surrogate +// - PrivateUse +// +// Also, for composite characters it should prepare different index table. +// +// Except for them, it should use precomputed index array. +// + +// +// * sortkey getter signature +// +// int GetSortKey (string s, int index, byte [] buf) +// Stores sort key for corresponding character element into buf and +// returns the length of the consumed _source_ character element in s. +// +// * character length to consume; default implementation +// +// If there is a diacritic after the base character, they are consumed +// and they are considered as a part of the character element. +// + +using System; +using System.Collections; +using System.Globalization; + +namespace Mono.Globalization.Unicode +{ + internal class MSCompatSortKeyTableGenerator + { + public static void Main () + { + new MSCompatSortKeyTableGenerator ().Run (); + } + + byte [] fillIndex = new byte [255]; // by category + CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1]; + + char [] specialIgnore = new char [] { + '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD', + '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F' + }; + + // FIXME: need more love (as always) + char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', + 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + '\u0292', '\u01BE', '\u0298'}; + byte [] alphaWeights = new byte [] {2, 9, 0xA, 0x1A, 0x21, + 0x23, 0x25, 0x2C, 0x32, 0x35, 0x36, 0x48, 0x51, 0x70, + 0x7C, 0x7E, 0x89, 0x8A, 0x91, 0x99, 0x9F, 0xA2, 0xA4, + 0xA6, 0xA9, 0xAA, 0xB3, 0xB4}; + + + public void Run () + { + UnicodeCategory uc; + + #region Specially ignored // 01 + // This will raise "Defined" flag up. + foreach (char c in specialIgnore) + map [(int) c] = new CharMapEntry (0, 0, 0); + #endregion + + + #region Variable weights + // Controls : 06 03 - 06 3D + fillIndex [6] = 3; + for (int i = 0; i < 65536; i++) { + char c = (char) i; + uc = Char.GetUnicodeCategory (c); + if (uc == UnicodeCategory.Control && + !Char.IsWhiteSpace (c)) + AddCharMap (c, 6, true); + } + + // Apostrophe 06 80 + map ['\''] = new CharMapEntry (6, 80, 1); + map ['\uFF63'] = new CharMapEntry (6, 80, 1); // full + + // Hyphen/Dash : 06 81 - 06 90 + fillIndex [6] = 0x81; + for (int i = 0; i < 65536; i++) { + if (Char.GetUnicodeCategory ((char) i) + == UnicodeCategory.DashPunctuation) + AddCharMapGroup ((char) i, 6, true, true); + } + + // Arabic variable weight chars 06 A0 - + fillIndex [6] = 0xA0; + // vowels + for (int i = 0x64B; i <= 0x650; i++) + AddCharMapGroup ((char) i, 6, true, true); + // sukun + AddCharMapGroup ('\u0652', 6, false, true); + // shadda + AddCharMapGroup ('\u0651', 6, false, true); + #endregion + + + #region Nonspacing marks // 01 + // FIXME: 01 03 - 01 B6 ... annoyance :( + + // Combining diacritical marks: 01 DC - + + // LAMESPEC: It should not stop at '\u20E1'. There are + // a few more characters (that however results in + // overflow of level 2 unless we start before 0xDD). + fillIndex [1] = 0xDC; + for (int i = 0x20d0; i <= 0x20e1; i++) + AddCharMap ((char) i, 1, true); + #endregion + + + #region Whitespaces // 07 03 - + fillIndex [7] = 0x3; + AddCharMapGroup (' ', 7, false, true); + AddCharMap ('\u00A0', 7, true); + for (int i = 9; i <= 0xD; i++) + AddCharMap ((char) i, 7, true); + for (int i = 0x2000; i <= 0x200B; i++) + AddCharMap ((char) i, 7, true); + AddCharMapGroup ('\u2028', 7, false, true); + AddCharMapGroup ('\u2029', 7, false, true); + + // LAMESPEC: Windows developers seem to have thought + // that those characters are kind of whitespaces, + // while they aren't. + AddCharMapGroup ('\u2422', 7, false, true); // blank symbol + AddCharMapGroup ('\u2423', 7, false, true); // open box + #endregion + + + #region ASCII non-alphanumeric // 07 + // non-alphanumeric ASCII except for: + - < = > ' + for (int i = 0x21; i < 0x7F; i++) { + if (Char.IsLetterOrDigit ((char) i) + || "+-<=>'".IndexOf ((char) i) >= 0) + continue; // they are not added here. + AddCharMapGroup ((char) i, 7, false, true); + } + #endregion + + + // FIXME: for 07 xx we need more love. + + + #region Numbers // 0C 02 - 0C E1 + fillIndex [9] = 2; + + // 9F8 : Bengali "one less than the denominator" + AddCharMap ('\u09F8', 9, true); + + ArrayList numbers = new ArrayList (); + for (int i = 0; i < 65536; i++) + if (Char.IsNumber ((char) i)) + numbers.Add (i); + + ArrayList numberValues = new ArrayList (); + foreach (int i in numbers) + numberValues.Add (new DictionaryEntry (i, CharUnicodeInfo.GetDecimalValue ((char) i))); + numberValues.Sort (DictionaryValueComparer.Instance); + decimal prevValue = -1; + foreach (DictionaryEntry de in numberValues) { + decimal currValue = (decimal) de.Value; + if (prevValue < currValue) { + prevValue = currValue; + fillIndex [9] += 1; + } + AddCharMap ((char) ((int) de.Key), 9, false); + } + + // 221E: infinity + fillIndex [9] = 0xFF; + AddCharMap ('\u221E', 9, true); + #endregion + + + #region Latin alphabets + for (int i = 0; i < alphabets.Length; i++) { + AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]); + } + #endregion + + #region Letters + + // Greek and Coptic + fillIndex [0xF] = 02; + for (int i = 0x0380; i < 0x03CF; i++) + if (Char.IsLetter ((char) i)) + AddLetterMap ((char) i, 0xF, true); + fillIndex [0xF] = 0x40; + for (int i = 0x03D0; i < 0x0400; i++) + if (Char.IsLetter ((char) i)) + AddLetterMap ((char) i, 0xF, true); + + // Cyrillic - UCA order w/ some modification + fillIndex [0x10] = 0x3; + // FIXME: For \u0400-\u045F we need "ordered Cyrillic" + // table which is moslty from UCA DUCET. + for (int i = 0; i < orderedCyrillic.Length; i++) { + char c = orderedCyrillic [i]; + if (Char.IsLetter (c)) { + AddLetterMap (c, 0x10, false); + fillIndex [0x10] += 3; + } + } + for (int i = 0x0460; i < 0x0481; i++) { + if (Char.IsLetter ((char) i)) { + AddLetterMap ((char) i, 0x10, false); + fillIndex [0x10] += 3; + } + } + + // Armenian + fillIndex [0x11] = 0x3; + for (int i = 0x0531; i < 0x0586; i++) + if (Char.IsLetter ((char) i)) + AddLetterMap ((char) i, 0x11, true); + + // Hebrew + fillIndex [0x12] = 0x3; + for (int i = 0x05D0; i < 0x05FF; i++) + if (Char.IsLetter ((char) i)) + AddLetterMap ((char) i, 0x12, true); + + // Arabic + + #endregion + } + + private void AddAlphaMap (char c, byte category, byte alphaWeight) + { + throw new NotImplementedException (); + } + + class DictionaryValueComparer : IComparer + { + public static readonly DictionaryValueComparer Instance + = new DictionaryValueComparer (); + + private DictionaryValueComparer () + { + } + + public /*static*/ int Compare (object o1, object o2) + { + DictionaryEntry e1 = (DictionaryEntry) o1; + DictionaryEntry e2 = (DictionaryEntry) o2; + // FIXME: in case of 0, compare decomposition categories + return Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value); + } + } + + private void AddCharMapGroup (char c, byte category, bool tail, bool updateIndexForSelf) + { + // update index + char c2 = tail ? + MSCompatGenerated.ToSmallFormTail (c) : + MSCompatGenerated.ToSmallForm (c); + if (c2 > char.MinValue) + AddCharMap (c2, category, true); + // itself + AddCharMap (c, category, updateIndexForSelf); + // + c2 = tail ? + MSCompatGenerated.ToFullWidthTail (c) : + MSCompatGenerated.ToFullWidth (c); + if (c2 > char.MinValue) + AddCharMapGroup (c2, category, tail, false); + } + + private void AddCharMap (char c, byte category, bool increment) + { + map [(int) c] = new CharMapEntry (category, + category == 1 ? (byte) 1 : fillIndex [category], + category != 1 ? fillIndex [category] : (byte) 1); + if (increment) + fillIndex [category] += 1; + } + + #region Level 3 properties (Case/Width) + + public static byte GetLevel3WeightRaw (char c) // add 2 for sortkey value + { + // Korean + if ('\u1100' <= c && c <= '\u11F9) + return 2; + if ('\uFFA0' <= c && c <= '\uFFDC) + return 4; + if ('\u3130' <= c && c <= '\u3164) + return 5; + // numbers + if ('\u2776' <= c && c <= '\u277F') + return 4; + if ('\u2780' <= c && c <= '\u2789') + return 8; + if ('\u2776' <= c && c <= '\u2793') + return 0xC; + if ('\u2160' <= c && c <= '\u216F') + return 0x18; + if ('\u2181' <= c && c <= '\u2182') + return 0x18; + // Arabic + if ('\u2135' <= c && c <= '\u2138') + return 4; + if ('\uFE80' <= c && c <= '\uFE8E') + return MSCompatGenerated.GetArabicFormInPresentationB (c); + + // actually I dunno the reason why they have weights. + switch (c) { + case '\u01BC': + return 0x10; + case '\u06A9': + return 0x20; + case '\u06AA': + return 0x28; + } + + byte ret = 0; + switch (c) { + case '\u03C2': + case '\u2104': + case '\u212B': + ret |= 8; + break; + case '\uFE42': + ret |= 0xC; + break; + } + + // misc + switch (MSCompatGenerated.GetNormalizationType (c)) { + case 1: // + ret |= 1; + break; + case 2: // + ret |= 2; + break; + case 3: // + ret |= 0xE; + break; + } + if (MSCompatGenerated.IsSmallCapital (c)) // grep "SMALL CAPITAL" + ret |= 8; + if (MSCompatGenerated.IsUppercase (c)) // DerivedCoreProperties + ret |= 0x10; + + return ret; + } + + // TODO: implement GetArabicFormInRepresentationD(), + // GetNormalizationType(), IsSmallCapital() and IsUppercase(). + // (They can be easily to be generated.) + + #endregion + + } + + internal struct CharMapEntry + { + public readonly byte Category; + public readonly byte Level1; + public readonly byte Level2; // It is always single byte. + public readonly bool Defined; + + public CharMapEntry (byte category, byte level1, byte level2) + { + Category = category; + Level1 = level1; + Level2 = level2; + Defined = true; + } + } +}