2005-05-25 Atsushi Enomoto <atsushi@ximian.com>
authorAtsushi Eno <atsushieno@gmail.com>
Wed, 25 May 2005 16:42:33 +0000 (16:42 -0000)
committerAtsushi Eno <atsushieno@gmail.com>
Wed, 25 May 2005 16:42:33 +0000 (16:42 -0000)
* Collation-notes.txt : more info. Started letter sortkey analysis
  (some of other stuff are really non-understandable right now.)
* create-mscompat-collation-table.cs : table generator proof-of-
  concept source (not compilable).
* MSCompatUnicodeTable.cs : moved some code to the new source.
  Some more fixes.

svn path=/branches/atsushi/mcs/; revision=45005

mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt
mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs
mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs [new file with mode: 0644]

index c25faaf39a8c286d6d39bdfe8ad56ffa3d9d6aa4..e98eda9db396423bc60d32c26271d03d4d532b17 100644 (file)
@@ -1,3 +1,12 @@
+2005-05-25  Atsushi Enomoto  <atsushi@ximian.com>
+
+       * Collation-notes.txt : more info. Started letter sortkey analysis
+         (some of other stuff are really non-understandable right now.)
+       * create-mscompat-collation-table.cs : table generator proof-of-
+         concept source (not compilable).
+       * MSCompatUnicodeTable.cs : moved some code to the new source.
+         Some more fixes.
+
 2005-05-20  Atsushi Enomoto  <atsushi@ximian.com>
 
        * Collation-notes.txt : started level 2 weight analysis.
index b1dc50b64d2ce004be45d164046322d568bf5cd1..61d78e9ae2938761870d15f371f2eb8adb7bf43c 100644 (file)
 
 **** level 2
 
+       <del>
        For Japanese voice marks, it just sums the count up.
 
        There also seems special rule for Thai (E01-E4F) e.g. E47 works like
        Japanese voice marks.
 
        For other letters, there will be a table.
+       </del>
+
+       It looks like all level 2 keys are just accumulated, however without
+       considering overflow. It sometimes makes sense (e.g. diaeresis and
+       acute) but it causes many conflicts (e.g. "A\u0308\u0301" and "\u1EA6"
+       are incorrectly regarded as equal).
+
+       Anyways since Japanese voice mark has level 2 value as 1 it just
+       looked like the sum of voice marks.
 
 **** level 3
 
 
                <primary category 0E : diacritics>
                Characters in non "0E" category are out of scope.
-               They could be grepped in UnicodeData.txt.
+               They can be grepped in UnicodeData.txt.
                -0E: acute
                -0F: grave
                -10: dot above
 
        1 specially ignored ones (Japanese, Tamil, Thai)
 
+               IdentifyBy: constants
                Unicode: 3099-309C, BCD, E47, E4C, FF9E, FF9F
                SortKey: 01 01 01 01 00
 
        2.1 control characters (specified as such in Unicode), except for
        whitespaces (0009-000D).
 
+               ProcessAfter: 4.1
+               IdentifyBy: UnicodeCategory.Control
                Unicode: 0001-000F minus 0009-000D, 007F-009F
-               SortKey: 06 80 07 06 03 00 - 06 80 07 06 3D 00
+               SortKey: 06 03 - 06 3D
 
        2.2 Apostrophe
+               IdentifyBy: constant
                Unicode: 0027,FF07 (')
-               SortKey: 06 80 (and nonspace equivalent)
+               SortKey: 06 80 (and width insensitive equivalents)
 
        2.3  minus sign, hyphen, dash
          minus signs: FE63, 207B (super), 208B (sub), 002D, 00FD (full-width)
          hyphens: 00AD (soft), 2010, 2011 (nonbreaking) ... Unicode HYPHEN?
          dashes, horizontal bars: FE58 ... UnicodeCategory.DashPunctuation
 
+               IdentifyBy: UnicodeCategory.DashPunctuation
                SortKey: 06 81 - 06 90 (and nonspace equivalents)
 
-       2.4 Arabic spacing and equivalents (64B-651, FE70-FE7F)
+       2.4 Arabic spacing and equivalents (64B-652, FE70-FE7F)
          They are part of nonspacing mark, but not equal.
 
                SortKey: 06 A0 - 06 A7 (and nonspace equivalents)
          (i.e. < 128) nor those equivalents
 
          NonSpacingMark which is ignorable (IsIgnorableNonSpacing())
-         // 30D, CD5-CD6, ABD, 2B9-2C1, 2C8, 2CB-2CD, 591-5C2. NonSpacingMark in
+         // 30D, CD5-CD6, ABD, 2B9-2C5, 2C8, 2CB-2CD, 591-5C2. NonSpacingMark in
          // 981-A3C. A4D, A70, A71, ABC ...
 
          TODO: I need more insight to write table generator.
          If in "discriminatory mode", those tables could be still provided
          as to be compatible to Windows.
 
+         Additionally there seems some bugs around Modifier letter collection.
+         For example, 2C6 should be nonspacing diacritical character but it
+         is regarded as a primary character. The same applies to Mandarin
+         tone marks (2C9-2CB) (and there's a plenty of such characters).
+
        4 space separators and some kind of marks
 
        4.1 whitespaces, paragraph separator etc.
        
          SortKey : 07 19 - 07 1A
 
-       4.3 other marks ('!', '^', ...)
-         Non-alpha-numeric < 0x7F except for '+' (math) and '-' (math/hyphen)
+       4.3 ASCII compatible marks ('!', '^', ...)
+         Non-alpha-numeric < 0x7F except for [[+-<=>']]
+         small compatibility equivalents -> itself, wide
+
+       4.3 other marks
+         FIXME: how to identify them?
          some Punctuations: InitialQuote/FinalQuote/Open/Close/Connector
          some OtherSymbols: 2400-2424
          3003, 3006, 2D0, 10FB
          (not Quotation_Mark property in PropList.txt ; 22, 27)
 
          byte area MathSymbol: 2B,3C,3D,3E,AB,B1,BB,D7,F7 except for AC
-         MathSymbol (2044, 208A, 208C, 207A, 207C)
+         some MathSymbol (2044, 208A, 208C, 207A, 207C)
          OtherLetter (1C0-1C2)
          2200-22FF MathSymbol except for 221E (INF. ; regarded as a number)
 
 
        6 Arrows and Box drawings
          09 02 .. 09 7C : 2300-237A
+                       only primary differences
          09 BC ... 09 FE : 25A0-AB, 25E7-EB, 25AC-B5, 25EC-EF, 25B6-B9,
                        25BC-C3, 25BA-25BB, 25C4-25D8, 25E6, 25DA-25E5
                        21*,25*,26*,27*
 
          This ordering is nothing to do with European Ordering Rules (EOR).
 
-       10 (F) greek letters
-         0F: 386-3F2
-         10: 400-4E9 exc. 482-486
-         11: 531-586 exc. 559-55F
-         12: 5D0-5F2
-         13: 621-64A, 670-6D3, 6D5
+       10 culture dependent letters (general)
+         0F: 386-3F2 ... Greek and Coptic
+               386-3CF: 0F 02 - 0F 19 (consider primary equivalents)
+               3D0-3EF: 0F 40 - 0F 54
+         10: 400-4E9 ... Cyrillic.
+               For 400-45F and 4B1, they are mostly UCA DUCET order.
+               After that 460-481 follows, by codepoint.
+               (490-4FF except for 4B1 and Cyrillic supplementary are unused.)
+         11: 531-586 ... Armenian.
+               Simply sorted by codepoint (handle case).
+         12: 5D0-5F2 ... Hebrew
+               Codepoint order (handle case).
+         13: 621-6D5 plus 670 (NonSpacingMark) ... Arabic
+
          14: 901-963 exc. 93C-93D 950-954
          15: 982-9FA exc. NonSpacingMark DecimalDigitNumber OtherNumber
          16: A05-A74 exc. A3C A4D A66-A71
          17: A81-AE0 exc. ABC-ABD
-         18: 
 
        ...
 
 
           3400-4DB5. Ordered, considering case/width equivalents.
 
-       20 (FF FF 01 01 01 01 00) Some supplemental Japanese/Arabic marks
+       20 (FF FF 01 01 01 01 00) Some Japanese/Arabic extenders
+          Actually FE7C and FE7D are not extender in Unicode (PropList.txt)
 
           3005, 3031, 3032, 309D, 309E, 30FC, 30FD, 30FE, FE7C, FE7D, FF70
 
        - by UnicodeCategory -
 
-       DashPunctuation         1 1 1 1 (no exception)
+       DashPunctuation         6 (no exception)
        DecimalDigitNumber      C (no exception)
        EnclosingMark           1 E (no exception)
        Format                  7 (only 70F)
 
        OtherNumber             C(<3192), 9E-A7 (3124<)
 
-       Control                 1 1 1 1 except for 9-D (7)
+       Control                 6 except for 9-D (7)
        FinalQuotePunctuation   7 except for BB (8)
        InitialQuotePunctuation 7 except for AB (8)
        ClosePunctuation        7 except for 232A (9)
        ConnectorPunctuation    7 except for FF65, 30FB, 2040 (A)
 
        OtherLetter             1, 7, 8 (1C0-1C2), C, 12-FF
-       MathSymbol              8, 9, 1 1 1 1, 7, A, C
+       MathSymbol              8, 9, 6, 7, A, C
        OtherSymbol             7, 9, A, C, E, F, <22, 52<
        CurrencySymbol          A except for FF69,24,FF04 (7) and 9F2,9F3 (15)
 
        TitlecaseLetter         E (no exception)
        UppercaseLetter         E,F,10,11,21 except for 1BC (C)
        ModifierLetter          1, 7, E, 1F, FF
-       ModifierSymbol          1 1 1 1, 1, 7
-       NonSpacingMark          1 1 1 1, 1, 13-1F
+       ModifierSymbol          1, 6, 7
+       NonSpacingMark          1, 6, 13-1F
        OtherPunctuation        1, 7, A, 1F
        SpacingCombiningMark    1, 14-22
 
        (UCD) is informative (it's informative but not normative to us)
        http://www.unicode.org/Public/UNIDATA/UCD.html
 
+       Decent char-by-char explaination is available here:
+       http://www.fileformat.info/info/unicode/
+
        Wine uses UCA default element table, but has windows-like character
        filterings support in their LCMapString implementation:
        http://cvs.winehq.com/cvsweb/wine/dlls/kernel/locale.c
index 460f40c84994954f00414ce8b2a9ccccf7c4aa42..9d5184714d243ae5f6ae45582d932888da6761f5 100644 (file)
@@ -10,7 +10,7 @@ namespace Mono.Globalization.Unicode
                {
                        switch (i) {
                        case 0:
-                       // No idea why each of those is ignored.
+                       // No idea why they are ignored.
                        case 0x2df: case 0x387:
                        case 0x3d7: case 0x3d8: case 0x3d9:
                        case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
@@ -28,9 +28,14 @@ namespace Mono.Globalization.Unicode
                        case 0xfffc: case 0xfffd:
                                return true;
                        // exceptional characters filtered by the 
-                       // following conditions (no idea why though).
-                       case 0x4d8: case 0x4d9: case 0x4e8: case 0x4e9:
-                       case 0x70f: case 0x3036: case 0x303f:
+                       // following conditions. Originally those exceptional
+                       // ranges are incorrect (they should not be ignored)
+                       // and most of those characters are unfortunately in
+                       // those ranges.
+                       case 0x4d8: case 0x4d9:
+                       case 0x4e8: case 0x4e9:
+                       case 0x70f:
+                       case 0x3036: case 0x303f:
                        case 0x337b: case 0xfb1e:
                                return false;
                        }
@@ -364,84 +369,101 @@ namespace Mono.Globalization.Unicode
                        return Normalization.ToWidthInsensitive (i);
                }
 
-               #region Level 3 properties (Case/Width)
+               #region Utilities
 
-               public static byte GetLevel3WeightRaw (char c) // add 2 for sortkey value
+               public static void GetPrimaryWeight (char c, bool variable,
+                       out byte category, out byte value)
                {
-                       // Korean
-                       if (0x1100 <= c && c <= 0x11F9)
-                               return 2;
-                       if (0xFFA0 <= c && c <= 0xFFDC)
-                               return 4;
-                       if (0x3130 <= c && c <= 0x3164)
-                               return 5;
-                       // numbers
-                       if (0x2776 <= c && c <= 0x277F)
-                               return 4;
-                       if (0x2780 <= c && c <= 0x2789)
-                               return 8;
-                       if (0x2776 <= c && c <= 0x2793)
-                               return 0xC;
-                       if (0x2160 <= c && c <= 0x216F)
-                               return 0x10;
-                       if (0x2181 <= c && c <= 0x2182)
-                               return 0x10;
-                       // Arabic
-                       if (0x2135 <= c && c <= 0x2138)
-                               return 4;
-                       if (0xFE80 <= c && c <= 0xFE8E)
-                               return GetArabicFormInPresentationB (c);
-
-                       // actually I dunno the reason why they have weights.
-                       switch (c) {
-                       case 0x01BC:
-                               return 0x10;
-                       case 0x06A9:
-                               return 0x20;
-                       case 0x06AA:
-                               return 0x28;
-                       }
+               }
 
-                       byte ret = 0;
+               public static string GetExpansion (char c)
+               {
                        switch (c) {
-                       case 0x03C2:
-                       case 0x2104:
-                       case 0x212B:
-                               ret |= 8;
-                               break;
-                       case 0xFE42:
-                               ret |= 0xC;
-                               break;
-                       }
-
-                       // misc
-                       switch (GetNormalizationType (c)) {
-                       case 1: // <full>
-                               ret |= 1;
-                               break;
-                       case 2: // <sub>
-                               ret |= 1;
-                               break;
-                       case 3: // <super>
-                               ret |= 0xE;
-                               break;
+                       case '\u00C6':
+                               return "AE";
+                       case '\u00DE':
+                               return "TH";
+                       case '\u00DF':
+                               return "ss";
+                       case '\u00E6':
+                               return "ae";
+                       case '\u00FE':
+                               return "th";
+                       case '\u0132':
+                               return "IJ";
+                       case '\u0133':
+                               return "ij";
+                       case '\u0152':
+                               return "OE";
+                       case '\u0153':
+                               return "oe";
+                       case '\u01C4':
+                               return "DZ\u030C"; // surprisingly Windows works fine here
+                       case '\u01C5':
+                               return "Dz\u030C";
+                       case '\u01C6':
+                               return "dz\u030C";
+                       case '\u01C7':
+                               return "LJ";
+                       case '\u01C8':
+                               return "Lj";
+                       case '\u01C9':
+                               return "lj";
+                       case '\u01CA':
+                               return "NJ";
+                       case '\u01CB':
+                               return "Nj";
+                       case '\u01CC':
+                               return "nj";
+                       case '\u01E2':
+                               return "A\u0304E\u0304"; // LAMESPEC: should be \u00C6\u0304
+                       case '\u01E3':
+                               return "a\u0304e\u0304"; // LAMESPEC: should be \u00E6\u0304
+                       case '\u01F1':
+                               return "DZ";
+                       case '\u01F2':
+                               return "Dz";
+                       case '\u01F3':
+                               return "dz";
+                       case '\u01FC':
+                               return "A\u0301E\u0301"; // LAMESPEC: should be \u00C6\u0301
+                       case '\u01FD':
+                               return "a\u0301e\u0301"; // LAMESPEC: should be \u00C6\u0301
+                       case '\u05F0':
+                               return "\u05D5\u05D5";
+                       case '\u05F1':
+                               return "\u05D5\u05D9";
+                       case '\u05F2':
+                               return "\u05D9\u05D9";
+                       case '\uFB00':
+                               return "ff";
+                       case '\uFB01':
+                               return "fi";
+                       case '\uFB02':
+                               return "fl";
                        }
-                       if (IsSmallCapital (c)) // grep "SMALL CAPITAL"
-                               ret |= 8;
-                       if (IsUppercase (c)) // DerivedCoreProperties
-                               ret |= 0x10;
-
-                       return ret;
+//                     if ('\u1113' <= c && c <= '\u115F') Korean Jamo
+//                             return true;
+                       return null;
                }
-
-               // TODO: implement GetArabicFormInRepresentationD(),
-               // GetNormalizationType(), IsSmallCapital() and IsUppercase().
-               // (They can be easily to be generated.)
-
                #endregion
 
+
                #region Level 4 properties (Kana)
 
+               public static bool HasSpecialWeight (char c)
+               {
+                       if (c < '\u3041')
+                               return false;
+                       else if (c < '\u3100')
+                               return true;
+                       else if (c < '\uFF60')
+                               return false;
+                       else if (c < '\uFF9F')
+                               return true;
+                       return true;
+               }
+
                public static byte GetJapaneseDashType (char c)
                {
                        switch (c) {
@@ -497,7 +519,25 @@ namespace Mono.Globalization.Unicode
                        return false;
                }
 
-               #endregion\
+               #endregion
+
+
+               // 0 means no primary weight. 6 means variable weight
+               // For expanded character the value is 0.
+               // Those arrays will be split into blocks (<3400 and >F800)
+               byte [] categories;
+               byte [] level1;
+               byte [] level2;
+               byte [] level3;
+               // level 4 is computed.
+
+               // public static bool HasSpecialWeight (char c)
+               // { return level1 [(int) c] == 6; }
+
+               //
+               // Maybe autogenerated code or icall to fill array runs here
+               //
        }
 }
 
+
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs
new file mode 100644 (file)
index 0000000..59c672a
--- /dev/null
@@ -0,0 +1,380 @@
+//
+//
+// There are two kind of sort keys : which are computed and which are laid out
+// as an indexed array. Computed sort keys are:
+//
+//     - CJK, which largely vary depending on LCID (namely kr,jp,zh-CHS,zh-TW)
+//     - Surrogate
+//     - PrivateUse
+//
+// Also, for composite characters it should prepare different index table.
+//
+// Except for them, it should use precomputed index array.
+//
+
+//
+// * sortkey getter signature
+//
+//     int GetSortKey (string s, int index, byte [] buf)
+//     Stores sort key for corresponding character element into buf and
+//     returns the length of the consumed _source_ character element in s.
+//
+// * character length to consume; default implementation
+//
+//     If there is a diacritic after the base character, they are consumed
+//     and they are considered as a part of the character element.
+//
+
+using System;
+using System.Collections;
+using System.Globalization;
+
+namespace Mono.Globalization.Unicode
+{
+       internal class MSCompatSortKeyTableGenerator
+       {
+               public static void Main ()
+               {
+                       new MSCompatSortKeyTableGenerator ().Run ();
+               }
+
+               byte [] fillIndex = new byte [255]; // by category
+               CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
+
+               char [] specialIgnore = new char [] {
+                       '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
+                       '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
+                       };
+
+               // FIXME: need more love (as always)
+               char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
+                       'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
+                       'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+                       '\u0292', '\u01BE', '\u0298'};
+               byte [] alphaWeights = new byte [] {2, 9, 0xA, 0x1A, 0x21,
+                       0x23, 0x25, 0x2C, 0x32, 0x35, 0x36, 0x48, 0x51, 0x70,
+                       0x7C, 0x7E, 0x89, 0x8A, 0x91, 0x99, 0x9F, 0xA2, 0xA4,
+                       0xA6, 0xA9, 0xAA, 0xB3, 0xB4};
+
+
+               public void Run ()
+               {
+                       UnicodeCategory uc;
+
+                       #region Specially ignored // 01
+                       // This will raise "Defined" flag up.
+                       foreach (char c in specialIgnore)
+                               map [(int) c] = new CharMapEntry (0, 0, 0);
+                       #endregion
+
+
+                       #region Variable weights
+                       // Controls : 06 03 - 06 3D
+                       fillIndex [6] = 3;
+                       for (int i = 0; i < 65536; i++) {
+                               char c = (char) i;
+                               uc = Char.GetUnicodeCategory (c);
+                               if (uc == UnicodeCategory.Control &&
+                                       !Char.IsWhiteSpace (c))
+                                       AddCharMap (c, 6, true);
+                       }
+
+                       // Apostrophe 06 80
+                       map ['\''] = new CharMapEntry (6, 80, 1);
+                       map ['\uFF63'] = new CharMapEntry (6, 80, 1); // full
+
+                       // Hyphen/Dash : 06 81 - 06 90
+                       fillIndex [6] = 0x81;
+                       for (int i = 0; i < 65536; i++) {
+                               if (Char.GetUnicodeCategory ((char) i)
+                                       == UnicodeCategory.DashPunctuation)
+                                       AddCharMapGroup ((char) i, 6, true, true);
+                       }
+
+                       // Arabic variable weight chars 06 A0 -
+                       fillIndex [6] = 0xA0;
+                       // vowels
+                       for (int i = 0x64B; i <= 0x650; i++)
+                               AddCharMapGroup ((char) i, 6, true, true);
+                       // sukun
+                       AddCharMapGroup ('\u0652', 6, false, true);
+                       // shadda
+                       AddCharMapGroup ('\u0651', 6, false, true);
+                       #endregion
+
+
+                       #region Nonspacing marks // 01
+                       // FIXME: 01 03 - 01 B6 ... annoyance :(
+
+                       // Combining diacritical marks: 01 DC -
+
+                       // LAMESPEC: It should not stop at '\u20E1'. There are
+                       // a few more characters (that however results in 
+                       // overflow of level 2 unless we start before 0xDD).
+                       fillIndex [1] = 0xDC;
+                       for (int i = 0x20d0; i <= 0x20e1; i++)
+                               AddCharMap ((char) i, 1, true);
+                       #endregion
+
+
+                       #region Whitespaces // 07 03 -
+                       fillIndex [7] = 0x3;
+                       AddCharMapGroup (' ', 7, false, true);
+                       AddCharMap ('\u00A0', 7, true);
+                       for (int i = 9; i <= 0xD; i++)
+                               AddCharMap ((char) i, 7, true);
+                       for (int i = 0x2000; i <= 0x200B; i++)
+                               AddCharMap ((char) i, 7, true);
+                       AddCharMapGroup ('\u2028', 7, false, true);
+                       AddCharMapGroup ('\u2029', 7, false, true);
+
+                       // LAMESPEC: Windows developers seem to have thought 
+                       // that those characters are kind of whitespaces,
+                       // while they aren't.
+                       AddCharMapGroup ('\u2422', 7, false, true); // blank symbol
+                       AddCharMapGroup ('\u2423', 7, false, true); // open box
+                       #endregion
+
+
+                       #region ASCII non-alphanumeric // 07
+                       // non-alphanumeric ASCII except for: + - < = > '
+                       for (int i = 0x21; i < 0x7F; i++) {
+                               if (Char.IsLetterOrDigit ((char) i)
+                                       || "+-<=>'".IndexOf ((char) i) >= 0)
+                                       continue; // they are not added here.
+                               AddCharMapGroup ((char) i, 7, false, true);
+                       }
+                       #endregion
+
+
+                       // FIXME: for 07 xx we need more love.
+
+
+                       #region Numbers // 0C 02 - 0C E1
+                       fillIndex [9] = 2;
+
+                       // 9F8 : Bengali "one less than the denominator"
+                       AddCharMap ('\u09F8', 9, true);
+
+                       ArrayList numbers = new ArrayList ();
+                       for (int i = 0; i < 65536; i++)
+                               if (Char.IsNumber ((char) i))
+                                       numbers.Add (i);
+
+                       ArrayList numberValues = new ArrayList ();
+                       foreach (int i in numbers)
+                               numberValues.Add (new DictionaryEntry (i, CharUnicodeInfo.GetDecimalValue ((char) i)));
+                       numberValues.Sort (DictionaryValueComparer.Instance);
+                       decimal prevValue = -1;
+                       foreach (DictionaryEntry de in numberValues) {
+                               decimal currValue = (decimal) de.Value;
+                               if (prevValue < currValue) {
+                                       prevValue = currValue;
+                                       fillIndex [9] += 1;
+                               }
+                               AddCharMap ((char) ((int) de.Key), 9, false);
+                       }
+
+                       // 221E: infinity
+                       fillIndex [9] = 0xFF;
+                       AddCharMap ('\u221E', 9, true);
+                       #endregion
+
+
+                       #region Latin alphabets
+                       for (int i = 0; i < alphabets.Length; i++) {
+                               AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
+                       }
+                       #endregion
+
+                       #region Letters
+
+                       // Greek and Coptic
+                       fillIndex [0xF] = 02;
+                       for (int i = 0x0380; i < 0x03CF; i++)
+                               if (Char.IsLetter ((char) i))
+                                       AddLetterMap ((char) i, 0xF, true);
+                       fillIndex [0xF] = 0x40;
+                       for (int i = 0x03D0; i < 0x0400; i++)
+                               if (Char.IsLetter ((char) i))
+                                       AddLetterMap ((char) i, 0xF, true);
+
+                       // Cyrillic - UCA order w/ some modification
+                       fillIndex [0x10] = 0x3;
+                       // FIXME: For \u0400-\u045F we need "ordered Cyrillic"
+                       // table which is moslty from UCA DUCET.
+                       for (int i = 0; i < orderedCyrillic.Length; i++) {
+                               char c = orderedCyrillic [i];
+                               if (Char.IsLetter (c)) {
+                                       AddLetterMap (c, 0x10, false);
+                                       fillIndex [0x10] += 3;
+                               }
+                       }
+                       for (int i = 0x0460; i < 0x0481; i++) {
+                               if (Char.IsLetter ((char) i)) {
+                                       AddLetterMap ((char) i, 0x10, false);
+                                       fillIndex [0x10] += 3;
+                               }
+                       }
+
+                       // Armenian
+                       fillIndex [0x11] = 0x3;
+                       for (int i = 0x0531; i < 0x0586; i++)
+                               if (Char.IsLetter ((char) i))
+                                       AddLetterMap ((char) i, 0x11, true);
+
+                       // Hebrew
+                       fillIndex [0x12] = 0x3;
+                       for (int i = 0x05D0; i < 0x05FF; i++)
+                               if (Char.IsLetter ((char) i))
+                                       AddLetterMap ((char) i, 0x12, true);
+
+                       // Arabic
+
+                       #endregion
+               }
+
+               private void AddAlphaMap (char c, byte category, byte alphaWeight)
+               {
+                       throw new NotImplementedException ();
+               }
+
+               class DictionaryValueComparer : IComparer
+               {
+                       public static readonly DictionaryValueComparer Instance
+                               = new DictionaryValueComparer ();
+
+                       private DictionaryValueComparer ()
+                       {
+                       }
+
+                       public /*static*/ int Compare (object o1, object o2)
+                       {
+                               DictionaryEntry e1 = (DictionaryEntry) o1;
+                               DictionaryEntry e2 = (DictionaryEntry) o2;
+                               // FIXME: in case of 0, compare decomposition categories
+                               return Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
+                       }
+               }
+
+               private void AddCharMapGroup (char c, byte category, bool tail, bool updateIndexForSelf)
+               {
+                       // <small> update index
+                       char c2 = tail ?
+                               MSCompatGenerated.ToSmallFormTail (c) :
+                               MSCompatGenerated.ToSmallForm (c);
+                       if (c2 > char.MinValue)
+                               AddCharMap (c2, category, true);
+                       // itself
+                       AddCharMap (c, category, updateIndexForSelf);
+                       // <full>
+                       c2 = tail ?
+                               MSCompatGenerated.ToFullWidthTail (c) :
+                               MSCompatGenerated.ToFullWidth (c);
+                       if (c2 > char.MinValue)
+                               AddCharMapGroup (c2, category, tail, false);
+               }
+
+               private void AddCharMap (char c, byte category, bool increment)
+               {
+                       map [(int) c] = new CharMapEntry (category,
+                               category == 1 ? (byte) 1 : fillIndex [category],
+                               category != 1 ? fillIndex [category] : (byte) 1);
+                       if (increment)
+                               fillIndex [category] += 1;
+               }
+
+               #region Level 3 properties (Case/Width)
+
+               public static byte GetLevel3WeightRaw (char c) // add 2 for sortkey value
+               {
+                       // Korean
+                       if ('\u1100' <= c && c <= '\u11F9)
+                               return 2;
+                       if ('\uFFA0' <= c && c <= '\uFFDC)
+                               return 4;
+                       if ('\u3130' <= c && c <= '\u3164)
+                               return 5;
+                       // numbers
+                       if ('\u2776' <= c && c <= '\u277F')
+                               return 4;
+                       if ('\u2780' <= c && c <= '\u2789')
+                               return 8;
+                       if ('\u2776' <= c && c <= '\u2793')
+                               return 0xC;
+                       if ('\u2160' <= c && c <= '\u216F')
+                               return 0x18;
+                       if ('\u2181' <= c && c <= '\u2182')
+                               return 0x18;
+                       // Arabic
+                       if ('\u2135' <= c && c <= '\u2138')
+                               return 4;
+                       if ('\uFE80' <= c && c <= '\uFE8E')
+                               return MSCompatGenerated.GetArabicFormInPresentationB (c);
+
+                       // actually I dunno the reason why they have weights.
+                       switch (c) {
+                       case '\u01BC':
+                               return 0x10;
+                       case '\u06A9':
+                               return 0x20;
+                       case '\u06AA':
+                               return 0x28;
+                       }
+
+                       byte ret = 0;
+                       switch (c) {
+                       case '\u03C2':
+                       case '\u2104':
+                       case '\u212B':
+                               ret |= 8;
+                               break;
+                       case '\uFE42':
+                               ret |= 0xC;
+                               break;
+                       }
+
+                       // misc
+                       switch (MSCompatGenerated.GetNormalizationType (c)) {
+                       case 1: // <full>
+                               ret |= 1;
+                               break;
+                       case 2: // <sub>
+                               ret |= 2;
+                               break;
+                       case 3: // <super>
+                               ret |= 0xE;
+                               break;
+                       }
+                       if (MSCompatGenerated.IsSmallCapital (c)) // grep "SMALL CAPITAL"
+                               ret |= 8;
+                       if (MSCompatGenerated.IsUppercase (c)) // DerivedCoreProperties
+                               ret |= 0x10;
+
+                       return ret;
+               }
+
+               // TODO: implement GetArabicFormInRepresentationD(),
+               // GetNormalizationType(), IsSmallCapital() and IsUppercase().
+               // (They can be easily to be generated.)
+
+               #endregion
+
+       }
+
+       internal struct CharMapEntry
+       {
+               public readonly byte Category;
+               public readonly byte Level1;
+               public readonly byte Level2; // It is always single byte.
+               public readonly bool Defined;
+
+               public CharMapEntry (byte category, byte level1, byte level2)
+               {
+                       Category = category;
+                       Level1 = level1;
+                       Level2 = level2;
+                       Defined = true;
+               }
+       }
+}