2005-05-25 Atsushi Enomoto <atsushi@ximian.com>

author Atsushi Eno <atsushieno@gmail.com>

Wed, 25 May 2005 16:42:33 +0000 (16:42 -0000)

committer Atsushi Eno <atsushieno@gmail.com>

Wed, 25 May 2005 16:42:33 +0000 (16:42 -0000)
author Atsushi Eno <atsushieno@gmail.com>
Wed, 25 May 2005 16:42:33 +0000 (16:42 -0000)
committer Atsushi Eno <atsushieno@gmail.com>
Wed, 25 May 2005 16:42:33 +0000 (16:42 -0000)
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog

index c25faaf39a8c286d6d39bdfe8ad56ffa3d9d6aa4..e98eda9db396423bc60d32c26271d03d4d532b17 100644 (file)
--- a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
@@ -1,3 +1,12 @@
+2005-05-25  Atsushi Enomoto  <atsushi@ximian.com>
+
+       * Collation-notes.txt : more info. Started letter sortkey analysis
+         (some of other stuff are really non-understandable right now.)
+       * create-mscompat-collation-table.cs : table generator proof-of-
+         concept source (not compilable).
+       * MSCompatUnicodeTable.cs : moved some code to the new source.
+         Some more fixes.
+
  2005-05-20  Atsushi Enomoto  <atsushi@ximian.com>
  
         * Collation-notes.txt : started level 2 weight analysis.
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt b/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt

index b1dc50b64d2ce004be45d164046322d568bf5cd1..61d78e9ae2938761870d15f371f2eb8adb7bf43c 100644 (file)
--- a/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt
@@ -210,12 +210,22 @@
  
  **** level 2
  
+       <del>
         For Japanese voice marks, it just sums the count up.
  
         There also seems special rule for Thai (E01-E4F) e.g. E47 works like
         Japanese voice marks.
  
         For other letters, there will be a table.
+       </del>
+
+       It looks like all level 2 keys are just accumulated, however without
+       considering overflow. It sometimes makes sense (e.g. diaeresis and
+       acute) but it causes many conflicts (e.g. "A\u0308\u0301" and "\u1EA6"
+       are incorrectly regarded as equal).
+
+       Anyways since Japanese voice mark has level 2 value as 1 it just
+       looked like the sum of voice marks.
  
  **** level 3
  
@@ -404,7 +414,7 @@
  
                 <primary category 0E : diacritics>
                 Characters in non "0E" category are out of scope.
-               They could be grepped in UnicodeData.txt.
+               They can be grepped in UnicodeData.txt.
                 -0E: acute
                 -0F: grave
                 -10: dot above
@@ -535,6 +545,7 @@
  
         1 specially ignored ones (Japanese, Tamil, Thai)
  
+               IdentifyBy: constants
                 Unicode: 3099-309C, BCD, E47, E4C, FF9E, FF9F
                 SortKey: 01 01 01 01 00
  
@@ -546,21 +557,25 @@
         2.1 control characters (specified as such in Unicode), except for
         whitespaces (0009-000D).
  
+               ProcessAfter: 4.1
+               IdentifyBy: UnicodeCategory.Control
                 Unicode: 0001-000F minus 0009-000D, 007F-009F
-               SortKey: 06 80 07 06 03 00 - 06 80 07 06 3D 00
+               SortKey: 06 03 - 06 3D
  
         2.2 Apostrophe
+               IdentifyBy: constant
                 Unicode: 0027,FF07 (')
-               SortKey: 06 80 (and nonspace equivalent)
+               SortKey: 06 80 (and width insensitive equivalents)
  
         2.3  minus sign, hyphen, dash
           minus signs: FE63, 207B (super), 208B (sub), 002D, 00FD (full-width)
           hyphens: 00AD (soft), 2010, 2011 (nonbreaking) ... Unicode HYPHEN?
           dashes, horizontal bars: FE58 ... UnicodeCategory.DashPunctuation
  
+               IdentifyBy: UnicodeCategory.DashPunctuation
                 SortKey: 06 81 - 06 90 (and nonspace equivalents)
  
-       2.4 Arabic spacing and equivalents (64B-651, FE70-FE7F)
+       2.4 Arabic spacing and equivalents (64B-652, FE70-FE7F)
           They are part of nonspacing mark, but not equal.
  
                 SortKey: 06 A0 - 06 A7 (and nonspace equivalents)
@@ -571,7 +586,7 @@
           (i.e. < 128) nor those equivalents
  
           NonSpacingMark which is ignorable (IsIgnorableNonSpacing())
-         // 30D, CD5-CD6, ABD, 2B9-2C1, 2C8, 2CB-2CD, 591-5C2. NonSpacingMark in
+         // 30D, CD5-CD6, ABD, 2B9-2C5, 2C8, 2CB-2CD, 591-5C2. NonSpacingMark in
           // 981-A3C. A4D, A70, A71, ABC ...
  
           TODO: I need more insight to write table generator.
@@ -596,6 +611,11 @@
           If in "discriminatory mode", those tables could be still provided
           as to be compatible to Windows.
  
+         Additionally there seems some bugs around Modifier letter collection.
+         For example, 2C6 should be nonspacing diacritical character but it
+         is regarded as a primary character. The same applies to Mandarin
+         tone marks (2C9-2CB) (and there's a plenty of such characters).
+
         4 space separators and some kind of marks
  
         4.1 whitespaces, paragraph separator etc.
@@ -607,8 +627,12 @@
         
           SortKey : 07 19 - 07 1A
  
-       4.3 other marks ('!', '^', ...)
-         Non-alpha-numeric < 0x7F except for '+' (math) and '-' (math/hyphen)
+       4.3 ASCII compatible marks ('!', '^', ...)
+         Non-alpha-numeric < 0x7F except for [[+-<=>']]
+         small compatibility equivalents -> itself, wide
+
+       4.3 other marks
+         FIXME: how to identify them?
           some Punctuations: InitialQuote/FinalQuote/Open/Close/Connector
           some OtherSymbols: 2400-2424
           3003, 3006, 2D0, 10FB
@@ -622,7 +646,7 @@
           (not Quotation_Mark property in PropList.txt ; 22, 27)
  
           byte area MathSymbol: 2B,3C,3D,3E,AB,B1,BB,D7,F7 except for AC
-         MathSymbol (2044, 208A, 208C, 207A, 207C)
+         some MathSymbol (2044, 208A, 208C, 207A, 207C)
           OtherLetter (1C0-1C2)
           2200-22FF MathSymbol except for 221E (INF. ; regarded as a number)
  
@@ -630,6 +654,7 @@
  
         6 Arrows and Box drawings
           09 02 .. 09 7C : 2300-237A
+                       only primary differences
           09 BC ... 09 FE : 25A0-AB, 25E7-EB, 25AC-B5, 25EC-EF, 25B6-B9,
                         25BC-C3, 25BA-25BB, 25C4-25D8, 25E6, 25DA-25E5
                         21*,25*,26*,27*
@@ -674,17 +699,24 @@
  
           This ordering is nothing to do with European Ordering Rules (EOR).
  
-       10 (F) greek letters
-         0F: 386-3F2
-         10: 400-4E9 exc. 482-486
-         11: 531-586 exc. 559-55F
-         12: 5D0-5F2
-         13: 621-64A, 670-6D3, 6D5
+       10 culture dependent letters (general)
+         0F: 386-3F2 ... Greek and Coptic
+               386-3CF: 0F 02 - 0F 19 (consider primary equivalents)
+               3D0-3EF: 0F 40 - 0F 54
+         10: 400-4E9 ... Cyrillic.
+               For 400-45F and 4B1, they are mostly UCA DUCET order.
+               After that 460-481 follows, by codepoint.
+               (490-4FF except for 4B1 and Cyrillic supplementary are unused.)
+         11: 531-586 ... Armenian.
+               Simply sorted by codepoint (handle case).
+         12: 5D0-5F2 ... Hebrew
+               Codepoint order (handle case).
+         13: 621-6D5 plus 670 (NonSpacingMark) ... Arabic
+
           14: 901-963 exc. 93C-93D 950-954
           15: 982-9FA exc. NonSpacingMark DecimalDigitNumber OtherNumber
           16: A05-A74 exc. A3C A4D A66-A71
           17: A81-AE0 exc. ABC-ABD
-         18: 
  
         ...
  
@@ -744,13 +776,14 @@
  
            3400-4DB5. Ordered, considering case/width equivalents.
  
-       20 (FF FF 01 01 01 01 00) Some supplemental Japanese/Arabic marks
+       20 (FF FF 01 01 01 01 00) Some Japanese/Arabic extenders
+          Actually FE7C and FE7D are not extender in Unicode (PropList.txt)
  
            3005, 3031, 3032, 309D, 309E, 30FC, 30FD, 30FE, FE7C, FE7D, FF70
  
         - by UnicodeCategory -
  
-       DashPunctuation         1 1 1 1 (no exception)
+       DashPunctuation         6 (no exception)
         DecimalDigitNumber      C (no exception)
         EnclosingMark           1 E (no exception)
         Format                  7 (only 70F)
@@ -763,7 +796,7 @@
  
         OtherNumber             C(<3192), 9E-A7 (3124<)
  
-       Control                 1 1 1 1 except for 9-D (7)
+       Control                 6 except for 9-D (7)
         FinalQuotePunctuation   7 except for BB (8)
         InitialQuotePunctuation 7 except for AB (8)
         ClosePunctuation        7 except for 232A (9)
@@ -771,7 +804,7 @@
         ConnectorPunctuation    7 except for FF65, 30FB, 2040 (A)
  
         OtherLetter             1, 7, 8 (1C0-1C2), C, 12-FF
-       MathSymbol              8, 9, 1 1 1 1, 7, A, C
+       MathSymbol              8, 9, 6, 7, A, C
         OtherSymbol             7, 9, A, C, E, F, <22, 52<
         CurrencySymbol          A except for FF69,24,FF04 (7) and 9F2,9F3 (15)
  
@@ -779,8 +812,8 @@
         TitlecaseLetter         E (no exception)
         UppercaseLetter         E,F,10,11,21 except for 1BC (C)
         ModifierLetter          1, 7, E, 1F, FF
-       ModifierSymbol          1 1 1 1, 1, 7
-       NonSpacingMark          1 1 1 1, 1, 13-1F
+       ModifierSymbol          1, 6, 7
+       NonSpacingMark          1, 6, 13-1F
         OtherPunctuation        1, 7, A, 1F
         SpacingCombiningMark    1, 14-22
  
@@ -1038,6 +1071,9 @@
         (UCD) is informative (it's informative but not normative to us)
         http://www.unicode.org/Public/UNIDATA/UCD.html
  
+       Decent char-by-char explaination is available here:
+       http://www.fileformat.info/info/unicode/
+
         Wine uses UCA default element table, but has windows-like character
         filterings support in their LCMapString implementation:
         http://cvs.winehq.com/cvsweb/wine/dlls/kernel/locale.c
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs b/mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs

index 460f40c84994954f00414ce8b2a9ccccf7c4aa42..9d5184714d243ae5f6ae45582d932888da6761f5 100644 (file)
--- a/mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs
@@ -10,7 +10,7 @@ namespace Mono.Globalization.Unicode
                 {
                         switch (i) {
                         case 0:
-                       // No idea why each of those is ignored.
+                       // No idea why they are ignored.
                         case 0x2df: case 0x387:
                         case 0x3d7: case 0x3d8: case 0x3d9:
                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
@@ -28,9 +28,14 @@ namespace Mono.Globalization.Unicode
                         case 0xfffc: case 0xfffd:
                                 return true;
                         // exceptional characters filtered by the 
-                       // following conditions (no idea why though).
-                       case 0x4d8: case 0x4d9: case 0x4e8: case 0x4e9:
-                       case 0x70f: case 0x3036: case 0x303f:
+                       // following conditions. Originally those exceptional
+                       // ranges are incorrect (they should not be ignored)
+                       // and most of those characters are unfortunately in
+                       // those ranges.
+                       case 0x4d8: case 0x4d9:
+                       case 0x4e8: case 0x4e9:
+                       case 0x70f:
+                       case 0x3036: case 0x303f:
                         case 0x337b: case 0xfb1e:
                                 return false;
                         }
@@ -364,84 +369,101 @@ namespace Mono.Globalization.Unicode
                         return Normalization.ToWidthInsensitive (i);
                 }
  
-               #region Level 3 properties (Case/Width)
+               #region Utilities
  
-               public static byte GetLevel3WeightRaw (char c) // add 2 for sortkey value
+               public static void GetPrimaryWeight (char c, bool variable,
+                       out byte category, out byte value)
                 {
-                       // Korean
-                       if (0x1100 <= c && c <= 0x11F9)
-                               return 2;
-                       if (0xFFA0 <= c && c <= 0xFFDC)
-                               return 4;
-                       if (0x3130 <= c && c <= 0x3164)
-                               return 5;
-                       // numbers
-                       if (0x2776 <= c && c <= 0x277F)
-                               return 4;
-                       if (0x2780 <= c && c <= 0x2789)
-                               return 8;
-                       if (0x2776 <= c && c <= 0x2793)
-                               return 0xC;
-                       if (0x2160 <= c && c <= 0x216F)
-                               return 0x10;
-                       if (0x2181 <= c && c <= 0x2182)
-                               return 0x10;
-                       // Arabic
-                       if (0x2135 <= c && c <= 0x2138)
-                               return 4;
-                       if (0xFE80 <= c && c <= 0xFE8E)
-                               return GetArabicFormInPresentationB (c);
-
-                       // actually I dunno the reason why they have weights.
-                       switch (c) {
-                       case 0x01BC:
-                               return 0x10;
-                       case 0x06A9:
-                               return 0x20;
-                       case 0x06AA:
-                               return 0x28;
-                       }
+               }
  
-                       byte ret = 0;
+               public static string GetExpansion (char c)
+               {
                         switch (c) {
-                       case 0x03C2:
-                       case 0x2104:
-                       case 0x212B:
-                               ret |= 8;
-                               break;
-                       case 0xFE42:
-                               ret |= 0xC;
-                               break;
-                       }
-
-                       // misc
-                       switch (GetNormalizationType (c)) {
-                       case 1: // <full>
-                               ret |= 1;
-                               break;
-                       case 2: // <sub>
-                               ret |= 1;
-                               break;
-                       case 3: // <super>
-                               ret |= 0xE;
-                               break;
+                       case '\u00C6':
+                               return "AE";
+                       case '\u00DE':
+                               return "TH";
+                       case '\u00DF':
+                               return "ss";
+                       case '\u00E6':
+                               return "ae";
+                       case '\u00FE':
+                               return "th";
+                       case '\u0132':
+                               return "IJ";
+                       case '\u0133':
+                               return "ij";
+                       case '\u0152':
+                               return "OE";
+                       case '\u0153':
+                               return "oe";
+                       case '\u01C4':
+                               return "DZ\u030C"; // surprisingly Windows works fine here
+                       case '\u01C5':
+                               return "Dz\u030C";
+                       case '\u01C6':
+                               return "dz\u030C";
+                       case '\u01C7':
+                               return "LJ";
+                       case '\u01C8':
+                               return "Lj";
+                       case '\u01C9':
+                               return "lj";
+                       case '\u01CA':
+                               return "NJ";
+                       case '\u01CB':
+                               return "Nj";
+                       case '\u01CC':
+                               return "nj";
+                       case '\u01E2':
+                               return "A\u0304E\u0304"; // LAMESPEC: should be \u00C6\u0304
+                       case '\u01E3':
+                               return "a\u0304e\u0304"; // LAMESPEC: should be \u00E6\u0304
+                       case '\u01F1':
+                               return "DZ";
+                       case '\u01F2':
+                               return "Dz";
+                       case '\u01F3':
+                               return "dz";
+                       case '\u01FC':
+                               return "A\u0301E\u0301"; // LAMESPEC: should be \u00C6\u0301
+                       case '\u01FD':
+                               return "a\u0301e\u0301"; // LAMESPEC: should be \u00C6\u0301
+                       case '\u05F0':
+                               return "\u05D5\u05D5";
+                       case '\u05F1':
+                               return "\u05D5\u05D9";
+                       case '\u05F2':
+                               return "\u05D9\u05D9";
+                       case '\uFB00':
+                               return "ff";
+                       case '\uFB01':
+                               return "fi";
+                       case '\uFB02':
+                               return "fl";
                         }
-                       if (IsSmallCapital (c)) // grep "SMALL CAPITAL"
-                               ret |= 8;
-                       if (IsUppercase (c)) // DerivedCoreProperties
-                               ret |= 0x10;
-
-                       return ret;
+//                     if ('\u1113' <= c && c <= '\u115F') Korean Jamo
+//                             return true;
+                       return null;
                 }
-
-               // TODO: implement GetArabicFormInRepresentationD(),
-               // GetNormalizationType(), IsSmallCapital() and IsUppercase().
-               // (They can be easily to be generated.)
-
                 #endregion
  
+
                 #region Level 4 properties (Kana)
  
+               public static bool HasSpecialWeight (char c)
+               {
+                       if (c < '\u3041')
+                               return false;
+                       else if (c < '\u3100')
+                               return true;
+                       else if (c < '\uFF60')
+                               return false;
+                       else if (c < '\uFF9F')
+                               return true;
+                       return true;
+               }
+
                 public static byte GetJapaneseDashType (char c)
                 {
                         switch (c) {
@@ -497,7 +519,25 @@ namespace Mono.Globalization.Unicode
                         return false;
                 }
  
-               #endregion\
+               #endregion
+
+
+               // 0 means no primary weight. 6 means variable weight
+               // For expanded character the value is 0.
+               // Those arrays will be split into blocks (<3400 and >F800)
+               byte [] categories;
+               byte [] level1;
+               byte [] level2;
+               byte [] level3;
+               // level 4 is computed.
+
+               // public static bool HasSpecialWeight (char c)
+               // { return level1 [(int) c] == 6; }
+
+               //
+               // Maybe autogenerated code or icall to fill array runs here
+               //
         }
  }
  
+
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

new file mode 100644 (file)

index 0000000..59c672a
--- /dev/null
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs
@@ -0,0 +1,380 @@
+//
+//
+// There are two kind of sort keys : which are computed and which are laid out
+// as an indexed array. Computed sort keys are:
+//
+//     - CJK, which largely vary depending on LCID (namely kr,jp,zh-CHS,zh-TW)
+//     - Surrogate
+//     - PrivateUse
+//
+// Also, for composite characters it should prepare different index table.
+//
+// Except for them, it should use precomputed index array.
+//
+
+//
+// * sortkey getter signature
+//
+//     int GetSortKey (string s, int index, byte [] buf)
+//     Stores sort key for corresponding character element into buf and
+//     returns the length of the consumed _source_ character element in s.
+//
+// * character length to consume; default implementation
+//
+//     If there is a diacritic after the base character, they are consumed
+//     and they are considered as a part of the character element.
+//
+
+using System;
+using System.Collections;
+using System.Globalization;
+
+namespace Mono.Globalization.Unicode
+{
+       internal class MSCompatSortKeyTableGenerator
+       {
+               public static void Main ()
+               {
+                       new MSCompatSortKeyTableGenerator ().Run ();
+               }
+
+               byte [] fillIndex = new byte [255]; // by category
+               CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
+
+               char [] specialIgnore = new char [] {
+                       '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
+                       '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
+                       };
+
+               // FIXME: need more love (as always)
+               char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
+                       'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
+                       'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+                       '\u0292', '\u01BE', '\u0298'};
+               byte [] alphaWeights = new byte [] {2, 9, 0xA, 0x1A, 0x21,
+                       0x23, 0x25, 0x2C, 0x32, 0x35, 0x36, 0x48, 0x51, 0x70,
+                       0x7C, 0x7E, 0x89, 0x8A, 0x91, 0x99, 0x9F, 0xA2, 0xA4,
+                       0xA6, 0xA9, 0xAA, 0xB3, 0xB4};
+
+
+               public void Run ()
+               {
+                       UnicodeCategory uc;
+
+                       #region Specially ignored // 01
+                       // This will raise "Defined" flag up.
+                       foreach (char c in specialIgnore)
+                               map [(int) c] = new CharMapEntry (0, 0, 0);
+                       #endregion
+
+
+                       #region Variable weights
+                       // Controls : 06 03 - 06 3D
+                       fillIndex [6] = 3;
+                       for (int i = 0; i < 65536; i++) {
+                               char c = (char) i;
+                               uc = Char.GetUnicodeCategory (c);
+                               if (uc == UnicodeCategory.Control &&
+                                       !Char.IsWhiteSpace (c))
+                                       AddCharMap (c, 6, true);
+                       }
+
+                       // Apostrophe 06 80
+                       map ['\''] = new CharMapEntry (6, 80, 1);
+                       map ['\uFF63'] = new CharMapEntry (6, 80, 1); // full
+
+                       // Hyphen/Dash : 06 81 - 06 90
+                       fillIndex [6] = 0x81;
+                       for (int i = 0; i < 65536; i++) {
+                               if (Char.GetUnicodeCategory ((char) i)
+                                       == UnicodeCategory.DashPunctuation)
+                                       AddCharMapGroup ((char) i, 6, true, true);
+                       }
+
+                       // Arabic variable weight chars 06 A0 -
+                       fillIndex [6] = 0xA0;
+                       // vowels
+                       for (int i = 0x64B; i <= 0x650; i++)
+                               AddCharMapGroup ((char) i, 6, true, true);
+                       // sukun
+                       AddCharMapGroup ('\u0652', 6, false, true);
+                       // shadda
+                       AddCharMapGroup ('\u0651', 6, false, true);
+                       #endregion
+
+
+                       #region Nonspacing marks // 01
+                       // FIXME: 01 03 - 01 B6 ... annoyance :(
+
+                       // Combining diacritical marks: 01 DC -
+
+                       // LAMESPEC: It should not stop at '\u20E1'. There are
+                       // a few more characters (that however results in 
+                       // overflow of level 2 unless we start before 0xDD).
+                       fillIndex [1] = 0xDC;
+                       for (int i = 0x20d0; i <= 0x20e1; i++)
+                               AddCharMap ((char) i, 1, true);
+                       #endregion
+
+
+                       #region Whitespaces // 07 03 -
+                       fillIndex [7] = 0x3;
+                       AddCharMapGroup (' ', 7, false, true);
+                       AddCharMap ('\u00A0', 7, true);
+                       for (int i = 9; i <= 0xD; i++)
+                               AddCharMap ((char) i, 7, true);
+                       for (int i = 0x2000; i <= 0x200B; i++)
+                               AddCharMap ((char) i, 7, true);
+                       AddCharMapGroup ('\u2028', 7, false, true);
+                       AddCharMapGroup ('\u2029', 7, false, true);
+
+                       // LAMESPEC: Windows developers seem to have thought 
+                       // that those characters are kind of whitespaces,
+                       // while they aren't.
+                       AddCharMapGroup ('\u2422', 7, false, true); // blank symbol
+                       AddCharMapGroup ('\u2423', 7, false, true); // open box
+                       #endregion
+
+
+                       #region ASCII non-alphanumeric // 07
+                       // non-alphanumeric ASCII except for: + - < = > '
+                       for (int i = 0x21; i < 0x7F; i++) {
+                               if (Char.IsLetterOrDigit ((char) i)
+                                       || "+-<=>'".IndexOf ((char) i) >= 0)
+                                       continue; // they are not added here.
+                               AddCharMapGroup ((char) i, 7, false, true);
+                       }
+                       #endregion
+
+
+                       // FIXME: for 07 xx we need more love.
+
+
+                       #region Numbers // 0C 02 - 0C E1
+                       fillIndex [9] = 2;
+
+                       // 9F8 : Bengali "one less than the denominator"
+                       AddCharMap ('\u09F8', 9, true);
+
+                       ArrayList numbers = new ArrayList ();
+                       for (int i = 0; i < 65536; i++)
+                               if (Char.IsNumber ((char) i))
+                                       numbers.Add (i);
+
+                       ArrayList numberValues = new ArrayList ();
+                       foreach (int i in numbers)
+                               numberValues.Add (new DictionaryEntry (i, CharUnicodeInfo.GetDecimalValue ((char) i)));
+                       numberValues.Sort (DictionaryValueComparer.Instance);
+                       decimal prevValue = -1;
+                       foreach (DictionaryEntry de in numberValues) {
+                               decimal currValue = (decimal) de.Value;
+                               if (prevValue < currValue) {
+                                       prevValue = currValue;
+                                       fillIndex [9] += 1;
+                               }
+                               AddCharMap ((char) ((int) de.Key), 9, false);
+                       }
+
+                       // 221E: infinity
+                       fillIndex [9] = 0xFF;
+                       AddCharMap ('\u221E', 9, true);
+                       #endregion
+
+
+                       #region Latin alphabets
+                       for (int i = 0; i < alphabets.Length; i++) {
+                               AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
+                       }
+                       #endregion
+
+                       #region Letters
+
+                       // Greek and Coptic
+                       fillIndex [0xF] = 02;
+                       for (int i = 0x0380; i < 0x03CF; i++)
+                               if (Char.IsLetter ((char) i))
+                                       AddLetterMap ((char) i, 0xF, true);
+                       fillIndex [0xF] = 0x40;
+                       for (int i = 0x03D0; i < 0x0400; i++)
+                               if (Char.IsLetter ((char) i))
+                                       AddLetterMap ((char) i, 0xF, true);
+
+                       // Cyrillic - UCA order w/ some modification
+                       fillIndex [0x10] = 0x3;
+                       // FIXME: For \u0400-\u045F we need "ordered Cyrillic"
+                       // table which is moslty from UCA DUCET.
+                       for (int i = 0; i < orderedCyrillic.Length; i++) {
+                               char c = orderedCyrillic [i];
+                               if (Char.IsLetter (c)) {
+                                       AddLetterMap (c, 0x10, false);
+                                       fillIndex [0x10] += 3;
+                               }
+                       }
+                       for (int i = 0x0460; i < 0x0481; i++) {
+                               if (Char.IsLetter ((char) i)) {
+                                       AddLetterMap ((char) i, 0x10, false);
+                                       fillIndex [0x10] += 3;
+                               }
+                       }
+
+                       // Armenian
+                       fillIndex [0x11] = 0x3;
+                       for (int i = 0x0531; i < 0x0586; i++)
+                               if (Char.IsLetter ((char) i))
+                                       AddLetterMap ((char) i, 0x11, true);
+
+                       // Hebrew
+                       fillIndex [0x12] = 0x3;
+                       for (int i = 0x05D0; i < 0x05FF; i++)
+                               if (Char.IsLetter ((char) i))
+                                       AddLetterMap ((char) i, 0x12, true);
+
+                       // Arabic
+
+                       #endregion
+               }
+
+               private void AddAlphaMap (char c, byte category, byte alphaWeight)
+               {
+                       throw new NotImplementedException ();
+               }
+
+               class DictionaryValueComparer : IComparer
+               {
+                       public static readonly DictionaryValueComparer Instance
+                               = new DictionaryValueComparer ();
+
+                       private DictionaryValueComparer ()
+                       {
+                       }
+
+                       public /*static*/ int Compare (object o1, object o2)
+                       {
+                               DictionaryEntry e1 = (DictionaryEntry) o1;
+                               DictionaryEntry e2 = (DictionaryEntry) o2;
+                               // FIXME: in case of 0, compare decomposition categories
+                               return Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
+                       }
+               }
+
+               private void AddCharMapGroup (char c, byte category, bool tail, bool updateIndexForSelf)
+               {
+                       // <small> update index
+                       char c2 = tail ?
+                               MSCompatGenerated.ToSmallFormTail (c) :
+                               MSCompatGenerated.ToSmallForm (c);
+                       if (c2 > char.MinValue)
+                               AddCharMap (c2, category, true);
+                       // itself
+                       AddCharMap (c, category, updateIndexForSelf);
+                       // <full>
+                       c2 = tail ?
+                               MSCompatGenerated.ToFullWidthTail (c) :
+                               MSCompatGenerated.ToFullWidth (c);
+                       if (c2 > char.MinValue)
+                               AddCharMapGroup (c2, category, tail, false);
+               }
+
+               private void AddCharMap (char c, byte category, bool increment)
+               {
+                       map [(int) c] = new CharMapEntry (category,
+                               category == 1 ? (byte) 1 : fillIndex [category],
+                               category != 1 ? fillIndex [category] : (byte) 1);
+                       if (increment)
+                               fillIndex [category] += 1;
+               }
+
+               #region Level 3 properties (Case/Width)
+
+               public static byte GetLevel3WeightRaw (char c) // add 2 for sortkey value
+               {
+                       // Korean
+                       if ('\u1100' <= c && c <= '\u11F9)
+                               return 2;
+                       if ('\uFFA0' <= c && c <= '\uFFDC)
+                               return 4;
+                       if ('\u3130' <= c && c <= '\u3164)
+                               return 5;
+                       // numbers
+                       if ('\u2776' <= c && c <= '\u277F')
+                               return 4;
+                       if ('\u2780' <= c && c <= '\u2789')
+                               return 8;
+                       if ('\u2776' <= c && c <= '\u2793')
+                               return 0xC;
+                       if ('\u2160' <= c && c <= '\u216F')
+                               return 0x18;
+                       if ('\u2181' <= c && c <= '\u2182')
+                               return 0x18;
+                       // Arabic
+                       if ('\u2135' <= c && c <= '\u2138')
+                               return 4;
+                       if ('\uFE80' <= c && c <= '\uFE8E')
+                               return MSCompatGenerated.GetArabicFormInPresentationB (c);
+
+                       // actually I dunno the reason why they have weights.
+                       switch (c) {
+                       case '\u01BC':
+                               return 0x10;
+                       case '\u06A9':
+                               return 0x20;
+                       case '\u06AA':
+                               return 0x28;
+                       }
+
+                       byte ret = 0;
+                       switch (c) {
+                       case '\u03C2':
+                       case '\u2104':
+                       case '\u212B':
+                               ret |= 8;
+                               break;
+                       case '\uFE42':
+                               ret |= 0xC;
+                               break;
+                       }
+
+                       // misc
+                       switch (MSCompatGenerated.GetNormalizationType (c)) {
+                       case 1: // <full>
+                               ret |= 1;
+                               break;
+                       case 2: // <sub>
+                               ret |= 2;
+                               break;
+                       case 3: // <super>
+                               ret |= 0xE;
+                               break;
+                       }
+                       if (MSCompatGenerated.IsSmallCapital (c)) // grep "SMALL CAPITAL"
+                               ret |= 8;
+                       if (MSCompatGenerated.IsUppercase (c)) // DerivedCoreProperties
+                               ret |= 0x10;
+
+                       return ret;
+               }
+
+               // TODO: implement GetArabicFormInRepresentationD(),
+               // GetNormalizationType(), IsSmallCapital() and IsUppercase().
+               // (They can be easily to be generated.)
+
+               #endregion
+
+       }
+
+       internal struct CharMapEntry
+       {
+               public readonly byte Category;
+               public readonly byte Level1;
+               public readonly byte Level2; // It is always single byte.
+               public readonly bool Defined;
+
+               public CharMapEntry (byte category, byte level1, byte level2)
+               {
+                       Category = category;
+                       Level1 = level1;
+                       Level2 = level2;
+                       Defined = true;
+               }
+       }
+}
author	Atsushi Eno <atsushieno@gmail.com>
	Wed, 25 May 2005 16:42:33 +0000 (16:42 -0000)
committer	Atsushi Eno <atsushieno@gmail.com>
	Wed, 25 May 2005 16:42:33 +0000 (16:42 -0000)
mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog		patch \| blob \| history
mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt		patch \| blob \| history
mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs		patch \| blob \| history
mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs	[new file with mode: 0644]	patch \| blob