From: Atsushi Eno <atsushieno@gmail.com>
Date: Wed, 25 May 2005 16:42:33 +0000 (-0000)
Subject: 2005-05-25  Atsushi Enomoto  <atsushi@ximian.com>
X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=commitdiff_plain;h=5bd7cf358da3a8d27e43fdd361f7542f04883a38;hp=e8295e94cf86519d79ee3125f452b6c20597ff85;p=mono.git

2005-05-25  Atsushi Enomoto  <atsushi@ximian.com>

	* Collation-notes.txt : more info. Started letter sortkey analysis
	  (some of other stuff are really non-understandable right now.)
	* create-mscompat-collation-table.cs : table generator proof-of-
	  concept source (not compilable).
	* MSCompatUnicodeTable.cs : moved some code to the new source.
	  Some more fixes.


svn path=/branches/atsushi/mcs/; revision=45005
---

diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
index c25faaf39a8..e98eda9db39 100644
--- a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
@@ -1,3 +1,12 @@
+2005-05-25  Atsushi Enomoto  <atsushi@ximian.com>
+
+	* Collation-notes.txt : more info. Started letter sortkey analysis
+	  (some of other stuff are really non-understandable right now.)
+	* create-mscompat-collation-table.cs : table generator proof-of-
+	  concept source (not compilable).
+	* MSCompatUnicodeTable.cs : moved some code to the new source.
+	  Some more fixes.
+
 2005-05-20  Atsushi Enomoto  <atsushi@ximian.com>
 
 	* Collation-notes.txt : started level 2 weight analysis.
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt b/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt
index b1dc50b64d2..61d78e9ae29 100644
--- a/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt
@@ -210,12 +210,22 @@
 
 **** level 2
 
+	<del>
 	For Japanese voice marks, it just sums the count up.
 
 	There also seems special rule for Thai (E01-E4F) e.g. E47 works like
 	Japanese voice marks.
 
 	For other letters, there will be a table.
+	</del>
+
+	It looks like all level 2 keys are just accumulated, however without
+	considering overflow. It sometimes makes sense (e.g. diaeresis and
+	acute) but it causes many conflicts (e.g. "A\u0308\u0301" and "\u1EA6"
+	are incorrectly regarded as equal).
+
+	Anyways since Japanese voice mark has level 2 value as 1 it just
+	looked like the sum of voice marks.
 
 **** level 3
 
@@ -404,7 +414,7 @@
 
 		<primary category 0E : diacritics>
 		Characters in non "0E" category are out of scope.
-		They could be grepped in UnicodeData.txt.
+		They can be grepped in UnicodeData.txt.
 		-0E: acute
 		-0F: grave
 		-10: dot above
@@ -535,6 +545,7 @@
 
 	1 specially ignored ones (Japanese, Tamil, Thai)
 
+		IdentifyBy: constants
 		Unicode: 3099-309C, BCD, E47, E4C, FF9E, FF9F
 		SortKey: 01 01 01 01 00
 
@@ -546,21 +557,25 @@
 	2.1 control characters (specified as such in Unicode), except for
 	whitespaces (0009-000D).
 
+		ProcessAfter: 4.1
+		IdentifyBy: UnicodeCategory.Control
 		Unicode: 0001-000F minus 0009-000D, 007F-009F
-		SortKey: 06 80 07 06 03 00 - 06 80 07 06 3D 00
+		SortKey: 06 03 - 06 3D
 
 	2.2 Apostrophe
+		IdentifyBy: constant
 		Unicode: 0027,FF07 (')
-		SortKey: 06 80 (and nonspace equivalent)
+		SortKey: 06 80 (and width insensitive equivalents)
 
 	2.3  minus sign, hyphen, dash
 	  minus signs: FE63, 207B (super), 208B (sub), 002D, 00FD (full-width)
 	  hyphens: 00AD (soft), 2010, 2011 (nonbreaking) ... Unicode HYPHEN?
 	  dashes, horizontal bars: FE58 ... UnicodeCategory.DashPunctuation
 
+		IdentifyBy: UnicodeCategory.DashPunctuation
 		SortKey: 06 81 - 06 90 (and nonspace equivalents)
 
-	2.4 Arabic spacing and equivalents (64B-651, FE70-FE7F)
+	2.4 Arabic spacing and equivalents (64B-652, FE70-FE7F)
 	  They are part of nonspacing mark, but not equal.
 
 		SortKey: 06 A0 - 06 A7 (and nonspace equivalents)
@@ -571,7 +586,7 @@
 	  (i.e. < 128) nor those equivalents
 
 	  NonSpacingMark which is ignorable (IsIgnorableNonSpacing())
-	  // 30D, CD5-CD6, ABD, 2B9-2C1, 2C8, 2CB-2CD, 591-5C2. NonSpacingMark in
+	  // 30D, CD5-CD6, ABD, 2B9-2C5, 2C8, 2CB-2CD, 591-5C2. NonSpacingMark in
 	  // 981-A3C. A4D, A70, A71, ABC ...
 
 	  TODO: I need more insight to write table generator.
@@ -596,6 +611,11 @@
 	  If in "discriminatory mode", those tables could be still provided
 	  as to be compatible to Windows.
 
+	  Additionally there seems some bugs around Modifier letter collection.
+	  For example, 2C6 should be nonspacing diacritical character but it
+	  is regarded as a primary character. The same applies to Mandarin
+	  tone marks (2C9-2CB) (and there's a plenty of such characters).
+
 	4 space separators and some kind of marks
 
 	4.1 whitespaces, paragraph separator etc.
@@ -607,8 +627,12 @@
 	
 	  SortKey : 07 19 - 07 1A
 
-	4.3 other marks ('!', '^', ...)
-	  Non-alpha-numeric < 0x7F except for '+' (math) and '-' (math/hyphen)
+	4.3 ASCII compatible marks ('!', '^', ...)
+	  Non-alpha-numeric < 0x7F except for [[+-<=>']]
+	  small compatibility equivalents -> itself, wide
+
+	4.3 other marks
+	  FIXME: how to identify them?
 	  some Punctuations: InitialQuote/FinalQuote/Open/Close/Connector
 	  some OtherSymbols: 2400-2424
 	  3003, 3006, 2D0, 10FB
@@ -622,7 +646,7 @@
 	  (not Quotation_Mark property in PropList.txt ; 22, 27)
 
 	  byte area MathSymbol: 2B,3C,3D,3E,AB,B1,BB,D7,F7 except for AC
-	  MathSymbol (2044, 208A, 208C, 207A, 207C)
+	  some MathSymbol (2044, 208A, 208C, 207A, 207C)
 	  OtherLetter (1C0-1C2)
 	  2200-22FF MathSymbol except for 221E (INF. ; regarded as a number)
 
@@ -630,6 +654,7 @@
 
 	6 Arrows and Box drawings
 	  09 02 .. 09 7C : 2300-237A
+			only primary differences
 	  09 BC ... 09 FE : 25A0-AB, 25E7-EB, 25AC-B5, 25EC-EF, 25B6-B9,
 			25BC-C3, 25BA-25BB, 25C4-25D8, 25E6, 25DA-25E5
 			21*,25*,26*,27*
@@ -674,17 +699,24 @@
 
 	  This ordering is nothing to do with European Ordering Rules (EOR).
 
-	10 (F) greek letters
-	  0F: 386-3F2
-	  10: 400-4E9 exc. 482-486
-	  11: 531-586 exc. 559-55F
-	  12: 5D0-5F2
-	  13: 621-64A, 670-6D3, 6D5
+	10 culture dependent letters (general)
+	  0F: 386-3F2 ... Greek and Coptic
+		386-3CF: 0F 02 - 0F 19 (consider primary equivalents)
+		3D0-3EF: 0F 40 - 0F 54
+	  10: 400-4E9 ... Cyrillic.
+		For 400-45F and 4B1, they are mostly UCA DUCET order.
+		After that 460-481 follows, by codepoint.
+		(490-4FF except for 4B1 and Cyrillic supplementary are unused.)
+	  11: 531-586 ... Armenian.
+		Simply sorted by codepoint (handle case).
+	  12: 5D0-5F2 ... Hebrew
+		Codepoint order (handle case).
+	  13: 621-6D5 plus 670 (NonSpacingMark) ... Arabic
+
 	  14: 901-963 exc. 93C-93D 950-954
 	  15: 982-9FA exc. NonSpacingMark DecimalDigitNumber OtherNumber
 	  16: A05-A74 exc. A3C A4D A66-A71
 	  17: A81-AE0 exc. ABC-ABD
-	  18: 
 
 	...
 
@@ -744,13 +776,14 @@
 
 	   3400-4DB5. Ordered, considering case/width equivalents.
 
-	20 (FF FF 01 01 01 01 00) Some supplemental Japanese/Arabic marks
+	20 (FF FF 01 01 01 01 00) Some Japanese/Arabic extenders
+	   Actually FE7C and FE7D are not extender in Unicode (PropList.txt)
 
 	   3005, 3031, 3032, 309D, 309E, 30FC, 30FD, 30FE, FE7C, FE7D, FF70
 
 	- by UnicodeCategory -
 
-	DashPunctuation		1 1 1 1 (no exception)
+	DashPunctuation		6 (no exception)
 	DecimalDigitNumber	C (no exception)
 	EnclosingMark		1 E (no exception)
 	Format			7 (only 70F)
@@ -763,7 +796,7 @@
 
 	OtherNumber		C(<3192), 9E-A7 (3124<)
 
-	Control			1 1 1 1 except for 9-D (7)
+	Control			6 except for 9-D (7)
 	FinalQuotePunctuation	7 except for BB (8)
 	InitialQuotePunctuation	7 except for AB (8)
 	ClosePunctuation	7 except for 232A (9)
@@ -771,7 +804,7 @@
 	ConnectorPunctuation	7 except for FF65, 30FB, 2040 (A)
 
 	OtherLetter		1, 7, 8 (1C0-1C2), C, 12-FF
-	MathSymbol		8, 9, 1 1 1 1, 7, A, C
+	MathSymbol		8, 9, 6, 7, A, C
 	OtherSymbol		7, 9, A, C, E, F, <22, 52<
 	CurrencySymbol		A except for FF69,24,FF04 (7) and 9F2,9F3 (15)
 
@@ -779,8 +812,8 @@
 	TitlecaseLetter		E (no exception)
 	UppercaseLetter		E,F,10,11,21 except for 1BC (C)
 	ModifierLetter		1, 7, E, 1F, FF
-	ModifierSymbol		1 1 1 1, 1, 7
-	NonSpacingMark		1 1 1 1, 1, 13-1F
+	ModifierSymbol		1, 6, 7
+	NonSpacingMark		1, 6, 13-1F
 	OtherPunctuation	1, 7, A, 1F
 	SpacingCombiningMark	1, 14-22
 
@@ -1038,6 +1071,9 @@
 	(UCD) is informative (it's informative but not normative to us)
 	http://www.unicode.org/Public/UNIDATA/UCD.html
 
+	Decent char-by-char explaination is available here:
+	http://www.fileformat.info/info/unicode/
+
 	Wine uses UCA default element table, but has windows-like character
 	filterings support in their LCMapString implementation:
 	http://cvs.winehq.com/cvsweb/wine/dlls/kernel/locale.c
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs b/mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs
index 460f40c8499..9d5184714d2 100644
--- a/mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs
@@ -10,7 +10,7 @@ namespace Mono.Globalization.Unicode
 		{
 			switch (i) {
 			case 0:
-			// No idea why each of those is ignored.
+			// No idea why they are ignored.
 			case 0x2df: case 0x387:
 			case 0x3d7: case 0x3d8: case 0x3d9:
 			case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
@@ -28,9 +28,14 @@ namespace Mono.Globalization.Unicode
 			case 0xfffc: case 0xfffd:
 				return true;
 			// exceptional characters filtered by the 
-			// following conditions (no idea why though).
-			case 0x4d8: case 0x4d9: case 0x4e8: case 0x4e9:
-			case 0x70f: case 0x3036: case 0x303f:
+			// following conditions. Originally those exceptional
+			// ranges are incorrect (they should not be ignored)
+			// and most of those characters are unfortunately in
+			// those ranges.
+			case 0x4d8: case 0x4d9:
+			case 0x4e8: case 0x4e9:
+			case 0x70f:
+			case 0x3036: case 0x303f:
 			case 0x337b: case 0xfb1e:
 				return false;
 			}
@@ -364,84 +369,101 @@ namespace Mono.Globalization.Unicode
 			return Normalization.ToWidthInsensitive (i);
 		}
 
-		#region Level 3 properties (Case/Width)
+		#region Utilities
 
-		public static byte GetLevel3WeightRaw (char c) // add 2 for sortkey value
+		public static void GetPrimaryWeight (char c, bool variable,
+			out byte category, out byte value)
 		{
-			// Korean
-			if (0x1100 <= c && c <= 0x11F9)
-				return 2;
-			if (0xFFA0 <= c && c <= 0xFFDC)
-				return 4;
-			if (0x3130 <= c && c <= 0x3164)
-				return 5;
-			// numbers
-			if (0x2776 <= c && c <= 0x277F)
-				return 4;
-			if (0x2780 <= c && c <= 0x2789)
-				return 8;
-			if (0x2776 <= c && c <= 0x2793)
-				return 0xC;
-			if (0x2160 <= c && c <= 0x216F)
-				return 0x10;
-			if (0x2181 <= c && c <= 0x2182)
-				return 0x10;
-			// Arabic
-			if (0x2135 <= c && c <= 0x2138)
-				return 4;
-			if (0xFE80 <= c && c <= 0xFE8E)
-				return GetArabicFormInPresentationB (c);
-
-			// actually I dunno the reason why they have weights.
-			switch (c) {
-			case 0x01BC:
-				return 0x10;
-			case 0x06A9:
-				return 0x20;
-			case 0x06AA:
-				return 0x28;
-			}
+		}
 
-			byte ret = 0;
+		public static string GetExpansion (char c)
+		{
 			switch (c) {
-			case 0x03C2:
-			case 0x2104:
-			case 0x212B:
-				ret |= 8;
-				break;
-			case 0xFE42:
-				ret |= 0xC;
-				break;
-			}
-
-			// misc
-			switch (GetNormalizationType (c)) {
-			case 1: // <full>
-				ret |= 1;
-				break;
-			case 2: // <sub>
-				ret |= 1;
-				break;
-			case 3: // <super>
-				ret |= 0xE;
-				break;
+			case '\u00C6':
+				return "AE";
+			case '\u00DE':
+				return "TH";
+			case '\u00DF':
+				return "ss";
+			case '\u00E6':
+				return "ae";
+			case '\u00FE':
+				return "th";
+			case '\u0132':
+				return "IJ";
+			case '\u0133':
+				return "ij";
+			case '\u0152':
+				return "OE";
+			case '\u0153':
+				return "oe";
+			case '\u01C4':
+				return "DZ\u030C"; // surprisingly Windows works fine here
+			case '\u01C5':
+				return "Dz\u030C";
+			case '\u01C6':
+				return "dz\u030C";
+			case '\u01C7':
+				return "LJ";
+			case '\u01C8':
+				return "Lj";
+			case '\u01C9':
+				return "lj";
+			case '\u01CA':
+				return "NJ";
+			case '\u01CB':
+				return "Nj";
+			case '\u01CC':
+				return "nj";
+			case '\u01E2':
+				return "A\u0304E\u0304"; // LAMESPEC: should be \u00C6\u0304
+			case '\u01E3':
+				return "a\u0304e\u0304"; // LAMESPEC: should be \u00E6\u0304
+			case '\u01F1':
+				return "DZ";
+			case '\u01F2':
+				return "Dz";
+			case '\u01F3':
+				return "dz";
+			case '\u01FC':
+				return "A\u0301E\u0301"; // LAMESPEC: should be \u00C6\u0301
+			case '\u01FD':
+				return "a\u0301e\u0301"; // LAMESPEC: should be \u00C6\u0301
+			case '\u05F0':
+				return "\u05D5\u05D5";
+			case '\u05F1':
+				return "\u05D5\u05D9";
+			case '\u05F2':
+				return "\u05D9\u05D9";
+			case '\uFB00':
+				return "ff";
+			case '\uFB01':
+				return "fi";
+			case '\uFB02':
+				return "fl";
 			}
-			if (IsSmallCapital (c)) // grep "SMALL CAPITAL"
-				ret |= 8;
-			if (IsUppercase (c)) // DerivedCoreProperties
-				ret |= 0x10;
-
-			return ret;
+//			if ('\u1113' <= c && c <= '\u115F') Korean Jamo
+//				return true;
+			return null;
 		}
-
-		// TODO: implement GetArabicFormInRepresentationD(),
-		// GetNormalizationType(), IsSmallCapital() and IsUppercase().
-		// (They can be easily to be generated.)
-
 		#endregion
 
+
 		#region Level 4 properties (Kana)
 
+		public static bool HasSpecialWeight (char c)
+		{
+			if (c < '\u3041')
+				return false;
+			else if (c < '\u3100')
+				return true;
+			else if (c < '\uFF60')
+				return false;
+			else if (c < '\uFF9F')
+				return true;
+			return true;
+		}
+
 		public static byte GetJapaneseDashType (char c)
 		{
 			switch (c) {
@@ -497,7 +519,25 @@ namespace Mono.Globalization.Unicode
 			return false;
 		}
 
-		#endregion\
+		#endregion
+
+
+		// 0 means no primary weight. 6 means variable weight
+		// For expanded character the value is 0.
+		// Those arrays will be split into blocks (<3400 and >F800)
+		byte [] categories;
+		byte [] level1;
+		byte [] level2;
+		byte [] level3;
+		// level 4 is computed.
+
+		// public static bool HasSpecialWeight (char c)
+		// { return level1 [(int) c] == 6; }
+
+		//
+		// Maybe autogenerated code or icall to fill array runs here
+		//
 	}
 }
 
+
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs
new file mode 100644
index 00000000000..59c672aa88b
--- /dev/null
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs
@@ -0,0 +1,380 @@
+//
+//
+// There are two kind of sort keys : which are computed and which are laid out
+// as an indexed array. Computed sort keys are:
+//
+//	- CJK, which largely vary depending on LCID (namely kr,jp,zh-CHS,zh-TW)
+//	- Surrogate
+//	- PrivateUse
+//
+// Also, for composite characters it should prepare different index table.
+//
+// Except for them, it should use precomputed index array.
+//
+
+//
+// * sortkey getter signature
+//
+//	int GetSortKey (string s, int index, byte [] buf)
+//	Stores sort key for corresponding character element into buf and
+//	returns the length of the consumed _source_ character element in s.
+//
+// * character length to consume; default implementation
+//
+//	If there is a diacritic after the base character, they are consumed
+//	and they are considered as a part of the character element.
+//
+
+using System;
+using System.Collections;
+using System.Globalization;
+
+namespace Mono.Globalization.Unicode
+{
+	internal class MSCompatSortKeyTableGenerator
+	{
+		public static void Main ()
+		{
+			new MSCompatSortKeyTableGenerator ().Run ();
+		}
+
+		byte [] fillIndex = new byte [255]; // by category
+		CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
+
+		char [] specialIgnore = new char [] {
+			'\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
+			'\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
+			};
+
+		// FIXME: need more love (as always)
+		char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
+			'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
+			'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+			'\u0292', '\u01BE', '\u0298'};
+		byte [] alphaWeights = new byte [] {2, 9, 0xA, 0x1A, 0x21,
+			0x23, 0x25, 0x2C, 0x32, 0x35, 0x36, 0x48, 0x51, 0x70,
+			0x7C, 0x7E, 0x89, 0x8A, 0x91, 0x99, 0x9F, 0xA2, 0xA4,
+			0xA6, 0xA9, 0xAA, 0xB3, 0xB4};
+
+
+		public void Run ()
+		{
+			UnicodeCategory uc;
+
+			#region Specially ignored // 01
+			// This will raise "Defined" flag up.
+			foreach (char c in specialIgnore)
+				map [(int) c] = new CharMapEntry (0, 0, 0);
+			#endregion
+
+
+			#region Variable weights
+			// Controls : 06 03 - 06 3D
+			fillIndex [6] = 3;
+			for (int i = 0; i < 65536; i++) {
+				char c = (char) i;
+				uc = Char.GetUnicodeCategory (c);
+				if (uc == UnicodeCategory.Control &&
+					!Char.IsWhiteSpace (c))
+					AddCharMap (c, 6, true);
+			}
+
+			// Apostrophe 06 80
+			map ['\''] = new CharMapEntry (6, 80, 1);
+			map ['\uFF63'] = new CharMapEntry (6, 80, 1); // full
+
+			// Hyphen/Dash : 06 81 - 06 90
+			fillIndex [6] = 0x81;
+			for (int i = 0; i < 65536; i++) {
+				if (Char.GetUnicodeCategory ((char) i)
+					== UnicodeCategory.DashPunctuation)
+					AddCharMapGroup ((char) i, 6, true, true);
+			}
+
+			// Arabic variable weight chars 06 A0 -
+			fillIndex [6] = 0xA0;
+			// vowels
+			for (int i = 0x64B; i <= 0x650; i++)
+				AddCharMapGroup ((char) i, 6, true, true);
+			// sukun
+			AddCharMapGroup ('\u0652', 6, false, true);
+			// shadda
+			AddCharMapGroup ('\u0651', 6, false, true);
+			#endregion
+
+
+			#region Nonspacing marks // 01
+			// FIXME: 01 03 - 01 B6 ... annoyance :(
+
+			// Combining diacritical marks: 01 DC -
+
+			// LAMESPEC: It should not stop at '\u20E1'. There are
+			// a few more characters (that however results in 
+			// overflow of level 2 unless we start before 0xDD).
+			fillIndex [1] = 0xDC;
+			for (int i = 0x20d0; i <= 0x20e1; i++)
+				AddCharMap ((char) i, 1, true);
+			#endregion
+
+
+			#region Whitespaces // 07 03 -
+			fillIndex [7] = 0x3;
+			AddCharMapGroup (' ', 7, false, true);
+			AddCharMap ('\u00A0', 7, true);
+			for (int i = 9; i <= 0xD; i++)
+				AddCharMap ((char) i, 7, true);
+			for (int i = 0x2000; i <= 0x200B; i++)
+				AddCharMap ((char) i, 7, true);
+			AddCharMapGroup ('\u2028', 7, false, true);
+			AddCharMapGroup ('\u2029', 7, false, true);
+
+			// LAMESPEC: Windows developers seem to have thought 
+			// that those characters are kind of whitespaces,
+			// while they aren't.
+			AddCharMapGroup ('\u2422', 7, false, true); // blank symbol
+			AddCharMapGroup ('\u2423', 7, false, true); // open box
+			#endregion
+
+
+			#region ASCII non-alphanumeric // 07
+			// non-alphanumeric ASCII except for: + - < = > '
+			for (int i = 0x21; i < 0x7F; i++) {
+				if (Char.IsLetterOrDigit ((char) i)
+					|| "+-<=>'".IndexOf ((char) i) >= 0)
+					continue; // they are not added here.
+				AddCharMapGroup ((char) i, 7, false, true);
+			}
+			#endregion
+
+
+			// FIXME: for 07 xx we need more love.
+
+
+			#region Numbers // 0C 02 - 0C E1
+			fillIndex [9] = 2;
+
+			// 9F8 : Bengali "one less than the denominator"
+			AddCharMap ('\u09F8', 9, true);
+
+			ArrayList numbers = new ArrayList ();
+			for (int i = 0; i < 65536; i++)
+				if (Char.IsNumber ((char) i))
+					numbers.Add (i);
+
+			ArrayList numberValues = new ArrayList ();
+			foreach (int i in numbers)
+				numberValues.Add (new DictionaryEntry (i, CharUnicodeInfo.GetDecimalValue ((char) i)));
+			numberValues.Sort (DictionaryValueComparer.Instance);
+			decimal prevValue = -1;
+			foreach (DictionaryEntry de in numberValues) {
+				decimal currValue = (decimal) de.Value;
+				if (prevValue < currValue) {
+					prevValue = currValue;
+					fillIndex [9] += 1;
+				}
+				AddCharMap ((char) ((int) de.Key), 9, false);
+			}
+
+			// 221E: infinity
+			fillIndex [9] = 0xFF;
+			AddCharMap ('\u221E', 9, true);
+			#endregion
+
+
+			#region Latin alphabets
+			for (int i = 0; i < alphabets.Length; i++) {
+				AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
+			}
+			#endregion
+
+			#region Letters
+
+			// Greek and Coptic
+			fillIndex [0xF] = 02;
+			for (int i = 0x0380; i < 0x03CF; i++)
+				if (Char.IsLetter ((char) i))
+					AddLetterMap ((char) i, 0xF, true);
+			fillIndex [0xF] = 0x40;
+			for (int i = 0x03D0; i < 0x0400; i++)
+				if (Char.IsLetter ((char) i))
+					AddLetterMap ((char) i, 0xF, true);
+
+			// Cyrillic - UCA order w/ some modification
+			fillIndex [0x10] = 0x3;
+			// FIXME: For \u0400-\u045F we need "ordered Cyrillic"
+			// table which is moslty from UCA DUCET.
+			for (int i = 0; i < orderedCyrillic.Length; i++) {
+				char c = orderedCyrillic [i];
+				if (Char.IsLetter (c)) {
+					AddLetterMap (c, 0x10, false);
+					fillIndex [0x10] += 3;
+				}
+			}
+			for (int i = 0x0460; i < 0x0481; i++) {
+				if (Char.IsLetter ((char) i)) {
+					AddLetterMap ((char) i, 0x10, false);
+					fillIndex [0x10] += 3;
+				}
+			}
+
+			// Armenian
+			fillIndex [0x11] = 0x3;
+			for (int i = 0x0531; i < 0x0586; i++)
+				if (Char.IsLetter ((char) i))
+					AddLetterMap ((char) i, 0x11, true);
+
+			// Hebrew
+			fillIndex [0x12] = 0x3;
+			for (int i = 0x05D0; i < 0x05FF; i++)
+				if (Char.IsLetter ((char) i))
+					AddLetterMap ((char) i, 0x12, true);
+
+			// Arabic
+
+			#endregion
+		}
+
+		private void AddAlphaMap (char c, byte category, byte alphaWeight)
+		{
+			throw new NotImplementedException ();
+		}
+
+		class DictionaryValueComparer : IComparer
+		{
+			public static readonly DictionaryValueComparer Instance
+				= new DictionaryValueComparer ();
+
+			private DictionaryValueComparer ()
+			{
+			}
+
+			public /*static*/ int Compare (object o1, object o2)
+			{
+				DictionaryEntry e1 = (DictionaryEntry) o1;
+				DictionaryEntry e2 = (DictionaryEntry) o2;
+				// FIXME: in case of 0, compare decomposition categories
+				return Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
+			}
+		}
+
+		private void AddCharMapGroup (char c, byte category, bool tail, bool updateIndexForSelf)
+		{
+			// <small> update index
+			char c2 = tail ?
+				MSCompatGenerated.ToSmallFormTail (c) :
+				MSCompatGenerated.ToSmallForm (c);
+			if (c2 > char.MinValue)
+				AddCharMap (c2, category, true);
+			// itself
+			AddCharMap (c, category, updateIndexForSelf);
+			// <full>
+			c2 = tail ?
+				MSCompatGenerated.ToFullWidthTail (c) :
+				MSCompatGenerated.ToFullWidth (c);
+			if (c2 > char.MinValue)
+				AddCharMapGroup (c2, category, tail, false);
+		}
+
+		private void AddCharMap (char c, byte category, bool increment)
+		{
+			map [(int) c] = new CharMapEntry (category,
+				category == 1 ? (byte) 1 : fillIndex [category],
+				category != 1 ? fillIndex [category] : (byte) 1);
+			if (increment)
+				fillIndex [category] += 1;
+		}
+
+		#region Level 3 properties (Case/Width)
+
+		public static byte GetLevel3WeightRaw (char c) // add 2 for sortkey value
+		{
+			// Korean
+			if ('\u1100' <= c && c <= '\u11F9)
+				return 2;
+			if ('\uFFA0' <= c && c <= '\uFFDC)
+				return 4;
+			if ('\u3130' <= c && c <= '\u3164)
+				return 5;
+			// numbers
+			if ('\u2776' <= c && c <= '\u277F')
+				return 4;
+			if ('\u2780' <= c && c <= '\u2789')
+				return 8;
+			if ('\u2776' <= c && c <= '\u2793')
+				return 0xC;
+			if ('\u2160' <= c && c <= '\u216F')
+				return 0x18;
+			if ('\u2181' <= c && c <= '\u2182')
+				return 0x18;
+			// Arabic
+			if ('\u2135' <= c && c <= '\u2138')
+				return 4;
+			if ('\uFE80' <= c && c <= '\uFE8E')
+				return MSCompatGenerated.GetArabicFormInPresentationB (c);
+
+			// actually I dunno the reason why they have weights.
+			switch (c) {
+			case '\u01BC':
+				return 0x10;
+			case '\u06A9':
+				return 0x20;
+			case '\u06AA':
+				return 0x28;
+			}
+
+			byte ret = 0;
+			switch (c) {
+			case '\u03C2':
+			case '\u2104':
+			case '\u212B':
+				ret |= 8;
+				break;
+			case '\uFE42':
+				ret |= 0xC;
+				break;
+			}
+
+			// misc
+			switch (MSCompatGenerated.GetNormalizationType (c)) {
+			case 1: // <full>
+				ret |= 1;
+				break;
+			case 2: // <sub>
+				ret |= 2;
+				break;
+			case 3: // <super>
+				ret |= 0xE;
+				break;
+			}
+			if (MSCompatGenerated.IsSmallCapital (c)) // grep "SMALL CAPITAL"
+				ret |= 8;
+			if (MSCompatGenerated.IsUppercase (c)) // DerivedCoreProperties
+				ret |= 0x10;
+
+			return ret;
+		}
+
+		// TODO: implement GetArabicFormInRepresentationD(),
+		// GetNormalizationType(), IsSmallCapital() and IsUppercase().
+		// (They can be easily to be generated.)
+
+		#endregion
+
+	}
+
+	internal struct CharMapEntry
+	{
+		public readonly byte Category;
+		public readonly byte Level1;
+		public readonly byte Level2; // It is always single byte.
+		public readonly bool Defined;
+
+		public CharMapEntry (byte category, byte level1, byte level2)
+		{
+			Category = category;
+			Level1 = level1;
+			Level2 = level2;
+			Defined = true;
+		}
+	}
+}