2005-07-19 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / create-mscompat-collation-table.cs
1 //
2 //
3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
5 //
6 //      - Surrogate
7 //      - PrivateUse
8 //
9 // Also, for composite characters it should prepare different index table.
10 //
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
13 //
14
15 //
16 // * sortkey getter signature
17 //
18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
19 //      Stores sort key for corresponding character element into buf and
20 //      returns the length of the consumed _source_ character element in s.
21 //
22 // * character length to consume
23 //
24 //      If there are characters whose primary weight is 0, they are consumed
25 //      and considered as a part of the character element.
26 //
27 #define Binary
28
29 using System;
30 using System.IO;
31 using System.Collections;
32 using System.Globalization;
33 using System.Text;
34 using System.Xml;
35
36 namespace Mono.Globalization.Unicode
37 {
38         internal class MSCompatSortKeyTableGenerator
39         {
40                 public static void Main (string [] args)
41                 {
42                         new MSCompatSortKeyTableGenerator ().Run (args);
43                 }
44
45                 const int DecompositionWide = 1; // fixed
46                 const int DecompositionSub = 2; // fixed
47                 const int DecompositionSmall = 3;
48                 const int DecompositionIsolated = 4;
49                 const int DecompositionInitial = 5;
50                 const int DecompositionFinal = 6;
51                 const int DecompositionMedial = 7;
52                 const int DecompositionNoBreak = 8;
53                 const int DecompositionVertical = 9;
54                 const int DecompositionFraction = 0xA;
55                 const int DecompositionFont = 0xB;
56                 const int DecompositionSuper = 0xC; // fixed
57                 const int DecompositionFull = 0xE;
58                 const int DecompositionNarrow = 0xD;
59                 const int DecompositionCircle = 0xF;
60                 const int DecompositionSquare = 0x10;
61                 const int DecompositionCompat = 0x11;
62                 const int DecompositionCanonical = 0x12;
63
64                 TextWriter Result = Console.Out;
65
66                 byte [] fillIndex = new byte [256]; // by category
67                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
68
69                 char [] specialIgnore = new char [] {
70                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
71                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
72                         };
73
74                 // FIXME: need more love (as always)
75                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
76                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
77                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
78                         '\u0292', '\u01BE', '\u0298'};
79                 byte [] alphaWeights = new byte [] {
80                         2, 9, 0xA, 0x1A, 0x21,
81                         0x23, 0x25, 0x2C, 0x32, 0x35,
82                         0x36, 0x48, 0x51, 0x70, 0x7C,
83                         0x7E, 0x89, 0x8A, 0x91, 0x99,
84                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
85                         0xA9, 0xAA, 0xB3, 0xB4};
86
87                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
88                 bool [] isUppercase = new bool [char.MaxValue + 1];
89
90                 byte [] decompType = new byte [char.MaxValue + 1];
91                 int [] decompIndex = new int [char.MaxValue + 1];
92                 int [] decompLength = new int [char.MaxValue + 1];
93                 int [] decompValues;
94                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
95
96                 byte [] diacritical = new byte [char.MaxValue + 1];
97
98                 string [] diacritics = new string [] {
99                         // LATIN, CYRILLIC etc.
100                         "UPTURN", "DOUBLE-STRUCK",
101                         "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
102                         "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
103                         "WITH ACUTE;", "WITH GRAVE;",
104                         //
105                         "WITH DOT ABOVE;", " MIDDLE DOT;",
106                         "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
107                         "WITH DIALYTIKA;",
108                         "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
109                         "DIALYTIKA TONOS", "DIALYTIKA AND TONOS", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
110                         "WITH OGONEK;", "WITH CEDILLA;",
111                         //
112                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
113                         "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
114                         "STROKE OVERLAY",
115                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
116                         " DIAERESIS AND GRAVE;",
117                         " BREVE AND ACUTE;",
118                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
119                         " MACRON AND ACUTE;",
120                         " MACRON AND GRAVE;",
121                         //
122                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
123                         " RING ABOVE AND ACUTE",
124                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
125                         " CIRCUMFLEX AND TILDE",
126                         " TILDE AND DIAERESIS",
127                         " STROKE AND ACUTE",
128                         " BREVE AND TILDE",
129                         " CEDILLA AND BREVE",
130                         " OGONEK AND MACRON",
131                         //
132                         "WITH OVERLINE",
133                         "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
134                         " DOUBLE GRAVE",
135                         " INVERTED BREVE",
136                         "ROMAN NUMERAL",
137                         " PRECEDED BY APOSTROPHE",
138                         "WITH HORN;",
139                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
140                         " PALATAL HOOK",
141                         " DOT BELOW;",
142                         " RETROFLEX;", "DIAERESIS BELOW",
143                         " RING BELOW",
144                         //
145                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
146                         " BREVE BELOW;", " HORN AND GRAVE",
147                         " TILDE BELOW",
148                         " TOPBAR",
149                         " DOT BELOW AND DOT ABOVE",
150                         " RIGHT HALF RING", " HORN AND TILDE",
151                         " CIRCUMFLEX AND DOT BELOW",
152                         " BREVE AND DOT BELOW",
153                         " DOT BELOW AND MACRON",
154                         " TONE TWO",
155                         " HORN AND HOOK ABOVE",
156                         " HORN AND DOT",
157                         // CIRCLED, PARENTHESIZED and so on
158                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
159                         "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
160                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
161                         };
162                 byte [] diacriticWeights = new byte [] {
163                         // LATIN.
164                         3, 3, 5, 5, 5,
165                         0xE, 0xF,
166                         0xE, 0xF,
167                         //
168                         0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
169                         0x16, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
170                         //
171                         0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
172                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
173                         //
174                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
175                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
176                         //
177                         0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
178                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
179                         //
180                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
181                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
182                         0x87, 0x95, 0xAA,
183                         // CIRCLED, PARENTHESIZED and so on.
184                         0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
185                         0xF3, 0xF3, 0xF3
186                         };
187
188                 int [] numberSecondaryWeightBounds = new int [] {
189                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
190                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
191                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
192                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
193                         0xE50, 0xE60, 0xED0, 0xEE0
194                         };
195
196                 char [] orderedGurmukhi;
197                 char [] orderedGujarati;
198                 char [] orderedGeorgian;
199                 char [] orderedThaana;
200
201                 static readonly char [] orderedTamilConsonants = new char [] {
202                         // based on traditional Tamil consonants, except for
203                         // Grantha (where Microsoft breaks traditionalism).
204                         // http://www.angelfire.com/empire/thamizh/padanGaL
205                         '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
206                         '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
207                         '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
208                         '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
209                         '\u0BB7', '\u0BB9'};
210
211                 // cp -> character name (only for some characters)
212                 ArrayList sortableCharNames = new ArrayList ();
213
214                 // cp -> arrow value (int)
215                 ArrayList arrowValues = new ArrayList ();
216
217                 // cp -> box value (int)
218                 ArrayList boxValues = new ArrayList ();
219
220                 // cp -> level1 value
221                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
222
223                 // letterName -> cp
224                 Hashtable arabicNameMap = new Hashtable ();
225
226                 // cp -> Hashtable [decompType] -> cp
227                 Hashtable nfkdMap = new Hashtable ();
228
229                 // Latin letter -> ArrayList [int]
230                 Hashtable latinMap = new Hashtable ();
231
232                 ArrayList jisJapanese = new ArrayList ();
233                 ArrayList nonJisJapanese = new ArrayList ();
234
235                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
236                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
237                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
238                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
239                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
240
241                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
242
243                 static double [] unicodeAge = new double [char.MaxValue + 1];
244
245                 ArrayList tailorings = new ArrayList ();
246
247                 void Run (string [] args)
248                 {
249                         string dirname = args.Length == 0 ? "downloaded" : args [0];
250                         ParseSources (dirname);
251                         Console.Error.WriteLine ("parse done.");
252
253                         ModifyParsedValues ();
254                         GenerateCore ();
255                         Console.Error.WriteLine ("generation done.");
256                         Serialize ();
257                         Console.Error.WriteLine ("serialization done.");
258 /*
259 StreamWriter sw = new StreamWriter ("agelog.txt");
260 for (int i = 0; i < char.MaxValue; i++) {
261 bool shouldBe = false;
262 switch (Char.GetUnicodeCategory ((char) i)) {
263 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
264         shouldBe = true; break;
265 }
266 if (unicodeAge [i] >= 3.1)
267         shouldBe = true;
268 //if (IsIgnorable (i) != shouldBe)
269 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
270 }
271 sw.Close ();
272 */
273                 }
274
275                 byte [] CompressArray (byte [] source, CodePointIndexer i)
276                 {
277                         return (byte []) CodePointIndexer.CompressArray  (
278                                 source, typeof (byte), i);
279                 }
280
281                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
282                 {
283                         return (ushort []) CodePointIndexer.CompressArray  (
284                                 source, typeof (ushort), i);
285                 }
286
287                 void Serialize ()
288                 {
289                         // Tailorings
290                         SerializeTailorings ();
291
292                         byte [] categories = new byte [map.Length];
293                         byte [] level1 = new byte [map.Length];
294                         byte [] level2 = new byte [map.Length];
295                         byte [] level3 = new byte [map.Length];
296                         ushort [] widthCompat = new ushort [map.Length];
297                         for (int i = 0; i < map.Length; i++) {
298                                 categories [i] = map [i].Category;
299                                 level1 [i] = map [i].Level1;
300                                 level2 [i] = map [i].Level2;
301                                 level3 [i] = ComputeLevel3Weight ((char) i);
302                                 // For Japanese Half-width characters, don't
303                                 // map widthCompat. It is IgnoreKanaType that
304                                 // handles those width differences.
305                                 if (0xFF6D <= i && i <= 0xFF9D)
306                                         continue;
307                                 switch (decompType [i]) {
308                                 case DecompositionNarrow:
309                                 case DecompositionWide:
310                                 case DecompositionSuper:
311                                 case DecompositionSub:
312                                         // they are always 1 char
313                                         widthCompat [i] = (ushort) decompValues [decompIndex [i]];
314                                         break;
315                                 }
316                         }
317
318                         // compress
319                         ignorableFlags = CompressArray (ignorableFlags,
320                                 MSCompatUnicodeTableUtil.Ignorable);
321                         categories = CompressArray (categories,
322                                 MSCompatUnicodeTableUtil.Category);
323                         level1 = CompressArray (level1, 
324                                 MSCompatUnicodeTableUtil.Level1);
325                         level2 = CompressArray (level2, 
326                                 MSCompatUnicodeTableUtil.Level2);
327                         level3 = CompressArray (level3, 
328                                 MSCompatUnicodeTableUtil.Level3);
329                         widthCompat = (ushort []) CodePointIndexer.CompressArray (
330                                 widthCompat, typeof (ushort),
331                                 MSCompatUnicodeTableUtil.WidthCompat);
332                         cjkCHS = CompressArray (cjkCHS,
333                                 MSCompatUnicodeTableUtil.CjkCHS);
334                         cjkCHT = CompressArray (cjkCHT,
335                                 MSCompatUnicodeTableUtil.Cjk);
336                         cjkJA = CompressArray (cjkJA,
337                                 MSCompatUnicodeTableUtil.Cjk);
338                         cjkKO = CompressArray (cjkKO,
339                                 MSCompatUnicodeTableUtil.Cjk);
340                         cjkKOlv2 = CompressArray (cjkKOlv2,
341                                 MSCompatUnicodeTableUtil.Cjk);
342
343                         // Ignorables
344                         Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
345 #if Binary
346                         MemoryStream ms = new MemoryStream ();
347                         BinaryWriter binary = new BinaryWriter (ms);
348                         binary.Write (ignorableFlags.Length);
349 #endif
350                         for (int i = 0; i < ignorableFlags.Length; i++) {
351                                 byte value = ignorableFlags [i];
352                                 if (value < 10)
353                                         Result.Write ("{0},", value);
354                                 else
355                                         Result.Write ("0x{0:X02},", value);
356 #if Binary
357                                 binary.Write (value);
358 #endif
359                                 if ((i & 0xF) == 0xF)
360                                         Result.WriteLine ("// {0:X04}", i - 0xF);
361                         }
362                         Result.WriteLine ("};");
363                         Result.WriteLine ();
364
365                         // Primary category
366                         Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
367 #if Binary
368                         binary.Write (categories.Length);
369 #endif
370                         for (int i = 0; i < categories.Length; i++) {
371                                 byte value = categories [i];
372                                 if (value < 10)
373                                         Result.Write ("{0},", value);
374                                 else
375                                         Result.Write ("0x{0:X02},", value);
376 #if Binary
377                                 binary.Write (value);
378 #endif
379                                 if ((i & 0xF) == 0xF)
380                                         Result.WriteLine ("// {0:X04}", i - 0xF);
381                         }
382                         Result.WriteLine ("};");
383                         Result.WriteLine ();
384
385                         // Primary weight value
386                         Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
387 #if Binary
388                         binary.Write (level1.Length);
389 #endif
390                         for (int i = 0; i < level1.Length; i++) {
391                                 byte value = level1 [i];
392                                 if (value < 10)
393                                         Result.Write ("{0},", value);
394                                 else
395                                         Result.Write ("0x{0:X02},", value);
396 #if Binary
397                                 binary.Write (value);
398 #endif
399                                 if ((i & 0xF) == 0xF)
400                                         Result.WriteLine ("// {0:X04}", i - 0xF);
401                         }
402                         Result.WriteLine ("};");
403                         Result.WriteLine ();
404
405                         // Secondary weight
406                         Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
407 #if Binary
408                         binary.Write (level2.Length);
409 #endif
410                         for (int i = 0; i < level2.Length; i++) {
411                                 byte value = level2 [i];
412                                 if (value < 10)
413                                         Result.Write ("{0},", value);
414                                 else
415                                         Result.Write ("0x{0:X02},", value);
416 #if Binary
417                                 binary.Write (value);
418 #endif
419                                 if ((i & 0xF) == 0xF)
420                                         Result.WriteLine ("// {0:X04}", i - 0xF);
421                         }
422                         Result.WriteLine ("};");
423                         Result.WriteLine ();
424
425                         // Thirtiary weight
426                         Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
427 #if Binary
428                         binary.Write (level3.Length);
429 #endif
430                         for (int i = 0; i < level3.Length; i++) {
431                                 byte value = level3 [i];
432                                 if (value < 10)
433                                         Result.Write ("{0},", value);
434                                 else
435                                         Result.Write ("0x{0:X02},", value);
436 #if Binary
437                                 binary.Write (value);
438 #endif
439                                 if ((i & 0xF) == 0xF)
440                                         Result.WriteLine ("// {0:X04}", i - 0xF);
441                         }
442                         Result.WriteLine ("};");
443                         Result.WriteLine ();
444
445                         // Width insensitivity mappings
446                         // (for now it is more lightweight than dumping the
447                         // entire NFKD table).
448                         Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
449 #if Binary
450                         binary.Write (widthCompat.Length);
451 #endif
452                         for (int i = 0; i < widthCompat.Length; i++) {
453                                 ushort value = widthCompat [i];
454                                 if (value < 10)
455                                         Result.Write ("{0},", value);
456                                 else
457                                         Result.Write ("0x{0:X02},", value);
458 #if Binary
459                                 binary.Write (value);
460 #endif
461                                 if ((i & 0xF) == 0xF)
462                                         Result.WriteLine ("// {0:X04}", i - 0xF);
463                         }
464                         Result.WriteLine ("};");
465                         Result.WriteLine ();
466 #if Binary
467                         using (FileStream fs = File.Create ("../collation.core.bin")) {
468                                 byte [] array = ms.ToArray ();
469                                 fs.Write (array, 0, array.Length);
470                         }
471 #endif
472
473                         // CJK
474                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
475                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
476                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
477                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
478                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
479                 }
480
481                 void SerializeCJK (string name, ushort [] cjk, int max)
482                 {
483                         int offset = 0;//char.MaxValue - cjk.Length;
484                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
485 #if Binary
486                         MemoryStream ms = new MemoryStream ();
487                         BinaryWriter binary = new BinaryWriter (ms);
488                         binary.Write (cjk.Length);
489 #endif
490                         for (int i = 0; i < cjk.Length; i++) {
491                                 if (i + offset == max)
492                                         break;
493                                 ushort value = cjk [i];
494                                 if (value < 10)
495                                         Result.Write ("{0},", value);
496                                 else
497                                         Result.Write ("0x{0:X04},", value);
498 #if Binary
499                                 binary.Write (value);
500 #endif
501                                 if ((i & 0xF) == 0xF)
502                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
503                         }
504                         Result.WriteLine ("};");
505                         Result.WriteLine ();
506 #if Binary
507                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
508                                 byte [] array = ms.ToArray ();
509                                 fs.Write (array, 0, array.Length);
510                         }
511 #endif
512                 }
513
514                 void SerializeCJK (string name, byte [] cjk, int max)
515                 {
516                         int offset = 0;//char.MaxValue - cjk.Length;
517                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
518 #if Binary
519                         MemoryStream ms = new MemoryStream ();
520                         BinaryWriter binary = new BinaryWriter (ms);
521 #endif
522                         for (int i = 0; i < cjk.Length; i++) {
523                                 if (i + offset == max)
524                                         break;
525                                 byte value = cjk [i];
526                                 if (value < 10)
527                                         Result.Write ("{0},", value);
528                                 else
529                                         Result.Write ("0x{0:X02},", value);
530 #if Binary
531                                 binary.Write (value);
532 #endif
533                                 if ((i & 0xF) == 0xF)
534                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
535                         }
536                         Result.WriteLine ("};");
537                         Result.WriteLine ();
538 #if Binary
539                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
540                                 byte [] array = ms.ToArray ();
541                                 fs.Write (array, 0, array.Length);
542                         }
543 #endif
544                 }
545
546                 void SerializeTailorings ()
547                 {
548                         Hashtable indexes = new Hashtable ();
549                         Hashtable counts = new Hashtable ();
550                         Result.WriteLine ("static char [] tailorings = new char [] {");
551                         int count = 0;
552 #if Binary
553                         MemoryStream ms = new MemoryStream ();
554                         BinaryWriter binary = new BinaryWriter (ms);
555 #endif
556                         foreach (Tailoring t in tailorings) {
557                                 if (t.Alias != 0)
558                                         continue;
559                                 Result.Write ("/*{0}*/", t.LCID);
560                                 indexes.Add (t.LCID, count);
561                                 char [] values = t.ItemToCharArray ();
562                                 counts.Add (t.LCID, values.Length);
563                                 foreach (char c in values) {
564                                         Result.Write ("'\\x{0:X}', ", (int) c);
565                                         if (++count % 16 == 0)
566                                                 Result.WriteLine (" // {0:X04}", count - 16);
567 #if Binary
568                                         binary.Write ((ushort) c);
569 #endif
570                                 }
571                         }
572                         Result.WriteLine ("};");
573
574                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
575 #if Binary
576                         byte [] rawdata = ms.ToArray ();
577                         ms = new MemoryStream ();
578                         binary = new BinaryWriter (ms);
579                         binary.Write (tailorings.Count);
580 #endif
581                         foreach (Tailoring t in tailorings) {
582                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
583                                 if (!indexes.ContainsKey (target)) {
584                                         throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
585                                         continue;
586                                 }
587                                 int idx = (int) indexes [target];
588                                 int cnt = (int) counts [target];
589                                 bool french = t.FrenchSort;
590                                 if (t.Alias != 0)
591                                         foreach (Tailoring t2 in tailorings)
592                                                 if (t2.LCID == t.LCID)
593                                                         french = t2.FrenchSort;
594                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
595 #if Binary
596                                 binary.Write (t.LCID);
597                                 binary.Write (idx);
598                                 binary.Write (cnt);
599                                 binary.Write (french);
600 #endif
601                         }
602                         Result.WriteLine ("};");
603 #if Binary
604                         binary.Write ((byte) 0xFF);
605                         binary.Write ((byte) 0xFF);
606                         binary.Write (rawdata.Length / 2);
607                         binary.Write (rawdata, 0, rawdata.Length);
608
609
610                         using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
611                                 byte [] array = ms.ToArray ();
612                                 fs.Write (array, 0, array.Length);
613                         }
614 #endif
615                 }
616
617                 #region Parse
618
619                 void ParseSources (string dirname)
620                 {
621                         string unidata =
622                                 dirname + "/UnicodeData.txt";
623                         string derivedCoreProps = 
624                                 dirname + "/DerivedCoreProperties.txt";
625                         string scripts = 
626                                 dirname + "/Scripts.txt";
627                         string cp932 = 
628                                 dirname + "/CP932.TXT";
629                         string derivedAge = 
630                                 dirname + "/DerivedAge.txt";
631                         string chXML = dirname + "/common/collation/zh.xml";
632                         string jaXML = dirname + "/common/collation/ja.xml";
633                         string koXML = dirname + "/common/collation/ko.xml";
634
635                         ParseDerivedAge (derivedAge);
636
637                         FillIgnorables ();
638
639                         ParseJISOrder (cp932); // in prior to ParseUnidata()
640                         ParseUnidata (unidata);
641                         ModifyUnidata ();
642                         ParseDerivedCoreProperties (derivedCoreProps);
643                         ParseScripts (scripts);
644                         ParseCJK (chXML, jaXML, koXML);
645
646                         ParseTailorings ("mono-tailoring-source.txt");
647                 }
648
649                 void ParseTailorings (string filename)
650                 {
651                         Tailoring t = null;
652                         int line = 0;
653                         using (StreamReader sr = new StreamReader (filename)) {
654                                 try {
655                                         while (sr.Peek () >= 0) {
656                                                 line++;
657                                                 ProcessTailoringLine (ref t,
658                                                         sr.ReadLine ().Trim ());
659                                         }
660                                 } catch (Exception) {
661                                         Console.Error.WriteLine ("ERROR at line {0}", line);
662                                         throw;
663                                 }
664                         }
665                 }
666
667                 // For now this is enough.
668                 string ParseTailoringSourceValue (string s)
669                 {
670                         StringBuilder sb = new StringBuilder ();
671                         for (int i = 0; i < s.Length; i++) {
672                                 if (s.StartsWith ("\\u")) {
673                                         sb.Append ((char) int.Parse (
674                                                 s.Substring (2, 4), NumberStyles.HexNumber),
675                                                 1);
676                                         i += 5;
677                                 }
678                         else
679                                 sb.Append (s [i]);
680                         }
681                         return sb.ToString ();
682                 }
683
684                 void ProcessTailoringLine (ref Tailoring t, string s)
685                 {
686                         int idx = s.IndexOf ('#');
687                         if (idx > 0)
688                                 s = s.Substring (0, idx).Trim ();
689                         if (s.Length == 0 || s [0] == '#')
690                                 return;
691                         if (s [0] == '@') {
692                                 idx = s.IndexOf ('=');
693                                 if (idx > 0)
694                                         t = new Tailoring (
695                                                 int.Parse (s.Substring (1, idx - 1)),
696                                                 int.Parse (s.Substring (idx + 1)));
697                                 else
698                                         t = new Tailoring (int.Parse (s.Substring (1)));
699                                 tailorings.Add (t);
700                                 return;
701                         }
702                         if (s.StartsWith ("*FrenchSort")) {
703                                 t.FrenchSort = true;
704                                 return;
705                         }
706                         string d = "*Diacritical";
707                         if (s.StartsWith (d)) {
708                                 idx = s.IndexOf ("->");
709                                 t.AddDiacriticalMap (
710                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
711                                                 NumberStyles.HexNumber),
712                                         byte.Parse (s.Substring (idx + 2).Trim (),
713                                                 NumberStyles.HexNumber));
714                                 return;
715                         }
716                         idx = s.IndexOf (':');
717                         if (idx > 0) {
718                                 string source = s.Substring (0, idx).Trim ();
719                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
720                                 byte [] b = new byte [4];
721                                 for (int i = 0; i < 4; i++) {
722                                         if (l [i] == "*")
723                                                 b [i] = 0;
724                                         else
725                                                 b [i] = byte.Parse (l [i],
726                                                         NumberStyles.HexNumber);
727                                 }
728                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
729                                         b);
730                         }
731                         idx = s.IndexOf ('=');
732                         if (idx > 0)
733                                 t.AddReplacementMap (
734                                         ParseTailoringSourceValue (
735                                                 s.Substring (0, idx).Trim ()),
736                                         ParseTailoringSourceValue (
737                                                 s.Substring (idx + 1).Trim ()));
738                 }
739
740                 void ParseDerivedAge (string filename)
741                 {
742                         using (StreamReader file =
743                                 new StreamReader (filename)) {
744                                 while (file.Peek () >= 0) {
745                                         string s = file.ReadLine ();
746                                         int idx = s.IndexOf ('#');
747                                         if (idx >= 0)
748                                                 s = s.Substring (0, idx);
749                                         idx = s.IndexOf (';');
750                                         if (idx < 0)
751                                                 continue;
752
753                                         string cpspec = s.Substring (0, idx);
754                                         idx = cpspec.IndexOf ("..");
755                                         NumberStyles nf = NumberStyles.HexNumber |
756                                                 NumberStyles.AllowTrailingWhite;
757                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
758                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
759                                         string value = s.Substring (cpspec.Length + 1).Trim ();
760
761                                         // FIXME: use index
762                                         if (cp > char.MaxValue)
763                                                 continue;
764
765                                         double v = double.Parse (value);
766                                         for (int i = cp; i <= cpEnd; i++)
767                                                 unicodeAge [i] = v;
768                                 }
769                         }
770                         unicodeAge [0] = double.MaxValue; // never be supported
771                 }
772
773                 void ParseUnidata (string filename)
774                 {
775                         ArrayList decompValues = new ArrayList ();
776                         using (StreamReader unidata =
777                                 new StreamReader (filename)) {
778                                 for (int line = 1; unidata.Peek () >= 0; line++) {
779                                         try {
780                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
781                                         } catch (Exception) {
782                                                 Console.Error.WriteLine ("**** At line " + line);
783                                                 throw;
784                                         }
785                                 }
786                         }
787                         this.decompValues = (int [])
788                                 decompValues.ToArray (typeof (int));
789                 }
790
791                 char previousLatinTarget = char.MinValue;
792                 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
793
794                 void ProcessUnidataLine (string s, ArrayList decompValues)
795                 {
796                         int idx = s.IndexOf ('#');
797                         if (idx >= 0)
798                                 s = s.Substring (0, idx);
799                         idx = s.IndexOf (';');
800                         if (idx < 0)
801                                 return;
802                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
803                         string [] values = s.Substring (idx + 1).Split (';');
804
805                         // FIXME: use index
806                         if (cp > char.MaxValue)
807                                 return;
808                         if (IsIgnorable (cp))
809                                 return;
810
811                         string name = values [0];
812
813                         // SPECIAL CASE: rename some characters for diacritical
814                         // remapping. FIXME: why are they different?
815                         // FIXME: it's still not working.
816                         if (cp == 0x018B || cp == 0x018C)
817                                 name = name.Replace ("TOPBAR", "STROKE");
818
819                         // isSmallCapital
820                         if (s.IndexOf ("SMALL CAPITAL") > 0)
821                                 isSmallCapital [cp] = true;
822
823                         // latin mapping by character name
824                         if (s.IndexOf ("LATIN") >= 0) {
825                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
826                                 int offset = lidx + 15;
827                                 if (lidx < 0) {
828                                         lidx = s.IndexOf ("LETTER TURNED ");
829                                         offset = lidx + 14;
830                                 }
831                                 if (lidx < 0) {
832                                         lidx = s.IndexOf ("LETTER CAPITAL ");
833                                         offset = lidx + 15;
834                                 }
835                                 if (lidx < 0) {
836                                         lidx = s.IndexOf ("LETTER SCRIPT ");
837                                         offset = lidx + 14;
838                                 }
839                                 if (lidx < 0) {
840                                         lidx = s.IndexOf ("LETTER ");
841                                         offset = lidx + 7;
842                                 }
843                                 char c = lidx > 0 ? s [offset] : char.MinValue;
844                                 char n = s [offset + 1];
845                                 char target = char.MinValue;
846                                 if ('A' <= c && c <= 'Z' &&
847                                         (n == ' ') || n == ';') {
848                                         target = c;
849                                         // FIXME: After 'Z', I cannot reset this state.
850                                         previousLatinTarget = c == 'Z' ? char.MinValue : c;
851                                 }
852
853                                 if (s.Substring (offset).StartsWith ("ALPHA"))
854                                         target = 'A';
855                                 else if (s.Substring (offset).StartsWith ("TONE SIX"))
856                                         target = 'B';
857                                 else if (s.Substring (offset).StartsWith ("OPEN O"))
858                                         target = 'C';
859                                 else if (s.Substring (offset).StartsWith ("SCHWA"))
860                                         target = 'E';
861                                 else if (s.Substring (offset).StartsWith ("ENG"))
862                                         target = 'N';
863                                 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
864                                         target = 'O';
865                                 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
866                                         target = 'R';
867                                 else if (s.Substring (offset).StartsWith ("TONE TWO"))
868                                         target = 'S';
869                                 else if (s.Substring (offset).StartsWith ("ESH"))
870                                         target = 'S';
871
872                                 // For remaining IPA chars, direct mapping is
873                                 // much faster.
874                                 switch (cp) {
875                                 case 0x0299: target = 'B'; break;
876                                 case 0x029A: target = 'E'; break;
877                                 case 0x029B: target = 'G'; break;
878                                 case 0x029C: target = 'H'; break;
879                                 case 0x029D: target = 'J'; break;
880                                 case 0x029E: target = 'K'; break;
881                                 case 0x029F: target = 'L'; break;
882                                 case 0x02A0: target = 'Q'; break;
883                                 case 0x02A7: target = 'T'; break;
884                                 case 0x02A8: target = 'T'; break;
885                                 }
886
887                                 if (target == char.MinValue)
888                                         target = previousLatinTarget;
889
890                                 if (target != char.MinValue) {
891                                         ArrayList entry = (ArrayList) latinMap [target];
892                                         if (entry == null) {
893                                                 entry = new ArrayList ();
894                                                 latinMap [target] = entry;
895                                         }
896                                         entry.Add (cp);
897                                         // FIXME: This secondary weight is hack.
898                                         // They are here because they must not
899                                         // be identical to the corresponding
900                                         // ASCII latins.
901                                         if (c != target && diacritical [cp] == 0) {
902                                                 diacriticalOffset [c - 'A']++;
903                                                 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
904                                         }
905                                 }
906                         }
907
908                         // Arrow names
909                         if (0x2000 <= cp && cp < 0x3000) {
910                                 int value = 0;
911                                 // SPECIAL CASES. FIXME: why?
912                                 switch (cp) {
913                                 case 0x21C5: value = -1; break; // E2
914                                 case 0x261D: value = 1; break;
915                                 case 0x27A6: value = 3; break;
916                                 case 0x21B0: value = 7; break;
917                                 case 0x21B1: value = 3; break;
918                                 case 0x21B2: value = 7; break;
919                                 case 0x21B4: value = 5; break;
920                                 case 0x21B5: value = 7; break;
921                                 case 0x21B9: value = -1; break; // E1
922                                 case 0x21CF: value = 7; break;
923                                 case 0x21D0: value = 3; break;
924                                 }
925                                 string [] arrowTargets = new string [] {
926                                         "",
927                                         "UPWARDS",
928                                         "NORTH EAST",
929                                         "RIGHTWARDS",
930                                         "SOUTH EAST",
931                                         "DOWNWARDS",
932                                         "SOUTH WEST",
933                                         "LEFTWARDS",
934                                         "NORTH WEST",
935                                         };
936                                 if (value == 0)
937                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
938                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
939                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
940                                                         s.IndexOf (" OVER") < 0
941                                                 )
942                                                         value = i;
943                                 if (value > 0)
944                                         arrowValues.Add (new DictionaryEntry (
945                                                 cp, value));
946                         }
947
948                         // Box names
949                         if (0x2500 <= cp && cp < 0x2600) {
950                                 int value = 0;
951                                 // flags:
952                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
953                                 // [h,rl] [r] [l]
954                                 // [v,ud] [u] [d]
955                                 // [dr] [dl] [ur] [ul]
956                                 // [vr,udr] [vl,vdl]
957                                 // [hd,rld] [hu,rlu]
958                                 // [hv,udrl,rlv,udh]
959                                 ArrayList flags = new ArrayList (new int [] {
960                                         32, 8 + 4, 8, 4,
961                                         16, 1 + 2, 1, 2,
962                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
963                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
964                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
965                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
966                                         });
967                                 byte [] offsets = new byte [] {
968                                         0, 0, 1, 2,
969                                         3, 3, 4, 5,
970                                         6, 7, 8, 9,
971                                         10, 10, 11, 11,
972                                         12, 12, 13, 13,
973                                         14, 14, 14, 14};
974                                 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
975                                         int flag = 0;
976                                         if (s.IndexOf (" UP") >= 0)
977                                                 flag |= 1;
978                                         if (s.IndexOf (" DOWN") >= 0)
979                                                 flag |= 2;
980                                         if (s.IndexOf (" RIGHT") >= 0)
981                                                 flag |= 4;
982                                         if (s.IndexOf (" LEFT") >= 0)
983                                                 flag |= 8;
984                                         if (s.IndexOf (" VERTICAL") >= 0)
985                                                 flag |= 16;
986                                         if (s.IndexOf (" HORIZONTAL") >= 0)
987                                                 flag |= 32;
988
989                                         int fidx = flags.IndexOf (flag);
990                                         value = fidx < 0 ? fidx : offsets [fidx];
991                                 } else if (s.IndexOf ("BLOCK") >= 0) {
992                                         if (s.IndexOf ("ONE EIGHTH") >= 0)
993                                                 value = 0x12;
994                                         else if (s.IndexOf ("ONE QUARTER") >= 0)
995                                                 value = 0x13;
996                                         else if (s.IndexOf ("THREE EIGHTHS") >= 0)
997                                                 value = 0x14;
998                                         else if (s.IndexOf ("HALF") >= 0)
999                                                 value = 0x15;
1000                                         else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1001                                                 value = 0x16;
1002                                         else if (s.IndexOf ("THREE QUARTERS") >= 0)
1003                                                 value = 0x17;
1004                                         else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1005                                                 value = 0x18;
1006                                         else
1007                                                 value = 0x19;
1008                                 }
1009                                 else if (s.IndexOf ("SHADE") >= 0)
1010                                         value = 0x19;
1011                                 else if (s.IndexOf ("SQUARE") >= 0)
1012                                         value = 0xBC - 0xE5;
1013                                 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1014                                         value = 0xBE - 0xE5;
1015                                 else if (s.IndexOf ("RECTANGLE") >= 0)
1016                                         value = 0xBD - 0xE5;
1017                                 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1018                                         value = 0xBF - 0xE5;
1019                                 else if (s.IndexOf ("TRIANGLE") >= 0) {
1020                                         if (s.IndexOf ("UP-POINTING") >= 0)
1021                                                 value = 0xC0 - 0xE5;
1022                                         else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1023                                                 value = 0xC1 - 0xE5;
1024                                         else if (s.IndexOf ("DOWN-POINTING") >= 0)
1025                                                 value = 0xC2 - 0xE5;
1026                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1027                                                 value = 0xC3 - 0xE5;
1028                                 }
1029                                 else if (s.IndexOf ("POINTER") >= 0) {
1030                                         if (s.IndexOf ("RIGHT-POINTING") >= 0)
1031                                                 value = 0xC4 - 0xE5;
1032                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1033                                                 value = 0xC5 - 0xE5;
1034                                 }
1035                                 else if (s.IndexOf ("DIAMOND") >= 0)
1036                                         value = 0xC6 - 0xE5;
1037                                 else if (s.IndexOf ("FISHEYE") >= 0)
1038                                         value = 0xC7 - 0xE5;
1039                                 else if (s.IndexOf ("LOZENGE") >= 0)
1040                                         value = 0xC8 - 0xE5;
1041                                 else if (s.IndexOf ("BULLSEYE") >= 0)
1042                                         value = 0xC9 - 0xE5;
1043                                 else if (s.IndexOf ("CIRCLE") >= 0) {
1044                                         if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1045                                                 value = 0xCA - 0xE5;
1046                                         else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1047                                                 value = 0xCB - 0xE5;
1048                                         else
1049                                                 value = 0xC9 - 0xE5;
1050                                 }
1051                                 if (0x25DA <= cp && cp <= 0x25E5)
1052                                         value = 0xCD + cp - 0x25DA - 0xE5;
1053
1054                                 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1055                                 switch (cp) {
1056                                 case 0x2571: value = 0xF; break;
1057                                 case 0x2572: value = 0x10; break;
1058                                 case 0x2573: value = 0x11; break;
1059                                 }
1060                                 if (value != 0)
1061                                         boxValues.Add (new DictionaryEntry (
1062                                                 cp, value));
1063                         }
1064
1065                         // For some characters store the name and sort later
1066                         // to determine sorting.
1067                         if (0x2100 <= cp && cp <= 0x213F &&
1068                                 Char.IsSymbol ((char) cp))
1069                                 sortableCharNames.Add (
1070                                         new DictionaryEntry (cp, name));
1071                         else if (0x3380 <= cp && cp <= 0x33DD)
1072                                 sortableCharNames.Add (new DictionaryEntry (
1073                                         cp, name.Substring (7)));
1074
1075                         if (Char.GetUnicodeCategory ((char) cp) ==
1076                                 UnicodeCategory.MathSymbol) {
1077                                 if (name.StartsWith ("CIRCLED "))
1078                                         diacritical [cp] = 0xEE;
1079                                 if (name.StartsWith ("SQUARED "))
1080                                         diacritical [cp] = 0xEF;
1081                         }
1082
1083                         // diacritical weights by character name
1084 if (diacritics.Length != diacriticWeights.Length)
1085 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1086                         for (int d = 0; d < diacritics.Length; d++) {
1087                                 if (s.IndexOf (diacritics [d]) > 0) {
1088                                         diacritical [cp] += diacriticWeights [d];
1089                                         if (s.IndexOf ("COMBINING") >= 0)
1090                                                 diacritical [cp] -= (byte) 2;
1091                                         continue;
1092                                 }
1093                                 // also process "COMBINING blah" here
1094                                 // For now it is limited to cp < 0x0370
1095 //                              if (cp < 0x0300 || cp >= 0x0370)
1096 //                                      continue;
1097                                 string tmp = diacritics [d].TrimEnd (';');
1098                                 if (tmp.IndexOf ("WITH ") == 0)
1099                                         tmp = tmp.Substring (4);
1100                                 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1101                                 if (name == tmp) {
1102                                         diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1103                                         break;
1104                                 }
1105 //if (name == tmp)
1106 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1107                         }
1108                         // Two-step grep required for it.
1109                         if (s.IndexOf ("FULL STOP") > 0 &&
1110                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1111                                 diacritical [cp] |= 0xF4;
1112                         if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1113                                 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1114                                         s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1115
1116                         // Arabic letter name
1117                         if (0x0621 <= cp && cp <= 0x064A &&
1118                                 Char.GetUnicodeCategory ((char) cp)
1119                                 == UnicodeCategory.OtherLetter) {
1120                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1121                                 switch (cp) {
1122                                 case 0x0621:
1123                                 case 0x0624:
1124                                 case 0x0626:
1125                                         // hamza, waw, yeh ... special cases.
1126                                         value = 0x07;
1127                                         break;
1128                                 case 0x0649:
1129                                 case 0x064A:
1130                                         value = 0x77; // special cases.
1131                                         break;
1132                                 default:
1133                                         // Get primary letter name i.e.
1134                                         // XXX part of ARABIC LETTER XXX yyy
1135                                         // e.g. that of "TEH MARBUTA" is "TEH".
1136                                         string letterName =
1137                                                 (cp == 0x0640) ?
1138                                                 // 0x0640 is special: it does
1139                                                 // not start with ARABIC LETTER
1140                                                 name :
1141                                                 name.Substring (14);
1142                                         int tmpIdx = letterName.IndexOf (' ');
1143                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1144 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1145                                         if (arabicNameMap.ContainsKey (letterName))
1146                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1147                                         else
1148                                                 arabicNameMap [letterName] = cp;
1149                                         break;
1150                                 }
1151                                 arabicLetterPrimaryValues [cp] = value;
1152                         }
1153
1154                         // Japanese square letter
1155                         if (0x3300 <= cp && cp <= 0x3357)
1156                                 if (!ExistsJIS (cp))
1157                                         nonJisJapanese.Add (new NonJISCharacter (cp, name));
1158
1159                         // normalizationType
1160                         string decomp = values [4];
1161                         idx = decomp.IndexOf ('<');
1162                         if (idx >= 0) {
1163                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1164                                 case "full":
1165                                         decompType [cp] = DecompositionFull;
1166                                         break;
1167                                 case "sub":
1168                                         decompType [cp] = DecompositionSub;
1169                                         break;
1170                                 case "super":
1171                                         decompType [cp] = DecompositionSuper;
1172                                         break;
1173                                 case "small":
1174                                         decompType [cp] = DecompositionSmall;
1175                                         break;
1176                                 case "isolated":
1177                                         decompType [cp] = DecompositionIsolated;
1178                                         break;
1179                                 case "initial":
1180                                         decompType [cp] = DecompositionInitial;
1181                                         break;
1182                                 case "final":
1183                                         decompType [cp] = DecompositionFinal;
1184                                         break;
1185                                 case "medial":
1186                                         decompType [cp] = DecompositionMedial;
1187                                         break;
1188                                 case "noBreak":
1189                                         decompType [cp] = DecompositionNoBreak;
1190                                         break;
1191                                 case "compat":
1192                                         decompType [cp] = DecompositionCompat;
1193                                         break;
1194                                 case "fraction":
1195                                         decompType [cp] = DecompositionFraction;
1196                                         break;
1197                                 case "font":
1198                                         decompType [cp] = DecompositionFont;
1199                                         break;
1200                                 case "circle":
1201                                         decompType [cp] = DecompositionCircle;
1202                                         break;
1203                                 case "square":
1204                                         decompType [cp] = DecompositionSquare;
1205                                         break;
1206                                 case "wide":
1207                                         decompType [cp] = DecompositionWide;
1208                                         break;
1209                                 case "narrow":
1210                                         decompType [cp] = DecompositionNarrow;
1211                                         break;
1212                                 case "vertical":
1213                                         decompType [cp] = DecompositionVertical;
1214                                         break;
1215                                 default:
1216                                         throw new Exception ("Support NFKD type : " + decomp);
1217                                 }
1218                         }
1219                         else
1220                                 decompType [cp] = DecompositionCanonical;
1221                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1222                         if (decomp.Length > 0) {
1223
1224                                 string [] velems = decomp.Split (' ');
1225                                 int didx = decompValues.Count;
1226                                 decompIndex [cp] = didx;
1227                                 foreach (string v in velems)
1228                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1229                                 decompLength [cp] = velems.Length;
1230
1231                                 // [decmpType] -> this_cp
1232                                 int targetCP = (int) decompValues [didx];
1233                                 // for "(x)" it specially maps to 'x' .
1234                                 // FIXME: check if it is sane
1235                                 if (velems.Length == 3 &&
1236                                         (int) decompValues [didx] == '(' &&
1237                                         (int) decompValues [didx + 2] == ')')
1238                                         targetCP = (int) decompValues [didx + 1];
1239                                 // special: 0x215F "1/"
1240                                 else if (cp == 0x215F)
1241                                         targetCP = '1';
1242                                 else if (velems.Length > 1 &&
1243                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
1244                                         // skip them, except for CJK ideograph compat
1245                                         targetCP = 0;
1246
1247                                 if (targetCP != 0) {
1248                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
1249                                         if (entry == null) {
1250                                                 entry = new Hashtable ();
1251                                                 nfkdMap [targetCP] = entry;
1252                                         }
1253                                         entry [(byte) decompType [cp]] = cp;
1254                                 }
1255                         }
1256                         // numeric values
1257                         if (values [5].Length > 0)
1258                                 decimalValue [cp] = decimal.Parse (values [5]);
1259                         else if (values [6].Length > 0)
1260                                 decimalValue [cp] = decimal.Parse (values [6]);
1261                         else if (values [7].Length > 0) {
1262                                 string decstr = values [7];
1263                                 idx = decstr.IndexOf ('/');
1264                                 if (cp == 0x215F) // special. "1/"
1265                                         decimalValue [cp] = 0x1;
1266                                 else if (idx > 0)
1267                                         // m/n
1268                                         decimalValue [cp] = 
1269                                                 decimal.Parse (decstr.Substring (0, idx))
1270                                                 / decimal.Parse (decstr.Substring (idx + 1));
1271                                 else if (decstr [0] == '(' &&
1272                                         decstr [decstr.Length - 1] == ')')
1273                                         // (n)
1274                                         decimalValue [cp] =
1275                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1276                                 else if (decstr [decstr.Length - 1] == '.')
1277                                         // n.
1278                                         decimalValue [cp] =
1279                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1280                                 else
1281                                         decimalValue [cp] = decimal.Parse (decstr);
1282                         }
1283                 }
1284
1285                 void ParseDerivedCoreProperties (string filename)
1286                 {
1287                         // IsUppercase
1288                         using (StreamReader file =
1289                                 new StreamReader (filename)) {
1290                                 for (int line = 1; file.Peek () >= 0; line++) {
1291                                         try {
1292                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1293                                         } catch (Exception) {
1294                                                 Console.Error.WriteLine ("**** At line " + line);
1295                                                 throw;
1296                                         }
1297                                 }
1298                         }
1299                 }
1300
1301                 void ProcessDerivedCorePropLine (string s)
1302                 {
1303                         int idx = s.IndexOf ('#');
1304                         if (idx >= 0)
1305                                 s = s.Substring (0, idx);
1306                         idx = s.IndexOf (';');
1307                         if (idx < 0)
1308                                 return;
1309                         string cpspec = s.Substring (0, idx);
1310                         idx = cpspec.IndexOf ("..");
1311                         NumberStyles nf = NumberStyles.HexNumber |
1312                                 NumberStyles.AllowTrailingWhite;
1313                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1314                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1315                         string value = s.Substring (cpspec.Length + 1).Trim ();
1316
1317                         // FIXME: use index
1318                         if (cp > char.MaxValue)
1319                                 return;
1320
1321                         switch (value) {
1322                         case "Uppercase":
1323                                 for (int x = cp; x <= cpEnd; x++)
1324                                         isUppercase [x] = true;
1325                                 break;
1326                         }
1327                 }
1328
1329                 void ParseScripts (string filename)
1330                 {
1331                         ArrayList gurmukhi = new ArrayList ();
1332                         ArrayList gujarati = new ArrayList ();
1333                         ArrayList georgian = new ArrayList ();
1334                         ArrayList thaana = new ArrayList ();
1335
1336                         using (StreamReader file =
1337                                 new StreamReader (filename)) {
1338                                 while (file.Peek () >= 0) {
1339                                         string s = file.ReadLine ();
1340                                         int idx = s.IndexOf ('#');
1341                                         if (idx >= 0)
1342                                                 s = s.Substring (0, idx);
1343                                         idx = s.IndexOf (';');
1344                                         if (idx < 0)
1345                                                 continue;
1346
1347                                         string cpspec = s.Substring (0, idx);
1348                                         idx = cpspec.IndexOf ("..");
1349                                         NumberStyles nf = NumberStyles.HexNumber |
1350                                                 NumberStyles.AllowTrailingWhite;
1351                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1352                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1353                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1354
1355                                         // FIXME: use index
1356                                         if (cp > char.MaxValue)
1357                                                 continue;
1358
1359                                         switch (value) {
1360                                         case "Gurmukhi":
1361                                                 for (int x = cp; x <= cpEnd; x++)
1362                                                         if (!IsIgnorable (x))
1363                                                                 gurmukhi.Add ((char) x);
1364                                                 break;
1365                                         case "Gujarati":
1366                                                 for (int x = cp; x <= cpEnd; x++)
1367                                                         if (!IsIgnorable (x))
1368                                                                 gujarati.Add ((char) x);
1369                                                 break;
1370                                         case "Georgian":
1371                                                 for (int x = cp; x <= cpEnd; x++)
1372                                                         if (!IsIgnorable (x))
1373                                                                 georgian.Add ((char) x);
1374                                                 break;
1375                                         case "Thaana":
1376                                                 for (int x = cp; x <= cpEnd; x++)
1377                                                         if (!IsIgnorable (x))
1378                                                                 thaana.Add ((char) x);
1379                                                 break;
1380                                         }
1381                                 }
1382                         }
1383                         gurmukhi.Sort (UCAComparer.Instance);
1384                         gujarati.Sort (UCAComparer.Instance);
1385                         georgian.Sort (UCAComparer.Instance);
1386                         thaana.Sort (UCAComparer.Instance);
1387                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1388                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1389                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1390                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1391                 }
1392
1393                 void ParseJISOrder (string filename)
1394                 {
1395                         int line = 1;
1396                         try {
1397                                 using (StreamReader file =
1398                                         new StreamReader (filename)) {
1399                                         for (;file.Peek () >= 0; line++)
1400                                                 ProcessJISOrderLine (file.ReadLine ());
1401                                 }
1402                         } catch (Exception) {
1403                                 Console.Error.WriteLine ("---- line {0}", line);
1404                                 throw;
1405                         }
1406                 }
1407
1408                 char [] ws = new char [] {'\t', ' '};
1409
1410                 void ProcessJISOrderLine (string s)
1411                 {
1412                         int idx = s.IndexOf ('#');
1413                         if (idx >= 0)
1414                                 s = s.Substring (0, idx).Trim ();
1415                         if (s.Length == 0)
1416                                 return;
1417                         idx = s.IndexOfAny (ws);
1418                         if (idx < 0)
1419                                 return;
1420                         // They start with "0x" so cut them out.
1421                         int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1422                         int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1423                         jisJapanese.Add (new JISCharacter (cp, jis));
1424                 }
1425
1426                 void ParseCJK (string zhXML, string jaXML, string koXML)
1427                 {
1428                         XmlDocument doc = new XmlDocument ();
1429                         doc.XmlResolver = null;
1430                         int v;
1431                         string s;
1432                         string category;
1433                         int offset;
1434                         ushort [] arr;
1435
1436                         // Chinese Simplified
1437                         category = "chs";
1438                         arr = cjkCHS;
1439                         offset = 0;//char.MaxValue - arr.Length;
1440                         doc.Load (zhXML);
1441                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1442                         v = 0x8008;
1443                         foreach (char c in s) {
1444                                 if (c < '\u3100')
1445                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1446                                 else {
1447                                         arr [(int) c - offset] = (ushort) v++;
1448                                         if (v % 256 == 0)
1449                                                 v += 2;
1450                                 }
1451                         }
1452
1453                         // Chinese Traditional
1454                         category = "cht";
1455                         arr = cjkCHT;
1456                         offset = 0;//char.MaxValue - arr.Length;
1457                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1458                         v = 0x8002;
1459                         foreach (char c in s) {
1460                                 if (c < '\u4E00')
1461                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1462                                 else {
1463                                         arr [(int) c - offset] = (ushort) v++;
1464                                         if (v % 256 == 0)
1465                                                 v += 2;
1466                                 }
1467                         }
1468
1469                         // Japanese
1470                         category = "ja";
1471                         arr = cjkJA;
1472                         offset = 0;//char.MaxValue - arr.Length;
1473
1474                         // SPECIAL CASES
1475                         arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1476                         arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1477                         arr [0x337E] = 0x8005;
1478                         arr [0x337D] = 0x8006;
1479                         arr [0x337C] = 0x8007;
1480
1481                         v = 0x8008;
1482                         foreach (JISCharacter jc in jisJapanese) {
1483                                 if (jc.JIS < 0x8800)
1484                                         continue;
1485                                 char c = (char) jc.CP;
1486
1487                                 if (c < '\u4E00')
1488                                         // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1489                                         continue;
1490                                 else {
1491                                         arr [(int) c - offset] = (ushort) v++;
1492                                         if (v % 256 == 0)
1493                                                 v += 2;
1494
1495                                         // SPECIAL CASES:
1496                                         if (c == '\u662D') // U+337C
1497                                                 continue;
1498                                         if (c == '\u5927') // U+337D
1499                                                 continue;
1500                                         if (c == '\u5E73') // U+337B
1501                                                 continue;
1502                                         if (c == '\u660E') // U+337E
1503                                                 continue;
1504                                         if (c == '\u9686') // U+F9DC
1505                                                 continue;
1506
1507                                         // FIXME: there are still remaining
1508                                         // characters after U+FA0C.
1509 //                                      for (int k = 0; k < char.MaxValue; k++) {
1510                                         for (int k = 0; k < '\uFA0D'; k++) {
1511                                                 if (decompIndex [k] == 0 || IsIgnorable (k))
1512                                                         continue;
1513                                                 if (decompValues [decompIndex [k]] == c /*&&
1514                                                         decompLength [k] == 1*/ ||
1515                                                         decompLength [k] == 3 &&
1516                                                         decompValues [decompIndex [k] + 1] == c) {
1517                                                         arr [k - offset] = (ushort) v++;
1518                                                         if (v % 256 == 0)
1519                                                                 v += 2;
1520                                                 }
1521                                         }
1522                                 }
1523                         }
1524
1525                         // Korean
1526                         // Korean weight is somewhat complex. It first shifts
1527                         // Hangul category from 52-x to 80-x (they are anyways
1528                         // computed). CJK ideographs are placed at secondary
1529                         // weight, like XX YY 01 zz 01, where XX and YY are
1530                         // corresponding "reset" value and zz is 41,43,45...
1531                         //
1532                         // Unlike chs,cht and ja, Korean value is a combined
1533                         // ushort which is computed as category
1534                         //
1535                         category = "ko";
1536                         arr = cjkKO;
1537                         offset = 0;//char.MaxValue - arr.Length;
1538                         doc.Load (koXML);
1539                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1540                                 XmlElement sc = (XmlElement) reset.NextSibling;
1541                                 // compute "category" and "level 1" for the 
1542                                 // target "reset" Hangle syllable
1543                                 char rc = reset.InnerText [0];
1544                                 int ri = ((int) rc - 0xAC00) + 1;
1545                                 ushort p = (ushort)
1546                                         ((ri / 254) * 256 + (ri % 254) + 2);
1547                                 // Place the characters after the target.
1548                                 s = sc.InnerText;
1549                                 v = 0x41;
1550                                 foreach (char c in s) {
1551                                         arr [(int) c - offset] = p;
1552                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1553                                         v += 2;
1554                                 }
1555                         }
1556                 }
1557
1558                 #endregion
1559
1560                 #region Generation
1561
1562                 void FillIgnorables ()
1563                 {
1564                         for (int i = 0; i <= char.MaxValue; i++) {
1565                                 if (Char.GetUnicodeCategory ((char) i) ==
1566                                         UnicodeCategory.OtherNotAssigned)
1567                                         continue;
1568                                 if (IsIgnorable (i))
1569                                         ignorableFlags [i] |= 1;
1570                                 if (IsIgnorableSymbol (i))
1571                                         ignorableFlags [i] |= 2;
1572                                 if (IsIgnorableNonSpacing (i))
1573                                         ignorableFlags [i] |= 4;
1574                         }
1575                 }
1576
1577                 void ModifyUnidata ()
1578                 {
1579                         // Modify some decomposition equivalence
1580                         decompType [0xFE31] = 0;
1581                         decompIndex [0xFE31] = 0;
1582                         decompLength [0xFE31] = 0;
1583                         decompType [0xFE32] = 0;
1584                         decompIndex [0xFE32] = 0;
1585                         decompLength [0xFE32] = 0;
1586
1587                         // Korean parens numbers
1588                         for (int i = 0x3200; i <= 0x321C; i++)
1589                                 diacritical [i] = 0xA;
1590                         for (int i = 0x3260; i <= 0x327B; i++)
1591                                 diacritical [i] = 0xC;
1592
1593                         // LAMESPEC: these remapping should not be done.
1594                         // Windows have incorrect CJK compat mappings.
1595                         decompValues [decompIndex [0x32A9]] = 0x91AB;
1596                         decompLength [0x323B] = 1;
1597                         decompValues [decompIndex [0x323B]] = 0x5B78;
1598                         decompValues [decompIndex [0x32AB]] = 0x5B78;
1599                         decompValues [decompIndex [0x32A2]] = 0x5BEB;
1600                         decompLength [0x3238] = 1;
1601                         decompValues [decompIndex [0x3238]] = 0x52DE;
1602                         decompValues [decompIndex [0x3298]] = 0x52DE;
1603
1604                         // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1605                         decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1606                         decompValues [decompIndex [0xFA0C]] = 0x5140;
1607                         decompLength [0xFA0C] = 1;
1608                         decompIndex [0xF929] = decompLength [0xF929] = 0;
1609
1610                         decompValues [decompIndex [0xF92C]] = 0x90DE;
1611                 }
1612
1613                 void ModifyParsedValues ()
1614                 {
1615                         // some cyrillic diacritical weight. They seem to be
1616                         // based on old character names, so it's quicker to
1617                         // set them directly here.
1618                         diacritical [0x0496] = diacritical [0x0497] = 7;
1619                         diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1620                         diacritical [0x049A] = diacritical [0x049B] = 0x17;
1621                         diacritical [0x049C] = diacritical [0x049D] = 9;
1622                         diacritical [0x049E] = diacritical [0x049F] = 4;
1623                         diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1624                         diacritical [0x04A2] = diacritical [0x04A3] = 7;
1625                         diacritical [0x04A4] = diacritical [0x04A5] = 8;
1626
1627                         // number, secondary weights
1628                         byte weight = 0x38;
1629                         int [] numarr = numberSecondaryWeightBounds;
1630                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1631                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1632                                         if (Char.IsNumber ((char) cp))
1633                                                 diacritical [cp] = weight;
1634
1635                         // Update name part of named characters
1636                         for (int i = 0; i < sortableCharNames.Count; i++) {
1637                                 DictionaryEntry de =
1638                                         (DictionaryEntry) sortableCharNames [i];
1639                                 int cp = (int) de.Key;
1640                                 string renamed = null;
1641                                 switch (cp) {
1642                                 case 0x2101: renamed = "A_1"; break;
1643                                 case 0x33C3: renamed = "A_2"; break;
1644                                 case 0x2105: renamed = "C_1"; break;
1645                                 case 0x2106: renamed = "C_2"; break;
1646                                 case 0x211E: renamed = "R1"; break;
1647                                 case 0x211F: renamed = "R2"; break;
1648                                 // Remove some of them!
1649                                 case 0x2103:
1650                                 case 0x2109:
1651                                 case 0x2116:
1652                                 case 0x2117:
1653                                 case 0x2118:
1654                                 case 0x2125:
1655                                 case 0x2127:
1656                                 case 0x2129:
1657                                 case 0x212E:
1658                                 case 0x2132:
1659                                         sortableCharNames.RemoveAt (i);
1660                                         i--;
1661                                         continue;
1662                                 }
1663                                 if (renamed != null)
1664                                         sortableCharNames [i] =
1665                                                 new DictionaryEntry (cp, renamed);
1666                         }
1667                 }
1668
1669                 void GenerateCore ()
1670                 {
1671                         UnicodeCategory uc;
1672
1673                         #region Specially ignored // 01
1674                         // This will raise "Defined" flag up.
1675                         // FIXME: Check If it is really fine. Actually for
1676                         // Japanese voice marks this code does remapping.
1677                         foreach (char c in specialIgnore)
1678                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1679                         #endregion
1680
1681                         #region Extenders (FF FF)
1682                         fillIndex [0xFF] = 0xFF;
1683                         char [] specialBiggest = new char [] {
1684                                 '\u3005', '\u3031', '\u3032', '\u309D',
1685                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1686                                 '\uFE7C', '\uFE7D', '\uFF70'};
1687                         foreach (char c in specialBiggest)
1688                                 AddCharMap (c, 0xFF, 0);
1689                         #endregion
1690
1691                         #region Variable weights
1692                         // Controls : 06 03 - 06 3D
1693                         fillIndex [0x6] = 3;
1694                         for (int i = 0; i < 65536; i++) {
1695                                 if (IsIgnorable (i))
1696                                         continue;
1697                                 char c = (char) i;
1698                                 uc = Char.GetUnicodeCategory (c);
1699                                 // NEL is whitespace but not ignored here.
1700                                 if (uc == UnicodeCategory.Control &&
1701                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1702                                         AddCharMap (c, 6, 1);
1703                         }
1704
1705                         // Apostrophe 06 80
1706                         fillIndex [0x6] = 0x80;
1707                         AddCharMap ('\'', 6, 0);
1708                         AddCharMap ('\uFF07', 6, 1);
1709                         AddCharMap ('\uFE63', 6, 1);
1710
1711                         // SPECIAL CASE: fill FE32 here in prior to be added
1712                         // at 2013. Windows does not always respect NFKD.
1713                         map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1714
1715                         // Hyphen/Dash : 06 81 - 06 90
1716                         for (int i = 0; i < char.MaxValue; i++) {
1717                                 if (!IsIgnorable (i) &&
1718                                         Char.GetUnicodeCategory ((char) i) ==
1719                                         UnicodeCategory.DashPunctuation) {
1720                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1721                                         if (i == 0x2011) {
1722                                                 // SPECIAL: add 2027 and 2043
1723                                                 // Maybe they are regarded the 
1724                                                 // same hyphens in "central"
1725                                                 // position.
1726                                                 AddCharMap ('\u2027', 6, 1);
1727                                                 AddCharMap ('\u2043', 6, 1);
1728                                         }
1729                                 }
1730                         }
1731                         // They are regarded as primarily equivalent to '-'
1732                         map [0x208B] = new CharMapEntry (6, 0x82, 0);
1733                         map [0x207B] = new CharMapEntry (6, 0x82, 0);
1734                         map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1735
1736                         // Arabic variable weight chars 06 A0 -
1737                         fillIndex [6] = 0xA0;
1738                         // vowels
1739                         for (int i = 0x64B; i <= 0x650; i++)
1740                                 AddArabicCharMap ((char) i);
1741                         // sukun
1742                         AddCharMapGroup ('\u0652', 6, 1, 0);
1743                         // shadda
1744                         AddCharMapGroup ('\u0651', 6, 1, 0);
1745                         #endregion
1746
1747
1748                         #region Nonspacing marks // 01
1749                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1750
1751                         // Combining diacritical marks: 01 DC -
1752
1753                         fillIndex [0x1] = 0x41;
1754                         for (int i = 0x030E; i <= 0x0326; i++)
1755                                 if (!IsIgnorable (i))
1756                                         AddCharMap ((char) i, 0x1, 1);
1757                         for (int i = 0x0329; i <= 0x0334; i++)
1758                                 if (!IsIgnorable (i))
1759                                         AddCharMap ((char) i, 0x1, 1);
1760                         fillIndex [0x1]++;
1761                         for (int i = 0x0339; i <= 0x0341; i++)
1762                                 if (!IsIgnorable (i))
1763                                         AddCharMap ((char) i, 0x1, 1);
1764                         fillIndex [0x1] = 0x74;
1765                         for (int i = 0x0346; i <= 0x0348; i++)
1766                                 if (!IsIgnorable (i))
1767                                         AddCharMap ((char) i, 0x1, 1);
1768                         for (int i = 0x02BE; i <= 0x02BF; i++)
1769                                 if (!IsIgnorable (i))
1770                                         AddCharMap ((char) i, 0x1, 1);
1771                         for (int i = 0x02C1; i <= 0x02C5; i++)
1772                                 if (!IsIgnorable (i))
1773                                         AddCharMap ((char) i, 0x1, 1);
1774                         for (int i = 0x02CE; i <= 0x02CF; i++)
1775                                 if (!IsIgnorable (i))
1776                                         AddCharMap ((char) i, 0x1, 1);
1777                         fillIndex [0x1]++;
1778                         for (int i = 0x02D1; i <= 0x02D3; i++)
1779                                 if (!IsIgnorable (i))
1780                                         AddCharMap ((char) i, 0x1, 1);
1781                         AddCharMap ('\u02DE', 0x1, 1);
1782                         for (int i = 0x02E4; i <= 0x02E9; i++)
1783                                 if (!IsIgnorable (i))
1784                                         AddCharMap ((char) i, 0x1, 1);
1785
1786                         // FIXME: needs more love here (it should eliminate
1787                         // all the hacky code above).
1788                         for (int i = 0x0300; i < 0x0370; i++)
1789                                 if (!IsIgnorable (i) && diacritical [i] != 0
1790                                         /* especiall here*/ && !map [i].Defined)
1791                                         map [i] = new CharMapEntry (
1792                                                 0x1, 0x1, diacritical [i]);
1793
1794                         // Cyrillic and Armenian nonspacing mark
1795                         fillIndex [0x1] = 0x94;
1796                         for (int i = 0x400; i < 0x580; i++)
1797                                 if (!IsIgnorable (i) &&
1798                                         Char.GetUnicodeCategory ((char) i) ==
1799                                         UnicodeCategory.NonSpacingMark)
1800                                         AddCharMap ((char) i, 1, 1);
1801
1802                         fillIndex [0x1] = 0x8D;
1803                         // syriac dotted nonspacing marks (1)
1804                         AddCharMap ('\u0740', 0x1, 1);
1805                         AddCharMap ('\u0741', 0x1, 1);
1806                         AddCharMap ('\u0742', 0x1, 1);
1807                         // syriac oblique nonspacing marks
1808                         AddCharMap ('\u0747', 0x1, 1);
1809                         AddCharMap ('\u0748', 0x1, 1);
1810                         // syriac dotted nonspacing marks (2)
1811                         fillIndex [0x1] = 0x94; // this reset is mandatory
1812                         AddCharMap ('\u0732', 0x1, 1);
1813                         AddCharMap ('\u0735', 0x1, 1);
1814                         AddCharMap ('\u0738', 0x1, 1);
1815                         AddCharMap ('\u0739', 0x1, 1);
1816                         AddCharMap ('\u073C', 0x1, 1);
1817                         // SPECIAL CASES: superscripts
1818                         AddCharMap ('\u073F', 0x1, 1);
1819                         AddCharMap ('\u0711', 0x1, 1);
1820                         // syriac "DOTS"
1821                         for (int i = 0x0743; i <= 0x0746; i++)
1822                                 AddCharMap ((char) i, 0x1, 1);
1823                         for (int i = 0x0730; i <= 0x0780; i++)
1824                                 if (!map [i].Defined &&
1825                                         Char.GetUnicodeCategory ((char) i) ==
1826                                         UnicodeCategory.NonSpacingMark)
1827                                         AddCharMap ((char) i, 0x1, 1);
1828
1829                         // LAMESPEC: It should not stop at '\u20E1'. There are
1830                         // a few more characters (that however results in 
1831                         // overflow of level 2 unless we start before 0xDD).
1832                         fillIndex [0x1] = 0xDD;
1833                         for (int i = 0x20D0; i <= 0x20DC; i++)
1834                                 AddCharMap ((char) i, 0x1, 1);
1835                         fillIndex [0x1] = 0xEC;
1836                         for (int i = 0x20DD; i <= 0x20E1; i++)
1837                                 AddCharMap ((char) i, 0x1, 1);
1838                         fillIndex [0x1] = 0x7;
1839                         for (int i = 0x302A; i <= 0x302D; i++)
1840                                 AddCharMap ((char) i, 0x1, 1);
1841                         fillIndex [0x1] = 0x50; // I wonder how they are sorted
1842                         for (int i = 0x02D4; i <= 0x02D7; i++)
1843                                 AddCharMap ((char) i, 0x1, 1);
1844
1845                         // They are not part of Nonspacing marks, but have
1846                         // only diacritical weight.
1847                         for (int i = 0x3099; i <= 0x309C; i++)
1848                                 map [i] = new CharMapEntry (1, 1, 1);
1849                         map [0xFF9E] = new CharMapEntry (1, 1, 1);
1850                         map [0xFF9F] = new CharMapEntry (1, 1, 2);
1851                         map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1852                         map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1853                         for (int i = 0x30FC; i <= 0x30FE; i++)
1854                                 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1855
1856                         #endregion
1857
1858
1859                         #region Whitespaces // 07 03 -
1860                         fillIndex [0x7] = 0x2;
1861                         AddCharMap (' ', 0x7, 2);
1862                         AddCharMap ('\u00A0', 0x7, 1);
1863                         for (int i = 9; i <= 0xD; i++)
1864                                 AddCharMap ((char) i, 0x7, 1);
1865                         for (int i = 0x2000; i <= 0x200B; i++)
1866                                 AddCharMap ((char) i, 0x7, 1);
1867
1868                         fillIndex [0x7] = 0x17;
1869                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1870                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1871
1872                         // Characters which used to represent layout control.
1873                         // LAMESPEC: Windows developers seem to have thought 
1874                         // that those characters are kind of whitespaces,
1875                         // while they aren't.
1876                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1877                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1878                         #endregion
1879
1880                         // category 09 - continued symbols from 08
1881                         fillIndex [0x9] = 2;
1882                         // misc tech mark
1883                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1884                                 AddCharMap ((char) cp, 0x9, 1, 0);
1885
1886                         // arrows
1887                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1888                         foreach (DictionaryEntry de in arrowValues) {
1889                                 int idx = (int) de.Value;
1890                                 int cp = (int) de.Key;
1891                                 if (map [cp].Defined)
1892                                         continue;
1893                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1894                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1895                                 arrowLv2 [idx]++;
1896                         }
1897                         // boxes
1898                         byte [] boxLv2 = new byte [128];
1899                         for (int i = 0; i < boxLv2.Length; i++)
1900                                 boxLv2 [i] = 3;
1901                         foreach (DictionaryEntry de in boxValues) {
1902                                 int cp = (int) de.Key;
1903                                 int off = (int) de.Value;
1904                                 if (map [cp].Defined)
1905                                         continue;
1906                                 if (off < 0) {
1907                                         fillIndex [0x9] = (byte) (0xE5 + off);
1908                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1909                                 }
1910                                 else {
1911                                         fillIndex [0x9] = (byte) (0xE5 + off);
1912                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1913                                 }
1914                         }
1915                         // Some special characters (slanted)
1916                         fillIndex [0x9] = 0xF4;
1917                         AddCharMap ('\u2571', 0x9, 3);
1918                         AddCharMap ('\u2572', 0x9, 3);
1919                         AddCharMap ('\u2573', 0x9, 3);
1920
1921                         // FIXME: implement 0A
1922                         #region Symbols
1923                         fillIndex [0xA] = 2;
1924                         // byte currency symbols
1925                         for (int cp = 0; cp < 0x100; cp++) {
1926                                 uc = Char.GetUnicodeCategory ((char) cp);
1927                                 if (!IsIgnorable (cp) &&
1928                                         uc == UnicodeCategory.CurrencySymbol &&
1929                                         cp != '$' ||
1930                                         cp == 0xAC)
1931                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1932                         }
1933                         // byte other symbols
1934                         for (int cp = 0; cp < 0x100; cp++) {
1935                                 if (cp == 0xA6)
1936                                         continue; // SPECIAL: skip FIXME: why?
1937                                 uc = Char.GetUnicodeCategory ((char) cp);
1938                                 if (!IsIgnorable (cp) &&
1939                                         uc == UnicodeCategory.OtherSymbol ||
1940                                         cp == '\u00B5' || cp == '\u00B7')
1941                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1942                         }
1943                         // U+30FB here
1944                         AddCharMapGroup ('\u30FB', 0xA, 1, 0);
1945
1946                         for (int cp = 0x2020; cp <= 0x2031; cp++)
1947                                 if (Char.IsPunctuation ((char) cp))
1948                                         AddCharMap ((char) cp, 0xA, 1, 0);
1949                         // SPECIAL CASES: why?
1950                         AddCharMap ('\u203B', 0xA, 1, 0);
1951                         AddCharMap ('\u2040', 0xA, 1, 0);
1952                         AddCharMap ('\u2041', 0xA, 1, 0);
1953                         AddCharMap ('\u2042', 0xA, 1, 0);
1954
1955                         for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1956                                 AddCharMap ((char) cp, 0xA, 1, 0);
1957                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1958                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1959                                 AddCharMap ((char) cp, 0xA, 1, 0);
1960                         // Dingbats
1961                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1962                                 if (Char.IsSymbol ((char) cp))
1963                                         AddCharMap ((char) cp, 0xA, 1, 0);
1964                         // OCR
1965                         for (int i = 0x2440; i < 0x2460; i++)
1966                                 AddCharMap ((char) i, 0xA, 1, 0);
1967
1968                         #endregion
1969
1970                         #region Numbers // 0C 02 - 0C E1
1971                         fillIndex [0xC] = 2;
1972
1973                         // 9F8 : Bengali "one less than the denominator"
1974                         AddCharMap ('\u09F8', 0xC, 1, 0x3C);
1975
1976                         ArrayList numbers = new ArrayList ();
1977                         for (int i = 0; i < 65536; i++)
1978                                 if (!IsIgnorable (i) &&
1979                                         Char.IsNumber ((char) i) &&
1980                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1981                                         numbers.Add (i);
1982
1983                         ArrayList numberValues = new ArrayList ();
1984                         foreach (int i in numbers)
1985                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1986                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1987
1988 //foreach (DictionaryEntry de in numberValues)
1989 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1990
1991                         decimal prevValue = -1;
1992                         foreach (DictionaryEntry de in numberValues) {
1993                                 int cp = (int) de.Key;
1994                                 decimal currValue = (decimal) de.Value;
1995                                 bool addnew = false;
1996                                 if (prevValue < currValue &&
1997                                         prevValue - (int) prevValue == 0 &&
1998                                         prevValue >= 1) {
1999
2000                                         addnew = true;
2001                                         // Process Hangzhou and Roman numbers
2002
2003                                         // There are some SPECIAL cases.
2004                                         if (currValue != 4) // no increment for 4
2005                                                 fillIndex [0xC]++;
2006
2007                                         int xcp;
2008                                         if (currValue <= 10) {
2009                                                 xcp = (int) prevValue + 0x2170 - 1;
2010                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2011                                                 xcp = (int) prevValue + 0x2160 - 1;
2012                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2013                                                 fillIndex [0xC] += 2;
2014                                                 xcp = (int) prevValue + 0x3021 - 1;
2015                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2016                                                 fillIndex [0xC]++;
2017                                         }
2018                                         else if (currValue == 11)
2019                                                 fillIndex [0xC]++;
2020                                 }
2021                                 if (prevValue < currValue)
2022                                         prevValue = currValue;
2023                                 if (map [cp].Defined)
2024                                         continue;
2025                                 // HangZhou and Roman are add later 
2026                                 // (code is above)
2027                                 else if (0x3021 <= cp && cp < 0x302A
2028                                         || 0x2160 <= cp && cp < 0x216A
2029                                         || 0x2170 <= cp && cp < 0x217A)
2030                                         continue;
2031
2032                                 if (cp ==  0x215B) // FIXME: why?
2033                                         fillIndex [0xC] += 2;
2034                                 else if (cp == 0x3021) // FIXME: why?
2035                                         fillIndex [0xC]++;
2036                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2037                                 if (addnew || cp <= '9') {
2038                                         int mod = (int) currValue - 1;
2039                                         int xcp;
2040                                         if (1 <= currValue && currValue <= 10) {
2041                                                 xcp = mod + 0x2776;
2042                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2043                                                 xcp = mod + 0x2780;
2044                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2045                                                 xcp = mod + 0x278A;
2046                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2047                                         }
2048                                         if (1 <= currValue && currValue <= 20) {
2049                                                 xcp = mod + 0x2460;
2050                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2051                                                 xcp = mod + 0x2474;
2052                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2053                                                 xcp = mod + 0x2488;
2054                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2055                                         }
2056                                 }
2057
2058                                 if (cp != 0x09E7 && cp != 0x09EA)
2059                                         fillIndex [0xC]++;
2060
2061                                 // Add special cases that are not regarded as 
2062                                 // numbers in UnicodeCategory speak.
2063                                 if (cp == '5') {
2064                                         // TONE FIVE
2065                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2066                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2067                                 }
2068                                 else if (cp == '6') // FIXME: why?
2069                                         fillIndex [0xC]++;
2070                         }
2071
2072                         // 221E: infinity
2073                         fillIndex [0xC] = 0xFF;
2074                         AddCharMap ('\u221E', 0xC, 1);
2075                         #endregion
2076
2077                         #region Letters and NonSpacing Marks (general)
2078
2079                         // ASCII Latin alphabets
2080                         for (int i = 0; i < alphabets.Length; i++)
2081                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2082
2083                         // non-ASCII Latin alphabets
2084                         // FIXME: there is no such characters that are placed
2085                         // *after* "alphabets" array items. This is nothing
2086                         // more than a hack that creates dummy weight for
2087                         // primary characters.
2088                         for (int i = 0x0080; i < 0x0300; i++) {
2089                                 if (!Char.IsLetter ((char) i))
2090                                         continue;
2091                                 // For those Latin Letters which has NFKD are
2092                                 // not added as independent primary character.
2093                                 if (decompIndex [i] != 0)
2094                                         continue;
2095                                 // SPECIAL CASES:
2096                                 // 1.some alphabets have primarily
2097                                 //   equivalent ASCII alphabets.
2098                                 // 2.some have independent primary weights,
2099                                 //   but inside a-to-z range.
2100                                 // 3.there are some expanded characters that
2101                                 //   are not part of Unicode Standard NFKD.
2102                                 // 4. some characters are letter in IsLetter
2103                                 //   but not in sortkeys (maybe unicode version
2104                                 //   difference caused it).
2105                                 switch (i) {
2106                                 // 1. skipping them does not make sense
2107 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
2108 //                              case 0x184: case 0x185: case 0x186: case 0x189:
2109 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
2110 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
2111 //                              case 0x19B: case 0x19C:
2112                                 // 2. skipping them does not make sense
2113 //                              case 0x14A: // Ng
2114 //                              case 0x14B: // ng
2115                                 // 3.
2116                                 case 0xC6: // AE
2117                                 case 0xE6: // ae
2118                                 case 0xDE: // Icelandic Thorn
2119                                 case 0xFE: // Icelandic Thorn
2120                                 case 0xDF: // German ss
2121                                 case 0xFF: // German ss
2122                                 // 4.
2123                                 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2124                                 // not classified yet
2125 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2126 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2127 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2128 //                              case 0x1DD:
2129                                         continue;
2130                                 }
2131                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
2132                         }
2133
2134                         // Greek and Coptic
2135                         fillIndex [0xF] = 02;
2136                         for (int i = 0x0380; i < 0x0390; i++)
2137                                 if (Char.IsLetter ((char) i))
2138                                         AddLetterMap ((char) i, 0xF, 1);
2139                         fillIndex [0xF] = 02;
2140                         for (int i = 0x0391; i < 0x03CF; i++)
2141                                 if (Char.IsLetter ((char) i))
2142                                         AddLetterMap ((char) i, 0xF, 1);
2143                         fillIndex [0xF] = 0x40;
2144                         for (int i = 0x03D0; i < 0x0400; i++)
2145                                 if (Char.IsLetter ((char) i))
2146                                         AddLetterMap ((char) i, 0xF, 1);
2147
2148                         // Cyrillic.
2149                         // Cyrillic letters are sorted like Latin letters i.e. 
2150                         // containing culture-specific letters between the
2151                         // standard Cyrillic sequence.
2152                         //
2153                         // We can't use UCA here; it has different sorting.
2154                         char [] orderedCyrillic = new char [] {
2155                                 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2156                                 '\u0452', // DJE for Serbocroatian
2157                                 '\u0435',
2158                                 '\u0454', // IE for Ukrainian
2159                                 '\u0436', '\u0437',
2160                                 '\u0455', // DZE
2161                                 '\u0438',
2162                                 '\u0456', // Byelorussian-Ukrainian I
2163                                 '\u0457', // YI
2164                                 '\u0439',
2165                                 '\u0458', // JE
2166                                 '\u043A', '\u043B',
2167                                 '\u0459', // LJE
2168                                 '\u043C', '\u043D',
2169                                 '\u045A', // NJE
2170                                 '\u043E',
2171                                 // 4E9 goes here.
2172                                 '\u043F', '\u0440', '\u0441', '\u0442',
2173                                 '\u045B', // TSHE for Serbocroatian
2174                                 '\u0443',
2175                                 '\u045E', // Short U for Byelorussian
2176                                 '\u04B1', // Straight U w/ stroke (diacritical!)
2177                                 '\u0444', '\u0445', '\u0446', '\u0447',
2178                                 '\u045F', // DZHE
2179                                 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2180                                 '\u044D', '\u044E', '\u044F'};
2181
2182                         // For some characters here is a map to basic cyrillic
2183                         // letters. See UnicodeData.txt character names for
2184                         // the sources. Here I simply declare an equiv. array.
2185                         // The content characters are map from U+490(,491),
2186                         // skipping small letters.
2187                         char [] cymap_src = new char [] {
2188                                 '\u0433', '\u0433', '\u0433', '\u0436',
2189                                 '\u0437', '\u043A', '\u043A', '\u043A',
2190                                 '\u043A', '\u043D', '\u043D', '\u043F',
2191                                 '\u0445', '\u0441', '\u0442', '\u0443',
2192                                 '\u0443', '\u0445', '\u0446', '\u0447',
2193                                 '\u0447', '\u0432', '\u0435', '\u0435',
2194                                 '\u0406', '\u0436', '\u043A', '\u043D',
2195                                 '\u0447', '\u0435'};
2196
2197                         fillIndex [0x10] = 0x8D;
2198                         for (int i = 0x0460; i < 0x0481; i++) {
2199                                 if (Char.IsLetter ((char) i)) {
2200                                         if (i == 0x0476)
2201                                                 // U+476/477 have the same
2202                                                 // primary weight as U+474/475.
2203                                                 fillIndex [0x10] -= 3;
2204                                         AddLetterMap ((char) i, 0x10, 3);
2205                                 }
2206                         }
2207
2208                         fillIndex [0x10] = 0x6;
2209                         for (int i = 0; i < orderedCyrillic.Length; i++) {
2210                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2211                                 if (!IsIgnorable ((int) c) &&
2212                                         Char.IsLetter (c) &&
2213                                         !map [c].Defined) {
2214                                         AddLetterMap (c, 0x10, 0);
2215                                         fillIndex [0x10] += 3;
2216                                 }
2217                         }
2218
2219                         for (int i = 0; i < cymap_src.Length; i++) {
2220                                 char c = cymap_src [i];
2221                                 fillIndex [0x10] = map [c].Level1;
2222                                 int c2 = 0x0490 + i * 2;
2223                                 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2224                         }
2225
2226                         // Armenian
2227                         fillIndex [0x11] = 0x3;
2228                         fillIndex [0x1] = 0x98;
2229                         for (int i = 0x0531; i < 0x0586; i++) {
2230                                 if (i == 0x0559 || i == 0x55A)
2231                                         AddCharMap ((char) i, 1, 1);
2232                                 if (Char.IsLetter ((char) i))
2233                                         AddLetterMap ((char) i, 0x11, 1);
2234                         }
2235
2236                         // Hebrew
2237                         // -Letters
2238                         fillIndex [0x12] = 0x2;
2239                         for (int i = 0x05D0; i < 0x05FF; i++)
2240                                 if (Char.IsLetter ((char) i))
2241                                         AddLetterMap ((char) i, 0x12, 1);
2242                         // -Accents
2243                         fillIndex [0x1] = 0x3;
2244                         for (int i = 0x0591; i <= 0x05C2; i++) {
2245                                 if (i == 0x05A3 || i == 0x05BB)
2246                                         fillIndex [0x1]++;
2247                                 if (i != 0x05BE)
2248                                         AddCharMap ((char) i, 0x1, 1);
2249                         }
2250
2251                         // Arabic
2252                         fillIndex [0x1] = 0x8E;
2253                         fillIndex [0x13] = 0x3;
2254                         for (int i = 0x0621; i <= 0x064A; i++) {
2255                                 // Abjad
2256                                 if (Char.GetUnicodeCategory ((char) i)
2257                                         != UnicodeCategory.OtherLetter) {
2258                                         // FIXME: arabic nonspacing marks are
2259                                         // in different order.
2260                                         AddCharMap ((char) i, 0x1, 1);
2261                                         continue;
2262                                 }
2263 //                              map [i] = new CharMapEntry (0x13,
2264 //                                      (byte) arabicLetterPrimaryValues [i], 1);
2265                                 fillIndex [0x13] = 
2266                                         (byte) arabicLetterPrimaryValues [i];
2267                                 byte formDiacritical = 8; // default
2268                                 // SPECIAL CASES:
2269                                 switch (i) {
2270                                 case 0x0622: formDiacritical = 9; break;
2271                                 case 0x0623: formDiacritical = 0xA; break;
2272                                 case 0x0624: formDiacritical = 5; break;
2273                                 case 0x0625: formDiacritical = 0xB; break;
2274                                 case 0x0626: formDiacritical = 7; break;
2275                                 case 0x0649: formDiacritical = 5; break;
2276                                 case 0x064A: formDiacritical = 7; break;
2277                                 }
2278                                 AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2279                         }
2280                         for (int i = 0x0670; i < 0x0673; i++)
2281                                 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2282                         fillIndex [0x13] = 0x84;
2283                         for (int i = 0x0674; i < 0x06D6; i++)
2284                                 if (Char.IsLetter ((char) i))
2285                                         AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2286
2287                         // Devanagari
2288                         // FIXME: it does seem straight codepoint mapping.
2289                         fillIndex [0x14] = 04;
2290                         for (int i = 0x0901; i < 0x0905; i++)
2291                                 if (!IsIgnorable (i))
2292                                         AddLetterMap ((char) i, 0x14, 2);
2293                         fillIndex [0x14] = 0xB;
2294                         for (int i = 0x0905; i < 0x093A; i++) {
2295                                 if (i == 0x0928)
2296                                         AddCharMap ('\u0929', 0x14, 0, 8);
2297                                 if (i == 0x0930)
2298                                         AddCharMap ('\u0931', 0x14, 0, 8);
2299                                 if (i == 0x0933)
2300                                         AddCharMap ('\u0934', 0x14, 0, 8);
2301                                 if (Char.IsLetter ((char) i))
2302                                         AddLetterMap ((char) i, 0x14, 4);
2303                                 if (i == 0x090B)
2304                                         AddCharMap ('\u0960', 0x14, 4);
2305                                 if (i == 0x090C)
2306                                         AddCharMap ('\u0961', 0x14, 4);
2307                         }
2308                         fillIndex [0x14] = 0xDA;
2309                         for (int i = 0x093E; i < 0x0945; i++)
2310                                 if (!IsIgnorable (i))
2311                                         AddLetterMap ((char) i, 0x14, 2);
2312                         fillIndex [0x14] = 0xEC;
2313                         for (int i = 0x0945; i < 0x094F; i++)
2314                                 if (!IsIgnorable (i))
2315                                         AddLetterMap ((char) i, 0x14, 2);
2316
2317                         // Bengali
2318                         // -Letters
2319                         fillIndex [0x15] = 02;
2320                         for (int i = 0x0980; i < 0x9FF; i++) {
2321                                 if (IsIgnorable (i))
2322                                         continue;
2323                                 if (i == 0x09E0)
2324                                         fillIndex [0x15] = 0x3B;
2325                                 switch (Char.GetUnicodeCategory ((char) i)) {
2326                                 case UnicodeCategory.NonSpacingMark:
2327                                 case UnicodeCategory.DecimalDigitNumber:
2328                                 case UnicodeCategory.OtherNumber:
2329                                         continue;
2330                                 }
2331                                 AddLetterMap ((char) i, 0x15, 1);
2332                         }
2333                         // -Signs
2334                         fillIndex [0x1] = 0x3;
2335                         for (int i = 0x0981; i < 0x0A00; i++)
2336                                 if (Char.GetUnicodeCategory ((char) i) ==
2337                                         UnicodeCategory.NonSpacingMark)
2338                                         AddCharMap ((char) i, 0x1, 1);
2339
2340                         // Gurmukhi. orderedGurmukhi is from UCA
2341                         // FIXME: it does not look equivalent to UCA.
2342                         fillIndex [0x16] = 04;
2343                         fillIndex [0x1] = 3;
2344                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
2345                                 char c = orderedGurmukhi [i];
2346                                 if (IsIgnorable ((int) c))
2347                                         continue;
2348                                 if (IsIgnorableNonSpacing (c)) {
2349                                         AddLetterMap (c, 0x1, 1);
2350                                         continue;
2351                                 }
2352                                 if (c == '\u0A3C' || c == '\u0A4D' ||
2353                                         '\u0A66' <= c && c <= '\u0A71')
2354                                         continue;
2355                                 // SPECIAL CASES
2356                                 byte shift = 4;
2357                                 switch (c) {
2358                                 case '\u0A33': case '\u0A36': case '\u0A16':
2359                                 case '\u0A17': case '\u0A5B': case '\u0A5E':
2360                                         shift = 0;
2361                                         break;
2362                                 }
2363                                 if (c == '\u0A3E') // Skip
2364                                         fillIndex [0x16] = 0xC0;
2365                                 AddLetterMap (c, 0x16, shift);
2366                         }
2367
2368                         // Gujarati. orderedGujarati is from UCA
2369                         fillIndex [0x17] = 0x4;
2370                         // nonspacing marks
2371                         map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2372                         map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2373                         map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2374                         map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2375                         map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2376                         map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2377                         // letters go first.
2378                         for (int i = 0; i < orderedGujarati.Length; i++) {
2379                                 // SPECIAL CASE
2380                                 char c = orderedGujarati [i];
2381                                 if (Char.IsLetter (c)) {
2382                                         // SPECIAL CASES
2383                                         if (c == '\u0AB3' || c == '\u0A32')
2384                                                 continue;
2385                                         if (c == '\u0A33') {
2386                                                 AddCharMap ('\u0A32', 0x17, 0);
2387                                                 AddCharMap ('\u0A33', 0x17, 4, 4);
2388                                                 continue;
2389                                         }
2390                                         if (c == '\u0A8B')
2391                                                 AddCharMap ('\u0AE0', 0x17, 0, 5);
2392                                         AddCharMap (c, 0x17, 4);
2393
2394                                         if (c == '\u0AB9')
2395                                                 AddCharMap ('\u0AB3', 0x17, 6);
2396                                 }
2397                         }
2398                         // non-letters
2399                         byte gujaratiShift = 4;
2400                         fillIndex [0x17] = 0xC0;
2401                         for (int i = 0; i < orderedGujarati.Length; i++) {
2402                                 char c = orderedGujarati [i];
2403                                 if (fillIndex [0x17] == 0xCC)
2404                                         gujaratiShift = 3;
2405                                 if (!Char.IsLetter (c)) {
2406                                         // SPECIAL CASES
2407                                         if (c == '\u0A82')
2408                                                 AddCharMap ('\u0A81', 0x17, 2);
2409                                         if (c == '\u0AC2')
2410                                                 fillIndex [0x17]++;
2411                                         AddLetterMap (c, 0x17, gujaratiShift);
2412                                 }
2413                         }
2414
2415                         // Oriya
2416                         fillIndex [0x1] = 03;
2417                         fillIndex [0x18] = 02;
2418                         for (int i = 0x0B00; i < 0x0B7F; i++) {
2419                                 switch (Char.GetUnicodeCategory ((char) i)) {
2420                                 case UnicodeCategory.NonSpacingMark:
2421                                 case UnicodeCategory.DecimalDigitNumber:
2422                                         AddLetterMap ((char) i, 0x1, 1);
2423                                         continue;
2424                                 }
2425                                 AddLetterMap ((char) i, 0x18, 1);
2426                         }
2427
2428                         // Tamil
2429                         fillIndex [0x19] = 2;
2430                         AddCharMap ('\u0BD7', 0x19, 0);
2431                         fillIndex [0x19] = 0xA;
2432                         // vowels
2433                         for (int i = 0x0B82; i <= 0x0B94; i++)
2434                                 if (!IsIgnorable ((char) i))
2435                                         AddCharMap ((char) i, 0x19, 2);
2436                         // special vowel
2437                         fillIndex [0x19] = 0x28;
2438                         // The array for Tamil consonants is a constant.
2439                         // Windows have almost similar sequence to TAM from
2440                         // tamilnet but a bit different in Grantha.
2441                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
2442                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2443                         // combining marks
2444                         fillIndex [0x19] = 0x82;
2445                         for (int i = 0x0BBE; i < 0x0BCD; i++)
2446                                 if (Char.GetUnicodeCategory ((char) i) ==
2447                                         UnicodeCategory.SpacingCombiningMark
2448                                         || i == 0x0BC0)
2449                                         AddLetterMap ((char) i, 0x19, 2);
2450
2451                         // Telugu
2452                         fillIndex [0x1A] = 0x4;
2453                         for (int i = 0x0C00; i < 0x0C62; i++) {
2454                                 if (i == 0x0C55 || i == 0x0C56)
2455                                         continue; // skip
2456                                 AddCharMap ((char) i, 0x1A, 3);
2457                                 char supp = (i == 0x0C0B) ? '\u0C60':
2458                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
2459                                 if (supp == char.MinValue)
2460                                         continue;
2461                                 AddCharMap (supp, 0x1A, 3);
2462                         }
2463
2464                         // Kannada
2465                         fillIndex [0x1B] = 4;
2466                         for (int i = 0x0C80; i < 0x0CE5; i++) {
2467                                 if (i == 0x0CD5 || i == 0x0CD6)
2468                                         continue; // ignore
2469                                 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2470                                         continue; // shift after 0xCB9
2471                                 AddCharMap ((char) i, 0x1B, 3);
2472                                 if (i == 0x0CB9) {
2473                                         // SPECIAL CASES: but why?
2474                                         AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2475                                         AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2476                                         AddCharMap ('\u0CDE', 0x1B, 3); // FA
2477                                 }
2478                                 if (i == 0x0CB2)
2479                                         AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2480                         }
2481                         
2482                         // Malayalam
2483                         fillIndex [0x1C] = 2;
2484                         fillIndex [0x1] = 3;
2485                         for (int i = 0x0D02; i < 0x0D61; i++) {
2486                                 // FIXME: I avoided MSCompatUnicodeTable usage
2487                                 // here (it results in recursion). So check if
2488                                 // using NonSpacingMark makes sense or not.
2489                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2490 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2491                                         AddCharMap ((char) i, 0x1C, 1);
2492                                 else if (!IsIgnorable ((char) i))
2493                                         AddCharMap ((char) i, 1, 1);
2494                         }
2495
2496                         // Thai ... note that it breaks 0x1E wall after E2B!
2497                         // Also, all Thai characters have level 2 value 3.
2498                         fillIndex [0x1E] = 2;
2499                         fillIndex [0x1] = 3;
2500                         for (int i = 0xE40; i <= 0xE44; i++)
2501                                 AddCharMap ((char) i, 0x1E, 1, 3);
2502                         for (int i = 0xE01; i < 0xE2B; i++)
2503                                 AddCharMap ((char) i, 0x1E, 6, 3);
2504                         fillIndex [0x1F] = 5;
2505                         for (int i = 0xE2B; i < 0xE30; i++)
2506                                 AddCharMap ((char) i, 0x1F, 6, 3);
2507                         fillIndex [0x1F] = 0x1E;
2508                         for (int i = 0xE30; i < 0xE3B; i++)
2509                                 AddCharMap ((char) i, 0x1F, 1, 3);
2510                         // some Thai characters remains.
2511                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
2512                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2513                         foreach (char c in specialThai)
2514                                 AddCharMap (c, 0x1F, 1, 3);
2515
2516                         for (int i = 0xE00; i < 0xE80; i++)
2517                                 if (Char.GetUnicodeCategory ((char) i) ==
2518                                         UnicodeCategory.NonSpacingMark)
2519                                         AddCharMap ((char) i, 1, 1);
2520
2521                         // Lao
2522                         fillIndex [0x1F] = 2;
2523                         fillIndex [0x1] = 3;
2524                         for (int i = 0xE80; i < 0xEDF; i++) {
2525                                 if (IsIgnorable ((char) i))
2526                                         continue;
2527                                 else if (Char.IsLetter ((char) i))
2528                                         AddCharMap ((char) i, 0x1F, 1);
2529                                 else if (Char.GetUnicodeCategory ((char) i) ==
2530                                         UnicodeCategory.NonSpacingMark)
2531                                         AddCharMap ((char) i, 1, 1);
2532                         }
2533
2534                         // Georgian. orderedGeorgian is from UCA DUCET.
2535                         fillIndex [0x21] = 5;
2536                         for (int i = 0; i < orderedGeorgian.Length; i++) {
2537                                 char c = orderedGeorgian [i];
2538                                 if (map [(int) c].Defined)
2539                                         continue;
2540                                 AddCharMap (c, 0x21, 0);
2541                                 if (c < '\u10F6')
2542                                         AddCharMap ((char) (c - 0x30), 0x21, 0);
2543                                 fillIndex [0x21] += 5;
2544                         }
2545
2546                         // Japanese Kana.
2547                         fillIndex [0x22] = 2;
2548                         int kanaOffset = 0x3041;
2549                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2550
2551                         for (int gyo = 0; gyo < 9; gyo++) {
2552                                 for (int dan = 0; dan < 5; dan++) {
2553                                         if (gyo == 7 && dan % 2 == 1) {
2554                                                 // 'ya'-gyo
2555                                                 fillIndex [0x22]++;
2556                                                 kanaOffset -= 2; // There is no space for yi and ye.
2557                                                 continue;
2558                                         }
2559                                         int cp = kanaOffset + dan * kanaLines [gyo];
2560                                         // small lines (a-gyo, ya-gyo)
2561                                         if (gyo == 0 || gyo == 7) {
2562                                                 AddKanaMap (cp, 1); // small
2563                                                 AddKanaMap (cp + 1, 1);
2564                                         }
2565                                         else
2566                                                 AddKanaMap (cp, kanaLines [gyo]);
2567                                         fillIndex [0x22]++;
2568
2569                                         if (cp == 0x30AB) {
2570                                                 // add small 'ka' (before normal one)
2571                                                 AddKanaMap (0x30F5, 1);
2572                                                 kanaOffset++;
2573                                         }
2574                                         if (cp == 0x30B1) {
2575                                                 // add small 'ke' (before normal one)
2576                                                 AddKanaMap (0x30F6, 1);
2577                                                 kanaOffset++;
2578                                         }
2579                                         if (cp == 0x3061) {
2580                                                 // add small 'Tsu' (before normal one)
2581                                                 AddKanaMap (0x3063, 1);
2582                                                 kanaOffset++;
2583                                         }
2584                                 }
2585                                 fillIndex [0x22] += 3;
2586                                 kanaOffset += 5 * kanaLines [gyo];
2587                         }
2588
2589                         // Wa-gyo is almost special, so I just manually add.
2590                         AddLetterMap ((char) 0x308E, 0x22, 0);
2591                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2592                         AddLetterMap ((char) 0x308F, 0x22, 0);
2593                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2594                         fillIndex [0x22]++;
2595                         AddLetterMap ((char) 0x3090, 0x22, 0);
2596                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2597                         fillIndex [0x22] += 2;
2598                         // no "Wu" in Japanese.
2599                         AddLetterMap ((char) 0x3091, 0x22, 0);
2600                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2601                         fillIndex [0x22]++;
2602                         AddLetterMap ((char) 0x3092, 0x22, 0);
2603                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2604                         // Nn
2605                         fillIndex [0x22] = 0x80;
2606                         AddLetterMap ((char) 0x3093, 0x22, 0);
2607                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2608
2609                         map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2610                                 map [0x30A6].Level1, 3);// voiced hiragana U
2611                         map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2612                                 map [0x30A6].Level1, 3);// voiced katakana U
2613
2614                         map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2615                                 map [0x30AB].Level1, 0);// small katakana Ka
2616                         map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2617                                 map [0x30B1].Level1, 0);// small katakana Ke
2618                         // voiced Wa lines
2619                         for (int i = 0x30F7; i < 0x30FB; i++)
2620                                 map [i] = new CharMapEntry (map [i - 8].Category,
2621                                         map [i - 8].Level1,
2622                                         3);
2623
2624                         // JIS Japanese square chars.
2625                         fillIndex [0x22] = 0x97;
2626                         jisJapanese.Sort (JISComparer.Instance);
2627                         foreach (JISCharacter j in jisJapanese)
2628                                 if (0x3300 <= j.CP && j.CP <= 0x3357)
2629                                         AddCharMap ((char) j.CP, 0x22, 1);
2630                         // non-JIS Japanese square chars.
2631                         nonJisJapanese.Sort (NonJISComparer.Instance);
2632                         foreach (NonJISCharacter j in nonJisJapanese)
2633                                 AddCharMap ((char) j.CP, 0x22, 1);
2634
2635                         // Bopomofo
2636                         fillIndex [0x23] = 0x02;
2637                         for (int i = 0x3105; i <= 0x312C; i++)
2638                                 AddCharMap ((char) i, 0x23, 1);
2639
2640                         // Estrangela: ancient Syriac
2641                         fillIndex [0x24] = 0x0B;
2642                         // FIXME: is 0x71E really alternative form?
2643                         ArrayList syriacAlternatives = new ArrayList (
2644                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2645                         for (int i = 0x0710; i <= 0x072C; i++) {
2646                                 if (i == 0x0711) // NonSpacingMark
2647                                         continue;
2648                                 if (syriacAlternatives.Contains (i))
2649                                         continue;
2650                                 AddCharMap ((char) i, 0x24, 4);
2651                                 // FIXME: why?
2652                                 if (i == 0x721)
2653                                         fillIndex [0x24]++;
2654                         }
2655                         foreach (int cp in syriacAlternatives)
2656                                 map [cp] = new CharMapEntry (0x24,
2657                                         (byte) (map [cp - 1].Level1 + 2),
2658                                         0);
2659                         // FIXME: Syriac NonSpacingMark should go here.
2660
2661                         // Thaana
2662                         // FIXME: it turned out that it does not look like UCA
2663                         fillIndex [0x24] = 0x6E;
2664                         fillIndex [0x1] = 0xAC;
2665                         for (int i = 0; i < orderedThaana.Length; i++) {
2666                                 char c = orderedThaana [i];
2667                                 if (IsIgnorableNonSpacing ((int) c))
2668                                         AddCharMap (c, 1, 1);
2669                                 AddCharMap (c, 0x24, 2);
2670                                 if (c == '\u0782') // SPECIAL CASE: why?
2671                                         fillIndex [0x24] += 2;
2672                         }
2673                         #endregion
2674
2675                         // FIXME: Add more culture-specific letters (that are
2676                         // not supported in Windows collation) here.
2677
2678                         // Surrogate ... they are computed.
2679
2680                         #region Hangul
2681                         // Hangul.
2682                         //
2683                         // Unlike UCA Windows Hangul sequence mixes Jongseong
2684                         // with Choseong sequence as well as Jungseong,
2685                         // adjusted to have the same primary weight for the
2686                         // same base character. So it is impossible to compute
2687                         // those sort keys.
2688                         //
2689                         // Here I introduce an ordered sequence of mixed
2690                         // 'commands' and 'characters' that is similar to
2691                         // LDML text:
2692                         //      - ',' increases primary weight.
2693                         //      - [A B] means a range, increasing index
2694                         //      - {A B} means a range, without increasing index
2695                         //      - '=' is no operation (it means the characters 
2696                         //        of both sides have the same weight).
2697                         //      - '>' inserts a Hangul Syllable block that 
2698                         //        contains 0x251 characters.
2699                         //      - '<' decreases the index
2700                         //      - '0'-'9' means skip count
2701                         //      - whitespaces are ignored
2702                         //
2703
2704                         string hangulSequence =
2705                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2706                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2707                         + "<{\u1113 \u1116}, \u3165,"
2708                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2709                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2710                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2711                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2712                                 + "[\u11D1 \u11D2], \u11B2,"
2713                                 + "[\u11D3 \u11D5], \u11B3,"
2714                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2715                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2716                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2717                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2718                         + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2719                                 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2720                                 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2721                         + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2722                                 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2723                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2724                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2725                         + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
2726                                 + "\u11F1,, \u11F2,,,"
2727                                 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
2728                         + "<\u114D, \u110D,,  >"
2729                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2730                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2731                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2732                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2733                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2734                                 + "[\u11F5 \u11F8]"
2735                         ;
2736
2737                         byte hangulCat = 0x52;
2738                         fillIndex [hangulCat] = 0x2;
2739
2740                         int syllableBlock = 0;
2741                         for (int n = 0; n < hangulSequence.Length; n++) {
2742                                 char c = hangulSequence [n];
2743                                 int start, end;
2744                                 if (Char.IsWhiteSpace (c))
2745                                         continue;
2746                                 switch (c) {
2747                                 case '=':
2748                                         break; // NOP
2749                                 case ',':
2750                                         IncrementSequentialIndex (ref hangulCat);
2751                                         break;
2752                                 case '<':
2753                                         if (fillIndex [hangulCat] == 2)
2754                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2755                                         fillIndex [hangulCat]--;
2756                                         break;
2757                                 case '>':
2758                                         IncrementSequentialIndex (ref hangulCat);
2759                                         for (int l = 0; l < 0x15; l++)
2760                                                 for (int v = 0; v < 0x1C; v++) {
2761                                                         AddCharMap (
2762                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2763                                                         IncrementSequentialIndex (ref hangulCat);
2764                                                 }
2765                                         syllableBlock++;
2766                                         break;
2767                                 case '[':
2768                                         start = hangulSequence [n + 1];
2769                                         end = hangulSequence [n + 3];
2770                                         for (int i = start; i <= end; i++) {
2771                                                 AddCharMap ((char) i, hangulCat, 0);
2772                                                 if (end > i)
2773                                                         IncrementSequentialIndex (ref hangulCat);
2774                                         }
2775                                         n += 4; // consumes 5 characters for this operation
2776                                         break;
2777                                 case '{':
2778                                         start = hangulSequence [n + 1];
2779                                         end = hangulSequence [n + 3];
2780                                         for (int i = start; i <= end; i++)
2781                                                 AddCharMap ((char) i, hangulCat, 0);
2782                                         n += 4; // consumes 5 characters for this operation
2783                                         break;
2784                                 default:
2785                                         AddCharMap (c, hangulCat, 0);
2786                                         break;
2787                                 }
2788                         }
2789
2790                         // Some Jamo NFKD.
2791                         for (int i = 0x3200; i < 0x3300; i++) {
2792                                 if (IsIgnorable (i) || map [i].Defined)
2793                                         continue;
2794                                 int ch = 0;
2795                                 // w/ bracket
2796                                 if (decompLength [i] == 4 &&
2797                                         decompValues [decompIndex [i]] == '(')
2798                                         ch = decompIndex [i] + 1;
2799                                 // circled
2800                                 else if (decompLength [i] == 2 &&
2801                                         decompValues [decompIndex [i] + 1] == '\u1161')
2802                                         ch = decompIndex [i];
2803                                 else if (decompLength [i] == 1)
2804                                         ch = decompIndex [i];
2805                                 else
2806                                         continue;
2807                                 ch = decompValues [ch];
2808                                 if (ch < 0x1100 || 0x1200 < ch &&
2809                                         ch < 0xAC00 || 0xD800 < ch)
2810                                         continue;
2811
2812                                 // SPECIAL CASE ?
2813                                 int offset = i < 0x3260 ? 1 : 0;
2814                                 if (0x326E <= i && i <= 0x3273)
2815                                         offset = 1;
2816
2817                                 map [i] = new CharMapEntry (map [ch].Category,
2818                                         (byte) (map [ch].Level1 + offset),
2819                                         map [ch].Level2);
2820 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2821                         }
2822
2823
2824                         #endregion
2825
2826                         // Letterlike characters and CJK compatibility square
2827                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2828                         int [] counts = new int ['Z' - 'A' + 1];
2829                         char [] namedChars = new char [sortableCharNames.Count];
2830                         int nCharNames = 0;
2831                         foreach (DictionaryEntry de in sortableCharNames) {
2832                                 counts [((string) de.Value) [0] - 'A']++;
2833                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2834                         }
2835                         nCharNames = 0; // reset
2836                         for (int a = 0; a < counts.Length; a++) {
2837                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2838                                 for (int i = 0; i < counts [a]; i++)
2839 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2840                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2841                         }
2842
2843                         // CJK unified ideograph.
2844                         byte cjkCat = 0x9E;
2845                         fillIndex [cjkCat] = 0x2;
2846                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2847                                 if (!IsIgnorable (cp))
2848                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2849                         // CJK Extensions goes here.
2850                         // LAMESPEC: With this Windows style CJK layout, it is
2851                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2852                         // 0x9FBB can never be added w/o breaking compat.
2853                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2854                                 if (!IsIgnorable (cp))
2855                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2856
2857                         // PrivateUse ... computed.
2858                         // remaining Surrogate ... computed.
2859
2860                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2861                         // non-alphanumeric ASCII except for: + - < = > '
2862                         for (int i = 0x21; i < 0x7F; i++) {
2863                                 if (Char.IsLetterOrDigit ((char) i)
2864                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2865                                         continue; // they are not added here.
2866                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2867                                 // Insert 3001 after ',' and 3002 after '.'
2868                                 if (i == 0x2C)
2869                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2870                                 else if (i == 0x2E)
2871                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2872                                 else if (i == 0x3A)
2873                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2874                         }
2875                         #endregion
2876
2877                         #region 07 - Punctuations and something else
2878                         for (int i = 0xA0; i < char.MaxValue; i++) {
2879                                 if (IsIgnorable (i))
2880                                         continue;
2881
2882                                 // FIXME: actually those reset should not be 
2883                                 // done but here I put for easy goal.
2884                                 if (i == 0x0700)
2885                                         fillIndex [0x7] = 0xE2;
2886                                 if (i == 0x2016)
2887                                         fillIndex [0x7] = 0x77;
2888
2889                                 // SPECIAL CASES:
2890                                 switch (i) {
2891                                 case 0xAB: // 08
2892                                 case 0xB7: // 0A
2893                                 case 0xBB: // 08
2894                                 case 0x02B9: // 01
2895                                 case 0x02BA: // 01
2896                                 case 0x2329: // 09
2897                                 case 0x232A: // 09
2898                                         continue;
2899                                 }
2900
2901                                 switch (Char.GetUnicodeCategory ((char) i)) {
2902                                 case UnicodeCategory.OtherPunctuation:
2903                                 case UnicodeCategory.ClosePunctuation:
2904                                 case UnicodeCategory.OpenPunctuation:
2905                                 case UnicodeCategory.ConnectorPunctuation:
2906                                 case UnicodeCategory.InitialQuotePunctuation:
2907                                 case UnicodeCategory.FinalQuotePunctuation:
2908                                 case UnicodeCategory.ModifierSymbol:
2909                                         // SPECIAL CASES: // 0xA
2910                                         if (0x2020 <= i && i <= 0x2031)
2911                                                 continue;
2912                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
2913                                         break;
2914                                 default:
2915                                         if (i == 0xA6 || i == 0x1C3) // SPECIAL CASE. FIXME: why?
2916                                                 goto case UnicodeCategory.OtherPunctuation;
2917                                         break;
2918                                 }
2919                         }
2920                         // Control pictures
2921                         // FIXME: it should not need to reset level 1, but
2922                         // it's for easy goal.
2923                         fillIndex [0x7] = 0xB6;
2924                         for (int i = 0x2400; i <= 0x2421; i++)
2925                                 AddCharMap ((char) i, 0x7, 1, 0);
2926
2927                         // Actually 3008-301F and FE33-FE5D are mixed, so
2928                         // it's somewhat countable, but not as a whole. Thus
2929                         // manual remapping is quicker.
2930                         fillIndex [0x7] = 0x8D;
2931                         int [] cjkCompatMarks1 = new int [] {
2932                                 0xFE33, 0xFE49, 0xFE4A, 0xFE4B, 0xFE4C};
2933                         int [] cjkCompatMarks2 = new int [] {
2934                                 0xFE34, 0xFE3F, 0xFE40, 0xFE3D, 0xFE3E, 0xFE41,
2935                                 0xFE42, 0xFE43, 0xFE44, 0xFE3B, 0xFE3C/*FE5D*/,
2936                                 0xFE39/*FE5E*/, 0xFE3A};
2937                         for (int i = 0; i < cjkCompatMarks1.Length; i++)
2938                                 map [cjkCompatMarks1 [i]] = new CharMapEntry (
2939                                         0x7, fillIndex [0x7]++, 0);
2940                         for (int i = 0; i < cjkCompatMarks2.Length; i++) {
2941                                 map [cjkCompatMarks2 [i]] = new CharMapEntry (
2942                                         0x7, fillIndex [0x7], 0);
2943                                 fillIndex [0x7] += 2;
2944                                 switch (cjkCompatMarks2 [i]) {
2945                                 case 0xFE3C:
2946                                         map [0xFE5D] = new CharMapEntry (
2947                                                 0x7, fillIndex [0x7]++, 0);
2948                                         break;
2949                                 case 0xFE39:
2950                                         map [0xFE5D] = new CharMapEntry (
2951                                                 0x7, fillIndex [0x7]++, 0);
2952                                         break;
2953                                 }
2954                         }
2955
2956                         fillIndex [0x7] = 0x93;
2957                         for (int i = 0x3008; i <= 0x3011; i++) {
2958                                 map [i] = new CharMapEntry (0x7,
2959                                         fillIndex [0x7], 0);
2960                                 fillIndex [0x7] += 2;
2961                         }
2962                         fillIndex [0x7] += 3;
2963                         map [0x3014] = new CharMapEntry (0x7, fillIndex [0x7], 0);
2964                         fillIndex [0x7] += 3;
2965                         map [0x3015] = new CharMapEntry (0x7, fillIndex [0x7], 0);
2966                         fillIndex [0x7] += 2;
2967                         for (int i = 0x3016; i < 0x301F; i++)
2968                                 map [i] = new CharMapEntry (0x7,
2969                                         fillIndex [0x7]++, 0);
2970
2971                         #endregion
2972
2973                         // FIXME: for 07 xx we need more love.
2974
2975                         // Characters w/ diacritical marks (NFKD)
2976                         for (int i = 0; i <= char.MaxValue; i++) {
2977                                 if (map [i].Defined || IsIgnorable (i))
2978                                         continue;
2979                                 if (decompIndex [i] == 0)
2980                                         continue;
2981
2982                                 int start = decompIndex [i];
2983                                 int primaryChar = decompValues [start];
2984                                 int secondary = 0;
2985                                 bool skip = false;
2986                                 int length = decompLength [i];
2987                                 // special processing for parenthesized ones.
2988                                 if (length == 3 &&
2989                                         decompValues [start] == '(' &&
2990                                         decompValues [start + 2] == ')') {
2991                                         primaryChar = decompValues [start + 1];
2992                                         length = 1;
2993                                 }
2994
2995                                 if (map [primaryChar].Level1 == 0)
2996                                         continue;
2997
2998                                 for (int l = 1; l < length; l++) {
2999                                         int c = decompValues [start + l];
3000                                         if (map [c].Level1 != 0)
3001                                                 skip = true;
3002                                         secondary += diacritical [c];
3003                                 }
3004                                 if (skip)
3005                                         continue;
3006                                 map [i] = new CharMapEntry (
3007                                         map [primaryChar].Category,
3008                                         map [primaryChar].Level1,
3009                                         (byte) secondary);
3010                                 
3011                         }
3012
3013                         // category 08 - symbols
3014                         fillIndex [0x8] = 2;
3015                         // Here Windows mapping is not straightforward. It is
3016                         // not based on computation but seems manual sorting.
3017                         AddCharMapGroup ('+', 0x8, 1, 0); // plus
3018                         AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
3019                         AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
3020                         AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
3021                         AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
3022                         AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
3023                         AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
3024                         AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
3025                         AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
3026                         AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
3027                         AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
3028                         AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
3029                         AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
3030
3031                         for (int cp = 0; cp < 0x2300; cp++) {
3032                                 if (cp == 0xAC) // SPECIAL CASE: skip
3033                                         continue;
3034                                 if (cp == 0x200) {
3035                                         cp = 0x2200; // skip to 2200
3036                                         fillIndex [0x8] = 0x21;
3037                                 }
3038                                 if (cp == 0x2295)
3039                                         fillIndex [0x8] = 0x3;
3040                                 if (cp == 0x22B2)
3041                                         fillIndex [0x8] = 0xB9;
3042                                 if (!map [cp].Defined &&
3043 //                                      Char.GetUnicodeCategory ((char) cp) ==
3044 //                                      UnicodeCategory.MathSymbol)
3045                                         Char.IsSymbol ((char) cp))
3046                                         AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
3047                                 // SPECIAL CASES: no idea why Windows sorts as such
3048                                 switch (cp) {
3049                                 case 0x3E:
3050                                         AddCharMap ('\u227B', 0x8, 1, 0);
3051                                         AddCharMap ('\u22B1', 0x8, 1, 0);
3052                                         break;
3053                                 case 0xB1:
3054                                         AddCharMapGroup ('\u00AB', 0x8, 1, 0);
3055                                         AddCharMapGroup ('\u226A', 0x8, 1, 0);
3056                                         AddCharMapGroup ('\u00BB', 0x8, 1, 0);
3057                                         AddCharMapGroup ('\u226B', 0x8, 1, 0);
3058                                         break;
3059                                 case 0xF7:
3060                                         AddCharMap ('\u01C0', 0x8, 1, 0);
3061                                         AddCharMap ('\u01C1', 0x8, 1, 0);
3062                                         AddCharMap ('\u01C2', 0x8, 1, 0);
3063                                         break;
3064                                 }
3065                         }
3066
3067                         #region Level2 adjustment
3068                         // Arabic Hamzah
3069                         diacritical [0x624] = 0x5;
3070                         diacritical [0x626] = 0x7;
3071                         diacritical [0x622] = 0x9;
3072                         diacritical [0x623] = 0xA;
3073                         diacritical [0x625] = 0xB;
3074                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
3075                         diacritical [0x64A] = 0x7; // Yaa'
3076
3077                         for (int i = 0; i < char.MaxValue; i++) {
3078                                 byte mod = 0;
3079                                 byte cat = map [i].Category;
3080                                 switch (cat) {
3081                                 case 0xE: // Latin diacritics
3082                                 case 0x22: // Japanese: circled characters
3083                                         mod = diacritical [i];
3084                                         break;
3085                                 case 0x13: // Arabic
3086                                         if (diacritical [i] == 0 && i >= 0xFE8D)
3087                                                 mod = 0x8; // default for arabic
3088                                         break;
3089                                 }
3090                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
3091                                         mod = diacritical [i];
3092                                 if (mod > 0)
3093                                         map [i] = new CharMapEntry (
3094                                                 cat, map [i].Level1, mod);
3095                         }
3096                         #endregion
3097
3098                         // FIXME: this is halfly hack but those NonSpacingMark 
3099                         // characters and still undefined are likely to
3100                         // be nonspacing.
3101                         for (int i = 0; i < char.MaxValue; i++) {
3102                                 if (map [i].Defined ||
3103                                         IsIgnorable (i))
3104                                         continue;
3105                                 switch (i) {
3106                                 // SPECIAL CASES.
3107                                 case 0x02B9:
3108                                 case 0x02BA:
3109                                         break;
3110                                 default:
3111                                         if (Char.GetUnicodeCategory ((char) i) !=
3112                                         UnicodeCategory.NonSpacingMark)
3113                                                 continue;
3114                                         break;
3115                                 }
3116                                 if (diacritical [i] != 0)
3117                                         map [i] = new CharMapEntry (1, 1, diacritical [i]);
3118                                 else
3119                                         AddCharMap ((char) i, 1, 1);
3120                         }
3121
3122                         // FIXME: this is hack but those Symbol characters
3123                         // are likely to fall into 0xA category.
3124                         for (int i = 0; i < char.MaxValue; i++)
3125                                 if (!map [i].Defined &&
3126                                         !IsIgnorable (i) &&
3127                                         Char.IsSymbol ((char) i))
3128                                         AddCharMap ((char) i, 0xA, 1);
3129                 }
3130
3131                 private void IncrementSequentialIndex (ref byte hangulCat)
3132                 {
3133                         fillIndex [hangulCat]++;
3134                         if (fillIndex [hangulCat] == 0) { // overflown
3135                                 hangulCat++;
3136                                 fillIndex [hangulCat] = 0x2;
3137                         }
3138                 }
3139
3140                 // Reset fillIndex to fixed value and call AddLetterMap().
3141                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3142                 {
3143                         fillIndex [category] = alphaWeight;
3144                         AddLetterMap (c, category, 0);
3145
3146                         ArrayList al = latinMap [c] as ArrayList;
3147                         if (al == null)
3148                                 return;
3149
3150                         foreach (int cp in al)
3151                                 AddLetterMap ((char) cp, category, 0);
3152                 }
3153
3154                 private void AddKanaMap (int i, byte voices)
3155                 {
3156                         for (byte b = 0; b < voices; b++) {
3157                                 char c = (char) (i + b);
3158                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
3159                                 // Hiragana
3160                                 AddLetterMapCore (c, 0x22, 0, arg, false);
3161                                 // Katakana
3162                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3163                         }
3164                 }
3165
3166                 private void AddLetterMap (char c, byte category, byte updateCount)
3167                 {
3168                         AddLetterMapCore (c, category, updateCount, 0, true);
3169                 }
3170
3171                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3172                 {
3173                         char c2;
3174                         // <small> updates index
3175                         c2 = ToSmallForm (c);
3176                         if (c2 != c)
3177                                 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3178                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3179                         if (c2 != c && !map [(int) c2].Defined)
3180                                 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3181                         bool doUpdate = true;
3182                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3183                                 doUpdate = false;
3184                         else
3185                                 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3186                         if (doUpdate)
3187                                 fillIndex [category] += updateCount;
3188                 }
3189
3190                 private bool AddCharMap (char c, byte category, byte increment)
3191                 {
3192                         return AddCharMap (c, category, increment, 0);
3193                 }
3194                 
3195                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3196                 {
3197                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3198                                 return false; // do nothing
3199                         map [(int) c] = new CharMapEntry (category,
3200                                 category == 1 ? alt : fillIndex [category],
3201                                 category == 1 ? fillIndex [category] : alt);
3202                         fillIndex [category] += increment;
3203                         return true;
3204                 }
3205
3206                 //
3207                 // Adds characters to table in the order below 
3208                 // (+ increases weight):
3209                 //      (<small> +)
3210                 //      itself
3211                 //      <fraction>
3212                 //      <full> | <super> | <sub>
3213                 //      <circle> | <wide> (| <narrow>)
3214                 //      +
3215                 //      (vertical +)
3216                 //
3217                 // level2 is fixed (does not increase).
3218                 int [] sameWeightItems = new int [] {
3219                         DecompositionFraction,
3220                         DecompositionFull,
3221                         DecompositionSuper,
3222                         DecompositionSub,
3223                         DecompositionCircle,
3224                         DecompositionWide,
3225                         DecompositionNarrow,
3226                         };
3227                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3228                 {
3229                         AddCharMapGroup (c, category, updateCount, level2, false);
3230                 }
3231
3232                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3233                 {
3234                         if (map [(int) c].Defined)
3235                                 return;
3236
3237                         if (deferLevel2)
3238                                 level2 = diacritical [(int) c];
3239
3240                         char small = char.MinValue;
3241                         char vertical = char.MinValue;
3242                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3243                         if (nfkd != null) {
3244                                 object smv = nfkd [(byte) DecompositionSmall];
3245                                 if (smv != null)
3246                                         small = (char) ((int) smv);
3247                                 object vv = nfkd [(byte) DecompositionVertical];
3248                                 if (vv != null)
3249                                         vertical = (char) ((int) vv);
3250                         }
3251
3252                         // <small> updates index
3253                         if (small != char.MinValue) {
3254                                 if (level2 == 0 && deferLevel2)
3255                                         level2 = diacritical [small];
3256                                 AddCharMap (small, category, updateCount, level2);
3257                         }
3258
3259                         // itself
3260                         AddCharMap (c, category, 0, level2);
3261
3262                         if (nfkd != null) {
3263                                 foreach (int weight in sameWeightItems) {
3264                                         object wv = nfkd [(byte) weight];
3265                                         if (wv != null) {
3266                                                 if (deferLevel2)
3267                                                         level2 = diacritical [(int) wv];
3268                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
3269                                         }
3270                                 }
3271                         }
3272
3273                         // update index here.
3274                         fillIndex [category] += updateCount;
3275
3276                         if (vertical != char.MinValue) {
3277                                 if (level2 == 0 && deferLevel2)
3278                                         level2 = diacritical [vertical];
3279                                 AddCharMap (vertical, category, updateCount, level2);
3280                         }
3281                 }
3282
3283                 private void AddCharMapCJK (char c, ref byte category)
3284                 {
3285                         AddCharMap (c, category, 0, 0);
3286                         IncrementSequentialIndex (ref category);
3287
3288                         // Special. I wonder why but Windows skips 9E F9.
3289                         if (category == 0x9E && fillIndex [category] == 0xF9)
3290                                 IncrementSequentialIndex (ref category);
3291                 }
3292
3293                 private void AddCharMapGroupCJK (char c, ref byte category)
3294                 {
3295                         AddCharMapCJK (c, ref category);
3296
3297                         // LAMESPEC: see below.
3298                         if (c == '\u5B78') {
3299                                 AddCharMapCJK ('\u32AB', ref category);
3300                                 AddCharMapCJK ('\u323B', ref category);
3301                         }
3302                         if (c == '\u52DE') {
3303                                 AddCharMapCJK ('\u3298', ref category);
3304                                 AddCharMapCJK ('\u3238', ref category);
3305                         }
3306                         if (c == '\u5BEB')
3307                                 AddCharMapCJK ('\u32A2', ref category);
3308                         if (c == '\u91AB')
3309                                 // Especially this mapping order totally does
3310                                 // not make sense to me.
3311                                 AddCharMapCJK ('\u32A9', ref category);
3312
3313                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3314                         if (nfkd == null)
3315                                 return;
3316                         for (byte weight = 0; weight <= 0x12; weight++) {
3317                                 object wv = nfkd [weight];
3318                                 if (wv == null)
3319                                         continue;
3320                                 int w = (int) wv;
3321
3322                                 // Special: they are ignored in this area.
3323                                 // FIXME: check if it is sane
3324                                 if (0xF900 <= w && w <= 0xFAD9)
3325                                         continue;
3326                                 // LAMESPEC: on Windows some of CJK characters
3327                                 // in 3200-32B0 are incorrectly mapped. They
3328                                 // mix Chinise and Japanese Kanji when
3329                                 // ordering those characters.
3330                                 switch (w) {
3331                                 case 0x32A2: case 0x3298: case 0x3238:
3332                                 case 0x32A9: case 0x323B: case 0x32AB:
3333                                         continue;
3334                                 }
3335
3336                                 AddCharMapCJK ((char) w, ref category);
3337                         }
3338                 }
3339
3340                 // For now it is only for 0x7 category.
3341                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3342                 {
3343                         char small = char.MinValue;
3344                         char vertical = char.MinValue;
3345                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3346                         if (nfkd != null) {
3347                                 object smv = nfkd [(byte) DecompositionSmall];
3348                                 if (smv != null)
3349                                         small = (char) ((int) smv);
3350                                 object vv = nfkd [(byte) DecompositionVertical];
3351                                 if (vv != null)
3352                                         vertical = (char) ((int) vv);
3353                         }
3354
3355                         // <small> updates index
3356                         if (small != char.MinValue)
3357                                 // SPECIAL CASE excluded (FIXME: why?)
3358                                 if (small != '\u2024')
3359                                         AddCharMap (small, category, updateCount);
3360
3361                         // itself
3362                         AddCharMap (c, category, updateCount, level2);
3363
3364                         // Since nfkdMap is problematic to have two or more
3365                         // NFKD to an identical character, here I iterate all.
3366                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3367                                 if (decompLength [c2] == 1 &&
3368                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3369                                         switch (decompType [c2]) {
3370                                         case DecompositionCompat:
3371                                                 AddCharMap ((char) c2, category, updateCount, level2);
3372                                                 break;
3373                                         }
3374                                 }
3375                         }
3376
3377                         if (vertical != char.MinValue)
3378                                 // SPECIAL CASE excluded (FIXME: why?)
3379                                 if (vertical != '\uFE33' && vertical != '\uFE34')
3380                                         AddCharMap (vertical, category, updateCount, level2);
3381                 }
3382
3383                 private void AddArabicCharMap (char c)
3384                 {
3385                         byte category = 6;
3386                         byte updateCount = 1;
3387                         byte level2 = 0;
3388
3389                         // itself
3390                         AddCharMap (c, category, 0, level2);
3391
3392                         // Since nfkdMap is problematic to have two or more
3393                         // NFKD to an identical character, here I iterate all.
3394                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3395                                 if (decompLength [c2] == 0)
3396                                         continue;
3397                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
3398                                 if ((int) (decompValues [idx]) == (int) c)
3399                                         AddCharMap ((char) c2, category,
3400                                                 0, level2);
3401                         }
3402                         fillIndex [category] += updateCount;
3403                 }
3404
3405                 char ToSmallForm (char c)
3406                 {
3407                         return ToDecomposed (c, DecompositionSmall, false);
3408                 }
3409
3410                 char ToDecomposed (char c, byte d, bool tail)
3411                 {
3412                         if (decompType [(int) c] != d)
3413                                 return c;
3414                         int idx = decompIndex [(int) c];
3415                         if (tail)
3416                                 idx += decompLength [(int) c] - 1;
3417                         return (char) decompValues [idx];
3418                 }
3419
3420                 bool ExistsJIS (int cp)
3421                 {
3422                         foreach (JISCharacter j in jisJapanese)
3423                                 if (j.CP == cp)
3424                                         return true;
3425                         return false;
3426                 }
3427
3428                 #endregion
3429
3430                 #region Level 3 properties (Case/Width)
3431
3432                 private byte ComputeLevel3Weight (char c)
3433                 {
3434                         byte b = ComputeLevel3WeightRaw (c);
3435                         return b > 0 ? (byte) (b + 2) : b;
3436                 }
3437
3438                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3439                 {
3440                         // CJK compat
3441                         if ('\u3192' <= c && c <= '\u319F')
3442                                 return 0;
3443
3444                         // They have <narrow> NFKD mapping, and on Windows
3445                         // those narrow characters are regarded as "normal",
3446                         // thus those characters themselves are regarded as
3447                         // "wide". grep "<narrow>" and you can pick them up
3448                         // (ignoring Kana, Hangul etc.)
3449                         switch (c) {
3450                         case '\u3002':
3451                         case '\u300C':
3452                         case '\u300D':
3453                         case '\u3001':
3454                         case '\u30FB':
3455                         case '\u2502':
3456                         case '\u2190':
3457                         case '\u2191':
3458                         case '\u2192':
3459                         case '\u2193':
3460                         case '\u25A0':
3461                         case '\u25CB':
3462                                 return 1;
3463                         }
3464                         // Korean
3465                         if ('\u11A8' <= c && c <= '\u11F9')
3466                                 return 2;
3467                         if ('\uFFA0' <= c && c <= '\uFFDC')
3468                                 return 4;
3469                         if ('\u3130' <= c && c <= '\u3164')
3470                                 return 5;
3471                         if ('\u3165' <= c && c <= '\u318E')
3472                                 return 4;
3473                         // Georgian Capital letters
3474                         if ('\u10A0' <= c && c <= '\u10C5')
3475                                 return 0x10;
3476                         // numbers
3477                         if ('\u2776' <= c && c <= '\u277F')
3478                                 return 4;
3479                         if ('\u2780' <= c && c <= '\u2789')
3480                                 return 8;
3481                         if ('\u2776' <= c && c <= '\u2793')
3482                                 return 0xC;
3483                         if ('\u2160' <= c && c <= '\u216F')
3484                                 return 0x10;
3485                         if ('\u2181' <= c && c <= '\u2182')
3486                                 return 0x18;
3487                         // Arabic
3488                         if ('\u2135' <= c && c <= '\u2138')
3489                                 return 4;
3490                         if ('\uFE80' <= c && c < '\uFF00') {
3491                                 // 2(Isolated)/8(Final)/0x18(Medial)
3492                                 switch (decompType [(int) c]) {
3493                                 case DecompositionIsolated:
3494                                         return 2;
3495                                 case DecompositionFinal:
3496                                         return 8;
3497                                 case DecompositionMedial:
3498                                         return 0x18;
3499                                 }
3500                         }
3501
3502                         // actually I dunno the reason why they have weights.
3503                         switch (c) {
3504                         case '\u01BC':
3505                                 return 0x10;
3506                         case '\u06A9':
3507                                 return 0x20;
3508                         case '\u06AA':
3509                                 return 0x28;
3510                         // Gurmukhi
3511                         case '\u0A39':
3512                         case '\u0A59':
3513                         case '\u0A5A':
3514                         case '\u0A5B':
3515                         case '\u0A5E':
3516                                 return 0x10;
3517                         }
3518
3519                         byte ret = 0;
3520                         switch (c) {
3521                         case '\u03C2':
3522                         case '\u2104':
3523                         case '\u212B':
3524                                 ret |= 8;
3525                                 break;
3526                         case '\uFE42':
3527                                 ret |= 0xC;
3528                                 break;
3529                         }
3530
3531                         // misc
3532                         switch (decompType [(int) c]) {
3533                         case DecompositionWide: // <wide>
3534                         case DecompositionSub: // <sub>
3535                         case DecompositionSuper: // <super>
3536                                 ret |= decompType [(int) c];
3537                                 break;
3538                         }
3539                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3540                                 ret |= 8;
3541                         if (isUppercase [(int) c]) // DerivedCoreProperties
3542                                 ret |= 0x10;
3543
3544                         return ret;
3545                 }
3546
3547                 #endregion
3548
3549                 #region IsIgnorable
3550 /*
3551                 static bool IsIgnorable (int i)
3552                 {
3553                         if (unicodeAge [i] >= 3.1)
3554                                 return true;
3555                         switch (char.GetUnicodeCategory ((char) i)) {
3556                         case UnicodeCategory.OtherNotAssigned:
3557                         case UnicodeCategory.Format:
3558                                 return true;
3559                         }
3560                         return false;
3561                 }
3562 */
3563
3564                 // FIXME: In the future use DerivedAge.txt to examine character
3565                 // versions and set those ones that have higher version than
3566                 // 1.0 as ignorable.
3567                 static bool IsIgnorable (int i)
3568                 {
3569                         switch (i) {
3570                         case 0:
3571                         // I guess, those characters are added between
3572                         // Unicode 1.0 (LCMapString) and Unicode 3.1
3573                         // (UnicodeCategory), so they used to be 
3574                         // something like OtherNotAssigned as of Unicode 1.1.
3575                         case 0x2df: case 0x387:
3576                         case 0x3d7: case 0x3d8: case 0x3d9:
3577                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3578                         case 0x400: case 0x40d: case 0x450: case 0x45d:
3579                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
3580                         case 0x653: case 0x654: case 0x655: case 0x66d:
3581                         case 0xb56:
3582                         case 0x1e9b: case 0x202f: case 0x20ad:
3583                         case 0x20ae: case 0x20af:
3584                         case 0x20e2: case 0x20e3:
3585                         case 0x2139: case 0x213a: case 0x2183:
3586                         case 0x2425: case 0x2426: case 0x2619:
3587                         case 0x2670: case 0x2671: case 0x3007:
3588                         case 0x3190: case 0x3191:
3589                         case 0xfffc: case 0xfffd:
3590                                 return true;
3591                         // exceptional characters filtered by the 
3592                         // following conditions. Originally those exceptional
3593                         // ranges are incorrect (they should not be ignored)
3594                         // and most of those characters are unfortunately in
3595                         // those ranges.
3596                         case 0x4d8: case 0x4d9:
3597                         case 0x4e8: case 0x4e9:
3598                         case 0x70F:
3599                         case 0x3036: case 0x303f:
3600                         case 0x337b: case 0xfb1e:
3601                                 return false;
3602                         }
3603
3604                         if (
3605                                 // The whole Sinhala characters.
3606                                 0x0D82 <= i && i <= 0x0DF4
3607                                 // The whole Tibetan characters.
3608                                 || 0x0F00 <= i && i <= 0x0FD1
3609                                 // The whole Myanmar characters.
3610                                 || 0x1000 <= i && i <= 0x1059
3611                                 // The whole Etiopic, Cherokee, 
3612                                 // Canadian Syllablic, Ogham, Runic,
3613                                 // Tagalog, Hanunoo, Philippine,
3614                                 // Buhid, Tagbanwa, Khmer and Mongorian
3615                                 // characters.
3616                                 || 0x1200 <= i && i <= 0x1DFF
3617                                 // Greek extension characters.
3618                                 || 0x1F00 <= i && i <= 0x1FFF
3619                                 // The whole Braille characters.
3620                                 || 0x2800 <= i && i <= 0x28FF
3621                                 // CJK radical characters.
3622                                 || 0x2E80 <= i && i <= 0x2EF3
3623                                 // Kangxi radical characters.
3624                                 || 0x2F00 <= i && i <= 0x2FD5
3625                                 // Ideographic description characters.
3626                                 || 0x2FF0 <= i && i <= 0x2FFB
3627                                 // Bopomofo letter and final
3628                                 || 0x31A0 <= i && i <= 0x31B7
3629                                 // White square with quadrant characters.
3630                                 || 0x25F0 <= i && i <= 0x25F7
3631                                 // Ideographic telegraph symbols.
3632                                 || 0x32C0 <= i && i <= 0x32CB
3633                                 || 0x3358 <= i && i <= 0x3370
3634                                 || 0x33E0 <= i && i <= 0x33FF
3635                                 // The whole YI characters.
3636                                 || 0xA000 <= i && i <= 0xA48C
3637                                 || 0xA490 <= i && i <= 0xA4C6
3638                                 // American small ligatures
3639                                 || 0xFB13 <= i && i <= 0xFB17
3640                                 // hebrew, arabic, variation selector.
3641                                 || 0xFB1D <= i && i <= 0xFE2F
3642                                 // Arabic ligatures.
3643                                 || 0xFEF5 <= i && i <= 0xFEFC
3644                                 // FIXME: why are they excluded?
3645                                 || 0x01F6 <= i && i <= 0x01F9
3646                                 || 0x0218 <= i && i <= 0x0233
3647                                 || 0x02A9 <= i && i <= 0x02AD
3648                                 || 0x02EA <= i && i <= 0x02EE
3649                                 || 0x0349 <= i && i <= 0x036F
3650                                 || 0x0488 <= i && i <= 0x048F
3651                                 || 0x04D0 <= i && i <= 0x04FF
3652                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3653                                 || 0x06D6 <= i && i <= 0x06ED
3654                                 || 0x06FA <= i && i <= 0x06FE
3655                                 || 0x2048 <= i && i <= 0x204D
3656                                 || 0x20e4 <= i && i <= 0x20ea
3657                                 || 0x213C <= i && i <= 0x214B
3658                                 || 0x21EB <= i && i <= 0x21FF
3659                                 || 0x22F2 <= i && i <= 0x22FF
3660                                 || 0x237B <= i && i <= 0x239A
3661                                 || 0x239B <= i && i <= 0x23CF
3662                                 || 0x24EB <= i && i <= 0x24FF
3663                                 || 0x2596 <= i && i <= 0x259F
3664                                 || 0x25F8 <= i && i <= 0x25FF
3665                                 || 0x2672 <= i && i <= 0x2689
3666                                 || 0x2768 <= i && i <= 0x2775
3667                                 || 0x27d0 <= i && i <= 0x27ff
3668                                 || 0x2900 <= i && i <= 0x2aff
3669                                 || 0x3033 <= i && i <= 0x303F
3670                                 || 0x31F0 <= i && i <= 0x31FF
3671                                 || 0x3250 <= i && i <= 0x325F
3672                                 || 0x32B1 <= i && i <= 0x32BF
3673                                 || 0x3371 <= i && i <= 0x337B
3674                                 || 0xFA30 <= i && i <= 0xFA6A
3675                         )
3676                                 return true;
3677
3678                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3679                         switch (uc) {
3680                         case UnicodeCategory.PrivateUse:
3681                         case UnicodeCategory.Surrogate:
3682                                 return false;
3683                         // ignored by nature
3684                         case UnicodeCategory.Format:
3685                         case UnicodeCategory.OtherNotAssigned:
3686                                 return true;
3687                         default:
3688                                 return false;
3689                         }
3690                 }
3691
3692                 // To check IsIgnorable sanity, try the driver below under MS.NET.
3693
3694                 /*
3695                 public static void Main ()
3696                 {
3697                         for (int i = 0; i <= char.MaxValue; i++)
3698                                 Dump (i, IsIgnorable (i));
3699                 }
3700
3701                 static void Dump (int i, bool ignore)
3702                 {
3703                         switch (Char.GetUnicodeCategory ((char) i)) {
3704                         case UnicodeCategory.PrivateUse:
3705                         case UnicodeCategory.Surrogate:
3706                                 return; // check nothing
3707                         }
3708
3709                         string s1 = "";
3710                         string s2 = new string ((char) i, 10);
3711                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3712                         if ((ret == 0) == ignore)
3713                                 return;
3714                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3715                 }
3716                 */
3717                 #endregion // IsIgnorable
3718
3719                 #region IsIgnorableSymbol
3720                 static bool IsIgnorableSymbol (int i)
3721                 {
3722                         if (IsIgnorable (i))
3723                                 return true;
3724
3725                         switch (i) {
3726                         // *Letter
3727                         case 0x00b5: case 0x01C0: case 0x01C1:
3728                         case 0x01C2: case 0x01C3: case 0x01F6:
3729                         case 0x01F7: case 0x01F8: case 0x01F9:
3730                         case 0x02D0: case 0x02EE: case 0x037A:
3731                         case 0x03D7: case 0x03F3:
3732                         case 0x0400: case 0x040d:
3733                         case 0x0450: case 0x045d:
3734                         case 0x048C: case 0x048D:
3735                         case 0x048E: case 0x048F:
3736                         case 0x0587: case 0x0640: case 0x06E5:
3737                         case 0x06E6: case 0x06FA: case 0x06FB:
3738                         case 0x06FC: case 0x093D: case 0x0950:
3739                         case 0x1E9B: case 0x2139: case 0x3006:
3740                         case 0x3033: case 0x3034: case 0x3035:
3741                         case 0xFE7E: case 0xFE7F:
3742                         // OtherNumber
3743                         case 0x16EE: case 0x16EF: case 0x16F0:
3744                         // LetterNumber
3745                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3746                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3747                         case 0x3038: // HANGZHOU NUMERAL TEN
3748                         case 0x3039: // HANGZHOU NUMERAL TWENTY
3749                         case 0x303a: // HANGZHOU NUMERAL THIRTY
3750                         // OtherSymbol
3751                         case 0x2117:
3752                         case 0x327F:
3753                                 return true;
3754                         // ModifierSymbol
3755                         case 0x02B9: case 0x02BA: case 0x02C2:
3756                         case 0x02C3: case 0x02C4: case 0x02C5:
3757                         case 0x02C8: case 0x02CC: case 0x02CD:
3758                         case 0x02CE: case 0x02CF: case 0x02D2:
3759                         case 0x02D3: case 0x02D4: case 0x02D5:
3760                         case 0x02D6: case 0x02D7: case 0x02DE:
3761                         case 0x02E5: case 0x02E6: case 0x02E7:
3762                         case 0x02E8: case 0x02E9:
3763                         case 0x309B: case 0x309C:
3764                         // OtherPunctuation
3765                         case 0x055A: // American Apos
3766                         case 0x05C0: // Hebrew Punct
3767                         case 0x0E4F: // Thai FONGMAN
3768                         case 0x0E5A: // Thai ANGKHANKHU
3769                         case 0x0E5B: // Thai KHOMUT
3770                         // CurencySymbol
3771                         case 0x09F2: // Bengali Rupee Mark
3772                         case 0x09F3: // Bengali Rupee Sign
3773                         // MathSymbol
3774                         case 0x221e: // INF.
3775                         // OtherSymbol
3776                         case 0x0482:
3777                         case 0x09FA:
3778                         case 0x0B70:
3779                                 return false;
3780                         }
3781
3782                         // *Letter
3783                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3784 #if NET_2_0
3785                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3786                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3787 #endif
3788                         )
3789                                 return true;
3790
3791                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3792                         switch (uc) {
3793                         case UnicodeCategory.Surrogate:
3794                                 return false; // inconsistent
3795
3796                         case UnicodeCategory.SpacingCombiningMark:
3797                         case UnicodeCategory.EnclosingMark:
3798                         case UnicodeCategory.NonSpacingMark:
3799                         case UnicodeCategory.PrivateUse:
3800                                 // NonSpacingMark
3801                                 if (0x064B <= i && i <= 0x0652) // Arabic
3802                                         return true;
3803                                 return false;
3804
3805                         case UnicodeCategory.Format:
3806                         case UnicodeCategory.OtherNotAssigned:
3807                                 return true;
3808
3809                         default:
3810                                 bool use = false;
3811                                 // OtherSymbols
3812                                 if (
3813                                         // latin in a circle
3814                                         0x249A <= i && i <= 0x24E9
3815                                         || 0x2100 <= i && i <= 0x2132
3816                                         // Japanese
3817                                         || 0x3196 <= i && i <= 0x31A0
3818                                         // Korean
3819                                         || 0x3200 <= i && i <= 0x321C
3820                                         // Chinese/Japanese
3821                                         || 0x322A <= i && i <= 0x3243
3822                                         // CJK
3823                                         || 0x3260 <= i && i <= 0x32B0
3824                                         || 0x32D0 <= i && i <= 0x3357
3825                                         || 0x337B <= i && i <= 0x33DD
3826                                 )
3827                                         use = !Char.IsLetterOrDigit ((char) i);
3828                                 if (use)
3829                                         return false;
3830
3831                                 // This "Digit" rule is mystery.
3832                                 // It filters some symbols out.
3833                                 if (Char.IsLetterOrDigit ((char) i))
3834                                         return false;
3835                                 if (Char.IsNumber ((char) i))
3836                                         return false;
3837                                 if (Char.IsControl ((char) i)
3838                                         || Char.IsSeparator ((char) i)
3839                                         || Char.IsPunctuation ((char) i))
3840                                         return true;
3841                                 if (Char.IsSymbol ((char) i))
3842                                         return true;
3843
3844                                 // FIXME: should check more
3845                                 return false;
3846                         }
3847                 }
3848
3849                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3850 /*
3851                 public static void Main ()
3852                 {
3853                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3854                         for (int i = 0; i <= char.MaxValue; i++) {
3855                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3856                                 if (uc == UnicodeCategory.Surrogate)
3857                                         continue;
3858
3859                                 bool ret = IsIgnorableSymbol (i);
3860
3861                                 string s1 = "TEST ";
3862                                 string s2 = "TEST " + (char) i;
3863
3864                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3865
3866                                 if (ret != (result == 0))
3867                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3868                                                 ret ? "should not ignore" :
3869                                                         "should ignore",
3870                                                 i,(char) i, uc);
3871                         }
3872                 }
3873 */
3874                 #endregion
3875
3876                 #region NonSpacing
3877                 static bool IsIgnorableNonSpacing (int i)
3878                 {
3879                         if (IsIgnorable (i))
3880                                 return true;
3881
3882                         switch (i) {
3883                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3884                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3885                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3886                                 return true;
3887                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3888                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3889                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3890                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3891                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3892                         case 0x0CCD: case 0x0E4E:
3893                                 return false;
3894                         }
3895
3896                         if (0x02b9 <= i && i <= 0x02c5
3897                                 || 0x02cc <= i && i <= 0x02d7
3898                                 || 0x02e4 <= i && i <= 0x02ef
3899                                 || 0x20DD <= i && i <= 0x20E0
3900                         )
3901                                 return true;
3902
3903                         if (0x064B <= i && i <= 0x00652
3904                                 || 0x0941 <= i && i <= 0x0948
3905                                 || 0x0AC1 <= i && i <= 0x0ACD
3906                                 || 0x0C3E <= i && i <= 0x0C4F
3907                                 || 0x0E31 <= i && i <= 0x0E3F
3908                         )
3909                                 return false;
3910
3911                         return Char.GetUnicodeCategory ((char) i) ==
3912                                 UnicodeCategory.NonSpacingMark;
3913                 }
3914
3915                 // We can reuse IsIgnorableSymbol testcode 
3916                 // for IsIgnorableNonSpacing.
3917                 #endregion
3918         }
3919
3920         struct CharMapEntry
3921         {
3922                 public byte Category;
3923                 public byte Level1;
3924                 public byte Level2; // It is always single byte.
3925                 public bool Defined;
3926
3927                 public CharMapEntry (byte category, byte level1, byte level2)
3928                 {
3929                         Category = category;
3930                         Level1 = level1;
3931                         Level2 = level2;
3932                         Defined = true;
3933                 }
3934         }
3935
3936         class JISCharacter
3937         {
3938                 public readonly int CP;
3939                 public readonly int JIS;
3940
3941                 public JISCharacter (int cp, int cpJIS)
3942                 {
3943                         CP = cp;
3944                         JIS = cpJIS;
3945                 }
3946         }
3947
3948         class JISComparer : IComparer
3949         {
3950                 public static readonly JISComparer Instance =
3951                         new JISComparer ();
3952
3953                 public int Compare (object o1, object o2)
3954                 {
3955                         JISCharacter j1 = (JISCharacter) o1;
3956                         JISCharacter j2 = (JISCharacter) o2;
3957                         return j1.JIS - j2.JIS;
3958                 }
3959         }
3960
3961         class NonJISCharacter
3962         {
3963                 public readonly int CP;
3964                 public readonly string Name;
3965
3966                 public NonJISCharacter (int cp, string name)
3967                 {
3968                         CP = cp;
3969                         Name = name;
3970                 }
3971         }
3972
3973         class NonJISComparer : IComparer
3974         {
3975                 public static readonly NonJISComparer Instance =
3976                         new NonJISComparer ();
3977
3978                 public int Compare (object o1, object o2)
3979                 {
3980                         NonJISCharacter j1 = (NonJISCharacter) o1;
3981                         NonJISCharacter j2 = (NonJISCharacter) o2;
3982                         return string.CompareOrdinal (j1.Name, j2.Name);
3983                 }
3984         }
3985
3986         class DecimalDictionaryValueComparer : IComparer
3987         {
3988                 public static readonly DecimalDictionaryValueComparer Instance
3989                         = new DecimalDictionaryValueComparer ();
3990
3991                 private DecimalDictionaryValueComparer ()
3992                 {
3993                 }
3994
3995                 public int Compare (object o1, object o2)
3996                 {
3997                         DictionaryEntry e1 = (DictionaryEntry) o1;
3998                         DictionaryEntry e2 = (DictionaryEntry) o2;
3999                         // FIXME: in case of 0, compare decomposition categories
4000                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4001                         if (ret != 0)
4002                                 return ret;
4003                         int i1 = (int) e1.Key;
4004                         int i2 = (int) e2.Key;
4005                         return i1 - i2;
4006                 }
4007         }
4008
4009         class StringDictionaryValueComparer : IComparer
4010         {
4011                 public static readonly StringDictionaryValueComparer Instance
4012                         = new StringDictionaryValueComparer ();
4013
4014                 private StringDictionaryValueComparer ()
4015                 {
4016                 }
4017
4018                 public int Compare (object o1, object o2)
4019                 {
4020                         DictionaryEntry e1 = (DictionaryEntry) o1;
4021                         DictionaryEntry e2 = (DictionaryEntry) o2;
4022                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4023                         if (ret != 0)
4024                                 return ret;
4025                         int i1 = (int) e1.Key;
4026                         int i2 = (int) e2.Key;
4027                         return i1 - i2;
4028                 }
4029         }
4030
4031         class UCAComparer : IComparer
4032         {
4033                 public static readonly UCAComparer Instance
4034                         = new UCAComparer ();
4035
4036                 private UCAComparer ()
4037                 {
4038                 }
4039
4040                 public int Compare (object o1, object o2)
4041                 {
4042                         char i1 = (char) o1;
4043                         char i2 = (char) o2;
4044
4045                         int l1 = CollationElementTable.GetSortKeyCount (i1);
4046                         int l2 = CollationElementTable.GetSortKeyCount (i2);
4047                         int l = l1 > l2 ? l2 : l1;
4048
4049                         for (int i = 0; i < l; i++) {
4050                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4051                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4052                                 int v = k1.Primary - k2.Primary;
4053                                 if (v != 0)
4054                                         return v;
4055                                 v = k1.Secondary - k2.Secondary;
4056                                 if (v != 0)
4057                                         return v;
4058                                 v = k1.Thirtiary - k2.Thirtiary;
4059                                 if (v != 0)
4060                                         return v;
4061                                 v = k1.Quarternary - k2.Quarternary;
4062                                 if (v != 0)
4063                                         return v;
4064                         }
4065                         return l1 - l2;
4066                 }
4067         }
4068
4069         class Tailoring
4070         {
4071                 int lcid;
4072                 int alias;
4073                 bool frenchSort;
4074                 ArrayList items = new ArrayList ();
4075
4076                 public Tailoring (int lcid)
4077                         : this (lcid, 0)
4078                 {
4079                 }
4080
4081                 public Tailoring (int lcid, int alias)
4082                 {
4083                         this.lcid = lcid;
4084                         this.alias = alias;
4085                 }
4086
4087                 public int LCID {
4088                         get { return lcid; }
4089                 }
4090
4091                 public int Alias {
4092                         get { return alias; }
4093                 }
4094
4095                 public bool FrenchSort {
4096                         get { return frenchSort; }
4097                         set { frenchSort = value; }
4098                 }
4099
4100                 public void AddDiacriticalMap (byte target, byte replace)
4101                 {
4102                         items.Add (new DiacriticalMap (target, replace));
4103                 }
4104
4105                 public void AddSortKeyMap (string source, byte [] sortkey)
4106                 {
4107                         items.Add (new SortKeyMap (source, sortkey));
4108                 }
4109
4110                 public void AddReplacementMap (string source, string replace)
4111                 {
4112                         items.Add (new ReplacementMap (source, replace));
4113                 }
4114
4115                 public char [] ItemToCharArray ()
4116                 {
4117                         ArrayList al = new ArrayList ();
4118                         foreach (ITailoringMap m in items)
4119                                 al.AddRange (m.ToCharArray ());
4120                         return al.ToArray (typeof (char)) as char [];
4121                 }
4122
4123                 interface ITailoringMap
4124                 {
4125                         char [] ToCharArray ();
4126                 }
4127
4128                 class DiacriticalMap : ITailoringMap
4129                 {
4130                         public readonly byte Target;
4131                         public readonly byte Replace;
4132
4133                         public DiacriticalMap (byte target, byte replace)
4134                         {
4135                                 Target = target;
4136                                 Replace = replace;
4137                         }
4138
4139                         public char [] ToCharArray ()
4140                         {
4141                                 char [] ret = new char [3];
4142                                 ret [0] = (char) 02; // kind:DiacriticalMap
4143                                 ret [1] = (char) Target;
4144                                 ret [2] = (char) Replace;
4145                                 return ret;
4146                         }
4147                 }
4148
4149                 class SortKeyMap : ITailoringMap
4150                 {
4151                         public readonly string Source;
4152                         public readonly byte [] SortKey;
4153
4154                         public SortKeyMap (string source, byte [] sortkey)
4155                         {
4156                                 Source = source;
4157                                 SortKey = sortkey;
4158                         }
4159
4160                         public char [] ToCharArray ()
4161                         {
4162                                 char [] ret = new char [Source.Length + 7];
4163                                 ret [0] = (char) 01; // kind:SortKeyMap
4164                                 for (int i = 0; i < Source.Length; i++)
4165                                         ret [i + 1] = Source [i];
4166                                 // null terminate
4167                                 for (int i = 0; i < 4; i++)
4168                                         ret [i + Source.Length + 2] = (char) SortKey [i];
4169                                 return ret;
4170                         }
4171                 }
4172
4173                 class ReplacementMap : ITailoringMap
4174                 {
4175                         public readonly string Source;
4176                         public readonly string Replace;
4177
4178                         public ReplacementMap (string source, string replace)
4179                         {
4180                                 Source = source;
4181                                 Replace = replace;
4182                         }
4183
4184                         public char [] ToCharArray ()
4185                         {
4186                                 char [] ret = new char [Source.Length + Replace.Length + 3];
4187                                 ret [0] = (char) 03; // kind:ReplaceMap
4188                                 int pos = 1;
4189                                 for (int i = 0; i < Source.Length; i++)
4190                                         ret [pos++] = Source [i];
4191                                 // null terminate
4192                                 pos++;
4193                                 for (int i = 0; i < Replace.Length; i++)
4194                                         ret [pos++] = Replace [i];
4195                                 // null terminate
4196                                 return ret;
4197                         }
4198                 }
4199         }
4200 }