2005-07-27 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / create-mscompat-collation-table.cs
1 //
2 //
3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
5 //
6 //      - Surrogate
7 //      - PrivateUse
8 //
9 // Also, for composite characters it should prepare different index table.
10 //
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
13 //
14
15 //
16 // * sortkey getter signature
17 //
18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
19 //      Stores sort key for corresponding character element into buf and
20 //      returns the length of the consumed _source_ character element in s.
21 //
22 // * character length to consume
23 //
24 //      If there are characters whose primary weight is 0, they are consumed
25 //      and considered as a part of the character element.
26 //
27 #define Binary
28
29 using System;
30 using System.IO;
31 using System.Collections;
32 using System.Globalization;
33 using System.Text;
34 using System.Xml;
35
36 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
37
38 namespace Mono.Globalization.Unicode
39 {
40         internal class MSCompatSortKeyTableGenerator
41         {
42                 public static void Main (string [] args)
43                 {
44                         new MSCompatSortKeyTableGenerator ().Run (args);
45                 }
46
47                 const int DecompositionWide = 1; // fixed
48                 const int DecompositionSub = 2; // fixed
49                 const int DecompositionSmall = 3;
50                 const int DecompositionIsolated = 4;
51                 const int DecompositionInitial = 5;
52                 const int DecompositionFinal = 6;
53                 const int DecompositionMedial = 7;
54                 const int DecompositionNoBreak = 8;
55                 const int DecompositionVertical = 9;
56                 const int DecompositionFraction = 0xA;
57                 const int DecompositionFont = 0xB;
58                 const int DecompositionSuper = 0xC; // fixed
59                 const int DecompositionFull = 0xE;
60                 const int DecompositionNarrow = 0xD;
61                 const int DecompositionCircle = 0xF;
62                 const int DecompositionSquare = 0x10;
63                 const int DecompositionCompat = 0x11;
64                 const int DecompositionCanonical = 0x12;
65
66                 TextWriter CSResult = Console.Out;
67                 TextWriter CResult = TextWriter.Null;
68
69                 byte [] fillIndex = new byte [256]; // by category
70                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
71
72                 char [] specialIgnore = new char [] {
73                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
74                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
75                         };
76
77                 // FIXME: need more love (as always)
78                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
79                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
80                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
81                         '\u0292', '\u01BE', '\u0298'};
82                 byte [] alphaWeights = new byte [] {
83                         2, 9, 0xA, 0x1A, 0x21,
84                         0x23, 0x25, 0x2C, 0x32, 0x35,
85                         0x36, 0x48, 0x51, 0x70, 0x7C,
86                         0x7E, 0x89, 0x8A, 0x91, 0x99,
87                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
88                         0xA9, 0xAA, 0xB3, 0xB4};
89
90                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
91                 bool [] isUppercase = new bool [char.MaxValue + 1];
92
93                 byte [] decompType = new byte [char.MaxValue + 1];
94                 int [] decompIndex = new int [char.MaxValue + 1];
95                 int [] decompLength = new int [char.MaxValue + 1];
96                 int [] decompValues;
97                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
98
99                 byte [] diacritical = new byte [char.MaxValue + 1];
100
101                 string [] diacritics = new string [] {
102                         // LATIN, CYRILLIC etc.
103                         "VERTICAL LINE ABOVE", "UPTURN", "DOUBLE-STRUCK",
104                         "ABKHASIAN",
105                         "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
106                         "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
107                         "WITH ACUTE;", "WITH GRAVE;",
108                         //
109                         "WITH DOT ABOVE;", " MIDDLE DOT;",
110                         "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
111                         "WITH DIALYTIKA;",
112                         "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
113                         "DIALYTIKA TONOS", "DIALYTIKA AND TONOS",
114                         "ABKHASIAN CHE WITH DESCENDER",
115                         "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
116                         "WITH OGONEK;", "WITH CEDILLA;",
117                         //
118                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
119                         "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
120                         "STROKE OVERLAY",
121                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
122                         " DIAERESIS AND GRAVE;",
123                         " BREVE AND ACUTE;",
124                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
125                         " MACRON AND ACUTE;",
126                         " MACRON AND GRAVE;",
127                         //
128                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
129                         " RING ABOVE AND ACUTE",
130                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
131                         " CIRCUMFLEX AND TILDE",
132                         " TILDE AND DIAERESIS",
133                         " STROKE AND ACUTE",
134                         " BREVE AND TILDE",
135                         " CEDILLA AND BREVE",
136                         " OGONEK AND MACRON",
137                         // 0x40
138                         "WITH OVERLINE", "DOUBLE VERTICAL LINE ABOVE",
139                         "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
140                         " DOUBLE GRAVE",
141                         " INVERTED BREVE",
142                         "ROMAN NUMERAL",
143                         " PRECEDED BY APOSTROPHE",
144                         "WITH HORN;",
145                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
146                         " PALATAL HOOK",
147                         " DOT BELOW;",
148                         " RETROFLEX;", "DIAERESIS BELOW", "RETROFLEX HOOK",
149                         " RING BELOW", "LOW VERTICAL LINE",
150                         //
151                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
152                         " BREVE BELOW;", " HORN AND GRAVE",
153                         " LOW MACRON",
154                         " TILDE BELOW",
155                         " TOPBAR",
156                         " DOT BELOW AND DOT ABOVE",
157                         " RIGHT HALF RING", " HORN AND TILDE",
158                         " CIRCUMFLEX AND DOT BELOW",
159                         " BREVE AND DOT BELOW",
160                         " DOT BELOW AND MACRON",
161                         " TONE TWO",
162                         " HORN AND HOOK ABOVE",
163                         " HORN AND DOT",
164                         // CIRCLED, PARENTHESIZED and so on
165                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
166                         "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
167                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
168                         };
169                 byte [] diacriticWeights = new byte [] {
170                         // LATIN.
171                         3, 3, 3, 5, 5, 5, 5,
172                         0xE, 0xF,
173                         0xE, 0xF,
174                         //
175                         0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
176                         0x16, 0x17, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
177                         //
178                         0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
179                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
180                         //
181                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
182                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
183                         //
184                         0x40, 0x41, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
185                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x59,
186                         0x5A, 0x5A,
187                         //
188                         0x60, 0x60, 0x61, 0x61, 0x62, 0x63, 0x68, 0x68,
189                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
190                         0x87, 0x95, 0xAA,
191                         // CIRCLED, PARENTHESIZED and so on.
192                         0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
193                         0xF3, 0xF3, 0xF3
194                         };
195
196                 int [] numberSecondaryWeightBounds = new int [] {
197                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
198                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
199                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
200                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
201                         0xE50, 0xE60, 0xED0, 0xEE0
202                         };
203
204                 char [] orderedGurmukhi;
205                 char [] orderedGujarati;
206                 char [] orderedGeorgian;
207                 char [] orderedThaana;
208
209                 static readonly char [] orderedTamilConsonants = new char [] {
210                         // based on traditional Tamil consonants, except for
211                         // Grantha (where Microsoft breaks traditionalism).
212                         // http://www.angelfire.com/empire/thamizh/padanGaL
213                         '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
214                         '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
215                         '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
216                         '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
217                         '\u0BB7', '\u0BB9'};
218
219                 // cp -> character name (only for some characters)
220                 ArrayList sortableCharNames = new ArrayList ();
221
222                 // cp -> arrow value (int)
223                 ArrayList arrowValues = new ArrayList ();
224
225                 // cp -> box value (int)
226                 ArrayList boxValues = new ArrayList ();
227
228                 // cp -> level1 value
229                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
230
231                 // letterName -> cp
232                 Hashtable arabicNameMap = new Hashtable ();
233
234                 // cp -> Hashtable [decompType] -> cp
235                 Hashtable nfkdMap = new Hashtable ();
236
237                 // Latin letter -> ArrayList [int]
238                 Hashtable latinMap = new Hashtable ();
239
240                 ArrayList jisJapanese = new ArrayList ();
241                 ArrayList nonJisJapanese = new ArrayList ();
242
243                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
244                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
245                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
246                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
247                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
248
249                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
250
251                 static double [] unicodeAge = new double [char.MaxValue + 1];
252
253                 ArrayList tailorings = new ArrayList ();
254
255                 void Run (string [] args)
256                 {
257                         string dirname = args.Length == 0 ? "downloaded" : args [0];
258                         ParseSources (dirname);
259                         Console.Error.WriteLine ("parse done.");
260
261                         ModifyParsedValues ();
262                         GenerateCore ();
263                         Console.Error.WriteLine ("generation done.");
264                         CResult = new StreamWriter ("collation-tables.h", false);
265                         Serialize ();
266                         CResult.Close ();
267                         Console.Error.WriteLine ("serialization done.");
268 /*
269 StreamWriter sw = new StreamWriter ("agelog.txt");
270 for (int i = 0; i < char.MaxValue; i++) {
271 bool shouldBe = false;
272 switch (Char.GetUnicodeCategory ((char) i)) {
273 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
274         shouldBe = true; break;
275 }
276 if (unicodeAge [i] >= 3.1)
277         shouldBe = true;
278 //if (IsIgnorable (i) != shouldBe)
279 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
280 }
281 sw.Close ();
282 */
283                 }
284
285                 byte [] CompressArray (byte [] source, CodePointIndexer i)
286                 {
287                         return (byte []) CodePointIndexer.CompressArray  (
288                                 source, typeof (byte), i);
289                 }
290
291                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
292                 {
293                         return (ushort []) CodePointIndexer.CompressArray  (
294                                 source, typeof (ushort), i);
295                 }
296
297                 void WriteByte (byte value)
298                 {
299                         
300                 }
301
302                 void Serialize ()
303                 {
304                         // Tailorings
305                         SerializeTailorings ();
306
307                         byte [] categories = new byte [map.Length];
308                         byte [] level1 = new byte [map.Length];
309                         byte [] level2 = new byte [map.Length];
310                         byte [] level3 = new byte [map.Length];
311 // widthCompat is now removed from the mapping table.
312 // If it turned out that it is still required, grep this source and uncomment
313 // widthCompat related lines. FIXME: remove those lines in the future.
314 //                      ushort [] widthCompat = new ushort [map.Length];
315                         for (int i = 0; i < map.Length; i++) {
316                                 categories [i] = map [i].Category;
317                                 level1 [i] = map [i].Level1;
318                                 level2 [i] = map [i].Level2;
319                                 level3 [i] = ComputeLevel3Weight ((char) i);
320 /*
321                                 // For Japanese Half-width characters, don't
322                                 // map widthCompat. It is IgnoreKanaType that
323                                 // handles those width differences.
324                                 if (0xFF6D <= i && i <= 0xFF9D)
325                                         continue;
326                                 switch (decompType [i]) {
327                                 case DecompositionNarrow:
328                                 case DecompositionWide:
329                                 case DecompositionSuper:
330                                 case DecompositionSub:
331                                         // they are always 1 char
332                                         widthCompat [i] = (ushort) decompValues [decompIndex [i]];
333                                         break;
334                                 }
335 */
336                         }
337
338                         // compress
339                         ignorableFlags = CompressArray (ignorableFlags,
340                                 UUtil.Ignorable);
341                         categories = CompressArray (categories, UUtil.Category);
342                         level1 = CompressArray (level1, UUtil.Level1);
343                         level2 = CompressArray (level2, UUtil.Level2);
344                         level3 = CompressArray (level3, UUtil.Level3);
345 //                      widthCompat = (ushort []) CodePointIndexer.CompressArray (
346 //                              widthCompat, typeof (ushort), UUtil.WidthCompat);
347                         cjkCHS = CompressArray (cjkCHS, UUtil.CjkCHS);
348                         cjkCHT = CompressArray (cjkCHT,UUtil.Cjk);
349                         cjkJA = CompressArray (cjkJA, UUtil.Cjk);
350                         cjkKO = CompressArray (cjkKO, UUtil.Cjk);
351                         cjkKOlv2 = CompressArray (cjkKOlv2, UUtil.Cjk);
352
353                         // Ignorables
354                         CResult.WriteLine ("static const guint8* collation_table_ignorableFlags [] = {");
355                         CSResult.WriteLine ("static readonly byte [] ignorableFlagsArr = new byte [] {");
356 #if Binary
357                         MemoryStream ms = new MemoryStream ();
358                         BinaryWriter binary = new BinaryWriter (ms);
359                         binary.Write (UUtil.ResourceVersion);
360                         binary.Write (ignorableFlags.Length);
361 #endif
362                         for (int i = 0; i < ignorableFlags.Length; i++) {
363                                 byte value = ignorableFlags [i];
364                                 if (value < 10)
365                                         CSResult.Write ("{0},", value);
366                                 else
367                                         CSResult.Write ("0x{0:X02},", value);
368                                 CResult.Write ("{0},", value);
369 #if Binary
370                                 binary.Write (value);
371 #endif
372                                 if ((i & 0xF) == 0xF) {
373                                         CSResult.WriteLine ("// {0:X04}",
374                                                 UUtil.Ignorable.ToCodePoint (i - 0xF));
375                                         CResult.WriteLine ();
376                                 }
377                         }
378                         CSResult.WriteLine ("};");
379                         CSResult.WriteLine ();
380
381                         // Primary category
382                         CResult.WriteLine ("static const guint8* collation_table_category [] = {");
383                         CSResult.WriteLine ("static readonly byte [] categoriesArr = new byte [] {");
384 #if Binary
385                         binary.Write (categories.Length);
386 #endif
387                         for (int i = 0; i < categories.Length; i++) {
388                                 byte value = categories [i];
389                                 if (value < 10)
390                                         CSResult.Write ("{0},", value);
391                                 else
392                                         CSResult.Write ("0x{0:X02},", value);
393                                 CResult.Write ("{0},", value);
394 #if Binary
395                                 binary.Write (value);
396 #endif
397                                 if ((i & 0xF) == 0xF) {
398                                         CSResult.WriteLine ("// {0:X04}",
399                                                 UUtil.Category.ToCodePoint (i - 0xF));
400                                         CResult.WriteLine ();
401                                 }
402                         }
403                         CResult.WriteLine ("};");
404                         CSResult.WriteLine ("};");
405                         CSResult.WriteLine ();
406
407                         // Primary weight value
408                         CResult.WriteLine ("static const guint8* collation_table_level1 [] = {");
409                         CSResult.WriteLine ("static readonly byte [] level1Arr = new byte [] {");
410 #if Binary
411                         binary.Write (level1.Length);
412 #endif
413                         for (int i = 0; i < level1.Length; i++) {
414                                 byte value = level1 [i];
415                                 if (value < 10)
416                                         CSResult.Write ("{0},", value);
417                                 else
418                                         CSResult.Write ("0x{0:X02},", value);
419                                 CResult.Write ("{0},", value);
420 #if Binary
421                                 binary.Write (value);
422 #endif
423                                 if ((i & 0xF) == 0xF) {
424                                         CSResult.WriteLine ("// {0:X04}",
425                                                 UUtil.Level1.ToCodePoint (i - 0xF));
426                                         CResult.WriteLine ();
427                                 }
428                         }
429                         CResult.WriteLine ("0};");
430                         CSResult.WriteLine ("};");
431                         CSResult.WriteLine ();
432
433                         // Secondary weight
434                         CResult.WriteLine ("static const guint8* collation_table_level2 [] = {");
435                         CSResult.WriteLine ("static readonly byte [] level2Arr = new byte [] {");
436 #if Binary
437                         binary.Write (level2.Length);
438 #endif
439                         for (int i = 0; i < level2.Length; i++) {
440                                 byte value = level2 [i];
441                                 if (value < 10)
442                                         CSResult.Write ("{0},", value);
443                                 else
444                                         CSResult.Write ("0x{0:X02},", value);
445                                 CResult.Write ("{0},", value);
446 #if Binary
447                                 binary.Write (value);
448 #endif
449                                 if ((i & 0xF) == 0xF) {
450                                         CSResult.WriteLine ("// {0:X04}",
451                                                 UUtil.Level2.ToCodePoint (i - 0xF));
452                                         CResult.WriteLine ();
453                                 }
454                         }
455                         CResult.WriteLine ("0};");
456                         CSResult.WriteLine ("};");
457                         CSResult.WriteLine ();
458
459                         // Thirtiary weight
460                         CResult.WriteLine ("static const guint8* collation_table_level3 [] = {");
461                         CSResult.WriteLine ("static readonly byte [] level3Arr = new byte [] {");
462 #if Binary
463                         binary.Write (level3.Length);
464 #endif
465                         for (int i = 0; i < level3.Length; i++) {
466                                 byte value = level3 [i];
467                                 if (value < 10)
468                                         CSResult.Write ("{0},", value);
469                                 else
470                                         CSResult.Write ("0x{0:X02},", value);
471                                 CResult.Write ("{0},", value);
472 #if Binary
473                                 binary.Write (value);
474 #endif
475                                 if ((i & 0xF) == 0xF) {
476                                         CSResult.WriteLine ("// {0:X04}",
477                                                 UUtil.Level3.ToCodePoint (i - 0xF));
478                                         CResult.WriteLine ();
479                                 }
480                         }
481                         CResult.WriteLine ("0};");
482                         CSResult.WriteLine ("};");
483                         CSResult.WriteLine ();
484
485 /*
486                         // Width insensitivity mappings
487                         // (for now it is more lightweight than dumping the
488                         // entire NFKD table).
489                         CResult.WriteLine ("static const guint16* widthCompat [] = {");
490                         CSResult.WriteLine ("static readonly ushort [] widthCompatArr = new ushort [] {");
491 #if Binary
492                         binary.Write (widthCompat.Length);
493 #endif
494                         for (int i = 0; i < widthCompat.Length; i++) {
495                                 ushort value = widthCompat [i];
496                                 if (value < 10)
497                                         CSResult.Write ("{0},", value);
498                                 else
499                                         CSResult.Write ("0x{0:X02},", value);
500                                 CResult.Write ("{0},", value);
501 #if Binary
502                                 binary.Write (value);
503 #endif
504                                 if ((i & 0xF) == 0xF) {
505                                         CSResult.WriteLine ("// {0:X04}",
506                                                 UUtil.WidthCompat.ToCodePoint (i - 0xF));
507                                         CResult.WriteLine ();
508                                 }
509                         }
510                         CResult.WriteLine ("0};");
511                         CSResult.WriteLine ("};");
512                         CSResult.WriteLine ();
513 */
514
515 #if Binary
516                         using (FileStream fs = File.Create ("../collation.core.bin")) {
517                                 byte [] array = ms.ToArray ();
518                                 fs.Write (array, 0, array.Length);
519                         }
520 #endif
521
522                         // CJK
523                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
524                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
525                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
526                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
527                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
528                 }
529
530                 void SerializeCJK (string name, ushort [] cjk, int max_unused)
531                 {
532                         CResult.WriteLine ("static const int collation_table_collation_cjk_{0}_size [] = {1};", name, cjk.Length);
533                         CSResult.WriteLine ("const int {0}ArrLength = {1};", name, cjk.Length);
534
535                         CResult.WriteLine ("static const guint8* collation_table_collation_cjk_{0} [] = {{", name);
536                         CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
537 #if Binary
538                         MemoryStream ms = new MemoryStream ();
539                         BinaryWriter binary = new BinaryWriter (ms);
540                         binary.Write (UUtil.ResourceVersion);
541                         binary.Write (cjk.Length); // the actual size is *2.
542 #endif
543                         // category
544                         for (int i = 0; i < cjk.Length; i++) {
545 //                              if (i == max)
546 //                                      break;
547                                 byte value = (byte) (cjk [i] >> 8);
548                                 if (value < 10)
549                                         CSResult.Write ("{0},", value);
550                                 else
551                                         CSResult.Write ("0x{0:X02},", value);
552                                 CResult.Write ("{0},", value);
553 #if Binary
554                                 binary.Write (value);
555 #endif
556                                 if ((i & 0xF) == 0xF) {
557                                         CSResult.WriteLine ("// {0:X04}", i - 0xF);
558                                         CResult.WriteLine ();
559                                 }
560                         }
561
562                         // level 1
563                         for (int i = 0; i < cjk.Length; i++) {
564 //                              if (i == max)
565 //                                      break;
566                                 byte value = (byte) (cjk [i] & 0xFF);
567                                 if (value < 10)
568                                         CSResult.Write ("{0},", value);
569                                 else
570                                         CSResult.Write ("0x{0:X02},", value);
571                                 CResult.Write ("{0},", value);
572 #if Binary
573                                 binary.Write (value);
574 #endif
575                                 if ((i & 0xF) == 0xF) {
576                                         CSResult.WriteLine ("// {0:X04}", i - 0xF);
577                                         CResult.WriteLine ();
578                                 }
579                         }
580
581                         CResult.WriteLine ("0};");
582                         CSResult.WriteLine ("};");
583                         CSResult.WriteLine ();
584 #if Binary
585                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
586                                 byte [] array = ms.ToArray ();
587                                 fs.Write (array, 0, array.Length);
588                         }
589 #endif
590                 }
591
592                 void SerializeCJK (string name, byte [] cjk, int max)
593                 {
594                         CResult.WriteLine ("static const guint8* collation_table_collation_cjk_{0} [] = {{", name);
595                         CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
596 #if Binary
597                         MemoryStream ms = new MemoryStream ();
598                         BinaryWriter binary = new BinaryWriter (ms);
599                         binary.Write (UUtil.ResourceVersion);
600 #endif
601                         for (int i = 0; i < cjk.Length; i++) {
602                                 if (i == max)
603                                         break;
604                                 byte value = cjk [i];
605                                 if (value < 10)
606                                         CSResult.Write ("{0},", value);
607                                 else
608                                         CSResult.Write ("0x{0:X02},", value);
609                                 CResult.Write ("{0},", value);
610 #if Binary
611                                 binary.Write (value);
612 #endif
613                                 if ((i & 0xF) == 0xF) {
614                                         CSResult.WriteLine ("// {0:X04}", i - 0xF);
615                                         CResult.WriteLine ();
616                                 }
617                         }
618                         CResult.WriteLine ("0};");
619                         CSResult.WriteLine ("};");
620                         CSResult.WriteLine ();
621 #if Binary
622                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
623                                 byte [] array = ms.ToArray ();
624                                 fs.Write (array, 0, array.Length);
625                         }
626 #endif
627                 }
628
629                 void SerializeTailorings ()
630                 {
631                         Hashtable indexes = new Hashtable ();
632                         Hashtable counts = new Hashtable ();
633                         CResult.WriteLine ("static const guint16*collation_table_tailoring = {");
634                         CSResult.WriteLine ("static char [] tailorings = new char [] {");
635                         int count = 0;
636 #if Binary
637                         MemoryStream ms = new MemoryStream ();
638                         BinaryWriter binary = new BinaryWriter (ms);
639                         // Here we don't need to output resource version.
640                         // This is cached.
641 #endif
642                         foreach (Tailoring t in tailorings) {
643                                 if (t.Alias != 0)
644                                         continue;
645                                 CResult.Write ("/*{0}*/", t.LCID);
646                                 CSResult.Write ("/*{0}*/", t.LCID);
647                                 indexes.Add (t.LCID, count);
648                                 char [] values = t.ItemToCharArray ();
649                                 counts.Add (t.LCID, values.Length);
650                                 foreach (char c in values) {
651                                         CSResult.Write ("'\\x{0:X}', ", (int) c);
652                                         CResult.Write ("{0},", (int) c);
653                                         if (++count % 16 == 0) {
654                                                 CSResult.WriteLine (" // {0:X04}", count - 16);
655                                                 CResult.WriteLine ();
656                                         }
657 #if Binary
658                                         binary.Write ((ushort) c);
659 #endif
660                                 }
661                         }
662                         CResult.WriteLine ("0};");
663                         CSResult.WriteLine ("};");
664
665                         CResult.WriteLine ("static const int collation_tailoring_count = {0};", tailorings.Count);
666                         CResult.WriteLine ("static const int* collation_tailoring_infos = {");
667                         CSResult.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
668 #if Binary
669                         byte [] rawdata = ms.ToArray ();
670                         ms = new MemoryStream ();
671                         binary = new BinaryWriter (ms);
672                         binary.Write (UUtil.ResourceVersion);
673                         binary.Write (tailorings.Count);
674 #endif
675                         foreach (Tailoring t in tailorings) {
676                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
677                                 if (!indexes.ContainsKey (target)) {
678                                         throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
679                                         continue;
680                                 }
681                                 int idx = (int) indexes [target];
682                                 int cnt = (int) counts [target];
683                                 bool french = t.FrenchSort;
684                                 if (t.Alias != 0)
685                                         foreach (Tailoring t2 in tailorings)
686                                                 if (t2.LCID == t.LCID)
687                                                         french = t2.FrenchSort;
688                                 CSResult.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
689                                 CResult.WriteLine ("{0},{1},{2},{3},", t.LCID, idx, cnt, french ? 1 : 0);
690 #if Binary
691                                 binary.Write (t.LCID);
692                                 binary.Write (idx);
693                                 binary.Write (cnt);
694                                 binary.Write (french);
695 #endif
696                         }
697                         CResult.WriteLine ("0};");
698                         CSResult.WriteLine ("};");
699 #if Binary
700                         binary.Write ((byte) 0xFF);
701                         binary.Write ((byte) 0xFF);
702                         binary.Write (rawdata.Length / 2);
703                         binary.Write (rawdata, 0, rawdata.Length);
704
705
706                         using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
707                                 byte [] array = ms.ToArray ();
708                                 fs.Write (array, 0, array.Length);
709                         }
710 #endif
711                 }
712
713                 #region Parse
714
715                 void ParseSources (string dirname)
716                 {
717                         string unidata =
718                                 dirname + "/UnicodeData.txt";
719                         string derivedCoreProps = 
720                                 dirname + "/DerivedCoreProperties.txt";
721                         string scripts = 
722                                 dirname + "/Scripts.txt";
723                         string cp932 = 
724                                 dirname + "/CP932.TXT";
725                         string derivedAge = 
726                                 dirname + "/DerivedAge.txt";
727                         string chXML = dirname + "/common/collation/zh.xml";
728                         string jaXML = dirname + "/common/collation/ja.xml";
729                         string koXML = dirname + "/common/collation/ko.xml";
730
731                         ParseDerivedAge (derivedAge);
732
733                         FillIgnorables ();
734
735                         ParseJISOrder (cp932); // in prior to ParseUnidata()
736                         ParseUnidata (unidata);
737                         ModifyUnidata ();
738                         ParseDerivedCoreProperties (derivedCoreProps);
739                         ParseScripts (scripts);
740                         ParseCJK (chXML, jaXML, koXML);
741
742                         ParseTailorings ("mono-tailoring-source.txt");
743                 }
744
745                 void ParseTailorings (string filename)
746                 {
747                         Tailoring t = null;
748                         int line = 0;
749                         using (StreamReader sr = new StreamReader (filename)) {
750                                 try {
751                                         while (sr.Peek () >= 0) {
752                                                 line++;
753                                                 ProcessTailoringLine (ref t,
754                                                         sr.ReadLine ().Trim ());
755                                         }
756                                 } catch (Exception) {
757                                         Console.Error.WriteLine ("ERROR at line {0}", line);
758                                         throw;
759                                 }
760                         }
761                 }
762
763                 // For now this is enough.
764                 string ParseTailoringSourceValue (string s)
765                 {
766                         StringBuilder sb = new StringBuilder ();
767                         for (int i = 0; i < s.Length; i++) {
768                                 if (i + 5 < s.Length &&
769                                         s [i] == '\\' && s [i + 1] == 'u') {
770                                         sb.Append (
771                                                 (char) int.Parse (
772                                                         s.Substring (i + 2, 4),
773                                                         NumberStyles.HexNumber),
774                                                 1);
775                                         i += 5;
776                                 }
777                                 else
778                                         sb.Append (s [i]);
779                         }
780                         return sb.ToString ();
781                 }
782
783                 void ProcessTailoringLine (ref Tailoring t, string s)
784                 {
785                         int idx = s.IndexOf ('#');
786                         if (idx > 0)
787                                 s = s.Substring (0, idx).Trim ();
788                         if (s.Length == 0 || s [0] == '#')
789                                 return;
790                         if (s [0] == '@') {
791                                 idx = s.IndexOf ('=');
792                                 if (idx > 0)
793                                         t = new Tailoring (
794                                                 int.Parse (s.Substring (1, idx - 1)),
795                                                 int.Parse (s.Substring (idx + 1)));
796                                 else
797                                         t = new Tailoring (int.Parse (s.Substring (1)));
798                                 tailorings.Add (t);
799                                 return;
800                         }
801                         if (s.StartsWith ("*FrenchSort")) {
802                                 t.FrenchSort = true;
803                                 return;
804                         }
805                         string d = "*Diacritical";
806                         if (s.StartsWith (d)) {
807                                 idx = s.IndexOf ("->");
808                                 t.AddDiacriticalMap (
809                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
810                                                 NumberStyles.HexNumber),
811                                         byte.Parse (s.Substring (idx + 2).Trim (),
812                                                 NumberStyles.HexNumber));
813                                 return;
814                         }
815                         idx = s.IndexOf (':');
816                         if (idx > 0) {
817                                 string source = s.Substring (0, idx).Trim ();
818                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
819                                 byte [] b = new byte [4];
820                                 for (int i = 0; i < 4; i++) {
821                                         if (l [i] == "*")
822                                                 b [i] = 0;
823                                         else
824                                                 b [i] = byte.Parse (l [i],
825                                                         NumberStyles.HexNumber);
826                                 }
827                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
828                                         b);
829                         }
830                         idx = s.IndexOf ('=');
831                         if (idx > 0)
832                                 t.AddReplacementMap (
833                                         ParseTailoringSourceValue (
834                                                 s.Substring (0, idx).Trim ()),
835                                         ParseTailoringSourceValue (
836                                                 s.Substring (idx + 1).Trim ()));
837                 }
838
839                 void ParseDerivedAge (string filename)
840                 {
841                         using (StreamReader file =
842                                 new StreamReader (filename)) {
843                                 while (file.Peek () >= 0) {
844                                         string s = file.ReadLine ();
845                                         int idx = s.IndexOf ('#');
846                                         if (idx >= 0)
847                                                 s = s.Substring (0, idx);
848                                         idx = s.IndexOf (';');
849                                         if (idx < 0)
850                                                 continue;
851
852                                         string cpspec = s.Substring (0, idx);
853                                         idx = cpspec.IndexOf ("..");
854                                         NumberStyles nf = NumberStyles.HexNumber |
855                                                 NumberStyles.AllowTrailingWhite;
856                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
857                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
858                                         string value = s.Substring (cpspec.Length + 1).Trim ();
859
860                                         // FIXME: use index
861                                         if (cp > char.MaxValue)
862                                                 continue;
863
864                                         double v = double.Parse (value);
865                                         for (int i = cp; i <= cpEnd; i++)
866                                                 unicodeAge [i] = v;
867                                 }
868                         }
869                         unicodeAge [0] = double.MaxValue; // never be supported
870                 }
871
872                 void ParseUnidata (string filename)
873                 {
874                         ArrayList decompValues = new ArrayList ();
875                         using (StreamReader unidata =
876                                 new StreamReader (filename)) {
877                                 for (int line = 1; unidata.Peek () >= 0; line++) {
878                                         try {
879                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
880                                         } catch (Exception) {
881                                                 Console.Error.WriteLine ("**** At line " + line);
882                                                 throw;
883                                         }
884                                 }
885                         }
886                         this.decompValues = (int [])
887                                 decompValues.ToArray (typeof (int));
888                 }
889
890                 char previousLatinTarget = char.MinValue;
891                 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
892
893                 void ProcessUnidataLine (string s, ArrayList decompValues)
894                 {
895                         int idx = s.IndexOf ('#');
896                         if (idx >= 0)
897                                 s = s.Substring (0, idx);
898                         idx = s.IndexOf (';');
899                         if (idx < 0)
900                                 return;
901                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
902                         string [] values = s.Substring (idx + 1).Split (';');
903
904                         // FIXME: use index
905                         if (cp > char.MaxValue)
906                                 return;
907                         if (IsIgnorable (cp))
908                                 return;
909
910                         string name = values [0];
911
912                         // SPECIAL CASE: rename some characters for diacritical
913                         // remapping. FIXME: why are they different?
914                         // FIXME: it's still not working.
915                         if (cp == 0x018B || cp == 0x018C)
916                                 name = name.Replace ("TOPBAR", "STROKE");
917
918                         // isSmallCapital
919                         if (s.IndexOf ("SMALL CAPITAL") > 0)
920                                 isSmallCapital [cp] = true;
921
922                         // latin mapping by character name
923                         if (s.IndexOf ("LATIN") >= 0) {
924                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
925                                 int offset = lidx + 15;
926                                 if (lidx < 0) {
927                                         lidx = s.IndexOf ("LETTER TURNED ");
928                                         offset = lidx + 14;
929                                 }
930                                 if (lidx < 0) {
931                                         lidx = s.IndexOf ("LETTER CAPITAL ");
932                                         offset = lidx + 15;
933                                 }
934                                 if (lidx < 0) {
935                                         lidx = s.IndexOf ("LETTER SCRIPT ");
936                                         offset = lidx + 14;
937                                 }
938                                 if (lidx < 0) {
939                                         lidx = s.IndexOf ("LETTER ");
940                                         offset = lidx + 7;
941                                 }
942                                 char c = lidx > 0 ? s [offset] : char.MinValue;
943                                 char n = s [offset + 1];
944                                 char target = char.MinValue;
945                                 if ('A' <= c && c <= 'Z' &&
946                                         (n == ' ') || n == ';') {
947                                         target = c;
948                                         // FIXME: After 'Z', I cannot reset this state.
949                                         previousLatinTarget = c == 'Z' ? char.MinValue : c;
950                                 }
951
952                                 if (s.Substring (offset).StartsWith ("ALPHA"))
953                                         target = 'A';
954                                 else if (s.Substring (offset).StartsWith ("TONE SIX"))
955                                         target = 'B';
956                                 else if (s.Substring (offset).StartsWith ("OPEN O"))
957                                         target = 'C';
958                                 else if (s.Substring (offset).StartsWith ("ETH"))
959                                         target = 'D';
960                                 else if (s.Substring (offset).StartsWith ("SCHWA"))
961                                         target = 'E';
962                                 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
963                                         target = 'O';
964                                 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
965                                         target = 'R';
966                                 else if (s.Substring (offset).StartsWith ("TONE TWO"))
967                                         target = 'S';
968                                 else if (s.Substring (offset).StartsWith ("ESH"))
969                                         target = 'S';
970                                 else if (s.Substring (offset).StartsWith ("OUNCE"))
971                                         target = 'Z';
972
973                                 // For remaining IPA chars, direct mapping is
974                                 // much faster.
975                                 switch (cp) {
976                                 case 0x0166: case 0x0167:
977                                         // Though they are 'T', they have different weight
978                                         target = char.MinValue; break;
979                                 case 0x0299: target = 'B'; break;
980                                 case 0x029A: target = 'E'; break;
981                                 case 0x029B: target = 'G'; break;
982                                 case 0x029C: target = 'H'; break;
983                                 case 0x029D: target = 'J'; break;
984                                 case 0x029E: target = 'K'; break;
985                                 case 0x029F: target = 'L'; break;
986                                 case 0x02A0: target = 'Q'; break;
987                                 case 0x02A7: target = 'T'; break;
988                                 case 0x02A8: target = 'T'; break;
989                                 }
990
991                                 if (target == char.MinValue)
992                                         target = previousLatinTarget;
993
994                                 if (target != char.MinValue) {
995                                         ArrayList entry = (ArrayList) latinMap [target];
996                                         if (entry == null) {
997                                                 entry = new ArrayList ();
998                                                 latinMap [target] = entry;
999                                         }
1000                                         entry.Add (cp);
1001                                         // FIXME: This secondary weight is hack.
1002                                         // They are here because they must not
1003                                         // be identical to the corresponding
1004                                         // ASCII latins.
1005                                         if (c != target && diacritical [cp] == 0) {
1006                                                 diacriticalOffset [c - 'A']++;
1007                                                 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
1008                                         }
1009                                 }
1010                         }
1011
1012                         // Arrow names
1013                         if (0x2000 <= cp && cp < 0x3000) {
1014                                 int value = 0;
1015                                 // SPECIAL CASES. FIXME: why?
1016                                 switch (cp) {
1017                                 case 0x21C5: value = -1; break; // E2
1018                                 case 0x261D: value = 1; break;
1019                                 case 0x27A6: value = 3; break;
1020                                 case 0x21B0: value = 7; break;
1021                                 case 0x21B1: value = 3; break;
1022                                 case 0x21B2: value = 7; break;
1023                                 case 0x21B4: value = 5; break;
1024                                 case 0x21B5: value = 7; break;
1025                                 case 0x21B9: value = -1; break; // E1
1026                                 case 0x21CF: value = 7; break;
1027                                 case 0x21D0: value = 3; break;
1028                                 }
1029                                 string [] arrowTargets = new string [] {
1030                                         "",
1031                                         "UPWARDS",
1032                                         "NORTH EAST",
1033                                         "RIGHTWARDS",
1034                                         "SOUTH EAST",
1035                                         "DOWNWARDS",
1036                                         "SOUTH WEST",
1037                                         "LEFTWARDS",
1038                                         "NORTH WEST",
1039                                         "LEFT RIGHT",
1040                                         "UP DOWN",
1041                                         };
1042                                 if (s.IndexOf ("RIGHTWARDS") >= 0 &&
1043                                         s.IndexOf ("LEFTWARDS") >= 0)
1044                                         value = 0xE1 - 0xD8;
1045                                 else if (s.IndexOf ("UPWARDS") >= 0 &&
1046                                         s.IndexOf ("DOWNWARDS") >= 0)
1047                                         value = 0xE2 - 0xD8;
1048                                 else if (s.IndexOf ("ARROW") >= 0 &&
1049                                         s.IndexOf ("COMBINING") < 0 &&
1050                                         s.IndexOf ("CLOCKWISE") >= 0)
1051                                         value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8;
1052                                 if (value == 0)
1053                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
1054                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
1055                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
1056                                                         s.IndexOf (" OVER") < 0
1057                                                 )
1058                                                         value = i;
1059                                 if (value > 0)
1060                                         arrowValues.Add (new DictionaryEntry (
1061                                                 cp, value));
1062                         }
1063
1064                         // Box names
1065                         if (0x2500 <= cp && cp < 0x2600) {
1066                                 int value = int.MinValue;
1067                                 // flags:
1068                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
1069                                 // [h,rl] [r] [l]
1070                                 // [v,ud] [u] [d]
1071                                 // [dr] [dl] [ur] [ul]
1072                                 // [vr,udr] [vl,vdl]
1073                                 // [hd,rld] [hu,rlu]
1074                                 // [hv,udrl,rlv,udh]
1075                                 ArrayList flags = new ArrayList (new int [] {
1076                                         32, 8 + 4, 8, 4,
1077                                         16, 1 + 2, 1, 2,
1078                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
1079                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
1080                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
1081                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
1082                                         });
1083                                 byte [] offsets = new byte [] {
1084                                         0, 0, 1, 2,
1085                                         3, 3, 4, 5,
1086                                         6, 7, 8, 9,
1087                                         10, 10, 11, 11,
1088                                         12, 12, 13, 13,
1089                                         14, 14, 14, 14};
1090                                 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
1091                                         int flag = 0;
1092                                         if (s.IndexOf (" UP") >= 0)
1093                                                 flag |= 1;
1094                                         if (s.IndexOf (" DOWN") >= 0)
1095                                                 flag |= 2;
1096                                         if (s.IndexOf (" RIGHT") >= 0)
1097                                                 flag |= 4;
1098                                         if (s.IndexOf (" LEFT") >= 0)
1099                                                 flag |= 8;
1100                                         if (s.IndexOf (" VERTICAL") >= 0)
1101                                                 flag |= 16;
1102                                         if (s.IndexOf (" HORIZONTAL") >= 0)
1103                                                 flag |= 32;
1104
1105                                         int fidx = flags.IndexOf (flag);
1106                                         if (fidx >= 0)
1107                                                 value = offsets [fidx];
1108                                 } else if (s.IndexOf ("BLOCK") >= 0) {
1109                                         if (s.IndexOf ("ONE EIGHTH") >= 0)
1110                                                 value = 0x12;
1111                                         else if (s.IndexOf ("ONE QUARTER") >= 0)
1112                                                 value = 0x13;
1113                                         else if (s.IndexOf ("THREE EIGHTHS") >= 0)
1114                                                 value = 0x14;
1115                                         else if (s.IndexOf ("HALF") >= 0)
1116                                                 value = 0x15;
1117                                         else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1118                                                 value = 0x16;
1119                                         else if (s.IndexOf ("THREE QUARTERS") >= 0)
1120                                                 value = 0x17;
1121                                         else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1122                                                 value = 0x18;
1123                                         else
1124                                                 value = 0x19;
1125                                 }
1126                                 else if (s.IndexOf ("SHADE") >= 0)
1127                                         value = 0x19;
1128                                 else if (s.IndexOf ("SQUARE") >= 0)
1129                                         value = 0xBC - 0xE5;
1130                                 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1131                                         value = 0xBE - 0xE5;
1132                                 else if (s.IndexOf ("RECTANGLE") >= 0)
1133                                         value = 0xBD - 0xE5;
1134                                 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1135                                         value = 0xBF - 0xE5;
1136                                 else if (s.IndexOf ("TRIANGLE") >= 0) {
1137                                         if (s.IndexOf ("UP-POINTING") >= 0)
1138                                                 value = 0xC0 - 0xE5;
1139                                         else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1140                                                 value = 0xC1 - 0xE5;
1141                                         else if (s.IndexOf ("DOWN-POINTING") >= 0)
1142                                                 value = 0xC2 - 0xE5;
1143                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1144                                                 value = 0xC3 - 0xE5;
1145                                 }
1146                                 else if (s.IndexOf ("POINTER") >= 0) {
1147                                         if (s.IndexOf ("RIGHT-POINTING") >= 0)
1148                                                 value = 0xC4 - 0xE5;
1149                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1150                                                 value = 0xC5 - 0xE5;
1151                                 }
1152                                 else if (s.IndexOf ("DIAMOND") >= 0)
1153                                         value = 0xC6 - 0xE5;
1154                                 else if (s.IndexOf ("FISHEYE") >= 0)
1155                                         value = 0xC7 - 0xE5;
1156                                 else if (s.IndexOf ("LOZENGE") >= 0)
1157                                         value = 0xC8 - 0xE5;
1158                                 else if (s.IndexOf ("BULLSEYE") >= 0)
1159                                         value = 0xC9 - 0xE5;
1160                                 else if (s.IndexOf ("CIRCLE") >= 0) {
1161                                         if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1162                                                 value = 0xCA - 0xE5;
1163                                         else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1164                                                 value = 0xCB - 0xE5;
1165                                         else
1166                                                 value = 0xC9 - 0xE5;
1167                                 }
1168                                 else if (s.IndexOf ("BULLET") >= 0)
1169                                         value = 0xCC - 0xE5;
1170                                 if (0x25DA <= cp && cp <= 0x25E5)
1171                                         value = 0xCD + cp - 0x25DA - 0xE5;
1172
1173                                 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1174                                 switch (cp) {
1175                                 case 0x2571: value = 0xF; break;
1176                                 case 0x2572: value = 0x10; break;
1177                                 case 0x2573: value = 0x11; break;
1178                                 }
1179                                 if (value != int.MinValue)
1180                                         boxValues.Add (new DictionaryEntry (
1181                                                 cp, value));
1182                         }
1183
1184                         // For some characters store the name and sort later
1185                         // to determine sorting.
1186                         if (0x2100 <= cp && cp <= 0x213F &&
1187                                 Char.IsSymbol ((char) cp))
1188                                 sortableCharNames.Add (
1189                                         new DictionaryEntry (cp, name));
1190                         else if (0x3380 <= cp && cp <= 0x33DD)
1191                                 sortableCharNames.Add (new DictionaryEntry (
1192                                         cp, name.Substring (7)));
1193
1194                         if (Char.GetUnicodeCategory ((char) cp) ==
1195                                 UnicodeCategory.MathSymbol) {
1196                                 if (name.StartsWith ("CIRCLED "))
1197                                         diacritical [cp] = 0xEE;
1198                                 if (name.StartsWith ("SQUARED "))
1199                                         diacritical [cp] = 0xEF;
1200                         }
1201
1202                         // diacritical weights by character name
1203 if (diacritics.Length != diacriticWeights.Length)
1204 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1205                         for (int d = diacritics.Length - 1; d >= 0; d--) {
1206                                 if (s.IndexOf (diacritics [d]) > 0) {
1207                                         diacritical [cp] += diacriticWeights [d];
1208                                         if (s.IndexOf ("COMBINING") >= 0)
1209                                                 diacritical [cp] -= (byte) 2;
1210                                         break;
1211                                 }
1212                                 // also process "COMBINING blah" here
1213                                 // For now it is limited to cp < 0x0370
1214 //                              if (cp < 0x0300 || cp >= 0x0370)
1215 //                                      continue;
1216                                 string tmp = diacritics [d].TrimEnd (';');
1217                                 if (tmp.IndexOf ("WITH ") == 0)
1218                                         tmp = tmp.Substring (4);
1219                                 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1220                                 if (name == tmp) {
1221                                         diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1222                                         break;
1223                                 }
1224 //if (name == tmp)
1225 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1226                         }
1227                         // Two-step grep required for it.
1228                         if (s.IndexOf ("FULL STOP") > 0 &&
1229                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1230                                 diacritical [cp] |= 0xF4;
1231                         if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1232                                 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1233                                         s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1234
1235                         // Arabic letter name
1236                         if (0x0621 <= cp && cp <= 0x064A &&
1237                                 Char.GetUnicodeCategory ((char) cp)
1238                                 == UnicodeCategory.OtherLetter) {
1239                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1240                                 switch (cp) {
1241                                 case 0x0621:
1242                                 case 0x0624:
1243                                 case 0x0626:
1244                                         // hamza, waw, yeh ... special cases.
1245                                         value = 0x07;
1246                                         break;
1247                                 case 0x0649:
1248                                 case 0x064A:
1249                                         value = 0x77; // special cases.
1250                                         break;
1251                                 default:
1252                                         // Get primary letter name i.e.
1253                                         // XXX part of ARABIC LETTER XXX yyy
1254                                         // e.g. that of "TEH MARBUTA" is "TEH".
1255                                         string letterName =
1256                                                 (cp == 0x0640) ?
1257                                                 // 0x0640 is special: it does
1258                                                 // not start with ARABIC LETTER
1259                                                 name :
1260                                                 name.Substring (14);
1261                                         int tmpIdx = letterName.IndexOf (' ');
1262                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1263 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1264                                         if (arabicNameMap.ContainsKey (letterName))
1265                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1266                                         else
1267                                                 arabicNameMap [letterName] = cp;
1268                                         break;
1269                                 }
1270                                 arabicLetterPrimaryValues [cp] = value;
1271                         }
1272
1273                         // Japanese square letter
1274                         if (0x3300 <= cp && cp <= 0x3357)
1275                                 if (!ExistsJIS (cp))
1276                                         nonJisJapanese.Add (new NonJISCharacter (cp, name));
1277
1278                         // normalizationType
1279                         string decomp = values [4];
1280                         idx = decomp.IndexOf ('<');
1281                         if (idx >= 0) {
1282                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1283                                 case "full":
1284                                         decompType [cp] = DecompositionFull;
1285                                         break;
1286                                 case "sub":
1287                                         decompType [cp] = DecompositionSub;
1288                                         break;
1289                                 case "super":
1290                                         decompType [cp] = DecompositionSuper;
1291                                         break;
1292                                 case "small":
1293                                         decompType [cp] = DecompositionSmall;
1294                                         break;
1295                                 case "isolated":
1296                                         decompType [cp] = DecompositionIsolated;
1297                                         break;
1298                                 case "initial":
1299                                         decompType [cp] = DecompositionInitial;
1300                                         break;
1301                                 case "final":
1302                                         decompType [cp] = DecompositionFinal;
1303                                         break;
1304                                 case "medial":
1305                                         decompType [cp] = DecompositionMedial;
1306                                         break;
1307                                 case "noBreak":
1308                                         decompType [cp] = DecompositionNoBreak;
1309                                         break;
1310                                 case "compat":
1311                                         decompType [cp] = DecompositionCompat;
1312                                         break;
1313                                 case "fraction":
1314                                         decompType [cp] = DecompositionFraction;
1315                                         break;
1316                                 case "font":
1317                                         decompType [cp] = DecompositionFont;
1318                                         break;
1319                                 case "circle":
1320                                         decompType [cp] = DecompositionCircle;
1321                                         break;
1322                                 case "square":
1323                                         decompType [cp] = DecompositionSquare;
1324                                         break;
1325                                 case "wide":
1326                                         decompType [cp] = DecompositionWide;
1327                                         break;
1328                                 case "narrow":
1329                                         decompType [cp] = DecompositionNarrow;
1330                                         break;
1331                                 case "vertical":
1332                                         decompType [cp] = DecompositionVertical;
1333                                         break;
1334                                 default:
1335                                         throw new Exception ("Support NFKD type : " + decomp);
1336                                 }
1337                         }
1338                         else
1339                                 decompType [cp] = DecompositionCanonical;
1340                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1341                         if (decomp.Length > 0) {
1342
1343                                 string [] velems = decomp.Split (' ');
1344                                 int didx = decompValues.Count;
1345                                 decompIndex [cp] = didx;
1346                                 foreach (string v in velems)
1347                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1348                                 decompLength [cp] = velems.Length;
1349
1350                                 // [decmpType] -> this_cp
1351                                 int targetCP = (int) decompValues [didx];
1352                                 // for "(x)" it specially maps to 'x' .
1353                                 // FIXME: check if it is sane
1354                                 if (velems.Length == 3 &&
1355                                         (int) decompValues [didx] == '(' &&
1356                                         (int) decompValues [didx + 2] == ')')
1357                                         targetCP = (int) decompValues [didx + 1];
1358                                 // special: 0x215F "1/"
1359                                 else if (cp == 0x215F)
1360                                         targetCP = '1';
1361                                 else if (velems.Length > 1 &&
1362                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
1363                                         // skip them, except for CJK ideograph compat
1364                                         targetCP = 0;
1365
1366                                 if (targetCP != 0) {
1367                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
1368                                         if (entry == null) {
1369                                                 entry = new Hashtable ();
1370                                                 nfkdMap [targetCP] = entry;
1371                                         }
1372                                         entry [(byte) decompType [cp]] = cp;
1373                                 }
1374                         }
1375                         // numeric values
1376                         if (values [5].Length > 0)
1377                                 decimalValue [cp] = decimal.Parse (values [5]);
1378                         else if (values [6].Length > 0)
1379                                 decimalValue [cp] = decimal.Parse (values [6]);
1380                         else if (values [7].Length > 0) {
1381                                 string decstr = values [7];
1382                                 idx = decstr.IndexOf ('/');
1383                                 if (cp == 0x215F) // special. "1/"
1384                                         decimalValue [cp] = 0x1;
1385                                 else if (idx > 0)
1386                                         // m/n
1387                                         decimalValue [cp] = 
1388                                                 decimal.Parse (decstr.Substring (0, idx))
1389                                                 / decimal.Parse (decstr.Substring (idx + 1));
1390                                 else if (decstr [0] == '(' &&
1391                                         decstr [decstr.Length - 1] == ')')
1392                                         // (n)
1393                                         decimalValue [cp] =
1394                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1395                                 else if (decstr [decstr.Length - 1] == '.')
1396                                         // n.
1397                                         decimalValue [cp] =
1398                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1399                                 else
1400                                         decimalValue [cp] = decimal.Parse (decstr);
1401                         }
1402                 }
1403
1404                 void ParseDerivedCoreProperties (string filename)
1405                 {
1406                         // IsUppercase
1407                         using (StreamReader file =
1408                                 new StreamReader (filename)) {
1409                                 for (int line = 1; file.Peek () >= 0; line++) {
1410                                         try {
1411                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1412                                         } catch (Exception) {
1413                                                 Console.Error.WriteLine ("**** At line " + line);
1414                                                 throw;
1415                                         }
1416                                 }
1417                         }
1418                 }
1419
1420                 void ProcessDerivedCorePropLine (string s)
1421                 {
1422                         int idx = s.IndexOf ('#');
1423                         if (idx >= 0)
1424                                 s = s.Substring (0, idx);
1425                         idx = s.IndexOf (';');
1426                         if (idx < 0)
1427                                 return;
1428                         string cpspec = s.Substring (0, idx);
1429                         idx = cpspec.IndexOf ("..");
1430                         NumberStyles nf = NumberStyles.HexNumber |
1431                                 NumberStyles.AllowTrailingWhite;
1432                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1433                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1434                         string value = s.Substring (cpspec.Length + 1).Trim ();
1435
1436                         // FIXME: use index
1437                         if (cp > char.MaxValue)
1438                                 return;
1439
1440                         switch (value) {
1441                         case "Uppercase":
1442                                 for (int x = cp; x <= cpEnd; x++)
1443                                         isUppercase [x] = true;
1444                                 break;
1445                         }
1446                 }
1447
1448                 void ParseScripts (string filename)
1449                 {
1450                         ArrayList gurmukhi = new ArrayList ();
1451                         ArrayList gujarati = new ArrayList ();
1452                         ArrayList georgian = new ArrayList ();
1453                         ArrayList thaana = new ArrayList ();
1454
1455                         using (StreamReader file =
1456                                 new StreamReader (filename)) {
1457                                 while (file.Peek () >= 0) {
1458                                         string s = file.ReadLine ();
1459                                         int idx = s.IndexOf ('#');
1460                                         if (idx >= 0)
1461                                                 s = s.Substring (0, idx);
1462                                         idx = s.IndexOf (';');
1463                                         if (idx < 0)
1464                                                 continue;
1465
1466                                         string cpspec = s.Substring (0, idx);
1467                                         idx = cpspec.IndexOf ("..");
1468                                         NumberStyles nf = NumberStyles.HexNumber |
1469                                                 NumberStyles.AllowTrailingWhite;
1470                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1471                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1472                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1473
1474                                         // FIXME: use index
1475                                         if (cp > char.MaxValue)
1476                                                 continue;
1477
1478                                         switch (value) {
1479                                         case "Gurmukhi":
1480                                                 for (int x = cp; x <= cpEnd; x++)
1481                                                         if (!IsIgnorable (x))
1482                                                                 gurmukhi.Add ((char) x);
1483                                                 break;
1484                                         case "Gujarati":
1485                                                 for (int x = cp; x <= cpEnd; x++)
1486                                                         if (!IsIgnorable (x))
1487                                                                 gujarati.Add ((char) x);
1488                                                 break;
1489                                         case "Georgian":
1490                                                 for (int x = cp; x <= cpEnd; x++)
1491                                                         if (!IsIgnorable (x))
1492                                                                 georgian.Add ((char) x);
1493                                                 break;
1494                                         case "Thaana":
1495                                                 for (int x = cp; x <= cpEnd; x++)
1496                                                         if (!IsIgnorable (x))
1497                                                                 thaana.Add ((char) x);
1498                                                 break;
1499                                         }
1500                                 }
1501                         }
1502                         gurmukhi.Sort (UCAComparer.Instance);
1503                         gujarati.Sort (UCAComparer.Instance);
1504                         georgian.Sort (UCAComparer.Instance);
1505                         thaana.Sort (UCAComparer.Instance);
1506                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1507                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1508                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1509                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1510                 }
1511
1512                 void ParseJISOrder (string filename)
1513                 {
1514                         int line = 1;
1515                         try {
1516                                 using (StreamReader file =
1517                                         new StreamReader (filename)) {
1518                                         for (;file.Peek () >= 0; line++)
1519                                                 ProcessJISOrderLine (file.ReadLine ());
1520                                 }
1521                         } catch (Exception) {
1522                                 Console.Error.WriteLine ("---- line {0}", line);
1523                                 throw;
1524                         }
1525                 }
1526
1527                 char [] ws = new char [] {'\t', ' '};
1528
1529                 void ProcessJISOrderLine (string s)
1530                 {
1531                         int idx = s.IndexOf ('#');
1532                         if (idx >= 0)
1533                                 s = s.Substring (0, idx).Trim ();
1534                         if (s.Length == 0)
1535                                 return;
1536                         idx = s.IndexOfAny (ws);
1537                         if (idx < 0)
1538                                 return;
1539                         // They start with "0x" so cut them out.
1540                         int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1541                         int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1542                         jisJapanese.Add (new JISCharacter (cp, jis));
1543                 }
1544
1545                 void ParseCJK (string zhXML, string jaXML, string koXML)
1546                 {
1547                         XmlDocument doc = new XmlDocument ();
1548                         doc.XmlResolver = null;
1549                         int v;
1550                         string s;
1551                         string category;
1552                         int offset;
1553                         ushort [] arr;
1554
1555                         // Chinese Simplified
1556                         category = "chs";
1557                         arr = cjkCHS;
1558                         offset = 0;//char.MaxValue - arr.Length;
1559                         doc.Load (zhXML);
1560                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1561                         v = 0x8008;
1562                         foreach (char c in s) {
1563                                 if (c < '\u3100')
1564                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1565                                 else {
1566                                         arr [(int) c - offset] = (ushort) v++;
1567                                         if (v % 256 == 0)
1568                                                 v += 2;
1569                                 }
1570                         }
1571
1572                         // Chinese Traditional
1573                         category = "cht";
1574                         arr = cjkCHT;
1575                         offset = 0;//char.MaxValue - arr.Length;
1576                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1577                         v = 0x8002;
1578                         foreach (char c in s) {
1579                                 if (c < '\u4E00')
1580                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1581                                 else {
1582                                         arr [(int) c - offset] = (ushort) v++;
1583                                         if (v % 256 == 0)
1584                                                 v += 2;
1585                                 }
1586                         }
1587
1588                         // Japanese
1589                         category = "ja";
1590                         arr = cjkJA;
1591                         offset = 0;//char.MaxValue - arr.Length;
1592
1593                         // SPECIAL CASES
1594                         arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1595                         arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1596                         arr [0x337E] = 0x8005;
1597                         arr [0x337D] = 0x8006;
1598                         arr [0x337C] = 0x8007;
1599
1600                         v = 0x8008;
1601                         foreach (JISCharacter jc in jisJapanese) {
1602                                 if (jc.JIS < 0x8800)
1603                                         continue;
1604                                 char c = (char) jc.CP;
1605
1606                                 if (c < '\u4E00')
1607                                         // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1608                                         continue;
1609                                 else {
1610                                         arr [(int) c - offset] = (ushort) v++;
1611                                         if (v % 256 == 0)
1612                                                 v += 2;
1613
1614                                         // SPECIAL CASES:
1615                                         if (c == '\u662D') // U+337C
1616                                                 continue;
1617                                         if (c == '\u5927') // U+337D
1618                                                 continue;
1619                                         if (c == '\u5E73') // U+337B
1620                                                 continue;
1621                                         if (c == '\u660E') // U+337E
1622                                                 continue;
1623                                         if (c == '\u9686') // U+F9DC
1624                                                 continue;
1625
1626                                         // FIXME: there are still remaining
1627                                         // characters after U+FA0C.
1628 //                                      for (int k = 0; k < char.MaxValue; k++) {
1629                                         for (int k = 0; k < '\uFA0D'; k++) {
1630                                                 if (decompIndex [k] == 0 || IsIgnorable (k))
1631                                                         continue;
1632                                                 if (decompValues [decompIndex [k]] == c /*&&
1633                                                         decompLength [k] == 1*/ ||
1634                                                         decompLength [k] == 3 &&
1635                                                         decompValues [decompIndex [k] + 1] == c) {
1636                                                         arr [k - offset] = (ushort) v++;
1637                                                         if (v % 256 == 0)
1638                                                                 v += 2;
1639                                                 }
1640                                         }
1641                                 }
1642                         }
1643
1644                         // Korean
1645                         // Korean weight is somewhat complex. It first shifts
1646                         // Hangul category from 52-x to 80-x (they are anyways
1647                         // computed). CJK ideographs are placed at secondary
1648                         // weight, like XX YY 01 zz 01, where XX and YY are
1649                         // corresponding "reset" value and zz is 41,43,45...
1650                         //
1651                         // Unlike chs,cht and ja, Korean value is a combined
1652                         // ushort which is computed as category
1653                         //
1654                         category = "ko";
1655                         arr = cjkKO;
1656                         offset = 0;//char.MaxValue - arr.Length;
1657                         doc.Load (koXML);
1658                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1659                                 XmlElement sc = (XmlElement) reset.NextSibling;
1660                                 // compute "category" and "level 1" for the 
1661                                 // target "reset" Hangle syllable
1662                                 char rc = reset.InnerText [0];
1663                                 int ri = ((int) rc - 0xAC00) + 1;
1664                                 ushort p = (ushort)
1665                                         ((ri / 254) * 256 + (ri % 254) + 2);
1666                                 // Place the characters after the target.
1667                                 s = sc.InnerText;
1668                                 v = 0x41;
1669                                 foreach (char c in s) {
1670                                         arr [(int) c - offset] = p;
1671                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1672                                         v += 2;
1673                                 }
1674                         }
1675                 }
1676
1677                 #endregion
1678
1679                 #region Generation
1680
1681                 void FillIgnorables ()
1682                 {
1683                         for (int i = 0; i <= char.MaxValue; i++) {
1684                                 if (Char.GetUnicodeCategory ((char) i) ==
1685                                         UnicodeCategory.OtherNotAssigned)
1686                                         continue;
1687                                 if (IsIgnorable (i))
1688                                         ignorableFlags [i] |= 1;
1689                                 if (IsIgnorableSymbol (i))
1690                                         ignorableFlags [i] |= 2;
1691                                 if (IsIgnorableNonSpacing (i))
1692                                         ignorableFlags [i] |= 4;
1693                         }
1694                 }
1695
1696                 void ModifyUnidata ()
1697                 {
1698                         ArrayList decompValues = new ArrayList (this.decompValues);
1699
1700                         // Hebrew uppercase letters.
1701                         foreach (int i in new int []
1702                                 {0x05DB, 0x05DE, 0x05E0, 0x05E4, 0x05E6})
1703                                 isUppercase [i] = true;
1704
1705
1706                         // Modify some decomposition equivalence
1707                         for (int i = 0xFE31; i <= 0xFE34; i++) {
1708                                 decompType [i] = 0;
1709                                 decompIndex [i] = 0;
1710                                 decompLength [i] = 0;
1711                         }
1712                         decompType [0x037E] = 0;
1713                         decompIndex [0x037E] = 0;
1714                         decompLength [0x037E] = 0;
1715
1716                         // Hangzhou numbers
1717                         for (int i = 0x3021; i <= 0x3029; i++)
1718                                 diacritical [i] = 0x4E;
1719                         // Korean parens numbers
1720                         for (int i = 0x3200; i <= 0x321C; i++)
1721                                 diacritical [i] = 0xA;
1722                         for (int i = 0x3260; i <= 0x327B; i++)
1723                                 diacritical [i] = 0xC;
1724
1725                         // LAMESPEC: these remapping should not be done.
1726                         // Windows have incorrect CJK compat mappings.
1727                         decompValues [decompIndex [0x32A9]] = 0x91AB;
1728                         decompLength [0x323B] = 1;
1729                         decompValues [decompIndex [0x323B]] = 0x5B78;
1730                         decompValues [decompIndex [0x32AB]] = 0x5B78;
1731                         decompValues [decompIndex [0x32A2]] = 0x5BEB;
1732                         decompLength [0x3238] = 1;
1733                         decompValues [decompIndex [0x3238]] = 0x52DE;
1734                         decompValues [decompIndex [0x3298]] = 0x52DE;
1735
1736                         // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1737                         decompIndex [0xFA0C] = decompValues.Count;
1738                         decompValues.Add ((int) 0x5140);
1739                         decompLength [0xFA0C] = 1;
1740                         decompIndex [0xF929] = decompLength [0xF929] = 0;
1741
1742                         decompValues [decompIndex [0xF92C]] = 0x90DE;
1743
1744                         decompIndex [0x2125] = decompValues.Count;
1745                         decompValues.Add ((int) 0x005A);
1746                         decompLength [0x2125] = 1;
1747                         decompType [0x2125] = DecompositionFont;
1748
1749                         this.decompValues = decompValues.ToArray (typeof (int)) as int [];
1750                 }
1751
1752                 void ModifyParsedValues ()
1753                 {
1754                         // Sometimes STROKE don't work fine
1755                         diacritical [0xD8] = diacritical [0xF8] = 0x21;
1756                         diacritical [0x141] = diacritical [0x142] = 0x1F;
1757                         // FIXME: why?
1758                         diacritical [0xAA] = diacritical [0xBA] = 3;
1759                         diacritical [0xD0] = diacritical [0xF0] = 0x68;
1760                         diacritical [0x131] = 3;
1761                         diacritical [0x138] = 3;
1762                         // TOPBAR does not work as an identifier for the weight
1763                         diacritical [0x182] = diacritical [0x183] = 0x68; // B
1764                         diacritical [0x18B] = diacritical [0x18C] = 0x1E; // D
1765                         // TONE TWO
1766                         diacritical [0x1A7] = diacritical [0x1A8] = 0x87;
1767                         // TONE SIX
1768                         diacritical [0x184] = diacritical [0x185] = 0x87;
1769                         // OPEN E
1770                         diacritical [0x190] = diacritical [0x25B] = 0x7B;
1771                         // There are many letters w/ diacritical weight 0x7B
1772                         diacritical [0x0192] = diacritical [0x0194] =
1773                         diacritical [0x0195] = diacritical [0x0196] =
1774                         diacritical [0x019C] = diacritical [0x019E] =
1775                         diacritical [0x01A6] = diacritical [0x01B1] =
1776                         diacritical [0x01B2] = diacritical [0x01BF] = 0x7B;
1777                         // ... as well as 0x7C
1778                         diacritical [0x01A2] = diacritical [0x01A3] = 0x7C;
1779
1780                         // <font> NFKD characters seem to have diacritical
1781                         // weight as 3,4,5... but the order does not look
1782                         // by codepoint and I have no idea how they are sorted.
1783                         diacritical [0x210E] = 3;
1784                         diacritical [0x210F] = 0x68;
1785                         diacritical [0x2110] = 4;
1786                         diacritical [0x2111] = 5;
1787                         diacritical [0x2112] = 4;
1788                         diacritical [0x2113] = 4;
1789                         diacritical [0x211B] = 4;
1790                         diacritical [0x211C] = 5;
1791
1792                         // some cyrillic diacritical weight. They seem to be
1793                         // based on old character names, so it's quicker to
1794                         // set them directly here.
1795                         // FIXME: they are by mostly unknown reason
1796                         diacritical [0x0496] = diacritical [0x0497] = 7;
1797                         diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1798                         diacritical [0x049A] = diacritical [0x049B] = 0x17;
1799                         diacritical [0x049C] = diacritical [0x049D] = 9;
1800                         diacritical [0x049E] = diacritical [0x049F] = 4;
1801                         diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1802                         diacritical [0x04A2] = diacritical [0x04A3] = 7;
1803                         diacritical [0x04A4] = diacritical [0x04A5] = 8;
1804                         diacritical [0x04AA] = diacritical [0x04AB] = 0x1A; // ES CEDILLA?
1805                         diacritical [0x04AC] = diacritical [0x04AD] = 7; // RIGHT DESCENDER? but U+4B2
1806                         diacritical [0x04AE] = diacritical [0x04AF] = 0xB; // STRAIGHT U?
1807                         diacritical [0x04B2] = diacritical [0x04B3] = 0x17; // RIGHT DESCENDER? but U+4AC
1808                         diacritical [0x04B4] = diacritical [0x04B5] = 3;
1809                         diacritical [0x04B6] = 8;
1810                         diacritical [0x04B7] = 7;
1811                         diacritical [0x04B8] = diacritical [0x04B9] = 9;
1812                         diacritical [0x04BA] = diacritical [0x04BB] = 9;
1813
1814                         // number, secondary weights
1815                         byte weight = 0x38;
1816                         int [] numarr = numberSecondaryWeightBounds;
1817                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1818                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1819                                         if (Char.IsNumber ((char) cp))
1820                                                 diacritical [cp] = weight;
1821
1822                         // Gurmukhi special letters' diacritical weight
1823                         for (int i = 0x0A50; i < 0x0A60; i++)
1824                                 diacritical [i] = 4;
1825                         // Oriya special letters' diacritical weight
1826                         for (int i = 0x0B5C; i < 0x0B60; i++)
1827                                 diacritical [i] = 6;
1828
1829                         // Update name part of named characters
1830                         for (int i = 0; i < sortableCharNames.Count; i++) {
1831                                 DictionaryEntry de =
1832                                         (DictionaryEntry) sortableCharNames [i];
1833                                 int cp = (int) de.Key;
1834                                 string renamed = null;
1835                                 switch (cp) {
1836                                 case 0x2101: renamed = "A_1"; break;
1837                                 case 0x33C3: renamed = "A_2"; break;
1838                                 case 0x2105: renamed = "C_1"; break;
1839                                 case 0x2106: renamed = "C_2"; break;
1840                                 case 0x211E: renamed = "R1"; break;
1841                                 case 0x211F: renamed = "R2"; break;
1842                                 // Remove some of them!
1843                                 case 0x2103:
1844                                 case 0x2109:
1845                                 case 0x2116:
1846                                 case 0x2117:
1847                                 case 0x2118:
1848                                 case 0x2125:
1849                                 case 0x2127:
1850                                 case 0x2129:
1851                                 case 0x212E:
1852                                 case 0x2132:
1853                                         sortableCharNames.RemoveAt (i);
1854                                         i--;
1855                                         continue;
1856                                 }
1857                                 if (renamed != null)
1858                                         sortableCharNames [i] =
1859                                                 new DictionaryEntry (cp, renamed);
1860                         }
1861                 }
1862
1863                 void GenerateCore ()
1864                 {
1865                         UnicodeCategory uc;
1866
1867                         #region Specially ignored // 01
1868                         // This will raise "Defined" flag up.
1869                         // FIXME: Check If it is really fine. Actually for
1870                         // Japanese voice marks this code does remapping.
1871                         foreach (char c in specialIgnore)
1872                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1873                         #endregion
1874
1875                         #region Extenders (FF FF)
1876                         fillIndex [0xFF] = 0xFF;
1877                         char [] specialBiggest = new char [] {
1878                                 '\u3005', '\u3031', '\u3032', '\u309D',
1879                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1880                                 '\uFE7C', '\uFE7D', '\uFF70'};
1881                         foreach (char c in specialBiggest)
1882                                 AddCharMap (c, 0xFF, 0);
1883                         #endregion
1884
1885                         #region Variable weights
1886                         // Controls : 06 03 - 06 3D
1887                         fillIndex [0x6] = 3;
1888                         for (int i = 0; i < 65536; i++) {
1889                                 if (IsIgnorable (i))
1890                                         continue;
1891                                 char c = (char) i;
1892                                 uc = Char.GetUnicodeCategory (c);
1893                                 // NEL is whitespace but not ignored here.
1894                                 if (uc == UnicodeCategory.Control &&
1895                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1896                                         AddCharMap (c, 6, 1);
1897                         }
1898
1899                         // Apostrophe 06 80
1900                         fillIndex [0x6] = 0x80;
1901                         AddCharMap ('\'', 6, 0);
1902                         AddCharMap ('\uFF07', 6, 1);
1903                         AddCharMap ('\uFE63', 6, 1);
1904
1905                         // SPECIAL CASE: fill FE32 here in prior to be added
1906                         // at 2013. Windows does not always respect NFKD.
1907                         map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1908
1909                         // Hyphen/Dash : 06 81 - 06 90
1910                         for (int i = 0; i < char.MaxValue; i++) {
1911                                 if (!IsIgnorable (i) &&
1912                                         Char.GetUnicodeCategory ((char) i) ==
1913                                         UnicodeCategory.DashPunctuation) {
1914                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1915                                         if (i == 0x2011) {
1916                                                 // SPECIAL: add 2027 and 2043
1917                                                 // Maybe they are regarded the 
1918                                                 // same hyphens in "central"
1919                                                 // position.
1920                                                 AddCharMap ('\u2027', 6, 1);
1921                                                 AddCharMap ('\u2043', 6, 1);
1922                                         }
1923                                 }
1924                         }
1925                         // They are regarded as primarily equivalent to '-'
1926                         map [0x208B] = new CharMapEntry (6, 0x82, 0);
1927                         map [0x207B] = new CharMapEntry (6, 0x82, 0);
1928                         map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1929
1930                         // Arabic variable weight chars 06 A0 -
1931                         fillIndex [6] = 0xA0;
1932                         // vowels
1933                         for (int i = 0x64B; i <= 0x650; i++)
1934                                 AddArabicCharMap ((char) i, 6, 1, 0);
1935                         // sukun
1936                         AddCharMapGroup ('\u0652', 6, 1, 0);
1937                         // shadda
1938                         AddCharMapGroup ('\u0651', 6, 1, 0);
1939                         #endregion
1940
1941
1942                         #region Nonspacing marks // 01
1943                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1944
1945                         // Combining diacritical marks: 01 DC -
1946
1947                         fillIndex [0x1] = 0x41;
1948                         for (int i = 0x030E; i <= 0x0326; i++)
1949                                 if (!IsIgnorable (i))
1950                                         AddCharMap ((char) i, 0x1, 1);
1951                         for (int i = 0x0329; i <= 0x0334; i++)
1952                                 if (!IsIgnorable (i))
1953                                         AddCharMap ((char) i, 0x1, 1);
1954                         fillIndex [0x1]++;
1955                         for (int i = 0x0339; i <= 0x0341; i++)
1956                                 if (!IsIgnorable (i))
1957                                         AddCharMap ((char) i, 0x1, 1);
1958                         fillIndex [0x1] = 0x74;
1959                         for (int i = 0x0346; i <= 0x0348; i++)
1960                                 if (!IsIgnorable (i))
1961                                         AddCharMap ((char) i, 0x1, 1);
1962                         for (int i = 0x02BE; i <= 0x02BF; i++)
1963                                 if (!IsIgnorable (i))
1964                                         AddCharMap ((char) i, 0x1, 1);
1965                         for (int i = 0x02C1; i <= 0x02C5; i++)
1966                                 if (!IsIgnorable (i))
1967                                         AddCharMap ((char) i, 0x1, 1);
1968                         for (int i = 0x02CE; i <= 0x02CF; i++)
1969                                 if (!IsIgnorable (i))
1970                                         AddCharMap ((char) i, 0x1, 1);
1971                         fillIndex [0x1]++;
1972                         for (int i = 0x02D1; i <= 0x02D3; i++)
1973                                 if (!IsIgnorable (i))
1974                                         AddCharMap ((char) i, 0x1, 1);
1975                         AddCharMap ('\u02DE', 0x1, 1);
1976                         for (int i = 0x02E4; i <= 0x02E9; i++)
1977                                 if (!IsIgnorable (i))
1978                                         AddCharMap ((char) i, 0x1, 1);
1979
1980
1981                         // FIXME: needs more love here (it should eliminate
1982                         // all the hacky code above).
1983                         for (int i = 0x0300; i < 0x0370; i++)
1984                                 if (!IsIgnorable (i) && diacritical [i] != 0
1985                                         && !map [i].Defined)
1986                                         map [i] = new CharMapEntry (
1987                                                 0x1, 0x1, diacritical [i]);
1988
1989                         // Cyrillic and Armenian nonspacing mark
1990                         fillIndex [0x1] = 0x94;
1991                         for (int i = 0x400; i < 0x580; i++)
1992                                 if (!IsIgnorable (i) &&
1993                                         Char.GetUnicodeCategory ((char) i) ==
1994                                         UnicodeCategory.NonSpacingMark)
1995                                         AddCharMap ((char) i, 1, 1);
1996
1997                         fillIndex [0x1] = 0x8D;
1998                         // syriac dotted nonspacing marks (1)
1999                         AddCharMap ('\u0740', 0x1, 1);
2000                         AddCharMap ('\u0741', 0x1, 1);
2001                         AddCharMap ('\u0742', 0x1, 1);
2002                         // syriac oblique nonspacing marks
2003                         AddCharMap ('\u0747', 0x1, 1);
2004                         AddCharMap ('\u0748', 0x1, 1);
2005                         // syriac dotted nonspacing marks (2)
2006                         fillIndex [0x1] = 0x94; // this reset is mandatory
2007                         AddCharMap ('\u0732', 0x1, 1);
2008                         AddCharMap ('\u0735', 0x1, 1);
2009                         AddCharMap ('\u0738', 0x1, 1);
2010                         AddCharMap ('\u0739', 0x1, 1);
2011                         AddCharMap ('\u073C', 0x1, 1);
2012                         // SPECIAL CASES: superscripts
2013                         AddCharMap ('\u073F', 0x1, 1);
2014                         AddCharMap ('\u0711', 0x1, 1);
2015                         // syriac "DOTS"
2016                         for (int i = 0x0743; i <= 0x0746; i++)
2017                                 AddCharMap ((char) i, 0x1, 1);
2018                         for (int i = 0x0730; i <= 0x0780; i++)
2019                                 if (!map [i].Defined &&
2020                                         Char.GetUnicodeCategory ((char) i) ==
2021                                         UnicodeCategory.NonSpacingMark)
2022                                         AddCharMap ((char) i, 0x1, 1);
2023
2024                         // LAMESPEC: It should not stop at '\u20E1'. There are
2025                         // a few more characters (that however results in 
2026                         // overflow of level 2 unless we start before 0xDD).
2027                         fillIndex [0x1] = 0xDD;
2028                         for (int i = 0x20D0; i <= 0x20DC; i++)
2029                                 AddCharMap ((char) i, 0x1, 1);
2030                         fillIndex [0x1] = 0xEC;
2031                         for (int i = 0x20DD; i <= 0x20E1; i++)
2032                                 AddCharMap ((char) i, 0x1, 1);
2033                         fillIndex [0x1] = 0x4;
2034                         AddCharMap ('\u0CD5', 0x1, 1);
2035                         AddCharMap ('\u0CD6', 0x1, 1);
2036                         AddCharMap ('\u093C', 0x1, 1);
2037                         for (int i = 0x302A; i <= 0x302D; i++)
2038                                 AddCharMap ((char) i, 0x1, 1);
2039                         AddCharMap ('\u0C55', 0x1, 1);
2040                         AddCharMap ('\u0C56', 0x1, 1);
2041
2042                         fillIndex [0x1] = 0x50; // I wonder how they are sorted
2043                         for (int i = 0x02D4; i <= 0x02D7; i++)
2044                                 AddCharMap ((char) i, 0x1, 1);
2045
2046                         // They are not part of Nonspacing marks, but have
2047                         // only diacritical weight.
2048                         for (int i = 0x3099; i <= 0x309C; i++)
2049                                 map [i] = new CharMapEntry (1, 1, 1);
2050                         map [0xFF9E] = new CharMapEntry (1, 1, 1);
2051                         map [0xFF9F] = new CharMapEntry (1, 1, 2);
2052                         map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
2053                         map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
2054                         for (int i = 0x30FC; i <= 0x30FE; i++)
2055                                 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
2056
2057                         fillIndex [0x1] = 0xA;
2058                         for (int i = 0x0951; i <= 0x0954; i++)
2059                                 AddCharMap ((char) i, 0x1, 2);
2060
2061                         #endregion
2062
2063
2064                         #region Whitespaces // 07 03 -
2065                         fillIndex [0x7] = 0x2;
2066                         AddCharMap (' ', 0x7, 2);
2067                         AddCharMap ('\u00A0', 0x7, 1);
2068                         for (int i = 9; i <= 0xD; i++)
2069                                 AddCharMap ((char) i, 0x7, 1);
2070                         for (int i = 0x2000; i <= 0x200B; i++)
2071                                 AddCharMap ((char) i, 0x7, 1);
2072
2073                         fillIndex [0x7] = 0x17;
2074                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
2075                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
2076
2077                         // Characters which used to represent layout control.
2078                         // LAMESPEC: Windows developers seem to have thought 
2079                         // that those characters are kind of whitespaces,
2080                         // while they aren't.
2081                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
2082                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
2083
2084                         #endregion
2085
2086                         // category 09 - continued symbols from 08
2087                         fillIndex [0x9] = 2;
2088                         // misc tech mark
2089                         for (int cp = 0x2300; cp <= 0x237A; cp++)
2090                                 AddCharMap ((char) cp, 0x9, 1, 0);
2091
2092                         // arrows
2093                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
2094                         foreach (DictionaryEntry de in arrowValues) {
2095                                 int idx = (int) de.Value;
2096                                 int cp = (int) de.Key;
2097                                 if (map [cp].Defined)
2098                                         continue;
2099                                 fillIndex [0x9] = (byte) (0xD8 + idx);
2100                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
2101                                 arrowLv2 [idx]++;
2102                         }
2103                         // boxes
2104                         byte [] boxLv2 = new byte [128];
2105                         // 0-63 will be used for those offsets are positive,
2106                         // and 64-127 are for negative ones.
2107                         for (int i = 0; i < boxLv2.Length; i++)
2108                                 boxLv2 [i] = 3;
2109                         foreach (DictionaryEntry de in boxValues) {
2110                                 int cp = (int) de.Key;
2111                                 int off = (int) de.Value;
2112                                 if (map [cp].Defined)
2113                                         continue;
2114                                 if (off < 0) {
2115                                         fillIndex [0x9] = (byte) (0xE5 + off);
2116                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++);
2117                                 }
2118                                 else {
2119                                         fillIndex [0x9] = (byte) (0xE5 + off);
2120                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
2121                                 }
2122                         }
2123                         // Some special characters (slanted)
2124                         fillIndex [0x9] = 0xF4;
2125                         AddCharMap ('\u2571', 0x9, 3);
2126                         AddCharMap ('\u2572', 0x9, 3);
2127                         AddCharMap ('\u2573', 0x9, 3);
2128
2129                         // FIXME: implement 0A
2130                         #region Symbols
2131                         fillIndex [0xA] = 2;
2132                         // byte currency symbols
2133                         for (int cp = 0; cp < 0x100; cp++) {
2134                                 uc = Char.GetUnicodeCategory ((char) cp);
2135                                 if (!IsIgnorable (cp) &&
2136                                         uc == UnicodeCategory.CurrencySymbol &&
2137                                         cp != '$')
2138                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
2139                         }
2140                         // byte other symbols
2141                         for (int cp = 0; cp < 0x100; cp++) {
2142                                 if (cp == 0xA6)
2143                                         continue; // SPECIAL: skip FIXME: why?
2144                                 uc = Char.GetUnicodeCategory ((char) cp);
2145                                 if (!IsIgnorable (cp) &&
2146                                         uc == UnicodeCategory.OtherSymbol ||
2147                                         cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7')
2148                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
2149                         }
2150                         // U+30FB here
2151                         AddCharMapGroup ('\u30FB', 0xA, 1, 0);
2152
2153                         for (int cp = 0x2020; cp <= 0x2031; cp++)
2154                                 if (Char.IsPunctuation ((char) cp))
2155                                         AddCharMap ((char) cp, 0xA, 1, 0);
2156                         // SPECIAL CASES: why?
2157                         AddCharMap ('\u203B', 0xA, 1, 0);
2158                         AddCharMap ('\u2040', 0xA, 1, 0);
2159                         AddCharMap ('\u2041', 0xA, 1, 0);
2160                         AddCharMap ('\u2042', 0xA, 1, 0);
2161
2162                         for (int cp = 0x20A0; cp <= 0x20AB; cp++)
2163                                 AddCharMap ((char) cp, 0xA, 1, 0);
2164
2165                         // 3004 is skipped at first...
2166                         for (int cp = 0x3010; cp <= 0x3040; cp++)
2167                                 if (Char.IsSymbol ((char) cp))
2168                                         AddCharMap ((char) cp, 0xA, 1, 0);
2169                         // SPECIAL CASES: added here
2170                         AddCharMap ('\u3004', 0xA, 1, 0);
2171                         AddCharMap ('\u327F', 0xA, 1, 0);
2172
2173                         for (int cp = 0x2600; cp <= 0x2613; cp++)
2174                                 AddCharMap ((char) cp, 0xA, 1, 0);
2175                         // Dingbats
2176                         for (int cp = 0x2620; cp <= 0x2770; cp++)
2177                                 if (Char.IsSymbol ((char) cp))
2178                                         AddCharMap ((char) cp, 0xA, 1, 0);
2179                         // OCR
2180                         for (int i = 0x2440; i < 0x2460; i++)
2181                                 AddCharMap ((char) i, 0xA, 1, 0);
2182
2183                         // SPECIAL CASES: why?
2184                         AddCharMap ('\u0E3F', 0xA, 1, 0);
2185                         AddCharMap ('\u2117', 0xA, 1, 0);
2186                         AddCharMap ('\u20AC', 0xA, 1, 0);
2187                         #endregion
2188
2189                         #region Numbers // 0C 02 - 0C E1
2190                         fillIndex [0xC] = 2;
2191
2192                         // 9F8 : Bengali "one less than the denominator"
2193                         AddCharMap ('\u09F8', 0xC, 1, 0x3C);
2194
2195                         ArrayList numbers = new ArrayList ();
2196                         for (int i = 0; i < 65536; i++)
2197                                 if (!IsIgnorable (i) &&
2198                                         Char.IsNumber ((char) i) &&
2199                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
2200                                         numbers.Add (i);
2201
2202                         ArrayList numberValues = new ArrayList ();
2203                         foreach (int i in numbers)
2204                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
2205                         // SPECIAL CASE: Cyrillic Thousand sign
2206                         numberValues.Add (new DictionaryEntry (0x0482, 1000m));
2207                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
2208
2209 //foreach (DictionaryEntry de in numberValues)
2210 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
2211
2212                         // FIXME: fillIndex adjustment lines are too
2213                         // complicated. It must be simpler.
2214                         decimal prevValue = -1;
2215                         foreach (DictionaryEntry de in numberValues) {
2216                                 int cp = (int) de.Key;
2217                                 decimal currValue = (decimal) de.Value;
2218                                 bool addnew = false;
2219                                 if (prevValue < currValue &&
2220                                         prevValue - (int) prevValue == 0 &&
2221                                         prevValue >= 1) {
2222
2223                                         addnew = true;
2224                                         // Process Hangzhou and Roman numbers
2225
2226                                         // There are some SPECIAL cases.
2227                                         if (currValue != 4) // no increment for 4
2228                                                 fillIndex [0xC]++;
2229
2230                                         int xcp;
2231                                         if (currValue <= 13) {
2232                                                 if (currValue == 4)
2233                                                         fillIndex [0xC]++;
2234                                                 // SPECIAL CASE
2235                                                 if (currValue == 11)
2236                                                         AddCharMap ('\u0BF0', 0xC, 1);
2237                                                 xcp = (int) prevValue + 0x2160 - 1;
2238                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2239                                                 xcp = (int) prevValue + 0x2170 - 1;
2240                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2241                                                 fillIndex [0xC]++;
2242                                         }
2243                                         if (currValue < 12)
2244                                                 fillIndex [0xC]++;
2245                                         if (currValue <= 10) {
2246                                                 xcp = (int) prevValue + 0x3021 - 1;
2247                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2248                                                 fillIndex [0xC]++;
2249                                         }
2250                                 }
2251                                 if (prevValue < currValue)
2252                                         prevValue = currValue;
2253                                 if (map [cp].Defined)
2254                                         continue;
2255                                 // HangZhou and Roman are add later 
2256                                 // (code is above)
2257                                 if (0x3021 <= cp && cp < 0x302A
2258                                         || 0x2160 <= cp && cp < 0x216C
2259                                         || 0x2170 <= cp && cp < 0x217C)
2260                                         continue;
2261
2262                                 if (cp == 0x215B) // FIXME: why?
2263                                         fillIndex [0xC] += 2;
2264                                 else if (cp == 0x3021) // FIXME: why?
2265                                         fillIndex [0xC]++;
2266                                 if (addnew || cp <= '9') {
2267                                         int mod = (int) currValue - 1;
2268                                         int xcp;
2269                                         if (1 <= currValue && currValue <= 11) {
2270                                                 xcp = mod + 0x2776;
2271                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2272                                                 xcp = mod + 0x2780;
2273                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2274                                                 xcp = mod + 0x278A;
2275                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2276                                         }
2277                                         if (1 <= currValue && currValue <= 20) {
2278                                                 xcp = mod + 0x2460;
2279                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2280                                                 xcp = mod + 0x2474;
2281                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2282                                                 xcp = mod + 0x2488;
2283                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2284                                         }
2285                                 }
2286                                 if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9)
2287                                         fillIndex [0xC]++;
2288                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2289
2290                                 switch (cp) {
2291                                 // Maybe Bengali digit numbers do not increase
2292                                 // indexes, but 0x09E6 does.
2293                                 case 0x09E7: case 0x09E8: case 0x09E9:
2294                                 case 0x09EA:
2295                                 // SPECIAL CASES
2296                                 case 0x0BF0: case 0x2180: case 0x2181:
2297                                         break;
2298                                 // SPECIAL CASE
2299                                 case 0x0BF1:
2300                                         fillIndex [0xC]++;
2301                                         break;
2302                                 default:
2303                                         if (currValue < 11 || currValue == 1000)
2304                                                 fillIndex [0xC]++;
2305                                         break;
2306                                 }
2307
2308                                 // Add special cases that are not regarded as 
2309                                 // numbers in UnicodeCategory speak.
2310                                 if (cp == '5') {
2311                                         // TONE FIVE
2312                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2313                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2314                                 }
2315                                 else if (cp == '2' || cp == '6') // FIXME: why?
2316                                         fillIndex [0xC]++;
2317                         }
2318
2319                         // 221E: infinity
2320                         fillIndex [0xC] = 0xFF;
2321                         AddCharMap ('\u221E', 0xC, 1);
2322                         #endregion
2323
2324                         #region Letters and NonSpacing Marks (general)
2325
2326                         // ASCII Latin alphabets
2327                         for (int i = 0; i < alphabets.Length; i++)
2328                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2329
2330                         // non-ASCII Latin alphabets
2331                         // FIXME: there is no such characters that are placed
2332                         // *after* "alphabets" array items. This is nothing
2333                         // more than a hack that creates dummy weight for
2334                         // primary characters.
2335                         for (int i = 0x0080; i < 0x0300; i++) {
2336                                 if (!Char.IsLetter ((char) i))
2337                                         continue;
2338                                 // For those Latin Letters which has NFKD are
2339                                 // not added as independent primary character.
2340                                 if (decompIndex [i] != 0)
2341                                         continue;
2342                                 // SPECIAL CASES:
2343                                 // 1.some alphabets have primarily
2344                                 //   equivalent ASCII alphabets.
2345                                 // 2.some have independent primary weights,
2346                                 //   but inside a-to-z range.
2347                                 // 3.there are some expanded characters that
2348                                 //   are not part of Unicode Standard NFKD.
2349                                 // 4. some characters are letter in IsLetter
2350                                 //   but not in sortkeys (maybe unicode version
2351                                 //   difference caused it).
2352                                 switch (i) {
2353                                 // 1. skipping them does not make sense
2354 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
2355 //                              case 0x184: case 0x185: case 0x186: case 0x189:
2356 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
2357 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
2358 //                              case 0x19B: case 0x19C:
2359                                 // 2. skipping them does not make sense
2360 //                              case 0x14A: // Ng
2361 //                              case 0x14B: // ng
2362                                 // 3.
2363                                 case 0xC6: // AE
2364                                 case 0xE6: // ae
2365                                 case 0xDE: // Icelandic Thorn
2366                                 case 0xFE: // Icelandic Thorn
2367                                 case 0xDF: // German ss
2368                                 case 0xFF: // German ss
2369                                 // 4.
2370                                 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2371                                 // not classified yet
2372 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2373 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2374 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2375 //                              case 0x1DD:
2376                                         continue;
2377                                 }
2378                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
2379                         }
2380
2381                         // Greek and Coptic
2382                         fillIndex [0xF] = 2;
2383                         for (int i = 0x0391; i < 0x03AA; i++)
2384                                 if (i != 0x03A2)
2385                                         AddCharMap ((char) i, 0xF, 1);
2386                         fillIndex [0xF] = 2;
2387                         for (int i = 0x03B1; i < 0x03CA; i++)
2388                                 if (i != 0x03C2)
2389                                         AddCharMap ((char) i, 0xF, 1);
2390                         // Final Sigma
2391                         map [0x03C2] = new CharMapEntry (0xF,
2392                                 map [0x03C3].Level1, map [0x03C3].Level2);
2393
2394                         fillIndex [0xF] = 0x40;
2395                         for (int i = 0x03DA; i < 0x03F0; i++)
2396                                 AddCharMap ((char) i, 0xF,
2397                                         (byte) (i % 2 == 0 ? 0 : 2));
2398
2399                         // NFKD
2400                         for (int i = 0x0386; i <= 0x0400; i++)
2401                                 FillLetterNFKD (i, true, true);
2402
2403                         // Cyrillic.
2404                         // Cyrillic letters are sorted like Latin letters i.e. 
2405                         // containing culture-specific letters between the
2406                         // standard Cyrillic sequence.
2407                         //
2408                         // We can't use UCA here; it has different sorting.
2409                         char [] orderedCyrillic = new char [] {
2410                                 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2411                                 '\u0452', // DJE for Serbocroatian
2412                                 '\u0435',
2413                                 '\u0454', // IE for Ukrainian
2414                                 '\u0436', '\u0437',
2415                                 '\u0455', // DZE
2416                                 '\u0438',
2417                                 '\u0456', // Byelorussian-Ukrainian I
2418                                 '\u0457', // YI
2419                                 '\u0439',
2420                                 '\u0458', // JE
2421                                 '\u043A', '\u043B',
2422                                 '\u0459', // LJE
2423                                 '\u043C', '\u043D',
2424                                 '\u045A', // NJE
2425                                 '\u043E',
2426                                 // 4E9 goes here.
2427                                 '\u043F', '\u0440', '\u0441', '\u0442',
2428                                 '\u045B', // TSHE for Serbocroatian
2429                                 '\u0443',
2430                                 '\u045E', // Short U for Byelorussian
2431                                 '\u04B1', // Straight U w/ stroke (diacritical!)
2432                                 '\u0444', '\u0445', '\u0446', '\u0447',
2433                                 '\u045F', // DZHE
2434                                 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2435                                 '\u044D', '\u044E', '\u044F'};
2436
2437                         // For some characters here is a map to basic cyrillic
2438                         // letters. See UnicodeData.txt character names for
2439                         // the sources. Here I simply declare an equiv. array.
2440                         // The content characters are map from U+490(,491),
2441                         // skipping small letters.
2442                         char [] cymap_src = new char [] {
2443                                 '\u0433', '\u0433', '\u0433', '\u0436',
2444                                 '\u0437', '\u043A', '\u043A', '\u043A',
2445                                 '\u043A', '\u043D', '\u043D', '\u043F',
2446                                 '\u0445', '\u0441', '\u0442', '\u0443',
2447                                 '\u0443', '\u0445', '\u0446', '\u0447',
2448                                 '\u0447', '\u0432', '\u0435', '\u0435',
2449                                 '\u0406', '\u0436', '\u043A', '\u043D',
2450                                 '\u0447', '\u0435'};
2451
2452                         fillIndex [0x10] = 0x8D;
2453                         for (int i = 0x0460; i < 0x0481; i++) {
2454                                 if (Char.IsLetter ((char) i)) {
2455                                         if (i == 0x0476)
2456                                                 // U+476/477 have the same
2457                                                 // primary weight as U+474/475.
2458                                                 fillIndex [0x10] -= 3;
2459                                         AddLetterMap ((char) i, 0x10, 3);
2460                                 }
2461                         }
2462
2463                         fillIndex [0x10] = 0x6;
2464                         for (int i = 0; i < orderedCyrillic.Length; i++) {
2465                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2466                                 if (!IsIgnorable ((int) c) &&
2467                                         Char.IsLetter (c) &&
2468                                         !map [c].Defined) {
2469                                         AddLetterMap (c, 0x10, 0);
2470                                         fillIndex [0x10] += 3;
2471                                 }
2472                         }
2473
2474                         // NFKD
2475                         for (int i = 0x0401; i <= 0x045F; i++)
2476                                 FillLetterNFKD (i, false, false);
2477
2478                         for (int i = 0; i < cymap_src.Length; i++) {
2479                                 char c = cymap_src [i];
2480                                 fillIndex [0x10] = map [c].Level1;
2481                                 int c2 = 0x0490 + i * 2;
2482                                 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2483                         }
2484
2485                         // Armenian
2486                         fillIndex [0x11] = 0x3;
2487                         fillIndex [0x1] = 0x98;
2488                         for (int i = 0x0531; i < 0x0586; i++) {
2489                                 if (i == 0x0559 || i == 0x55A)
2490                                         AddCharMap ((char) i, 1, 1);
2491                                 if (Char.IsLetter ((char) i))
2492                                         AddLetterMap ((char) i, 0x11, 1);
2493                         }
2494
2495                         // Hebrew
2496                         // -Letters
2497                         fillIndex [0x12] = 0x2;
2498                         for (int i = 0x05D0; i < 0x05FF; i++)
2499                                 if (Char.IsLetter ((char) i)) {
2500                                         if (isUppercase [i]) {
2501                                                 fillIndex [0x12]--;
2502                                                 AddLetterMap ((char) i, 0x12, 2);
2503                                         }
2504                                         else
2505                                                 AddLetterMap ((char) i, 0x12, 1);
2506                                 }
2507                         // -Accents
2508                         fillIndex [0x1] = 0x3;
2509                         for (int i = 0x0591; i <= 0x05C2; i++) {
2510                                 if (i == 0x05A3 || i == 0x05BB)
2511                                         fillIndex [0x1]++;
2512                                 if (i != 0x05BE)
2513                                         AddCharMap ((char) i, 0x1, 1);
2514                         }
2515
2516                         // Arabic
2517                         fillIndex [0x1] = 0x8E;
2518                         fillIndex [0x13] = 0x3;
2519                         for (int i = 0x0621; i <= 0x064A; i++) {
2520                                 // Abjad
2521                                 if (Char.GetUnicodeCategory ((char) i)
2522                                         != UnicodeCategory.OtherLetter) {
2523                                         // FIXME: arabic nonspacing marks are
2524                                         // in different order.
2525                                         AddCharMap ((char) i, 0x1, 1);
2526                                         continue;
2527                                 }
2528 //                              map [i] = new CharMapEntry (0x13,
2529 //                                      (byte) arabicLetterPrimaryValues [i], 1);
2530                                 fillIndex [0x13] = 
2531                                         (byte) arabicLetterPrimaryValues [i];
2532                                 byte formDiacritical = 8; // default
2533                                 // SPECIAL CASES:
2534                                 switch (i) {
2535                                 case 0x0622: formDiacritical = 9; break;
2536                                 case 0x0623: formDiacritical = 0xA; break;
2537                                 case 0x0624: formDiacritical = 5; break;
2538                                 case 0x0625: formDiacritical = 0xB; break;
2539                                 case 0x0626: formDiacritical = 7; break;
2540                                 case 0x0649: formDiacritical = 5; break;
2541                                 case 0x064A: formDiacritical = 7; break;
2542                                 }
2543 //                              AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2544                                 AddArabicCharMap ((char) i, 0x13, 1, formDiacritical);
2545                         }
2546                         for (int i = 0x0670; i < 0x0673; i++)
2547                                 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2548                         fillIndex [0x13] = 0x84;
2549                         for (int i = 0x0674; i < 0x06D6; i++)
2550                                 if (Char.IsLetter ((char) i))
2551                                         AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2552
2553                         // Devanagari
2554
2555                         // FIXME: this could be fixed in more decent way
2556                         for (int i = 0x0958; i <= 0x095F; i++)
2557                                 diacritical [i] = 8;
2558
2559                         // FIXME: it does seem straight codepoint mapping.
2560                         fillIndex [0x14] = 04;
2561                         for (int i = 0x0901; i < 0x0905; i++)
2562                                 if (!IsIgnorable (i))
2563                                         AddLetterMap ((char) i, 0x14, 2);
2564                         fillIndex [0x14] = 0xB;
2565                         for (int i = 0x0905; i < 0x093A; i++) {
2566                                 if (i == 0x0928)
2567                                         AddCharMap ('\u0929', 0x14, 0, 8);
2568                                 if (i == 0x0930)
2569                                         AddCharMap ('\u0931', 0x14, 0, 8);
2570                                 if (i == 0x0933)
2571                                         AddCharMap ('\u0934', 0x14, 0, 8);
2572                                 if (Char.IsLetter ((char) i))
2573                                         AddLetterMap ((char) i, 0x14, 4);
2574                                 if (i == 0x090B)
2575                                         AddCharMap ('\u0960', 0x14, 4);
2576                                 if (i == 0x090C)
2577                                         AddCharMap ('\u0961', 0x14, 4);
2578                         }
2579                         fillIndex [0x14] = 0xDA;
2580                         for (int i = 0x093E; i < 0x0945; i++)
2581                                 if (!IsIgnorable (i))
2582                                         AddLetterMap ((char) i, 0x14, 2);
2583                         fillIndex [0x14] = 0xEC;
2584                         for (int i = 0x0945; i < 0x094F; i++)
2585                                 if (!IsIgnorable (i))
2586                                         AddLetterMap ((char) i, 0x14, 2);
2587
2588                         // Bengali
2589                         // -Letters
2590                         fillIndex [0x15] = 02;
2591                         for (int i = 0x0980; i < 0x9FF; i++) {
2592                                 if (IsIgnorable (i))
2593                                         continue;
2594                                 if (i == 0x09E0)
2595                                         fillIndex [0x15] = 0x3B;
2596                                 switch (Char.GetUnicodeCategory ((char) i)) {
2597                                 case UnicodeCategory.NonSpacingMark:
2598                                 case UnicodeCategory.DecimalDigitNumber:
2599                                 case UnicodeCategory.OtherNumber:
2600                                         continue;
2601                                 }
2602                                 AddLetterMap ((char) i, 0x15, 1);
2603                         }
2604                         // -Signs
2605                         fillIndex [0x1] = 0x3;
2606                         for (int i = 0x0981; i < 0x0A00; i++)
2607                                 if (Char.GetUnicodeCategory ((char) i) ==
2608                                         UnicodeCategory.NonSpacingMark)
2609                                         AddCharMap ((char) i, 0x1, 1);
2610
2611                         // Gurmukhi. orderedGurmukhi is from UCA
2612                         // FIXME: it does not look equivalent to UCA.
2613                         fillIndex [0x16] = 04;
2614                         fillIndex [0x1] = 3;
2615                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
2616                                 char c = orderedGurmukhi [i];
2617                                 if (IsIgnorable ((int) c))
2618                                         continue;
2619                                 if (IsIgnorableNonSpacing (c)) {
2620                                         AddLetterMap (c, 0x1, 1);
2621                                         continue;
2622                                 }
2623                                 if (c == '\u0A3C' || c == '\u0A4D' ||
2624                                         '\u0A66' <= c && c <= '\u0A71')
2625                                         continue;
2626                                 // SPECIAL CASES
2627                                 byte shift = 4;
2628                                 switch (c) {
2629                                 case '\u0A33': case '\u0A36': case '\u0A16':
2630                                 case '\u0A17': case '\u0A5B': case '\u0A5E':
2631                                         shift = 0;
2632                                         break;
2633                                 }
2634                                 if (c == '\u0A3E') // Skip
2635                                         fillIndex [0x16] = 0xC0;
2636                                 AddLetterMap (c, 0x16, shift);
2637                         }
2638
2639                         // Gujarati. orderedGujarati is from UCA
2640                         fillIndex [0x17] = 0x4;
2641                         // nonspacing marks
2642                         map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2643                         map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2644                         map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2645                         map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2646                         map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2647                         map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2648                         // letters go first.
2649                         for (int i = 0; i < orderedGujarati.Length; i++) {
2650                                 // SPECIAL CASE
2651                                 char c = orderedGujarati [i];
2652                                 if (Char.IsLetter (c)) {
2653                                         // SPECIAL CASES
2654                                         if (c == '\u0AB3' || c == '\u0A32')
2655                                                 continue;
2656                                         if (c == '\u0A33') {
2657                                                 AddCharMap ('\u0A32', 0x17, 0);
2658                                                 AddCharMap ('\u0A33', 0x17, 4, 4);
2659                                                 continue;
2660                                         }
2661                                         if (c == '\u0A8B')
2662                                                 AddCharMap ('\u0AE0', 0x17, 0, 5);
2663                                         AddCharMap (c, 0x17, 4);
2664
2665                                         if (c == '\u0AB9')
2666                                                 AddCharMap ('\u0AB3', 0x17, 6);
2667                                 }
2668                         }
2669                         // non-letters
2670                         byte gujaratiShift = 4;
2671                         fillIndex [0x17] = 0xC0;
2672                         for (int i = 0; i < orderedGujarati.Length; i++) {
2673                                 char c = orderedGujarati [i];
2674                                 if (fillIndex [0x17] == 0xCC)
2675                                         gujaratiShift = 3;
2676                                 if (!Char.IsLetter (c)) {
2677                                         // SPECIAL CASES
2678                                         if (c == '\u0A82')
2679                                                 AddCharMap ('\u0A81', 0x17, 2);
2680                                         if (c == '\u0AC2')
2681                                                 fillIndex [0x17]++;
2682                                         AddLetterMap (c, 0x17, gujaratiShift);
2683                                 }
2684                         }
2685
2686                         // Oriya
2687                         fillIndex [0x1] = 03;
2688                         fillIndex [0x18] = 02;
2689                         for (int i = 0x0B00; i < 0x0B7F; i++) {
2690                                 switch (Char.GetUnicodeCategory ((char) i)) {
2691                                 case UnicodeCategory.NonSpacingMark:
2692                                 case UnicodeCategory.DecimalDigitNumber:
2693                                         AddLetterMap ((char) i, 0x1, 1);
2694                                         continue;
2695                                 }
2696                                 AddLetterMapCore ((char) i, 0x18, 1, 0, true);
2697                         }
2698
2699                         // Tamil
2700                         fillIndex [0x19] = 2;
2701                         AddCharMap ('\u0BD7', 0x19, 0);
2702                         fillIndex [0x19] = 0xA;
2703                         // vowels
2704                         for (int i = 0x0B82; i <= 0x0B94; i++)
2705                                 if (!IsIgnorable ((char) i))
2706                                         AddCharMap ((char) i, 0x19, 2);
2707                         // special vowel
2708                         fillIndex [0x19] = 0x28;
2709                         // The array for Tamil consonants is a constant.
2710                         // Windows have almost similar sequence to TAM from
2711                         // tamilnet but a bit different in Grantha.
2712                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
2713                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2714                         // combining marks
2715                         fillIndex [0x19] = 0x82;
2716                         for (int i = 0x0BBE; i < 0x0BCD; i++)
2717                                 if (Char.GetUnicodeCategory ((char) i) ==
2718                                         UnicodeCategory.SpacingCombiningMark
2719                                         || i == 0x0BC0)
2720                                         AddLetterMap ((char) i, 0x19, 2);
2721
2722                         // Telugu
2723                         fillIndex [0x1A] = 0x4;
2724                         for (int i = 0x0C00; i < 0x0C62; i++) {
2725                                 if (i == 0x0C55 || i == 0x0C56)
2726                                         continue; // skip
2727                                 AddCharMap ((char) i, 0x1A, 3);
2728                                 char supp = (i == 0x0C0B) ? '\u0C60':
2729                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
2730                                 if (supp == char.MinValue)
2731                                         continue;
2732                                 AddCharMap (supp, 0x1A, 3);
2733                         }
2734
2735                         // Kannada
2736                         fillIndex [0x1B] = 4;
2737                         for (int i = 0x0C80; i < 0x0CE5; i++) {
2738                                 if (i == 0x0CD5 || i == 0x0CD6)
2739                                         continue; // ignore
2740                                 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2741                                         continue; // shift after 0xCB9
2742                                 AddCharMap ((char) i, 0x1B, 3);
2743                                 if (i == 0x0CB9) {
2744                                         // SPECIAL CASES: but why?
2745                                         AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2746                                         AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2747                                         AddCharMap ('\u0CDE', 0x1B, 3); // FA
2748                                 }
2749                                 if (i == 0x0CB2)
2750                                         AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2751                         }
2752                         
2753                         // Malayalam
2754                         fillIndex [0x1C] = 2;
2755                         fillIndex [0x1] = 3;
2756                         for (int i = 0x0D02; i < 0x0D61; i++) {
2757                                 // FIXME: I avoided MSCompatUnicodeTable usage
2758                                 // here (it results in recursion). So check if
2759                                 // using NonSpacingMark makes sense or not.
2760                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2761 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2762                                         AddCharMap ((char) i, 0x1C, 1);
2763                                 else if (!IsIgnorable ((char) i))
2764                                         AddCharMap ((char) i, 1, 1);
2765                         }
2766
2767                         // Thai ... note that it breaks 0x1E wall after E2B!
2768                         // Also, all Thai characters have level 2 value 3.
2769                         fillIndex [0x1E] = 2;
2770                         fillIndex [0x1] = 3;
2771                         for (int i = 0xE40; i <= 0xE44; i++)
2772                                 AddCharMap ((char) i, 0x1E, 1, 3);
2773                         for (int i = 0xE01; i < 0xE2B; i++)
2774                                 AddCharMap ((char) i, 0x1E, 6, 3);
2775                         fillIndex [0x1F] = 5;
2776                         for (int i = 0xE2B; i < 0xE30; i++)
2777                                 AddCharMap ((char) i, 0x1F, 6, 3);
2778                         fillIndex [0x1F] = 0x1E;
2779                         for (int i = 0xE30; i < 0xE3B; i++)
2780                                 AddCharMap ((char) i, 0x1F, 1, 3);
2781                         // some Thai characters remains.
2782                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
2783                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2784                         foreach (char c in specialThai)
2785                                 AddCharMap (c, 0x1F, 1, 3);
2786
2787                         for (int i = 0xE00; i < 0xE80; i++)
2788                                 if (Char.GetUnicodeCategory ((char) i) ==
2789                                         UnicodeCategory.NonSpacingMark)
2790                                         AddCharMap ((char) i, 1, 1);
2791
2792                         // Lao
2793                         fillIndex [0x1F] = 2;
2794                         fillIndex [0x1] = 3;
2795                         for (int i = 0xE80; i < 0xEDF; i++) {
2796                                 if (IsIgnorable ((char) i))
2797                                         continue;
2798                                 else if (Char.IsLetter ((char) i))
2799                                         AddCharMap ((char) i, 0x1F, 1);
2800                                 else if (Char.GetUnicodeCategory ((char) i) ==
2801                                         UnicodeCategory.NonSpacingMark)
2802                                         AddCharMap ((char) i, 1, 1);
2803                         }
2804
2805                         // Georgian. orderedGeorgian is from UCA DUCET.
2806                         fillIndex [0x21] = 5;
2807                         for (int i = 0; i < orderedGeorgian.Length; i++) {
2808                                 char c = orderedGeorgian [i];
2809                                 if (map [(int) c].Defined)
2810                                         continue;
2811                                 AddCharMap (c, 0x21, 0);
2812                                 if (c < '\u10F6')
2813                                         AddCharMap ((char) (c - 0x30), 0x21, 0);
2814                                 fillIndex [0x21] += 5;
2815                         }
2816
2817                         // Japanese Kana.
2818                         fillIndex [0x22] = 2;
2819                         int kanaOffset = 0x3041;
2820                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2821
2822                         for (int gyo = 0; gyo < 9; gyo++) {
2823                                 for (int dan = 0; dan < 5; dan++) {
2824                                         if (gyo == 7 && dan % 2 == 1) {
2825                                                 // 'ya'-gyo
2826                                                 fillIndex [0x22]++;
2827                                                 kanaOffset -= 2; // There is no space for yi and ye.
2828                                                 continue;
2829                                         }
2830                                         int cp = kanaOffset + dan * kanaLines [gyo];
2831                                         // small lines (a-gyo, ya-gyo)
2832                                         if (gyo == 0 || gyo == 7) {
2833                                                 AddKanaMap (cp, 1); // small
2834                                                 AddKanaMap (cp + 1, 1);
2835                                         }
2836                                         else
2837                                                 AddKanaMap (cp, kanaLines [gyo]);
2838                                         fillIndex [0x22]++;
2839
2840                                         if (cp == 0x30AB) {
2841                                                 // add small 'ka' (before normal one)
2842                                                 AddKanaMap (0x30F5, 1);
2843                                                 kanaOffset++;
2844                                         }
2845                                         if (cp == 0x30B1) {
2846                                                 // add small 'ke' (before normal one)
2847                                                 AddKanaMap (0x30F6, 1);
2848                                                 kanaOffset++;
2849                                         }
2850                                         if (cp == 0x3061) {
2851                                                 // add small 'Tsu' (before normal one)
2852                                                 AddKanaMap (0x3063, 1);
2853                                                 kanaOffset++;
2854                                         }
2855                                 }
2856                                 fillIndex [0x22] += 3;
2857                                 kanaOffset += 5 * kanaLines [gyo];
2858                         }
2859
2860                         // Wa-gyo is almost special, so I just manually add.
2861                         AddLetterMap ((char) 0x308E, 0x22, 0);
2862                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2863                         AddLetterMap ((char) 0x308F, 0x22, 0);
2864                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2865                         fillIndex [0x22]++;
2866                         AddLetterMap ((char) 0x3090, 0x22, 0);
2867                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2868                         fillIndex [0x22] += 2;
2869                         // no "Wu" in Japanese.
2870                         AddLetterMap ((char) 0x3091, 0x22, 0);
2871                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2872                         fillIndex [0x22]++;
2873                         AddLetterMap ((char) 0x3092, 0x22, 0);
2874                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2875                         // Nn
2876                         fillIndex [0x22] = 0x80;
2877                         AddLetterMap ((char) 0x3093, 0x22, 0);
2878                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2879
2880                         map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2881                                 map [0x30A6].Level1, 3);// voiced hiragana U
2882                         map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2883                                 map [0x30A6].Level1, 3);// voiced katakana U
2884
2885                         map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2886                                 map [0x30AB].Level1, 0);// small katakana Ka
2887                         map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2888                                 map [0x30B1].Level1, 0);// small katakana Ke
2889                         // voiced Wa lines
2890                         for (int i = 0x30F7; i < 0x30FB; i++)
2891                                 map [i] = new CharMapEntry (map [i - 8].Category,
2892                                         map [i - 8].Level1,
2893                                         3);
2894
2895                         // JIS Japanese square chars.
2896                         fillIndex [0x22] = 0x97;
2897                         jisJapanese.Sort (JISComparer.Instance);
2898                         foreach (JISCharacter j in jisJapanese)
2899                                 if (0x3300 <= j.CP && j.CP <= 0x3357)
2900                                         AddCharMap ((char) j.CP, 0x22, 1);
2901                         // non-JIS Japanese square chars.
2902                         nonJisJapanese.Sort (NonJISComparer.Instance);
2903                         foreach (NonJISCharacter j in nonJisJapanese)
2904                                 AddCharMap ((char) j.CP, 0x22, 1);
2905
2906                         // Bopomofo
2907                         fillIndex [0x23] = 0x02;
2908                         for (int i = 0x3105; i <= 0x312C; i++)
2909                                 AddCharMap ((char) i, 0x23, 1);
2910
2911                         // Estrangela: ancient Syriac
2912                         fillIndex [0x24] = 0x0B;
2913                         // FIXME: is 0x71E really alternative form?
2914                         ArrayList syriacAlternatives = new ArrayList (
2915                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2916                         for (int i = 0x0710; i <= 0x072C; i++) {
2917                                 if (i == 0x0711) // NonSpacingMark
2918                                         continue;
2919                                 if (syriacAlternatives.Contains (i))
2920                                         continue;
2921                                 AddCharMap ((char) i, 0x24, 4);
2922                                 // FIXME: why?
2923                                 if (i == 0x721)
2924                                         fillIndex [0x24]++;
2925                         }
2926                         foreach (int cp in syriacAlternatives)
2927                                 map [cp] = new CharMapEntry (0x24,
2928                                         (byte) (map [cp - 1].Level1 + 2),
2929                                         0);
2930                         // FIXME: Syriac NonSpacingMark should go here.
2931
2932                         // Thaana
2933                         // FIXME: it turned out that it does not look like UCA
2934                         fillIndex [0x24] = 0x6E;
2935                         fillIndex [0x1] = 0xAC;
2936                         for (int i = 0; i < orderedThaana.Length; i++) {
2937                                 char c = orderedThaana [i];
2938                                 if (IsIgnorableNonSpacing ((int) c))
2939                                         AddCharMap (c, 1, 1);
2940                                 AddCharMap (c, 0x24, 2);
2941                                 if (c == '\u0782') // SPECIAL CASE: why?
2942                                         fillIndex [0x24] += 2;
2943                         }
2944                         #endregion
2945
2946                         // FIXME: Add more culture-specific letters (that are
2947                         // not supported in Windows collation) here.
2948
2949                         // Surrogate ... they are computed.
2950
2951                         #region Hangul
2952                         // Hangul.
2953                         //
2954                         // Unlike UCA Windows Hangul sequence mixes Jongseong
2955                         // with Choseong sequence as well as Jungseong,
2956                         // adjusted to have the same primary weight for the
2957                         // same base character. So it is impossible to compute
2958                         // those sort keys.
2959                         //
2960                         // Here I introduce an ordered sequence of mixed
2961                         // 'commands' and 'characters' that is similar to
2962                         // LDML text:
2963                         //      - ',' increases primary weight.
2964                         //      - [A B] means a range, increasing index
2965                         //      - {A B} means a range, without increasing index
2966                         //      - '=' is no operation (it means the characters 
2967                         //        of both sides have the same weight).
2968                         //      - '>' inserts a Hangul Syllable block that 
2969                         //        contains 0x251 characters.
2970                         //      - '<' decreases the index
2971                         //      - '0'-'9' means skip count
2972                         //      - whitespaces are ignored
2973                         //
2974
2975                         string hangulSequence =
2976                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2977                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2978                         + "<{\u1113 \u1116}, \u3165,"
2979                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2980                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2981                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2982                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2983                                 + "[\u11D1 \u11D2], \u11B2,"
2984                                 + "[\u11D3 \u11D5], \u11B3,"
2985                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2986                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2987                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2988                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2989                         + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2990                                 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2991                                 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2992                         + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2993                                 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2994                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2995                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2996                         + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
2997                                 + "\u11F1,, \u11F2,,,"
2998                                 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
2999                         + "<\u114D, \u110D,,  >"
3000                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
3001                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
3002                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
3003                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
3004                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
3005                                 + "[\u11F5 \u11F8]"
3006                         ;
3007
3008                         byte hangulCat = 0x52;
3009                         fillIndex [hangulCat] = 0x2;
3010
3011                         int syllableBlock = 0;
3012                         for (int n = 0; n < hangulSequence.Length; n++) {
3013                                 char c = hangulSequence [n];
3014                                 int start, end;
3015                                 if (Char.IsWhiteSpace (c))
3016                                         continue;
3017                                 switch (c) {
3018                                 case '=':
3019                                         break; // NOP
3020                                 case ',':
3021                                         IncrementSequentialIndex (ref hangulCat);
3022                                         break;
3023                                 case '<':
3024                                         if (fillIndex [hangulCat] == 2)
3025                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
3026                                         fillIndex [hangulCat]--;
3027                                         break;
3028                                 case '>':
3029                                         IncrementSequentialIndex (ref hangulCat);
3030                                         for (int l = 0; l < 0x15; l++)
3031                                                 for (int v = 0; v < 0x1C; v++) {
3032                                                         AddCharMap (
3033                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
3034                                                         IncrementSequentialIndex (ref hangulCat);
3035                                                 }
3036                                         syllableBlock++;
3037                                         break;
3038                                 case '[':
3039                                         start = hangulSequence [n + 1];
3040                                         end = hangulSequence [n + 3];
3041                                         for (int i = start; i <= end; i++) {
3042                                                 AddCharMap ((char) i, hangulCat, 0);
3043                                                 if (end > i)
3044                                                         IncrementSequentialIndex (ref hangulCat);
3045                                         }
3046                                         n += 4; // consumes 5 characters for this operation
3047                                         break;
3048                                 case '{':
3049                                         start = hangulSequence [n + 1];
3050                                         end = hangulSequence [n + 3];
3051                                         for (int i = start; i <= end; i++)
3052                                                 AddCharMap ((char) i, hangulCat, 0);
3053                                         n += 4; // consumes 5 characters for this operation
3054                                         break;
3055                                 default:
3056                                         AddCharMap (c, hangulCat, 0);
3057                                         break;
3058                                 }
3059                         }
3060
3061                         // Some Jamo NFKD.
3062                         for (int i = 0x3200; i < 0x3300; i++) {
3063                                 if (IsIgnorable (i) || map [i].Defined)
3064                                         continue;
3065                                 int ch = 0;
3066                                 // w/ bracket
3067                                 if (decompLength [i] == 4 &&
3068                                         decompValues [decompIndex [i]] == '(')
3069                                         ch = decompIndex [i] + 1;
3070                                 // circled
3071                                 else if (decompLength [i] == 2 &&
3072                                         decompValues [decompIndex [i] + 1] == '\u1161')
3073                                         ch = decompIndex [i];
3074                                 else if (decompLength [i] == 1)
3075                                         ch = decompIndex [i];
3076                                 else
3077                                         continue;
3078                                 ch = decompValues [ch];
3079                                 if (ch < 0x1100 || 0x1200 < ch &&
3080                                         ch < 0xAC00 || 0xD800 < ch)
3081                                         continue;
3082
3083                                 // SPECIAL CASE ?
3084                                 int offset = i < 0x3260 ? 1 : 0;
3085                                 if (0x326E <= i && i <= 0x3273)
3086                                         offset = 1;
3087
3088                                 map [i] = new CharMapEntry (map [ch].Category,
3089                                         (byte) (map [ch].Level1 + offset),
3090                                         map [ch].Level2);
3091 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
3092                         }
3093
3094
3095                         #endregion
3096
3097                         // Letterlike characters and CJK compatibility square
3098                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
3099                         int [] counts = new int ['Z' - 'A' + 1];
3100                         char [] namedChars = new char [sortableCharNames.Count];
3101                         int nCharNames = 0;
3102                         foreach (DictionaryEntry de in sortableCharNames) {
3103                                 counts [((string) de.Value) [0] - 'A']++;
3104                                 namedChars [nCharNames++] = (char) ((int) de.Key);
3105                         }
3106                         nCharNames = 0; // reset
3107                         for (int a = 0; a < counts.Length; a++) {
3108                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
3109                                 for (int i = 0; i < counts [a]; i++)
3110 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
3111                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
3112                         }
3113
3114                         // CJK unified ideograph.
3115                         byte cjkCat = 0x9E;
3116                         fillIndex [cjkCat] = 0x2;
3117                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
3118                                 if (!IsIgnorable (cp))
3119                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
3120                         // CJK Extensions goes here.
3121                         // LAMESPEC: With this Windows style CJK layout, it is
3122                         // impossible to add more CJK ideograph i.e. 0x9FA6-
3123                         // 0x9FBB can never be added w/o breaking compat.
3124                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
3125                                 if (!IsIgnorable (cp))
3126                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
3127
3128                         // PrivateUse ... computed.
3129                         // remaining Surrogate ... computed.
3130
3131                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
3132                         // non-alphanumeric ASCII except for: + - < = > '
3133                         for (int i = 0x21; i < 0x7F; i++) {
3134                                 // SPECIAL CASE: 02C6 looks regarded as 
3135                                 // equivalent to '^', which does not conform 
3136                                 // to Unicode standard character database.
3137                                 if (i == 0x005B)
3138                                         AddCharMap ('\u2045', 0x7, 0, 0x1C);
3139                                 if (i == 0x005D)
3140                                         AddCharMap ('\u2046', 0x7, 0, 0x1C);
3141                                 if (i == 0x005E)
3142                                         AddCharMap ('\u02C6', 0x7, 0, 3);
3143                                 if (i == 0x0060)
3144                                         AddCharMap ('\u02CB', 0x7, 0, 3);
3145
3146                                 if (Char.IsLetterOrDigit ((char) i)
3147                                         || "+-<=>'".IndexOf ((char) i) >= 0)
3148                                         continue; // they are not added here.
3149
3150                                 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3151                                 // Insert 3001 after ',' and 3002 after '.'
3152                                 if (i == 0x2C)
3153                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
3154                                 else if (i == 0x2E)
3155                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
3156                                 else if (i == 0x3A)
3157                                         AddCharMap ('\uFE30', 0x7, 1, 0);
3158                         }
3159                         #endregion
3160
3161                         #region 07 - Punctuations and something else
3162                         for (int i = 0xA0; i < char.MaxValue; i++) {
3163                                 if (IsIgnorable (i))
3164                                         continue;
3165
3166                                 // FIXME: actually those reset should not be 
3167                                 // done but here I put for easy goal.
3168                                 if (i == 0x05C3)
3169                                         fillIndex [0x7]++;
3170                                 if (i == 0x0700)
3171                                         fillIndex [0x7] = 0xE2;
3172                                 if (i == 0x2016)
3173                                         fillIndex [0x7] = 0x77;
3174                                 if (i == 0x3008)
3175                                         fillIndex [0x7] = 0x93;
3176
3177                                 if (0x02C8 <= i && i <= 0x02CD)
3178                                         continue; // nonspacing marks
3179
3180                                 // SPECIAL CASE: maybe they could be allocated
3181                                 // dummy NFKD mapping and no special processing
3182                                 // would be required here.
3183                                 if (i == 0x00AF)
3184                                         AddCharMap ('\u02C9', 0x7, 0, 3);
3185                                 if (i == 0x00B4)
3186                                         AddCharMap ('\u02CA', 0x7, 0, 3);
3187                                 if (i == 0x02C7)
3188                                         AddCharMap ('\u02D8', 0x7, 0, 3);
3189
3190                                 // SPECIAL CASES:
3191                                 switch (i) {
3192                                 case 0xAB: // 08
3193                                 case 0xB7: // 0A
3194                                 case 0xBB: // 08
3195                                 case 0x02B9: // 01
3196                                 case 0x02BA: // 01
3197                                 case 0x2329: // 09
3198                                 case 0x232A: // 09
3199                                         continue;
3200                                 }
3201
3202                                 switch (Char.GetUnicodeCategory ((char) i)) {
3203                                 case UnicodeCategory.OtherPunctuation:
3204                                 case UnicodeCategory.ClosePunctuation:
3205                                 case UnicodeCategory.OpenPunctuation:
3206                                 case UnicodeCategory.ConnectorPunctuation:
3207                                 case UnicodeCategory.InitialQuotePunctuation:
3208                                 case UnicodeCategory.FinalQuotePunctuation:
3209                                 case UnicodeCategory.ModifierSymbol:
3210                                         // SPECIAL CASES: // 0xA
3211                                         if (0x2020 <= i && i <= 0x2031)
3212                                                 continue;
3213                                         if (i == 0x3003) // added later
3214                                                 continue;
3215                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3216                                         break;
3217                                 default:
3218                                         if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why?
3219                                                 goto case UnicodeCategory.OtherPunctuation;
3220                                         break;
3221                                 }
3222                         }
3223
3224                         // Control pictures
3225                         // FIXME: it should not need to reset level 1, but
3226                         // it's for easy goal.
3227                         fillIndex [0x7] = 0xB6;
3228                         for (int i = 0x2400; i <= 0x2424; i++)
3229                                 AddCharMap ((char) i, 0x7, 1, 0);
3230
3231                         // FIXME: what are they?
3232                         AddCharMap ('\u3003', 0x7, 1);
3233                         AddCharMap ('\u3006', 0x7, 1);
3234                         AddCharMap ('\u02D0', 0x7, 1);
3235                         AddCharMap ('\u10FB', 0x7, 1);
3236                         AddCharMap ('\u0950', 0x7, 1);
3237                         AddCharMap ('\u093D', 0x7, 1);
3238                         AddCharMap ('\u0964', 0x7, 1);
3239                         AddCharMap ('\u0965', 0x7, 1);
3240                         AddCharMap ('\u0970', 0x7, 1);
3241
3242                         #endregion
3243
3244                         #region category 08 - symbols
3245                         fillIndex [0x8] = 2;
3246                         // Here Windows mapping is not straightforward. It is
3247                         // not based on computation but seems manual sorting.
3248                         AddCharMapGroup ('+', 0x8, 1, 0); // plus
3249                         AddCharMapGroup ('\u2212', 0x8, 1); // minus
3250                         AddCharMapGroup ('\u229D', 0x8, 1); // minus
3251                         AddCharMapGroup ('\u2297', 0x8, 1); // mul
3252                         AddCharMapGroup ('\u2044', 0x8, 1); // div
3253                         AddCharMapGroup ('\u2215', 0x8, 0); // div
3254                         AddCharMapGroup ('\u2298', 0x8, 1); // div slash
3255                         AddCharMapGroup ('\u2217', 0x8, 0); // mul
3256                         AddCharMapGroup ('\u229B', 0x8, 1); // asterisk oper
3257                         AddCharMapGroup ('\u2218', 0x8, 0); // ring
3258                         AddCharMapGroup ('\u229A', 0x8, 1); // ring
3259                         AddCharMapGroup ('\u2219', 0x8, 0); // bullet
3260                         AddCharMapGroup ('\u2299', 0x8, 1); // dot oper
3261                         AddCharMapGroup ('\u2213', 0x8, 1); // minus-or-plus
3262                         AddCharMapGroup ('\u003C', 0x8, 1); // <
3263                         AddCharMapGroup ('\u227A', 0x8, 1); // precedes relation
3264                         AddCharMapGroup ('\u22B0', 0x8, 1); // precedes under relation
3265
3266                         for (int cp = 0; cp < 0x2300; cp++) {
3267                                 if (cp == 0xAC) // SPECIAL CASE: skip
3268                                         continue;
3269                                 if (cp == 0x200) {
3270                                         cp = 0x2200; // skip to 2200
3271                                         fillIndex [0x8] = 0x21;
3272                                 }
3273                                 if (cp == 0x2295)
3274                                         fillIndex [0x8] = 0x3;
3275                                 if (cp == 0x22A2)
3276                                         fillIndex [0x8] = 0xAB;
3277                                 if (cp == 0x22B2)
3278                                         fillIndex [0x8] = 0xB9;
3279                                 if (!map [cp].Defined &&
3280 //                                      Char.GetUnicodeCategory ((char) cp) ==
3281 //                                      UnicodeCategory.MathSymbol)
3282                                         Char.IsSymbol ((char) cp))
3283                                         AddCharMapGroup ((char) cp, 0x8, 1);
3284                                 // SPECIAL CASES: no idea why Windows sorts as such
3285                                 switch (cp) {
3286                                 case 0x3E:
3287                                         AddCharMap ('\u227B', 0x8, 1, 0);
3288                                         AddCharMap ('\u22B1', 0x8, 1, 0);
3289                                         break;
3290                                 case 0xB1:
3291                                         AddCharMapGroup ('\u00AB', 0x8, 1);
3292                                         AddCharMapGroup ('\u226A', 0x8, 1);
3293                                         AddCharMapGroup ('\u00BB', 0x8, 1);
3294                                         AddCharMapGroup ('\u226B', 0x8, 1);
3295                                         break;
3296                                 case 0xF7:
3297                                         AddCharMap ('\u01C0', 0x8, 1, 0);
3298                                         AddCharMap ('\u01C1', 0x8, 1, 0);
3299                                         AddCharMap ('\u01C2', 0x8, 1, 0);
3300                                         break;
3301                                 }
3302                         }
3303                         #endregion
3304
3305                         #region Hack!
3306
3307                         // Characters w/ diacritical marks (NFKD)
3308                         for (int i = 0; i <= char.MaxValue; i++) {
3309                                 if (map [i].Defined || IsIgnorable (i))
3310                                         continue;
3311                                 if (decompIndex [i] == 0)
3312                                         continue;
3313
3314                                 int start = decompIndex [i];
3315                                 int primaryChar = decompValues [start];
3316                                 int secondary = diacritical [i];
3317                                 bool skip = false;
3318                                 int length = decompLength [i];
3319                                 // special processing for parenthesized ones.
3320                                 if (length == 3 &&
3321                                         decompValues [start] == '(' &&
3322                                         decompValues [start + 2] == ')') {
3323                                         primaryChar = decompValues [start + 1];
3324                                         length = 1;
3325                                 }
3326
3327                                 if (map [primaryChar].Level1 == 0)
3328                                         continue;
3329
3330                                 for (int l = 1; l < length; l++) {
3331                                         int c = decompValues [start + l];
3332                                         if (map [c].Level1 != 0)
3333                                                 skip = true;
3334                                         secondary += diacritical [c];
3335                                 }
3336                                 if (skip)
3337                                         continue;
3338                                 map [i] = new CharMapEntry (
3339                                         map [primaryChar].Category,
3340                                         map [primaryChar].Level1,
3341                                         (byte) secondary);
3342                                 
3343                         }
3344
3345                         // Diacritical weight adjustment
3346
3347                         // Arabic Hamzah
3348                         diacritical [0x624] = 0x5;
3349                         diacritical [0x626] = 0x7;
3350                         diacritical [0x622] = 0x9;
3351                         diacritical [0x623] = 0xA;
3352                         diacritical [0x625] = 0xB;
3353                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
3354                         diacritical [0x64A] = 0x7; // Yaa'
3355
3356                         for (int i = 0; i < char.MaxValue; i++) {
3357                                 byte mod = 0;
3358                                 byte cat = map [i].Category;
3359                                 switch (cat) {
3360                                 case 0xE: // Latin diacritics
3361                                 case 0x22: // Japanese: circled characters
3362                                         mod = diacritical [i];
3363                                         break;
3364                                 case 0x13: // Arabic
3365                                         if (i == 0x0621)
3366                                                 break; // 0
3367                                         if (diacritical [i] == 0 && decompLength [i] != 0)
3368                                                 diacritical [i] = map [decompValues [decompIndex [i]]].Level2;
3369                                         if (diacritical [i] == 0 && i >= 0xFE8D)
3370                                                 mod = 0x8; // default for arabic
3371                                         break;
3372                                 }
3373                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
3374                                         mod = diacritical [i];
3375                                 if (mod > 0)
3376                                         map [i] = new CharMapEntry (
3377                                                 cat, map [i].Level1, mod);
3378                         }
3379
3380                         // FIXME: this is halfly hack but those NonSpacingMark 
3381                         // characters and still undefined are likely to
3382                         // be nonspacing.
3383                         for (int i = 0; i < char.MaxValue; i++) {
3384                                 if (map [i].Defined ||
3385                                         IsIgnorable (i))
3386                                         continue;
3387                                 switch (i) {
3388                                 // SPECIAL CASES.
3389                                 case 0x02B9:
3390                                 case 0x02BA:
3391                                         break;
3392                                 default:
3393                                         if (Char.GetUnicodeCategory ((char) i) !=
3394                                         UnicodeCategory.NonSpacingMark)
3395                                                 continue;
3396                                         break;
3397                                 }
3398                                 if (diacritical [i] != 0)
3399                                         map [i] = new CharMapEntry (1, 1, diacritical [i]);
3400                                 else
3401                                         AddCharMap ((char) i, 1, 1);
3402                         }
3403
3404                         #endregion
3405                 }
3406
3407                 TextInfo ti = CultureInfo.InvariantCulture.TextInfo;
3408
3409                 private void FillLetterNFKD (int i, bool checkUpper, bool greekRemap)
3410                 {
3411                         if (map [i].Defined)
3412                                 return;
3413                         int up = (int) ti.ToUpper ((char) i);
3414                         if (checkUpper && map [up].Category == 0xF) {
3415                                 if (i == up)
3416                                         return;
3417                                 FillLetterNFKD (up, checkUpper, greekRemap);
3418                                 map [i] = new CharMapEntry (0xF,
3419                                         map [up].Level1,
3420                                         map [up].Level2);
3421                         } else {
3422                                 int idx = decompIndex [i];
3423                                 if (idx == 0)
3424                                         return;
3425                                 int primary = decompValues [decompIndex [i]];
3426                                 FillLetterNFKD (primary, checkUpper, greekRemap);
3427
3428                                 int lv2 = map [primary].Level2;
3429                                 byte off = 0;
3430                                 for (int l = 1; l < decompLength [i]; l++) {
3431                                         int tmp = decompValues [idx + l];
3432                                         if (map [tmp].Category != 1)
3433                                                 return;
3434                                         if (greekRemap && map [tmp].Level2 == 0xC)
3435                                                 off += 3;
3436                                         else
3437                                                 off += map [tmp].Level2;
3438                                 }
3439                                 if (off > 0) {
3440                                         if (lv2 == 0)
3441                                                 lv2 += 2;
3442                                         lv2 += off;
3443                                 }
3444                                 map [i] = new CharMapEntry (
3445                                         map [primary].Category,
3446                                         map [primary].Level1,
3447                                         (byte) lv2);
3448                         }
3449                 }
3450
3451                 private void IncrementSequentialIndex (ref byte hangulCat)
3452                 {
3453                         fillIndex [hangulCat]++;
3454                         if (fillIndex [hangulCat] == 0) { // overflown
3455                                 hangulCat++;
3456                                 fillIndex [hangulCat] = 0x2;
3457                         }
3458                 }
3459
3460                 // Reset fillIndex to fixed value and call AddLetterMap().
3461                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3462                 {
3463                         fillIndex [category] = alphaWeight;
3464                         AddLetterMap (c, category, 0);
3465
3466                         ArrayList al = latinMap [c] as ArrayList;
3467                         if (al == null)
3468                                 return;
3469
3470                         foreach (int cp in al)
3471                                 AddLetterMap ((char) cp, category, 0);
3472                 }
3473
3474                 private void AddKanaMap (int i, byte voices)
3475                 {
3476                         for (byte b = 0; b < voices; b++) {
3477                                 char c = (char) (i + b);
3478                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
3479                                 // Hiragana
3480                                 AddLetterMapCore (c, 0x22, 0, arg, false);
3481                                 // Katakana
3482                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3483                         }
3484                 }
3485
3486                 private void AddLetterMap (char c, byte category, byte updateCount)
3487                 {
3488                         AddLetterMapCore (c, category, updateCount, 0, true);
3489                 }
3490
3491                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3492                 {
3493                         char c2;
3494                         // <small> updates index
3495                         c2 = ToSmallForm (c);
3496                         if (c2 != c)
3497                                 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3498                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3499                         if (c2 != c && !map [(int) c2].Defined)
3500                                 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3501                         bool doUpdate = true;
3502                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3503                                 doUpdate = false;
3504                         else
3505                                 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3506                         if (doUpdate)
3507                                 fillIndex [category] += updateCount;
3508                 }
3509
3510                 private bool AddCharMap (char c, byte category, byte increment)
3511                 {
3512                         return AddCharMap (c, category, increment, 0);
3513                 }
3514                 
3515                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3516                 {
3517                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3518                                 return false; // do nothing
3519                         map [(int) c] = new CharMapEntry (category,
3520                                 category == 1 ? alt : fillIndex [category],
3521                                 category == 1 ? fillIndex [category] : alt);
3522                         fillIndex [category] += increment;
3523                         return true;
3524                 }
3525
3526                 //
3527                 // Adds characters to table in the order below 
3528                 // (+ increases weight):
3529                 //      (<small> +)
3530                 //      itself
3531                 //      <fraction>
3532                 //      <full> | <super> | <sub>
3533                 //      <circle> | <wide> (| <narrow>)
3534                 //      +
3535                 //      (vertical +)
3536                 //
3537                 // level2 is fixed (does not increase).
3538                 int [] sameWeightItems = new int [] {
3539                         DecompositionFraction,
3540                         DecompositionFull,
3541                         DecompositionSuper,
3542                         DecompositionSub,
3543                         DecompositionCircle,
3544                         DecompositionWide,
3545                         DecompositionNarrow,
3546                         };
3547                 private void AddCharMapGroup (char c, byte category, byte updateCount)
3548                 {
3549                         AddCharMapGroup (c, category, updateCount, 0, true);
3550                 }
3551
3552                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3553                 {
3554                         AddCharMapGroup (c, category, updateCount, level2, false);
3555                 }
3556
3557                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3558                 {
3559                         if (map [(int) c].Defined)
3560                                 return;
3561
3562                         if (deferLevel2)
3563                                 level2 = diacritical [(int) c];
3564
3565                         char small = char.MinValue;
3566                         char vertical = char.MinValue;
3567                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3568                         if (nfkd != null) {
3569                                 object smv = nfkd [(byte) DecompositionSmall];
3570                                 if (smv != null)
3571                                         small = (char) ((int) smv);
3572                                 object vv = nfkd [(byte) DecompositionVertical];
3573                                 if (vv != null)
3574                                         vertical = (char) ((int) vv);
3575                         }
3576
3577                         // <small> updates index
3578                         if (small != char.MinValue) {
3579                                 if (level2 == 0 && deferLevel2)
3580                                         level2 = diacritical [small];
3581                                 AddCharMap (small, category, updateCount, level2);
3582                         }
3583
3584                         // itself
3585                         AddCharMap (c, category, 0, level2);
3586
3587                         if (nfkd != null) {
3588                                 foreach (int weight in sameWeightItems) {
3589                                         object wv = nfkd [(byte) weight];
3590                                         if (wv != null) {
3591                                                 if (deferLevel2)
3592                                                         level2 = diacritical [(int) wv];
3593                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
3594                                         }
3595                                 }
3596                         }
3597
3598                         // update index here.
3599                         fillIndex [category] += updateCount;
3600
3601                         if (vertical != char.MinValue) {
3602                                 if (level2 == 0 && deferLevel2)
3603                                         level2 = diacritical [vertical];
3604                                 AddCharMap (vertical, category, updateCount, level2);
3605                         }
3606                 }
3607
3608                 private void AddCharMapCJK (char c, ref byte category)
3609                 {
3610                         AddCharMap (c, category, 0, 0);
3611                         IncrementSequentialIndex (ref category);
3612
3613                         // Special. I wonder why but Windows skips 9E F9.
3614                         if (category == 0x9E && fillIndex [category] == 0xF9)
3615                                 IncrementSequentialIndex (ref category);
3616                 }
3617
3618                 private void AddCharMapGroupCJK (char c, ref byte category)
3619                 {
3620                         AddCharMapCJK (c, ref category);
3621
3622                         // LAMESPEC: see below.
3623                         if (c == '\u5B78') {
3624                                 AddCharMapCJK ('\u32AB', ref category);
3625                                 AddCharMapCJK ('\u323B', ref category);
3626                         }
3627                         if (c == '\u52DE') {
3628                                 AddCharMapCJK ('\u3298', ref category);
3629                                 AddCharMapCJK ('\u3238', ref category);
3630                         }
3631                         if (c == '\u5BEB')
3632                                 AddCharMapCJK ('\u32A2', ref category);
3633                         if (c == '\u91AB')
3634                                 // Especially this mapping order totally does
3635                                 // not make sense to me.
3636                                 AddCharMapCJK ('\u32A9', ref category);
3637
3638                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3639                         if (nfkd == null)
3640                                 return;
3641                         for (byte weight = 0; weight <= 0x12; weight++) {
3642                                 object wv = nfkd [weight];
3643                                 if (wv == null)
3644                                         continue;
3645                                 int w = (int) wv;
3646
3647                                 // Special: they are ignored in this area.
3648                                 // FIXME: check if it is sane
3649                                 if (0xF900 <= w && w <= 0xFAD9)
3650                                         continue;
3651                                 // LAMESPEC: on Windows some of CJK characters
3652                                 // in 3200-32B0 are incorrectly mapped. They
3653                                 // mix Chinise and Japanese Kanji when
3654                                 // ordering those characters.
3655                                 switch (w) {
3656                                 case 0x32A2: case 0x3298: case 0x3238:
3657                                 case 0x32A9: case 0x323B: case 0x32AB:
3658                                         continue;
3659                                 }
3660
3661                                 AddCharMapCJK ((char) w, ref category);
3662                         }
3663                 }
3664
3665                 // For now it is only for 0x7 category.
3666                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3667                 {
3668                         if (map [(int) c].Defined)
3669                                 return;
3670
3671                         bool updateWeight = false;
3672                         // Process in advance (lower primary weight)
3673                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3674                                 if (!map [c2].Defined &&
3675                                         decompLength [c2] == 1 &&
3676                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3677                                         switch (decompType [c2]) {
3678                                         case DecompositionSmall:
3679                                                 updateWeight = true;
3680                                                 AddCharMap ((char) c2, category,
3681                                                         0, level2);
3682                                                 break;
3683                                         }
3684                                 }
3685                         }
3686                         if (updateWeight)
3687                                 fillIndex [category] = (byte)
3688                                         (fillIndex [category] + updateCount);
3689
3690                         // Identical weight
3691                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3692                                 if (!map [c2].Defined &&
3693                                         decompLength [c2] == 1 &&
3694                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3695                                         switch (decompType [c2]) {
3696                                         case DecompositionSub:
3697                                         case DecompositionSuper:
3698                                         case DecompositionWide:
3699                                         case DecompositionNarrow:
3700                                                 AddCharMap ((char) c2, category,
3701                                                         0, level2);
3702                                                 break;
3703                                         }
3704                                 }
3705                         }
3706
3707                         // itself
3708                         AddCharMap (c, category, updateCount, level2);
3709
3710                         // Since nfkdMap is problematic to have two or more
3711                         // NFKD to an identical character, here I iterate all.
3712                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3713                                 if (!map [c2].Defined &&
3714                                         decompLength [c2] == 1 &&
3715                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3716                                         switch (decompType [c2]) {
3717                                         case DecompositionWide:
3718                                         case DecompositionNarrow:
3719                                         case DecompositionSmall:
3720                                         case DecompositionSub:
3721                                         case DecompositionSuper:
3722                                                 continue;
3723                                         default:
3724                                                 AddCharMap ((char) c2, category, updateCount, level2);
3725                                                 break;
3726                                         }
3727                                 }
3728                         }
3729                 }
3730
3731                 private void AddArabicCharMap (char c, byte category, byte updateCount, byte level2)
3732                 {
3733                         // itself
3734                         AddCharMap (c, category, 0, level2);
3735
3736                         // Since nfkdMap is problematic to have two or more
3737                         // NFKD to an identical character, here I iterate all.
3738                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3739                                 if (decompLength [c2] == 0)
3740                                         continue;
3741                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
3742                                 if ((int) (decompValues [idx]) == (int) c)
3743                                         AddCharMap ((char) c2, category,
3744                                                 0, level2);
3745                         }
3746                         fillIndex [category] += updateCount;
3747                 }
3748
3749                 char ToSmallForm (char c)
3750                 {
3751                         return ToDecomposed (c, DecompositionSmall, false);
3752                 }
3753
3754                 char ToDecomposed (char c, byte d, bool tail)
3755                 {
3756                         if (decompType [(int) c] != d)
3757                                 return c;
3758                         int idx = decompIndex [(int) c];
3759                         if (tail)
3760                                 idx += decompLength [(int) c] - 1;
3761                         return (char) decompValues [idx];
3762                 }
3763
3764                 bool ExistsJIS (int cp)
3765                 {
3766                         foreach (JISCharacter j in jisJapanese)
3767                                 if (j.CP == cp)
3768                                         return true;
3769                         return false;
3770                 }
3771
3772                 #endregion
3773
3774                 #region Level 3 properties (Case/Width)
3775
3776                 private byte ComputeLevel3Weight (char c)
3777                 {
3778                         byte b = ComputeLevel3WeightRaw (c);
3779                         return b > 0 ? (byte) (b + 2) : b;
3780                 }
3781
3782                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3783                 {
3784                         // CJK compat
3785                         if ('\u3192' <= c && c <= '\u319F')
3786                                 return 0;
3787
3788                         // They have <narrow> NFKD mapping, and on Windows
3789                         // those narrow characters are regarded as "normal",
3790                         // thus those characters themselves are regarded as
3791                         // "wide". grep "<narrow>" and you can pick them up
3792                         // (ignoring Kana, Hangul etc.)
3793                         switch (c) {
3794                         case '\u3002':
3795                         case '\u300C':
3796                         case '\u300D':
3797                         case '\u3001':
3798                         case '\u30FB':
3799                         case '\u2502':
3800                         case '\u2190':
3801                         case '\u2191':
3802                         case '\u2192':
3803                         case '\u2193':
3804                         case '\u25A0':
3805                         case '\u25CB':
3806                                 return 1;
3807                         }
3808                         // Korean
3809                         if ('\u11A8' <= c && c <= '\u11F9')
3810                                 return 2;
3811                         if ('\uFFA0' <= c && c <= '\uFFDC')
3812                                 return 4;
3813                         if ('\u3130' <= c && c <= '\u3164')
3814                                 return 5;
3815                         if ('\u3165' <= c && c <= '\u318E')
3816                                 return 4;
3817                         // Georgian Capital letters
3818                         if ('\u10A0' <= c && c <= '\u10C5')
3819                                 return 0x10;
3820                         // numbers
3821                         if ('\u2776' <= c && c <= '\u277F')
3822                                 return 4;
3823                         if ('\u2780' <= c && c <= '\u2789')
3824                                 return 8;
3825                         if ('\u2776' <= c && c <= '\u2793')
3826                                 return 0xC;
3827                         if ('\u2160' <= c && c <= '\u216F')
3828                                 return 0x10;
3829                         if ('\u2181' <= c && c <= '\u2182')
3830                                 return 0x10;
3831                         // Arabic
3832                         if ('\u2135' <= c && c <= '\u2138')
3833                                 return 4;
3834                         // I believe that Windows has a bug on setting level 3
3835                         // weight here. NFKD results in different values.
3836                         if ('\uFE80' < c && c < '\uFF00') {
3837                                 // 2(Isolated)/8(Final)/0x18(Medial)
3838                                 switch (decompType [(int) c]) {
3839                                 case DecompositionIsolated:
3840                                         return 0; // 2;
3841                                 case DecompositionFinal:
3842                                         return 8;
3843                                 case DecompositionMedial:
3844                                         return 0x18;
3845                                 case DecompositionInitial:
3846                                         return 0x10;
3847                                 }
3848                         }
3849
3850                         // I have no idea why those symbols have level 3 weight
3851                         if (c == '\u2104' || c == '\u212B')
3852                                 return 0x18;
3853                         if ('\u211E' <= c && c <= '\u212B')
3854                                 return 0x10;
3855
3856                         // actually I dunno the reason why they have weights.
3857                         switch (c) {
3858                         case '\u01BC':
3859                                 return 0x10;
3860                         case '\u06A9':
3861                                 return 0x20;
3862                         case '\u06AA':
3863                                 return 0x28;
3864                         // Gurmukhi
3865                         case '\u0A39':
3866                         case '\u0A59':
3867                         case '\u0A5A':
3868                         case '\u0A5B':
3869                         case '\u0A5E':
3870                                 return 0x10;
3871                         }
3872
3873                         byte ret = 0;
3874                         switch (c) {
3875                         case '\u03C2':
3876                         case '\u212B':
3877                                 ret = 8;
3878                                 break;
3879                         case '\uFE42':
3880                                 ret = 0xA;
3881                                 break;
3882                         }
3883
3884                         // misc
3885                         switch (decompType [(int) c]) {
3886                         case DecompositionWide: // <wide>
3887                         case DecompositionSub: // <sub>
3888                         case DecompositionSuper: // <super>
3889                                 ret |= decompType [(int) c];
3890                                 break;
3891                         }
3892                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3893                                 ret |= 8;
3894                         if (isUppercase [(int) c]) // DerivedCoreProperties
3895                                 ret |= 0x10;
3896
3897                         return ret;
3898                 }
3899
3900                 #endregion
3901
3902                 #region IsIgnorable
3903 /*
3904                 static bool IsIgnorable (int i)
3905                 {
3906                         if (unicodeAge [i] >= 3.1)
3907                                 return true;
3908                         switch (char.GetUnicodeCategory ((char) i)) {
3909                         case UnicodeCategory.OtherNotAssigned:
3910                         case UnicodeCategory.Format:
3911                                 return true;
3912                         }
3913                         return false;
3914                 }
3915 */
3916
3917                 // FIXME: In the future use DerivedAge.txt to examine character
3918                 // versions and set those ones that have higher version than
3919                 // 1.0 as ignorable.
3920                 static bool IsIgnorable (int i)
3921                 {
3922                         switch (i) {
3923                         case 0:
3924                         // I guess, those characters are added between
3925                         // Unicode 1.0 (LCMapString) and Unicode 3.1
3926                         // (UnicodeCategory), so they used to be 
3927                         // something like OtherNotAssigned as of Unicode 1.1.
3928                         case 0x2df: case 0x387:
3929                         case 0x3d7: case 0x3d8: case 0x3d9:
3930                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3931                         case 0x400: case 0x40d: case 0x450: case 0x45d:
3932                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
3933                         case 0x653: case 0x654: case 0x655: case 0x66d:
3934                         case 0xb56:
3935                         case 0x1e9b: case 0x202f: case 0x20ad:
3936                         case 0x20ae: case 0x20af:
3937                         case 0x20e2: case 0x20e3:
3938                         case 0x2139: case 0x213a: case 0x2183:
3939                         case 0x2425: case 0x2426: case 0x2619:
3940                         case 0x2670: case 0x2671: case 0x3007:
3941                         case 0x3190: case 0x3191:
3942                         case 0xfffc: case 0xfffd:
3943                                 return true;
3944                         // exceptional characters filtered by the 
3945                         // following conditions. Originally those exceptional
3946                         // ranges are incorrect (they should not be ignored)
3947                         // and most of those characters are unfortunately in
3948                         // those ranges.
3949                         case 0x4d8: case 0x4d9:
3950                         case 0x4e8: case 0x4e9:
3951                         case 0x70F:
3952                         case 0x3036: case 0x303f:
3953                         case 0x337b: case 0xfb1e:
3954                                 return false;
3955                         }
3956
3957                         if (
3958                                 // The whole Sinhala characters.
3959                                 0x0D82 <= i && i <= 0x0DF4
3960                                 // The whole Tibetan characters.
3961                                 || 0x0F00 <= i && i <= 0x0FD1
3962                                 // The whole Myanmar characters.
3963                                 || 0x1000 <= i && i <= 0x1059
3964                                 // The whole Etiopic, Cherokee, 
3965                                 // Canadian Syllablic, Ogham, Runic,
3966                                 // Tagalog, Hanunoo, Philippine,
3967                                 // Buhid, Tagbanwa, Khmer and Mongorian
3968                                 // characters.
3969                                 || 0x1200 <= i && i <= 0x1DFF
3970                                 // Greek extension characters.
3971                                 || 0x1F00 <= i && i <= 0x1FFF
3972                                 // The whole Braille characters.
3973                                 || 0x2800 <= i && i <= 0x28FF
3974                                 // CJK radical characters.
3975                                 || 0x2E80 <= i && i <= 0x2EF3
3976                                 // Kangxi radical characters.
3977                                 || 0x2F00 <= i && i <= 0x2FD5
3978                                 // Ideographic description characters.
3979                                 || 0x2FF0 <= i && i <= 0x2FFB
3980                                 // Bopomofo letter and final
3981                                 || 0x31A0 <= i && i <= 0x31B7
3982                                 // White square with quadrant characters.
3983                                 || 0x25F0 <= i && i <= 0x25F7
3984                                 // Ideographic telegraph symbols.
3985                                 || 0x32C0 <= i && i <= 0x32CB
3986                                 || 0x3358 <= i && i <= 0x3370
3987                                 || 0x33E0 <= i && i <= 0x33FF
3988                                 // The whole YI characters.
3989                                 || 0xA000 <= i && i <= 0xA48C
3990                                 || 0xA490 <= i && i <= 0xA4C6
3991                                 // American small ligatures
3992                                 || 0xFB13 <= i && i <= 0xFB17
3993                                 // hebrew, arabic, variation selector.
3994                                 || 0xFB1D <= i && i <= 0xFE2F
3995                                 // Arabic ligatures.
3996                                 || 0xFEF5 <= i && i <= 0xFEFC
3997                                 // FIXME: why are they excluded?
3998                                 || 0x01F6 <= i && i <= 0x01F9
3999                                 || 0x0218 <= i && i <= 0x0233
4000                                 || 0x02A9 <= i && i <= 0x02AD
4001                                 || 0x02EA <= i && i <= 0x02EE
4002                                 || 0x0349 <= i && i <= 0x036F
4003                                 || 0x0488 <= i && i <= 0x048F
4004                                 || 0x04D0 <= i && i <= 0x04FF
4005                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
4006                                 || 0x06D6 <= i && i <= 0x06ED
4007                                 || 0x06FA <= i && i <= 0x06FE
4008                                 || 0x2048 <= i && i <= 0x204D
4009                                 || 0x20e4 <= i && i <= 0x20ea
4010                                 || 0x213C <= i && i <= 0x214B
4011                                 || 0x21EB <= i && i <= 0x21FF
4012                                 || 0x22F2 <= i && i <= 0x22FF
4013                                 || 0x237B <= i && i <= 0x239A
4014                                 || 0x239B <= i && i <= 0x23CF
4015                                 || 0x24EB <= i && i <= 0x24FF
4016                                 || 0x2596 <= i && i <= 0x259F
4017                                 || 0x25F8 <= i && i <= 0x25FF
4018                                 || 0x2672 <= i && i <= 0x2689
4019                                 || 0x2768 <= i && i <= 0x2775
4020                                 || 0x27d0 <= i && i <= 0x27ff
4021                                 || 0x2900 <= i && i <= 0x2aff
4022                                 || 0x3033 <= i && i <= 0x303F
4023                                 || 0x31F0 <= i && i <= 0x31FF
4024                                 || 0x3250 <= i && i <= 0x325F
4025                                 || 0x32B1 <= i && i <= 0x32BF
4026                                 || 0x3371 <= i && i <= 0x337B
4027                                 || 0xFA30 <= i && i <= 0xFA6A
4028                         )
4029                                 return true;
4030
4031                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4032                         switch (uc) {
4033                         case UnicodeCategory.PrivateUse:
4034                         case UnicodeCategory.Surrogate:
4035                                 return false;
4036                         // ignored by nature
4037                         case UnicodeCategory.Format:
4038                         case UnicodeCategory.OtherNotAssigned:
4039                                 return true;
4040                         default:
4041                                 return false;
4042                         }
4043                 }
4044
4045                 // To check IsIgnorable sanity, try the driver below under MS.NET.
4046
4047                 /*
4048                 public static void Main ()
4049                 {
4050                         for (int i = 0; i <= char.MaxValue; i++)
4051                                 Dump (i, IsIgnorable (i));
4052                 }
4053
4054                 static void Dump (int i, bool ignore)
4055                 {
4056                         switch (Char.GetUnicodeCategory ((char) i)) {
4057                         case UnicodeCategory.PrivateUse:
4058                         case UnicodeCategory.Surrogate:
4059                                 return; // check nothing
4060                         }
4061
4062                         string s1 = "";
4063                         string s2 = new string ((char) i, 10);
4064                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
4065                         if ((ret == 0) == ignore)
4066                                 return;
4067                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
4068                 }
4069                 */
4070                 #endregion // IsIgnorable
4071
4072                 #region IsIgnorableSymbol
4073                 static bool IsIgnorableSymbol (int i)
4074                 {
4075                         if (IsIgnorable (i))
4076                                 return true;
4077
4078                         switch (i) {
4079                         // *Letter
4080                         case 0x00b5: case 0x01C0: case 0x01C1:
4081                         case 0x01C2: case 0x01C3: case 0x01F6:
4082                         case 0x01F7: case 0x01F8: case 0x01F9:
4083                         case 0x02D0: case 0x02EE: case 0x037A:
4084                         case 0x03D7: case 0x03F3:
4085                         case 0x0400: case 0x040d:
4086                         case 0x0450: case 0x045d:
4087                         case 0x048C: case 0x048D:
4088                         case 0x048E: case 0x048F:
4089                         case 0x0587: case 0x0640: case 0x06E5:
4090                         case 0x06E6: case 0x06FA: case 0x06FB:
4091                         case 0x06FC: case 0x093D: case 0x0950:
4092                         case 0x1E9B: case 0x2139: case 0x3006:
4093                         case 0x3033: case 0x3034: case 0x3035:
4094                         case 0xFE7E: case 0xFE7F:
4095                         // OtherNumber
4096                         case 0x16EE: case 0x16EF: case 0x16F0:
4097                         // LetterNumber
4098                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
4099                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
4100                         case 0x3038: // HANGZHOU NUMERAL TEN
4101                         case 0x3039: // HANGZHOU NUMERAL TWENTY
4102                         case 0x303a: // HANGZHOU NUMERAL THIRTY
4103                         // OtherSymbol
4104                         case 0x2117:
4105                         case 0x327F:
4106                                 return true;
4107                         // ModifierSymbol
4108                         case 0x02B9: case 0x02BA: case 0x02C2:
4109                         case 0x02C3: case 0x02C4: case 0x02C5:
4110                         case 0x02C8: case 0x02CC: case 0x02CD:
4111                         case 0x02CE: case 0x02CF: case 0x02D2:
4112                         case 0x02D3: case 0x02D4: case 0x02D5:
4113                         case 0x02D6: case 0x02D7: case 0x02DE:
4114                         case 0x02E5: case 0x02E6: case 0x02E7:
4115                         case 0x02E8: case 0x02E9:
4116                         case 0x309B: case 0x309C:
4117                         // OtherPunctuation
4118                         case 0x055A: // American Apos
4119                         case 0x05C0: // Hebrew Punct
4120                         case 0x0E4F: // Thai FONGMAN
4121                         case 0x0E5A: // Thai ANGKHANKHU
4122                         case 0x0E5B: // Thai KHOMUT
4123                         // CurencySymbol
4124                         case 0x09F2: // Bengali Rupee Mark
4125                         case 0x09F3: // Bengali Rupee Sign
4126                         // MathSymbol
4127                         case 0x221e: // INF.
4128                         // OtherSymbol
4129                         case 0x0482:
4130                         case 0x09FA:
4131                         case 0x0B70:
4132                                 return false;
4133                         }
4134
4135                         // *Letter
4136                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
4137 #if NET_2_0
4138                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
4139                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
4140 #endif
4141                         )
4142                                 return true;
4143
4144                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4145                         switch (uc) {
4146                         case UnicodeCategory.Surrogate:
4147                                 return false; // inconsistent
4148
4149                         case UnicodeCategory.SpacingCombiningMark:
4150                         case UnicodeCategory.EnclosingMark:
4151                         case UnicodeCategory.NonSpacingMark:
4152                         case UnicodeCategory.PrivateUse:
4153                                 // NonSpacingMark
4154                                 if (0x064B <= i && i <= 0x0652) // Arabic
4155                                         return true;
4156                                 return false;
4157
4158                         case UnicodeCategory.Format:
4159                         case UnicodeCategory.OtherNotAssigned:
4160                                 return true;
4161
4162                         default:
4163                                 bool use = false;
4164                                 // OtherSymbols
4165                                 if (
4166                                         // latin in a circle
4167                                         0x249A <= i && i <= 0x24E9
4168                                         || 0x2100 <= i && i <= 0x2132
4169                                         // Japanese
4170                                         || 0x3196 <= i && i <= 0x31A0
4171                                         // Korean
4172                                         || 0x3200 <= i && i <= 0x321C
4173                                         // Chinese/Japanese
4174                                         || 0x322A <= i && i <= 0x3243
4175                                         // CJK
4176                                         || 0x3260 <= i && i <= 0x32B0
4177                                         || 0x32D0 <= i && i <= 0x3357
4178                                         || 0x337B <= i && i <= 0x33DD
4179                                 )
4180                                         use = !Char.IsLetterOrDigit ((char) i);
4181                                 if (use)
4182                                         return false;
4183
4184                                 // This "Digit" rule is mystery.
4185                                 // It filters some symbols out.
4186                                 if (Char.IsLetterOrDigit ((char) i))
4187                                         return false;
4188                                 if (Char.IsNumber ((char) i))
4189                                         return false;
4190                                 if (Char.IsControl ((char) i)
4191                                         || Char.IsSeparator ((char) i)
4192                                         || Char.IsPunctuation ((char) i))
4193                                         return true;
4194                                 if (Char.IsSymbol ((char) i))
4195                                         return true;
4196
4197                                 // FIXME: should check more
4198                                 return false;
4199                         }
4200                 }
4201
4202                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
4203 /*
4204                 public static void Main ()
4205                 {
4206                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
4207                         for (int i = 0; i <= char.MaxValue; i++) {
4208                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4209                                 if (uc == UnicodeCategory.Surrogate)
4210                                         continue;
4211
4212                                 bool ret = IsIgnorableSymbol (i);
4213
4214                                 string s1 = "TEST ";
4215                                 string s2 = "TEST " + (char) i;
4216
4217                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
4218
4219                                 if (ret != (result == 0))
4220                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
4221                                                 ret ? "should not ignore" :
4222                                                         "should ignore",
4223                                                 i,(char) i, uc);
4224                         }
4225                 }
4226 */
4227                 #endregion
4228
4229                 #region NonSpacing
4230                 static bool IsIgnorableNonSpacing (int i)
4231                 {
4232                         if (IsIgnorable (i))
4233                                 return true;
4234
4235                         switch (i) {
4236                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
4237                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
4238                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
4239                                 return true;
4240                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
4241                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
4242                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
4243                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
4244                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
4245                         case 0x0CCD: case 0x0E4E:
4246                                 return false;
4247                         }
4248
4249                         if (0x02b9 <= i && i <= 0x02c5
4250                                 || 0x02cc <= i && i <= 0x02d7
4251                                 || 0x02e4 <= i && i <= 0x02ef
4252                                 || 0x20DD <= i && i <= 0x20E0
4253                         )
4254                                 return true;
4255
4256                         if (0x064B <= i && i <= 0x00652
4257                                 || 0x0941 <= i && i <= 0x0948
4258                                 || 0x0AC1 <= i && i <= 0x0ACD
4259                                 || 0x0C3E <= i && i <= 0x0C4F
4260                                 || 0x0E31 <= i && i <= 0x0E3F
4261                         )
4262                                 return false;
4263
4264                         return Char.GetUnicodeCategory ((char) i) ==
4265                                 UnicodeCategory.NonSpacingMark;
4266                 }
4267
4268                 // We can reuse IsIgnorableSymbol testcode 
4269                 // for IsIgnorableNonSpacing.
4270                 #endregion
4271         }
4272
4273         struct CharMapEntry
4274         {
4275                 public byte Category;
4276                 public byte Level1;
4277                 public byte Level2; // It is always single byte.
4278                 public bool Defined;
4279
4280                 public CharMapEntry (byte category, byte level1, byte level2)
4281                 {
4282                         Category = category;
4283                         Level1 = level1;
4284                         Level2 = level2;
4285                         Defined = true;
4286                 }
4287         }
4288
4289         class JISCharacter
4290         {
4291                 public readonly int CP;
4292                 public readonly int JIS;
4293
4294                 public JISCharacter (int cp, int cpJIS)
4295                 {
4296                         CP = cp;
4297                         JIS = cpJIS;
4298                 }
4299         }
4300
4301         class JISComparer : IComparer
4302         {
4303                 public static readonly JISComparer Instance =
4304                         new JISComparer ();
4305
4306                 public int Compare (object o1, object o2)
4307                 {
4308                         JISCharacter j1 = (JISCharacter) o1;
4309                         JISCharacter j2 = (JISCharacter) o2;
4310                         return j1.JIS - j2.JIS;
4311                 }
4312         }
4313
4314         class NonJISCharacter
4315         {
4316                 public readonly int CP;
4317                 public readonly string Name;
4318
4319                 public NonJISCharacter (int cp, string name)
4320                 {
4321                         CP = cp;
4322                         Name = name;
4323                 }
4324         }
4325
4326         class NonJISComparer : IComparer
4327         {
4328                 public static readonly NonJISComparer Instance =
4329                         new NonJISComparer ();
4330
4331                 public int Compare (object o1, object o2)
4332                 {
4333                         NonJISCharacter j1 = (NonJISCharacter) o1;
4334                         NonJISCharacter j2 = (NonJISCharacter) o2;
4335                         return string.CompareOrdinal (j1.Name, j2.Name);
4336                 }
4337         }
4338
4339         class DecimalDictionaryValueComparer : IComparer
4340         {
4341                 public static readonly DecimalDictionaryValueComparer Instance
4342                         = new DecimalDictionaryValueComparer ();
4343
4344                 private DecimalDictionaryValueComparer ()
4345                 {
4346                 }
4347
4348                 public int Compare (object o1, object o2)
4349                 {
4350                         DictionaryEntry e1 = (DictionaryEntry) o1;
4351                         DictionaryEntry e2 = (DictionaryEntry) o2;
4352                         // FIXME: in case of 0, compare decomposition categories
4353                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4354                         if (ret != 0)
4355                                 return ret;
4356                         int i1 = (int) e1.Key;
4357                         int i2 = (int) e2.Key;
4358                         return i1 - i2;
4359                 }
4360         }
4361
4362         class StringDictionaryValueComparer : IComparer
4363         {
4364                 public static readonly StringDictionaryValueComparer Instance
4365                         = new StringDictionaryValueComparer ();
4366
4367                 private StringDictionaryValueComparer ()
4368                 {
4369                 }
4370
4371                 public int Compare (object o1, object o2)
4372                 {
4373                         DictionaryEntry e1 = (DictionaryEntry) o1;
4374                         DictionaryEntry e2 = (DictionaryEntry) o2;
4375                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4376                         if (ret != 0)
4377                                 return ret;
4378                         int i1 = (int) e1.Key;
4379                         int i2 = (int) e2.Key;
4380                         return i1 - i2;
4381                 }
4382         }
4383
4384         class UCAComparer : IComparer
4385         {
4386                 public static readonly UCAComparer Instance
4387                         = new UCAComparer ();
4388
4389                 private UCAComparer ()
4390                 {
4391                 }
4392
4393                 public int Compare (object o1, object o2)
4394                 {
4395                         char i1 = (char) o1;
4396                         char i2 = (char) o2;
4397
4398                         int l1 = CollationElementTable.GetSortKeyCount (i1);
4399                         int l2 = CollationElementTable.GetSortKeyCount (i2);
4400                         int l = l1 > l2 ? l2 : l1;
4401
4402                         for (int i = 0; i < l; i++) {
4403                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4404                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4405                                 int v = k1.Primary - k2.Primary;
4406                                 if (v != 0)
4407                                         return v;
4408                                 v = k1.Secondary - k2.Secondary;
4409                                 if (v != 0)
4410                                         return v;
4411                                 v = k1.Thirtiary - k2.Thirtiary;
4412                                 if (v != 0)
4413                                         return v;
4414                                 v = k1.Quarternary - k2.Quarternary;
4415                                 if (v != 0)
4416                                         return v;
4417                         }
4418                         return l1 - l2;
4419                 }
4420         }
4421
4422         class Tailoring
4423         {
4424                 int lcid;
4425                 int alias;
4426                 bool frenchSort;
4427                 ArrayList items = new ArrayList ();
4428
4429                 public Tailoring (int lcid)
4430                         : this (lcid, 0)
4431                 {
4432                 }
4433
4434                 public Tailoring (int lcid, int alias)
4435                 {
4436                         this.lcid = lcid;
4437                         this.alias = alias;
4438                 }
4439
4440                 public int LCID {
4441                         get { return lcid; }
4442                 }
4443
4444                 public int Alias {
4445                         get { return alias; }
4446                 }
4447
4448                 public bool FrenchSort {
4449                         get { return frenchSort; }
4450                         set { frenchSort = value; }
4451                 }
4452
4453                 public void AddDiacriticalMap (byte target, byte replace)
4454                 {
4455                         items.Add (new DiacriticalMap (target, replace));
4456                 }
4457
4458                 public void AddSortKeyMap (string source, byte [] sortkey)
4459                 {
4460                         items.Add (new SortKeyMap (source, sortkey));
4461                 }
4462
4463                 public void AddReplacementMap (string source, string replace)
4464                 {
4465                         items.Add (new ReplacementMap (source, replace));
4466                 }
4467
4468                 public char [] ItemToCharArray ()
4469                 {
4470                         ArrayList al = new ArrayList ();
4471                         foreach (ITailoringMap m in items)
4472                                 al.AddRange (m.ToCharArray ());
4473                         return al.ToArray (typeof (char)) as char [];
4474                 }
4475
4476                 interface ITailoringMap
4477                 {
4478                         char [] ToCharArray ();
4479                 }
4480
4481                 class DiacriticalMap : ITailoringMap
4482                 {
4483                         public readonly byte Target;
4484                         public readonly byte Replace;
4485
4486                         public DiacriticalMap (byte target, byte replace)
4487                         {
4488                                 Target = target;
4489                                 Replace = replace;
4490                         }
4491
4492                         public char [] ToCharArray ()
4493                         {
4494                                 char [] ret = new char [3];
4495                                 ret [0] = (char) 02; // kind:DiacriticalMap
4496                                 ret [1] = (char) Target;
4497                                 ret [2] = (char) Replace;
4498                                 return ret;
4499                         }
4500                 }
4501
4502                 class SortKeyMap : ITailoringMap
4503                 {
4504                         public readonly string Source;
4505                         public readonly byte [] SortKey;
4506
4507                         public SortKeyMap (string source, byte [] sortkey)
4508                         {
4509                                 Source = source;
4510                                 SortKey = sortkey;
4511                         }
4512
4513                         public char [] ToCharArray ()
4514                         {
4515                                 char [] ret = new char [Source.Length + 7];
4516                                 ret [0] = (char) 01; // kind:SortKeyMap
4517                                 for (int i = 0; i < Source.Length; i++)
4518                                         ret [i + 1] = Source [i];
4519                                 // null terminate
4520                                 for (int i = 0; i < 4; i++)
4521                                         ret [i + Source.Length + 2] = (char) SortKey [i];
4522                                 return ret;
4523                         }
4524                 }
4525
4526                 class ReplacementMap : ITailoringMap
4527                 {
4528                         public readonly string Source;
4529                         public readonly string Replace;
4530
4531                         public ReplacementMap (string source, string replace)
4532                         {
4533                                 Source = source;
4534                                 Replace = replace;
4535                         }
4536
4537                         public char [] ToCharArray ()
4538                         {
4539                                 char [] ret = new char [Source.Length + Replace.Length + 3];
4540                                 ret [0] = (char) 03; // kind:ReplaceMap
4541                                 int pos = 1;
4542                                 for (int i = 0; i < Source.Length; i++)
4543                                         ret [pos++] = Source [i];
4544                                 // null terminate
4545                                 pos++;
4546                                 for (int i = 0; i < Replace.Length; i++)
4547                                         ret [pos++] = Replace [i];
4548                                 // null terminate
4549                                 return ret;
4550                         }
4551                 }
4552         }
4553 }