2005-07-19 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / create-mscompat-collation-table.cs
1 //
2 //
3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
5 //
6 //      - Surrogate
7 //      - PrivateUse
8 //
9 // Also, for composite characters it should prepare different index table.
10 //
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
13 //
14
15 //
16 // * sortkey getter signature
17 //
18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
19 //      Stores sort key for corresponding character element into buf and
20 //      returns the length of the consumed _source_ character element in s.
21 //
22 // * character length to consume
23 //
24 //      If there are characters whose primary weight is 0, they are consumed
25 //      and considered as a part of the character element.
26 //
27 #define Binary
28
29 using System;
30 using System.IO;
31 using System.Collections;
32 using System.Globalization;
33 using System.Text;
34 using System.Xml;
35
36 namespace Mono.Globalization.Unicode
37 {
38         internal class MSCompatSortKeyTableGenerator
39         {
40                 public static void Main (string [] args)
41                 {
42                         new MSCompatSortKeyTableGenerator ().Run (args);
43                 }
44
45                 const int DecompositionWide = 1; // fixed
46                 const int DecompositionSub = 2; // fixed
47                 const int DecompositionSmall = 3;
48                 const int DecompositionIsolated = 4;
49                 const int DecompositionInitial = 5;
50                 const int DecompositionFinal = 6;
51                 const int DecompositionMedial = 7;
52                 const int DecompositionNoBreak = 8;
53                 const int DecompositionVertical = 9;
54                 const int DecompositionFraction = 0xA;
55                 const int DecompositionFont = 0xB;
56                 const int DecompositionSuper = 0xC; // fixed
57                 const int DecompositionFull = 0xE;
58                 const int DecompositionNarrow = 0xD;
59                 const int DecompositionCircle = 0xF;
60                 const int DecompositionSquare = 0x10;
61                 const int DecompositionCompat = 0x11;
62                 const int DecompositionCanonical = 0x12;
63
64                 TextWriter Result = Console.Out;
65
66                 byte [] fillIndex = new byte [256]; // by category
67                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
68
69                 char [] specialIgnore = new char [] {
70                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
71                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
72                         };
73
74                 // FIXME: need more love (as always)
75                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
76                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
77                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
78                         '\u0292', '\u01BE', '\u0298'};
79                 byte [] alphaWeights = new byte [] {
80                         2, 9, 0xA, 0x1A, 0x21,
81                         0x23, 0x25, 0x2C, 0x32, 0x35,
82                         0x36, 0x48, 0x51, 0x70, 0x7C,
83                         0x7E, 0x89, 0x8A, 0x91, 0x99,
84                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
85                         0xA9, 0xAA, 0xB3, 0xB4};
86
87                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
88                 bool [] isUppercase = new bool [char.MaxValue + 1];
89
90                 byte [] decompType = new byte [char.MaxValue + 1];
91                 int [] decompIndex = new int [char.MaxValue + 1];
92                 int [] decompLength = new int [char.MaxValue + 1];
93                 int [] decompValues;
94                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
95
96                 byte [] diacritical = new byte [char.MaxValue + 1];
97
98                 string [] diacritics = new string [] {
99                         // LATIN, CYRILLIC etc.
100                         "UPTURN", "DOUBLE-STRUCK",
101                         "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
102                         "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
103                         "WITH ACUTE;", "WITH GRAVE;",
104                         //
105                         "WITH DOT ABOVE;", " MIDDLE DOT;",
106                         "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
107                         "WITH DIALYTIKA;",
108                         "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
109                         "DIALYTIKA TONOS", "DIALYTIKA AND TONOS", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
110                         "WITH OGONEK;", "WITH CEDILLA;",
111                         //
112                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
113                         "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
114                         "STROKE OVERLAY",
115                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
116                         " DIAERESIS AND GRAVE;",
117                         " BREVE AND ACUTE;",
118                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
119                         " MACRON AND ACUTE;",
120                         " MACRON AND GRAVE;",
121                         //
122                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
123                         " RING ABOVE AND ACUTE",
124                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
125                         " CIRCUMFLEX AND TILDE",
126                         " TILDE AND DIAERESIS",
127                         " STROKE AND ACUTE",
128                         " BREVE AND TILDE",
129                         " CEDILLA AND BREVE",
130                         " OGONEK AND MACRON",
131                         //
132                         "WITH OVERLINE",
133                         "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
134                         " DOUBLE GRAVE",
135                         " INVERTED BREVE",
136                         "ROMAN NUMERAL",
137                         " PRECEDED BY APOSTROPHE",
138                         "WITH HORN;",
139                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
140                         " PALATAL HOOK",
141                         " DOT BELOW;",
142                         " RETROFLEX;", "DIAERESIS BELOW",
143                         " RING BELOW",
144                         //
145                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
146                         " BREVE BELOW;", " HORN AND GRAVE",
147                         " TILDE BELOW",
148                         " TOPBAR",
149                         " DOT BELOW AND DOT ABOVE",
150                         " RIGHT HALF RING", " HORN AND TILDE",
151                         " CIRCUMFLEX AND DOT BELOW",
152                         " BREVE AND DOT BELOW",
153                         " DOT BELOW AND MACRON",
154                         " TONE TWO",
155                         " HORN AND HOOK ABOVE",
156                         " HORN AND DOT",
157                         // CIRCLED, PARENTHESIZED and so on
158                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
159                         "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
160                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
161                         };
162                 byte [] diacriticWeights = new byte [] {
163                         // LATIN.
164                         3, 3, 5, 5, 5,
165                         0xE, 0xF,
166                         0xE, 0xF,
167                         //
168                         0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
169                         0x16, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
170                         //
171                         0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
172                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
173                         //
174                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
175                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
176                         //
177                         0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
178                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
179                         //
180                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
181                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
182                         0x87, 0x95, 0xAA,
183                         // CIRCLED, PARENTHESIZED and so on.
184                         0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
185                         0xF3, 0xF3, 0xF3
186                         };
187
188                 int [] numberSecondaryWeightBounds = new int [] {
189                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
190                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
191                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
192                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
193                         0xE50, 0xE60, 0xED0, 0xEE0
194                         };
195
196                 char [] orderedGurmukhi;
197                 char [] orderedGujarati;
198                 char [] orderedGeorgian;
199                 char [] orderedThaana;
200
201                 static readonly char [] orderedTamilConsonants = new char [] {
202                         // based on traditional Tamil consonants, except for
203                         // Grantha (where Microsoft breaks traditionalism).
204                         // http://www.angelfire.com/empire/thamizh/padanGaL
205                         '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
206                         '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
207                         '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
208                         '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
209                         '\u0BB7', '\u0BB9'};
210
211                 // cp -> character name (only for some characters)
212                 ArrayList sortableCharNames = new ArrayList ();
213
214                 // cp -> arrow value (int)
215                 ArrayList arrowValues = new ArrayList ();
216
217                 // cp -> box value (int)
218                 ArrayList boxValues = new ArrayList ();
219
220                 // cp -> level1 value
221                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
222
223                 // letterName -> cp
224                 Hashtable arabicNameMap = new Hashtable ();
225
226                 // cp -> Hashtable [decompType] -> cp
227                 Hashtable nfkdMap = new Hashtable ();
228
229                 // Latin letter -> ArrayList [int]
230                 Hashtable latinMap = new Hashtable ();
231
232                 ArrayList jisJapanese = new ArrayList ();
233                 ArrayList nonJisJapanese = new ArrayList ();
234
235                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
236                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
237                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
238                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
239                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
240
241                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
242
243                 static double [] unicodeAge = new double [char.MaxValue + 1];
244
245                 ArrayList tailorings = new ArrayList ();
246
247                 void Run (string [] args)
248                 {
249                         string dirname = args.Length == 0 ? "downloaded" : args [0];
250                         ParseSources (dirname);
251                         Console.Error.WriteLine ("parse done.");
252
253                         ModifyParsedValues ();
254                         GenerateCore ();
255                         Console.Error.WriteLine ("generation done.");
256                         Serialize ();
257                         Console.Error.WriteLine ("serialization done.");
258 /*
259 StreamWriter sw = new StreamWriter ("agelog.txt");
260 for (int i = 0; i < char.MaxValue; i++) {
261 bool shouldBe = false;
262 switch (Char.GetUnicodeCategory ((char) i)) {
263 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
264         shouldBe = true; break;
265 }
266 if (unicodeAge [i] >= 3.1)
267         shouldBe = true;
268 //if (IsIgnorable (i) != shouldBe)
269 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
270 }
271 sw.Close ();
272 */
273                 }
274
275                 byte [] CompressArray (byte [] source, CodePointIndexer i)
276                 {
277                         return (byte []) CodePointIndexer.CompressArray  (
278                                 source, typeof (byte), i);
279                 }
280
281                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
282                 {
283                         return (ushort []) CodePointIndexer.CompressArray  (
284                                 source, typeof (ushort), i);
285                 }
286
287                 void Serialize ()
288                 {
289                         // Tailorings
290                         SerializeTailorings ();
291
292                         byte [] categories = new byte [map.Length];
293                         byte [] level1 = new byte [map.Length];
294                         byte [] level2 = new byte [map.Length];
295                         byte [] level3 = new byte [map.Length];
296                         ushort [] widthCompat = new ushort [map.Length];
297                         for (int i = 0; i < map.Length; i++) {
298                                 categories [i] = map [i].Category;
299                                 level1 [i] = map [i].Level1;
300                                 level2 [i] = map [i].Level2;
301                                 level3 [i] = ComputeLevel3Weight ((char) i);
302                                 // For Japanese Half-width characters, don't
303                                 // map widthCompat. It is IgnoreKanaType that
304                                 // handles those width differences.
305                                 if (0xFF6D <= i && i <= 0xFF9D)
306                                         continue;
307                                 switch (decompType [i]) {
308                                 case DecompositionNarrow:
309                                 case DecompositionWide:
310                                 case DecompositionSuper:
311                                 case DecompositionSub:
312                                         // they are always 1 char
313                                         widthCompat [i] = (ushort) decompValues [decompIndex [i]];
314                                         break;
315                                 }
316                         }
317
318                         // compress
319                         ignorableFlags = CompressArray (ignorableFlags,
320                                 MSCompatUnicodeTableUtil.Ignorable);
321                         categories = CompressArray (categories,
322                                 MSCompatUnicodeTableUtil.Category);
323                         level1 = CompressArray (level1, 
324                                 MSCompatUnicodeTableUtil.Level1);
325                         level2 = CompressArray (level2, 
326                                 MSCompatUnicodeTableUtil.Level2);
327                         level3 = CompressArray (level3, 
328                                 MSCompatUnicodeTableUtil.Level3);
329                         widthCompat = (ushort []) CodePointIndexer.CompressArray (
330                                 widthCompat, typeof (ushort),
331                                 MSCompatUnicodeTableUtil.WidthCompat);
332                         cjkCHS = CompressArray (cjkCHS,
333                                 MSCompatUnicodeTableUtil.CjkCHS);
334                         cjkCHT = CompressArray (cjkCHT,
335                                 MSCompatUnicodeTableUtil.Cjk);
336                         cjkJA = CompressArray (cjkJA,
337                                 MSCompatUnicodeTableUtil.Cjk);
338                         cjkKO = CompressArray (cjkKO,
339                                 MSCompatUnicodeTableUtil.Cjk);
340                         cjkKOlv2 = CompressArray (cjkKOlv2,
341                                 MSCompatUnicodeTableUtil.Cjk);
342
343                         // Ignorables
344                         Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
345 #if Binary
346                         MemoryStream ms = new MemoryStream ();
347                         BinaryWriter binary = new BinaryWriter (ms);
348                         binary.Write (ignorableFlags.Length);
349 #endif
350                         for (int i = 0; i < ignorableFlags.Length; i++) {
351                                 byte value = ignorableFlags [i];
352                                 if (value < 10)
353                                         Result.Write ("{0},", value);
354                                 else
355                                         Result.Write ("0x{0:X02},", value);
356 #if Binary
357                                 binary.Write (value);
358 #endif
359                                 if ((i & 0xF) == 0xF)
360                                         Result.WriteLine ("// {0:X04}", i - 0xF);
361                         }
362                         Result.WriteLine ("};");
363                         Result.WriteLine ();
364
365                         // Primary category
366                         Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
367 #if Binary
368                         binary.Write (categories.Length);
369 #endif
370                         for (int i = 0; i < categories.Length; i++) {
371                                 byte value = categories [i];
372                                 if (value < 10)
373                                         Result.Write ("{0},", value);
374                                 else
375                                         Result.Write ("0x{0:X02},", value);
376 #if Binary
377                                 binary.Write (value);
378 #endif
379                                 if ((i & 0xF) == 0xF)
380                                         Result.WriteLine ("// {0:X04}", i - 0xF);
381                         }
382                         Result.WriteLine ("};");
383                         Result.WriteLine ();
384
385                         // Primary weight value
386                         Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
387 #if Binary
388                         binary.Write (level1.Length);
389 #endif
390                         for (int i = 0; i < level1.Length; i++) {
391                                 byte value = level1 [i];
392                                 if (value < 10)
393                                         Result.Write ("{0},", value);
394                                 else
395                                         Result.Write ("0x{0:X02},", value);
396 #if Binary
397                                 binary.Write (value);
398 #endif
399                                 if ((i & 0xF) == 0xF)
400                                         Result.WriteLine ("// {0:X04}", i - 0xF);
401                         }
402                         Result.WriteLine ("};");
403                         Result.WriteLine ();
404
405                         // Secondary weight
406                         Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
407 #if Binary
408                         binary.Write (level2.Length);
409 #endif
410                         for (int i = 0; i < level2.Length; i++) {
411                                 byte value = level2 [i];
412                                 if (value < 10)
413                                         Result.Write ("{0},", value);
414                                 else
415                                         Result.Write ("0x{0:X02},", value);
416 #if Binary
417                                 binary.Write (value);
418 #endif
419                                 if ((i & 0xF) == 0xF)
420                                         Result.WriteLine ("// {0:X04}", i - 0xF);
421                         }
422                         Result.WriteLine ("};");
423                         Result.WriteLine ();
424
425                         // Thirtiary weight
426                         Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
427 #if Binary
428                         binary.Write (level3.Length);
429 #endif
430                         for (int i = 0; i < level3.Length; i++) {
431                                 byte value = level3 [i];
432                                 if (value < 10)
433                                         Result.Write ("{0},", value);
434                                 else
435                                         Result.Write ("0x{0:X02},", value);
436 #if Binary
437                                 binary.Write (value);
438 #endif
439                                 if ((i & 0xF) == 0xF)
440                                         Result.WriteLine ("// {0:X04}", i - 0xF);
441                         }
442                         Result.WriteLine ("};");
443                         Result.WriteLine ();
444
445                         // Width insensitivity mappings
446                         // (for now it is more lightweight than dumping the
447                         // entire NFKD table).
448                         Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
449 #if Binary
450                         binary.Write (widthCompat.Length);
451 #endif
452                         for (int i = 0; i < widthCompat.Length; i++) {
453                                 ushort value = widthCompat [i];
454                                 if (value < 10)
455                                         Result.Write ("{0},", value);
456                                 else
457                                         Result.Write ("0x{0:X02},", value);
458 #if Binary
459                                 binary.Write (value);
460 #endif
461                                 if ((i & 0xF) == 0xF)
462                                         Result.WriteLine ("// {0:X04}", i - 0xF);
463                         }
464                         Result.WriteLine ("};");
465                         Result.WriteLine ();
466 #if Binary
467                         using (FileStream fs = File.Create ("../collation.core.bin")) {
468                                 byte [] array = ms.ToArray ();
469                                 fs.Write (array, 0, array.Length);
470                         }
471 #endif
472
473                         // CJK
474                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
475                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
476                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
477                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
478                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
479                 }
480
481                 void SerializeCJK (string name, ushort [] cjk, int max)
482                 {
483                         int offset = 0;//char.MaxValue - cjk.Length;
484                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
485 #if Binary
486                         MemoryStream ms = new MemoryStream ();
487                         BinaryWriter binary = new BinaryWriter (ms);
488                         binary.Write (cjk.Length);
489 #endif
490                         for (int i = 0; i < cjk.Length; i++) {
491                                 if (i + offset == max)
492                                         break;
493                                 ushort value = cjk [i];
494                                 if (value < 10)
495                                         Result.Write ("{0},", value);
496                                 else
497                                         Result.Write ("0x{0:X04},", value);
498 #if Binary
499                                 binary.Write (value);
500 #endif
501                                 if ((i & 0xF) == 0xF)
502                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
503                         }
504                         Result.WriteLine ("};");
505                         Result.WriteLine ();
506 #if Binary
507                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
508                                 byte [] array = ms.ToArray ();
509                                 fs.Write (array, 0, array.Length);
510                         }
511 #endif
512                 }
513
514                 void SerializeCJK (string name, byte [] cjk, int max)
515                 {
516                         int offset = 0;//char.MaxValue - cjk.Length;
517                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
518 #if Binary
519                         MemoryStream ms = new MemoryStream ();
520                         BinaryWriter binary = new BinaryWriter (ms);
521 #endif
522                         for (int i = 0; i < cjk.Length; i++) {
523                                 if (i + offset == max)
524                                         break;
525                                 byte value = cjk [i];
526                                 if (value < 10)
527                                         Result.Write ("{0},", value);
528                                 else
529                                         Result.Write ("0x{0:X02},", value);
530 #if Binary
531                                 binary.Write (value);
532 #endif
533                                 if ((i & 0xF) == 0xF)
534                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
535                         }
536                         Result.WriteLine ("};");
537                         Result.WriteLine ();
538 #if Binary
539                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
540                                 byte [] array = ms.ToArray ();
541                                 fs.Write (array, 0, array.Length);
542                         }
543 #endif
544                 }
545
546                 void SerializeTailorings ()
547                 {
548                         Hashtable indexes = new Hashtable ();
549                         Hashtable counts = new Hashtable ();
550                         Result.WriteLine ("static char [] tailorings = new char [] {");
551                         int count = 0;
552 #if Binary
553                         MemoryStream ms = new MemoryStream ();
554                         BinaryWriter binary = new BinaryWriter (ms);
555 #endif
556                         foreach (Tailoring t in tailorings) {
557                                 if (t.Alias != 0)
558                                         continue;
559                                 Result.Write ("/*{0}*/", t.LCID);
560                                 indexes.Add (t.LCID, count);
561                                 char [] values = t.ItemToCharArray ();
562                                 counts.Add (t.LCID, values.Length);
563                                 foreach (char c in values) {
564                                         Result.Write ("'\\x{0:X}', ", (int) c);
565                                         if (++count % 16 == 0)
566                                                 Result.WriteLine (" // {0:X04}", count - 16);
567 #if Binary
568                                         binary.Write ((ushort) c);
569 #endif
570                                 }
571                         }
572                         Result.WriteLine ("};");
573
574                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
575 #if Binary
576                         byte [] rawdata = ms.ToArray ();
577                         ms = new MemoryStream ();
578                         binary = new BinaryWriter (ms);
579                         binary.Write (tailorings.Count);
580 #endif
581                         foreach (Tailoring t in tailorings) {
582                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
583                                 if (!indexes.ContainsKey (target)) {
584                                         throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
585                                         continue;
586                                 }
587                                 int idx = (int) indexes [target];
588                                 int cnt = (int) counts [target];
589                                 bool french = t.FrenchSort;
590                                 if (t.Alias != 0)
591                                         foreach (Tailoring t2 in tailorings)
592                                                 if (t2.LCID == t.LCID)
593                                                         french = t2.FrenchSort;
594                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
595 #if Binary
596                                 binary.Write (t.LCID);
597                                 binary.Write (idx);
598                                 binary.Write (cnt);
599                                 binary.Write (french);
600 #endif
601                         }
602                         Result.WriteLine ("};");
603 #if Binary
604                         binary.Write ((byte) 0xFF);
605                         binary.Write ((byte) 0xFF);
606                         binary.Write (rawdata.Length / 2);
607                         binary.Write (rawdata, 0, rawdata.Length);
608
609
610                         using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
611                                 byte [] array = ms.ToArray ();
612                                 fs.Write (array, 0, array.Length);
613                         }
614 #endif
615                 }
616
617                 #region Parse
618
619                 void ParseSources (string dirname)
620                 {
621                         string unidata =
622                                 dirname + "/UnicodeData.txt";
623                         string derivedCoreProps = 
624                                 dirname + "/DerivedCoreProperties.txt";
625                         string scripts = 
626                                 dirname + "/Scripts.txt";
627                         string cp932 = 
628                                 dirname + "/CP932.TXT";
629                         string derivedAge = 
630                                 dirname + "/DerivedAge.txt";
631                         string chXML = dirname + "/common/collation/zh.xml";
632                         string jaXML = dirname + "/common/collation/ja.xml";
633                         string koXML = dirname + "/common/collation/ko.xml";
634
635                         ParseDerivedAge (derivedAge);
636
637                         FillIgnorables ();
638
639                         ParseJISOrder (cp932); // in prior to ParseUnidata()
640                         ParseUnidata (unidata);
641                         ModifyUnidata ();
642                         ParseDerivedCoreProperties (derivedCoreProps);
643                         ParseScripts (scripts);
644                         ParseCJK (chXML, jaXML, koXML);
645
646                         ParseTailorings ("mono-tailoring-source.txt");
647                 }
648
649                 void ParseTailorings (string filename)
650                 {
651                         Tailoring t = null;
652                         int line = 0;
653                         using (StreamReader sr = new StreamReader (filename)) {
654                                 try {
655                                         while (sr.Peek () >= 0) {
656                                                 line++;
657                                                 ProcessTailoringLine (ref t,
658                                                         sr.ReadLine ().Trim ());
659                                         }
660                                 } catch (Exception) {
661                                         Console.Error.WriteLine ("ERROR at line {0}", line);
662                                         throw;
663                                 }
664                         }
665                 }
666
667                 // For now this is enough.
668                 string ParseTailoringSourceValue (string s)
669                 {
670                         StringBuilder sb = new StringBuilder ();
671                         for (int i = 0; i < s.Length; i++) {
672                                 if (s.StartsWith ("\\u")) {
673                                         sb.Append ((char) int.Parse (
674                                                 s.Substring (2, 4), NumberStyles.HexNumber),
675                                                 1);
676                                         i += 5;
677                                 }
678                         else
679                                 sb.Append (s [i]);
680                         }
681                         return sb.ToString ();
682                 }
683
684                 void ProcessTailoringLine (ref Tailoring t, string s)
685                 {
686                         int idx = s.IndexOf ('#');
687                         if (idx > 0)
688                                 s = s.Substring (0, idx).Trim ();
689                         if (s.Length == 0 || s [0] == '#')
690                                 return;
691                         if (s [0] == '@') {
692                                 idx = s.IndexOf ('=');
693                                 if (idx > 0)
694                                         t = new Tailoring (
695                                                 int.Parse (s.Substring (1, idx - 1)),
696                                                 int.Parse (s.Substring (idx + 1)));
697                                 else
698                                         t = new Tailoring (int.Parse (s.Substring (1)));
699                                 tailorings.Add (t);
700                                 return;
701                         }
702                         if (s.StartsWith ("*FrenchSort")) {
703                                 t.FrenchSort = true;
704                                 return;
705                         }
706                         string d = "*Diacritical";
707                         if (s.StartsWith (d)) {
708                                 idx = s.IndexOf ("->");
709                                 t.AddDiacriticalMap (
710                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
711                                                 NumberStyles.HexNumber),
712                                         byte.Parse (s.Substring (idx + 2).Trim (),
713                                                 NumberStyles.HexNumber));
714                                 return;
715                         }
716                         idx = s.IndexOf (':');
717                         if (idx > 0) {
718                                 string source = s.Substring (0, idx).Trim ();
719                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
720                                 byte [] b = new byte [4];
721                                 for (int i = 0; i < 4; i++) {
722                                         if (l [i] == "*")
723                                                 b [i] = 0;
724                                         else
725                                                 b [i] = byte.Parse (l [i],
726                                                         NumberStyles.HexNumber);
727                                 }
728                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
729                                         b);
730                         }
731                         idx = s.IndexOf ('=');
732                         if (idx > 0)
733                                 t.AddReplacementMap (
734                                         ParseTailoringSourceValue (
735                                                 s.Substring (0, idx).Trim ()),
736                                         ParseTailoringSourceValue (
737                                                 s.Substring (idx + 1).Trim ()));
738                 }
739
740                 void ParseDerivedAge (string filename)
741                 {
742                         using (StreamReader file =
743                                 new StreamReader (filename)) {
744                                 while (file.Peek () >= 0) {
745                                         string s = file.ReadLine ();
746                                         int idx = s.IndexOf ('#');
747                                         if (idx >= 0)
748                                                 s = s.Substring (0, idx);
749                                         idx = s.IndexOf (';');
750                                         if (idx < 0)
751                                                 continue;
752
753                                         string cpspec = s.Substring (0, idx);
754                                         idx = cpspec.IndexOf ("..");
755                                         NumberStyles nf = NumberStyles.HexNumber |
756                                                 NumberStyles.AllowTrailingWhite;
757                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
758                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
759                                         string value = s.Substring (cpspec.Length + 1).Trim ();
760
761                                         // FIXME: use index
762                                         if (cp > char.MaxValue)
763                                                 continue;
764
765                                         double v = double.Parse (value);
766                                         for (int i = cp; i <= cpEnd; i++)
767                                                 unicodeAge [i] = v;
768                                 }
769                         }
770                         unicodeAge [0] = double.MaxValue; // never be supported
771                 }
772
773                 void ParseUnidata (string filename)
774                 {
775                         ArrayList decompValues = new ArrayList ();
776                         using (StreamReader unidata =
777                                 new StreamReader (filename)) {
778                                 for (int line = 1; unidata.Peek () >= 0; line++) {
779                                         try {
780                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
781                                         } catch (Exception) {
782                                                 Console.Error.WriteLine ("**** At line " + line);
783                                                 throw;
784                                         }
785                                 }
786                         }
787                         this.decompValues = (int [])
788                                 decompValues.ToArray (typeof (int));
789                 }
790
791                 char previousLatinTarget = char.MinValue;
792                 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
793
794                 void ProcessUnidataLine (string s, ArrayList decompValues)
795                 {
796                         int idx = s.IndexOf ('#');
797                         if (idx >= 0)
798                                 s = s.Substring (0, idx);
799                         idx = s.IndexOf (';');
800                         if (idx < 0)
801                                 return;
802                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
803                         string [] values = s.Substring (idx + 1).Split (';');
804
805                         // FIXME: use index
806                         if (cp > char.MaxValue)
807                                 return;
808                         if (IsIgnorable (cp))
809                                 return;
810
811                         string name = values [0];
812
813                         // SPECIAL CASE: rename some characters for diacritical
814                         // remapping. FIXME: why are they different?
815                         // FIXME: it's still not working.
816                         if (cp == 0x018B || cp == 0x018C)
817                                 name = name.Replace ("TOPBAR", "STROKE");
818
819                         // isSmallCapital
820                         if (s.IndexOf ("SMALL CAPITAL") > 0)
821                                 isSmallCapital [cp] = true;
822
823                         // latin mapping by character name
824                         if (s.IndexOf ("LATIN") >= 0) {
825                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
826                                 int offset = lidx + 15;
827                                 if (lidx < 0) {
828                                         lidx = s.IndexOf ("LETTER TURNED ");
829                                         offset = lidx + 14;
830                                 }
831                                 if (lidx < 0) {
832                                         lidx = s.IndexOf ("LETTER CAPITAL ");
833                                         offset = lidx + 15;
834                                 }
835                                 if (lidx < 0) {
836                                         lidx = s.IndexOf ("LETTER SCRIPT ");
837                                         offset = lidx + 14;
838                                 }
839                                 if (lidx < 0) {
840                                         lidx = s.IndexOf ("LETTER ");
841                                         offset = lidx + 7;
842                                 }
843                                 char c = lidx > 0 ? s [offset] : char.MinValue;
844                                 char n = s [offset + 1];
845                                 char target = char.MinValue;
846                                 if ('A' <= c && c <= 'Z' &&
847                                         (n == ' ') || n == ';') {
848                                         target = c;
849                                         // FIXME: After 'Z', I cannot reset this state.
850                                         previousLatinTarget = c == 'Z' ? char.MinValue : c;
851                                 }
852
853                                 if (s.Substring (offset).StartsWith ("ALPHA"))
854                                         target = 'A';
855                                 else if (s.Substring (offset).StartsWith ("TONE SIX"))
856                                         target = 'B';
857                                 else if (s.Substring (offset).StartsWith ("OPEN O"))
858                                         target = 'C';
859                                 else if (s.Substring (offset).StartsWith ("SCHWA"))
860                                         target = 'E';
861                                 else if (s.Substring (offset).StartsWith ("ENG"))
862                                         target = 'N';
863                                 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
864                                         target = 'O';
865                                 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
866                                         target = 'R';
867                                 else if (s.Substring (offset).StartsWith ("TONE TWO"))
868                                         target = 'S';
869                                 else if (s.Substring (offset).StartsWith ("ESH"))
870                                         target = 'S';
871
872                                 // For remaining IPA chars, direct mapping is
873                                 // much faster.
874                                 switch (cp) {
875                                 case 0x0299: target = 'B'; break;
876                                 case 0x029A: target = 'E'; break;
877                                 case 0x029B: target = 'G'; break;
878                                 case 0x029C: target = 'H'; break;
879                                 case 0x029D: target = 'J'; break;
880                                 case 0x029E: target = 'K'; break;
881                                 case 0x029F: target = 'L'; break;
882                                 case 0x02A0: target = 'Q'; break;
883                                 case 0x02A7: target = 'T'; break;
884                                 case 0x02A8: target = 'T'; break;
885                                 }
886
887                                 if (target == char.MinValue)
888                                         target = previousLatinTarget;
889
890                                 if (target != char.MinValue) {
891                                         ArrayList entry = (ArrayList) latinMap [target];
892                                         if (entry == null) {
893                                                 entry = new ArrayList ();
894                                                 latinMap [target] = entry;
895                                         }
896                                         entry.Add (cp);
897                                         // FIXME: This secondary weight is hack.
898                                         // They are here because they must not
899                                         // be identical to the corresponding
900                                         // ASCII latins.
901                                         if (c != target && diacritical [cp] == 0) {
902                                                 diacriticalOffset [c - 'A']++;
903                                                 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
904                                         }
905                                 }
906                         }
907
908                         // Arrow names
909                         if (0x2000 <= cp && cp < 0x3000) {
910                                 int value = 0;
911                                 // SPECIAL CASES. FIXME: why?
912                                 switch (cp) {
913                                 case 0x21C5: value = -1; break; // E2
914                                 case 0x261D: value = 1; break;
915                                 case 0x27A6: value = 3; break;
916                                 case 0x21B0: value = 7; break;
917                                 case 0x21B1: value = 3; break;
918                                 case 0x21B2: value = 7; break;
919                                 case 0x21B4: value = 5; break;
920                                 case 0x21B5: value = 7; break;
921                                 case 0x21B9: value = -1; break; // E1
922                                 case 0x21CF: value = 7; break;
923                                 case 0x21D0: value = 3; break;
924                                 }
925                                 string [] arrowTargets = new string [] {
926                                         "",
927                                         "UPWARDS",
928                                         "NORTH EAST",
929                                         "RIGHTWARDS",
930                                         "SOUTH EAST",
931                                         "DOWNWARDS",
932                                         "SOUTH WEST",
933                                         "LEFTWARDS",
934                                         "NORTH WEST",
935                                         "LEFT RIGHT",
936                                         "UP DOWN",
937                                         };
938                                 if (value == 0)
939                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++) {
940                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
941                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
942                                                         s.IndexOf (" OVER") < 0
943                                                 )
944                                                         value = i;
945                                                 else if (s.IndexOf ("RIGHTWARDS") > 0 &&
946                                                         s.IndexOf ("LEFTWARDS") > 0)
947                                                         value = 0xE1 - 0xD8;
948                                                 else if (s.IndexOf ("UPWARDS") > 0 &&
949                                                         s.IndexOf ("DOWNWARDS") > 0)
950                                                         value = 0xE2 - 0xD8;
951                                         }
952                                 if (value > 0)
953                                         arrowValues.Add (new DictionaryEntry (
954                                                 cp, value));
955                         }
956
957                         // Box names
958                         if (0x2500 <= cp && cp < 0x2600) {
959                                 int value = 0;
960                                 // flags:
961                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
962                                 // [h,rl] [r] [l]
963                                 // [v,ud] [u] [d]
964                                 // [dr] [dl] [ur] [ul]
965                                 // [vr,udr] [vl,vdl]
966                                 // [hd,rld] [hu,rlu]
967                                 // [hv,udrl,rlv,udh]
968                                 ArrayList flags = new ArrayList (new int [] {
969                                         32, 8 + 4, 8, 4,
970                                         16, 1 + 2, 1, 2,
971                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
972                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
973                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
974                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
975                                         });
976                                 byte [] offsets = new byte [] {
977                                         0, 0, 1, 2,
978                                         3, 3, 4, 5,
979                                         6, 7, 8, 9,
980                                         10, 10, 11, 11,
981                                         12, 12, 13, 13,
982                                         14, 14, 14, 14};
983                                 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
984                                         int flag = 0;
985                                         if (s.IndexOf (" UP") >= 0)
986                                                 flag |= 1;
987                                         if (s.IndexOf (" DOWN") >= 0)
988                                                 flag |= 2;
989                                         if (s.IndexOf (" RIGHT") >= 0)
990                                                 flag |= 4;
991                                         if (s.IndexOf (" LEFT") >= 0)
992                                                 flag |= 8;
993                                         if (s.IndexOf (" VERTICAL") >= 0)
994                                                 flag |= 16;
995                                         if (s.IndexOf (" HORIZONTAL") >= 0)
996                                                 flag |= 32;
997
998                                         int fidx = flags.IndexOf (flag);
999                                         value = fidx < 0 ? fidx : offsets [fidx];
1000                                 } else if (s.IndexOf ("BLOCK") >= 0) {
1001                                         if (s.IndexOf ("ONE EIGHTH") >= 0)
1002                                                 value = 0x12;
1003                                         else if (s.IndexOf ("ONE QUARTER") >= 0)
1004                                                 value = 0x13;
1005                                         else if (s.IndexOf ("THREE EIGHTHS") >= 0)
1006                                                 value = 0x14;
1007                                         else if (s.IndexOf ("HALF") >= 0)
1008                                                 value = 0x15;
1009                                         else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1010                                                 value = 0x16;
1011                                         else if (s.IndexOf ("THREE QUARTERS") >= 0)
1012                                                 value = 0x17;
1013                                         else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1014                                                 value = 0x18;
1015                                         else
1016                                                 value = 0x19;
1017                                 }
1018                                 else if (s.IndexOf ("SHADE") >= 0)
1019                                         value = 0x19;
1020                                 else if (s.IndexOf ("SQUARE") >= 0)
1021                                         value = 0xBC - 0xE5;
1022                                 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1023                                         value = 0xBE - 0xE5;
1024                                 else if (s.IndexOf ("RECTANGLE") >= 0)
1025                                         value = 0xBD - 0xE5;
1026                                 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1027                                         value = 0xBF - 0xE5;
1028                                 else if (s.IndexOf ("TRIANGLE") >= 0) {
1029                                         if (s.IndexOf ("UP-POINTING") >= 0)
1030                                                 value = 0xC0 - 0xE5;
1031                                         else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1032                                                 value = 0xC1 - 0xE5;
1033                                         else if (s.IndexOf ("DOWN-POINTING") >= 0)
1034                                                 value = 0xC2 - 0xE5;
1035                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1036                                                 value = 0xC3 - 0xE5;
1037                                 }
1038                                 else if (s.IndexOf ("POINTER") >= 0) {
1039                                         if (s.IndexOf ("RIGHT-POINTING") >= 0)
1040                                                 value = 0xC4 - 0xE5;
1041                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1042                                                 value = 0xC5 - 0xE5;
1043                                 }
1044                                 else if (s.IndexOf ("DIAMOND") >= 0)
1045                                         value = 0xC6 - 0xE5;
1046                                 else if (s.IndexOf ("FISHEYE") >= 0)
1047                                         value = 0xC7 - 0xE5;
1048                                 else if (s.IndexOf ("LOZENGE") >= 0)
1049                                         value = 0xC8 - 0xE5;
1050                                 else if (s.IndexOf ("BULLSEYE") >= 0)
1051                                         value = 0xC9 - 0xE5;
1052                                 else if (s.IndexOf ("CIRCLE") >= 0) {
1053                                         if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1054                                                 value = 0xCA - 0xE5;
1055                                         else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1056                                                 value = 0xCB - 0xE5;
1057                                         else
1058                                                 value = 0xC9 - 0xE5;
1059                                 }
1060                                 if (0x25DA <= cp && cp <= 0x25E5)
1061                                         value = 0xCD + cp - 0x25DA - 0xE5;
1062
1063                                 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1064                                 switch (cp) {
1065                                 case 0x2571: value = 0xF; break;
1066                                 case 0x2572: value = 0x10; break;
1067                                 case 0x2573: value = 0x11; break;
1068                                 }
1069                                 if (value != 0)
1070                                         boxValues.Add (new DictionaryEntry (
1071                                                 cp, value));
1072                         }
1073
1074                         // For some characters store the name and sort later
1075                         // to determine sorting.
1076                         if (0x2100 <= cp && cp <= 0x213F &&
1077                                 Char.IsSymbol ((char) cp))
1078                                 sortableCharNames.Add (
1079                                         new DictionaryEntry (cp, name));
1080                         else if (0x3380 <= cp && cp <= 0x33DD)
1081                                 sortableCharNames.Add (new DictionaryEntry (
1082                                         cp, name.Substring (7)));
1083
1084                         if (Char.GetUnicodeCategory ((char) cp) ==
1085                                 UnicodeCategory.MathSymbol) {
1086                                 if (name.StartsWith ("CIRCLED "))
1087                                         diacritical [cp] = 0xEE;
1088                                 if (name.StartsWith ("SQUARED "))
1089                                         diacritical [cp] = 0xEF;
1090                         }
1091
1092                         // diacritical weights by character name
1093 if (diacritics.Length != diacriticWeights.Length)
1094 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1095                         for (int d = 0; d < diacritics.Length; d++) {
1096                                 if (s.IndexOf (diacritics [d]) > 0) {
1097                                         diacritical [cp] += diacriticWeights [d];
1098                                         if (s.IndexOf ("COMBINING") >= 0)
1099                                                 diacritical [cp] -= (byte) 2;
1100                                         continue;
1101                                 }
1102                                 // also process "COMBINING blah" here
1103                                 // For now it is limited to cp < 0x0370
1104 //                              if (cp < 0x0300 || cp >= 0x0370)
1105 //                                      continue;
1106                                 string tmp = diacritics [d].TrimEnd (';');
1107                                 if (tmp.IndexOf ("WITH ") == 0)
1108                                         tmp = tmp.Substring (4);
1109                                 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1110                                 if (name == tmp) {
1111                                         diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1112                                         break;
1113                                 }
1114 //if (name == tmp)
1115 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1116                         }
1117                         // Two-step grep required for it.
1118                         if (s.IndexOf ("FULL STOP") > 0 &&
1119                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1120                                 diacritical [cp] |= 0xF4;
1121                         if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1122                                 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1123                                         s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1124
1125                         // Arabic letter name
1126                         if (0x0621 <= cp && cp <= 0x064A &&
1127                                 Char.GetUnicodeCategory ((char) cp)
1128                                 == UnicodeCategory.OtherLetter) {
1129                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1130                                 switch (cp) {
1131                                 case 0x0621:
1132                                 case 0x0624:
1133                                 case 0x0626:
1134                                         // hamza, waw, yeh ... special cases.
1135                                         value = 0x07;
1136                                         break;
1137                                 case 0x0649:
1138                                 case 0x064A:
1139                                         value = 0x77; // special cases.
1140                                         break;
1141                                 default:
1142                                         // Get primary letter name i.e.
1143                                         // XXX part of ARABIC LETTER XXX yyy
1144                                         // e.g. that of "TEH MARBUTA" is "TEH".
1145                                         string letterName =
1146                                                 (cp == 0x0640) ?
1147                                                 // 0x0640 is special: it does
1148                                                 // not start with ARABIC LETTER
1149                                                 name :
1150                                                 name.Substring (14);
1151                                         int tmpIdx = letterName.IndexOf (' ');
1152                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1153 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1154                                         if (arabicNameMap.ContainsKey (letterName))
1155                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1156                                         else
1157                                                 arabicNameMap [letterName] = cp;
1158                                         break;
1159                                 }
1160                                 arabicLetterPrimaryValues [cp] = value;
1161                         }
1162
1163                         // Japanese square letter
1164                         if (0x3300 <= cp && cp <= 0x3357)
1165                                 if (!ExistsJIS (cp))
1166                                         nonJisJapanese.Add (new NonJISCharacter (cp, name));
1167
1168                         // normalizationType
1169                         string decomp = values [4];
1170                         idx = decomp.IndexOf ('<');
1171                         if (idx >= 0) {
1172                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1173                                 case "full":
1174                                         decompType [cp] = DecompositionFull;
1175                                         break;
1176                                 case "sub":
1177                                         decompType [cp] = DecompositionSub;
1178                                         break;
1179                                 case "super":
1180                                         decompType [cp] = DecompositionSuper;
1181                                         break;
1182                                 case "small":
1183                                         decompType [cp] = DecompositionSmall;
1184                                         break;
1185                                 case "isolated":
1186                                         decompType [cp] = DecompositionIsolated;
1187                                         break;
1188                                 case "initial":
1189                                         decompType [cp] = DecompositionInitial;
1190                                         break;
1191                                 case "final":
1192                                         decompType [cp] = DecompositionFinal;
1193                                         break;
1194                                 case "medial":
1195                                         decompType [cp] = DecompositionMedial;
1196                                         break;
1197                                 case "noBreak":
1198                                         decompType [cp] = DecompositionNoBreak;
1199                                         break;
1200                                 case "compat":
1201                                         decompType [cp] = DecompositionCompat;
1202                                         break;
1203                                 case "fraction":
1204                                         decompType [cp] = DecompositionFraction;
1205                                         break;
1206                                 case "font":
1207                                         decompType [cp] = DecompositionFont;
1208                                         break;
1209                                 case "circle":
1210                                         decompType [cp] = DecompositionCircle;
1211                                         break;
1212                                 case "square":
1213                                         decompType [cp] = DecompositionSquare;
1214                                         break;
1215                                 case "wide":
1216                                         decompType [cp] = DecompositionWide;
1217                                         break;
1218                                 case "narrow":
1219                                         decompType [cp] = DecompositionNarrow;
1220                                         break;
1221                                 case "vertical":
1222                                         decompType [cp] = DecompositionVertical;
1223                                         break;
1224                                 default:
1225                                         throw new Exception ("Support NFKD type : " + decomp);
1226                                 }
1227                         }
1228                         else
1229                                 decompType [cp] = DecompositionCanonical;
1230                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1231                         if (decomp.Length > 0) {
1232
1233                                 string [] velems = decomp.Split (' ');
1234                                 int didx = decompValues.Count;
1235                                 decompIndex [cp] = didx;
1236                                 foreach (string v in velems)
1237                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1238                                 decompLength [cp] = velems.Length;
1239
1240                                 // [decmpType] -> this_cp
1241                                 int targetCP = (int) decompValues [didx];
1242                                 // for "(x)" it specially maps to 'x' .
1243                                 // FIXME: check if it is sane
1244                                 if (velems.Length == 3 &&
1245                                         (int) decompValues [didx] == '(' &&
1246                                         (int) decompValues [didx + 2] == ')')
1247                                         targetCP = (int) decompValues [didx + 1];
1248                                 // special: 0x215F "1/"
1249                                 else if (cp == 0x215F)
1250                                         targetCP = '1';
1251                                 else if (velems.Length > 1 &&
1252                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
1253                                         // skip them, except for CJK ideograph compat
1254                                         targetCP = 0;
1255
1256                                 if (targetCP != 0) {
1257                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
1258                                         if (entry == null) {
1259                                                 entry = new Hashtable ();
1260                                                 nfkdMap [targetCP] = entry;
1261                                         }
1262                                         entry [(byte) decompType [cp]] = cp;
1263                                 }
1264                         }
1265                         // numeric values
1266                         if (values [5].Length > 0)
1267                                 decimalValue [cp] = decimal.Parse (values [5]);
1268                         else if (values [6].Length > 0)
1269                                 decimalValue [cp] = decimal.Parse (values [6]);
1270                         else if (values [7].Length > 0) {
1271                                 string decstr = values [7];
1272                                 idx = decstr.IndexOf ('/');
1273                                 if (cp == 0x215F) // special. "1/"
1274                                         decimalValue [cp] = 0x1;
1275                                 else if (idx > 0)
1276                                         // m/n
1277                                         decimalValue [cp] = 
1278                                                 decimal.Parse (decstr.Substring (0, idx))
1279                                                 / decimal.Parse (decstr.Substring (idx + 1));
1280                                 else if (decstr [0] == '(' &&
1281                                         decstr [decstr.Length - 1] == ')')
1282                                         // (n)
1283                                         decimalValue [cp] =
1284                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1285                                 else if (decstr [decstr.Length - 1] == '.')
1286                                         // n.
1287                                         decimalValue [cp] =
1288                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1289                                 else
1290                                         decimalValue [cp] = decimal.Parse (decstr);
1291                         }
1292                 }
1293
1294                 void ParseDerivedCoreProperties (string filename)
1295                 {
1296                         // IsUppercase
1297                         using (StreamReader file =
1298                                 new StreamReader (filename)) {
1299                                 for (int line = 1; file.Peek () >= 0; line++) {
1300                                         try {
1301                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1302                                         } catch (Exception) {
1303                                                 Console.Error.WriteLine ("**** At line " + line);
1304                                                 throw;
1305                                         }
1306                                 }
1307                         }
1308                 }
1309
1310                 void ProcessDerivedCorePropLine (string s)
1311                 {
1312                         int idx = s.IndexOf ('#');
1313                         if (idx >= 0)
1314                                 s = s.Substring (0, idx);
1315                         idx = s.IndexOf (';');
1316                         if (idx < 0)
1317                                 return;
1318                         string cpspec = s.Substring (0, idx);
1319                         idx = cpspec.IndexOf ("..");
1320                         NumberStyles nf = NumberStyles.HexNumber |
1321                                 NumberStyles.AllowTrailingWhite;
1322                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1323                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1324                         string value = s.Substring (cpspec.Length + 1).Trim ();
1325
1326                         // FIXME: use index
1327                         if (cp > char.MaxValue)
1328                                 return;
1329
1330                         switch (value) {
1331                         case "Uppercase":
1332                                 for (int x = cp; x <= cpEnd; x++)
1333                                         isUppercase [x] = true;
1334                                 break;
1335                         }
1336                 }
1337
1338                 void ParseScripts (string filename)
1339                 {
1340                         ArrayList gurmukhi = new ArrayList ();
1341                         ArrayList gujarati = new ArrayList ();
1342                         ArrayList georgian = new ArrayList ();
1343                         ArrayList thaana = new ArrayList ();
1344
1345                         using (StreamReader file =
1346                                 new StreamReader (filename)) {
1347                                 while (file.Peek () >= 0) {
1348                                         string s = file.ReadLine ();
1349                                         int idx = s.IndexOf ('#');
1350                                         if (idx >= 0)
1351                                                 s = s.Substring (0, idx);
1352                                         idx = s.IndexOf (';');
1353                                         if (idx < 0)
1354                                                 continue;
1355
1356                                         string cpspec = s.Substring (0, idx);
1357                                         idx = cpspec.IndexOf ("..");
1358                                         NumberStyles nf = NumberStyles.HexNumber |
1359                                                 NumberStyles.AllowTrailingWhite;
1360                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1361                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1362                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1363
1364                                         // FIXME: use index
1365                                         if (cp > char.MaxValue)
1366                                                 continue;
1367
1368                                         switch (value) {
1369                                         case "Gurmukhi":
1370                                                 for (int x = cp; x <= cpEnd; x++)
1371                                                         if (!IsIgnorable (x))
1372                                                                 gurmukhi.Add ((char) x);
1373                                                 break;
1374                                         case "Gujarati":
1375                                                 for (int x = cp; x <= cpEnd; x++)
1376                                                         if (!IsIgnorable (x))
1377                                                                 gujarati.Add ((char) x);
1378                                                 break;
1379                                         case "Georgian":
1380                                                 for (int x = cp; x <= cpEnd; x++)
1381                                                         if (!IsIgnorable (x))
1382                                                                 georgian.Add ((char) x);
1383                                                 break;
1384                                         case "Thaana":
1385                                                 for (int x = cp; x <= cpEnd; x++)
1386                                                         if (!IsIgnorable (x))
1387                                                                 thaana.Add ((char) x);
1388                                                 break;
1389                                         }
1390                                 }
1391                         }
1392                         gurmukhi.Sort (UCAComparer.Instance);
1393                         gujarati.Sort (UCAComparer.Instance);
1394                         georgian.Sort (UCAComparer.Instance);
1395                         thaana.Sort (UCAComparer.Instance);
1396                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1397                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1398                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1399                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1400                 }
1401
1402                 void ParseJISOrder (string filename)
1403                 {
1404                         int line = 1;
1405                         try {
1406                                 using (StreamReader file =
1407                                         new StreamReader (filename)) {
1408                                         for (;file.Peek () >= 0; line++)
1409                                                 ProcessJISOrderLine (file.ReadLine ());
1410                                 }
1411                         } catch (Exception) {
1412                                 Console.Error.WriteLine ("---- line {0}", line);
1413                                 throw;
1414                         }
1415                 }
1416
1417                 char [] ws = new char [] {'\t', ' '};
1418
1419                 void ProcessJISOrderLine (string s)
1420                 {
1421                         int idx = s.IndexOf ('#');
1422                         if (idx >= 0)
1423                                 s = s.Substring (0, idx).Trim ();
1424                         if (s.Length == 0)
1425                                 return;
1426                         idx = s.IndexOfAny (ws);
1427                         if (idx < 0)
1428                                 return;
1429                         // They start with "0x" so cut them out.
1430                         int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1431                         int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1432                         jisJapanese.Add (new JISCharacter (cp, jis));
1433                 }
1434
1435                 void ParseCJK (string zhXML, string jaXML, string koXML)
1436                 {
1437                         XmlDocument doc = new XmlDocument ();
1438                         doc.XmlResolver = null;
1439                         int v;
1440                         string s;
1441                         string category;
1442                         int offset;
1443                         ushort [] arr;
1444
1445                         // Chinese Simplified
1446                         category = "chs";
1447                         arr = cjkCHS;
1448                         offset = 0;//char.MaxValue - arr.Length;
1449                         doc.Load (zhXML);
1450                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1451                         v = 0x8008;
1452                         foreach (char c in s) {
1453                                 if (c < '\u3100')
1454                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1455                                 else {
1456                                         arr [(int) c - offset] = (ushort) v++;
1457                                         if (v % 256 == 0)
1458                                                 v += 2;
1459                                 }
1460                         }
1461
1462                         // Chinese Traditional
1463                         category = "cht";
1464                         arr = cjkCHT;
1465                         offset = 0;//char.MaxValue - arr.Length;
1466                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1467                         v = 0x8002;
1468                         foreach (char c in s) {
1469                                 if (c < '\u4E00')
1470                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1471                                 else {
1472                                         arr [(int) c - offset] = (ushort) v++;
1473                                         if (v % 256 == 0)
1474                                                 v += 2;
1475                                 }
1476                         }
1477
1478                         // Japanese
1479                         category = "ja";
1480                         arr = cjkJA;
1481                         offset = 0;//char.MaxValue - arr.Length;
1482
1483                         // SPECIAL CASES
1484                         arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1485                         arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1486                         arr [0x337E] = 0x8005;
1487                         arr [0x337D] = 0x8006;
1488                         arr [0x337C] = 0x8007;
1489
1490                         v = 0x8008;
1491                         foreach (JISCharacter jc in jisJapanese) {
1492                                 if (jc.JIS < 0x8800)
1493                                         continue;
1494                                 char c = (char) jc.CP;
1495
1496                                 if (c < '\u4E00')
1497                                         // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1498                                         continue;
1499                                 else {
1500                                         arr [(int) c - offset] = (ushort) v++;
1501                                         if (v % 256 == 0)
1502                                                 v += 2;
1503
1504                                         // SPECIAL CASES:
1505                                         if (c == '\u662D') // U+337C
1506                                                 continue;
1507                                         if (c == '\u5927') // U+337D
1508                                                 continue;
1509                                         if (c == '\u5E73') // U+337B
1510                                                 continue;
1511                                         if (c == '\u660E') // U+337E
1512                                                 continue;
1513                                         if (c == '\u9686') // U+F9DC
1514                                                 continue;
1515
1516                                         // FIXME: there are still remaining
1517                                         // characters after U+FA0C.
1518 //                                      for (int k = 0; k < char.MaxValue; k++) {
1519                                         for (int k = 0; k < '\uFA0D'; k++) {
1520                                                 if (decompIndex [k] == 0 || IsIgnorable (k))
1521                                                         continue;
1522                                                 if (decompValues [decompIndex [k]] == c /*&&
1523                                                         decompLength [k] == 1*/ ||
1524                                                         decompLength [k] == 3 &&
1525                                                         decompValues [decompIndex [k] + 1] == c) {
1526                                                         arr [k - offset] = (ushort) v++;
1527                                                         if (v % 256 == 0)
1528                                                                 v += 2;
1529                                                 }
1530                                         }
1531                                 }
1532                         }
1533
1534                         // Korean
1535                         // Korean weight is somewhat complex. It first shifts
1536                         // Hangul category from 52-x to 80-x (they are anyways
1537                         // computed). CJK ideographs are placed at secondary
1538                         // weight, like XX YY 01 zz 01, where XX and YY are
1539                         // corresponding "reset" value and zz is 41,43,45...
1540                         //
1541                         // Unlike chs,cht and ja, Korean value is a combined
1542                         // ushort which is computed as category
1543                         //
1544                         category = "ko";
1545                         arr = cjkKO;
1546                         offset = 0;//char.MaxValue - arr.Length;
1547                         doc.Load (koXML);
1548                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1549                                 XmlElement sc = (XmlElement) reset.NextSibling;
1550                                 // compute "category" and "level 1" for the 
1551                                 // target "reset" Hangle syllable
1552                                 char rc = reset.InnerText [0];
1553                                 int ri = ((int) rc - 0xAC00) + 1;
1554                                 ushort p = (ushort)
1555                                         ((ri / 254) * 256 + (ri % 254) + 2);
1556                                 // Place the characters after the target.
1557                                 s = sc.InnerText;
1558                                 v = 0x41;
1559                                 foreach (char c in s) {
1560                                         arr [(int) c - offset] = p;
1561                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1562                                         v += 2;
1563                                 }
1564                         }
1565                 }
1566
1567                 #endregion
1568
1569                 #region Generation
1570
1571                 void FillIgnorables ()
1572                 {
1573                         for (int i = 0; i <= char.MaxValue; i++) {
1574                                 if (Char.GetUnicodeCategory ((char) i) ==
1575                                         UnicodeCategory.OtherNotAssigned)
1576                                         continue;
1577                                 if (IsIgnorable (i))
1578                                         ignorableFlags [i] |= 1;
1579                                 if (IsIgnorableSymbol (i))
1580                                         ignorableFlags [i] |= 2;
1581                                 if (IsIgnorableNonSpacing (i))
1582                                         ignorableFlags [i] |= 4;
1583                         }
1584                 }
1585
1586                 void ModifyUnidata ()
1587                 {
1588                         // Modify some decomposition equivalence
1589                         decompType [0xFE31] = 0;
1590                         decompIndex [0xFE31] = 0;
1591                         decompLength [0xFE31] = 0;
1592                         decompType [0xFE32] = 0;
1593                         decompIndex [0xFE32] = 0;
1594                         decompLength [0xFE32] = 0;
1595
1596                         // Korean parens numbers
1597                         for (int i = 0x3200; i <= 0x321C; i++)
1598                                 diacritical [i] = 0xA;
1599                         for (int i = 0x3260; i <= 0x327B; i++)
1600                                 diacritical [i] = 0xC;
1601
1602                         // LAMESPEC: these remapping should not be done.
1603                         // Windows have incorrect CJK compat mappings.
1604                         decompValues [decompIndex [0x32A9]] = 0x91AB;
1605                         decompLength [0x323B] = 1;
1606                         decompValues [decompIndex [0x323B]] = 0x5B78;
1607                         decompValues [decompIndex [0x32AB]] = 0x5B78;
1608                         decompValues [decompIndex [0x32A2]] = 0x5BEB;
1609                         decompLength [0x3238] = 1;
1610                         decompValues [decompIndex [0x3238]] = 0x52DE;
1611                         decompValues [decompIndex [0x3298]] = 0x52DE;
1612
1613                         // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1614                         decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1615                         decompValues [decompIndex [0xFA0C]] = 0x5140;
1616                         decompLength [0xFA0C] = 1;
1617                         decompIndex [0xF929] = decompLength [0xF929] = 0;
1618
1619                         decompValues [decompIndex [0xF92C]] = 0x90DE;
1620                 }
1621
1622                 void ModifyParsedValues ()
1623                 {
1624                         // some cyrillic diacritical weight. They seem to be
1625                         // based on old character names, so it's quicker to
1626                         // set them directly here.
1627                         diacritical [0x0496] = diacritical [0x0497] = 7;
1628                         diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1629                         diacritical [0x049A] = diacritical [0x049B] = 0x17;
1630                         diacritical [0x049C] = diacritical [0x049D] = 9;
1631                         diacritical [0x049E] = diacritical [0x049F] = 4;
1632                         diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1633                         diacritical [0x04A2] = diacritical [0x04A3] = 7;
1634                         diacritical [0x04A4] = diacritical [0x04A5] = 8;
1635
1636                         // number, secondary weights
1637                         byte weight = 0x38;
1638                         int [] numarr = numberSecondaryWeightBounds;
1639                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1640                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1641                                         if (Char.IsNumber ((char) cp))
1642                                                 diacritical [cp] = weight;
1643
1644                         // Update name part of named characters
1645                         for (int i = 0; i < sortableCharNames.Count; i++) {
1646                                 DictionaryEntry de =
1647                                         (DictionaryEntry) sortableCharNames [i];
1648                                 int cp = (int) de.Key;
1649                                 string renamed = null;
1650                                 switch (cp) {
1651                                 case 0x2101: renamed = "A_1"; break;
1652                                 case 0x33C3: renamed = "A_2"; break;
1653                                 case 0x2105: renamed = "C_1"; break;
1654                                 case 0x2106: renamed = "C_2"; break;
1655                                 case 0x211E: renamed = "R1"; break;
1656                                 case 0x211F: renamed = "R2"; break;
1657                                 // Remove some of them!
1658                                 case 0x2103:
1659                                 case 0x2109:
1660                                 case 0x2116:
1661                                 case 0x2117:
1662                                 case 0x2118:
1663                                 case 0x2125:
1664                                 case 0x2127:
1665                                 case 0x2129:
1666                                 case 0x212E:
1667                                 case 0x2132:
1668                                         sortableCharNames.RemoveAt (i);
1669                                         i--;
1670                                         continue;
1671                                 }
1672                                 if (renamed != null)
1673                                         sortableCharNames [i] =
1674                                                 new DictionaryEntry (cp, renamed);
1675                         }
1676                 }
1677
1678                 void GenerateCore ()
1679                 {
1680                         UnicodeCategory uc;
1681
1682                         #region Specially ignored // 01
1683                         // This will raise "Defined" flag up.
1684                         // FIXME: Check If it is really fine. Actually for
1685                         // Japanese voice marks this code does remapping.
1686                         foreach (char c in specialIgnore)
1687                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1688                         #endregion
1689
1690                         #region Extenders (FF FF)
1691                         fillIndex [0xFF] = 0xFF;
1692                         char [] specialBiggest = new char [] {
1693                                 '\u3005', '\u3031', '\u3032', '\u309D',
1694                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1695                                 '\uFE7C', '\uFE7D', '\uFF70'};
1696                         foreach (char c in specialBiggest)
1697                                 AddCharMap (c, 0xFF, 0);
1698                         #endregion
1699
1700                         #region Variable weights
1701                         // Controls : 06 03 - 06 3D
1702                         fillIndex [0x6] = 3;
1703                         for (int i = 0; i < 65536; i++) {
1704                                 if (IsIgnorable (i))
1705                                         continue;
1706                                 char c = (char) i;
1707                                 uc = Char.GetUnicodeCategory (c);
1708                                 // NEL is whitespace but not ignored here.
1709                                 if (uc == UnicodeCategory.Control &&
1710                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1711                                         AddCharMap (c, 6, 1);
1712                         }
1713
1714                         // Apostrophe 06 80
1715                         fillIndex [0x6] = 0x80;
1716                         AddCharMap ('\'', 6, 0);
1717                         AddCharMap ('\uFF07', 6, 1);
1718                         AddCharMap ('\uFE63', 6, 1);
1719
1720                         // SPECIAL CASE: fill FE32 here in prior to be added
1721                         // at 2013. Windows does not always respect NFKD.
1722                         map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1723
1724                         // Hyphen/Dash : 06 81 - 06 90
1725                         for (int i = 0; i < char.MaxValue; i++) {
1726                                 if (!IsIgnorable (i) &&
1727                                         Char.GetUnicodeCategory ((char) i) ==
1728                                         UnicodeCategory.DashPunctuation) {
1729                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1730                                         if (i == 0x2011) {
1731                                                 // SPECIAL: add 2027 and 2043
1732                                                 // Maybe they are regarded the 
1733                                                 // same hyphens in "central"
1734                                                 // position.
1735                                                 AddCharMap ('\u2027', 6, 1);
1736                                                 AddCharMap ('\u2043', 6, 1);
1737                                         }
1738                                 }
1739                         }
1740                         // They are regarded as primarily equivalent to '-'
1741                         map [0x208B] = new CharMapEntry (6, 0x82, 0);
1742                         map [0x207B] = new CharMapEntry (6, 0x82, 0);
1743                         map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1744
1745                         // Arabic variable weight chars 06 A0 -
1746                         fillIndex [6] = 0xA0;
1747                         // vowels
1748                         for (int i = 0x64B; i <= 0x650; i++)
1749                                 AddArabicCharMap ((char) i);
1750                         // sukun
1751                         AddCharMapGroup ('\u0652', 6, 1, 0);
1752                         // shadda
1753                         AddCharMapGroup ('\u0651', 6, 1, 0);
1754                         #endregion
1755
1756
1757                         #region Nonspacing marks // 01
1758                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1759
1760                         // Combining diacritical marks: 01 DC -
1761
1762                         fillIndex [0x1] = 0x41;
1763                         for (int i = 0x030E; i <= 0x0326; i++)
1764                                 if (!IsIgnorable (i))
1765                                         AddCharMap ((char) i, 0x1, 1);
1766                         for (int i = 0x0329; i <= 0x0334; i++)
1767                                 if (!IsIgnorable (i))
1768                                         AddCharMap ((char) i, 0x1, 1);
1769                         fillIndex [0x1]++;
1770                         for (int i = 0x0339; i <= 0x0341; i++)
1771                                 if (!IsIgnorable (i))
1772                                         AddCharMap ((char) i, 0x1, 1);
1773                         fillIndex [0x1] = 0x74;
1774                         for (int i = 0x0346; i <= 0x0348; i++)
1775                                 if (!IsIgnorable (i))
1776                                         AddCharMap ((char) i, 0x1, 1);
1777                         for (int i = 0x02BE; i <= 0x02BF; i++)
1778                                 if (!IsIgnorable (i))
1779                                         AddCharMap ((char) i, 0x1, 1);
1780                         for (int i = 0x02C1; i <= 0x02C5; i++)
1781                                 if (!IsIgnorable (i))
1782                                         AddCharMap ((char) i, 0x1, 1);
1783                         for (int i = 0x02CE; i <= 0x02CF; i++)
1784                                 if (!IsIgnorable (i))
1785                                         AddCharMap ((char) i, 0x1, 1);
1786                         fillIndex [0x1]++;
1787                         for (int i = 0x02D1; i <= 0x02D3; i++)
1788                                 if (!IsIgnorable (i))
1789                                         AddCharMap ((char) i, 0x1, 1);
1790                         AddCharMap ('\u02DE', 0x1, 1);
1791                         for (int i = 0x02E4; i <= 0x02E9; i++)
1792                                 if (!IsIgnorable (i))
1793                                         AddCharMap ((char) i, 0x1, 1);
1794
1795                         // FIXME: needs more love here (it should eliminate
1796                         // all the hacky code above).
1797                         for (int i = 0x0300; i < 0x0370; i++)
1798                                 if (!IsIgnorable (i) && diacritical [i] != 0
1799                                         /* especiall here*/ && !map [i].Defined)
1800                                         map [i] = new CharMapEntry (
1801                                                 0x1, 0x1, diacritical [i]);
1802
1803                         // Cyrillic and Armenian nonspacing mark
1804                         fillIndex [0x1] = 0x94;
1805                         for (int i = 0x400; i < 0x580; i++)
1806                                 if (!IsIgnorable (i) &&
1807                                         Char.GetUnicodeCategory ((char) i) ==
1808                                         UnicodeCategory.NonSpacingMark)
1809                                         AddCharMap ((char) i, 1, 1);
1810
1811                         fillIndex [0x1] = 0x8D;
1812                         // syriac dotted nonspacing marks (1)
1813                         AddCharMap ('\u0740', 0x1, 1);
1814                         AddCharMap ('\u0741', 0x1, 1);
1815                         AddCharMap ('\u0742', 0x1, 1);
1816                         // syriac oblique nonspacing marks
1817                         AddCharMap ('\u0747', 0x1, 1);
1818                         AddCharMap ('\u0748', 0x1, 1);
1819                         // syriac dotted nonspacing marks (2)
1820                         fillIndex [0x1] = 0x94; // this reset is mandatory
1821                         AddCharMap ('\u0732', 0x1, 1);
1822                         AddCharMap ('\u0735', 0x1, 1);
1823                         AddCharMap ('\u0738', 0x1, 1);
1824                         AddCharMap ('\u0739', 0x1, 1);
1825                         AddCharMap ('\u073C', 0x1, 1);
1826                         // SPECIAL CASES: superscripts
1827                         AddCharMap ('\u073F', 0x1, 1);
1828                         AddCharMap ('\u0711', 0x1, 1);
1829                         // syriac "DOTS"
1830                         for (int i = 0x0743; i <= 0x0746; i++)
1831                                 AddCharMap ((char) i, 0x1, 1);
1832                         for (int i = 0x0730; i <= 0x0780; i++)
1833                                 if (!map [i].Defined &&
1834                                         Char.GetUnicodeCategory ((char) i) ==
1835                                         UnicodeCategory.NonSpacingMark)
1836                                         AddCharMap ((char) i, 0x1, 1);
1837
1838                         // LAMESPEC: It should not stop at '\u20E1'. There are
1839                         // a few more characters (that however results in 
1840                         // overflow of level 2 unless we start before 0xDD).
1841                         fillIndex [0x1] = 0xDD;
1842                         for (int i = 0x20D0; i <= 0x20DC; i++)
1843                                 AddCharMap ((char) i, 0x1, 1);
1844                         fillIndex [0x1] = 0xEC;
1845                         for (int i = 0x20DD; i <= 0x20E1; i++)
1846                                 AddCharMap ((char) i, 0x1, 1);
1847                         fillIndex [0x1] = 0x7;
1848                         for (int i = 0x302A; i <= 0x302D; i++)
1849                                 AddCharMap ((char) i, 0x1, 1);
1850                         fillIndex [0x1] = 0x50; // I wonder how they are sorted
1851                         for (int i = 0x02D4; i <= 0x02D7; i++)
1852                                 AddCharMap ((char) i, 0x1, 1);
1853
1854                         // They are not part of Nonspacing marks, but have
1855                         // only diacritical weight.
1856                         for (int i = 0x3099; i <= 0x309C; i++)
1857                                 map [i] = new CharMapEntry (1, 1, 1);
1858                         map [0xFF9E] = new CharMapEntry (1, 1, 1);
1859                         map [0xFF9F] = new CharMapEntry (1, 1, 2);
1860                         map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1861                         map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1862                         for (int i = 0x30FC; i <= 0x30FE; i++)
1863                                 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1864
1865                         #endregion
1866
1867
1868                         #region Whitespaces // 07 03 -
1869                         fillIndex [0x7] = 0x2;
1870                         AddCharMap (' ', 0x7, 2);
1871                         AddCharMap ('\u00A0', 0x7, 1);
1872                         for (int i = 9; i <= 0xD; i++)
1873                                 AddCharMap ((char) i, 0x7, 1);
1874                         for (int i = 0x2000; i <= 0x200B; i++)
1875                                 AddCharMap ((char) i, 0x7, 1);
1876
1877                         fillIndex [0x7] = 0x17;
1878                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1879                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1880
1881                         // Characters which used to represent layout control.
1882                         // LAMESPEC: Windows developers seem to have thought 
1883                         // that those characters are kind of whitespaces,
1884                         // while they aren't.
1885                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1886                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1887                         #endregion
1888
1889                         // category 09 - continued symbols from 08
1890                         fillIndex [0x9] = 2;
1891                         // misc tech mark
1892                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1893                                 AddCharMap ((char) cp, 0x9, 1, 0);
1894
1895                         // arrows
1896                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
1897                         foreach (DictionaryEntry de in arrowValues) {
1898                                 int idx = (int) de.Value;
1899                                 int cp = (int) de.Key;
1900                                 if (map [cp].Defined)
1901                                         continue;
1902                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1903                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1904                                 arrowLv2 [idx]++;
1905                         }
1906                         // boxes
1907                         byte [] boxLv2 = new byte [128];
1908                         for (int i = 0; i < boxLv2.Length; i++)
1909                                 boxLv2 [i] = 3;
1910                         foreach (DictionaryEntry de in boxValues) {
1911                                 int cp = (int) de.Key;
1912                                 int off = (int) de.Value;
1913                                 if (map [cp].Defined)
1914                                         continue;
1915                                 if (off < 0) {
1916                                         fillIndex [0x9] = (byte) (0xE5 + off);
1917                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1918                                 }
1919                                 else {
1920                                         fillIndex [0x9] = (byte) (0xE5 + off);
1921                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1922                                 }
1923                         }
1924                         // Some special characters (slanted)
1925                         fillIndex [0x9] = 0xF4;
1926                         AddCharMap ('\u2571', 0x9, 3);
1927                         AddCharMap ('\u2572', 0x9, 3);
1928                         AddCharMap ('\u2573', 0x9, 3);
1929
1930                         // FIXME: implement 0A
1931                         #region Symbols
1932                         fillIndex [0xA] = 2;
1933                         // byte currency symbols
1934                         for (int cp = 0; cp < 0x100; cp++) {
1935                                 uc = Char.GetUnicodeCategory ((char) cp);
1936                                 if (!IsIgnorable (cp) &&
1937                                         uc == UnicodeCategory.CurrencySymbol &&
1938                                         cp != '$' ||
1939                                         cp == 0xAC)
1940                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1941                         }
1942                         // byte other symbols
1943                         for (int cp = 0; cp < 0x100; cp++) {
1944                                 if (cp == 0xA6)
1945                                         continue; // SPECIAL: skip FIXME: why?
1946                                 uc = Char.GetUnicodeCategory ((char) cp);
1947                                 if (!IsIgnorable (cp) &&
1948                                         uc == UnicodeCategory.OtherSymbol ||
1949                                         cp == '\u00B5' || cp == '\u00B7')
1950                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1951                         }
1952                         // U+30FB here
1953                         AddCharMapGroup ('\u30FB', 0xA, 1, 0);
1954
1955                         for (int cp = 0x2020; cp <= 0x2031; cp++)
1956                                 if (Char.IsPunctuation ((char) cp))
1957                                         AddCharMap ((char) cp, 0xA, 1, 0);
1958                         // SPECIAL CASES: why?
1959                         AddCharMap ('\u203B', 0xA, 1, 0);
1960                         AddCharMap ('\u2040', 0xA, 1, 0);
1961                         AddCharMap ('\u2041', 0xA, 1, 0);
1962                         AddCharMap ('\u2042', 0xA, 1, 0);
1963
1964                         for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1965                                 AddCharMap ((char) cp, 0xA, 1, 0);
1966                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1967                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1968                                 AddCharMap ((char) cp, 0xA, 1, 0);
1969                         // Dingbats
1970                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1971                                 if (Char.IsSymbol ((char) cp))
1972                                         AddCharMap ((char) cp, 0xA, 1, 0);
1973                         // OCR
1974                         for (int i = 0x2440; i < 0x2460; i++)
1975                                 AddCharMap ((char) i, 0xA, 1, 0);
1976
1977                         #endregion
1978
1979                         #region Numbers // 0C 02 - 0C E1
1980                         fillIndex [0xC] = 2;
1981
1982                         // 9F8 : Bengali "one less than the denominator"
1983                         AddCharMap ('\u09F8', 0xC, 1, 0x3C);
1984
1985                         ArrayList numbers = new ArrayList ();
1986                         for (int i = 0; i < 65536; i++)
1987                                 if (!IsIgnorable (i) &&
1988                                         Char.IsNumber ((char) i) &&
1989                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1990                                         numbers.Add (i);
1991
1992                         ArrayList numberValues = new ArrayList ();
1993                         foreach (int i in numbers)
1994                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1995                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1996
1997 //foreach (DictionaryEntry de in numberValues)
1998 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1999
2000                         decimal prevValue = -1;
2001                         foreach (DictionaryEntry de in numberValues) {
2002                                 int cp = (int) de.Key;
2003                                 decimal currValue = (decimal) de.Value;
2004                                 bool addnew = false;
2005                                 if (prevValue < currValue &&
2006                                         prevValue - (int) prevValue == 0 &&
2007                                         prevValue >= 1) {
2008
2009                                         addnew = true;
2010                                         // Process Hangzhou and Roman numbers
2011
2012                                         // There are some SPECIAL cases.
2013                                         if (currValue != 4) // no increment for 4
2014                                                 fillIndex [0xC]++;
2015
2016                                         int xcp;
2017                                         if (currValue <= 10) {
2018                                                 xcp = (int) prevValue + 0x2170 - 1;
2019                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2020                                                 xcp = (int) prevValue + 0x2160 - 1;
2021                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2022                                                 fillIndex [0xC] += 2;
2023                                                 xcp = (int) prevValue + 0x3021 - 1;
2024                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2025                                                 fillIndex [0xC]++;
2026                                         }
2027                                         else if (currValue == 11)
2028                                                 fillIndex [0xC]++;
2029                                 }
2030                                 if (prevValue < currValue)
2031                                         prevValue = currValue;
2032                                 if (map [cp].Defined)
2033                                         continue;
2034                                 // HangZhou and Roman are add later 
2035                                 // (code is above)
2036                                 else if (0x3021 <= cp && cp < 0x302A
2037                                         || 0x2160 <= cp && cp < 0x216A
2038                                         || 0x2170 <= cp && cp < 0x217A)
2039                                         continue;
2040
2041                                 if (cp ==  0x215B) // FIXME: why?
2042                                         fillIndex [0xC] += 2;
2043                                 else if (cp == 0x3021) // FIXME: why?
2044                                         fillIndex [0xC]++;
2045                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2046                                 if (addnew || cp <= '9') {
2047                                         int mod = (int) currValue - 1;
2048                                         int xcp;
2049                                         if (1 <= currValue && currValue <= 10) {
2050                                                 xcp = mod + 0x2776;
2051                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2052                                                 xcp = mod + 0x2780;
2053                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2054                                                 xcp = mod + 0x278A;
2055                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2056                                         }
2057                                         if (1 <= currValue && currValue <= 20) {
2058                                                 xcp = mod + 0x2460;
2059                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2060                                                 xcp = mod + 0x2474;
2061                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2062                                                 xcp = mod + 0x2488;
2063                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2064                                         }
2065                                 }
2066
2067                                 if (cp != 0x09E7 && cp != 0x09EA)
2068                                         fillIndex [0xC]++;
2069
2070                                 // Add special cases that are not regarded as 
2071                                 // numbers in UnicodeCategory speak.
2072                                 if (cp == '5') {
2073                                         // TONE FIVE
2074                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2075                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2076                                 }
2077                                 else if (cp == '6') // FIXME: why?
2078                                         fillIndex [0xC]++;
2079                         }
2080
2081                         // 221E: infinity
2082                         fillIndex [0xC] = 0xFF;
2083                         AddCharMap ('\u221E', 0xC, 1);
2084                         #endregion
2085
2086                         #region Letters and NonSpacing Marks (general)
2087
2088                         // ASCII Latin alphabets
2089                         for (int i = 0; i < alphabets.Length; i++)
2090                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2091
2092                         // non-ASCII Latin alphabets
2093                         // FIXME: there is no such characters that are placed
2094                         // *after* "alphabets" array items. This is nothing
2095                         // more than a hack that creates dummy weight for
2096                         // primary characters.
2097                         for (int i = 0x0080; i < 0x0300; i++) {
2098                                 if (!Char.IsLetter ((char) i))
2099                                         continue;
2100                                 // For those Latin Letters which has NFKD are
2101                                 // not added as independent primary character.
2102                                 if (decompIndex [i] != 0)
2103                                         continue;
2104                                 // SPECIAL CASES:
2105                                 // 1.some alphabets have primarily
2106                                 //   equivalent ASCII alphabets.
2107                                 // 2.some have independent primary weights,
2108                                 //   but inside a-to-z range.
2109                                 // 3.there are some expanded characters that
2110                                 //   are not part of Unicode Standard NFKD.
2111                                 // 4. some characters are letter in IsLetter
2112                                 //   but not in sortkeys (maybe unicode version
2113                                 //   difference caused it).
2114                                 switch (i) {
2115                                 // 1. skipping them does not make sense
2116 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
2117 //                              case 0x184: case 0x185: case 0x186: case 0x189:
2118 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
2119 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
2120 //                              case 0x19B: case 0x19C:
2121                                 // 2. skipping them does not make sense
2122 //                              case 0x14A: // Ng
2123 //                              case 0x14B: // ng
2124                                 // 3.
2125                                 case 0xC6: // AE
2126                                 case 0xE6: // ae
2127                                 case 0xDE: // Icelandic Thorn
2128                                 case 0xFE: // Icelandic Thorn
2129                                 case 0xDF: // German ss
2130                                 case 0xFF: // German ss
2131                                 // 4.
2132                                 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2133                                 // not classified yet
2134 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2135 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2136 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2137 //                              case 0x1DD:
2138                                         continue;
2139                                 }
2140                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
2141                         }
2142
2143                         // Greek and Coptic
2144                         fillIndex [0xF] = 02;
2145                         for (int i = 0x0380; i < 0x0390; i++)
2146                                 if (Char.IsLetter ((char) i))
2147                                         AddLetterMap ((char) i, 0xF, 1);
2148                         fillIndex [0xF] = 02;
2149                         for (int i = 0x0391; i < 0x03CF; i++)
2150                                 if (Char.IsLetter ((char) i))
2151                                         AddLetterMap ((char) i, 0xF, 1);
2152                         fillIndex [0xF] = 0x40;
2153                         for (int i = 0x03D0; i < 0x0400; i++)
2154                                 if (Char.IsLetter ((char) i))
2155                                         AddLetterMap ((char) i, 0xF, 1);
2156
2157                         // Cyrillic.
2158                         // Cyrillic letters are sorted like Latin letters i.e. 
2159                         // containing culture-specific letters between the
2160                         // standard Cyrillic sequence.
2161                         //
2162                         // We can't use UCA here; it has different sorting.
2163                         char [] orderedCyrillic = new char [] {
2164                                 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2165                                 '\u0452', // DJE for Serbocroatian
2166                                 '\u0435',
2167                                 '\u0454', // IE for Ukrainian
2168                                 '\u0436', '\u0437',
2169                                 '\u0455', // DZE
2170                                 '\u0438',
2171                                 '\u0456', // Byelorussian-Ukrainian I
2172                                 '\u0457', // YI
2173                                 '\u0439',
2174                                 '\u0458', // JE
2175                                 '\u043A', '\u043B',
2176                                 '\u0459', // LJE
2177                                 '\u043C', '\u043D',
2178                                 '\u045A', // NJE
2179                                 '\u043E',
2180                                 // 4E9 goes here.
2181                                 '\u043F', '\u0440', '\u0441', '\u0442',
2182                                 '\u045B', // TSHE for Serbocroatian
2183                                 '\u0443',
2184                                 '\u045E', // Short U for Byelorussian
2185                                 '\u04B1', // Straight U w/ stroke (diacritical!)
2186                                 '\u0444', '\u0445', '\u0446', '\u0447',
2187                                 '\u045F', // DZHE
2188                                 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2189                                 '\u044D', '\u044E', '\u044F'};
2190
2191                         // For some characters here is a map to basic cyrillic
2192                         // letters. See UnicodeData.txt character names for
2193                         // the sources. Here I simply declare an equiv. array.
2194                         // The content characters are map from U+490(,491),
2195                         // skipping small letters.
2196                         char [] cymap_src = new char [] {
2197                                 '\u0433', '\u0433', '\u0433', '\u0436',
2198                                 '\u0437', '\u043A', '\u043A', '\u043A',
2199                                 '\u043A', '\u043D', '\u043D', '\u043F',
2200                                 '\u0445', '\u0441', '\u0442', '\u0443',
2201                                 '\u0443', '\u0445', '\u0446', '\u0447',
2202                                 '\u0447', '\u0432', '\u0435', '\u0435',
2203                                 '\u0406', '\u0436', '\u043A', '\u043D',
2204                                 '\u0447', '\u0435'};
2205
2206                         fillIndex [0x10] = 0x8D;
2207                         for (int i = 0x0460; i < 0x0481; i++) {
2208                                 if (Char.IsLetter ((char) i)) {
2209                                         if (i == 0x0476)
2210                                                 // U+476/477 have the same
2211                                                 // primary weight as U+474/475.
2212                                                 fillIndex [0x10] -= 3;
2213                                         AddLetterMap ((char) i, 0x10, 3);
2214                                 }
2215                         }
2216
2217                         fillIndex [0x10] = 0x6;
2218                         for (int i = 0; i < orderedCyrillic.Length; i++) {
2219                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2220                                 if (!IsIgnorable ((int) c) &&
2221                                         Char.IsLetter (c) &&
2222                                         !map [c].Defined) {
2223                                         AddLetterMap (c, 0x10, 0);
2224                                         fillIndex [0x10] += 3;
2225                                 }
2226                         }
2227
2228                         for (int i = 0; i < cymap_src.Length; i++) {
2229                                 char c = cymap_src [i];
2230                                 fillIndex [0x10] = map [c].Level1;
2231                                 int c2 = 0x0490 + i * 2;
2232                                 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2233                         }
2234
2235                         // Armenian
2236                         fillIndex [0x11] = 0x3;
2237                         fillIndex [0x1] = 0x98;
2238                         for (int i = 0x0531; i < 0x0586; i++) {
2239                                 if (i == 0x0559 || i == 0x55A)
2240                                         AddCharMap ((char) i, 1, 1);
2241                                 if (Char.IsLetter ((char) i))
2242                                         AddLetterMap ((char) i, 0x11, 1);
2243                         }
2244
2245                         // Hebrew
2246                         // -Letters
2247                         fillIndex [0x12] = 0x2;
2248                         for (int i = 0x05D0; i < 0x05FF; i++)
2249                                 if (Char.IsLetter ((char) i))
2250                                         AddLetterMap ((char) i, 0x12, 1);
2251                         // -Accents
2252                         fillIndex [0x1] = 0x3;
2253                         for (int i = 0x0591; i <= 0x05C2; i++) {
2254                                 if (i == 0x05A3 || i == 0x05BB)
2255                                         fillIndex [0x1]++;
2256                                 if (i != 0x05BE)
2257                                         AddCharMap ((char) i, 0x1, 1);
2258                         }
2259
2260                         // Arabic
2261                         fillIndex [0x1] = 0x8E;
2262                         fillIndex [0x13] = 0x3;
2263                         for (int i = 0x0621; i <= 0x064A; i++) {
2264                                 // Abjad
2265                                 if (Char.GetUnicodeCategory ((char) i)
2266                                         != UnicodeCategory.OtherLetter) {
2267                                         // FIXME: arabic nonspacing marks are
2268                                         // in different order.
2269                                         AddCharMap ((char) i, 0x1, 1);
2270                                         continue;
2271                                 }
2272 //                              map [i] = new CharMapEntry (0x13,
2273 //                                      (byte) arabicLetterPrimaryValues [i], 1);
2274                                 fillIndex [0x13] = 
2275                                         (byte) arabicLetterPrimaryValues [i];
2276                                 byte formDiacritical = 8; // default
2277                                 // SPECIAL CASES:
2278                                 switch (i) {
2279                                 case 0x0622: formDiacritical = 9; break;
2280                                 case 0x0623: formDiacritical = 0xA; break;
2281                                 case 0x0624: formDiacritical = 5; break;
2282                                 case 0x0625: formDiacritical = 0xB; break;
2283                                 case 0x0626: formDiacritical = 7; break;
2284                                 case 0x0649: formDiacritical = 5; break;
2285                                 case 0x064A: formDiacritical = 7; break;
2286                                 }
2287                                 AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2288                         }
2289                         for (int i = 0x0670; i < 0x0673; i++)
2290                                 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2291                         fillIndex [0x13] = 0x84;
2292                         for (int i = 0x0674; i < 0x06D6; i++)
2293                                 if (Char.IsLetter ((char) i))
2294                                         AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2295
2296                         // Devanagari
2297
2298                         // FIXME: this could be fixed in more decent way
2299                         for (int i = 0x0958; i <= 0x095F; i++)
2300                                 diacritical [i] = 8;
2301
2302                         // FIXME: it does seem straight codepoint mapping.
2303                         fillIndex [0x14] = 04;
2304                         for (int i = 0x0901; i < 0x0905; i++)
2305                                 if (!IsIgnorable (i))
2306                                         AddLetterMap ((char) i, 0x14, 2);
2307                         fillIndex [0x14] = 0xB;
2308                         for (int i = 0x0905; i < 0x093A; i++) {
2309                                 if (i == 0x0928)
2310                                         AddCharMap ('\u0929', 0x14, 0, 8);
2311                                 if (i == 0x0930)
2312                                         AddCharMap ('\u0931', 0x14, 0, 8);
2313                                 if (i == 0x0933)
2314                                         AddCharMap ('\u0934', 0x14, 0, 8);
2315                                 if (Char.IsLetter ((char) i))
2316                                         AddLetterMap ((char) i, 0x14, 4);
2317                                 if (i == 0x090B)
2318                                         AddCharMap ('\u0960', 0x14, 4);
2319                                 if (i == 0x090C)
2320                                         AddCharMap ('\u0961', 0x14, 4);
2321                         }
2322                         fillIndex [0x14] = 0xDA;
2323                         for (int i = 0x093E; i < 0x0945; i++)
2324                                 if (!IsIgnorable (i))
2325                                         AddLetterMap ((char) i, 0x14, 2);
2326                         fillIndex [0x14] = 0xEC;
2327                         for (int i = 0x0945; i < 0x094F; i++)
2328                                 if (!IsIgnorable (i))
2329                                         AddLetterMap ((char) i, 0x14, 2);
2330
2331                         // Bengali
2332                         // -Letters
2333                         fillIndex [0x15] = 02;
2334                         for (int i = 0x0980; i < 0x9FF; i++) {
2335                                 if (IsIgnorable (i))
2336                                         continue;
2337                                 if (i == 0x09E0)
2338                                         fillIndex [0x15] = 0x3B;
2339                                 switch (Char.GetUnicodeCategory ((char) i)) {
2340                                 case UnicodeCategory.NonSpacingMark:
2341                                 case UnicodeCategory.DecimalDigitNumber:
2342                                 case UnicodeCategory.OtherNumber:
2343                                         continue;
2344                                 }
2345                                 AddLetterMap ((char) i, 0x15, 1);
2346                         }
2347                         // -Signs
2348                         fillIndex [0x1] = 0x3;
2349                         for (int i = 0x0981; i < 0x0A00; i++)
2350                                 if (Char.GetUnicodeCategory ((char) i) ==
2351                                         UnicodeCategory.NonSpacingMark)
2352                                         AddCharMap ((char) i, 0x1, 1);
2353
2354                         // Gurmukhi. orderedGurmukhi is from UCA
2355                         // FIXME: it does not look equivalent to UCA.
2356                         fillIndex [0x16] = 04;
2357                         fillIndex [0x1] = 3;
2358                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
2359                                 char c = orderedGurmukhi [i];
2360                                 if (IsIgnorable ((int) c))
2361                                         continue;
2362                                 if (IsIgnorableNonSpacing (c)) {
2363                                         AddLetterMap (c, 0x1, 1);
2364                                         continue;
2365                                 }
2366                                 if (c == '\u0A3C' || c == '\u0A4D' ||
2367                                         '\u0A66' <= c && c <= '\u0A71')
2368                                         continue;
2369                                 // SPECIAL CASES
2370                                 byte shift = 4;
2371                                 switch (c) {
2372                                 case '\u0A33': case '\u0A36': case '\u0A16':
2373                                 case '\u0A17': case '\u0A5B': case '\u0A5E':
2374                                         shift = 0;
2375                                         break;
2376                                 }
2377                                 if (c == '\u0A3E') // Skip
2378                                         fillIndex [0x16] = 0xC0;
2379                                 AddLetterMap (c, 0x16, shift);
2380                         }
2381
2382                         // Gujarati. orderedGujarati is from UCA
2383                         fillIndex [0x17] = 0x4;
2384                         // nonspacing marks
2385                         map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2386                         map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2387                         map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2388                         map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2389                         map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2390                         map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2391                         // letters go first.
2392                         for (int i = 0; i < orderedGujarati.Length; i++) {
2393                                 // SPECIAL CASE
2394                                 char c = orderedGujarati [i];
2395                                 if (Char.IsLetter (c)) {
2396                                         // SPECIAL CASES
2397                                         if (c == '\u0AB3' || c == '\u0A32')
2398                                                 continue;
2399                                         if (c == '\u0A33') {
2400                                                 AddCharMap ('\u0A32', 0x17, 0);
2401                                                 AddCharMap ('\u0A33', 0x17, 4, 4);
2402                                                 continue;
2403                                         }
2404                                         if (c == '\u0A8B')
2405                                                 AddCharMap ('\u0AE0', 0x17, 0, 5);
2406                                         AddCharMap (c, 0x17, 4);
2407
2408                                         if (c == '\u0AB9')
2409                                                 AddCharMap ('\u0AB3', 0x17, 6);
2410                                 }
2411                         }
2412                         // non-letters
2413                         byte gujaratiShift = 4;
2414                         fillIndex [0x17] = 0xC0;
2415                         for (int i = 0; i < orderedGujarati.Length; i++) {
2416                                 char c = orderedGujarati [i];
2417                                 if (fillIndex [0x17] == 0xCC)
2418                                         gujaratiShift = 3;
2419                                 if (!Char.IsLetter (c)) {
2420                                         // SPECIAL CASES
2421                                         if (c == '\u0A82')
2422                                                 AddCharMap ('\u0A81', 0x17, 2);
2423                                         if (c == '\u0AC2')
2424                                                 fillIndex [0x17]++;
2425                                         AddLetterMap (c, 0x17, gujaratiShift);
2426                                 }
2427                         }
2428
2429                         // Oriya
2430                         fillIndex [0x1] = 03;
2431                         fillIndex [0x18] = 02;
2432                         for (int i = 0x0B00; i < 0x0B7F; i++) {
2433                                 switch (Char.GetUnicodeCategory ((char) i)) {
2434                                 case UnicodeCategory.NonSpacingMark:
2435                                 case UnicodeCategory.DecimalDigitNumber:
2436                                         AddLetterMap ((char) i, 0x1, 1);
2437                                         continue;
2438                                 }
2439                                 AddLetterMap ((char) i, 0x18, 1);
2440                         }
2441
2442                         // Tamil
2443                         fillIndex [0x19] = 2;
2444                         AddCharMap ('\u0BD7', 0x19, 0);
2445                         fillIndex [0x19] = 0xA;
2446                         // vowels
2447                         for (int i = 0x0B82; i <= 0x0B94; i++)
2448                                 if (!IsIgnorable ((char) i))
2449                                         AddCharMap ((char) i, 0x19, 2);
2450                         // special vowel
2451                         fillIndex [0x19] = 0x28;
2452                         // The array for Tamil consonants is a constant.
2453                         // Windows have almost similar sequence to TAM from
2454                         // tamilnet but a bit different in Grantha.
2455                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
2456                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2457                         // combining marks
2458                         fillIndex [0x19] = 0x82;
2459                         for (int i = 0x0BBE; i < 0x0BCD; i++)
2460                                 if (Char.GetUnicodeCategory ((char) i) ==
2461                                         UnicodeCategory.SpacingCombiningMark
2462                                         || i == 0x0BC0)
2463                                         AddLetterMap ((char) i, 0x19, 2);
2464
2465                         // Telugu
2466                         fillIndex [0x1A] = 0x4;
2467                         for (int i = 0x0C00; i < 0x0C62; i++) {
2468                                 if (i == 0x0C55 || i == 0x0C56)
2469                                         continue; // skip
2470                                 AddCharMap ((char) i, 0x1A, 3);
2471                                 char supp = (i == 0x0C0B) ? '\u0C60':
2472                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
2473                                 if (supp == char.MinValue)
2474                                         continue;
2475                                 AddCharMap (supp, 0x1A, 3);
2476                         }
2477
2478                         // Kannada
2479                         fillIndex [0x1B] = 4;
2480                         for (int i = 0x0C80; i < 0x0CE5; i++) {
2481                                 if (i == 0x0CD5 || i == 0x0CD6)
2482                                         continue; // ignore
2483                                 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2484                                         continue; // shift after 0xCB9
2485                                 AddCharMap ((char) i, 0x1B, 3);
2486                                 if (i == 0x0CB9) {
2487                                         // SPECIAL CASES: but why?
2488                                         AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2489                                         AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2490                                         AddCharMap ('\u0CDE', 0x1B, 3); // FA
2491                                 }
2492                                 if (i == 0x0CB2)
2493                                         AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2494                         }
2495                         
2496                         // Malayalam
2497                         fillIndex [0x1C] = 2;
2498                         fillIndex [0x1] = 3;
2499                         for (int i = 0x0D02; i < 0x0D61; i++) {
2500                                 // FIXME: I avoided MSCompatUnicodeTable usage
2501                                 // here (it results in recursion). So check if
2502                                 // using NonSpacingMark makes sense or not.
2503                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2504 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2505                                         AddCharMap ((char) i, 0x1C, 1);
2506                                 else if (!IsIgnorable ((char) i))
2507                                         AddCharMap ((char) i, 1, 1);
2508                         }
2509
2510                         // Thai ... note that it breaks 0x1E wall after E2B!
2511                         // Also, all Thai characters have level 2 value 3.
2512                         fillIndex [0x1E] = 2;
2513                         fillIndex [0x1] = 3;
2514                         for (int i = 0xE40; i <= 0xE44; i++)
2515                                 AddCharMap ((char) i, 0x1E, 1, 3);
2516                         for (int i = 0xE01; i < 0xE2B; i++)
2517                                 AddCharMap ((char) i, 0x1E, 6, 3);
2518                         fillIndex [0x1F] = 5;
2519                         for (int i = 0xE2B; i < 0xE30; i++)
2520                                 AddCharMap ((char) i, 0x1F, 6, 3);
2521                         fillIndex [0x1F] = 0x1E;
2522                         for (int i = 0xE30; i < 0xE3B; i++)
2523                                 AddCharMap ((char) i, 0x1F, 1, 3);
2524                         // some Thai characters remains.
2525                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
2526                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2527                         foreach (char c in specialThai)
2528                                 AddCharMap (c, 0x1F, 1, 3);
2529
2530                         for (int i = 0xE00; i < 0xE80; i++)
2531                                 if (Char.GetUnicodeCategory ((char) i) ==
2532                                         UnicodeCategory.NonSpacingMark)
2533                                         AddCharMap ((char) i, 1, 1);
2534
2535                         // Lao
2536                         fillIndex [0x1F] = 2;
2537                         fillIndex [0x1] = 3;
2538                         for (int i = 0xE80; i < 0xEDF; i++) {
2539                                 if (IsIgnorable ((char) i))
2540                                         continue;
2541                                 else if (Char.IsLetter ((char) i))
2542                                         AddCharMap ((char) i, 0x1F, 1);
2543                                 else if (Char.GetUnicodeCategory ((char) i) ==
2544                                         UnicodeCategory.NonSpacingMark)
2545                                         AddCharMap ((char) i, 1, 1);
2546                         }
2547
2548                         // Georgian. orderedGeorgian is from UCA DUCET.
2549                         fillIndex [0x21] = 5;
2550                         for (int i = 0; i < orderedGeorgian.Length; i++) {
2551                                 char c = orderedGeorgian [i];
2552                                 if (map [(int) c].Defined)
2553                                         continue;
2554                                 AddCharMap (c, 0x21, 0);
2555                                 if (c < '\u10F6')
2556                                         AddCharMap ((char) (c - 0x30), 0x21, 0);
2557                                 fillIndex [0x21] += 5;
2558                         }
2559
2560                         // Japanese Kana.
2561                         fillIndex [0x22] = 2;
2562                         int kanaOffset = 0x3041;
2563                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2564
2565                         for (int gyo = 0; gyo < 9; gyo++) {
2566                                 for (int dan = 0; dan < 5; dan++) {
2567                                         if (gyo == 7 && dan % 2 == 1) {
2568                                                 // 'ya'-gyo
2569                                                 fillIndex [0x22]++;
2570                                                 kanaOffset -= 2; // There is no space for yi and ye.
2571                                                 continue;
2572                                         }
2573                                         int cp = kanaOffset + dan * kanaLines [gyo];
2574                                         // small lines (a-gyo, ya-gyo)
2575                                         if (gyo == 0 || gyo == 7) {
2576                                                 AddKanaMap (cp, 1); // small
2577                                                 AddKanaMap (cp + 1, 1);
2578                                         }
2579                                         else
2580                                                 AddKanaMap (cp, kanaLines [gyo]);
2581                                         fillIndex [0x22]++;
2582
2583                                         if (cp == 0x30AB) {
2584                                                 // add small 'ka' (before normal one)
2585                                                 AddKanaMap (0x30F5, 1);
2586                                                 kanaOffset++;
2587                                         }
2588                                         if (cp == 0x30B1) {
2589                                                 // add small 'ke' (before normal one)
2590                                                 AddKanaMap (0x30F6, 1);
2591                                                 kanaOffset++;
2592                                         }
2593                                         if (cp == 0x3061) {
2594                                                 // add small 'Tsu' (before normal one)
2595                                                 AddKanaMap (0x3063, 1);
2596                                                 kanaOffset++;
2597                                         }
2598                                 }
2599                                 fillIndex [0x22] += 3;
2600                                 kanaOffset += 5 * kanaLines [gyo];
2601                         }
2602
2603                         // Wa-gyo is almost special, so I just manually add.
2604                         AddLetterMap ((char) 0x308E, 0x22, 0);
2605                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2606                         AddLetterMap ((char) 0x308F, 0x22, 0);
2607                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2608                         fillIndex [0x22]++;
2609                         AddLetterMap ((char) 0x3090, 0x22, 0);
2610                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2611                         fillIndex [0x22] += 2;
2612                         // no "Wu" in Japanese.
2613                         AddLetterMap ((char) 0x3091, 0x22, 0);
2614                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2615                         fillIndex [0x22]++;
2616                         AddLetterMap ((char) 0x3092, 0x22, 0);
2617                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2618                         // Nn
2619                         fillIndex [0x22] = 0x80;
2620                         AddLetterMap ((char) 0x3093, 0x22, 0);
2621                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2622
2623                         map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2624                                 map [0x30A6].Level1, 3);// voiced hiragana U
2625                         map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2626                                 map [0x30A6].Level1, 3);// voiced katakana U
2627
2628                         map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2629                                 map [0x30AB].Level1, 0);// small katakana Ka
2630                         map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2631                                 map [0x30B1].Level1, 0);// small katakana Ke
2632                         // voiced Wa lines
2633                         for (int i = 0x30F7; i < 0x30FB; i++)
2634                                 map [i] = new CharMapEntry (map [i - 8].Category,
2635                                         map [i - 8].Level1,
2636                                         3);
2637
2638                         // JIS Japanese square chars.
2639                         fillIndex [0x22] = 0x97;
2640                         jisJapanese.Sort (JISComparer.Instance);
2641                         foreach (JISCharacter j in jisJapanese)
2642                                 if (0x3300 <= j.CP && j.CP <= 0x3357)
2643                                         AddCharMap ((char) j.CP, 0x22, 1);
2644                         // non-JIS Japanese square chars.
2645                         nonJisJapanese.Sort (NonJISComparer.Instance);
2646                         foreach (NonJISCharacter j in nonJisJapanese)
2647                                 AddCharMap ((char) j.CP, 0x22, 1);
2648
2649                         // Bopomofo
2650                         fillIndex [0x23] = 0x02;
2651                         for (int i = 0x3105; i <= 0x312C; i++)
2652                                 AddCharMap ((char) i, 0x23, 1);
2653
2654                         // Estrangela: ancient Syriac
2655                         fillIndex [0x24] = 0x0B;
2656                         // FIXME: is 0x71E really alternative form?
2657                         ArrayList syriacAlternatives = new ArrayList (
2658                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2659                         for (int i = 0x0710; i <= 0x072C; i++) {
2660                                 if (i == 0x0711) // NonSpacingMark
2661                                         continue;
2662                                 if (syriacAlternatives.Contains (i))
2663                                         continue;
2664                                 AddCharMap ((char) i, 0x24, 4);
2665                                 // FIXME: why?
2666                                 if (i == 0x721)
2667                                         fillIndex [0x24]++;
2668                         }
2669                         foreach (int cp in syriacAlternatives)
2670                                 map [cp] = new CharMapEntry (0x24,
2671                                         (byte) (map [cp - 1].Level1 + 2),
2672                                         0);
2673                         // FIXME: Syriac NonSpacingMark should go here.
2674
2675                         // Thaana
2676                         // FIXME: it turned out that it does not look like UCA
2677                         fillIndex [0x24] = 0x6E;
2678                         fillIndex [0x1] = 0xAC;
2679                         for (int i = 0; i < orderedThaana.Length; i++) {
2680                                 char c = orderedThaana [i];
2681                                 if (IsIgnorableNonSpacing ((int) c))
2682                                         AddCharMap (c, 1, 1);
2683                                 AddCharMap (c, 0x24, 2);
2684                                 if (c == '\u0782') // SPECIAL CASE: why?
2685                                         fillIndex [0x24] += 2;
2686                         }
2687                         #endregion
2688
2689                         // FIXME: Add more culture-specific letters (that are
2690                         // not supported in Windows collation) here.
2691
2692                         // Surrogate ... they are computed.
2693
2694                         #region Hangul
2695                         // Hangul.
2696                         //
2697                         // Unlike UCA Windows Hangul sequence mixes Jongseong
2698                         // with Choseong sequence as well as Jungseong,
2699                         // adjusted to have the same primary weight for the
2700                         // same base character. So it is impossible to compute
2701                         // those sort keys.
2702                         //
2703                         // Here I introduce an ordered sequence of mixed
2704                         // 'commands' and 'characters' that is similar to
2705                         // LDML text:
2706                         //      - ',' increases primary weight.
2707                         //      - [A B] means a range, increasing index
2708                         //      - {A B} means a range, without increasing index
2709                         //      - '=' is no operation (it means the characters 
2710                         //        of both sides have the same weight).
2711                         //      - '>' inserts a Hangul Syllable block that 
2712                         //        contains 0x251 characters.
2713                         //      - '<' decreases the index
2714                         //      - '0'-'9' means skip count
2715                         //      - whitespaces are ignored
2716                         //
2717
2718                         string hangulSequence =
2719                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2720                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2721                         + "<{\u1113 \u1116}, \u3165,"
2722                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2723                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2724                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2725                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2726                                 + "[\u11D1 \u11D2], \u11B2,"
2727                                 + "[\u11D3 \u11D5], \u11B3,"
2728                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2729                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2730                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2731                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2732                         + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2733                                 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2734                                 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2735                         + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2736                                 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2737                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2738                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2739                         + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
2740                                 + "\u11F1,, \u11F2,,,"
2741                                 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
2742                         + "<\u114D, \u110D,,  >"
2743                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2744                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2745                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2746                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2747                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2748                                 + "[\u11F5 \u11F8]"
2749                         ;
2750
2751                         byte hangulCat = 0x52;
2752                         fillIndex [hangulCat] = 0x2;
2753
2754                         int syllableBlock = 0;
2755                         for (int n = 0; n < hangulSequence.Length; n++) {
2756                                 char c = hangulSequence [n];
2757                                 int start, end;
2758                                 if (Char.IsWhiteSpace (c))
2759                                         continue;
2760                                 switch (c) {
2761                                 case '=':
2762                                         break; // NOP
2763                                 case ',':
2764                                         IncrementSequentialIndex (ref hangulCat);
2765                                         break;
2766                                 case '<':
2767                                         if (fillIndex [hangulCat] == 2)
2768                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2769                                         fillIndex [hangulCat]--;
2770                                         break;
2771                                 case '>':
2772                                         IncrementSequentialIndex (ref hangulCat);
2773                                         for (int l = 0; l < 0x15; l++)
2774                                                 for (int v = 0; v < 0x1C; v++) {
2775                                                         AddCharMap (
2776                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2777                                                         IncrementSequentialIndex (ref hangulCat);
2778                                                 }
2779                                         syllableBlock++;
2780                                         break;
2781                                 case '[':
2782                                         start = hangulSequence [n + 1];
2783                                         end = hangulSequence [n + 3];
2784                                         for (int i = start; i <= end; i++) {
2785                                                 AddCharMap ((char) i, hangulCat, 0);
2786                                                 if (end > i)
2787                                                         IncrementSequentialIndex (ref hangulCat);
2788                                         }
2789                                         n += 4; // consumes 5 characters for this operation
2790                                         break;
2791                                 case '{':
2792                                         start = hangulSequence [n + 1];
2793                                         end = hangulSequence [n + 3];
2794                                         for (int i = start; i <= end; i++)
2795                                                 AddCharMap ((char) i, hangulCat, 0);
2796                                         n += 4; // consumes 5 characters for this operation
2797                                         break;
2798                                 default:
2799                                         AddCharMap (c, hangulCat, 0);
2800                                         break;
2801                                 }
2802                         }
2803
2804                         // Some Jamo NFKD.
2805                         for (int i = 0x3200; i < 0x3300; i++) {
2806                                 if (IsIgnorable (i) || map [i].Defined)
2807                                         continue;
2808                                 int ch = 0;
2809                                 // w/ bracket
2810                                 if (decompLength [i] == 4 &&
2811                                         decompValues [decompIndex [i]] == '(')
2812                                         ch = decompIndex [i] + 1;
2813                                 // circled
2814                                 else if (decompLength [i] == 2 &&
2815                                         decompValues [decompIndex [i] + 1] == '\u1161')
2816                                         ch = decompIndex [i];
2817                                 else if (decompLength [i] == 1)
2818                                         ch = decompIndex [i];
2819                                 else
2820                                         continue;
2821                                 ch = decompValues [ch];
2822                                 if (ch < 0x1100 || 0x1200 < ch &&
2823                                         ch < 0xAC00 || 0xD800 < ch)
2824                                         continue;
2825
2826                                 // SPECIAL CASE ?
2827                                 int offset = i < 0x3260 ? 1 : 0;
2828                                 if (0x326E <= i && i <= 0x3273)
2829                                         offset = 1;
2830
2831                                 map [i] = new CharMapEntry (map [ch].Category,
2832                                         (byte) (map [ch].Level1 + offset),
2833                                         map [ch].Level2);
2834 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2835                         }
2836
2837
2838                         #endregion
2839
2840                         // Letterlike characters and CJK compatibility square
2841                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2842                         int [] counts = new int ['Z' - 'A' + 1];
2843                         char [] namedChars = new char [sortableCharNames.Count];
2844                         int nCharNames = 0;
2845                         foreach (DictionaryEntry de in sortableCharNames) {
2846                                 counts [((string) de.Value) [0] - 'A']++;
2847                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2848                         }
2849                         nCharNames = 0; // reset
2850                         for (int a = 0; a < counts.Length; a++) {
2851                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2852                                 for (int i = 0; i < counts [a]; i++)
2853 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2854                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2855                         }
2856
2857                         // CJK unified ideograph.
2858                         byte cjkCat = 0x9E;
2859                         fillIndex [cjkCat] = 0x2;
2860                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2861                                 if (!IsIgnorable (cp))
2862                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2863                         // CJK Extensions goes here.
2864                         // LAMESPEC: With this Windows style CJK layout, it is
2865                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2866                         // 0x9FBB can never be added w/o breaking compat.
2867                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2868                                 if (!IsIgnorable (cp))
2869                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2870
2871                         // PrivateUse ... computed.
2872                         // remaining Surrogate ... computed.
2873
2874                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2875                         // non-alphanumeric ASCII except for: + - < = > '
2876                         for (int i = 0x21; i < 0x7F; i++) {
2877                                 if (Char.IsLetterOrDigit ((char) i)
2878                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2879                                         continue; // they are not added here.
2880                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2881                                 // Insert 3001 after ',' and 3002 after '.'
2882                                 if (i == 0x2C)
2883                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2884                                 else if (i == 0x2E)
2885                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2886                                 else if (i == 0x3A)
2887                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2888                         }
2889                         #endregion
2890
2891                         #region 07 - Punctuations and something else
2892                         for (int i = 0xA0; i < char.MaxValue; i++) {
2893                                 if (IsIgnorable (i))
2894                                         continue;
2895
2896                                 // FIXME: actually those reset should not be 
2897                                 // done but here I put for easy goal.
2898                                 if (i == 0x0700)
2899                                         fillIndex [0x7] = 0xE2;
2900                                 if (i == 0x2016)
2901                                         fillIndex [0x7] = 0x77;
2902
2903                                 // SPECIAL CASES:
2904                                 switch (i) {
2905                                 case 0xAB: // 08
2906                                 case 0xB7: // 0A
2907                                 case 0xBB: // 08
2908                                 case 0x02B9: // 01
2909                                 case 0x02BA: // 01
2910                                 case 0x2329: // 09
2911                                 case 0x232A: // 09
2912                                         continue;
2913                                 }
2914
2915                                 switch (Char.GetUnicodeCategory ((char) i)) {
2916                                 case UnicodeCategory.OtherPunctuation:
2917                                 case UnicodeCategory.ClosePunctuation:
2918                                 case UnicodeCategory.OpenPunctuation:
2919                                 case UnicodeCategory.ConnectorPunctuation:
2920                                 case UnicodeCategory.InitialQuotePunctuation:
2921                                 case UnicodeCategory.FinalQuotePunctuation:
2922                                 case UnicodeCategory.ModifierSymbol:
2923                                         // SPECIAL CASES: // 0xA
2924                                         if (0x2020 <= i && i <= 0x2031)
2925                                                 continue;
2926                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
2927                                         break;
2928                                 default:
2929                                         if (i == 0xA6 || i == 0x1C3) // SPECIAL CASE. FIXME: why?
2930                                                 goto case UnicodeCategory.OtherPunctuation;
2931                                         break;
2932                                 }
2933                         }
2934                         // Control pictures
2935                         // FIXME: it should not need to reset level 1, but
2936                         // it's for easy goal.
2937                         fillIndex [0x7] = 0xB6;
2938                         for (int i = 0x2400; i <= 0x2421; i++)
2939                                 AddCharMap ((char) i, 0x7, 1, 0);
2940
2941                         // Actually 3008-301F and FE33-FE5D are mixed, so
2942                         // it's somewhat countable, but not as a whole. Thus
2943                         // manual remapping is quicker.
2944                         fillIndex [0x7] = 0x8D;
2945                         int [] cjkCompatMarks1 = new int [] {
2946                                 0xFE33, 0xFE49, 0xFE4A, 0xFE4B, 0xFE4C};
2947                         int [] cjkCompatMarks2 = new int [] {
2948                                 0xFE34, 0xFE3F, 0xFE40, 0xFE3D, 0xFE3E, 0xFE41,
2949                                 0xFE42, 0xFE43, 0xFE44, 0xFE3B, 0xFE3C/*FE5D*/,
2950                                 0xFE39/*FE5E*/, 0xFE3A};
2951                         for (int i = 0; i < cjkCompatMarks1.Length; i++)
2952                                 map [cjkCompatMarks1 [i]] = new CharMapEntry (
2953                                         0x7, fillIndex [0x7]++, 0);
2954                         for (int i = 0; i < cjkCompatMarks2.Length; i++) {
2955                                 map [cjkCompatMarks2 [i]] = new CharMapEntry (
2956                                         0x7, fillIndex [0x7], 0);
2957                                 fillIndex [0x7] += 2;
2958                                 switch (cjkCompatMarks2 [i]) {
2959                                 case 0xFE3C:
2960                                         map [0xFE5D] = new CharMapEntry (
2961                                                 0x7, fillIndex [0x7]++, 0);
2962                                         break;
2963                                 case 0xFE39:
2964                                         map [0xFE5D] = new CharMapEntry (
2965                                                 0x7, fillIndex [0x7]++, 0);
2966                                         break;
2967                                 }
2968                         }
2969
2970                         fillIndex [0x7] = 0x93;
2971                         for (int i = 0x3008; i <= 0x3011; i++) {
2972                                 map [i] = new CharMapEntry (0x7,
2973                                         fillIndex [0x7], 0);
2974                                 fillIndex [0x7] += 2;
2975                         }
2976                         fillIndex [0x7] += 3;
2977                         map [0x3014] = new CharMapEntry (0x7, fillIndex [0x7], 0);
2978                         fillIndex [0x7] += 3;
2979                         map [0x3015] = new CharMapEntry (0x7, fillIndex [0x7], 0);
2980                         fillIndex [0x7] += 2;
2981                         for (int i = 0x3016; i < 0x301F; i++)
2982                                 map [i] = new CharMapEntry (0x7,
2983                                         fillIndex [0x7]++, 0);
2984
2985                         #endregion
2986
2987                         // FIXME: for 07 xx we need more love.
2988
2989                         // Characters w/ diacritical marks (NFKD)
2990                         for (int i = 0; i <= char.MaxValue; i++) {
2991                                 if (map [i].Defined || IsIgnorable (i))
2992                                         continue;
2993                                 if (decompIndex [i] == 0)
2994                                         continue;
2995
2996                                 int start = decompIndex [i];
2997                                 int primaryChar = decompValues [start];
2998                                 int secondary = diacritical [i];
2999                                 bool skip = false;
3000                                 int length = decompLength [i];
3001                                 // special processing for parenthesized ones.
3002                                 if (length == 3 &&
3003                                         decompValues [start] == '(' &&
3004                                         decompValues [start + 2] == ')') {
3005                                         primaryChar = decompValues [start + 1];
3006                                         length = 1;
3007                                 }
3008
3009                                 if (map [primaryChar].Level1 == 0)
3010                                         continue;
3011
3012                                 for (int l = 1; l < length; l++) {
3013                                         int c = decompValues [start + l];
3014                                         if (map [c].Level1 != 0)
3015                                                 skip = true;
3016                                         secondary += diacritical [c];
3017                                 }
3018                                 if (skip)
3019                                         continue;
3020                                 map [i] = new CharMapEntry (
3021                                         map [primaryChar].Category,
3022                                         map [primaryChar].Level1,
3023                                         (byte) secondary);
3024                                 
3025                         }
3026
3027                         // category 08 - symbols
3028                         fillIndex [0x8] = 2;
3029                         // Here Windows mapping is not straightforward. It is
3030                         // not based on computation but seems manual sorting.
3031                         AddCharMapGroup ('+', 0x8, 1, 0); // plus
3032                         AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
3033                         AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
3034                         AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
3035                         AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
3036                         AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
3037                         AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
3038                         AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
3039                         AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
3040                         AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
3041                         AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
3042                         AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
3043                         AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
3044
3045                         for (int cp = 0; cp < 0x2300; cp++) {
3046                                 if (cp == 0xAC) // SPECIAL CASE: skip
3047                                         continue;
3048                                 if (cp == 0x200) {
3049                                         cp = 0x2200; // skip to 2200
3050                                         fillIndex [0x8] = 0x21;
3051                                 }
3052                                 if (cp == 0x2295)
3053                                         fillIndex [0x8] = 0x3;
3054                                 if (cp == 0x22B2)
3055                                         fillIndex [0x8] = 0xB9;
3056                                 if (!map [cp].Defined &&
3057 //                                      Char.GetUnicodeCategory ((char) cp) ==
3058 //                                      UnicodeCategory.MathSymbol)
3059                                         Char.IsSymbol ((char) cp))
3060                                         AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
3061                                 // SPECIAL CASES: no idea why Windows sorts as such
3062                                 switch (cp) {
3063                                 case 0x3E:
3064                                         AddCharMap ('\u227B', 0x8, 1, 0);
3065                                         AddCharMap ('\u22B1', 0x8, 1, 0);
3066                                         break;
3067                                 case 0xB1:
3068                                         AddCharMapGroup ('\u00AB', 0x8, 1, 0);
3069                                         AddCharMapGroup ('\u226A', 0x8, 1, 0);
3070                                         AddCharMapGroup ('\u00BB', 0x8, 1, 0);
3071                                         AddCharMapGroup ('\u226B', 0x8, 1, 0);
3072                                         break;
3073                                 case 0xF7:
3074                                         AddCharMap ('\u01C0', 0x8, 1, 0);
3075                                         AddCharMap ('\u01C1', 0x8, 1, 0);
3076                                         AddCharMap ('\u01C2', 0x8, 1, 0);
3077                                         break;
3078                                 }
3079                         }
3080
3081                         #region Level2 adjustment
3082                         // Arabic Hamzah
3083                         diacritical [0x624] = 0x5;
3084                         diacritical [0x626] = 0x7;
3085                         diacritical [0x622] = 0x9;
3086                         diacritical [0x623] = 0xA;
3087                         diacritical [0x625] = 0xB;
3088                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
3089                         diacritical [0x64A] = 0x7; // Yaa'
3090
3091                         for (int i = 0; i < char.MaxValue; i++) {
3092                                 byte mod = 0;
3093                                 byte cat = map [i].Category;
3094                                 switch (cat) {
3095                                 case 0xE: // Latin diacritics
3096                                 case 0x22: // Japanese: circled characters
3097                                         mod = diacritical [i];
3098                                         break;
3099                                 case 0x13: // Arabic
3100                                         if (diacritical [i] == 0 && i >= 0xFE8D)
3101                                                 mod = 0x8; // default for arabic
3102                                         break;
3103                                 }
3104                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
3105                                         mod = diacritical [i];
3106                                 if (mod > 0)
3107                                         map [i] = new CharMapEntry (
3108                                                 cat, map [i].Level1, mod);
3109                         }
3110                         #endregion
3111
3112                         // FIXME: this is halfly hack but those NonSpacingMark 
3113                         // characters and still undefined are likely to
3114                         // be nonspacing.
3115                         for (int i = 0; i < char.MaxValue; i++) {
3116                                 if (map [i].Defined ||
3117                                         IsIgnorable (i))
3118                                         continue;
3119                                 switch (i) {
3120                                 // SPECIAL CASES.
3121                                 case 0x02B9:
3122                                 case 0x02BA:
3123                                         break;
3124                                 default:
3125                                         if (Char.GetUnicodeCategory ((char) i) !=
3126                                         UnicodeCategory.NonSpacingMark)
3127                                                 continue;
3128                                         break;
3129                                 }
3130                                 if (diacritical [i] != 0)
3131                                         map [i] = new CharMapEntry (1, 1, diacritical [i]);
3132                                 else
3133                                         AddCharMap ((char) i, 1, 1);
3134                         }
3135
3136                         // FIXME: this is hack but those Symbol characters
3137                         // are likely to fall into 0xA category.
3138                         for (int i = 0; i < char.MaxValue; i++)
3139                                 if (!map [i].Defined &&
3140                                         !IsIgnorable (i) &&
3141                                         Char.IsSymbol ((char) i))
3142                                         AddCharMap ((char) i, 0xA, 1);
3143                 }
3144
3145                 private void IncrementSequentialIndex (ref byte hangulCat)
3146                 {
3147                         fillIndex [hangulCat]++;
3148                         if (fillIndex [hangulCat] == 0) { // overflown
3149                                 hangulCat++;
3150                                 fillIndex [hangulCat] = 0x2;
3151                         }
3152                 }
3153
3154                 // Reset fillIndex to fixed value and call AddLetterMap().
3155                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3156                 {
3157                         fillIndex [category] = alphaWeight;
3158                         AddLetterMap (c, category, 0);
3159
3160                         ArrayList al = latinMap [c] as ArrayList;
3161                         if (al == null)
3162                                 return;
3163
3164                         foreach (int cp in al)
3165                                 AddLetterMap ((char) cp, category, 0);
3166                 }
3167
3168                 private void AddKanaMap (int i, byte voices)
3169                 {
3170                         for (byte b = 0; b < voices; b++) {
3171                                 char c = (char) (i + b);
3172                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
3173                                 // Hiragana
3174                                 AddLetterMapCore (c, 0x22, 0, arg, false);
3175                                 // Katakana
3176                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3177                         }
3178                 }
3179
3180                 private void AddLetterMap (char c, byte category, byte updateCount)
3181                 {
3182                         AddLetterMapCore (c, category, updateCount, 0, true);
3183                 }
3184
3185                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3186                 {
3187                         char c2;
3188                         // <small> updates index
3189                         c2 = ToSmallForm (c);
3190                         if (c2 != c)
3191                                 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3192                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3193                         if (c2 != c && !map [(int) c2].Defined)
3194                                 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3195                         bool doUpdate = true;
3196                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3197                                 doUpdate = false;
3198                         else
3199                                 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3200                         if (doUpdate)
3201                                 fillIndex [category] += updateCount;
3202                 }
3203
3204                 private bool AddCharMap (char c, byte category, byte increment)
3205                 {
3206                         return AddCharMap (c, category, increment, 0);
3207                 }
3208                 
3209                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3210                 {
3211                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3212                                 return false; // do nothing
3213                         map [(int) c] = new CharMapEntry (category,
3214                                 category == 1 ? alt : fillIndex [category],
3215                                 category == 1 ? fillIndex [category] : alt);
3216                         fillIndex [category] += increment;
3217                         return true;
3218                 }
3219
3220                 //
3221                 // Adds characters to table in the order below 
3222                 // (+ increases weight):
3223                 //      (<small> +)
3224                 //      itself
3225                 //      <fraction>
3226                 //      <full> | <super> | <sub>
3227                 //      <circle> | <wide> (| <narrow>)
3228                 //      +
3229                 //      (vertical +)
3230                 //
3231                 // level2 is fixed (does not increase).
3232                 int [] sameWeightItems = new int [] {
3233                         DecompositionFraction,
3234                         DecompositionFull,
3235                         DecompositionSuper,
3236                         DecompositionSub,
3237                         DecompositionCircle,
3238                         DecompositionWide,
3239                         DecompositionNarrow,
3240                         };
3241                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3242                 {
3243                         AddCharMapGroup (c, category, updateCount, level2, false);
3244                 }
3245
3246                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3247                 {
3248                         if (map [(int) c].Defined)
3249                                 return;
3250
3251                         if (deferLevel2)
3252                                 level2 = diacritical [(int) c];
3253
3254                         char small = char.MinValue;
3255                         char vertical = char.MinValue;
3256                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3257                         if (nfkd != null) {
3258                                 object smv = nfkd [(byte) DecompositionSmall];
3259                                 if (smv != null)
3260                                         small = (char) ((int) smv);
3261                                 object vv = nfkd [(byte) DecompositionVertical];
3262                                 if (vv != null)
3263                                         vertical = (char) ((int) vv);
3264                         }
3265
3266                         // <small> updates index
3267                         if (small != char.MinValue) {
3268                                 if (level2 == 0 && deferLevel2)
3269                                         level2 = diacritical [small];
3270                                 AddCharMap (small, category, updateCount, level2);
3271                         }
3272
3273                         // itself
3274                         AddCharMap (c, category, 0, level2);
3275
3276                         if (nfkd != null) {
3277                                 foreach (int weight in sameWeightItems) {
3278                                         object wv = nfkd [(byte) weight];
3279                                         if (wv != null) {
3280                                                 if (deferLevel2)
3281                                                         level2 = diacritical [(int) wv];
3282                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
3283                                         }
3284                                 }
3285                         }
3286
3287                         // update index here.
3288                         fillIndex [category] += updateCount;
3289
3290                         if (vertical != char.MinValue) {
3291                                 if (level2 == 0 && deferLevel2)
3292                                         level2 = diacritical [vertical];
3293                                 AddCharMap (vertical, category, updateCount, level2);
3294                         }
3295                 }
3296
3297                 private void AddCharMapCJK (char c, ref byte category)
3298                 {
3299                         AddCharMap (c, category, 0, 0);
3300                         IncrementSequentialIndex (ref category);
3301
3302                         // Special. I wonder why but Windows skips 9E F9.
3303                         if (category == 0x9E && fillIndex [category] == 0xF9)
3304                                 IncrementSequentialIndex (ref category);
3305                 }
3306
3307                 private void AddCharMapGroupCJK (char c, ref byte category)
3308                 {
3309                         AddCharMapCJK (c, ref category);
3310
3311                         // LAMESPEC: see below.
3312                         if (c == '\u5B78') {
3313                                 AddCharMapCJK ('\u32AB', ref category);
3314                                 AddCharMapCJK ('\u323B', ref category);
3315                         }
3316                         if (c == '\u52DE') {
3317                                 AddCharMapCJK ('\u3298', ref category);
3318                                 AddCharMapCJK ('\u3238', ref category);
3319                         }
3320                         if (c == '\u5BEB')
3321                                 AddCharMapCJK ('\u32A2', ref category);
3322                         if (c == '\u91AB')
3323                                 // Especially this mapping order totally does
3324                                 // not make sense to me.
3325                                 AddCharMapCJK ('\u32A9', ref category);
3326
3327                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3328                         if (nfkd == null)
3329                                 return;
3330                         for (byte weight = 0; weight <= 0x12; weight++) {
3331                                 object wv = nfkd [weight];
3332                                 if (wv == null)
3333                                         continue;
3334                                 int w = (int) wv;
3335
3336                                 // Special: they are ignored in this area.
3337                                 // FIXME: check if it is sane
3338                                 if (0xF900 <= w && w <= 0xFAD9)
3339                                         continue;
3340                                 // LAMESPEC: on Windows some of CJK characters
3341                                 // in 3200-32B0 are incorrectly mapped. They
3342                                 // mix Chinise and Japanese Kanji when
3343                                 // ordering those characters.
3344                                 switch (w) {
3345                                 case 0x32A2: case 0x3298: case 0x3238:
3346                                 case 0x32A9: case 0x323B: case 0x32AB:
3347                                         continue;
3348                                 }
3349
3350                                 AddCharMapCJK ((char) w, ref category);
3351                         }
3352                 }
3353
3354                 // For now it is only for 0x7 category.
3355                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3356                 {
3357                         char small = char.MinValue;
3358                         char vertical = char.MinValue;
3359                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3360                         if (nfkd != null) {
3361                                 object smv = nfkd [(byte) DecompositionSmall];
3362                                 if (smv != null)
3363                                         small = (char) ((int) smv);
3364                                 object vv = nfkd [(byte) DecompositionVertical];
3365                                 if (vv != null)
3366                                         vertical = (char) ((int) vv);
3367                         }
3368
3369                         // <small> updates index
3370                         if (small != char.MinValue)
3371                                 // SPECIAL CASE excluded (FIXME: why?)
3372                                 if (small != '\u2024')
3373                                         AddCharMap (small, category, updateCount);
3374
3375                         // itself
3376                         AddCharMap (c, category, updateCount, level2);
3377
3378                         // Since nfkdMap is problematic to have two or more
3379                         // NFKD to an identical character, here I iterate all.
3380                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3381                                 if (decompLength [c2] == 1 &&
3382                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3383                                         switch (decompType [c2]) {
3384                                         case DecompositionCompat:
3385                                                 AddCharMap ((char) c2, category, updateCount, level2);
3386                                                 break;
3387                                         }
3388                                 }
3389                         }
3390
3391                         if (vertical != char.MinValue)
3392                                 // SPECIAL CASE excluded (FIXME: why?)
3393                                 if (vertical != '\uFE33' && vertical != '\uFE34')
3394                                         AddCharMap (vertical, category, updateCount, level2);
3395                 }
3396
3397                 private void AddArabicCharMap (char c)
3398                 {
3399                         byte category = 6;
3400                         byte updateCount = 1;
3401                         byte level2 = 0;
3402
3403                         // itself
3404                         AddCharMap (c, category, 0, level2);
3405
3406                         // Since nfkdMap is problematic to have two or more
3407                         // NFKD to an identical character, here I iterate all.
3408                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3409                                 if (decompLength [c2] == 0)
3410                                         continue;
3411                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
3412                                 if ((int) (decompValues [idx]) == (int) c)
3413                                         AddCharMap ((char) c2, category,
3414                                                 0, level2);
3415                         }
3416                         fillIndex [category] += updateCount;
3417                 }
3418
3419                 char ToSmallForm (char c)
3420                 {
3421                         return ToDecomposed (c, DecompositionSmall, false);
3422                 }
3423
3424                 char ToDecomposed (char c, byte d, bool tail)
3425                 {
3426                         if (decompType [(int) c] != d)
3427                                 return c;
3428                         int idx = decompIndex [(int) c];
3429                         if (tail)
3430                                 idx += decompLength [(int) c] - 1;
3431                         return (char) decompValues [idx];
3432                 }
3433
3434                 bool ExistsJIS (int cp)
3435                 {
3436                         foreach (JISCharacter j in jisJapanese)
3437                                 if (j.CP == cp)
3438                                         return true;
3439                         return false;
3440                 }
3441
3442                 #endregion
3443
3444                 #region Level 3 properties (Case/Width)
3445
3446                 private byte ComputeLevel3Weight (char c)
3447                 {
3448                         byte b = ComputeLevel3WeightRaw (c);
3449                         return b > 0 ? (byte) (b + 2) : b;
3450                 }
3451
3452                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3453                 {
3454                         // CJK compat
3455                         if ('\u3192' <= c && c <= '\u319F')
3456                                 return 0;
3457
3458                         // They have <narrow> NFKD mapping, and on Windows
3459                         // those narrow characters are regarded as "normal",
3460                         // thus those characters themselves are regarded as
3461                         // "wide". grep "<narrow>" and you can pick them up
3462                         // (ignoring Kana, Hangul etc.)
3463                         switch (c) {
3464                         case '\u3002':
3465                         case '\u300C':
3466                         case '\u300D':
3467                         case '\u3001':
3468                         case '\u30FB':
3469                         case '\u2502':
3470                         case '\u2190':
3471                         case '\u2191':
3472                         case '\u2192':
3473                         case '\u2193':
3474                         case '\u25A0':
3475                         case '\u25CB':
3476                                 return 1;
3477                         }
3478                         // Korean
3479                         if ('\u11A8' <= c && c <= '\u11F9')
3480                                 return 2;
3481                         if ('\uFFA0' <= c && c <= '\uFFDC')
3482                                 return 4;
3483                         if ('\u3130' <= c && c <= '\u3164')
3484                                 return 5;
3485                         if ('\u3165' <= c && c <= '\u318E')
3486                                 return 4;
3487                         // Georgian Capital letters
3488                         if ('\u10A0' <= c && c <= '\u10C5')
3489                                 return 0x10;
3490                         // numbers
3491                         if ('\u2776' <= c && c <= '\u277F')
3492                                 return 4;
3493                         if ('\u2780' <= c && c <= '\u2789')
3494                                 return 8;
3495                         if ('\u2776' <= c && c <= '\u2793')
3496                                 return 0xC;
3497                         if ('\u2160' <= c && c <= '\u216F')
3498                                 return 0x10;
3499                         if ('\u2181' <= c && c <= '\u2182')
3500                                 return 0x18;
3501                         // Arabic
3502                         if ('\u2135' <= c && c <= '\u2138')
3503                                 return 4;
3504                         if ('\uFE80' <= c && c < '\uFF00') {
3505                                 // 2(Isolated)/8(Final)/0x18(Medial)
3506                                 switch (decompType [(int) c]) {
3507                                 case DecompositionIsolated:
3508                                         return 2;
3509                                 case DecompositionFinal:
3510                                         return 8;
3511                                 case DecompositionMedial:
3512                                         return 0x18;
3513                                 }
3514                         }
3515
3516                         // actually I dunno the reason why they have weights.
3517                         switch (c) {
3518                         case '\u01BC':
3519                                 return 0x10;
3520                         case '\u06A9':
3521                                 return 0x20;
3522                         case '\u06AA':
3523                                 return 0x28;
3524                         // Gurmukhi
3525                         case '\u0A39':
3526                         case '\u0A59':
3527                         case '\u0A5A':
3528                         case '\u0A5B':
3529                         case '\u0A5E':
3530                                 return 0x10;
3531                         }
3532
3533                         byte ret = 0;
3534                         switch (c) {
3535                         case '\u03C2':
3536                         case '\u2104':
3537                         case '\u212B':
3538                                 ret |= 8;
3539                                 break;
3540                         case '\uFE42':
3541                                 ret |= 0xC;
3542                                 break;
3543                         }
3544
3545                         // misc
3546                         switch (decompType [(int) c]) {
3547                         case DecompositionWide: // <wide>
3548                         case DecompositionSub: // <sub>
3549                         case DecompositionSuper: // <super>
3550                                 ret |= decompType [(int) c];
3551                                 break;
3552                         }
3553                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3554                                 ret |= 8;
3555                         if (isUppercase [(int) c]) // DerivedCoreProperties
3556                                 ret |= 0x10;
3557
3558                         return ret;
3559                 }
3560
3561                 #endregion
3562
3563                 #region IsIgnorable
3564 /*
3565                 static bool IsIgnorable (int i)
3566                 {
3567                         if (unicodeAge [i] >= 3.1)
3568                                 return true;
3569                         switch (char.GetUnicodeCategory ((char) i)) {
3570                         case UnicodeCategory.OtherNotAssigned:
3571                         case UnicodeCategory.Format:
3572                                 return true;
3573                         }
3574                         return false;
3575                 }
3576 */
3577
3578                 // FIXME: In the future use DerivedAge.txt to examine character
3579                 // versions and set those ones that have higher version than
3580                 // 1.0 as ignorable.
3581                 static bool IsIgnorable (int i)
3582                 {
3583                         switch (i) {
3584                         case 0:
3585                         // I guess, those characters are added between
3586                         // Unicode 1.0 (LCMapString) and Unicode 3.1
3587                         // (UnicodeCategory), so they used to be 
3588                         // something like OtherNotAssigned as of Unicode 1.1.
3589                         case 0x2df: case 0x387:
3590                         case 0x3d7: case 0x3d8: case 0x3d9:
3591                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3592                         case 0x400: case 0x40d: case 0x450: case 0x45d:
3593                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
3594                         case 0x653: case 0x654: case 0x655: case 0x66d:
3595                         case 0xb56:
3596                         case 0x1e9b: case 0x202f: case 0x20ad:
3597                         case 0x20ae: case 0x20af:
3598                         case 0x20e2: case 0x20e3:
3599                         case 0x2139: case 0x213a: case 0x2183:
3600                         case 0x2425: case 0x2426: case 0x2619:
3601                         case 0x2670: case 0x2671: case 0x3007:
3602                         case 0x3190: case 0x3191:
3603                         case 0xfffc: case 0xfffd:
3604                                 return true;
3605                         // exceptional characters filtered by the 
3606                         // following conditions. Originally those exceptional
3607                         // ranges are incorrect (they should not be ignored)
3608                         // and most of those characters are unfortunately in
3609                         // those ranges.
3610                         case 0x4d8: case 0x4d9:
3611                         case 0x4e8: case 0x4e9:
3612                         case 0x70F:
3613                         case 0x3036: case 0x303f:
3614                         case 0x337b: case 0xfb1e:
3615                                 return false;
3616                         }
3617
3618                         if (
3619                                 // The whole Sinhala characters.
3620                                 0x0D82 <= i && i <= 0x0DF4
3621                                 // The whole Tibetan characters.
3622                                 || 0x0F00 <= i && i <= 0x0FD1
3623                                 // The whole Myanmar characters.
3624                                 || 0x1000 <= i && i <= 0x1059
3625                                 // The whole Etiopic, Cherokee, 
3626                                 // Canadian Syllablic, Ogham, Runic,
3627                                 // Tagalog, Hanunoo, Philippine,
3628                                 // Buhid, Tagbanwa, Khmer and Mongorian
3629                                 // characters.
3630                                 || 0x1200 <= i && i <= 0x1DFF
3631                                 // Greek extension characters.
3632                                 || 0x1F00 <= i && i <= 0x1FFF
3633                                 // The whole Braille characters.
3634                                 || 0x2800 <= i && i <= 0x28FF
3635                                 // CJK radical characters.
3636                                 || 0x2E80 <= i && i <= 0x2EF3
3637                                 // Kangxi radical characters.
3638                                 || 0x2F00 <= i && i <= 0x2FD5
3639                                 // Ideographic description characters.
3640                                 || 0x2FF0 <= i && i <= 0x2FFB
3641                                 // Bopomofo letter and final
3642                                 || 0x31A0 <= i && i <= 0x31B7
3643                                 // White square with quadrant characters.
3644                                 || 0x25F0 <= i && i <= 0x25F7
3645                                 // Ideographic telegraph symbols.
3646                                 || 0x32C0 <= i && i <= 0x32CB
3647                                 || 0x3358 <= i && i <= 0x3370
3648                                 || 0x33E0 <= i && i <= 0x33FF
3649                                 // The whole YI characters.
3650                                 || 0xA000 <= i && i <= 0xA48C
3651                                 || 0xA490 <= i && i <= 0xA4C6
3652                                 // American small ligatures
3653                                 || 0xFB13 <= i && i <= 0xFB17
3654                                 // hebrew, arabic, variation selector.
3655                                 || 0xFB1D <= i && i <= 0xFE2F
3656                                 // Arabic ligatures.
3657                                 || 0xFEF5 <= i && i <= 0xFEFC
3658                                 // FIXME: why are they excluded?
3659                                 || 0x01F6 <= i && i <= 0x01F9
3660                                 || 0x0218 <= i && i <= 0x0233
3661                                 || 0x02A9 <= i && i <= 0x02AD
3662                                 || 0x02EA <= i && i <= 0x02EE
3663                                 || 0x0349 <= i && i <= 0x036F
3664                                 || 0x0488 <= i && i <= 0x048F
3665                                 || 0x04D0 <= i && i <= 0x04FF
3666                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3667                                 || 0x06D6 <= i && i <= 0x06ED
3668                                 || 0x06FA <= i && i <= 0x06FE
3669                                 || 0x2048 <= i && i <= 0x204D
3670                                 || 0x20e4 <= i && i <= 0x20ea
3671                                 || 0x213C <= i && i <= 0x214B
3672                                 || 0x21EB <= i && i <= 0x21FF
3673                                 || 0x22F2 <= i && i <= 0x22FF
3674                                 || 0x237B <= i && i <= 0x239A
3675                                 || 0x239B <= i && i <= 0x23CF
3676                                 || 0x24EB <= i && i <= 0x24FF
3677                                 || 0x2596 <= i && i <= 0x259F
3678                                 || 0x25F8 <= i && i <= 0x25FF
3679                                 || 0x2672 <= i && i <= 0x2689
3680                                 || 0x2768 <= i && i <= 0x2775
3681                                 || 0x27d0 <= i && i <= 0x27ff
3682                                 || 0x2900 <= i && i <= 0x2aff
3683                                 || 0x3033 <= i && i <= 0x303F
3684                                 || 0x31F0 <= i && i <= 0x31FF
3685                                 || 0x3250 <= i && i <= 0x325F
3686                                 || 0x32B1 <= i && i <= 0x32BF
3687                                 || 0x3371 <= i && i <= 0x337B
3688                                 || 0xFA30 <= i && i <= 0xFA6A
3689                         )
3690                                 return true;
3691
3692                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3693                         switch (uc) {
3694                         case UnicodeCategory.PrivateUse:
3695                         case UnicodeCategory.Surrogate:
3696                                 return false;
3697                         // ignored by nature
3698                         case UnicodeCategory.Format:
3699                         case UnicodeCategory.OtherNotAssigned:
3700                                 return true;
3701                         default:
3702                                 return false;
3703                         }
3704                 }
3705
3706                 // To check IsIgnorable sanity, try the driver below under MS.NET.
3707
3708                 /*
3709                 public static void Main ()
3710                 {
3711                         for (int i = 0; i <= char.MaxValue; i++)
3712                                 Dump (i, IsIgnorable (i));
3713                 }
3714
3715                 static void Dump (int i, bool ignore)
3716                 {
3717                         switch (Char.GetUnicodeCategory ((char) i)) {
3718                         case UnicodeCategory.PrivateUse:
3719                         case UnicodeCategory.Surrogate:
3720                                 return; // check nothing
3721                         }
3722
3723                         string s1 = "";
3724                         string s2 = new string ((char) i, 10);
3725                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3726                         if ((ret == 0) == ignore)
3727                                 return;
3728                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3729                 }
3730                 */
3731                 #endregion // IsIgnorable
3732
3733                 #region IsIgnorableSymbol
3734                 static bool IsIgnorableSymbol (int i)
3735                 {
3736                         if (IsIgnorable (i))
3737                                 return true;
3738
3739                         switch (i) {
3740                         // *Letter
3741                         case 0x00b5: case 0x01C0: case 0x01C1:
3742                         case 0x01C2: case 0x01C3: case 0x01F6:
3743                         case 0x01F7: case 0x01F8: case 0x01F9:
3744                         case 0x02D0: case 0x02EE: case 0x037A:
3745                         case 0x03D7: case 0x03F3:
3746                         case 0x0400: case 0x040d:
3747                         case 0x0450: case 0x045d:
3748                         case 0x048C: case 0x048D:
3749                         case 0x048E: case 0x048F:
3750                         case 0x0587: case 0x0640: case 0x06E5:
3751                         case 0x06E6: case 0x06FA: case 0x06FB:
3752                         case 0x06FC: case 0x093D: case 0x0950:
3753                         case 0x1E9B: case 0x2139: case 0x3006:
3754                         case 0x3033: case 0x3034: case 0x3035:
3755                         case 0xFE7E: case 0xFE7F:
3756                         // OtherNumber
3757                         case 0x16EE: case 0x16EF: case 0x16F0:
3758                         // LetterNumber
3759                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3760                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3761                         case 0x3038: // HANGZHOU NUMERAL TEN
3762                         case 0x3039: // HANGZHOU NUMERAL TWENTY
3763                         case 0x303a: // HANGZHOU NUMERAL THIRTY
3764                         // OtherSymbol
3765                         case 0x2117:
3766                         case 0x327F:
3767                                 return true;
3768                         // ModifierSymbol
3769                         case 0x02B9: case 0x02BA: case 0x02C2:
3770                         case 0x02C3: case 0x02C4: case 0x02C5:
3771                         case 0x02C8: case 0x02CC: case 0x02CD:
3772                         case 0x02CE: case 0x02CF: case 0x02D2:
3773                         case 0x02D3: case 0x02D4: case 0x02D5:
3774                         case 0x02D6: case 0x02D7: case 0x02DE:
3775                         case 0x02E5: case 0x02E6: case 0x02E7:
3776                         case 0x02E8: case 0x02E9:
3777                         case 0x309B: case 0x309C:
3778                         // OtherPunctuation
3779                         case 0x055A: // American Apos
3780                         case 0x05C0: // Hebrew Punct
3781                         case 0x0E4F: // Thai FONGMAN
3782                         case 0x0E5A: // Thai ANGKHANKHU
3783                         case 0x0E5B: // Thai KHOMUT
3784                         // CurencySymbol
3785                         case 0x09F2: // Bengali Rupee Mark
3786                         case 0x09F3: // Bengali Rupee Sign
3787                         // MathSymbol
3788                         case 0x221e: // INF.
3789                         // OtherSymbol
3790                         case 0x0482:
3791                         case 0x09FA:
3792                         case 0x0B70:
3793                                 return false;
3794                         }
3795
3796                         // *Letter
3797                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3798 #if NET_2_0
3799                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3800                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3801 #endif
3802                         )
3803                                 return true;
3804
3805                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3806                         switch (uc) {
3807                         case UnicodeCategory.Surrogate:
3808                                 return false; // inconsistent
3809
3810                         case UnicodeCategory.SpacingCombiningMark:
3811                         case UnicodeCategory.EnclosingMark:
3812                         case UnicodeCategory.NonSpacingMark:
3813                         case UnicodeCategory.PrivateUse:
3814                                 // NonSpacingMark
3815                                 if (0x064B <= i && i <= 0x0652) // Arabic
3816                                         return true;
3817                                 return false;
3818
3819                         case UnicodeCategory.Format:
3820                         case UnicodeCategory.OtherNotAssigned:
3821                                 return true;
3822
3823                         default:
3824                                 bool use = false;
3825                                 // OtherSymbols
3826                                 if (
3827                                         // latin in a circle
3828                                         0x249A <= i && i <= 0x24E9
3829                                         || 0x2100 <= i && i <= 0x2132
3830                                         // Japanese
3831                                         || 0x3196 <= i && i <= 0x31A0
3832                                         // Korean
3833                                         || 0x3200 <= i && i <= 0x321C
3834                                         // Chinese/Japanese
3835                                         || 0x322A <= i && i <= 0x3243
3836                                         // CJK
3837                                         || 0x3260 <= i && i <= 0x32B0
3838                                         || 0x32D0 <= i && i <= 0x3357
3839                                         || 0x337B <= i && i <= 0x33DD
3840                                 )
3841                                         use = !Char.IsLetterOrDigit ((char) i);
3842                                 if (use)
3843                                         return false;
3844
3845                                 // This "Digit" rule is mystery.
3846                                 // It filters some symbols out.
3847                                 if (Char.IsLetterOrDigit ((char) i))
3848                                         return false;
3849                                 if (Char.IsNumber ((char) i))
3850                                         return false;
3851                                 if (Char.IsControl ((char) i)
3852                                         || Char.IsSeparator ((char) i)
3853                                         || Char.IsPunctuation ((char) i))
3854                                         return true;
3855                                 if (Char.IsSymbol ((char) i))
3856                                         return true;
3857
3858                                 // FIXME: should check more
3859                                 return false;
3860                         }
3861                 }
3862
3863                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3864 /*
3865                 public static void Main ()
3866                 {
3867                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3868                         for (int i = 0; i <= char.MaxValue; i++) {
3869                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3870                                 if (uc == UnicodeCategory.Surrogate)
3871                                         continue;
3872
3873                                 bool ret = IsIgnorableSymbol (i);
3874
3875                                 string s1 = "TEST ";
3876                                 string s2 = "TEST " + (char) i;
3877
3878                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3879
3880                                 if (ret != (result == 0))
3881                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3882                                                 ret ? "should not ignore" :
3883                                                         "should ignore",
3884                                                 i,(char) i, uc);
3885                         }
3886                 }
3887 */
3888                 #endregion
3889
3890                 #region NonSpacing
3891                 static bool IsIgnorableNonSpacing (int i)
3892                 {
3893                         if (IsIgnorable (i))
3894                                 return true;
3895
3896                         switch (i) {
3897                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3898                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3899                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3900                                 return true;
3901                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3902                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3903                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3904                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3905                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3906                         case 0x0CCD: case 0x0E4E:
3907                                 return false;
3908                         }
3909
3910                         if (0x02b9 <= i && i <= 0x02c5
3911                                 || 0x02cc <= i && i <= 0x02d7
3912                                 || 0x02e4 <= i && i <= 0x02ef
3913                                 || 0x20DD <= i && i <= 0x20E0
3914                         )
3915                                 return true;
3916
3917                         if (0x064B <= i && i <= 0x00652
3918                                 || 0x0941 <= i && i <= 0x0948
3919                                 || 0x0AC1 <= i && i <= 0x0ACD
3920                                 || 0x0C3E <= i && i <= 0x0C4F
3921                                 || 0x0E31 <= i && i <= 0x0E3F
3922                         )
3923                                 return false;
3924
3925                         return Char.GetUnicodeCategory ((char) i) ==
3926                                 UnicodeCategory.NonSpacingMark;
3927                 }
3928
3929                 // We can reuse IsIgnorableSymbol testcode 
3930                 // for IsIgnorableNonSpacing.
3931                 #endregion
3932         }
3933
3934         struct CharMapEntry
3935         {
3936                 public byte Category;
3937                 public byte Level1;
3938                 public byte Level2; // It is always single byte.
3939                 public bool Defined;
3940
3941                 public CharMapEntry (byte category, byte level1, byte level2)
3942                 {
3943                         Category = category;
3944                         Level1 = level1;
3945                         Level2 = level2;
3946                         Defined = true;
3947                 }
3948         }
3949
3950         class JISCharacter
3951         {
3952                 public readonly int CP;
3953                 public readonly int JIS;
3954
3955                 public JISCharacter (int cp, int cpJIS)
3956                 {
3957                         CP = cp;
3958                         JIS = cpJIS;
3959                 }
3960         }
3961
3962         class JISComparer : IComparer
3963         {
3964                 public static readonly JISComparer Instance =
3965                         new JISComparer ();
3966
3967                 public int Compare (object o1, object o2)
3968                 {
3969                         JISCharacter j1 = (JISCharacter) o1;
3970                         JISCharacter j2 = (JISCharacter) o2;
3971                         return j1.JIS - j2.JIS;
3972                 }
3973         }
3974
3975         class NonJISCharacter
3976         {
3977                 public readonly int CP;
3978                 public readonly string Name;
3979
3980                 public NonJISCharacter (int cp, string name)
3981                 {
3982                         CP = cp;
3983                         Name = name;
3984                 }
3985         }
3986
3987         class NonJISComparer : IComparer
3988         {
3989                 public static readonly NonJISComparer Instance =
3990                         new NonJISComparer ();
3991
3992                 public int Compare (object o1, object o2)
3993                 {
3994                         NonJISCharacter j1 = (NonJISCharacter) o1;
3995                         NonJISCharacter j2 = (NonJISCharacter) o2;
3996                         return string.CompareOrdinal (j1.Name, j2.Name);
3997                 }
3998         }
3999
4000         class DecimalDictionaryValueComparer : IComparer
4001         {
4002                 public static readonly DecimalDictionaryValueComparer Instance
4003                         = new DecimalDictionaryValueComparer ();
4004
4005                 private DecimalDictionaryValueComparer ()
4006                 {
4007                 }
4008
4009                 public int Compare (object o1, object o2)
4010                 {
4011                         DictionaryEntry e1 = (DictionaryEntry) o1;
4012                         DictionaryEntry e2 = (DictionaryEntry) o2;
4013                         // FIXME: in case of 0, compare decomposition categories
4014                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4015                         if (ret != 0)
4016                                 return ret;
4017                         int i1 = (int) e1.Key;
4018                         int i2 = (int) e2.Key;
4019                         return i1 - i2;
4020                 }
4021         }
4022
4023         class StringDictionaryValueComparer : IComparer
4024         {
4025                 public static readonly StringDictionaryValueComparer Instance
4026                         = new StringDictionaryValueComparer ();
4027
4028                 private StringDictionaryValueComparer ()
4029                 {
4030                 }
4031
4032                 public int Compare (object o1, object o2)
4033                 {
4034                         DictionaryEntry e1 = (DictionaryEntry) o1;
4035                         DictionaryEntry e2 = (DictionaryEntry) o2;
4036                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4037                         if (ret != 0)
4038                                 return ret;
4039                         int i1 = (int) e1.Key;
4040                         int i2 = (int) e2.Key;
4041                         return i1 - i2;
4042                 }
4043         }
4044
4045         class UCAComparer : IComparer
4046         {
4047                 public static readonly UCAComparer Instance
4048                         = new UCAComparer ();
4049
4050                 private UCAComparer ()
4051                 {
4052                 }
4053
4054                 public int Compare (object o1, object o2)
4055                 {
4056                         char i1 = (char) o1;
4057                         char i2 = (char) o2;
4058
4059                         int l1 = CollationElementTable.GetSortKeyCount (i1);
4060                         int l2 = CollationElementTable.GetSortKeyCount (i2);
4061                         int l = l1 > l2 ? l2 : l1;
4062
4063                         for (int i = 0; i < l; i++) {
4064                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4065                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4066                                 int v = k1.Primary - k2.Primary;
4067                                 if (v != 0)
4068                                         return v;
4069                                 v = k1.Secondary - k2.Secondary;
4070                                 if (v != 0)
4071                                         return v;
4072                                 v = k1.Thirtiary - k2.Thirtiary;
4073                                 if (v != 0)
4074                                         return v;
4075                                 v = k1.Quarternary - k2.Quarternary;
4076                                 if (v != 0)
4077                                         return v;
4078                         }
4079                         return l1 - l2;
4080                 }
4081         }
4082
4083         class Tailoring
4084         {
4085                 int lcid;
4086                 int alias;
4087                 bool frenchSort;
4088                 ArrayList items = new ArrayList ();
4089
4090                 public Tailoring (int lcid)
4091                         : this (lcid, 0)
4092                 {
4093                 }
4094
4095                 public Tailoring (int lcid, int alias)
4096                 {
4097                         this.lcid = lcid;
4098                         this.alias = alias;
4099                 }
4100
4101                 public int LCID {
4102                         get { return lcid; }
4103                 }
4104
4105                 public int Alias {
4106                         get { return alias; }
4107                 }
4108
4109                 public bool FrenchSort {
4110                         get { return frenchSort; }
4111                         set { frenchSort = value; }
4112                 }
4113
4114                 public void AddDiacriticalMap (byte target, byte replace)
4115                 {
4116                         items.Add (new DiacriticalMap (target, replace));
4117                 }
4118
4119                 public void AddSortKeyMap (string source, byte [] sortkey)
4120                 {
4121                         items.Add (new SortKeyMap (source, sortkey));
4122                 }
4123
4124                 public void AddReplacementMap (string source, string replace)
4125                 {
4126                         items.Add (new ReplacementMap (source, replace));
4127                 }
4128
4129                 public char [] ItemToCharArray ()
4130                 {
4131                         ArrayList al = new ArrayList ();
4132                         foreach (ITailoringMap m in items)
4133                                 al.AddRange (m.ToCharArray ());
4134                         return al.ToArray (typeof (char)) as char [];
4135                 }
4136
4137                 interface ITailoringMap
4138                 {
4139                         char [] ToCharArray ();
4140                 }
4141
4142                 class DiacriticalMap : ITailoringMap
4143                 {
4144                         public readonly byte Target;
4145                         public readonly byte Replace;
4146
4147                         public DiacriticalMap (byte target, byte replace)
4148                         {
4149                                 Target = target;
4150                                 Replace = replace;
4151                         }
4152
4153                         public char [] ToCharArray ()
4154                         {
4155                                 char [] ret = new char [3];
4156                                 ret [0] = (char) 02; // kind:DiacriticalMap
4157                                 ret [1] = (char) Target;
4158                                 ret [2] = (char) Replace;
4159                                 return ret;
4160                         }
4161                 }
4162
4163                 class SortKeyMap : ITailoringMap
4164                 {
4165                         public readonly string Source;
4166                         public readonly byte [] SortKey;
4167
4168                         public SortKeyMap (string source, byte [] sortkey)
4169                         {
4170                                 Source = source;
4171                                 SortKey = sortkey;
4172                         }
4173
4174                         public char [] ToCharArray ()
4175                         {
4176                                 char [] ret = new char [Source.Length + 7];
4177                                 ret [0] = (char) 01; // kind:SortKeyMap
4178                                 for (int i = 0; i < Source.Length; i++)
4179                                         ret [i + 1] = Source [i];
4180                                 // null terminate
4181                                 for (int i = 0; i < 4; i++)
4182                                         ret [i + Source.Length + 2] = (char) SortKey [i];
4183                                 return ret;
4184                         }
4185                 }
4186
4187                 class ReplacementMap : ITailoringMap
4188                 {
4189                         public readonly string Source;
4190                         public readonly string Replace;
4191
4192                         public ReplacementMap (string source, string replace)
4193                         {
4194                                 Source = source;
4195                                 Replace = replace;
4196                         }
4197
4198                         public char [] ToCharArray ()
4199                         {
4200                                 char [] ret = new char [Source.Length + Replace.Length + 3];
4201                                 ret [0] = (char) 03; // kind:ReplaceMap
4202                                 int pos = 1;
4203                                 for (int i = 0; i < Source.Length; i++)
4204                                         ret [pos++] = Source [i];
4205                                 // null terminate
4206                                 pos++;
4207                                 for (int i = 0; i < Replace.Length; i++)
4208                                         ret [pos++] = Replace [i];
4209                                 // null terminate
4210                                 return ret;
4211                         }
4212                 }
4213         }
4214 }