2005-06-24 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / create-mscompat-collation-table.cs
1 //
2 //
3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
5 //
6 //      - Surrogate
7 //      - PrivateUse
8 //
9 // Also, for composite characters it should prepare different index table.
10 //
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
13 //
14
15 //
16 // * sortkey getter signature
17 //
18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
19 //      Stores sort key for corresponding character element into buf and
20 //      returns the length of the consumed _source_ character element in s.
21 //
22 // * character length to consume
23 //
24 //      If there are characters whose primary weight is 0, they are consumed
25 //      and considered as a part of the character element.
26 //
27
28 using System;
29 using System.IO;
30 using System.Collections;
31 using System.Globalization;
32 using System.Text;
33 using System.Xml;
34
35 namespace Mono.Globalization.Unicode
36 {
37         internal class MSCompatSortKeyTableGenerator
38         {
39                 public static void Main (string [] args)
40                 {
41                         new MSCompatSortKeyTableGenerator ().Run (args);
42                 }
43
44                 const int DecompositionWide = 1; // fixed
45                 const int DecompositionSub = 2; // fixed
46                 const int DecompositionSmall = 3;
47                 const int DecompositionIsolated = 4;
48                 const int DecompositionInitial = 5;
49                 const int DecompositionFinal = 6;
50                 const int DecompositionMedial = 7;
51                 const int DecompositionNoBreak = 8;
52                 const int DecompositionVertical = 9;
53                 const int DecompositionFraction = 0xA;
54                 const int DecompositionFont = 0xB;
55                 const int DecompositionSuper = 0xC; // fixed
56                 const int DecompositionFull = 0xE;
57                 const int DecompositionNarrow = 0xD;
58                 const int DecompositionCircle = 0xF;
59                 const int DecompositionSquare = 0x10;
60                 const int DecompositionCompat = 0x11;
61                 const int DecompositionCanonical = 0x12;
62
63                 TextWriter Result = Console.Out;
64
65                 byte [] fillIndex = new byte [256]; // by category
66                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
67
68                 char [] specialIgnore = new char [] {
69                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
70                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
71                         };
72
73                 // FIXME: need more love (as always)
74                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
75                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
76                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
77                         '\u0292', '\u01BE', '\u0298'};
78                 byte [] alphaWeights = new byte [] {
79                         2, 9, 0xA, 0x1A, 0x21,
80                         0x23, 0x25, 0x2C, 0x32, 0x35,
81                         0x36, 0x48, 0x51, 0x70, 0x7C,
82                         0x7E, 0x89, 0x8A, 0x91, 0x99,
83                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
84                         0xA9, 0xAA, 0xB3, 0xB4};
85
86                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
87                 bool [] isUppercase = new bool [char.MaxValue + 1];
88
89                 byte [] decompType = new byte [char.MaxValue + 1];
90                 int [] decompIndex = new int [char.MaxValue + 1];
91                 int [] decompLength = new int [char.MaxValue + 1];
92                 int [] decompValues;
93                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
94
95                 byte [] diacritical = new byte [char.MaxValue + 1];
96
97                 string [] diacritics = new string [] {
98                         // LATIN
99                         "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
100                         "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
101                         " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
102                         " OGONEK;", " CEDILLA;",
103                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
104                         " STROKE;", " CIRCUMFLEX AND ACUTE;",
105                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
106                         " DIAERESIS AND GRAVE;",
107                         " BREVE AND ACUTE;",
108                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
109                         " MACRON AND ACUTE;",
110                         " MACRON AND GRAVE;",
111                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
112                         " RING ABOVE AND ACUTE",
113                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
114                         " CIRCUMFLEX AND TILDE",
115                         " TILDE AND DIAERESIS",
116                         " STROKE AND ACUTE",
117                         " BREVE AND TILDE",
118                         " CEDILLA AND BREVE",
119                         " OGONEK AND MACRON",
120                         " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
121                         " DOUBLE GRAVE;",
122                         " INVERTED BREVE",
123                         " PRECEDED BY APOSTROPHE",
124                         " HORN;",
125                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
126                         " PALATAL HOOK",
127                         " DOT BELOW;",
128                         " RETROFLEX;", "DIAERESIS BELOW",
129                         " RING BELOW",
130                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
131                         " BREVE BELOW;", " HORN AND GRAVE",
132                         " TILDE BELOW",
133                         " DOT BELOW AND DOT ABOVE",
134                         " RIGHT HALF RING", " HORN AND TILDE",
135                         " CIRCUMFLEX AND DOT BELOW",
136                         " BREVE AND DOT BELOW",
137                         " DOT BELOW AND MACRON",
138                         " HORN AND HOOK ABOVE",
139                         " HORN AND DOT",
140                         // CIRCLED, PARENTHESIZED and so on
141                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
142                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
143                         };
144                 byte [] diacriticWeights = new byte [] {
145                         // LATIN.
146                         0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
147                         0x17, 0x19, 0x1A, 0x1B, 0x1C,
148                         0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
149                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
150                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
151                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
152                         0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
153                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
154                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 
155                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
156                         0x95, 0xAA,
157                         // CIRCLED, PARENTHESIZED and so on.
158                         0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
159                         };
160
161                 int [] numberSecondaryWeightBounds = new int [] {
162                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
163                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
164                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
165                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
166                         0xE50, 0xE60, 0xED0, 0xEE0
167                         };
168
169                 char [] orderedCyrillic;
170                 char [] orderedGurmukhi;
171                 char [] orderedGujarati;
172                 char [] orderedGeorgian;
173                 char [] orderedThaana;
174
175                 static readonly char [] orderedTamilConsonants = new char [] {
176                         // based on traditional Tamil consonants, except for
177                         // Grantha (where Microsoft breaks traditionalism).
178                         // http://www.angelfire.com/empire/thamizh/padanGaL
179                         '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
180                         '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
181                         '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
182                         '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
183                         '\u0BB9'};
184
185                 // cp -> character name (only for some characters)
186                 ArrayList sortableCharNames = new ArrayList ();
187
188                 // cp -> arrow value (int)
189                 ArrayList arrowValues = new ArrayList ();
190
191                 // cp -> box value (int)
192                 ArrayList boxValues = new ArrayList ();
193
194                 // cp -> level1 value
195                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
196                 Hashtable cyrillicLetterPrimaryValues = new Hashtable ();
197
198                 // letterName -> cp
199                 Hashtable arabicNameMap = new Hashtable ();
200                 Hashtable cyrillicNameMap = new Hashtable ();
201
202                 // cp -> Hashtable [decompType] -> cp
203                 Hashtable nfkdMap = new Hashtable ();
204
205                 // Latin letter -> ArrayList [int]
206                 Hashtable latinMap = new Hashtable ();
207
208                 ArrayList jisJapanese = new ArrayList ();
209                 ArrayList nonJisJapanese = new ArrayList ();
210
211                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
212                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
213                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
214                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
215                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
216
217                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
218
219                 static double [] unicodeAge = new double [char.MaxValue + 1];
220
221                 ArrayList tailorings = new ArrayList ();
222
223                 void Run (string [] args)
224                 {
225                         string dirname = args.Length == 0 ? "downloaded" : args [0];
226                         ParseSources (dirname);
227                         Console.Error.WriteLine ("parse done.");
228
229                         ModifyParsedValues ();
230                         GenerateCore ();
231                         Console.Error.WriteLine ("generation done.");
232                         Serialize ();
233                         Console.Error.WriteLine ("serialization done.");
234 /*
235 StreamWriter sw = new StreamWriter ("agelog.txt");
236 for (int i = 0; i < char.MaxValue; i++) {
237 bool shouldBe = false;
238 switch (Char.GetUnicodeCategory ((char) i)) {
239 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
240         shouldBe = true; break;
241 }
242 if (unicodeAge [i] >= 3.1)
243         shouldBe = true;
244 //if (IsIgnorable (i) != shouldBe)
245 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
246 }
247 sw.Close ();
248 */
249                 }
250
251                 byte [] CompressArray (byte [] source, CodePointIndexer i)
252                 {
253                         return (byte []) CodePointIndexer.CompressArray  (
254                                 source, typeof (byte), i);
255                 }
256
257                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
258                 {
259                         return (ushort []) CodePointIndexer.CompressArray  (
260                                 source, typeof (ushort), i);
261                 }
262
263                 void Serialize ()
264                 {
265                         // Tailorings
266                         SerializeTailorings ();
267
268                         byte [] categories = new byte [map.Length];
269                         byte [] level1 = new byte [map.Length];
270                         byte [] level2 = new byte [map.Length];
271                         byte [] level3 = new byte [map.Length];
272                         int [] widthCompat = new int [map.Length];
273                         for (int i = 0; i < map.Length; i++) {
274                                 categories [i] = map [i].Category;
275                                 level1 [i] = map [i].Level1;
276                                 level2 [i] = map [i].Level2;
277                                 level3 [i] = ComputeLevel3Weight ((char) i);
278                                 switch (decompType [i]) {
279                                 case DecompositionNarrow:
280                                 case DecompositionWide:
281                                 case DecompositionSuper:
282                                 case DecompositionSub:
283                                         // they are always 1 char
284                                         widthCompat [i] = decompValues [decompIndex [i]];
285                                         break;
286                                 }
287                         }
288
289                         // compress
290                         ignorableFlags = CompressArray (ignorableFlags,
291                                 MSCompatUnicodeTableUtil.Ignorable);
292                         categories = CompressArray (categories,
293                                 MSCompatUnicodeTableUtil.Category);
294                         level1 = CompressArray (level1, 
295                                 MSCompatUnicodeTableUtil.Level1);
296                         level2 = CompressArray (level2, 
297                                 MSCompatUnicodeTableUtil.Level2);
298                         level3 = CompressArray (level3, 
299                                 MSCompatUnicodeTableUtil.Level3);
300                         widthCompat = (int []) CodePointIndexer.CompressArray (
301                                 widthCompat, typeof (int),
302                                 MSCompatUnicodeTableUtil.WidthCompat);
303                         cjkCHS = CompressArray (cjkCHS,
304                                 MSCompatUnicodeTableUtil.CjkCHS);
305                         cjkCHT = CompressArray (cjkCHT,
306                                 MSCompatUnicodeTableUtil.Cjk);
307                         cjkJA = CompressArray (cjkJA,
308                                 MSCompatUnicodeTableUtil.Cjk);
309                         cjkKO = CompressArray (cjkKO,
310                                 MSCompatUnicodeTableUtil.Cjk);
311                         cjkKOlv2 = CompressArray (cjkKOlv2,
312                                 MSCompatUnicodeTableUtil.Cjk);
313
314                         // Ignorables
315                         Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
316                         for (int i = 0; i < ignorableFlags.Length; i++) {
317                                 byte value = ignorableFlags [i];
318                                 if (value < 10)
319                                         Result.Write ("{0},", value);
320                                 else
321                                         Result.Write ("0x{0:X02},", value);
322                                 if ((i & 0xF) == 0xF)
323                                         Result.WriteLine ("// {0:X04}", i - 0xF);
324                         }
325                         Result.WriteLine ("};");
326                         Result.WriteLine ();
327
328                         // Primary category
329                         Result.WriteLine ("static byte [] categories = new byte [] {");
330                         for (int i = 0; i < categories.Length; i++) {
331                                 byte value = categories [i];
332                                 if (value < 10)
333                                         Result.Write ("{0},", value);
334                                 else
335                                         Result.Write ("0x{0:X02},", value);
336                                 if ((i & 0xF) == 0xF)
337                                         Result.WriteLine ("// {0:X04}", i - 0xF);
338                         }
339                         Result.WriteLine ("};");
340                         Result.WriteLine ();
341
342                         // Primary weight value
343                         Result.WriteLine ("static byte [] level1 = new byte [] {");
344                         for (int i = 0; i < level1.Length; i++) {
345                                 byte value = level1 [i];
346                                 if (value < 10)
347                                         Result.Write ("{0},", value);
348                                 else
349                                         Result.Write ("0x{0:X02},", value);
350                                 if ((i & 0xF) == 0xF)
351                                         Result.WriteLine ("// {0:X04}", i - 0xF);
352                         }
353                         Result.WriteLine ("};");
354                         Result.WriteLine ();
355
356                         // Secondary weight
357                         Result.WriteLine ("static byte [] level2 = new byte [] {");
358                         for (int i = 0; i < level2.Length; i++) {
359                                 int value = level2 [i];
360                                 if (value < 10)
361                                         Result.Write ("{0},", value);
362                                 else
363                                         Result.Write ("0x{0:X02},", value);
364                                 if ((i & 0xF) == 0xF)
365                                         Result.WriteLine ("// {0:X04}", i - 0xF);
366                         }
367                         Result.WriteLine ("};");
368                         Result.WriteLine ();
369
370                         // Thirtiary weight
371                         Result.WriteLine ("static byte [] level3 = new byte [] {");
372                         for (int i = 0; i < level3.Length; i++) {
373                                 byte value = level3 [i];
374                                 if (value < 10)
375                                         Result.Write ("{0},", value);
376                                 else
377                                         Result.Write ("0x{0:X02},", value);
378                                 if ((i & 0xF) == 0xF)
379                                         Result.WriteLine ("// {0:X04}", i - 0xF);
380                         }
381                         Result.WriteLine ("};");
382                         Result.WriteLine ();
383
384                         // Width insensitivity mappings
385                         // (for now it is more lightweight than dumping the
386                         // entire NFKD table).
387                         Result.WriteLine ("static int [] widthCompat = new int [] {");
388                         for (int i = 0; i < widthCompat.Length; i++) {
389                                 int value = widthCompat [i];
390                                 if (value < 10)
391                                         Result.Write ("{0},", value);
392                                 else
393                                         Result.Write ("0x{0:X02},", value);
394                                 if ((i & 0xF) == 0xF)
395                                         Result.WriteLine ("// {0:X04}", i - 0xF);
396                         }
397                         Result.WriteLine ("};");
398                         Result.WriteLine ();
399
400                         // CJK
401                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
402                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
403                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
404                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
405                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
406                 }
407
408                 void SerializeCJK (string name, ushort [] cjk, int max)
409                 {
410                         int offset = 0;//char.MaxValue - cjk.Length;
411                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
412                         for (int i = 0; i < cjk.Length; i++) {
413                                 if (i + offset == max)
414                                         break;
415                                 ushort value = cjk [i];
416                                 if (value < 10)
417                                         Result.Write ("{0},", value);
418                                 else
419                                         Result.Write ("0x{0:X04},", value);
420                                 if ((i & 0xF) == 0xF)
421                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
422                         }
423                         Result.WriteLine ("};");
424                         Result.WriteLine ();
425                 }
426
427                 void SerializeCJK (string name, byte [] cjk, int max)
428                 {
429                         int offset = 0;//char.MaxValue - cjk.Length;
430                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
431                         for (int i = 0; i < cjk.Length; i++) {
432                                 if (i + offset == max)
433                                         break;
434                                 byte value = cjk [i];
435                                 if (value < 10)
436                                         Result.Write ("{0},", value);
437                                 else
438                                         Result.Write ("0x{0:X02},", value);
439                                 if ((i & 0xF) == 0xF)
440                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
441                         }
442                         Result.WriteLine ("};");
443                         Result.WriteLine ();
444                 }
445
446                 void SerializeTailorings ()
447                 {
448                         Hashtable indexes = new Hashtable ();
449                         Hashtable counts = new Hashtable ();
450                         Result.WriteLine ("static char [] tailorings = new char [] {");
451                         int count = 0;
452                         foreach (Tailoring t in tailorings) {
453                                 if (t.Alias != 0)
454                                         continue;
455                                 Result.Write ("/*{0}*/", t.LCID);
456                                 indexes.Add (t.LCID, count);
457                                 char [] values = t.ItemToCharArray ();
458                                 counts.Add (t.LCID, values.Length);
459                                 foreach (char c in values) {
460                                         Result.Write ("'\\x{0:X}', ", (int) c);
461                                         if (++count % 16 == 0)
462                                                 Result.WriteLine (" // {0:X04}", count - 16);
463                                 }
464                         }
465                         Result.WriteLine ("};");
466
467                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
468                         foreach (Tailoring t in tailorings) {
469                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
470                                 if (!indexes.ContainsKey (target)) {
471                                         Console.Error.WriteLine ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias);
472                                         continue;
473                                 }
474                                 int idx = (int) indexes [target];
475                                 int cnt = (int) counts [target];
476                                 bool french = t.FrenchSort;
477                                 if (t.Alias != 0)
478                                         foreach (Tailoring t2 in tailorings)
479                                                 if (t2.LCID == t.LCID)
480                                                         french = t2.FrenchSort;
481                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
482                         }
483                         Result.WriteLine ("};");
484                 }
485
486                 #region Parse
487
488                 void ParseSources (string dirname)
489                 {
490                         string unidata =
491                                 dirname + "/UnicodeData.txt";
492                         string derivedCoreProps = 
493                                 dirname + "/DerivedCoreProperties.txt";
494                         string scripts = 
495                                 dirname + "/Scripts.txt";
496                         string cp932 = 
497                                 dirname + "/CP932.TXT";
498                         string derivedAge = 
499                                 dirname + "/DerivedAge.txt";
500                         string chXML = dirname + "/common/collation/zh.xml";
501                         string jaXML = dirname + "/common/collation/ja.xml";
502                         string koXML = dirname + "/common/collation/ko.xml";
503
504                         ParseDerivedAge (derivedAge);
505
506                         FillIgnorables ();
507
508                         ParseJISOrder (cp932); // in prior to ParseUnidata()
509                         ParseUnidata (unidata);
510                         ParseDerivedCoreProperties (derivedCoreProps);
511                         ParseScripts (scripts);
512                         ParseCJK (chXML, jaXML, koXML);
513
514                         ParseTailorings ("mono-tailoring-source.txt");
515                 }
516
517                 void ParseTailorings (string filename)
518                 {
519                         Tailoring t = null;
520                         int line = 0;
521                         using (StreamReader sr = new StreamReader (filename)) {
522                                 try {
523                                         while (sr.Peek () >= 0) {
524                                                 line++;
525                                                 ProcessTailoringLine (ref t,
526                                                         sr.ReadLine ().Trim ());
527                                         }
528                                 } catch (Exception) {
529                                         Console.Error.WriteLine ("ERROR at line {0}", line);
530                                         throw;
531                                 }
532                         }
533                 }
534
535                 // For now this is enough.
536                 string ParseTailoringSourceValue (string s)
537                 {
538                         StringBuilder sb = new StringBuilder ();
539                         for (int i = 0; i < s.Length; i++) {
540                                 if (s.StartsWith ("\\u")) {
541                                         sb.Append ((char) int.Parse (
542                                                 s.Substring (2, 4), NumberStyles.HexNumber),
543                                                 1);
544                                         i += 5;
545                                 }
546                         else
547                                 sb.Append (s [i]);
548                         }
549                         return sb.ToString ();
550                 }
551
552                 void ProcessTailoringLine (ref Tailoring t, string s)
553                 {
554                         int idx = s.IndexOf ('#');
555                         if (idx > 0)
556                                 s = s.Substring (0, idx).Trim ();
557                         if (s.Length == 0 || s [0] == '#')
558                                 return;
559                         if (s [0] == '@') {
560                                 idx = s.IndexOf ('=');
561                                 if (idx > 0)
562                                         t = new Tailoring (
563                                                 int.Parse (s.Substring (1, idx - 1)),
564                                                 int.Parse (s.Substring (idx + 1)));
565                                 else
566                                         t = new Tailoring (int.Parse (s.Substring (1)));
567                                 tailorings.Add (t);
568                                 return;
569                         }
570                         if (s.StartsWith ("*FrenchSort")) {
571                                 t.FrenchSort = true;
572                                 return;
573                         }
574                         string d = "*Diacritical";
575                         if (s.StartsWith (d)) {
576                                 idx = s.IndexOf ("->");
577                                 t.AddDiacriticalMap (
578                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
579                                                 NumberStyles.HexNumber),
580                                         byte.Parse (s.Substring (idx + 2).Trim (),
581                                                 NumberStyles.HexNumber));
582                                 return;
583                         }
584                         idx = s.IndexOf (':');
585                         if (idx > 0) {
586                                 string source = s.Substring (0, idx).Trim ();
587                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
588                                 byte [] b = new byte [5];
589                                 for (int i = 0; i < 5; i++) {
590                                         if (l [i] == "*")
591                                                 b [i] = 0;
592                                         else
593                                                 b [i] = byte.Parse (l [i],
594                                                         NumberStyles.HexNumber);
595                                 }
596                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
597                                         b);
598                         }
599                         idx = s.IndexOf ('=');
600                         if (idx > 0)
601                                 t.AddReplacementMap (
602                                         ParseTailoringSourceValue (
603                                                 s.Substring (0, idx).Trim ()),
604                                         ParseTailoringSourceValue (
605                                                 s.Substring (idx + 1).Trim ()));
606                 }
607
608                 void ParseDerivedAge (string filename)
609                 {
610                         using (StreamReader file =
611                                 new StreamReader (filename)) {
612                                 while (file.Peek () >= 0) {
613                                         string s = file.ReadLine ();
614                                         int idx = s.IndexOf ('#');
615                                         if (idx >= 0)
616                                                 s = s.Substring (0, idx);
617                                         idx = s.IndexOf (';');
618                                         if (idx < 0)
619                                                 continue;
620
621                                         string cpspec = s.Substring (0, idx);
622                                         idx = cpspec.IndexOf ("..");
623                                         NumberStyles nf = NumberStyles.HexNumber |
624                                                 NumberStyles.AllowTrailingWhite;
625                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
626                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
627                                         string value = s.Substring (cpspec.Length + 1).Trim ();
628
629                                         // FIXME: use index
630                                         if (cp > char.MaxValue)
631                                                 continue;
632
633                                         double v = double.Parse (value);
634                                         for (int i = cp; i <= cpEnd; i++)
635                                                 unicodeAge [i] = v;
636                                 }
637                         }
638                         unicodeAge [0] = double.MaxValue; // never be supported
639                 }
640
641                 void ParseUnidata (string filename)
642                 {
643                         ArrayList decompValues = new ArrayList ();
644                         using (StreamReader unidata =
645                                 new StreamReader (filename)) {
646                                 for (int line = 1; unidata.Peek () >= 0; line++) {
647                                         try {
648                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
649                                         } catch (Exception) {
650                                                 Console.Error.WriteLine ("**** At line " + line);
651                                                 throw;
652                                         }
653                                 }
654                         }
655                         this.decompValues = (int [])
656                                 decompValues.ToArray (typeof (int));
657                 }
658                 
659                 void ProcessUnidataLine (string s, ArrayList decompValues)
660                 {
661                         int idx = s.IndexOf ('#');
662                         if (idx >= 0)
663                                 s = s.Substring (0, idx);
664                         idx = s.IndexOf (';');
665                         if (idx < 0)
666                                 return;
667                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
668                         string [] values = s.Substring (idx + 1).Split (';');
669
670                         // FIXME: use index
671                         if (cp > char.MaxValue)
672                                 return;
673                         if (IsIgnorable (cp))
674                                 return;
675
676                         string name = values [0];
677
678                         // isSmallCapital
679                         if (s.IndexOf ("SMALL CAPITAL") > 0)
680                                 isSmallCapital [cp] = true;
681
682                         // latin mapping by character name
683                         if (s.IndexOf ("LATIN") > 0) {
684                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
685                                 int offset = lidx + 15;
686                                 if (lidx < 0) {
687                                         lidx = s.IndexOf ("LETTER TURNED ");
688                                         offset = lidx + 14;
689                                 }
690                                 if (lidx < 0) {
691                                         lidx = s.IndexOf ("LETTER ");
692                                         offset = lidx + 7;
693                                 }
694                                 char c = lidx > 0 ? s [offset] : char.MinValue;
695                                 if ('A' <= c && c <= 'Z' &&
696                                         (s.Length == offset + 1 || s [offset + 1] == ' ')) {
697                                         ArrayList entry = (ArrayList) latinMap [c];
698                                         if (entry == null) {
699                                                 entry = new ArrayList ();
700                                                 latinMap [c] = entry;
701                                         }
702                                         entry.Add (cp);
703                                 }
704                         }
705
706                         // Arrow names
707                         if (0x2000 <= cp && cp < 0x3000) {
708                                 int value = 0;
709                                 // SPECIAL CASES. FIXME: why?
710                                 switch (cp) {
711                                 case 0x21C5: value = -1; break; // E2
712                                 case 0x261D: value = 1; break;
713                                 case 0x27A6: value = 3; break;
714                                 case 0x21B0: value = 7; break;
715                                 case 0x21B1: value = 3; break;
716                                 case 0x21B2: value = 7; break;
717                                 case 0x21B4: value = 5; break;
718                                 case 0x21B5: value = 7; break;
719                                 case 0x21B9: value = -1; break; // E1
720                                 case 0x21CF: value = 7; break;
721                                 case 0x21D0: value = 3; break;
722                                 }
723                                 string [] arrowTargets = new string [] {
724                                         "",
725                                         "UPWARDS",
726                                         "NORTH EAST",
727                                         "RIGHTWARDS",
728                                         "SOUTH EAST",
729                                         "DOWNWARDS",
730                                         "SOUTH WEST",
731                                         "LEFTWARDS",
732                                         "NORTH WEST",
733                                         };
734                                 if (value == 0)
735                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
736                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
737                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
738                                                         s.IndexOf (" OVER") < 0
739                                                 )
740                                                         value = i;
741                                 if (value > 0)
742                                         arrowValues.Add (new DictionaryEntry (
743                                                 cp, value));
744                         }
745
746                         // Box names
747                         if (0x2500 <= cp && cp < 0x25B0) {
748                                 int value = 0;
749                                 // flags:
750                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
751                                 // [h,rl] [r] [l]
752                                 // [v,ud] [u] [d]
753                                 // [dr] [dl] [ur] [ul]
754                                 // [vr,udr] [vl,vdl]
755                                 // [hd,rld] [hu,rlu]
756                                 // [hv,udrl,rlv,udh]
757                                 ArrayList flags = new ArrayList (new int [] {
758                                         32, 8 + 4, 8, 4,
759                                         16, 1 + 2, 1, 2,
760                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
761                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
762                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
763                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
764                                         });
765                                 byte [] offsets = new byte [] {
766                                         0, 0, 1, 2,
767                                         3, 3, 4, 5,
768                                         6, 7, 8, 9,
769                                         10, 10, 11, 11,
770                                         12, 12, 13, 13,
771                                         14, 14, 14, 14};
772                                 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
773                                         int flag = 0;
774                                         if (s.IndexOf (" UP") > 0)
775                                                 flag |= 1;
776                                         if (s.IndexOf (" DOWN") > 0)
777                                                 flag |= 2;
778                                         if (s.IndexOf (" RIGHT") > 0)
779                                                 flag |= 4;
780                                         if (s.IndexOf (" LEFT") > 0)
781                                                 flag |= 8;
782                                         if (s.IndexOf (" VERTICAL") > 0)
783                                                 flag |= 16;
784                                         if (s.IndexOf (" HORIZONTAL") > 0)
785                                                 flag |= 32;
786
787                                         int fidx = flags.IndexOf (flag);
788                                         value = fidx < 0 ? fidx : offsets [fidx];
789                                 } else if (s.IndexOf ("BLOCK") > 0) {
790                                         if (s.IndexOf ("ONE EIGHTH") > 0)
791                                                 value = 0x12;
792                                         else if (s.IndexOf ("ONE QUARTER") > 0)
793                                                 value = 0x13;
794                                         else if (s.IndexOf ("THREE EIGHTHS") > 0)
795                                                 value = 0x14;
796                                         else if (s.IndexOf ("HALF") > 0)
797                                                 value = 0x15;
798                                         else if (s.IndexOf ("FIVE EIGHTHS") > 0)
799                                                 value = 0x16;
800                                         else if (s.IndexOf ("THREE QUARTERS") > 0)
801                                                 value = 0x17;
802                                         else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
803                                                 value = 0x18;
804                                         else
805                                                 value = 0x19;
806                                 }
807                                 if (value >= 0)
808                                         boxValues.Add (new DictionaryEntry (
809                                                 cp, value));
810                         }
811
812                         // For some characters store the name and sort later
813                         // to determine sorting.
814                         if (0x2100 <= cp && cp <= 0x213F &&
815                                 Char.IsSymbol ((char) cp))
816                                 sortableCharNames.Add (
817                                         new DictionaryEntry (cp, values [0]));
818                         else if (0x3380 <= cp && cp <= 0x33DD)
819                                 sortableCharNames.Add (new DictionaryEntry (
820                                         cp, values [0].Substring (7)));
821
822                         // diacritical weights by character name
823                         for (int d = 0; d < diacritics.Length; d++)
824                                 if (s.IndexOf (diacritics [d]) > 0)
825                                         diacritical [cp] |= diacriticWeights [d];
826                         // Two-step grep required for it.
827                         if (s.IndexOf ("FULL STOP") > 0 &&
828                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
829                                 diacritical [cp] |= 0xF4;
830
831                         // Cyrillic letter name
832                         if (0x0430 <= cp && cp <= 0x0486 &&
833                                 Char.IsLetter ((char) cp)) {
834                                 byte value = (byte) (cyrillicNameMap.Count * 3 + 0x06);
835                                 // Get primary letter name i.e.
836                                 // XXX part of CYRILLIC LETTER XXX yyy
837                                 // e.g. "IZHITSA" for "IZHITSA DOUBLE GRAVE".
838                                 string letterName =
839                                         values [0].Substring (values [0].IndexOf ("LETTER ") + 7);
840                                 int tmpIdx = letterName.IndexOf (' ');
841                                 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
842 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
843                                 if (cyrillicNameMap.ContainsKey (letterName))
844                                         value = (byte) cyrillicLetterPrimaryValues [cyrillicNameMap [letterName]];
845                                 else
846                                         cyrillicNameMap [letterName] = cp;
847
848                                 cyrillicLetterPrimaryValues [cp] = value;
849                         }
850
851                         // Arabic letter name
852                         if (0x0621 <= cp && cp <= 0x064A &&
853                                 Char.GetUnicodeCategory ((char) cp)
854                                 == UnicodeCategory.OtherLetter) {
855                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
856                                 switch (cp) {
857                                 case 0x0621:
858                                 case 0x0624:
859                                 case 0x0626:
860                                         // hamza, waw, yeh ... special cases.
861                                         value = 0x07;
862                                         break;
863                                 case 0x0649:
864                                 case 0x064A:
865                                         value = 0x77; // special cases.
866                                         break;
867                                 default:
868                                         // Get primary letter name i.e.
869                                         // XXX part of ARABIC LETTER XXX yyy
870                                         // e.g. that of "TEH MARBUTA" is "TEH".
871                                         string letterName =
872                                                 (cp == 0x0640) ?
873                                                 // 0x0640 is special: it does
874                                                 // not start with ARABIC LETTER
875                                                 values [0] :
876                                                 values [0].Substring (14);
877                                         int tmpIdx = letterName.IndexOf (' ');
878                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
879 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
880                                         if (arabicNameMap.ContainsKey (letterName))
881                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
882                                         else
883                                                 arabicNameMap [letterName] = cp;
884                                         break;
885                                 }
886                                 arabicLetterPrimaryValues [cp] = value;
887                         }
888
889                         // Japanese square letter
890                         if (0x3300 <= cp && cp <= 0x3357)
891                                 if (!ExistsJIS (cp))
892                                         nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
893
894                         // normalizationType
895                         string decomp = values [4];
896                         idx = decomp.IndexOf ('<');
897                         if (idx >= 0) {
898                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
899                                 case "full":
900                                         decompType [cp] = DecompositionFull;
901                                         break;
902                                 case "sub":
903                                         decompType [cp] = DecompositionSub;
904                                         break;
905                                 case "super":
906                                         decompType [cp] = DecompositionSuper;
907                                         break;
908                                 case "small":
909                                         decompType [cp] = DecompositionSmall;
910                                         break;
911                                 case "isolated":
912                                         decompType [cp] = DecompositionIsolated;
913                                         break;
914                                 case "initial":
915                                         decompType [cp] = DecompositionInitial;
916                                         break;
917                                 case "final":
918                                         decompType [cp] = DecompositionFinal;
919                                         break;
920                                 case "medial":
921                                         decompType [cp] = DecompositionMedial;
922                                         break;
923                                 case "noBreak":
924                                         decompType [cp] = DecompositionNoBreak;
925                                         break;
926                                 case "compat":
927                                         decompType [cp] = DecompositionCompat;
928                                         break;
929                                 case "fraction":
930                                         decompType [cp] = DecompositionFraction;
931                                         break;
932                                 case "font":
933                                         decompType [cp] = DecompositionFont;
934                                         break;
935                                 case "circle":
936                                         decompType [cp] = DecompositionCircle;
937                                         break;
938                                 case "square":
939                                         decompType [cp] = DecompositionSquare;
940                                         break;
941                                 case "wide":
942                                         decompType [cp] = DecompositionWide;
943                                         break;
944                                 case "narrow":
945                                         decompType [cp] = DecompositionNarrow;
946                                         break;
947                                 case "vertical":
948                                         decompType [cp] = DecompositionVertical;
949                                         break;
950                                 default:
951                                         throw new Exception ("Support NFKD type : " + decomp);
952                                 }
953                         }
954                         else
955                                 decompType [cp] = DecompositionCanonical;
956                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
957                         if (decomp.Length > 0) {
958
959                                 string [] velems = decomp.Split (' ');
960                                 int didx = decompValues.Count;
961                                 decompIndex [cp] = didx;
962                                 foreach (string v in velems)
963                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
964                                 decompLength [cp] = velems.Length;
965
966                                 // [decmpType] -> this_cp
967                                 int targetCP = (int) decompValues [didx];
968                                 // for "(x)" it specially maps to 'x' .
969                                 // FIXME: check if it is sane
970                                 if (velems.Length == 3 &&
971                                         (int) decompValues [didx] == '(' &&
972                                         (int) decompValues [didx + 2] == ')')
973                                         targetCP = (int) decompValues [didx + 1];
974                                 // special: 0x215F "1/"
975                                 else if (cp == 0x215F)
976                                         targetCP = '1';
977                                 else if (velems.Length > 1 &&
978                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
979                                         // skip them, except for CJK ideograph compat
980                                         targetCP = 0;
981
982                                 if (targetCP != 0) {
983                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
984                                         if (entry == null) {
985                                                 entry = new Hashtable ();
986                                                 nfkdMap [targetCP] = entry;
987                                         }
988                                         entry [(byte) decompType [cp]] = cp;
989                                 }
990                         }
991                         // numeric values
992                         if (values [5].Length > 0)
993                                 decimalValue [cp] = decimal.Parse (values [5]);
994                         else if (values [6].Length > 0)
995                                 decimalValue [cp] = decimal.Parse (values [6]);
996                         else if (values [7].Length > 0) {
997                                 string decstr = values [7];
998                                 idx = decstr.IndexOf ('/');
999                                 if (cp == 0x215F) // special. "1/"
1000                                         decimalValue [cp] = 0x1;
1001                                 else if (idx > 0)
1002                                         // m/n
1003                                         decimalValue [cp] = 
1004                                                 decimal.Parse (decstr.Substring (0, idx))
1005                                                 / decimal.Parse (decstr.Substring (idx + 1));
1006                                 else if (decstr [0] == '(' &&
1007                                         decstr [decstr.Length - 1] == ')')
1008                                         // (n)
1009                                         decimalValue [cp] =
1010                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1011                                 else if (decstr [decstr.Length - 1] == '.')
1012                                         // n.
1013                                         decimalValue [cp] =
1014                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1015                                 else
1016                                         decimalValue [cp] = decimal.Parse (decstr);
1017                         }
1018                 }
1019
1020                 void ParseDerivedCoreProperties (string filename)
1021                 {
1022                         // IsUppercase
1023                         using (StreamReader file =
1024                                 new StreamReader (filename)) {
1025                                 for (int line = 1; file.Peek () >= 0; line++) {
1026                                         try {
1027                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1028                                         } catch (Exception) {
1029                                                 Console.Error.WriteLine ("**** At line " + line);
1030                                                 throw;
1031                                         }
1032                                 }
1033                         }
1034                 }
1035
1036                 void ProcessDerivedCorePropLine (string s)
1037                 {
1038                         int idx = s.IndexOf ('#');
1039                         if (idx >= 0)
1040                                 s = s.Substring (0, idx);
1041                         idx = s.IndexOf (';');
1042                         if (idx < 0)
1043                                 return;
1044                         string cpspec = s.Substring (0, idx);
1045                         idx = cpspec.IndexOf ("..");
1046                         NumberStyles nf = NumberStyles.HexNumber |
1047                                 NumberStyles.AllowTrailingWhite;
1048                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1049                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1050                         string value = s.Substring (cpspec.Length + 1).Trim ();
1051
1052                         // FIXME: use index
1053                         if (cp > char.MaxValue)
1054                                 return;
1055
1056                         switch (value) {
1057                         case "Uppercase":
1058                                 for (int x = cp; x <= cpEnd; x++)
1059                                         isUppercase [x] = true;
1060                                 break;
1061                         }
1062                 }
1063
1064                 void ParseScripts (string filename)
1065                 {
1066                         ArrayList cyrillic = new ArrayList ();
1067                         ArrayList gurmukhi = new ArrayList ();
1068                         ArrayList gujarati = new ArrayList ();
1069                         ArrayList georgian = new ArrayList ();
1070                         ArrayList thaana = new ArrayList ();
1071
1072                         using (StreamReader file =
1073                                 new StreamReader (filename)) {
1074                                 while (file.Peek () >= 0) {
1075                                         string s = file.ReadLine ();
1076                                         int idx = s.IndexOf ('#');
1077                                         if (idx >= 0)
1078                                                 s = s.Substring (0, idx);
1079                                         idx = s.IndexOf (';');
1080                                         if (idx < 0)
1081                                                 continue;
1082
1083                                         string cpspec = s.Substring (0, idx);
1084                                         idx = cpspec.IndexOf ("..");
1085                                         NumberStyles nf = NumberStyles.HexNumber |
1086                                                 NumberStyles.AllowTrailingWhite;
1087                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1088                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1089                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1090
1091                                         // FIXME: use index
1092                                         if (cp > char.MaxValue)
1093                                                 continue;
1094
1095                                         switch (value) {
1096                                         case "Cyrillic":
1097                                                 for (int x = cp; x <= cpEnd; x++)
1098                                                         if (!IsIgnorable (x))
1099                                                                 cyrillic.Add ((char) x);
1100                                                 break;
1101                                         case "Gurmukhi":
1102                                                 for (int x = cp; x <= cpEnd; x++)
1103                                                         if (!IsIgnorable (x))
1104                                                                 gurmukhi.Add ((char) x);
1105                                                 break;
1106                                         case "Gujarati":
1107                                                 for (int x = cp; x <= cpEnd; x++)
1108                                                         if (!IsIgnorable (x))
1109                                                                 gujarati.Add ((char) x);
1110                                                 break;
1111                                         case "Georgian":
1112                                                 for (int x = cp; x <= cpEnd; x++)
1113                                                         if (!IsIgnorable (x))
1114                                                                 georgian.Add ((char) x);
1115                                                 break;
1116                                         case "Thaana":
1117                                                 for (int x = cp; x <= cpEnd; x++)
1118                                                         if (!IsIgnorable (x))
1119                                                                 thaana.Add ((char) x);
1120                                                 break;
1121                                         }
1122                                 }
1123                         }
1124                         cyrillic.Sort (UCAComparer.Instance);
1125                         gurmukhi.Sort (UCAComparer.Instance);
1126                         gujarati.Sort (UCAComparer.Instance);
1127                         georgian.Sort (UCAComparer.Instance);
1128                         thaana.Sort (UCAComparer.Instance);
1129                         orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1130                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1131                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1132                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1133                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1134                 }
1135
1136                 void ParseJISOrder (string filename)
1137                 {
1138                         using (StreamReader file =
1139                                 new StreamReader (filename)) {
1140                                 while (file.Peek () >= 0) {
1141                                         string s = file.ReadLine ();
1142                                         int idx = s.IndexOf ('#');
1143                                         if (idx >= 0)
1144                                                 s = s.Substring (0, idx).Trim ();
1145                                         if (s.Length == 0)
1146                                                 continue;
1147                                         idx = s.IndexOf (' ');
1148                                         if (idx < 0)
1149                                                 continue;
1150                                         // They start with "0x" so cut them out.
1151                                         int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
1152                                         int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
1153                                         jisJapanese.Add (new JISCharacter (cp, jis));
1154                                 }
1155                         }
1156                 }
1157
1158                 void ParseCJK (string zhXML, string jaXML, string koXML)
1159                 {
1160                         XmlDocument doc = new XmlDocument ();
1161                         doc.XmlResolver = null;
1162                         int v;
1163                         string s;
1164                         string category;
1165                         int offset;
1166                         ushort [] arr;
1167
1168                         // Chinese Simplified
1169                         category = "chs";
1170                         arr = cjkCHS;
1171                         offset = 0;//char.MaxValue - arr.Length;
1172                         doc.Load (zhXML);
1173                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1174                         v = 0x8008;
1175                         foreach (char c in s) {
1176                                 if (c < '\u3100')
1177                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1178                                 else {
1179                                         arr [(int) c - offset] = (ushort) v++;
1180                                         if (v % 256 == 0)
1181                                                 v += 2;
1182                                 }
1183                         }
1184
1185                         // Chinese Traditional
1186                         category = "cht";
1187                         arr = cjkCHT;
1188                         offset = 0;//char.MaxValue - arr.Length;
1189                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1190                         v = 0x8002;
1191                         foreach (char c in s) {
1192                                 if (c < '\u4E00')
1193                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1194                                 else {
1195                                         arr [(int) c - offset] = (ushort) v++;
1196                                         if (v % 256 == 0)
1197                                                 v += 2;
1198                                 }
1199                         }
1200
1201                         // Japanese
1202                         category = "ja";
1203                         arr = cjkJA;
1204                         offset = 0;//char.MaxValue - arr.Length;
1205                         doc.Load (jaXML);
1206                         s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1207                         v = 0x8008;
1208                         foreach (char c in s) {
1209                                 if (c < '\u4E00')
1210                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1211                                 else {
1212                                         arr [(int) c - offset] = (ushort) v++;
1213                                         if (v % 256 == 0)
1214                                                 v += 2;
1215                                 }
1216                         }
1217
1218                         // Korean
1219                         // Korean weight is somewhat complex. It first shifts
1220                         // Hangul category from 52-x to 80-x (they are anyways
1221                         // computed). CJK ideographs are placed at secondary
1222                         // weight, like XX YY 01 zz 01, where XX and YY are
1223                         // corresponding "reset" value and zz is 41,43,45...
1224                         //
1225                         // Unlike chs,cht and ja, Korean value is a combined
1226                         // ushort which is computed as category
1227                         //
1228                         category = "ko";
1229                         arr = cjkKO;
1230                         offset = 0;//char.MaxValue - arr.Length;
1231                         doc.Load (koXML);
1232                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1233                                 XmlElement sc = (XmlElement) reset.NextSibling;
1234                                 // compute "category" and "level 1" for the 
1235                                 // target "reset" Hangle syllable
1236                                 char rc = reset.InnerText [0];
1237                                 int ri = ((int) rc - 0xAC00) + 1;
1238                                 ushort p = (ushort)
1239                                         ((ri / 254) * 256 + (ri % 254) + 2);
1240                                 // Place the characters after the target.
1241                                 s = sc.InnerText;
1242                                 v = 0x41;
1243                                 foreach (char c in s) {
1244                                         arr [(int) c - offset] = p;
1245                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1246                                         v += 2;
1247                                 }
1248                         }
1249                 }
1250
1251                 #endregion
1252
1253                 #region Generation
1254
1255                 void FillIgnorables ()
1256                 {
1257                         for (int i = 0; i <= char.MaxValue; i++) {
1258                                 if (Char.GetUnicodeCategory ((char) i) ==
1259                                         UnicodeCategory.OtherNotAssigned)
1260                                         continue;
1261                                 if (IsIgnorable (i))
1262                                         ignorableFlags [i] |= 1;
1263                                 if (IsIgnorableSymbol (i))
1264                                         ignorableFlags [i] |= 2;
1265                                 if (IsIgnorableNonSpacing (i))
1266                                         ignorableFlags [i] |= 4;
1267                         }
1268                 }
1269
1270                 void ModifyParsedValues ()
1271                 {
1272                         // number, secondary weights
1273                         byte weight = 0x38;
1274                         int [] numarr = numberSecondaryWeightBounds;
1275                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1276                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1277                                         if (Char.IsNumber ((char) cp))
1278                                                 diacritical [cp] = weight;
1279
1280                         // Modify some decomposition equivalence
1281                         decompType [0xFE31] = 0;
1282                         decompIndex [0xFE31] = 0;
1283                         decompLength [0xFE31] = 0;
1284                         decompType [0xFE32] = 0;
1285                         decompIndex [0xFE32] = 0;
1286                         decompLength [0xFE32] = 0;
1287
1288                         // Korean parens numbers
1289                         for (int i = 0x3200; i <= 0x321C; i++)
1290                                 diacritical [i] = 0xA;
1291                         for (int i = 0x3260; i <= 0x327B; i++)
1292                                 diacritical [i] = 0xC;
1293
1294                         // Update name part of named characters
1295                         for (int i = 0; i < sortableCharNames.Count; i++) {
1296                                 DictionaryEntry de =
1297                                         (DictionaryEntry) sortableCharNames [i];
1298                                 int cp = (int) de.Key;
1299                                 string renamed = null;
1300                                 switch (cp) {
1301                                 case 0x2101: renamed = "A_1"; break;
1302                                 case 0x33C3: renamed = "A_2"; break;
1303                                 case 0x2105: renamed = "C_1"; break;
1304                                 case 0x2106: renamed = "C_2"; break;
1305                                 case 0x211E: renamed = "R1"; break;
1306                                 case 0x211F: renamed = "R2"; break;
1307                                 // Remove some of them!
1308                                 case 0x2103:
1309                                 case 0x2109:
1310                                 case 0x2116:
1311                                 case 0x2117:
1312                                 case 0x2118:
1313                                 case 0x2125:
1314                                 case 0x2127:
1315                                 case 0x2129:
1316                                 case 0x212E:
1317                                 case 0x2132:
1318                                         sortableCharNames.RemoveAt (i);
1319                                         i--;
1320                                         continue;
1321                                 }
1322                                 if (renamed != null)
1323                                         sortableCharNames [i] =
1324                                                 new DictionaryEntry (cp, renamed);
1325                         }
1326                 }
1327
1328                 void GenerateCore ()
1329                 {
1330                         UnicodeCategory uc;
1331
1332                         #region Specially ignored // 01
1333                         // This will raise "Defined" flag up.
1334                         foreach (char c in specialIgnore)
1335                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1336                         #endregion
1337
1338
1339                         #region Variable weights
1340                         // Controls : 06 03 - 06 3D
1341                         fillIndex [6] = 3;
1342                         for (int i = 0; i < 65536; i++) {
1343                                 if (IsIgnorable (i))
1344                                         continue;
1345                                 char c = (char) i;
1346                                 uc = Char.GetUnicodeCategory (c);
1347                                 // NEL is whitespace but not ignored here.
1348                                 if (uc == UnicodeCategory.Control &&
1349                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1350                                         AddCharMap (c, 6, 1);
1351                         }
1352
1353                         // Apostrophe 06 80
1354                         fillIndex [6] = 0x80;
1355                         AddCharMapGroup ('\'', 6, 1, 0);
1356                         AddCharMap ('\uFE63', 6, 1);
1357
1358                         // Hyphen/Dash : 06 81 - 06 90
1359                         for (int i = 0; i < char.MaxValue; i++) {
1360                                 if (!IsIgnorable (i) &&
1361                                         Char.GetUnicodeCategory ((char) i) ==
1362                                         UnicodeCategory.DashPunctuation) {
1363                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1364                                         if (i == 0x2011) {
1365                                                 // SPECIAL: add 2027 and 2043
1366                                                 // Maybe they are regarded the 
1367                                                 // same hyphens in "central"
1368                                                 // position.
1369                                                 AddCharMap ('\u2027', 6, 1);
1370                                                 AddCharMap ('\u2043', 6, 1);
1371                                         }
1372                                 }
1373                         }
1374
1375                         // Arabic variable weight chars 06 A0 -
1376                         fillIndex [6] = 0xA0;
1377                         // vowels
1378                         for (int i = 0x64B; i <= 0x650; i++)
1379                                 AddArabicCharMap ((char) i);
1380                         // sukun
1381                         AddCharMapGroup ('\u0652', 6, 1, 0);
1382                         // shadda
1383                         AddCharMapGroup ('\u0651', 6, 1, 0);
1384                         #endregion
1385
1386
1387                         #region Nonspacing marks // 01
1388                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1389
1390                         // Combining diacritical marks: 01 DC -
1391
1392                         fillIndex [0x1] = 0x41;
1393                         for (int i = 0x030E; i <= 0x0326; i++)
1394                                 if (!IsIgnorable (i))
1395                                         AddCharMap ((char) i, 0x1, 1);
1396                         for (int i = 0x0329; i <= 0x0334; i++)
1397                                 if (!IsIgnorable (i))
1398                                         AddCharMap ((char) i, 0x1, 1);
1399                         for (int i = 0x0339; i <= 0x0341; i++)
1400                                 if (!IsIgnorable (i))
1401                                         AddCharMap ((char) i, 0x1, 1);
1402                         fillIndex [0x1] = 0x72;
1403                         for (int i = 0x0346; i <= 0x0348; i++)
1404                                 if (!IsIgnorable (i))
1405                                         AddCharMap ((char) i, 0x1, 1);
1406                         for (int i = 0x02BE; i <= 0x02BF; i++)
1407                                 if (!IsIgnorable (i))
1408                                         AddCharMap ((char) i, 0x1, 1);
1409                         for (int i = 0x02C1; i <= 0x02C5; i++)
1410                                 if (!IsIgnorable (i))
1411                                         AddCharMap ((char) i, 0x1, 1);
1412                         for (int i = 0x02CE; i <= 0x02CF; i++)
1413                                 if (!IsIgnorable (i))
1414                                         AddCharMap ((char) i, 0x1, 1);
1415                         for (int i = 0x02D1; i <= 0x02D3; i++)
1416                                 if (!IsIgnorable (i))
1417                                         AddCharMap ((char) i, 0x1, 1);
1418                         AddCharMap ('\u02DE', 0x1, 1);
1419                         for (int i = 0x02E4; i <= 0x02E9; i++)
1420                                 if (!IsIgnorable (i))
1421                                         AddCharMap ((char) i, 0x1, 1);
1422
1423                         // LAMESPEC: It should not stop at '\u20E1'. There are
1424                         // a few more characters (that however results in 
1425                         // overflow of level 2 unless we start before 0xDD).
1426                         fillIndex [0x1] = 0xDC;
1427                         for (int i = 0x20d0; i <= 0x20e1; i++)
1428                                 AddCharMap ((char) i, 0x1, 1);
1429                         #endregion
1430
1431
1432                         #region Whitespaces // 07 03 -
1433                         fillIndex [0x7] = 0x2;
1434                         AddCharMap (' ', 0x7, 2);
1435                         AddCharMap ('\u00A0', 0x7, 1);
1436                         for (int i = 9; i <= 0xD; i++)
1437                                 AddCharMap ((char) i, 0x7, 1);
1438                         for (int i = 0x2000; i <= 0x200B; i++)
1439                                 AddCharMap ((char) i, 0x7, 1);
1440
1441                         fillIndex [0x7] = 0x17;
1442                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1443                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1444
1445                         // Characters which used to represent layout control.
1446                         // LAMESPEC: Windows developers seem to have thought 
1447                         // that those characters are kind of whitespaces,
1448                         // while they aren't.
1449                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1450                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1451                         #endregion
1452
1453                         // FIXME: 09 should be more complete.
1454                         fillIndex [0x9] = 2;
1455                         // misc tech mark
1456                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1457                                 AddCharMap ((char) cp, 0x9, 1, 0);
1458
1459                         // arrows
1460                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1461                         foreach (DictionaryEntry de in arrowValues) {
1462                                 int idx = (int) de.Value;
1463                                 int cp = (int) de.Key;
1464                                 if (map [cp].Defined)
1465                                         continue;
1466                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1467                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1468                                 arrowLv2 [idx]++;
1469                         }
1470                         // boxes
1471                         byte [] boxLv2 = new byte [128];
1472                         for (int i = 0; i < boxLv2.Length; i++)
1473                                 boxLv2 [i] = 3;
1474                         foreach (DictionaryEntry de in boxValues) {
1475                                 int cp = (int) de.Key;
1476                                 int idx = (int) de.Value;
1477                                 if (map [cp].Defined)
1478                                         continue;
1479                                 fillIndex [0x9] = (byte) (0xE5 + idx);
1480                                 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1481                                 boxLv2 [idx]++;
1482                         }
1483                         // Some special characters (slanted)
1484                         fillIndex [0x9] = 0xF4;
1485                         AddCharMap ('\u2571', 0x9, 3);
1486                         AddCharMap ('\u2572', 0x9, 3);
1487                         AddCharMap ('\u2573', 0x9, 3);
1488
1489                         // FIXME: implement 0A
1490                         #region Symbols
1491                         fillIndex [0xA] = 2;
1492                         // byte currency symbols
1493                         for (int cp = 0; cp < 0x100; cp++) {
1494                                 uc = Char.GetUnicodeCategory ((char) cp);
1495                                 if (!IsIgnorable (cp) &&
1496                                         uc == UnicodeCategory.CurrencySymbol &&
1497                                         cp != '$')
1498                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1499                         }
1500                         // byte other symbols
1501                         for (int cp = 0; cp < 0x100; cp++) {
1502                                 if (cp == 0xA6)
1503                                         continue; // SPECIAL: skip FIXME: why?
1504                                 uc = Char.GetUnicodeCategory ((char) cp);
1505                                 if (!IsIgnorable (cp) &&
1506                                         uc == UnicodeCategory.OtherSymbol)
1507                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1508                         }
1509
1510                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1511                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1512                                 AddCharMap ((char) cp, 0xA, 1, 0);
1513                         // Dingbats
1514                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1515                                 if (Char.IsSymbol ((char) cp))
1516                                         AddCharMap ((char) cp, 0xA, 1, 0);
1517                         // OCR
1518                         for (int i = 0x2440; i < 0x2460; i++)
1519                                 AddCharMap ((char) i, 0xA, 1, 0);
1520
1521                         #endregion
1522
1523                         #region Numbers // 0C 02 - 0C E1
1524                         fillIndex [0xC] = 2;
1525
1526                         // 9F8 : Bengali "one less than the denominator"
1527                         AddCharMap ('\u09F8', 0xC, 1);
1528
1529                         ArrayList numbers = new ArrayList ();
1530                         for (int i = 0; i < 65536; i++)
1531                                 if (!IsIgnorable (i) &&
1532                                         Char.IsNumber ((char) i) &&
1533                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1534                                         numbers.Add (i);
1535
1536                         ArrayList numberValues = new ArrayList ();
1537                         foreach (int i in numbers)
1538                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1539                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1540
1541 //foreach (DictionaryEntry de in numberValues)
1542 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1543
1544                         decimal prevValue = -1;
1545                         foreach (DictionaryEntry de in numberValues) {
1546                                 int cp = (int) de.Key;
1547                                 decimal currValue = (decimal) de.Value;
1548                                 bool addnew = false;
1549                                 if (prevValue < currValue &&
1550                                         prevValue - (int) prevValue == 0 &&
1551                                         prevValue >= 1) {
1552
1553                                         addnew = true;
1554                                         // Process Hangzhou and Roman numbers
1555
1556                                         // There are some SPECIAL cases.
1557                                         if (currValue != 4) // no increment for 4
1558                                                 fillIndex [0xC]++;
1559
1560                                         int xcp;
1561                                         xcp = (int) prevValue + 0x2170 - 1;
1562                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1563                                         xcp = (int) prevValue + 0x2160 - 1;
1564                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1565                                         fillIndex [0xC] += 2;
1566                                         xcp = (int) prevValue + 0x3021 - 1;
1567                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1568                                         fillIndex [0xC]++;
1569                                 }
1570                                 if (prevValue < currValue)
1571                                         prevValue = currValue;
1572                                 if (map [cp].Defined)
1573                                         continue;
1574                                 // HangZhou and Roman are add later 
1575                                 // (code is above)
1576                                 else if (0x3021 <= cp && cp < 0x302A
1577                                         || 0x2160 <= cp && cp < 0x216A
1578                                         || 0x2170 <= cp && cp < 0x217A)
1579                                         continue;
1580
1581                                 if (cp ==  0x215B) // FIXME: why?
1582                                         fillIndex [0xC] += 2;
1583                                 else if (cp == 0x3021) // FIXME: why?
1584                                         fillIndex [0xC]++;
1585                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1586
1587                                 if (addnew || cp <= '9') {
1588                                         int xcp;
1589                                         if (1 <= currValue && currValue <= 10) {
1590                                                 xcp = cp - 0x31 + 0x2776;
1591                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1592                                                 xcp = cp - 0x31 + 0x2780;
1593                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1594                                                 xcp = cp - 0x31 + 0x278A;
1595                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1596                                         }
1597                                         if (1 <= currValue && currValue <= 20) {
1598                                                 xcp = cp - 0x31 + 0x2460;
1599                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1600                                                 xcp = cp - 0x31 + 0x2474;
1601                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1602                                                 xcp = cp - 0x31 + 0x2488;
1603                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1604                                         }
1605                                 }
1606
1607                                 if (cp != 0x09E7 && cp != 0x09EA)
1608                                         fillIndex [0xC]++;
1609
1610                                 // Add special cases that are not regarded as 
1611                                 // numbers in UnicodeCategory speak.
1612                                 if (cp == '5') {
1613                                         // TONE FIVE
1614                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1615                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1616                                 }
1617                                 else if (cp == '6') // FIXME: why?
1618                                         fillIndex [0xC]++;
1619                         }
1620
1621                         // 221E: infinity
1622                         fillIndex [0xC] = 0xFF;
1623                         AddCharMap ('\u221E', 0xC, 1);
1624                         #endregion
1625
1626                         #region Letters and NonSpacing Marks (general)
1627
1628                         // ASCII Latin alphabets
1629                         for (int i = 0; i < alphabets.Length; i++)
1630                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1631
1632
1633                         // non-ASCII Latin alphabets
1634                         // FIXME: there is no such characters that are placed
1635                         // *after* "alphabets" array items. This is nothing
1636                         // more than a hack that creates dummy weight for
1637                         // primary characters.
1638                         for (int i = 0x0080; i < 0x0300; i++) {
1639                                 if (!Char.IsLetter ((char) i))
1640                                         continue;
1641                                 // For those Latin Letters which has NFKD are
1642                                 // not added as independent primary character.
1643                                 if (decompIndex [i] != 0)
1644                                         continue;
1645                                 // SPECIAL CASES:
1646                                 // 1.some alphabets have primarily
1647                                 //   equivalent ASCII alphabets.
1648                                 // 2.some have independent primary weights,
1649                                 //   but inside a-to-z range.
1650                                 // 3.there are some expanded characters that
1651                                 //   are not part of Unicode Standard NFKD.
1652                                 switch (i) {
1653                                 // 1. skipping them does not make sense
1654 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
1655 //                              case 0x184: case 0x185: case 0x186: case 0x189:
1656 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
1657 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
1658 //                              case 0x19B: case 0x19C:
1659                                 // 2. skipping them does not make sense
1660 //                              case 0x14A: // Ng
1661 //                              case 0x14B: // ng
1662                                 // 3.
1663                                 case 0xC6: // AE
1664                                 case 0xE6: // ae
1665                                 case 0xDE: // Icelandic Thorn
1666                                 case 0xFE: // Icelandic Thorn
1667                                 case 0xDF: // German ss
1668                                 case 0xFF: // German ss
1669                                 // not classified yet
1670 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1671 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1672 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1673 //                              case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1674 //                              case 0x1DD:
1675                                         continue;
1676                                 }
1677                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
1678                         }
1679
1680                         // Greek and Coptic
1681                         fillIndex [0xF] = 02;
1682                         for (int i = 0x0380; i < 0x0390; i++)
1683                                 if (Char.IsLetter ((char) i))
1684                                         AddLetterMap ((char) i, 0xF, 1);
1685                         fillIndex [0xF] = 02;
1686                         for (int i = 0x0391; i < 0x03CF; i++)
1687                                 if (Char.IsLetter ((char) i))
1688                                         AddLetterMap ((char) i, 0xF, 1);
1689                         fillIndex [0xF] = 0x40;
1690                         for (int i = 0x03D0; i < 0x0400; i++)
1691                                 if (Char.IsLetter ((char) i))
1692                                         AddLetterMap ((char) i, 0xF, 1);
1693
1694                         // Cyrillic - character name order
1695                         fillIndex [0x10] = 0x6;
1696 //*
1697 for (int i = 0; i < orderedCyrillic.Length; i++)
1698 Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
1699
1700                         // table which is moslty from UCA DUCET.
1701                         for (int i = 0; i < orderedCyrillic.Length; i++) {
1702                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
1703                                 if (!IsIgnorable ((int) c) &&
1704                                         c <= '\u045C' &&
1705                                         Char.IsLetter (c)) {
1706                                         AddLetterMap (c, 0x10, 0);
1707                                         fillIndex [0x10] += 3;
1708                                 }
1709                         }
1710                         /*
1711                         for (int i = 0x0460; i < 0x0481; i++) {
1712                                 if (Char.IsLetter ((char) i)) {
1713                                         AddLetterMap ((char) i, 0x10, 0);
1714                                         fillIndex [0x10] += 3;
1715                                 }
1716                         }
1717                         */
1718 /*
1719                         for (int i = 0x0400; i <= 0x0486; i++) {
1720                                 if (!Char.IsLetter ((char) i)) {
1721 //                                      AddCharMap ((char) i, 0x1, 1);
1722                                         continue;
1723                                 }
1724                                 if (!cyrillicLetterPrimaryValues.ContainsKey (i)) {
1725                                         Console.Error.WriteLine ("no value for {0:x04}", i);
1726                                         continue;
1727                                 }
1728                                 fillIndex [0x10] = 
1729                                         (byte) cyrillicLetterPrimaryValues [i];
1730                                 AddLetterMap ((char) i, 0x10, 0);
1731                         }
1732 */
1733
1734                         // Armenian
1735                         fillIndex [0x11] = 0x3;
1736                         for (int i = 0x0531; i < 0x0586; i++)
1737                                 if (Char.IsLetter ((char) i))
1738                                         AddLetterMap ((char) i, 0x11, 1);
1739
1740                         // Hebrew
1741                         // -Letters
1742                         fillIndex [0x12] = 0x3;
1743                         for (int i = 0x05D0; i < 0x05FF; i++)
1744                                 if (Char.IsLetter ((char) i))
1745                                         AddLetterMap ((char) i, 0x12, 1);
1746                         // -Accents
1747                         fillIndex [0x1] = 0x3;
1748                         for (int i = 0x0591; i <= 0x05C2; i++)
1749                                 if (i != 0x05BE)
1750                                         AddCharMap ((char) i, 0x1, 1);
1751
1752                         // Arabic
1753                         fillIndex [0x1] = 0x8E;
1754                         fillIndex [0x13] = 0x3;
1755                         for (int i = 0x0621; i <= 0x064A; i++) {
1756                                 // Abjad
1757                                 if (Char.GetUnicodeCategory ((char) i)
1758                                         != UnicodeCategory.OtherLetter) {
1759                                         // FIXME: arabic nonspacing marks are
1760                                         // in different order.
1761                                         AddCharMap ((char) i, 0x1, 1);
1762                                         continue;
1763                                 }
1764 //                              map [i] = new CharMapEntry (0x13,
1765 //                                      (byte) arabicLetterPrimaryValues [i], 1);
1766                                 fillIndex [0x13] = 
1767                                         (byte) arabicLetterPrimaryValues [i];
1768                                 AddLetterMap ((char) i, 0x13, 0);
1769                         }
1770                         fillIndex [0x13] = 0x84;
1771                         for (int i = 0x0674; i < 0x06D6; i++)
1772                                 if (Char.IsLetter ((char) i))
1773                                         AddLetterMap ((char) i, 0x13, 1);
1774
1775                         // Devanagari
1776                         // FIXME: it does seem straight codepoint mapping.
1777                         fillIndex [0x14] = 04;
1778                         for (int i = 0x0901; i < 0x0905; i++)
1779                                 if (!IsIgnorable (i))
1780                                         AddLetterMap ((char) i, 0x14, 2);
1781                         fillIndex [0x14] = 0xB;
1782                         for (int i = 0x0905; i < 0x093A; i++)
1783                                 if (Char.IsLetter ((char) i))
1784                                         AddLetterMap ((char) i, 0x14, 4);
1785                         for (int i = 0x093E; i < 0x094F; i++)
1786                                 if (!IsIgnorable (i))
1787                                         AddLetterMap ((char) i, 0x14, 2);
1788
1789                         // Bengali
1790                         // -Letters
1791                         fillIndex [0x15] = 02;
1792                         for (int i = 0x0980; i < 0x9FF; i++) {
1793                                 if (IsIgnorable (i))
1794                                         continue;
1795                                 if (i == 0x09E0)
1796                                         fillIndex [0x15] = 0x3B;
1797                                 switch (Char.GetUnicodeCategory ((char) i)) {
1798                                 case UnicodeCategory.NonSpacingMark:
1799                                 case UnicodeCategory.DecimalDigitNumber:
1800                                 case UnicodeCategory.OtherNumber:
1801                                         continue;
1802                                 }
1803                                 AddLetterMap ((char) i, 0x15, 1);
1804                         }
1805                         // -Signs
1806                         fillIndex [0x1] = 0x3;
1807                         for (int i = 0x0981; i < 0x0A00; i++)
1808                                 if (Char.GetUnicodeCategory ((char) i) ==
1809                                         UnicodeCategory.NonSpacingMark)
1810                                         AddCharMap ((char) i, 0x1, 1);
1811
1812                         // Gurmukhi. orderedGurmukhi is from UCA
1813                         // FIXME: it does not look equivalent to UCA.
1814                         fillIndex [0x1] = 03;
1815                         fillIndex [0x16] = 02;
1816                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
1817                                 char c = orderedGurmukhi [i];
1818                                 if (IsIgnorable ((int) c))
1819                                         continue;
1820                                 if (!Char.IsLetter (c)) {
1821                                         AddLetterMap (c, 0x1, 1);
1822                                         continue;
1823                                 }
1824                                 if (c == '\u0A3C' || c == '\u0A4D' ||
1825                                         '\u0A66' <= c && c <= '\u0A71')
1826                                         continue;
1827                                 AddLetterMap (c, 0x16, 4);
1828                         }
1829
1830                         // Gujarati. orderedGujarati is from UCA
1831                         fillIndex [0x17] = 02;
1832                         for (int i = 0; i < orderedGujarati.Length; i++)
1833                                 AddLetterMap (orderedGujarati [i], 0x17, 4);
1834
1835                         // Oriya
1836                         fillIndex [0x18] = 02;
1837                         for (int i = 0x0B00; i < 0x0B7F; i++) {
1838                                 switch (Char.GetUnicodeCategory ((char) i)) {
1839                                 case UnicodeCategory.NonSpacingMark:
1840                                 case UnicodeCategory.DecimalDigitNumber:
1841                                         continue;
1842                                 }
1843                                 AddLetterMap ((char) i, 0x18, 1);
1844                         }
1845
1846                         // Tamil
1847                         fillIndex [0x19] = 2;
1848                         AddCharMap ('\u0BD7', 0x19, 0);
1849                         fillIndex [0x19] = 0xA;
1850                         // vowels
1851                         for (int i = 0x0BD7; i < 0x0B94; i++)
1852                                 if (Char.IsLetter ((char) i))
1853                                         AddCharMap ((char) i, 0x19, 2);
1854                         // special vowel
1855                         fillIndex [0x19] = 0x24;
1856                         AddCharMap ('\u0B94', 0x19, 0);
1857                         fillIndex [0x19] = 0x26;
1858                         // The array for Tamil consonants is a constant.
1859                         // Windows have almost similar sequence to TAM from
1860                         // tamilnet but a bit different in Grantha.
1861                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
1862                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1863                         // combining marks
1864                         fillIndex [0x19] = 0x82;
1865                         for (int i = 0x0BBE; i < 0x0BCD; i++)
1866                                 if (Char.GetUnicodeCategory ((char) i) ==
1867                                         UnicodeCategory.SpacingCombiningMark
1868                                         || i == 0x0BC0)
1869                                         AddLetterMap ((char) i, 0x19, 2);
1870
1871                         // Telugu
1872                         fillIndex [0x1A] = 0x4;
1873                         for (int i = 0x0C00; i < 0x0C62; i++) {
1874                                 if (i == 0x0C55 || i == 0x0C56)
1875                                         continue; // skip
1876                                 AddCharMap ((char) i, 0x1A, 3);
1877                                 char supp = (i == 0x0C0B) ? '\u0C60':
1878                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
1879                                 if (supp == char.MinValue)
1880                                         continue;
1881                                 AddCharMap (supp, 0x1A, 3);
1882                         }
1883
1884                         // Kannada
1885                         fillIndex [0x1B] = 4;
1886                         for (int i = 0x0C80; i < 0x0CE5; i++) {
1887                                 if (i == 0x0CD5 || i == 0x0CD6)
1888                                         continue; // ignore
1889                                 AddCharMap ((char) i, 0x1B, 3);
1890                         }
1891                         
1892                         // Malayalam
1893                         fillIndex [0x1C] = 2;
1894                         for (int i = 0x0D02; i < 0x0D61; i++)
1895                                 // FIXME: I avoided MSCompatUnicodeTable usage
1896                                 // here (it results in recursion). So check if
1897                                 // using NonSpacingMark makes sense or not.
1898                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1899 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1900                                         AddCharMap ((char) i, 0x1C, 1);
1901
1902                         // Thai ... note that it breaks 0x1E wall after E2B!
1903                         // Also, all Thai characters have level 2 value 3.
1904                         fillIndex [0x1E] = 2;
1905                         for (int i = 0xE44; i < 0xE48; i++)
1906                                 AddCharMap ((char) i, 0x1E, 1, 3);
1907                         for (int i = 0xE01; i < 0xE2B; i++)
1908                                 AddCharMap ((char) i, 0x1E, 6, 0);
1909                         fillIndex [0x1F] = 5;
1910                         for (int i = 0xE2B; i < 0xE30; i++)
1911                                 AddCharMap ((char) i, 0x1F, 6, 0);
1912                         for (int i = 0xE30; i < 0xE3B; i++)
1913                                 AddCharMap ((char) i, 0x1F, 1, 3);
1914                         // some Thai characters remains.
1915                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
1916                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1917                         foreach (char c in specialThai)
1918                                 AddCharMap (c, 0x1F, 1);
1919
1920                         // Lao
1921                         fillIndex [0x1F] = 2;
1922                         for (int i = 0xE80; i < 0xEDF; i++)
1923                                 if (Char.IsLetter ((char) i))
1924                                         AddCharMap ((char) i, 0x1F, 1);
1925
1926                         // Georgian. orderedGeorgian is from UCA DUCET.
1927                         fillIndex [0x21] = 5;
1928                         for (int i = 0; i < orderedGeorgian.Length; i++)
1929                                 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1930
1931                         // Japanese Kana.
1932                         fillIndex [0x22] = 2;
1933                         int kanaOffset = 0x3041;
1934                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1935
1936                         for (int gyo = 0; gyo < 9; gyo++) {
1937                                 for (int dan = 0; dan < 5; dan++) {
1938                                         if (gyo == 7 && dan % 2 == 1) {
1939                                                 // 'ya'-gyo
1940                                                 fillIndex [0x22]++;
1941                                                 kanaOffset -= 2; // There is no space for yi and ye.
1942                                                 continue;
1943                                         }
1944                                         int cp = kanaOffset + dan * kanaLines [gyo];
1945                                         // small lines (a-gyo, ya-gyo)
1946                                         if (gyo == 0 || gyo == 7) {
1947                                                 AddKanaMap (cp, 1); // small
1948                                                 AddKanaMap (cp + 1, 1);
1949                                         }
1950                                         else
1951                                                 AddKanaMap (cp, kanaLines [gyo]);
1952                                         fillIndex [0x22]++;
1953
1954                                         if (cp == 0x3061) {
1955                                                 // add small 'Tsu' (before normal one)
1956                                                 AddKanaMap (0x3063, 1);
1957                                                 kanaOffset++;
1958                                         }
1959                                 }
1960                                 fillIndex [0x22] += 3;
1961                                 kanaOffset += 5 * kanaLines [gyo];
1962                         }
1963
1964                         // Wa-gyo is almost special, so I just manually add.
1965                         AddLetterMap ((char) 0x308E, 0x22, 0);
1966                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1967                         AddLetterMap ((char) 0x308F, 0x22, 0);
1968                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1969                         fillIndex [0x22]++;
1970                         AddLetterMap ((char) 0x3090, 0x22, 0);
1971                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1972                         fillIndex [0x22] += 2;
1973                         // no "Wu" in Japanese.
1974                         AddLetterMap ((char) 0x3091, 0x22, 0);
1975                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1976                         fillIndex [0x22]++;
1977                         AddLetterMap ((char) 0x3092, 0x22, 0);
1978                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1979                         // Nn
1980                         fillIndex [0x22] = 0x80;
1981                         AddLetterMap ((char) 0x3093, 0x22, 0);
1982                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1983
1984                         // JIS Japanese square chars.
1985                         fillIndex [0x22] = 0x97;
1986                         jisJapanese.Sort (JISComparer.Instance);
1987                         foreach (JISCharacter j in jisJapanese)
1988                                 AddCharMap ((char) j.CP, 0x22, 1);
1989                         // non-JIS Japanese square chars.
1990                         nonJisJapanese.Sort (NonJISComparer.Instance);
1991                         foreach (NonJISCharacter j in nonJisJapanese)
1992                                 AddCharMap ((char) j.CP, 0x22, 1);
1993
1994                         // Bopomofo
1995                         fillIndex [0x23] = 0x02;
1996                         for (int i = 0x3105; i <= 0x312C; i++)
1997                                 AddCharMap ((char) i, 0x23, 1);
1998
1999                         // Estrangela: ancient Syriac
2000                         fillIndex [0x24] = 0x0B;
2001                         // FIXME: is 0x71E really alternative form?
2002                         ArrayList syriacAlternatives = new ArrayList (
2003                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2004                         for (int i = 0x0710; i <= 0x072C; i++) {
2005                                 if (i == 0x0711) // NonSpacingMark
2006                                         continue;
2007                                 if (syriacAlternatives.Contains (i))
2008                                         continue;
2009                                 AddCharMap ((char) i, 0x24, 4);
2010                                 // FIXME: why?
2011                                 if (i == 0x721)
2012                                         fillIndex [0x24]++;
2013                         }
2014                         foreach (int cp in syriacAlternatives)
2015                                 map [cp] = new CharMapEntry (0x24,
2016                                         (byte) (map [cp - 1].Level1 + 2),
2017                                         0);
2018
2019                         // Thaana
2020                         // FIXME: it turned out that it does not look like UCA
2021                         fillIndex [0x24] = 0x6E;
2022                         for (int i = 0; i < orderedThaana.Length; i++) {
2023                                 if (IsIgnorableNonSpacing (i))
2024                                         continue;
2025                                 AddCharMap (orderedThaana [i], 0x24, 2);
2026                         }
2027                         #endregion
2028
2029                         // FIXME: Add more culture-specific letters (that are
2030                         // not supported in Windows collation) here.
2031
2032                         // Surrogate ... they are computed.
2033
2034                         #region Hangul
2035                         // Hangul.
2036                         //
2037                         // Unlike UCA Windows Hangul sequence mixes Jongseong
2038                         // with Choseong sequence as well as Jungseong,
2039                         // adjusted to have the same primary weight for the
2040                         // same base character. So it is impossible to compute
2041                         // those sort keys.
2042                         //
2043                         // Here I introduce an ordered sequence of mixed
2044                         // 'commands' and 'characters' that is similar to
2045                         // LDML text:
2046                         //      - ',' increases primary weight.
2047                         //      - [A B] means a range, increasing index
2048                         //      - {A B} means a range, without increasing index
2049                         //      - '=' is no operation (it means the characters 
2050                         //        of both sides have the same weight).
2051                         //      - '>' inserts a Hangul Syllable block that 
2052                         //        contains 0x251 characters.
2053                         //      - '<' decreases the index
2054                         //      - '0'-'9' means skip count
2055                         //      - whitespaces are ignored
2056                         //
2057
2058                         string hangulSequence =
2059                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2060                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2061                         + "<{\u1113 \u1116}, \u3165,"
2062                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2063                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2064                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
2065                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2066                                 + "[\u11D1 \u11D2], \u11B2,"
2067                                 + "[\u11D3 \u11D5], \u11B3,"
2068                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2069                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2070                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2071                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2072                         + "<{\u1121 \u112C}, \u11B9,,,,,,,,, [\u11E4 \u11E6],,"
2073                                 + "\u1109=\u11BA,,, \u3214=\u3274 <>"
2074                         + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
2075                                 + "\u11EA,, \u110A=\u11BB,,, >"
2076                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2077                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2078                         + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2079                                 + "\u11F1,, \u11F2,,,"
2080                                 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2081                         + "<\u114D, \u110D,,  >"
2082                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2083                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2084                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2085                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2086                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2087                                 + "[\u11F5 \u11F8]"
2088                         ;
2089
2090                         byte hangulCat = 0x52;
2091                         fillIndex [hangulCat] = 0x2;
2092
2093                         int syllableBlock = 0;
2094                         for (int n = 0; n < hangulSequence.Length; n++) {
2095                                 char c = hangulSequence [n];
2096                                 int start, end;
2097                                 if (Char.IsWhiteSpace (c))
2098                                         continue;
2099                                 switch (c) {
2100                                 case '=':
2101                                         break; // NOP
2102                                 case ',':
2103                                         IncrementSequentialIndex (ref hangulCat);
2104                                         break;
2105                                 case '<':
2106                                         if (fillIndex [hangulCat] == 2)
2107                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2108                                         fillIndex [hangulCat]--;
2109                                         break;
2110                                 case '>':
2111                                         IncrementSequentialIndex (ref hangulCat);
2112                                         for (int l = 0; l < 0x15; l++)
2113                                                 for (int v = 0; v < 0x1C; v++) {
2114                                                         AddCharMap (
2115                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2116                                                         IncrementSequentialIndex (ref hangulCat);
2117                                                 }
2118                                         syllableBlock++;
2119                                         break;
2120                                 case '[':
2121                                         start = hangulSequence [n + 1];
2122                                         end = hangulSequence [n + 3];
2123                                         for (int i = start; i <= end; i++) {
2124                                                 AddCharMap ((char) i, hangulCat, 0);
2125                                                 if (end > i)
2126                                                         IncrementSequentialIndex (ref hangulCat);
2127                                         }
2128                                         n += 4; // consumes 5 characters for this operation
2129                                         break;
2130                                 case '{':
2131                                         start = hangulSequence [n + 1];
2132                                         end = hangulSequence [n + 3];
2133                                         for (int i = start; i <= end; i++)
2134                                                 AddCharMap ((char) i, hangulCat, 0);
2135                                         n += 4; // consumes 5 characters for this operation
2136                                         break;
2137                                 default:
2138                                         AddCharMap (c, hangulCat, 0);
2139                                         break;
2140                                 }
2141                         }
2142
2143                         // Some Jamo NFKD.
2144                         for (int i = 0x3200; i < 0x3300; i++) {
2145                                 if (IsIgnorable (i) || map [i].Defined)
2146                                         continue;
2147                                 int ch = 0;
2148                                 // w/ bracket
2149                                 if (decompLength [i] == 4 &&
2150                                         decompValues [decompIndex [i]] == '(')
2151                                         ch = decompIndex [i] + 1;
2152                                 // circled
2153                                 else if (decompLength [i] == 2 &&
2154                                         decompValues [decompIndex [i] + 1] == '\u1161')
2155                                         ch = decompIndex [i];
2156                                 else if (decompLength [i] == 1)
2157                                         ch = decompIndex [i];
2158                                 else
2159                                         continue;
2160                                 ch = decompValues [ch];
2161                                 if (ch < 0x1100 || 0x1200 < ch &&
2162                                         ch < 0xAC00 || 0xD800 < ch)
2163                                         continue;
2164                                 map [i] = new CharMapEntry (map [ch].Category,
2165                                         (byte) (map [ch].Level1 + 1),
2166                                         map [ch].Level2);
2167 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2168                         }
2169
2170
2171                         #endregion
2172
2173                         // Letterlike characters and CJK compatibility square
2174                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2175                         int [] counts = new int ['Z' - 'A' + 1];
2176                         char [] namedChars = new char [sortableCharNames.Count];
2177                         int nCharNames = 0;
2178                         foreach (DictionaryEntry de in sortableCharNames) {
2179                                 counts [((string) de.Value) [0] - 'A']++;
2180                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2181                         }
2182                         nCharNames = 0; // reset
2183                         for (int a = 0; a < counts.Length; a++) {
2184                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2185                                 for (int i = 0; i < counts [a]; i++)
2186 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2187                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2188                         }
2189
2190                         // CJK unified ideograph.
2191                         byte cjkCat = 0x9E;
2192                         fillIndex [cjkCat] = 0x2;
2193                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2194                                 if (!IsIgnorable (cp))
2195                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2196                         // CJK Extensions goes here.
2197                         // LAMESPEC: With this Windows style CJK layout, it is
2198                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2199                         // 0x9FBB can never be added w/o breaking compat.
2200                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2201                                 if (!IsIgnorable (cp))
2202                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2203
2204                         // PrivateUse ... computed.
2205                         // remaining Surrogate ... computed.
2206
2207                         #region Special "biggest" area (FF FF)
2208                         fillIndex [0xFF] = 0xFF;
2209                         char [] specialBiggest = new char [] {
2210                                 '\u3005', '\u3031', '\u3032', '\u309D',
2211                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2212                                 '\uFE7C', '\uFE7D', '\uFF70'};
2213                         foreach (char c in specialBiggest)
2214                                 AddCharMap (c, 0xFF, 0);
2215                         #endregion
2216
2217                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2218                         // non-alphanumeric ASCII except for: + - < = > '
2219                         for (int i = 0x21; i < 0x7F; i++) {
2220                                 if (Char.IsLetterOrDigit ((char) i)
2221                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2222                                         continue; // they are not added here.
2223                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2224                                 // Insert 3001 after ',' and 3002 after '.'
2225                                 if (i == 0x2C)
2226                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2227                                 else if (i == 0x2E) {
2228                                         fillIndex [0x7]--;
2229                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2230                                 }
2231                                 else if (i == 0x3A)
2232                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2233                         }
2234                         #endregion
2235
2236                         #region 07 - Punctuations and something else
2237                         for (int i = 0xA0; i < char.MaxValue; i++) {
2238                                 if (IsIgnorable (i))
2239                                         continue;
2240
2241                                 // SPECIAL CASES:
2242                                 switch (i) {
2243                                 case 0xAB: // 08
2244                                 case 0xB7: // 0A
2245                                 case 0x2329: // 09
2246                                 case 0x232A: // 09
2247                                         continue;
2248                                 }
2249
2250                                 switch (Char.GetUnicodeCategory ((char) i)) {
2251                                 case UnicodeCategory.OtherPunctuation:
2252                                 case UnicodeCategory.ClosePunctuation:
2253                                 case UnicodeCategory.OpenPunctuation:
2254                                 case UnicodeCategory.InitialQuotePunctuation:
2255                                 case UnicodeCategory.FinalQuotePunctuation:
2256                                 case UnicodeCategory.ModifierSymbol:
2257                                         // SPECIAL CASES: // 0xA
2258                                         if (0x2020 <= i && i <= 0x2042)
2259                                                 continue;
2260                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
2261                                         break;
2262                                 default:
2263                                         if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2264                                                 goto case UnicodeCategory.OtherPunctuation;
2265                                         break;
2266                                 }
2267                         }
2268                         // Control pictures
2269                         for (int i = 0x2400; i <= 0x2421; i++)
2270                                 AddCharMap ((char) i, 0x7, 1, 0);
2271                         #endregion
2272
2273                         // FIXME: for 07 xx we need more love.
2274
2275                         // FIXME: 08 should be more complete.
2276                         fillIndex [0x8] = 2;
2277                         for (int cp = 0; cp < char.MaxValue; cp++)
2278                                 if (!map [cp].Defined &&
2279                                         Char.GetUnicodeCategory ((char) cp) ==
2280                                         UnicodeCategory.MathSymbol)
2281                                         AddCharMapGroup ((char) cp, 0x8, 1, 0);
2282
2283                         // Characters w/ diacritical marks (NFKD)
2284                         for (int i = 0; i <= char.MaxValue; i++) {
2285                                 if (map [i].Defined || IsIgnorable (i))
2286                                         continue;
2287                                 if (decompIndex [i] == 0)
2288                                         continue;
2289
2290                                 int start = decompIndex [i];
2291                                 int primaryChar = decompValues [start];
2292                                 int secondary = 0;
2293                                 bool skip = false;
2294                                 int length = decompLength [i];
2295                                 // special processing for parenthesized ones.
2296                                 if (length == 3 &&
2297                                         decompValues [start] == '(' &&
2298                                         decompValues [start + 2] == ')') {
2299                                         primaryChar = decompValues [start + 1];
2300                                         length = 1;
2301                                 }
2302
2303                                 if (map [primaryChar].Level1 == 0)
2304                                         continue;
2305
2306                                 for (int l = 1; l < length; l++) {
2307                                         int c = decompValues [start + l];
2308                                         if (map [c].Level1 != 0)
2309                                                 skip = true;
2310                                         secondary += diacritical [c];
2311                                 }
2312                                 if (skip)
2313                                         continue;
2314                                 map [i] = new CharMapEntry (
2315                                         map [primaryChar].Category,
2316                                         map [primaryChar].Level1,
2317                                         (byte) secondary);
2318                                 
2319                         }
2320
2321                         #region Level2 adjustment
2322                         // Arabic Hamzah
2323                         diacritical [0x624] = 0x5;
2324                         diacritical [0x626] = 0x7;
2325                         diacritical [0x622] = 0x9;
2326                         diacritical [0x623] = 0xA;
2327                         diacritical [0x625] = 0xB;
2328                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
2329                         diacritical [0x64A] = 0x7; // Yaa'
2330
2331
2332                         for (int i = 0; i < char.MaxValue; i++) {
2333                                 byte mod = 0;
2334                                 byte cat = map [i].Category;
2335                                 switch (cat) {
2336                                 case 0xE: // Latin diacritics
2337                                 case 0x22: // Japanese: circled characters
2338                                         mod = diacritical [i];
2339                                         break;
2340                                 case 0x13: // Arabic
2341                                         if (diacritical [i] == 0)
2342                                                 mod = 0x8; // default for arabic
2343                                         break;
2344                                 }
2345                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
2346                                         mod = diacritical [i];
2347                                 if (mod > 0)
2348                                         map [i] = new CharMapEntry (
2349                                                 cat, map [i].Level1, mod);
2350                         }
2351                         #endregion
2352
2353                         // FIXME: this is hack but those which are 
2354                         // NonSpacingMark characters and still undefined
2355                         // are likely to be nonspacing.
2356                         for (int i = 0; i < char.MaxValue; i++)
2357                                 if (!map [i].Defined &&
2358                                         !IsIgnorable (i) &&
2359                                         Char.GetUnicodeCategory ((char) i) ==
2360                                         UnicodeCategory.NonSpacingMark)
2361                                         AddCharMap ((char) i, 1, 1);
2362                 }
2363
2364                 private void IncrementSequentialIndex (ref byte hangulCat)
2365                 {
2366                         fillIndex [hangulCat]++;
2367                         if (fillIndex [hangulCat] == 0) { // overflown
2368                                 hangulCat++;
2369                                 fillIndex [hangulCat] = 0x2;
2370                         }
2371                 }
2372
2373                 // Reset fillIndex to fixed value and call AddLetterMap().
2374                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2375                 {
2376                         fillIndex [category] = alphaWeight;
2377                         AddLetterMap (c, category, 0);
2378
2379                         ArrayList al = latinMap [c] as ArrayList;
2380                         if (al == null)
2381                                 return;
2382
2383                         foreach (int cp in al)
2384                                 AddLetterMap ((char) cp, category, 0);
2385                 }
2386
2387                 private void AddKanaMap (int i, byte voices)
2388                 {
2389                         for (byte b = 0; b < voices; b++) {
2390                                 char c = (char) (i + b);
2391                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
2392                                 // Hiragana
2393                                 AddLetterMapCore (c, 0x22, 0, arg);
2394                                 // Katakana
2395                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2396                         }
2397                 }
2398
2399                 private void AddLetterMap (char c, byte category, byte updateCount)
2400                 {
2401                         AddLetterMapCore (c, category, updateCount, 0);
2402                 }
2403
2404                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2405                 {
2406                         char c2;
2407                         // <small> updates index
2408                         c2 = ToSmallForm (c);
2409                         if (c2 != c)
2410                                 AddCharMapGroup (c2, category, updateCount, level2);
2411                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2412                         if (c2 != c && !map [(int) c2].Defined)
2413                                 AddLetterMapCore (c2, category, 0, level2);
2414                         bool doUpdate = true;
2415                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2416                                 doUpdate = false;
2417                         else
2418                                 AddCharMapGroup (c, category, 0, level2);
2419                         if (doUpdate)
2420                                 fillIndex [category] += updateCount;
2421                 }
2422
2423                 private bool AddCharMap (char c, byte category, byte increment)
2424                 {
2425                         return AddCharMap (c, category, increment, 0);
2426                 }
2427                 
2428                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2429                 {
2430                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2431                                 return false; // do nothing
2432                         map [(int) c] = new CharMapEntry (category,
2433                                 category == 1 ? alt : fillIndex [category],
2434                                 category == 1 ? fillIndex [category] : alt);
2435                         fillIndex [category] += increment;
2436                         return true;
2437                 }
2438
2439                 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2440                 {
2441                         char c2 = ToSmallFormTail (c);
2442                         if (c2 != c)
2443                                 AddCharMap (c2, category, updateCount, 0);
2444                         // itself
2445                         AddCharMap (c, category, updateCount, 0);
2446                         // <full>
2447                         c2 = ToFullWidthTail (c);
2448                         if (c2 != c)
2449                                 AddCharMapGroupTail (c2, category, updateCount);
2450                 }
2451
2452                 //
2453                 // Adds characters to table in the order below 
2454                 // (+ increases weight):
2455                 //      (<small> +)
2456                 //      itself
2457                 //      <fraction>
2458                 //      <full> | <super> | <sub>
2459                 //      <circle> | <wide> (| <narrow>)
2460                 //      +
2461                 //      (vertical +)
2462                 //
2463                 // level2 is fixed (does not increase).
2464                 int [] sameWeightItems = new int [] {
2465                         DecompositionFraction,
2466                         DecompositionFull,
2467                         DecompositionSuper,
2468                         DecompositionSub,
2469                         DecompositionCircle,
2470                         DecompositionWide,
2471                         DecompositionNarrow,
2472                         };
2473                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2474                 {
2475                         if (map [(int) c].Defined)
2476                                 return;
2477
2478                         char small = char.MinValue;
2479                         char vertical = char.MinValue;
2480                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2481                         if (nfkd != null) {
2482                                 object smv = nfkd [(byte) DecompositionSmall];
2483                                 if (smv != null)
2484                                         small = (char) ((int) smv);
2485                                 object vv = nfkd [(byte) DecompositionVertical];
2486                                 if (vv != null)
2487                                         vertical = (char) ((int) vv);
2488                         }
2489
2490                         // <small> updates index
2491                         if (small != char.MinValue)
2492                                 AddCharMap (small, category, updateCount);
2493
2494                         // itself
2495                         AddCharMap (c, category, 0, level2);
2496
2497                         if (nfkd != null) {
2498                                 foreach (int weight in sameWeightItems) {
2499                                         object wv = nfkd [(byte) weight];
2500                                         if (wv != null)
2501                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
2502                                 }
2503                         }
2504
2505                         // update index here.
2506                         fillIndex [category] += updateCount;
2507
2508                         if (vertical != char.MinValue)
2509                                 AddCharMap (vertical, category, updateCount, level2);
2510                 }
2511
2512                 private void AddCharMapCJK (char c, ref byte category)
2513                 {
2514                         AddCharMap (c, category, 0, 0);
2515                         IncrementSequentialIndex (ref category);
2516
2517                         // Special. I wonder why but Windows skips 9E F9.
2518                         if (category == 0x9E && fillIndex [category] == 0xF9)
2519                                 IncrementSequentialIndex (ref category);
2520                 }
2521
2522                 private void AddCharMapGroupCJK (char c, ref byte category)
2523                 {
2524                         AddCharMapCJK (c, ref category);
2525
2526                         // LAMESPEC: see below.
2527                         if (c == '\u52DE') {
2528                                 AddCharMapCJK ('\u3298', ref category);
2529                                 AddCharMapCJK ('\u3238', ref category);
2530                         }
2531                         if (c == '\u5BEB')
2532                                 AddCharMapCJK ('\u32A2', ref category);
2533                         if (c == '\u91AB')
2534                                 // Especially this mapping order totally does
2535                                 // not make sense to me.
2536                                 AddCharMapCJK ('\u32A9', ref category);
2537
2538                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2539                         if (nfkd == null)
2540                                 return;
2541                         for (byte weight = 0; weight <= 0x12; weight++) {
2542                                 object wv = nfkd [weight];
2543                                 if (wv == null)
2544                                         continue;
2545                                 int w = (int) wv;
2546
2547                                 // Special: they are ignored in this area.
2548                                 // FIXME: check if it is sane
2549                                 if (0xF900 <= w && w <= 0xFAD9)
2550                                         continue;
2551                                 // LAMESPEC: on Windows some of CJK characters
2552                                 // in 3200-32B0 are incorrectly mapped. They
2553                                 // mix Chinise and Japanese Kanji when
2554                                 // ordering those characters.
2555                                 switch (w) {
2556                                 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
2557                                         continue;
2558                                 }
2559
2560                                 AddCharMapCJK ((char) w, ref category);
2561                         }
2562                 }
2563
2564                 // For now it is only for 0x7 category.
2565                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2566                 {
2567                         char small = char.MinValue;
2568                         char vertical = char.MinValue;
2569                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2570                         if (nfkd != null) {
2571                                 object smv = nfkd [(byte) DecompositionSmall];
2572                                 if (smv != null)
2573                                         small = (char) ((int) smv);
2574                                 object vv = nfkd [(byte) DecompositionVertical];
2575                                 if (vv != null)
2576                                         vertical = (char) ((int) vv);
2577                         }
2578
2579                         // <small> updates index
2580                         if (small != char.MinValue)
2581                                 // SPECIAL CASE excluded (FIXME: why?)
2582                                 if (small != '\u2024')
2583                                         AddCharMap (small, category, updateCount);
2584
2585                         // itself
2586                         AddCharMap (c, category, updateCount, level2);
2587
2588                         // Since nfkdMap is problematic to have two or more
2589                         // NFKD to an identical character, here I iterate all.
2590                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
2591                                 if (decompLength [c2] == 1 &&
2592                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
2593                                         switch (decompType [c2]) {
2594                                         case DecompositionCompat:
2595                                                 AddCharMap ((char) c2, category, updateCount, level2);
2596                                                 break;
2597                                         }
2598                                 }
2599                         }
2600
2601                         if (vertical != char.MinValue)
2602                                 // SPECIAL CASE excluded (FIXME: why?)
2603                                 if (vertical != '\uFE33' && vertical != '\uFE34')
2604                                         AddCharMap (vertical, category, updateCount, level2);
2605                 }
2606
2607                 private void AddArabicCharMap (char c)
2608                 {
2609                         byte category = 6;
2610                         byte updateCount = 1;
2611                         byte level2 = 0;
2612
2613                         // itself
2614                         AddCharMap (c, category, 0, level2);
2615
2616                         // Since nfkdMap is problematic to have two or more
2617                         // NFKD to an identical character, here I iterate all.
2618                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
2619                                 if (decompLength [c2] == 0)
2620                                         continue;
2621                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
2622                                 if ((int) (decompValues [idx]) == (int) c)
2623                                         AddCharMap ((char) c2, category,
2624                                                 0, level2);
2625                         }
2626                         fillIndex [category] += updateCount;
2627                 }
2628
2629                 char ToFullWidth (char c)
2630                 {
2631                         return ToDecomposed (c, DecompositionFull, false);
2632                 }
2633
2634                 char ToFullWidthTail (char c)
2635                 {
2636                         return ToDecomposed (c, DecompositionFull, true);
2637                 }
2638
2639                 char ToSmallForm (char c)
2640                 {
2641                         return ToDecomposed (c, DecompositionSmall, false);
2642                 }
2643
2644                 char ToSmallFormTail (char c)
2645                 {
2646                         return ToDecomposed (c, DecompositionSmall, true);
2647                 }
2648
2649                 char ToDecomposed (char c, byte d, bool tail)
2650                 {
2651                         if (decompType [(int) c] != d)
2652                                 return c;
2653                         int idx = decompIndex [(int) c];
2654                         if (tail)
2655                                 idx += decompLength [(int) c] - 1;
2656                         return (char) decompValues [idx];
2657                 }
2658
2659                 bool ExistsJIS (int cp)
2660                 {
2661                         foreach (JISCharacter j in jisJapanese)
2662                                 if (j.CP == cp)
2663                                         return true;
2664                         return false;
2665                 }
2666
2667                 #endregion
2668
2669                 #region Level 3 properties (Case/Width)
2670
2671                 private byte ComputeLevel3Weight (char c)
2672                 {
2673                         byte b = ComputeLevel3WeightRaw (c);
2674                         return b > 0 ? (byte) (b + 2) : b;
2675                 }
2676
2677                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
2678                 {
2679                         // Korean
2680                         if ('\u11A8' <= c && c <= '\u11F9')
2681                                 return 2;
2682                         if ('\uFFA0' <= c && c <= '\uFFDC')
2683                                 return 4;
2684                         if ('\u3130' <= c && c <= '\u3164')
2685                                 return 5;
2686                         // numbers
2687                         if ('\u2776' <= c && c <= '\u277F')
2688                                 return 4;
2689                         if ('\u2780' <= c && c <= '\u2789')
2690                                 return 8;
2691                         if ('\u2776' <= c && c <= '\u2793')
2692                                 return 0xC;
2693                         if ('\u2160' <= c && c <= '\u216F')
2694                                 return 0x18;
2695                         if ('\u2181' <= c && c <= '\u2182')
2696                                 return 0x18;
2697                         // Arabic
2698                         if ('\u2135' <= c && c <= '\u2138')
2699                                 return 4;
2700                         if ('\uFE80' <= c && c < '\uFE8E') {
2701                                 // 2(Isolated)/8(Final)/0x18(Medial)
2702                                 switch (decompType [(int) c]) {
2703                                 case DecompositionIsolated:
2704                                         return 2;
2705                                 case DecompositionFinal:
2706                                         return 8;
2707                                 case DecompositionMedial:
2708                                         return 0x18;
2709                                 }
2710                         }
2711
2712                         // actually I dunno the reason why they have weights.
2713                         switch (c) {
2714                         case '\u01BC':
2715                                 return 0x10;
2716                         case '\u06A9':
2717                                 return 0x20;
2718                         case '\u06AA':
2719                                 return 0x28;
2720                         }
2721
2722                         byte ret = 0;
2723                         switch (c) {
2724                         case '\u03C2':
2725                         case '\u2104':
2726                         case '\u212B':
2727                                 ret |= 8;
2728                                 break;
2729                         case '\uFE42':
2730                                 ret |= 0xC;
2731                                 break;
2732                         }
2733
2734                         // misc
2735                         switch (decompType [(int) c]) {
2736                         case DecompositionWide: // <wide>
2737                         case DecompositionSub: // <sub>
2738                         case DecompositionSuper: // <super>
2739                                 ret |= decompType [(int) c];
2740                                 break;
2741                         }
2742                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2743                                 ret |= 8;
2744                         if (isUppercase [(int) c]) // DerivedCoreProperties
2745                                 ret |= 0x10;
2746
2747                         return ret;
2748                 }
2749
2750                 #endregion
2751
2752                 #region IsIgnorable
2753 /*
2754                 static bool IsIgnorable (int i)
2755                 {
2756                         if (unicodeAge [i] >= 3.1)
2757                                 return true;
2758                         switch (char.GetUnicodeCategory ((char) i)) {
2759                         case UnicodeCategory.OtherNotAssigned:
2760                         case UnicodeCategory.Format:
2761                                 return true;
2762                         }
2763                         return false;
2764                 }
2765 */
2766
2767                 // FIXME: In the future use DerivedAge.txt to examine character
2768                 // versions and set those ones that have higher version than
2769                 // 1.0 as ignorable.
2770                 static bool IsIgnorable (int i)
2771                 {
2772                         switch (i) {
2773                         case 0:
2774                         // I guess, those characters are added between
2775                         // Unicode 1.0 (LCMapString) and Unicode 3.1
2776                         // (UnicodeCategory), so they used to be 
2777                         // something like OtherNotAssigned as of Unicode 1.1.
2778                         case 0x2df: case 0x387:
2779                         case 0x3d7: case 0x3d8: case 0x3d9:
2780                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2781                         case 0x400: case 0x40d: case 0x450: case 0x45d:
2782                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
2783                         case 0x653: case 0x654: case 0x655: case 0x66d:
2784                         case 0xb56:
2785                         case 0x1e9b: case 0x202f: case 0x20ad:
2786                         case 0x20ae: case 0x20af:
2787                         case 0x20e2: case 0x20e3:
2788                         case 0x2139: case 0x213a: case 0x2183:
2789                         case 0x2425: case 0x2426: case 0x2619:
2790                         case 0x2670: case 0x2671: case 0x3007:
2791                         case 0x3190: case 0x3191:
2792                         case 0xfffc: case 0xfffd:
2793                                 return true;
2794                         // exceptional characters filtered by the 
2795                         // following conditions. Originally those exceptional
2796                         // ranges are incorrect (they should not be ignored)
2797                         // and most of those characters are unfortunately in
2798                         // those ranges.
2799                         case 0x4d8: case 0x4d9:
2800                         case 0x4e8: case 0x4e9:
2801                         case 0x3036: case 0x303f:
2802                         case 0x337b: case 0xfb1e:
2803                                 return false;
2804                         }
2805
2806                         if (
2807                                 // The whole Sinhala characters.
2808                                 0x0D82 <= i && i <= 0x0DF4
2809                                 // The whole Tibetan characters.
2810                                 || 0x0F00 <= i && i <= 0x0FD1
2811                                 // The whole Myanmar characters.
2812                                 || 0x1000 <= i && i <= 0x1059
2813                                 // The whole Etiopic, Cherokee, 
2814                                 // Canadian Syllablic, Ogham, Runic,
2815                                 // Tagalog, Hanunoo, Philippine,
2816                                 // Buhid, Tagbanwa, Khmer and Mongorian
2817                                 // characters.
2818                                 || 0x1200 <= i && i <= 0x1DFF
2819                                 // Greek extension characters.
2820                                 || 0x1F00 <= i && i <= 0x1FFF
2821                                 // The whole Braille characters.
2822                                 || 0x2800 <= i && i <= 0x28FF
2823                                 // CJK radical characters.
2824                                 || 0x2E80 <= i && i <= 0x2EF3
2825                                 // Kangxi radical characters.
2826                                 || 0x2F00 <= i && i <= 0x2FD5
2827                                 // Ideographic description characters.
2828                                 || 0x2FF0 <= i && i <= 0x2FFB
2829                                 // Bopomofo letter and final
2830                                 || 0x31A0 <= i && i <= 0x31B7
2831                                 // White square with quadrant characters.
2832                                 || 0x25F0 <= i && i <= 0x25F7
2833                                 // Ideographic telegraph symbols.
2834                                 || 0x32C0 <= i && i <= 0x32CB
2835                                 || 0x3358 <= i && i <= 0x3370
2836                                 || 0x33E0 <= i && i <= 0x33FF
2837                                 // The whole YI characters.
2838                                 || 0xA000 <= i && i <= 0xA48C
2839                                 || 0xA490 <= i && i <= 0xA4C6
2840                                 // American small ligatures
2841                                 || 0xFB13 <= i && i <= 0xFB17
2842                                 // hebrew, arabic, variation selector.
2843                                 || 0xFB1D <= i && i <= 0xFE2F
2844                                 // Arabic ligatures.
2845                                 || 0xFEF5 <= i && i <= 0xFEFC
2846                                 // FIXME: why are they excluded?
2847                                 || 0x01F6 <= i && i <= 0x01F9
2848                                 || 0x0218 <= i && i <= 0x0233
2849                                 || 0x02A9 <= i && i <= 0x02AD
2850                                 || 0x02EA <= i && i <= 0x02EE
2851                                 || 0x0349 <= i && i <= 0x036F
2852                                 || 0x0488 <= i && i <= 0x048F
2853                                 || 0x04D0 <= i && i <= 0x04FF
2854                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2855                                 || 0x06D6 <= i && i <= 0x06ED
2856                                 || 0x06FA <= i && i <= 0x06FE
2857                                 || 0x2048 <= i && i <= 0x204D
2858                                 || 0x20e4 <= i && i <= 0x20ea
2859                                 || 0x213C <= i && i <= 0x214B
2860                                 || 0x21EB <= i && i <= 0x21FF
2861                                 || 0x22F2 <= i && i <= 0x22FF
2862                                 || 0x237B <= i && i <= 0x239A
2863                                 || 0x239B <= i && i <= 0x23CF
2864                                 || 0x24EB <= i && i <= 0x24FF
2865                                 || 0x2596 <= i && i <= 0x259F
2866                                 || 0x25F8 <= i && i <= 0x25FF
2867                                 || 0x2672 <= i && i <= 0x2689
2868                                 || 0x2768 <= i && i <= 0x2775
2869                                 || 0x27d0 <= i && i <= 0x27ff
2870                                 || 0x2900 <= i && i <= 0x2aff
2871                                 || 0x3033 <= i && i <= 0x303F
2872                                 || 0x31F0 <= i && i <= 0x31FF
2873                                 || 0x3250 <= i && i <= 0x325F
2874                                 || 0x32B1 <= i && i <= 0x32BF
2875                                 || 0x3371 <= i && i <= 0x337B
2876                                 || 0xFA30 <= i && i <= 0xFA6A
2877                         )
2878                                 return true;
2879
2880                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2881                         switch (uc) {
2882                         case UnicodeCategory.PrivateUse:
2883                         case UnicodeCategory.Surrogate:
2884                                 return false;
2885                         // ignored by nature
2886                         case UnicodeCategory.Format:
2887                         case UnicodeCategory.OtherNotAssigned:
2888                                 return true;
2889                         default:
2890                                 return false;
2891                         }
2892                 }
2893
2894                 // To check IsIgnorable sanity, try the driver below under MS.NET.
2895
2896                 /*
2897                 public static void Main ()
2898                 {
2899                         for (int i = 0; i <= char.MaxValue; i++)
2900                                 Dump (i, IsIgnorable (i));
2901                 }
2902
2903                 static void Dump (int i, bool ignore)
2904                 {
2905                         switch (Char.GetUnicodeCategory ((char) i)) {
2906                         case UnicodeCategory.PrivateUse:
2907                         case UnicodeCategory.Surrogate:
2908                                 return; // check nothing
2909                         }
2910
2911                         string s1 = "";
2912                         string s2 = new string ((char) i, 10);
2913                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2914                         if ((ret == 0) == ignore)
2915                                 return;
2916                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2917                 }
2918                 */
2919                 #endregion // IsIgnorable
2920
2921                 #region IsIgnorableSymbol
2922                 static bool IsIgnorableSymbol (int i)
2923                 {
2924                         if (IsIgnorable (i))
2925                                 return true;
2926
2927                         switch (i) {
2928                         // *Letter
2929                         case 0x00b5: case 0x01C0: case 0x01C1:
2930                         case 0x01C2: case 0x01C3: case 0x01F6:
2931                         case 0x01F7: case 0x01F8: case 0x01F9:
2932                         case 0x02D0: case 0x02EE: case 0x037A:
2933                         case 0x03D7: case 0x03F3:
2934                         case 0x0400: case 0x040d:
2935                         case 0x0450: case 0x045d:
2936                         case 0x048C: case 0x048D:
2937                         case 0x048E: case 0x048F:
2938                         case 0x0587: case 0x0640: case 0x06E5:
2939                         case 0x06E6: case 0x06FA: case 0x06FB:
2940                         case 0x06FC: case 0x093D: case 0x0950:
2941                         case 0x1E9B: case 0x2139: case 0x3006:
2942                         case 0x3033: case 0x3034: case 0x3035:
2943                         case 0xFE7E: case 0xFE7F:
2944                         // OtherNumber
2945                         case 0x16EE: case 0x16EF: case 0x16F0:
2946                         // LetterNumber
2947                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2948                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2949                         case 0x3038: // HANGZHOU NUMERAL TEN
2950                         case 0x3039: // HANGZHOU NUMERAL TWENTY
2951                         case 0x303a: // HANGZHOU NUMERAL THIRTY
2952                         // OtherSymbol
2953                         case 0x2117:
2954                         case 0x327F:
2955                                 return true;
2956                         // ModifierSymbol
2957                         case 0x02B9: case 0x02BA: case 0x02C2:
2958                         case 0x02C3: case 0x02C4: case 0x02C5:
2959                         case 0x02C8: case 0x02CC: case 0x02CD:
2960                         case 0x02CE: case 0x02CF: case 0x02D2:
2961                         case 0x02D3: case 0x02D4: case 0x02D5:
2962                         case 0x02D6: case 0x02D7: case 0x02DE:
2963                         case 0x02E5: case 0x02E6: case 0x02E7:
2964                         case 0x02E8: case 0x02E9:
2965                         case 0x309B: case 0x309C:
2966                         // OtherPunctuation
2967                         case 0x055A: // American Apos
2968                         case 0x05C0: // Hebrew Punct
2969                         case 0x0E4F: // Thai FONGMAN
2970                         case 0x0E5A: // Thai ANGKHANKHU
2971                         case 0x0E5B: // Thai KHOMUT
2972                         // CurencySymbol
2973                         case 0x09F2: // Bengali Rupee Mark
2974                         case 0x09F3: // Bengali Rupee Sign
2975                         // MathSymbol
2976                         case 0x221e: // INF.
2977                         // OtherSymbol
2978                         case 0x0482:
2979                         case 0x09FA:
2980                         case 0x0B70:
2981                                 return false;
2982                         }
2983
2984                         // *Letter
2985                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2986 #if NET_2_0
2987                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2988                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2989 #endif
2990                         )
2991                                 return true;
2992
2993                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2994                         switch (uc) {
2995                         case UnicodeCategory.Surrogate:
2996                                 return false; // inconsistent
2997
2998                         case UnicodeCategory.SpacingCombiningMark:
2999                         case UnicodeCategory.EnclosingMark:
3000                         case UnicodeCategory.NonSpacingMark:
3001                         case UnicodeCategory.PrivateUse:
3002                                 // NonSpacingMark
3003                                 if (0x064B <= i && i <= 0x0652) // Arabic
3004                                         return true;
3005                                 return false;
3006
3007                         case UnicodeCategory.Format:
3008                         case UnicodeCategory.OtherNotAssigned:
3009                                 return true;
3010
3011                         default:
3012                                 bool use = false;
3013                                 // OtherSymbols
3014                                 if (
3015                                         // latin in a circle
3016                                         0x249A <= i && i <= 0x24E9
3017                                         || 0x2100 <= i && i <= 0x2132
3018                                         // Japanese
3019                                         || 0x3196 <= i && i <= 0x31A0
3020                                         // Korean
3021                                         || 0x3200 <= i && i <= 0x321C
3022                                         // Chinese/Japanese
3023                                         || 0x322A <= i && i <= 0x3243
3024                                         // CJK
3025                                         || 0x3260 <= i && i <= 0x32B0
3026                                         || 0x32D0 <= i && i <= 0x3357
3027                                         || 0x337B <= i && i <= 0x33DD
3028                                 )
3029                                         use = !Char.IsLetterOrDigit ((char) i);
3030                                 if (use)
3031                                         return false;
3032
3033                                 // This "Digit" rule is mystery.
3034                                 // It filters some symbols out.
3035                                 if (Char.IsLetterOrDigit ((char) i))
3036                                         return false;
3037                                 if (Char.IsNumber ((char) i))
3038                                         return false;
3039                                 if (Char.IsControl ((char) i)
3040                                         || Char.IsSeparator ((char) i)
3041                                         || Char.IsPunctuation ((char) i))
3042                                         return true;
3043                                 if (Char.IsSymbol ((char) i))
3044                                         return true;
3045
3046                                 // FIXME: should check more
3047                                 return false;
3048                         }
3049                 }
3050
3051                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3052 /*
3053                 public static void Main ()
3054                 {
3055                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3056                         for (int i = 0; i <= char.MaxValue; i++) {
3057                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3058                                 if (uc == UnicodeCategory.Surrogate)
3059                                         continue;
3060
3061                                 bool ret = IsIgnorableSymbol (i);
3062
3063                                 string s1 = "TEST ";
3064                                 string s2 = "TEST " + (char) i;
3065
3066                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3067
3068                                 if (ret != (result == 0))
3069                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3070                                                 ret ? "should not ignore" :
3071                                                         "should ignore",
3072                                                 i,(char) i, uc);
3073                         }
3074                 }
3075 */
3076                 #endregion
3077
3078                 #region NonSpacing
3079                 static bool IsIgnorableNonSpacing (int i)
3080                 {
3081                         if (IsIgnorable (i))
3082                                 return true;
3083
3084                         switch (i) {
3085                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3086                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3087                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3088                                 return true;
3089                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3090                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3091                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3092                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3093                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3094                         case 0x0CCD: case 0x0E4E:
3095                                 return false;
3096                         }
3097
3098                         if (0x02b9 <= i && i <= 0x02c5
3099                                 || 0x02cc <= i && i <= 0x02d7
3100                                 || 0x02e4 <= i && i <= 0x02ef
3101                                 || 0x20DD <= i && i <= 0x20E0
3102                         )
3103                                 return true;
3104
3105                         if (0x064B <= i && i <= 0x00652
3106                                 || 0x0941 <= i && i <= 0x0948
3107                                 || 0x0AC1 <= i && i <= 0x0ACD
3108                                 || 0x0C3E <= i && i <= 0x0C4F
3109                                 || 0x0E31 <= i && i <= 0x0E3F
3110                         )
3111                                 return false;
3112
3113                         return Char.GetUnicodeCategory ((char) i) ==
3114                                 UnicodeCategory.NonSpacingMark;
3115                 }
3116
3117                 // We can reuse IsIgnorableSymbol testcode 
3118                 // for IsIgnorableNonSpacing.
3119                 #endregion
3120         }
3121
3122         struct CharMapEntry
3123         {
3124                 public byte Category;
3125                 public byte Level1;
3126                 public byte Level2; // It is always single byte.
3127                 public bool Defined;
3128
3129                 public CharMapEntry (byte category, byte level1, byte level2)
3130                 {
3131                         Category = category;
3132                         Level1 = level1;
3133                         Level2 = level2;
3134                         Defined = true;
3135                 }
3136         }
3137
3138         class JISCharacter
3139         {
3140                 public readonly int CP;
3141                 public readonly int JIS;
3142
3143                 public JISCharacter (int cp, int cpJIS)
3144                 {
3145                         CP = cp;
3146                         JIS = cpJIS;
3147                 }
3148         }
3149
3150         class JISComparer : IComparer
3151         {
3152                 public static readonly JISComparer Instance =
3153                         new JISComparer ();
3154
3155                 public int Compare (object o1, object o2)
3156                 {
3157                         JISCharacter j1 = (JISCharacter) o1;
3158                         JISCharacter j2 = (JISCharacter) o2;
3159                         return j2.JIS - j1.JIS;
3160                 }
3161         }
3162
3163         class NonJISCharacter
3164         {
3165                 public readonly int CP;
3166                 public readonly string Name;
3167
3168                 public NonJISCharacter (int cp, string name)
3169                 {
3170                         CP = cp;
3171                         Name = name;
3172                 }
3173         }
3174
3175         class NonJISComparer : IComparer
3176         {
3177                 public static readonly NonJISComparer Instance =
3178                         new NonJISComparer ();
3179
3180                 public int Compare (object o1, object o2)
3181                 {
3182                         NonJISCharacter j1 = (NonJISCharacter) o1;
3183                         NonJISCharacter j2 = (NonJISCharacter) o2;
3184                         return string.CompareOrdinal (j1.Name, j2.Name);
3185                 }
3186         }
3187
3188         class DecimalDictionaryValueComparer : IComparer
3189         {
3190                 public static readonly DecimalDictionaryValueComparer Instance
3191                         = new DecimalDictionaryValueComparer ();
3192
3193                 private DecimalDictionaryValueComparer ()
3194                 {
3195                 }
3196
3197                 public int Compare (object o1, object o2)
3198                 {
3199                         DictionaryEntry e1 = (DictionaryEntry) o1;
3200                         DictionaryEntry e2 = (DictionaryEntry) o2;
3201                         // FIXME: in case of 0, compare decomposition categories
3202                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3203                         if (ret != 0)
3204                                 return ret;
3205                         int i1 = (int) e1.Key;
3206                         int i2 = (int) e2.Key;
3207                         return i1 - i2;
3208                 }
3209         }
3210
3211         class StringDictionaryValueComparer : IComparer
3212         {
3213                 public static readonly StringDictionaryValueComparer Instance
3214                         = new StringDictionaryValueComparer ();
3215
3216                 private StringDictionaryValueComparer ()
3217                 {
3218                 }
3219
3220                 public int Compare (object o1, object o2)
3221                 {
3222                         DictionaryEntry e1 = (DictionaryEntry) o1;
3223                         DictionaryEntry e2 = (DictionaryEntry) o2;
3224                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3225                         if (ret != 0)
3226                                 return ret;
3227                         int i1 = (int) e1.Key;
3228                         int i2 = (int) e2.Key;
3229                         return i1 - i2;
3230                 }
3231         }
3232
3233         class UCAComparer : IComparer
3234         {
3235                 public static readonly UCAComparer Instance
3236                         = new UCAComparer ();
3237
3238                 private UCAComparer ()
3239                 {
3240                 }
3241
3242                 public int Compare (object o1, object o2)
3243                 {
3244                         char i1 = (char) o1;
3245                         char i2 = (char) o2;
3246
3247                         int l1 = CollationElementTable.GetSortKeyCount (i1);
3248                         int l2 = CollationElementTable.GetSortKeyCount (i2);
3249                         int l = l1 > l2 ? l2 : l1;
3250
3251                         for (int i = 0; i < l; i++) {
3252                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3253                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3254                                 int v = k1.Primary - k2.Primary;
3255                                 if (v != 0)
3256                                         return v;
3257                                 v = k1.Secondary - k2.Secondary;
3258                                 if (v != 0)
3259                                         return v;
3260                                 v = k1.Thirtiary - k2.Thirtiary;
3261                                 if (v != 0)
3262                                         return v;
3263                                 v = k1.Quarternary - k2.Quarternary;
3264                                 if (v != 0)
3265                                         return v;
3266                         }
3267                         return l1 - l2;
3268                 }
3269         }
3270
3271         class Tailoring
3272         {
3273                 int lcid;
3274                 int alias;
3275                 bool frenchSort;
3276                 ArrayList items = new ArrayList ();
3277
3278                 public Tailoring (int lcid)
3279                         : this (lcid, 0)
3280                 {
3281                 }
3282
3283                 public Tailoring (int lcid, int alias)
3284                 {
3285                         this.lcid = lcid;
3286                         this.alias = alias;
3287                 }
3288
3289                 public int LCID {
3290                         get { return lcid; }
3291                 }
3292
3293                 public int Alias {
3294                         get { return alias; }
3295                 }
3296
3297                 public bool FrenchSort {
3298                         get { return frenchSort; }
3299                         set { frenchSort = value; }
3300                 }
3301
3302                 public void AddDiacriticalMap (byte target, byte replace)
3303                 {
3304                         items.Add (new DiacriticalMap (target, replace));
3305                 }
3306
3307                 public void AddSortKeyMap (string source, byte [] sortkey)
3308                 {
3309                         items.Add (new SortKeyMap (source, sortkey));
3310                 }
3311
3312                 public void AddReplacementMap (string source, string replace)
3313                 {
3314                         items.Add (new ReplacementMap (source, replace));
3315                 }
3316
3317                 public char [] ItemToCharArray ()
3318                 {
3319                         ArrayList al = new ArrayList ();
3320                         foreach (ITailoringMap m in items)
3321                                 al.AddRange (m.ToCharArray ());
3322                         return al.ToArray (typeof (char)) as char [];
3323                 }
3324
3325                 interface ITailoringMap
3326                 {
3327                         char [] ToCharArray ();
3328                 }
3329
3330                 class DiacriticalMap : ITailoringMap
3331                 {
3332                         public readonly byte Target;
3333                         public readonly byte Replace;
3334
3335                         public DiacriticalMap (byte target, byte replace)
3336                         {
3337                                 Target = target;
3338                                 Replace = replace;
3339                         }
3340
3341                         public char [] ToCharArray ()
3342                         {
3343                                 char [] ret = new char [3];
3344                                 ret [0] = (char) 02; // kind:DiacriticalMap
3345                                 ret [1] = (char) Target;
3346                                 ret [2] = (char) Replace;
3347                                 return ret;
3348                         }
3349                 }
3350
3351                 class SortKeyMap : ITailoringMap
3352                 {
3353                         public readonly string Source;
3354                         public readonly byte [] SortKey;
3355
3356                         public SortKeyMap (string source, byte [] sortkey)
3357                         {
3358                                 Source = source;
3359                                 SortKey = sortkey;
3360                         }
3361
3362                         public char [] ToCharArray ()
3363                         {
3364                                 char [] ret = new char [Source.Length + 7];
3365                                 ret [0] = (char) 01; // kind:SortKeyMap
3366                                 for (int i = 0; i < Source.Length; i++)
3367                                         ret [i + 1] = Source [i];
3368                                 // null terminate
3369                                 for (int i = 0; i < 5; i++)
3370                                         ret [i + Source.Length + 2] = (char) SortKey [i];
3371                                 return ret;
3372                         }
3373                 }
3374
3375                 class ReplacementMap : ITailoringMap
3376                 {
3377                         public readonly string Source;
3378                         public readonly string Replace;
3379
3380                         public ReplacementMap (string source, string replace)
3381                         {
3382                                 Source = source;
3383                                 Replace = replace;
3384                         }
3385
3386                         public char [] ToCharArray ()
3387                         {
3388                                 char [] ret = new char [Source.Length + Replace.Length + 3];
3389                                 ret [0] = (char) 03; // kind:ReplaceMap
3390                                 int pos = 1;
3391                                 for (int i = 0; i < Source.Length; i++)
3392                                         ret [pos++] = Source [i];
3393                                 // null terminate
3394                                 pos++;
3395                                 for (int i = 0; i < Replace.Length; i++)
3396                                         ret [pos++] = Replace [i];
3397                                 // null terminate
3398                                 return ret;
3399                         }
3400                 }
3401         }
3402 }