171074096c17f3f309c1e95ca097f5bd639907bc
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / create-mscompat-collation-table.cs
1 //
2 //
3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
5 //
6 //      - Surrogate
7 //      - PrivateUse
8 //
9 // Also, for composite characters it should prepare different index table.
10 //
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
13 //
14
15 //
16 // * sortkey getter signature
17 //
18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
19 //      Stores sort key for corresponding character element into buf and
20 //      returns the length of the consumed _source_ character element in s.
21 //
22 // * character length to consume
23 //
24 //      If there are characters whose primary weight is 0, they are consumed
25 //      and considered as a part of the character element.
26 //
27
28 using System;
29 using System.IO;
30 using System.Collections;
31 using System.Globalization;
32 using System.Text;
33 using System.Xml;
34
35 namespace Mono.Globalization.Unicode
36 {
37         internal class MSCompatSortKeyTableGenerator
38         {
39                 public static void Main (string [] args)
40                 {
41                         new MSCompatSortKeyTableGenerator ().Run (args);
42                 }
43
44                 const int DecompositionWide = 1; // fixed
45                 const int DecompositionSub = 2; // fixed
46                 const int DecompositionSmall = 3;
47                 const int DecompositionIsolated = 4;
48                 const int DecompositionInitial = 5;
49                 const int DecompositionFinal = 6;
50                 const int DecompositionMedial = 7;
51                 const int DecompositionNoBreak = 8;
52                 const int DecompositionVertical = 9;
53                 const int DecompositionFraction = 0xA;
54                 const int DecompositionFont = 0xB;
55                 const int DecompositionSuper = 0xC; // fixed
56                 const int DecompositionFull = 0xE;
57                 const int DecompositionNarrow = 0xD;
58                 const int DecompositionCircle = 0xF;
59                 const int DecompositionSquare = 0x10;
60                 const int DecompositionCompat = 0x11;
61                 const int DecompositionCanonical = 0x12;
62
63                 TextWriter Result = Console.Out;
64
65                 byte [] fillIndex = new byte [256]; // by category
66                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
67
68                 char [] specialIgnore = new char [] {
69                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
70                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
71                         };
72
73                 // FIXME: need more love (as always)
74                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
75                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
76                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
77                         '\u0292', '\u01BE', '\u0298'};
78                 byte [] alphaWeights = new byte [] {
79                         2, 9, 0xA, 0x1A, 0x21,
80                         0x23, 0x25, 0x2C, 0x32, 0x35,
81                         0x36, 0x48, 0x51, 0x70, 0x7C,
82                         0x7E, 0x89, 0x8A, 0x91, 0x99,
83                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
84                         0xA9, 0xAA, 0xB3, 0xB4};
85
86                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
87                 bool [] isUppercase = new bool [char.MaxValue + 1];
88
89                 byte [] decompType = new byte [char.MaxValue + 1];
90                 int [] decompIndex = new int [char.MaxValue + 1];
91                 int [] decompLength = new int [char.MaxValue + 1];
92                 int [] decompValues;
93                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
94
95                 byte [] diacritical = new byte [char.MaxValue + 1];
96
97                 string [] diacritics = new string [] {
98                         // LATIN
99                         "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
100                         "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
101                         " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
102                         " OGONEK;", " CEDILLA;",
103                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
104                         " STROKE;", " CIRCUMFLEX AND ACUTE;",
105                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
106                         " DIAERESIS AND GRAVE;",
107                         " BREVE AND ACUTE;",
108                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
109                         " MACRON AND ACUTE;",
110                         " MACRON AND GRAVE;",
111                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
112                         " RING ABOVE AND ACUTE",
113                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
114                         " CIRCUMFLEX AND TILDE",
115                         " TILDE AND DIAERESIS",
116                         " STROKE AND ACUTE",
117                         " BREVE AND TILDE",
118                         " CEDILLA AND BREVE",
119                         " OGONEK AND MACRON",
120                         " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
121                         " DOUBLE GRAVE;",
122                         " INVERTED BREVE",
123                         " PRECEDED BY APOSTROPHE",
124                         " HORN;",
125                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
126                         " PALATAL HOOK",
127                         " DOT BELOW;",
128                         " RETROFLEX;", "DIAERESIS BELOW",
129                         " RING BELOW",
130                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
131                         " BREVE BELOW;", " HORN AND GRAVE",
132                         " TILDE BELOW",
133                         " DOT BELOW AND DOT ABOVE",
134                         " RIGHT HALF RING", " HORN AND TILDE",
135                         " CIRCUMFLEX AND DOT BELOW",
136                         " BREVE AND DOT BELOW",
137                         " DOT BELOW AND MACRON",
138                         " HORN AND HOOK ABOVE",
139                         " HORN AND DOT",
140                         // CIRCLED, PARENTHESIZED and so on
141                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
142                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
143                         };
144                 byte [] diacriticWeights = new byte [] {
145                         // LATIN.
146                         0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
147                         0x17, 0x19, 0x1A, 0x1B, 0x1C,
148                         0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
149                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
150                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
151                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
152                         0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
153                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
154                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 
155                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
156                         0x95, 0xAA,
157                         // CIRCLED, PARENTHESIZED and so on.
158                         0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
159                         };
160
161                 int [] numberSecondaryWeightBounds = new int [] {
162                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
163                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
164                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
165                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
166                         0xE50, 0xE60, 0xED0, 0xEE0
167                         };
168
169                 char [] orderedCyrillic;
170                 char [] orderedGurmukhi;
171                 char [] orderedGujarati;
172                 char [] orderedGeorgian;
173                 char [] orderedThaana;
174
175                 static readonly char [] orderedTamilConsonants = new char [] {
176                         // based on traditional Tamil consonants, except for
177                         // Grantha (where Microsoft breaks traditionalism).
178                         // http://www.angelfire.com/empire/thamizh/padanGaL
179                         '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
180                         '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
181                         '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
182                         '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
183                         '\u0BB9'};
184
185                 // cp -> character name (only for some characters)
186                 ArrayList sortableCharNames = new ArrayList ();
187
188                 // cp -> arrow value (int)
189                 ArrayList arrowValues = new ArrayList ();
190
191                 // cp -> box value (int)
192                 ArrayList boxValues = new ArrayList ();
193
194                 // cp -> level1 value
195                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
196
197                 // letterName -> cp
198                 Hashtable arabicNameMap = new Hashtable ();
199
200                 // cp -> Hashtable [decompType] -> cp
201                 Hashtable nfkdMap = new Hashtable ();
202
203                 // Latin letter -> ArrayList [int]
204                 Hashtable latinMap = new Hashtable ();
205
206                 ArrayList jisJapanese = new ArrayList ();
207                 ArrayList nonJisJapanese = new ArrayList ();
208
209                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
210                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
211                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
212                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
213                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
214
215                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
216
217                 static double [] unicodeAge = new double [char.MaxValue + 1];
218
219                 ArrayList tailorings = new ArrayList ();
220
221                 void Run (string [] args)
222                 {
223                         string dirname = args.Length == 0 ? "downloaded" : args [0];
224                         ParseSources (dirname);
225                         Console.Error.WriteLine ("parse done.");
226
227                         ModifyParsedValues ();
228                         GenerateCore ();
229                         Console.Error.WriteLine ("generation done.");
230                         Serialize ();
231                         Console.Error.WriteLine ("serialization done.");
232 /*
233 StreamWriter sw = new StreamWriter ("agelog.txt");
234 for (int i = 0; i < char.MaxValue; i++) {
235 bool shouldBe = false;
236 switch (Char.GetUnicodeCategory ((char) i)) {
237 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
238         shouldBe = true; break;
239 }
240 if (unicodeAge [i] >= 3.1)
241         shouldBe = true;
242 //if (IsIgnorable (i) != shouldBe)
243 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
244 }
245 sw.Close ();
246 */
247                 }
248
249                 byte [] CompressArray (byte [] source, CodePointIndexer i)
250                 {
251                         return (byte []) CodePointIndexer.CompressArray  (
252                                 source, typeof (byte), i);
253                 }
254
255                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
256                 {
257                         return (ushort []) CodePointIndexer.CompressArray  (
258                                 source, typeof (ushort), i);
259                 }
260
261                 void Serialize ()
262                 {
263                         // Tailorings
264                         SerializeTailorings ();
265
266                         byte [] categories = new byte [map.Length];
267                         byte [] level1 = new byte [map.Length];
268                         byte [] level2 = new byte [map.Length];
269                         byte [] level3 = new byte [map.Length];
270                         int [] widthCompat = new int [map.Length];
271                         for (int i = 0; i < map.Length; i++) {
272                                 categories [i] = map [i].Category;
273                                 level1 [i] = map [i].Level1;
274                                 level2 [i] = map [i].Level2;
275                                 level3 [i] = ComputeLevel3Weight ((char) i);
276                                 switch (decompType [i]) {
277                                 case DecompositionNarrow:
278                                 case DecompositionWide:
279                                 case DecompositionSuper:
280                                 case DecompositionSub:
281                                         // they are always 1 char
282                                         widthCompat [i] = decompValues [decompIndex [i]];
283                                         break;
284                                 }
285                         }
286
287                         // compress
288                         ignorableFlags = CompressArray (ignorableFlags,
289                                 MSCompatUnicodeTableUtil.Ignorable);
290                         categories = CompressArray (categories,
291                                 MSCompatUnicodeTableUtil.Category);
292                         level1 = CompressArray (level1, 
293                                 MSCompatUnicodeTableUtil.Level1);
294                         level2 = CompressArray (level2, 
295                                 MSCompatUnicodeTableUtil.Level2);
296                         level3 = CompressArray (level3, 
297                                 MSCompatUnicodeTableUtil.Level3);
298                         widthCompat = (int []) CodePointIndexer.CompressArray (
299                                 widthCompat, typeof (int),
300                                 MSCompatUnicodeTableUtil.WidthCompat);
301                         cjkCHS = CompressArray (cjkCHS,
302                                 MSCompatUnicodeTableUtil.CjkCHS);
303                         cjkCHT = CompressArray (cjkCHT,
304                                 MSCompatUnicodeTableUtil.Cjk);
305                         cjkJA = CompressArray (cjkJA,
306                                 MSCompatUnicodeTableUtil.Cjk);
307                         cjkKO = CompressArray (cjkKO,
308                                 MSCompatUnicodeTableUtil.Cjk);
309                         cjkKOlv2 = CompressArray (cjkKOlv2,
310                                 MSCompatUnicodeTableUtil.Cjk);
311
312                         // Ignorables
313                         Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
314                         for (int i = 0; i < ignorableFlags.Length; i++) {
315                                 byte value = ignorableFlags [i];
316                                 if (value < 10)
317                                         Result.Write ("{0},", value);
318                                 else
319                                         Result.Write ("0x{0:X02},", value);
320                                 if ((i & 0xF) == 0xF)
321                                         Result.WriteLine ("// {0:X04}", i - 0xF);
322                         }
323                         Result.WriteLine ("};");
324                         Result.WriteLine ();
325
326                         // Primary category
327                         Result.WriteLine ("static byte [] categories = new byte [] {");
328                         for (int i = 0; i < categories.Length; i++) {
329                                 byte value = categories [i];
330                                 if (value < 10)
331                                         Result.Write ("{0},", value);
332                                 else
333                                         Result.Write ("0x{0:X02},", value);
334                                 if ((i & 0xF) == 0xF)
335                                         Result.WriteLine ("// {0:X04}", i - 0xF);
336                         }
337                         Result.WriteLine ("};");
338                         Result.WriteLine ();
339
340                         // Primary weight value
341                         Result.WriteLine ("static byte [] level1 = new byte [] {");
342                         for (int i = 0; i < level1.Length; i++) {
343                                 byte value = level1 [i];
344                                 if (value < 10)
345                                         Result.Write ("{0},", value);
346                                 else
347                                         Result.Write ("0x{0:X02},", value);
348                                 if ((i & 0xF) == 0xF)
349                                         Result.WriteLine ("// {0:X04}", i - 0xF);
350                         }
351                         Result.WriteLine ("};");
352                         Result.WriteLine ();
353
354                         // Secondary weight
355                         Result.WriteLine ("static byte [] level2 = new byte [] {");
356                         for (int i = 0; i < level2.Length; i++) {
357                                 int value = level2 [i];
358                                 if (value < 10)
359                                         Result.Write ("{0},", value);
360                                 else
361                                         Result.Write ("0x{0:X02},", value);
362                                 if ((i & 0xF) == 0xF)
363                                         Result.WriteLine ("// {0:X04}", i - 0xF);
364                         }
365                         Result.WriteLine ("};");
366                         Result.WriteLine ();
367
368                         // Thirtiary weight
369                         Result.WriteLine ("static byte [] level3 = new byte [] {");
370                         for (int i = 0; i < level3.Length; i++) {
371                                 byte value = level3 [i];
372                                 if (value < 10)
373                                         Result.Write ("{0},", value);
374                                 else
375                                         Result.Write ("0x{0:X02},", value);
376                                 if ((i & 0xF) == 0xF)
377                                         Result.WriteLine ("// {0:X04}", i - 0xF);
378                         }
379                         Result.WriteLine ("};");
380                         Result.WriteLine ();
381
382                         // Width insensitivity mappings
383                         // (for now it is more lightweight than dumping the
384                         // entire NFKD table).
385                         Result.WriteLine ("static int [] widthCompat = new int [] {");
386                         for (int i = 0; i < widthCompat.Length; i++) {
387                                 int value = widthCompat [i];
388                                 if (value < 10)
389                                         Result.Write ("{0},", value);
390                                 else
391                                         Result.Write ("0x{0:X02},", value);
392                                 if ((i & 0xF) == 0xF)
393                                         Result.WriteLine ("// {0:X04}", i - 0xF);
394                         }
395                         Result.WriteLine ("};");
396                         Result.WriteLine ();
397
398                         // CJK
399                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
400                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
401                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
402                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
403                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
404                 }
405
406                 void SerializeCJK (string name, ushort [] cjk, int max)
407                 {
408                         int offset = 0;//char.MaxValue - cjk.Length;
409                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
410                         for (int i = 0; i < cjk.Length; i++) {
411                                 if (i + offset == max)
412                                         break;
413                                 ushort value = cjk [i];
414                                 if (value < 10)
415                                         Result.Write ("{0},", value);
416                                 else
417                                         Result.Write ("0x{0:X04},", value);
418                                 if ((i & 0xF) == 0xF)
419                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
420                         }
421                         Result.WriteLine ("};");
422                         Result.WriteLine ();
423                 }
424
425                 void SerializeCJK (string name, byte [] cjk, int max)
426                 {
427                         int offset = 0;//char.MaxValue - cjk.Length;
428                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
429                         for (int i = 0; i < cjk.Length; i++) {
430                                 if (i + offset == max)
431                                         break;
432                                 byte value = cjk [i];
433                                 if (value < 10)
434                                         Result.Write ("{0},", value);
435                                 else
436                                         Result.Write ("0x{0:X02},", value);
437                                 if ((i & 0xF) == 0xF)
438                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
439                         }
440                         Result.WriteLine ("};");
441                         Result.WriteLine ();
442                 }
443
444                 void SerializeTailorings ()
445                 {
446                         Hashtable indexes = new Hashtable ();
447                         Hashtable counts = new Hashtable ();
448                         Result.WriteLine ("static char [] tailorings = new char [] {");
449                         int count = 0;
450                         foreach (Tailoring t in tailorings) {
451                                 if (t.Alias != 0)
452                                         continue;
453                                 Result.Write ("/*{0}*/", t.LCID);
454                                 indexes.Add (t.LCID, count);
455                                 char [] values = t.ItemToCharArray ();
456                                 counts.Add (t.LCID, values.Length);
457                                 foreach (char c in values) {
458                                         Result.Write ("'\\x{0:X}', ", (int) c);
459                                         if (++count % 16 == 0)
460                                                 Result.WriteLine (" // {0:X04}", count - 16);
461                                 }
462                         }
463                         Result.WriteLine ("};");
464
465                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
466                         foreach (Tailoring t in tailorings) {
467                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
468                                 if (!indexes.ContainsKey (target)) {
469                                         Console.Error.WriteLine ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias);
470                                         continue;
471                                 }
472                                 int idx = (int) indexes [target];
473                                 int cnt = (int) counts [target];
474                                 bool french = t.FrenchSort;
475                                 if (t.Alias != 0)
476                                         foreach (Tailoring t2 in tailorings)
477                                                 if (t2.LCID == t.LCID)
478                                                         french = t2.FrenchSort;
479                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
480                         }
481                         Result.WriteLine ("};");
482                 }
483
484                 #region Parse
485
486                 void ParseSources (string dirname)
487                 {
488                         string unidata =
489                                 dirname + "/UnicodeData.txt";
490                         string derivedCoreProps = 
491                                 dirname + "/DerivedCoreProperties.txt";
492                         string scripts = 
493                                 dirname + "/Scripts.txt";
494                         string cp932 = 
495                                 dirname + "/CP932.TXT";
496                         string derivedAge = 
497                                 dirname + "/DerivedAge.txt";
498                         string chXML = dirname + "/common/collation/zh.xml";
499                         string jaXML = dirname + "/common/collation/ja.xml";
500                         string koXML = dirname + "/common/collation/ko.xml";
501
502                         ParseDerivedAge (derivedAge);
503
504                         FillIgnorables ();
505
506                         ParseJISOrder (cp932); // in prior to ParseUnidata()
507                         ParseUnidata (unidata);
508                         ParseDerivedCoreProperties (derivedCoreProps);
509                         ParseScripts (scripts);
510                         ParseCJK (chXML, jaXML, koXML);
511
512                         ParseTailorings ("mono-tailoring-source.txt");
513                 }
514
515                 void ParseTailorings (string filename)
516                 {
517                         Tailoring t = null;
518                         int line = 0;
519                         using (StreamReader sr = new StreamReader (filename)) {
520                                 try {
521                                         while (sr.Peek () >= 0) {
522                                                 line++;
523                                                 ProcessTailoringLine (ref t,
524                                                         sr.ReadLine ().Trim ());
525                                         }
526                                 } catch (Exception) {
527                                         Console.Error.WriteLine ("ERROR at line {0}", line);
528                                         throw;
529                                 }
530                         }
531                 }
532
533                 // For now this is enough.
534                 string ParseTailoringSourceValue (string s)
535                 {
536                         StringBuilder sb = new StringBuilder ();
537                         for (int i = 0; i < s.Length; i++) {
538                                 if (s.StartsWith ("\\u")) {
539                                         sb.Append ((char) int.Parse (
540                                                 s.Substring (2, 4), NumberStyles.HexNumber),
541                                                 1);
542                                         i += 5;
543                                 }
544                         else
545                                 sb.Append (s [i]);
546                         }
547                         return sb.ToString ();
548                 }
549
550                 void ProcessTailoringLine (ref Tailoring t, string s)
551                 {
552                         int idx = s.IndexOf ('#');
553                         if (idx > 0)
554                                 s = s.Substring (0, idx).Trim ();
555                         if (s.Length == 0 || s [0] == '#')
556                                 return;
557                         if (s [0] == '@') {
558                                 idx = s.IndexOf ('=');
559                                 if (idx > 0)
560                                         t = new Tailoring (
561                                                 int.Parse (s.Substring (1, idx - 1)),
562                                                 int.Parse (s.Substring (idx + 1)));
563                                 else
564                                         t = new Tailoring (int.Parse (s.Substring (1)));
565                                 tailorings.Add (t);
566                                 return;
567                         }
568                         if (s.StartsWith ("*FrenchSort")) {
569                                 t.FrenchSort = true;
570                                 return;
571                         }
572                         string d = "*Diacritical";
573                         if (s.StartsWith (d)) {
574                                 idx = s.IndexOf ("->");
575                                 t.AddDiacriticalMap (
576                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
577                                                 NumberStyles.HexNumber),
578                                         byte.Parse (s.Substring (idx + 2).Trim (),
579                                                 NumberStyles.HexNumber));
580                                 return;
581                         }
582                         idx = s.IndexOf (':');
583                         if (idx > 0) {
584                                 string source = s.Substring (0, idx).Trim ();
585                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
586                                 byte [] b = new byte [5];
587                                 for (int i = 0; i < 5; i++) {
588                                         if (l [i] == "*")
589                                                 b [i] = 0;
590                                         else
591                                                 b [i] = byte.Parse (l [i],
592                                                         NumberStyles.HexNumber);
593                                 }
594                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
595                                         b);
596                         }
597                         idx = s.IndexOf ('=');
598                         if (idx > 0)
599                                 t.AddReplacementMap (
600                                         ParseTailoringSourceValue (
601                                                 s.Substring (0, idx).Trim ()),
602                                         ParseTailoringSourceValue (
603                                                 s.Substring (idx + 1).Trim ()));
604                 }
605
606                 void ParseDerivedAge (string filename)
607                 {
608                         using (StreamReader file =
609                                 new StreamReader (filename)) {
610                                 while (file.Peek () >= 0) {
611                                         string s = file.ReadLine ();
612                                         int idx = s.IndexOf ('#');
613                                         if (idx >= 0)
614                                                 s = s.Substring (0, idx);
615                                         idx = s.IndexOf (';');
616                                         if (idx < 0)
617                                                 continue;
618
619                                         string cpspec = s.Substring (0, idx);
620                                         idx = cpspec.IndexOf ("..");
621                                         NumberStyles nf = NumberStyles.HexNumber |
622                                                 NumberStyles.AllowTrailingWhite;
623                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
624                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
625                                         string value = s.Substring (cpspec.Length + 1).Trim ();
626
627                                         // FIXME: use index
628                                         if (cp > char.MaxValue)
629                                                 continue;
630
631                                         for (int i = cp; i <= cpEnd; i++)
632                                                 unicodeAge [i] = double.Parse (value);
633                                 }
634                         }
635                         unicodeAge [0] = double.MaxValue; // never be supported
636                 }
637
638                 void ParseUnidata (string filename)
639                 {
640                         ArrayList decompValues = new ArrayList ();
641                         using (StreamReader unidata =
642                                 new StreamReader (filename)) {
643                                 for (int line = 1; unidata.Peek () >= 0; line++) {
644                                         try {
645                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
646                                         } catch (Exception) {
647                                                 Console.Error.WriteLine ("**** At line " + line);
648                                                 throw;
649                                         }
650                                 }
651                         }
652                         this.decompValues = (int [])
653                                 decompValues.ToArray (typeof (int));
654                 }
655                 
656                 void ProcessUnidataLine (string s, ArrayList decompValues)
657                 {
658                         int idx = s.IndexOf ('#');
659                         if (idx >= 0)
660                                 s = s.Substring (0, idx);
661                         idx = s.IndexOf (';');
662                         if (idx < 0)
663                                 return;
664                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
665                         string [] values = s.Substring (idx + 1).Split (';');
666
667                         // FIXME: use index
668                         if (cp > char.MaxValue)
669                                 return;
670                         if (IsIgnorable (cp))
671                                 return;
672
673                         string name = values [0];
674
675                         // isSmallCapital
676                         if (s.IndexOf ("SMALL CAPITAL") > 0)
677                                 isSmallCapital [cp] = true;
678
679                         // latin mapping by character name
680                         if (s.IndexOf ("LATIN") > 0) {
681                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
682                                 int offset = lidx + 15;
683                                 if (lidx < 0) {
684                                         lidx = s.IndexOf ("LETTER TURNED ");
685                                         offset = lidx + 14;
686                                 }
687                                 if (lidx < 0) {
688                                         lidx = s.IndexOf ("LETTER ");
689                                         offset = lidx + 7;
690                                 }
691                                 char c = lidx > 0 ? s [offset] : char.MinValue;
692                                 if ('A' <= c && c <= 'Z' &&
693                                         (s.Length == offset + 1 || s [offset + 1] == ' ')) {
694                                         ArrayList entry = (ArrayList) latinMap [c];
695                                         if (entry == null) {
696                                                 entry = new ArrayList ();
697                                                 latinMap [c] = entry;
698                                         }
699                                         entry.Add (cp);
700                                 }
701                         }
702
703                         // Arrow names
704                         if (0x2000 <= cp && cp < 0x3000) {
705                                 int value = 0;
706                                 // SPECIAL CASES. FIXME: why?
707                                 switch (cp) {
708                                 case 0x21C5: value = -1; break; // E2
709                                 case 0x261D: value = 1; break;
710                                 case 0x27A6: value = 3; break;
711                                 case 0x21B0: value = 7; break;
712                                 case 0x21B1: value = 3; break;
713                                 case 0x21B2: value = 7; break;
714                                 case 0x21B4: value = 5; break;
715                                 case 0x21B5: value = 7; break;
716                                 case 0x21B9: value = -1; break; // E1
717                                 case 0x21CF: value = 7; break;
718                                 case 0x21D0: value = 3; break;
719                                 }
720                                 string [] arrowTargets = new string [] {
721                                         "",
722                                         "UPWARDS",
723                                         "NORTH EAST",
724                                         "RIGHTWARDS",
725                                         "SOUTH EAST",
726                                         "DOWNWARDS",
727                                         "SOUTH WEST",
728                                         "LEFTWARDS",
729                                         "NORTH WEST",
730                                         };
731                                 if (value == 0)
732                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
733                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
734                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
735                                                         s.IndexOf (" OVER") < 0
736                                                 )
737                                                         value = i;
738                                 if (value > 0)
739                                         arrowValues.Add (new DictionaryEntry (
740                                                 cp, value));
741                         }
742
743                         // Box names
744                         if (0x2500 <= cp && cp < 0x25B0) {
745                                 int value = 0;
746                                 // flags:
747                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
748                                 // [h,rl] [r] [l]
749                                 // [v,ud] [u] [d]
750                                 // [dr] [dl] [ur] [ul]
751                                 // [vr,udr] [vl,vdl]
752                                 // [hd,rld] [hu,rlu]
753                                 // [hv,udrl,rlv,udh]
754                                 ArrayList flags = new ArrayList (new int [] {
755                                         32, 8 + 4, 8, 4,
756                                         16, 1 + 2, 1, 2,
757                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
758                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
759                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
760                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
761                                         });
762                                 byte [] offsets = new byte [] {
763                                         0, 0, 1, 2,
764                                         3, 3, 4, 5,
765                                         6, 7, 8, 9,
766                                         10, 10, 11, 11,
767                                         12, 12, 13, 13,
768                                         14, 14, 14, 14};
769                                 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
770                                         int flag = 0;
771                                         if (s.IndexOf (" UP") > 0)
772                                                 flag |= 1;
773                                         if (s.IndexOf (" DOWN") > 0)
774                                                 flag |= 2;
775                                         if (s.IndexOf (" RIGHT") > 0)
776                                                 flag |= 4;
777                                         if (s.IndexOf (" LEFT") > 0)
778                                                 flag |= 8;
779                                         if (s.IndexOf (" VERTICAL") > 0)
780                                                 flag |= 16;
781                                         if (s.IndexOf (" HORIZONTAL") > 0)
782                                                 flag |= 32;
783
784                                         int fidx = flags.IndexOf (flag);
785                                         value = fidx < 0 ? fidx : offsets [fidx];
786                                 } else if (s.IndexOf ("BLOCK") > 0) {
787                                         if (s.IndexOf ("ONE EIGHTH") > 0)
788                                                 value = 0x12;
789                                         else if (s.IndexOf ("ONE QUARTER") > 0)
790                                                 value = 0x13;
791                                         else if (s.IndexOf ("THREE EIGHTHS") > 0)
792                                                 value = 0x14;
793                                         else if (s.IndexOf ("HALF") > 0)
794                                                 value = 0x15;
795                                         else if (s.IndexOf ("FIVE EIGHTHS") > 0)
796                                                 value = 0x16;
797                                         else if (s.IndexOf ("THREE QUARTERS") > 0)
798                                                 value = 0x17;
799                                         else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
800                                                 value = 0x18;
801                                         else
802                                                 value = 0x19;
803                                 }
804                                 if (value >= 0)
805                                         boxValues.Add (new DictionaryEntry (
806                                                 cp, value));
807                         }
808
809                         // For some characters store the name and sort later
810                         // to determine sorting.
811                         if (0x2100 <= cp && cp <= 0x213F &&
812                                 Char.IsSymbol ((char) cp))
813                                 sortableCharNames.Add (
814                                         new DictionaryEntry (cp, values [0]));
815                         else if (0x3380 <= cp && cp <= 0x33DD)
816                                 sortableCharNames.Add (new DictionaryEntry (
817                                         cp, values [0].Substring (7)));
818
819                         // diacritical weights by character name
820                         for (int d = 0; d < diacritics.Length; d++)
821                                 if (s.IndexOf (diacritics [d]) > 0)
822                                         diacritical [cp] |= diacriticWeights [d];
823                         // Two-step grep required for it.
824                         if (s.IndexOf ("FULL STOP") > 0 &&
825                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
826                                 diacritical [cp] |= 0xF4;
827
828                         // Arabic letter name
829                         if (0x0621 <= cp && cp <= 0x064A &&
830                                 Char.GetUnicodeCategory ((char) cp)
831                                 == UnicodeCategory.OtherLetter) {
832                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
833                                 switch (cp) {
834                                 case 0x0621:
835                                 case 0x0624:
836                                 case 0x0626:
837                                         // hamza, waw, yeh ... special cases.
838                                         value = 0x07;
839                                         break;
840                                 case 0x0649:
841                                 case 0x064A:
842                                         value = 0x77; // special cases.
843                                         break;
844                                 default:
845                                         // Get primary letter name i.e.
846                                         // XXX part of ARABIC LETTER XXX yyy
847                                         // e.g. that of "TEH MARBUTA" is "TEH".
848                                         string letterName =
849                                                 (cp == 0x0640) ?
850                                                 // 0x0640 is special: it does
851                                                 // not start with ARABIC LETTER
852                                                 values [0] :
853                                                 values [0].Substring (14);
854                                         int tmpIdx = letterName.IndexOf (' ');
855                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
856 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
857                                         if (arabicNameMap.ContainsKey (letterName))
858                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
859                                         else
860                                                 arabicNameMap [letterName] = cp;
861                                         break;
862                                 }
863                                 arabicLetterPrimaryValues [cp] = value;
864                         }
865
866                         // Japanese square letter
867                         if (0x3300 <= cp && cp <= 0x3357)
868                                 if (!ExistsJIS (cp))
869                                         nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
870
871                         // normalizationType
872                         string decomp = values [4];
873                         idx = decomp.IndexOf ('<');
874                         if (idx >= 0) {
875                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
876                                 case "full":
877                                         decompType [cp] = DecompositionFull;
878                                         break;
879                                 case "sub":
880                                         decompType [cp] = DecompositionSub;
881                                         break;
882                                 case "super":
883                                         decompType [cp] = DecompositionSuper;
884                                         break;
885                                 case "small":
886                                         decompType [cp] = DecompositionSmall;
887                                         break;
888                                 case "isolated":
889                                         decompType [cp] = DecompositionIsolated;
890                                         break;
891                                 case "initial":
892                                         decompType [cp] = DecompositionInitial;
893                                         break;
894                                 case "final":
895                                         decompType [cp] = DecompositionFinal;
896                                         break;
897                                 case "medial":
898                                         decompType [cp] = DecompositionMedial;
899                                         break;
900                                 case "noBreak":
901                                         decompType [cp] = DecompositionNoBreak;
902                                         break;
903                                 case "compat":
904                                         decompType [cp] = DecompositionCompat;
905                                         break;
906                                 case "fraction":
907                                         decompType [cp] = DecompositionFraction;
908                                         break;
909                                 case "font":
910                                         decompType [cp] = DecompositionFont;
911                                         break;
912                                 case "circle":
913                                         decompType [cp] = DecompositionCircle;
914                                         break;
915                                 case "square":
916                                         decompType [cp] = DecompositionSquare;
917                                         break;
918                                 case "wide":
919                                         decompType [cp] = DecompositionWide;
920                                         break;
921                                 case "narrow":
922                                         decompType [cp] = DecompositionNarrow;
923                                         break;
924                                 case "vertical":
925                                         decompType [cp] = DecompositionVertical;
926                                         break;
927                                 default:
928                                         throw new Exception ("Support NFKD type : " + decomp);
929                                 }
930                         }
931                         else
932                                 decompType [cp] = DecompositionCanonical;
933                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
934                         if (decomp.Length > 0) {
935
936                                 string [] velems = decomp.Split (' ');
937                                 int didx = decompValues.Count;
938                                 decompIndex [cp] = didx;
939                                 foreach (string v in velems)
940                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
941                                 decompLength [cp] = velems.Length;
942
943                                 // [decmpType] -> this_cp
944                                 int targetCP = (int) decompValues [didx];
945                                 // for "(x)" it specially maps to 'x' .
946                                 // FIXME: check if it is sane
947                                 if (velems.Length == 3 &&
948                                         (int) decompValues [didx] == '(' &&
949                                         (int) decompValues [didx + 2] == ')')
950                                         targetCP = (int) decompValues [didx + 1];
951                                 // special: 0x215F "1/"
952                                 else if (cp == 0x215F)
953                                         targetCP = '1';
954                                 else if (velems.Length > 1 &&
955                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
956                                         // skip them, except for CJK ideograph compat
957                                         targetCP = 0;
958
959                                 if (targetCP != 0) {
960                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
961                                         if (entry == null) {
962                                                 entry = new Hashtable ();
963                                                 nfkdMap [targetCP] = entry;
964                                         }
965                                         entry [(byte) decompType [cp]] = cp;
966                                 }
967                         }
968                         // numeric values
969                         if (values [5].Length > 0)
970                                 decimalValue [cp] = decimal.Parse (values [5]);
971                         else if (values [6].Length > 0)
972                                 decimalValue [cp] = decimal.Parse (values [6]);
973                         else if (values [7].Length > 0) {
974                                 string decstr = values [7];
975                                 idx = decstr.IndexOf ('/');
976                                 if (cp == 0x215F) // special. "1/"
977                                         decimalValue [cp] = 0x1;
978                                 else if (idx > 0)
979                                         // m/n
980                                         decimalValue [cp] = 
981                                                 decimal.Parse (decstr.Substring (0, idx))
982                                                 / decimal.Parse (decstr.Substring (idx + 1));
983                                 else if (decstr [0] == '(' &&
984                                         decstr [decstr.Length - 1] == ')')
985                                         // (n)
986                                         decimalValue [cp] =
987                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
988                                 else if (decstr [decstr.Length - 1] == '.')
989                                         // n.
990                                         decimalValue [cp] =
991                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
992                                 else
993                                         decimalValue [cp] = decimal.Parse (decstr);
994                         }
995                 }
996
997                 void ParseDerivedCoreProperties (string filename)
998                 {
999                         // IsUppercase
1000                         using (StreamReader file =
1001                                 new StreamReader (filename)) {
1002                                 for (int line = 1; file.Peek () >= 0; line++) {
1003                                         try {
1004                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1005                                         } catch (Exception) {
1006                                                 Console.Error.WriteLine ("**** At line " + line);
1007                                                 throw;
1008                                         }
1009                                 }
1010                         }
1011                 }
1012
1013                 void ProcessDerivedCorePropLine (string s)
1014                 {
1015                         int idx = s.IndexOf ('#');
1016                         if (idx >= 0)
1017                                 s = s.Substring (0, idx);
1018                         idx = s.IndexOf (';');
1019                         if (idx < 0)
1020                                 return;
1021                         string cpspec = s.Substring (0, idx);
1022                         idx = cpspec.IndexOf ("..");
1023                         NumberStyles nf = NumberStyles.HexNumber |
1024                                 NumberStyles.AllowTrailingWhite;
1025                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1026                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1027                         string value = s.Substring (cpspec.Length + 1).Trim ();
1028
1029                         // FIXME: use index
1030                         if (cp > char.MaxValue)
1031                                 return;
1032
1033                         switch (value) {
1034                         case "Uppercase":
1035                                 for (int x = cp; x <= cpEnd; x++)
1036                                         isUppercase [x] = true;
1037                                 break;
1038                         }
1039                 }
1040
1041                 void ParseScripts (string filename)
1042                 {
1043                         ArrayList cyrillic = new ArrayList ();
1044                         ArrayList gurmukhi = new ArrayList ();
1045                         ArrayList gujarati = new ArrayList ();
1046                         ArrayList georgian = new ArrayList ();
1047                         ArrayList thaana = new ArrayList ();
1048
1049                         using (StreamReader file =
1050                                 new StreamReader (filename)) {
1051                                 while (file.Peek () >= 0) {
1052                                         string s = file.ReadLine ();
1053                                         int idx = s.IndexOf ('#');
1054                                         if (idx >= 0)
1055                                                 s = s.Substring (0, idx);
1056                                         idx = s.IndexOf (';');
1057                                         if (idx < 0)
1058                                                 continue;
1059
1060                                         string cpspec = s.Substring (0, idx);
1061                                         idx = cpspec.IndexOf ("..");
1062                                         NumberStyles nf = NumberStyles.HexNumber |
1063                                                 NumberStyles.AllowTrailingWhite;
1064                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1065                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1066                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1067
1068                                         // FIXME: use index
1069                                         if (cp > char.MaxValue)
1070                                                 continue;
1071
1072                                         switch (value) {
1073                                         case "Cyrillic":
1074                                                 for (int x = cp; x <= cpEnd; x++)
1075                                                         if (!IsIgnorable (x))
1076                                                                 cyrillic.Add ((char) x);
1077                                                 break;
1078                                         case "Gurmukhi":
1079                                                 for (int x = cp; x <= cpEnd; x++)
1080                                                         if (!IsIgnorable (x))
1081                                                                 gurmukhi.Add ((char) x);
1082                                                 break;
1083                                         case "Gujarati":
1084                                                 for (int x = cp; x <= cpEnd; x++)
1085                                                         if (!IsIgnorable (x))
1086                                                                 gujarati.Add ((char) x);
1087                                                 break;
1088                                         case "Georgian":
1089                                                 for (int x = cp; x <= cpEnd; x++)
1090                                                         if (!IsIgnorable (x))
1091                                                                 georgian.Add ((char) x);
1092                                                 break;
1093                                         case "Thaana":
1094                                                 for (int x = cp; x <= cpEnd; x++)
1095                                                         if (!IsIgnorable (x))
1096                                                                 thaana.Add ((char) x);
1097                                                 break;
1098                                         }
1099                                 }
1100                         }
1101                         cyrillic.Sort (UCAComparer.Instance);
1102                         gurmukhi.Sort (UCAComparer.Instance);
1103                         gujarati.Sort (UCAComparer.Instance);
1104                         georgian.Sort (UCAComparer.Instance);
1105                         thaana.Sort (UCAComparer.Instance);
1106                         orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1107                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1108                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1109                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1110                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1111                 }
1112
1113                 void ParseJISOrder (string filename)
1114                 {
1115                         using (StreamReader file =
1116                                 new StreamReader (filename)) {
1117                                 while (file.Peek () >= 0) {
1118                                         string s = file.ReadLine ();
1119                                         int idx = s.IndexOf ('#');
1120                                         if (idx >= 0)
1121                                                 s = s.Substring (0, idx).Trim ();
1122                                         if (s.Length == 0)
1123                                                 continue;
1124                                         idx = s.IndexOf (' ');
1125                                         if (idx < 0)
1126                                                 continue;
1127                                         // They start with "0x" so cut them out.
1128                                         int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
1129                                         int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
1130                                         jisJapanese.Add (new JISCharacter (cp, jis));
1131                                 }
1132                         }
1133                 }
1134
1135                 void ParseCJK (string zhXML, string jaXML, string koXML)
1136                 {
1137                         XmlDocument doc = new XmlDocument ();
1138                         doc.XmlResolver = null;
1139                         int v;
1140                         string s;
1141                         string category;
1142                         int offset;
1143                         ushort [] arr;
1144
1145                         // Chinese Simplified
1146                         category = "chs";
1147                         arr = cjkCHS;
1148                         offset = 0;//char.MaxValue - arr.Length;
1149                         doc.Load (zhXML);
1150                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1151                         v = 0x8008;
1152                         foreach (char c in s) {
1153                                 if (c < '\u3100')
1154                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1155                                 else {
1156                                         arr [(int) c - offset] = (ushort) v++;
1157                                         if (v % 256 == 0)
1158                                                 v += 2;
1159                                 }
1160                         }
1161
1162                         // Chinese Traditional
1163                         category = "cht";
1164                         arr = cjkCHT;
1165                         offset = 0;//char.MaxValue - arr.Length;
1166                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1167                         v = 0x8002;
1168                         foreach (char c in s) {
1169                                 if (c < '\u4E00')
1170                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1171                                 else {
1172                                         arr [(int) c - offset] = (ushort) v++;
1173                                         if (v % 256 == 0)
1174                                                 v += 2;
1175                                 }
1176                         }
1177
1178                         // Japanese
1179                         category = "ja";
1180                         arr = cjkJA;
1181                         offset = 0;//char.MaxValue - arr.Length;
1182                         doc.Load (jaXML);
1183                         s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1184                         v = 0x8008;
1185                         foreach (char c in s) {
1186                                 if (c < '\u4E00')
1187                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1188                                 else {
1189                                         arr [(int) c - offset] = (ushort) v++;
1190                                         if (v % 256 == 0)
1191                                                 v += 2;
1192                                 }
1193                         }
1194
1195                         // Korean
1196                         // Korean weight is somewhat complex. It first shifts
1197                         // Hangul category from 52-x to 80-x (they are anyways
1198                         // computed). CJK ideographs are placed at secondary
1199                         // weight, like XX YY 01 zz 01, where XX and YY are
1200                         // corresponding "reset" value and zz is 41,43,45...
1201                         //
1202                         // Unlike chs,cht and ja, Korean value is a combined
1203                         // ushort which is computed as category
1204                         //
1205                         category = "ko";
1206                         arr = cjkKO;
1207                         offset = 0;//char.MaxValue - arr.Length;
1208                         doc.Load (koXML);
1209                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1210                                 XmlElement sc = (XmlElement) reset.NextSibling;
1211                                 // compute "category" and "level 1" for the 
1212                                 // target "reset" Hangle syllable
1213                                 char rc = reset.InnerText [0];
1214                                 int ri = ((int) rc - 0xAC00) + 1;
1215                                 ushort p = (ushort)
1216                                         ((ri / 254) * 256 + (ri % 254) + 2);
1217                                 // Place the characters after the target.
1218                                 s = sc.InnerText;
1219                                 v = 0x41;
1220                                 foreach (char c in s) {
1221                                         arr [(int) c - offset] = p;
1222                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1223                                         v += 2;
1224                                 }
1225                         }
1226                 }
1227
1228                 #endregion
1229
1230                 #region Generation
1231
1232                 void FillIgnorables ()
1233                 {
1234                         for (int i = 0; i <= char.MaxValue; i++) {
1235                                 if (Char.GetUnicodeCategory ((char) i) ==
1236                                         UnicodeCategory.OtherNotAssigned)
1237                                         continue;
1238                                 if (IsIgnorable (i))
1239                                         ignorableFlags [i] |= 1;
1240                                 if (IsIgnorableSymbol (i))
1241                                         ignorableFlags [i] |= 2;
1242                                 if (IsIgnorableNonSpacing (i))
1243                                         ignorableFlags [i] |= 4;
1244                         }
1245                 }
1246
1247                 void ModifyParsedValues ()
1248                 {
1249                         // number, secondary weights
1250                         byte weight = 0x38;
1251                         int [] numarr = numberSecondaryWeightBounds;
1252                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1253                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1254                                         if (Char.IsNumber ((char) cp))
1255                                                 diacritical [cp] = weight;
1256
1257                         // Korean parens numbers
1258                         for (int i = 0x3200; i <= 0x321C; i++)
1259                                 diacritical [i] = 0xA;
1260                         for (int i = 0x3260; i <= 0x327B; i++)
1261                                 diacritical [i] = 0xC;
1262
1263                         // Update name part of named characters
1264                         for (int i = 0; i < sortableCharNames.Count; i++) {
1265                                 DictionaryEntry de =
1266                                         (DictionaryEntry) sortableCharNames [i];
1267                                 int cp = (int) de.Key;
1268                                 string renamed = null;
1269                                 switch (cp) {
1270                                 case 0x2101: renamed = "A_1"; break;
1271                                 case 0x33C3: renamed = "A_2"; break;
1272                                 case 0x2105: renamed = "C_1"; break;
1273                                 case 0x2106: renamed = "C_2"; break;
1274                                 case 0x211E: renamed = "R1"; break;
1275                                 case 0x211F: renamed = "R2"; break;
1276                                 // Remove some of them!
1277                                 case 0x2103:
1278                                 case 0x2109:
1279                                 case 0x2116:
1280                                 case 0x2117:
1281                                 case 0x2118:
1282                                 case 0x2125:
1283                                 case 0x2127:
1284                                 case 0x2129:
1285                                 case 0x212E:
1286                                 case 0x2132:
1287                                         sortableCharNames.RemoveAt (i);
1288                                         i--;
1289                                         continue;
1290                                 }
1291                                 if (renamed != null)
1292                                         sortableCharNames [i] =
1293                                                 new DictionaryEntry (cp, renamed);
1294                         }
1295                 }
1296
1297                 void GenerateCore ()
1298                 {
1299                         UnicodeCategory uc;
1300
1301                         #region Specially ignored // 01
1302                         // This will raise "Defined" flag up.
1303                         foreach (char c in specialIgnore)
1304                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1305                         #endregion
1306
1307
1308                         #region Variable weights
1309                         // Controls : 06 03 - 06 3D
1310                         fillIndex [6] = 3;
1311                         for (int i = 0; i < 65536; i++) {
1312                                 if (IsIgnorable (i))
1313                                         continue;
1314                                 char c = (char) i;
1315                                 uc = Char.GetUnicodeCategory (c);
1316                                 // NEL is whitespace but not ignored here.
1317                                 if (uc == UnicodeCategory.Control &&
1318                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1319                                         AddCharMap (c, 6, 1);
1320                         }
1321
1322                         // Apostrophe 06 80
1323                         fillIndex [6] = 0x80;
1324                         AddCharMapGroup ('\'', 6, 1, 0);
1325                         AddCharMap ('\uFE63', 6, 1);
1326
1327                         // Hyphen/Dash : 06 81 - 06 90
1328                         for (int i = 0; i < char.MaxValue; i++) {
1329                                 if (Char.GetUnicodeCategory ((char) i)
1330                                         == UnicodeCategory.DashPunctuation)
1331 //                                      AddCharMapGroupTail ((char) i, 6, 1);
1332                                         AddCharMapGroup ((char) i, 6, 1, 0);
1333                         }
1334
1335                         // Arabic variable weight chars 06 A0 -
1336                         fillIndex [6] = 0xA0;
1337                         // vowels
1338                         for (int i = 0x64B; i <= 0x650; i++)
1339                                 AddCharMapGroupTail ((char) i, 6, 1);
1340                         // sukun
1341                         AddCharMapGroup ('\u0652', 6, 1, 0);
1342                         // shadda
1343                         AddCharMapGroup ('\u0651', 6, 1, 0);
1344                         #endregion
1345
1346
1347                         #region Nonspacing marks // 01
1348                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1349
1350                         // Combining diacritical marks: 01 DC -
1351
1352                         fillIndex [0x1] = 0x41;
1353                         for (int i = 0x030E; i <= 0x0326; i++)
1354                                 if (!IsIgnorable (i))
1355                                         AddCharMap ((char) i, 0x1, 1);
1356                         for (int i = 0x0329; i <= 0x0334; i++)
1357                                 if (!IsIgnorable (i))
1358                                         AddCharMap ((char) i, 0x1, 1);
1359                         for (int i = 0x0339; i <= 0x0341; i++)
1360                                 if (!IsIgnorable (i))
1361                                         AddCharMap ((char) i, 0x1, 1);
1362                         fillIndex [0x1] = 0x72;
1363                         for (int i = 0x0346; i <= 0x0348; i++)
1364                                 if (!IsIgnorable (i))
1365                                         AddCharMap ((char) i, 0x1, 1);
1366                         for (int i = 0x02BE; i <= 0x02BF; i++)
1367                                 if (!IsIgnorable (i))
1368                                         AddCharMap ((char) i, 0x1, 1);
1369                         for (int i = 0x02C1; i <= 0x02C5; i++)
1370                                 if (!IsIgnorable (i))
1371                                         AddCharMap ((char) i, 0x1, 1);
1372                         for (int i = 0x02CE; i <= 0x02CF; i++)
1373                                 if (!IsIgnorable (i))
1374                                         AddCharMap ((char) i, 0x1, 1);
1375                         for (int i = 0x02D1; i <= 0x02D3; i++)
1376                                 if (!IsIgnorable (i))
1377                                         AddCharMap ((char) i, 0x1, 1);
1378                         AddCharMap ('\u02DE', 0x1, 1);
1379                         for (int i = 0x02E4; i <= 0x02E9; i++)
1380                                 if (!IsIgnorable (i))
1381                                         AddCharMap ((char) i, 0x1, 1);
1382
1383                         // LAMESPEC: It should not stop at '\u20E1'. There are
1384                         // a few more characters (that however results in 
1385                         // overflow of level 2 unless we start before 0xDD).
1386                         fillIndex [0x1] = 0xDC;
1387                         for (int i = 0x20d0; i <= 0x20e1; i++)
1388                                 AddCharMap ((char) i, 0x1, 1);
1389                         #endregion
1390
1391
1392                         #region Whitespaces // 07 03 -
1393                         fillIndex [0x7] = 0x2;
1394                         AddCharMap (' ', 0x7, 2);
1395                         AddCharMap ('\u00A0', 0x7, 1);
1396                         for (int i = 9; i <= 0xD; i++)
1397                                 AddCharMap ((char) i, 0x7, 1);
1398                         for (int i = 0x2000; i <= 0x200B; i++)
1399                                 AddCharMap ((char) i, 0x7, 1);
1400
1401                         fillIndex [0x7] = 0x17;
1402                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1403                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1404
1405                         // Characters which used to represent layout control.
1406                         // LAMESPEC: Windows developers seem to have thought 
1407                         // that those characters are kind of whitespaces,
1408                         // while they aren't.
1409                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1410                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1411                         #endregion
1412
1413                         // FIXME: 09 should be more complete.
1414                         fillIndex [0x9] = 2;
1415                         // misc tech mark
1416                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1417                                 AddCharMap ((char) cp, 0x9, 1, 0);
1418
1419                         // arrows
1420                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1421                         foreach (DictionaryEntry de in arrowValues) {
1422                                 int idx = (int) de.Value;
1423                                 int cp = (int) de.Key;
1424                                 if (map [cp].Defined)
1425                                         continue;
1426                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1427                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1428                                 arrowLv2 [idx]++;
1429                         }
1430                         // boxes
1431                         byte [] boxLv2 = new byte [128];
1432                         for (int i = 0; i < boxLv2.Length; i++)
1433                                 boxLv2 [i] = 3;
1434                         foreach (DictionaryEntry de in boxValues) {
1435                                 int cp = (int) de.Key;
1436                                 int idx = (int) de.Value;
1437                                 if (map [cp].Defined)
1438                                         continue;
1439                                 fillIndex [0x9] = (byte) (0xE5 + idx);
1440                                 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1441                                 boxLv2 [idx]++;
1442                         }
1443                         // Some special characters (slanted)
1444                         fillIndex [0x9] = 0xF4;
1445                         AddCharMap ('\u2571', 0x9, 3);
1446                         AddCharMap ('\u2572', 0x9, 3);
1447                         AddCharMap ('\u2573', 0x9, 3);
1448
1449                         // FIXME: implement 0A
1450                         #region Symbols
1451                         fillIndex [0xA] = 2;
1452                         // byte currency symbols
1453                         for (int cp = 0; cp < 0x100; cp++) {
1454                                 uc = Char.GetUnicodeCategory ((char) cp);
1455                                 if (!IsIgnorable (cp) &&
1456                                         uc == UnicodeCategory.CurrencySymbol &&
1457                                         cp != '$')
1458                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1459                         }
1460                         // byte other symbols
1461                         for (int cp = 0; cp < 0x100; cp++) {
1462                                 if (cp == 0xA6)
1463                                         continue; // SPECIAL: skip FIXME: why?
1464                                 uc = Char.GetUnicodeCategory ((char) cp);
1465                                 if (!IsIgnorable (cp) &&
1466                                         uc == UnicodeCategory.OtherSymbol)
1467                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1468                         }
1469
1470                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1471                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1472                                 AddCharMap ((char) cp, 0xA, 1, 0);
1473                         // Dingbats
1474                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1475                                 if (Char.IsSymbol ((char) cp))
1476                                         AddCharMap ((char) cp, 0xA, 1, 0);
1477                         // OCR
1478                         for (int i = 0x2440; i < 0x2460; i++)
1479                                 AddCharMap ((char) i, 0xA, 1, 0);
1480
1481                         #endregion
1482
1483                         #region Numbers // 0C 02 - 0C E1
1484                         fillIndex [0xC] = 2;
1485
1486                         // 9F8 : Bengali "one less than the denominator"
1487                         AddCharMap ('\u09F8', 0xC, 1);
1488
1489                         ArrayList numbers = new ArrayList ();
1490                         for (int i = 0; i < 65536; i++)
1491                                 if (!IsIgnorable (i) &&
1492                                         Char.IsNumber ((char) i) &&
1493                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1494                                         numbers.Add (i);
1495
1496                         ArrayList numberValues = new ArrayList ();
1497                         foreach (int i in numbers)
1498                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1499                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1500
1501 //foreach (DictionaryEntry de in numberValues)
1502 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1503
1504                         decimal prevValue = -1;
1505                         foreach (DictionaryEntry de in numberValues) {
1506                                 int cp = (int) de.Key;
1507                                 decimal currValue = (decimal) de.Value;
1508                                 bool addnew = false;
1509                                 if (prevValue < currValue &&
1510                                         prevValue - (int) prevValue == 0 &&
1511                                         prevValue >= 1) {
1512
1513                                         addnew = true;
1514                                         // Process Hangzhou and Roman numbers
1515
1516                                         // There are some SPECIAL cases.
1517                                         if (currValue != 4) // no increment for 4
1518                                                 fillIndex [0xC]++;
1519
1520                                         int xcp;
1521                                         xcp = (int) prevValue + 0x2170 - 1;
1522                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1523                                         xcp = (int) prevValue + 0x2160 - 1;
1524                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1525                                         fillIndex [0xC] += 2;
1526                                         xcp = (int) prevValue + 0x3021 - 1;
1527                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1528                                         fillIndex [0xC]++;
1529                                 }
1530                                 if (prevValue < currValue)
1531                                         prevValue = currValue;
1532                                 if (map [cp].Defined)
1533                                         continue;
1534                                 // HangZhou and Roman are add later 
1535                                 // (code is above)
1536                                 else if (0x3021 <= cp && cp < 0x302A
1537                                         || 0x2160 <= cp && cp < 0x216A
1538                                         || 0x2170 <= cp && cp < 0x217A)
1539                                         continue;
1540
1541                                 if (cp ==  0x215B) // FIXME: why?
1542                                         fillIndex [0xC] += 2;
1543                                 else if (cp == 0x3021) // FIXME: why?
1544                                         fillIndex [0xC]++;
1545                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1546
1547                                 if (addnew || cp <= '9') {
1548                                         int xcp;
1549                                         if (1 <= currValue && currValue <= 10) {
1550                                                 xcp = cp - 0x31 + 0x2776;
1551                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1552                                                 xcp = cp - 0x31 + 0x2780;
1553                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1554                                                 xcp = cp - 0x31 + 0x278A;
1555                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1556                                         }
1557                                         if (1 <= currValue && currValue <= 20) {
1558                                                 xcp = cp - 0x31 + 0x2460;
1559                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1560                                                 xcp = cp - 0x31 + 0x2474;
1561                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1562                                                 xcp = cp - 0x31 + 0x2488;
1563                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1564                                         }
1565                                 }
1566
1567                                 if (cp != 0x09E7 && cp != 0x09EA)
1568                                         fillIndex [0xC]++;
1569
1570                                 // Add special cases that are not regarded as 
1571                                 // numbers in UnicodeCategory speak.
1572                                 if (cp == '5') {
1573                                         // TONE FIVE
1574                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1575                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1576                                 }
1577                                 else if (cp == '6') // FIXME: why?
1578                                         fillIndex [0xC]++;
1579                         }
1580
1581                         // 221E: infinity
1582                         fillIndex [0xC] = 0xFF;
1583                         AddCharMap ('\u221E', 0xC, 1);
1584                         #endregion
1585
1586                         #region Letters and NonSpacing Marks (general)
1587
1588                         // ASCII Latin alphabets
1589                         for (int i = 0; i < alphabets.Length; i++)
1590                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1591
1592
1593                         // non-ASCII Latin alphabets
1594                         // FIXME: there is no such characters that are placed
1595                         // *after* "alphabets" array items. This is nothing
1596                         // more than a hack that creates dummy weight for
1597                         // primary characters.
1598                         for (int i = 0x0080; i < 0x0300; i++) {
1599                                 if (!Char.IsLetter ((char) i))
1600                                         continue;
1601                                 // For those Latin Letters which has NFKD are
1602                                 // not added as independent primary character.
1603                                 if (decompIndex [i] != 0)
1604                                         continue;
1605                                 // SPECIAL CASES:
1606                                 // 1.some alphabets have primarily
1607                                 //   equivalent ASCII alphabets.
1608                                 // 2.some have independent primary weights,
1609                                 //   but inside a-to-z range.
1610                                 // 3.there are some expanded characters that
1611                                 //   are not part of Unicode Standard NFKD.
1612                                 switch (i) {
1613                                 // 1. skipping them does not make sense
1614 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
1615 //                              case 0x184: case 0x185: case 0x186: case 0x189:
1616 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
1617 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
1618 //                              case 0x19B: case 0x19C:
1619                                 // 2. skipping them does not make sense
1620 //                              case 0x14A: // Ng
1621 //                              case 0x14B: // ng
1622                                 // 3.
1623                                 case 0xC6: // AE
1624                                 case 0xE6: // ae
1625                                 case 0xDE: // Icelandic Thorn
1626                                 case 0xFE: // Icelandic Thorn
1627                                 case 0xDF: // German ss
1628                                 case 0xFF: // German ss
1629                                 // not classified yet
1630 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1631 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1632 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1633 //                              case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1634 //                              case 0x1DD:
1635                                         continue;
1636                                 }
1637                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
1638                         }
1639
1640                         // Greek and Coptic
1641                         fillIndex [0xF] = 02;
1642                         for (int i = 0x0380; i < 0x0390; i++)
1643                                 if (Char.IsLetter ((char) i))
1644                                         AddLetterMap ((char) i, 0xF, 1);
1645                         fillIndex [0xF] = 02;
1646                         for (int i = 0x0391; i < 0x03CF; i++)
1647                                 if (Char.IsLetter ((char) i))
1648                                         AddLetterMap ((char) i, 0xF, 1);
1649                         fillIndex [0xF] = 0x40;
1650                         for (int i = 0x03D0; i < 0x0400; i++)
1651                                 if (Char.IsLetter ((char) i))
1652                                         AddLetterMap ((char) i, 0xF, 1);
1653
1654                         // Cyrillic - UCA order w/ some modification
1655                         fillIndex [0x10] = 0x3;
1656                         // table which is moslty from UCA DUCET.
1657                         for (int i = 0; i < orderedCyrillic.Length; i++) {
1658                                 char c = orderedCyrillic [i];
1659                                 if (Char.IsLetter (c))
1660                                         AddLetterMap (c, 0x10, 3);
1661                         }
1662                         for (int i = 0x0460; i < 0x0481; i++) {
1663                                 if (Char.IsLetter ((char) i))
1664                                         AddLetterMap ((char) i, 0x10, 3);
1665                         }
1666
1667                         // Armenian
1668                         fillIndex [0x11] = 0x3;
1669                         for (int i = 0x0531; i < 0x0586; i++)
1670                                 if (Char.IsLetter ((char) i))
1671                                         AddLetterMap ((char) i, 0x11, 1);
1672
1673                         // Hebrew
1674                         // -Letters
1675                         fillIndex [0x12] = 0x3;
1676                         for (int i = 0x05D0; i < 0x05FF; i++)
1677                                 if (Char.IsLetter ((char) i))
1678                                         AddLetterMap ((char) i, 0x12, 1);
1679                         // -Accents
1680                         fillIndex [0x1] = 0x3;
1681                         for (int i = 0x0591; i <= 0x05C2; i++)
1682                                 if (i != 0x05BE)
1683                                         AddCharMap ((char) i, 0x1, 1);
1684
1685                         // Arabic
1686                         fillIndex [0x1] = 0x8E;
1687                         fillIndex [0x13] = 0x3;
1688                         for (int i = 0x0621; i <= 0x064A; i++) {
1689                                 // Abjad
1690                                 if (Char.GetUnicodeCategory ((char) i)
1691                                         != UnicodeCategory.OtherLetter) {
1692                                         // FIXME: arabic nonspacing marks are
1693                                         // in different order.
1694                                         AddCharMap ((char) i, 0x1, 1);
1695                                         continue;
1696                                 }
1697 //                              map [i] = new CharMapEntry (0x13,
1698 //                                      (byte) arabicLetterPrimaryValues [i], 1);
1699                                 fillIndex [0x13] = 
1700                                         (byte) arabicLetterPrimaryValues [i];
1701                                 AddLetterMap ((char) i, 0x13, 0);
1702                         }
1703                         fillIndex [0x13] = 0x84;
1704                         for (int i = 0x0674; i < 0x06D6; i++)
1705                                 if (Char.IsLetter ((char) i))
1706                                         AddLetterMap ((char) i, 0x13, 1);
1707
1708                         // Devanagari
1709                         // FIXME: it does seem straight codepoint mapping.
1710                         fillIndex [0x14] = 04;
1711                         for (int i = 0x0901; i < 0x0905; i++)
1712                                 if (!IsIgnorable (i))
1713                                         AddLetterMap ((char) i, 0x14, 2);
1714                         fillIndex [0x14] = 0xB;
1715                         for (int i = 0x0905; i < 0x093A; i++)
1716                                 if (Char.IsLetter ((char) i))
1717                                         AddLetterMap ((char) i, 0x14, 4);
1718                         for (int i = 0x093E; i < 0x094F; i++)
1719                                 if (!IsIgnorable (i))
1720                                         AddLetterMap ((char) i, 0x14, 2);
1721
1722                         // Bengali
1723                         // -Letters
1724                         fillIndex [0x15] = 02;
1725                         for (int i = 0x0980; i < 0x9FF; i++) {
1726                                 if (IsIgnorable (i))
1727                                         continue;
1728                                 if (i == 0x09E0)
1729                                         fillIndex [0x15] = 0x3B;
1730                                 switch (Char.GetUnicodeCategory ((char) i)) {
1731                                 case UnicodeCategory.NonSpacingMark:
1732                                 case UnicodeCategory.DecimalDigitNumber:
1733                                 case UnicodeCategory.OtherNumber:
1734                                         continue;
1735                                 }
1736                                 AddLetterMap ((char) i, 0x15, 1);
1737                         }
1738                         // -Signs
1739                         fillIndex [0x1] = 0x3;
1740                         for (int i = 0x0981; i < 0x0A00; i++)
1741                                 if (Char.GetUnicodeCategory ((char) i) ==
1742                                         UnicodeCategory.NonSpacingMark)
1743                                         AddCharMap ((char) i, 0x1, 1);
1744
1745                         // Gurmukhi. orderedGurmukhi is from UCA
1746                         // FIXME: it does not look equivalent to UCA.
1747                         fillIndex [0x1] = 03;
1748                         fillIndex [0x16] = 02;
1749                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
1750                                 char c = orderedGurmukhi [i];
1751                                 if (IsIgnorable ((int) c))
1752                                         continue;
1753                                 if (!Char.IsLetter (c)) {
1754                                         AddLetterMap (c, 0x1, 1);
1755                                         continue;
1756                                 }
1757                                 if (c == '\u0A3C' || c == '\u0A4D' ||
1758                                         '\u0A66' <= c && c <= '\u0A71')
1759                                         continue;
1760                                 AddLetterMap (c, 0x16, 4);
1761                         }
1762
1763                         // Gujarati. orderedGujarati is from UCA
1764                         fillIndex [0x17] = 02;
1765                         for (int i = 0; i < orderedGujarati.Length; i++)
1766                                 AddLetterMap (orderedGujarati [i], 0x17, 4);
1767
1768                         // Oriya
1769                         fillIndex [0x18] = 02;
1770                         for (int i = 0x0B00; i < 0x0B7F; i++) {
1771                                 switch (Char.GetUnicodeCategory ((char) i)) {
1772                                 case UnicodeCategory.NonSpacingMark:
1773                                 case UnicodeCategory.DecimalDigitNumber:
1774                                         continue;
1775                                 }
1776                                 AddLetterMap ((char) i, 0x18, 1);
1777                         }
1778
1779                         // Tamil
1780                         fillIndex [0x19] = 2;
1781                         AddCharMap ('\u0BD7', 0x19, 0);
1782                         fillIndex [0x19] = 0xA;
1783                         // vowels
1784                         for (int i = 0x0BD7; i < 0x0B94; i++)
1785                                 if (Char.IsLetter ((char) i))
1786                                         AddCharMap ((char) i, 0x19, 2);
1787                         // special vowel
1788                         fillIndex [0x19] = 0x24;
1789                         AddCharMap ('\u0B94', 0x19, 0);
1790                         fillIndex [0x19] = 0x26;
1791                         // The array for Tamil consonants is a constant.
1792                         // Windows have almost similar sequence to TAM from
1793                         // tamilnet but a bit different in Grantha.
1794                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
1795                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1796                         // combining marks
1797                         fillIndex [0x19] = 0x82;
1798                         for (int i = 0x0BBE; i < 0x0BCD; i++)
1799                                 if (Char.GetUnicodeCategory ((char) i) ==
1800                                         UnicodeCategory.SpacingCombiningMark
1801                                         || i == 0x0BC0)
1802                                         AddLetterMap ((char) i, 0x19, 2);
1803
1804                         // Telugu
1805                         fillIndex [0x1A] = 0x4;
1806                         for (int i = 0x0C00; i < 0x0C62; i++) {
1807                                 if (i == 0x0C55 || i == 0x0C56)
1808                                         continue; // skip
1809                                 AddCharMap ((char) i, 0x1A, 3);
1810                                 char supp = (i == 0x0C0B) ? '\u0C60':
1811                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
1812                                 if (supp == char.MinValue)
1813                                         continue;
1814                                 AddCharMap (supp, 0x1A, 3);
1815                         }
1816
1817                         // Kannada
1818                         fillIndex [0x1B] = 4;
1819                         for (int i = 0x0C80; i < 0x0CE5; i++) {
1820                                 if (i == 0x0CD5 || i == 0x0CD6)
1821                                         continue; // ignore
1822                                 AddCharMap ((char) i, 0x1B, 3);
1823                         }
1824                         
1825                         // Malayalam
1826                         fillIndex [0x1C] = 2;
1827                         for (int i = 0x0D02; i < 0x0D61; i++)
1828                                 // FIXME: I avoided MSCompatUnicodeTable usage
1829                                 // here (it results in recursion). So check if
1830                                 // using NonSpacingMark makes sense or not.
1831                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1832 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1833                                         AddCharMap ((char) i, 0x1C, 1);
1834
1835                         // Thai ... note that it breaks 0x1E wall after E2B!
1836                         // Also, all Thai characters have level 2 value 3.
1837                         fillIndex [0x1E] = 2;
1838                         for (int i = 0xE44; i < 0xE48; i++)
1839                                 AddCharMap ((char) i, 0x1E, 1, 3);
1840                         for (int i = 0xE01; i < 0xE2B; i++)
1841                                 AddCharMap ((char) i, 0x1E, 6, 0);
1842                         fillIndex [0x1F] = 5;
1843                         for (int i = 0xE2B; i < 0xE30; i++)
1844                                 AddCharMap ((char) i, 0x1F, 6, 0);
1845                         for (int i = 0xE30; i < 0xE3B; i++)
1846                                 AddCharMap ((char) i, 0x1F, 1, 3);
1847                         // some Thai characters remains.
1848                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
1849                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1850                         foreach (char c in specialThai)
1851                                 AddCharMap (c, 0x1F, 1);
1852
1853                         // Lao
1854                         fillIndex [0x1F] = 2;
1855                         for (int i = 0xE80; i < 0xEDF; i++)
1856                                 if (Char.IsLetter ((char) i))
1857                                         AddCharMap ((char) i, 0x1F, 1);
1858
1859                         // Georgian. orderedGeorgian is from UCA DUCET.
1860                         fillIndex [0x21] = 5;
1861                         for (int i = 0; i < orderedGeorgian.Length; i++)
1862                                 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1863
1864                         // Japanese Kana.
1865                         fillIndex [0x22] = 2;
1866                         int kanaOffset = 0x3041;
1867                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1868
1869                         for (int gyo = 0; gyo < 9; gyo++) {
1870                                 for (int dan = 0; dan < 5; dan++) {
1871                                         if (gyo == 7 && dan % 2 == 1) {
1872                                                 // 'ya'-gyo
1873                                                 fillIndex [0x22]++;
1874                                                 kanaOffset -= 2; // There is no space for yi and ye.
1875                                                 continue;
1876                                         }
1877                                         int cp = kanaOffset + dan * kanaLines [gyo];
1878                                         // small lines (a-gyo, ya-gyo)
1879                                         if (gyo == 0 || gyo == 7) {
1880                                                 AddKanaMap (cp, 1); // small
1881                                                 AddKanaMap (cp + 1, 1);
1882                                         }
1883                                         else
1884                                                 AddKanaMap (cp, kanaLines [gyo]);
1885                                         fillIndex [0x22]++;
1886
1887                                         if (cp == 0x3061) {
1888                                                 // add small 'Tsu' (before normal one)
1889                                                 AddKanaMap (0x3063, 1);
1890                                                 kanaOffset++;
1891                                         }
1892                                 }
1893                                 fillIndex [0x22] += 3;
1894                                 kanaOffset += 5 * kanaLines [gyo];
1895                         }
1896
1897                         // Wa-gyo is almost special, so I just manually add.
1898                         AddLetterMap ((char) 0x308E, 0x22, 0);
1899                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1900                         AddLetterMap ((char) 0x308F, 0x22, 0);
1901                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1902                         fillIndex [0x22]++;
1903                         AddLetterMap ((char) 0x3090, 0x22, 0);
1904                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1905                         fillIndex [0x22] += 2;
1906                         // no "Wu" in Japanese.
1907                         AddLetterMap ((char) 0x3091, 0x22, 0);
1908                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1909                         fillIndex [0x22]++;
1910                         AddLetterMap ((char) 0x3092, 0x22, 0);
1911                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1912                         // Nn
1913                         fillIndex [0x22] = 0x80;
1914                         AddLetterMap ((char) 0x3093, 0x22, 0);
1915                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1916
1917                         // JIS Japanese square chars.
1918                         fillIndex [0x22] = 0x97;
1919                         jisJapanese.Sort (JISComparer.Instance);
1920                         foreach (JISCharacter j in jisJapanese)
1921                                 AddCharMap ((char) j.CP, 0x22, 1);
1922                         // non-JIS Japanese square chars.
1923                         nonJisJapanese.Sort (NonJISComparer.Instance);
1924                         foreach (NonJISCharacter j in nonJisJapanese)
1925                                 AddCharMap ((char) j.CP, 0x22, 1);
1926
1927                         // Bopomofo
1928                         fillIndex [0x23] = 0x02;
1929                         for (int i = 0x3105; i <= 0x312C; i++)
1930                                 AddCharMap ((char) i, 0x23, 1);
1931
1932                         // Estrangela: ancient Syriac
1933                         fillIndex [0x24] = 0x0B;
1934                         // FIXME: is 0x71E really alternative form?
1935                         ArrayList syriacAlternatives = new ArrayList (
1936                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
1937                         for (int i = 0x0710; i <= 0x072C; i++) {
1938                                 if (i == 0x0711) // NonSpacingMark
1939                                         continue;
1940                                 if (syriacAlternatives.Contains (i))
1941                                         continue;
1942                                 AddCharMap ((char) i, 0x24, 4);
1943                                 // FIXME: why?
1944                                 if (i == 0x721)
1945                                         fillIndex [0x24]++;
1946                         }
1947                         foreach (int cp in syriacAlternatives)
1948                                 map [cp] = new CharMapEntry (0x24,
1949                                         (byte) (map [cp - 1].Level1 + 2),
1950                                         0);
1951
1952                         // Thaana
1953                         // FIXME: it turned out that it does not look like UCA
1954                         fillIndex [0x24] = 0x6E;
1955                         for (int i = 0; i < orderedThaana.Length; i++) {
1956                                 if (IsIgnorableNonSpacing (i))
1957                                         continue;
1958                                 AddCharMap (orderedThaana [i], 0x24, 2);
1959                         }
1960                         #endregion
1961
1962                         // FIXME: Add more culture-specific letters (that are
1963                         // not supported in Windows collation) here.
1964
1965                         // Surrogate ... they are computed.
1966
1967                         #region Hangul
1968                         // Hangul.
1969                         //
1970                         // Unlike UCA Windows Hangul sequence mixes Jongseong
1971                         // with Choseong sequence as well as Jungseong,
1972                         // adjusted to have the same primary weight for the
1973                         // same base character. So it is impossible to compute
1974                         // those sort keys.
1975                         //
1976                         // Here I introduce an ordered sequence of mixed
1977                         // 'commands' and 'characters' that is similar to
1978                         // LDML text:
1979                         //      - ',' increases primary weight.
1980                         //      - [A B] means a range, increasing index
1981                         //      - {A B} means a range, without increasing index
1982                         //      - '=' is no operation (it means the characters 
1983                         //        of both sides have the same weight).
1984                         //      - '>' inserts a Hangul Syllable block that 
1985                         //        contains 0x251 characters.
1986                         //      - '<' decreases the index
1987                         //      - '0'-'9' means skip count
1988                         //      - whitespaces are ignored
1989                         //
1990
1991                         string hangulSequence =
1992                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
1993                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
1994                         + "<{\u1113 \u1116}, \u3165,"
1995                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
1996                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
1997                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
1998                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
1999                                 + "[\u11D1 \u11D2], \u11B2,"
2000                                 + "[\u11D3 \u11D5], \u11B3,"
2001                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2002                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2003                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2004                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2005                         + "<{\u1121 \u112C}, \u11B9,,,,,,,,, [\u11E4 \u11E6],,"
2006                                 + "\u1109=\u11BA,,, \u3214=\u3274 <>"
2007                         + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
2008                                 + "\u11EA,, \u110A=\u11BB,,, >"
2009                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2010                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2011                         + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2012                                 + "\u11F1,, \u11F2,,,"
2013                                 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2014                         + "<\u114D, \u110D,,  >"
2015                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2016                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2017                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2018                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2019                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2020                                 + "[\u11F5 \u11F8]"
2021                         ;
2022
2023                         byte hangulCat = 0x52;
2024                         fillIndex [hangulCat] = 0x2;
2025
2026                         int syllableBlock = 0;
2027                         for (int n = 0; n < hangulSequence.Length; n++) {
2028                                 char c = hangulSequence [n];
2029                                 int start, end;
2030                                 if (Char.IsWhiteSpace (c))
2031                                         continue;
2032                                 switch (c) {
2033                                 case '=':
2034                                         break; // NOP
2035                                 case ',':
2036                                         IncrementSequentialIndex (ref hangulCat);
2037                                         break;
2038                                 case '<':
2039                                         if (fillIndex [hangulCat] == 2)
2040                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2041                                         fillIndex [hangulCat]--;
2042                                         break;
2043                                 case '>':
2044                                         IncrementSequentialIndex (ref hangulCat);
2045                                         for (int l = 0; l < 0x15; l++)
2046                                                 for (int v = 0; v < 0x1C; v++) {
2047                                                         AddCharMap (
2048                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2049                                                         IncrementSequentialIndex (ref hangulCat);
2050                                                 }
2051                                         syllableBlock++;
2052                                         break;
2053                                 case '[':
2054                                         start = hangulSequence [n + 1];
2055                                         end = hangulSequence [n + 3];
2056                                         for (int i = start; i <= end; i++) {
2057                                                 AddCharMap ((char) i, hangulCat, 0);
2058                                                 if (end > i)
2059                                                         IncrementSequentialIndex (ref hangulCat);
2060                                         }
2061                                         n += 4; // consumes 5 characters for this operation
2062                                         break;
2063                                 case '{':
2064                                         start = hangulSequence [n + 1];
2065                                         end = hangulSequence [n + 3];
2066                                         for (int i = start; i <= end; i++)
2067                                                 AddCharMap ((char) i, hangulCat, 0);
2068                                         n += 4; // consumes 5 characters for this operation
2069                                         break;
2070                                 default:
2071                                         AddCharMap (c, hangulCat, 0);
2072                                         break;
2073                                 }
2074                         }
2075
2076                         #endregion
2077
2078                         // Letterlike characters and CJK compatibility square
2079                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2080                         int [] counts = new int ['Z' - 'A' + 1];
2081                         char [] namedChars = new char [sortableCharNames.Count];
2082                         int nCharNames = 0;
2083                         foreach (DictionaryEntry de in sortableCharNames) {
2084                                 counts [((string) de.Value) [0] - 'A']++;
2085                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2086                         }
2087                         nCharNames = 0; // reset
2088                         for (int a = 0; a < counts.Length; a++) {
2089                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2090                                 for (int i = 0; i < counts [a]; i++)
2091 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2092                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2093                         }
2094
2095                         // CJK unified ideograph.
2096                         byte cjkCat = 0x9E;
2097                         fillIndex [cjkCat] = 0x2;
2098                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2099                                 if (!IsIgnorable (cp))
2100                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2101                         // CJK Extensions goes here.
2102                         // LAMESPEC: With this Windows style CJK layout, it is
2103                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2104                         // 0x9FBB can never be added w/o breaking compat.
2105                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2106                                 if (!IsIgnorable (cp))
2107                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2108
2109                         // PrivateUse ... computed.
2110                         // remaining Surrogate ... computed.
2111
2112                         #region Special "biggest" area (FF FF)
2113                         fillIndex [0xFF] = 0xFF;
2114                         char [] specialBiggest = new char [] {
2115                                 '\u3005', '\u3031', '\u3032', '\u309D',
2116                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2117                                 '\uFE7C', '\uFE7D', '\uFF70'};
2118                         foreach (char c in specialBiggest)
2119                                 AddCharMap (c, 0xFF, 0);
2120                         #endregion
2121
2122                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2123                         // non-alphanumeric ASCII except for: + - < = > '
2124                         for (int i = 0x21; i < 0x7F; i++) {
2125                                 if (Char.IsLetterOrDigit ((char) i)
2126                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2127                                         continue; // they are not added here.
2128                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2129                                 // Insert 3001 after ',' and 3002 after '.'
2130                                 if (i == 0x2C)
2131                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2132                                 else if (i == 0x2E) {
2133                                         fillIndex [0x7]--;
2134                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2135                                 }
2136                                 else if (i == 0x3A)
2137                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2138                         }
2139                         #endregion
2140
2141                         #region 07 - Punctuations and something else
2142                         for (int i = 0xA0; i < char.MaxValue; i++) {
2143                                 if (IsIgnorable (i))
2144                                         continue;
2145
2146                                 // SPECIAL CASES:
2147                                 switch (i) {
2148                                 case 0xAB: // 08
2149                                 case 0xB7: // 0A
2150                                 case 0x2329: // 09
2151                                 case 0x232A: // 09
2152                                         continue;
2153                                 }
2154
2155                                 switch (Char.GetUnicodeCategory ((char) i)) {
2156                                 case UnicodeCategory.OtherPunctuation:
2157                                 case UnicodeCategory.ClosePunctuation:
2158                                 case UnicodeCategory.OpenPunctuation:
2159                                 case UnicodeCategory.InitialQuotePunctuation:
2160                                 case UnicodeCategory.FinalQuotePunctuation:
2161                                 case UnicodeCategory.ModifierSymbol:
2162                                         // SPECIAL CASES: // 0xA
2163                                         if (0x2020 <= i && i <= 0x2042)
2164                                                 continue;
2165                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
2166                                         break;
2167                                 default:
2168                                         if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2169                                                 goto case UnicodeCategory.OtherPunctuation;
2170                                         break;
2171                                 }
2172                         }
2173                         // Control pictures
2174                         for (int i = 0x2400; i <= 0x2421; i++)
2175                                 AddCharMap ((char) i, 0x7, 1, 0);
2176                         #endregion
2177
2178                         // FIXME: for 07 xx we need more love.
2179
2180                         // FIXME: 08 should be more complete.
2181                         fillIndex [0x8] = 2;
2182                         for (int cp = 0; cp < char.MaxValue; cp++)
2183                                 if (!map [cp].Defined &&
2184                                         Char.GetUnicodeCategory ((char) cp) ==
2185                                         UnicodeCategory.MathSymbol)
2186                                         AddCharMapGroup ((char) cp, 0x8, 1, 0);
2187
2188                         // Characters w/ diacritical marks (NFKD)
2189                         for (int i = 0; i <= char.MaxValue; i++) {
2190                                 if (map [i].Defined || IsIgnorable (i))
2191                                         continue;
2192                                 if (decompIndex [i] == 0)
2193                                         continue;
2194
2195                                 int start = decompIndex [i];
2196                                 int primaryChar = decompValues [start];
2197                                 int secondary = 0;
2198                                 bool skip = false;
2199                                 int length = decompLength [i];
2200                                 // special processing for parenthesized ones.
2201                                 if (length == 3 &&
2202                                         decompValues [start] == '(' &&
2203                                         decompValues [start + 2] == ')') {
2204                                         primaryChar = decompValues [start + 1];
2205                                         length = 1;
2206                                 }
2207
2208                                 if (map [primaryChar].Level1 == 0)
2209                                         continue;
2210
2211                                 for (int l = 1; l < length; l++) {
2212                                         int c = decompValues [start + l];
2213                                         if (map [c].Level1 != 0)
2214                                                 skip = true;
2215                                         secondary += diacritical [c];
2216                                 }
2217                                 if (skip)
2218                                         continue;
2219                                 map [i] = new CharMapEntry (
2220                                         map [primaryChar].Category,
2221                                         map [primaryChar].Level1,
2222                                         (byte) secondary);
2223                                 
2224                         }
2225
2226                         #region Level2 adjustment
2227                         // Arabic Hamzah
2228                         diacritical [0x624] = 0x5;
2229                         diacritical [0x626] = 0x7;
2230                         diacritical [0x622] = 0x9;
2231                         diacritical [0x623] = 0xA;
2232                         diacritical [0x625] = 0xB;
2233                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
2234                         diacritical [0x64A] = 0x7; // Yaa'
2235
2236
2237                         for (int i = 0; i < char.MaxValue; i++) {
2238                                 byte mod = 0;
2239                                 byte cat = map [i].Category;
2240                                 switch (cat) {
2241                                 case 0xE: // Latin diacritics
2242                                 case 0x22: // Japanese: circled characters
2243                                         mod = diacritical [i];
2244                                         break;
2245                                 case 0x13: // Arabic
2246                                         if (diacritical [i] == 0)
2247                                                 mod = 0x8; // default for arabic
2248                                         break;
2249                                 }
2250                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
2251                                         mod = diacritical [i];
2252                                 if (mod > 0)
2253                                         map [i] = new CharMapEntry (
2254                                                 cat, map [i].Level1, mod);
2255                         }
2256                         #endregion
2257
2258                         // FIXME: this is hack but those which are 
2259                         // NonSpacingMark characters and still undefined
2260                         // are likely to be nonspacing.
2261                         for (int i = 0; i < char.MaxValue; i++)
2262                                 if (!map [i].Defined &&
2263                                         !IsIgnorable (i) &&
2264                                         Char.GetUnicodeCategory ((char) i) ==
2265                                         UnicodeCategory.NonSpacingMark)
2266                                         AddCharMap ((char) i, 1, 1);
2267                 }
2268
2269                 private void IncrementSequentialIndex (ref byte hangulCat)
2270                 {
2271                         fillIndex [hangulCat]++;
2272                         if (fillIndex [hangulCat] == 0) { // overflown
2273                                 hangulCat++;
2274                                 fillIndex [hangulCat] = 0x2;
2275                         }
2276                 }
2277
2278                 // Reset fillIndex to fixed value and call AddLetterMap().
2279                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2280                 {
2281                         fillIndex [category] = alphaWeight;
2282                         AddLetterMap (c, category, 0);
2283
2284                         ArrayList al = latinMap [c] as ArrayList;
2285                         if (al == null)
2286                                 return;
2287
2288                         foreach (int cp in al)
2289                                 AddLetterMap ((char) cp, category, 0);
2290                 }
2291
2292                 private void AddKanaMap (int i, byte voices)
2293                 {
2294                         for (byte b = 0; b < voices; b++) {
2295                                 char c = (char) (i + b);
2296                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
2297                                 // Hiragana
2298                                 AddLetterMapCore (c, 0x22, 0, arg);
2299                                 // Katakana
2300                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2301                         }
2302                 }
2303
2304                 private void AddLetterMap (char c, byte category, byte updateCount)
2305                 {
2306                         AddLetterMapCore (c, category, updateCount, 0);
2307                 }
2308
2309                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2310                 {
2311                         char c2;
2312                         // <small> updates index
2313                         c2 = ToSmallForm (c);
2314                         if (c2 != c)
2315                                 AddCharMapGroup (c2, category, updateCount, level2);
2316                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2317                         if (c2 != c && !map [(int) c2].Defined)
2318                                 AddLetterMapCore (c2, category, 0, level2);
2319                         bool doUpdate = true;
2320                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2321                                 doUpdate = false;
2322                         else
2323                                 AddCharMapGroup (c, category, 0, level2);
2324                         if (doUpdate)
2325                                 fillIndex [category] += updateCount;
2326                 }
2327
2328                 private bool AddCharMap (char c, byte category, byte increment)
2329                 {
2330                         return AddCharMap (c, category, increment, 0);
2331                 }
2332                 
2333                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2334                 {
2335                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2336                                 return false; // do nothing
2337                         map [(int) c] = new CharMapEntry (category,
2338                                 category == 1 ? alt : fillIndex [category],
2339                                 category == 1 ? fillIndex [category] : alt);
2340                         fillIndex [category] += increment;
2341                         return true;
2342                 }
2343
2344                 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2345                 {
2346                         char c2 = ToSmallFormTail (c);
2347                         if (c2 != c)
2348                                 AddCharMap (c2, category, updateCount, 0);
2349                         // itself
2350                         AddCharMap (c, category, updateCount, 0);
2351                         // <full>
2352                         c2 = ToFullWidthTail (c);
2353                         if (c2 != c)
2354                                 AddCharMapGroupTail (c2, category, updateCount);
2355                 }
2356
2357                 //
2358                 // Adds characters to table in the order below 
2359                 // (+ increases weight):
2360                 //      (<small> +)
2361                 //      itself
2362                 //      <fraction>
2363                 //      <full> | <super> | <sub>
2364                 //      <circle> | <wide> (| <narrow>)
2365                 //      +
2366                 //      (vertical +)
2367                 //
2368                 // level2 is fixed (does not increase).
2369                 int [] sameWeightItems = new int [] {
2370                         DecompositionFraction,
2371                         DecompositionFull,
2372                         DecompositionSuper,
2373                         DecompositionSub,
2374                         DecompositionCircle,
2375                         DecompositionWide,
2376                         DecompositionNarrow,
2377                         };
2378                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2379                 {
2380                         if (map [(int) c].Defined)
2381                                 return;
2382
2383                         char small = char.MinValue;
2384                         char vertical = char.MinValue;
2385                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2386                         if (nfkd != null) {
2387                                 object smv = nfkd [(byte) DecompositionSmall];
2388                                 if (smv != null)
2389                                         small = (char) ((int) smv);
2390                                 object vv = nfkd [(byte) DecompositionVertical];
2391                                 if (vv != null)
2392                                         vertical = (char) ((int) vv);
2393                         }
2394
2395                         // <small> updates index
2396                         if (small != char.MinValue)
2397                                 AddCharMap (small, category, updateCount);
2398
2399                         // itself
2400                         AddCharMap (c, category, 0, level2);
2401
2402                         if (nfkd != null) {
2403                                 foreach (int weight in sameWeightItems) {
2404                                         object wv = nfkd [(byte) weight];
2405                                         if (wv != null)
2406                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
2407                                 }
2408                         }
2409
2410                         // update index here.
2411                         fillIndex [category] += updateCount;
2412
2413                         if (vertical != char.MinValue)
2414                                 AddCharMap (vertical, category, updateCount, level2);
2415                 }
2416
2417                 private void AddCharMapCJK (char c, ref byte category)
2418                 {
2419                         AddCharMap (c, category, 0, 0);
2420                         IncrementSequentialIndex (ref category);
2421
2422                         // Special. I wonder why but Windows skips 9E F9.
2423                         if (category == 0x9E && fillIndex [category] == 0xF9)
2424                                 IncrementSequentialIndex (ref category);
2425                 }
2426
2427                 private void AddCharMapGroupCJK (char c, ref byte category)
2428                 {
2429                         AddCharMapCJK (c, ref category);
2430
2431                         // LAMESPEC: see below.
2432                         if (c == '\u52DE') {
2433                                 AddCharMapCJK ('\u3298', ref category);
2434                                 AddCharMapCJK ('\u3238', ref category);
2435                         }
2436                         if (c == '\u5BEB')
2437                                 AddCharMapCJK ('\u32A2', ref category);
2438                         if (c == '\u91AB')
2439                                 // Especially this mapping order totally does
2440                                 // not make sense to me.
2441                                 AddCharMapCJK ('\u32A9', ref category);
2442
2443                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2444                         if (nfkd == null)
2445                                 return;
2446                         for (byte weight = 0; weight <= 0x12; weight++) {
2447                                 object wv = nfkd [weight];
2448                                 if (wv == null)
2449                                         continue;
2450                                 int w = (int) wv;
2451
2452                                 // Special: they are ignored in this area.
2453                                 // FIXME: check if it is sane
2454                                 if (0xF900 <= w && w <= 0xFAD9)
2455                                         continue;
2456                                 // LAMESPEC: on Windows some of CJK characters
2457                                 // in 3200-32B0 are incorrectly mapped. They
2458                                 // mix Chinise and Japanese Kanji when
2459                                 // ordering those characters.
2460                                 switch (w) {
2461                                 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
2462                                         continue;
2463                                 }
2464
2465                                 AddCharMapCJK ((char) w, ref category);
2466                         }
2467                 }
2468
2469                 // For now it is only for 0x7 category.
2470                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2471                 {
2472                         char small = char.MinValue;
2473                         char vertical = char.MinValue;
2474                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2475                         if (nfkd != null) {
2476                                 object smv = nfkd [(byte) DecompositionSmall];
2477                                 if (smv != null)
2478                                         small = (char) ((int) smv);
2479                                 object vv = nfkd [(byte) DecompositionVertical];
2480                                 if (vv != null)
2481                                         vertical = (char) ((int) vv);
2482                         }
2483
2484                         // <small> updates index
2485                         if (small != char.MinValue)
2486                                 // SPECIAL CASE excluded (FIXME: why?)
2487                                 if (small != '\u2024')
2488                                         AddCharMap (small, category, updateCount);
2489
2490                         // itself
2491                         AddCharMap (c, category, updateCount, level2);
2492
2493                         // Since nfkdMap is problematic to have two or more
2494                         // NFKD to an identical character, here I iterate all.
2495                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
2496                                 if (decompLength [c2] == 1 &&
2497                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
2498                                         switch (decompType [c2]) {
2499                                         case DecompositionCompat:
2500                                                 AddCharMap ((char) c2, category, updateCount, level2);
2501                                                 break;
2502                                         }
2503                                 }
2504                         }
2505
2506                         if (vertical != char.MinValue)
2507                                 // SPECIAL CASE excluded (FIXME: why?)
2508                                 if (vertical != '\uFE33' && vertical != '\uFE34')
2509                                         AddCharMap (vertical, category, updateCount, level2);
2510                 }
2511
2512                 char ToFullWidth (char c)
2513                 {
2514                         return ToDecomposed (c, DecompositionFull, false);
2515                 }
2516
2517                 char ToFullWidthTail (char c)
2518                 {
2519                         return ToDecomposed (c, DecompositionFull, true);
2520                 }
2521
2522                 char ToSmallForm (char c)
2523                 {
2524                         return ToDecomposed (c, DecompositionSmall, false);
2525                 }
2526
2527                 char ToSmallFormTail (char c)
2528                 {
2529                         return ToDecomposed (c, DecompositionSmall, true);
2530                 }
2531
2532                 char ToDecomposed (char c, byte d, bool tail)
2533                 {
2534                         if (decompType [(int) c] != d)
2535                                 return c;
2536                         int idx = decompIndex [(int) c];
2537                         if (tail)
2538                                 idx += decompLength [(int) c] - 1;
2539                         return (char) decompValues [idx];
2540                 }
2541
2542                 bool ExistsJIS (int cp)
2543                 {
2544                         foreach (JISCharacter j in jisJapanese)
2545                                 if (j.CP == cp)
2546                                         return true;
2547                         return false;
2548                 }
2549
2550                 #endregion
2551
2552                 #region Level 3 properties (Case/Width)
2553
2554                 private byte ComputeLevel3Weight (char c)
2555                 {
2556                         byte b = ComputeLevel3WeightRaw (c);
2557                         return b > 0 ? (byte) (b + 2) : b;
2558                 }
2559
2560                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
2561                 {
2562                         // Korean
2563                         if ('\u11A8' <= c && c <= '\u11F9')
2564                                 return 2;
2565                         if ('\uFFA0' <= c && c <= '\uFFDC')
2566                                 return 4;
2567                         if ('\u3130' <= c && c <= '\u3164')
2568                                 return 5;
2569                         // numbers
2570                         if ('\u2776' <= c && c <= '\u277F')
2571                                 return 4;
2572                         if ('\u2780' <= c && c <= '\u2789')
2573                                 return 8;
2574                         if ('\u2776' <= c && c <= '\u2793')
2575                                 return 0xC;
2576                         if ('\u2160' <= c && c <= '\u216F')
2577                                 return 0x18;
2578                         if ('\u2181' <= c && c <= '\u2182')
2579                                 return 0x18;
2580                         // Arabic
2581                         if ('\u2135' <= c && c <= '\u2138')
2582                                 return 4;
2583                         if ('\uFE80' <= c && c < '\uFE8E') {
2584                                 // 2(Isolated)/8(Final)/0x18(Medial)
2585                                 switch (decompType [(int) c]) {
2586                                 case DecompositionIsolated:
2587                                         return 2;
2588                                 case DecompositionFinal:
2589                                         return 8;
2590                                 case DecompositionMedial:
2591                                         return 0x18;
2592                                 }
2593                         }
2594
2595                         // actually I dunno the reason why they have weights.
2596                         switch (c) {
2597                         case '\u01BC':
2598                                 return 0x10;
2599                         case '\u06A9':
2600                                 return 0x20;
2601                         case '\u06AA':
2602                                 return 0x28;
2603                         }
2604
2605                         byte ret = 0;
2606                         switch (c) {
2607                         case '\u03C2':
2608                         case '\u2104':
2609                         case '\u212B':
2610                                 ret |= 8;
2611                                 break;
2612                         case '\uFE42':
2613                                 ret |= 0xC;
2614                                 break;
2615                         }
2616
2617                         // misc
2618                         switch (decompType [(int) c]) {
2619                         case DecompositionWide: // <wide>
2620                         case DecompositionSub: // <sub>
2621                         case DecompositionSuper: // <super>
2622                                 ret |= decompType [(int) c];
2623                                 break;
2624                         }
2625                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2626                                 ret |= 8;
2627                         if (isUppercase [(int) c]) // DerivedCoreProperties
2628                                 ret |= 0x10;
2629
2630                         return ret;
2631                 }
2632
2633                 #endregion
2634
2635                 #region IsIgnorable
2636 /*
2637                 static bool IsIgnorable (int i)
2638                 {
2639                         if (unicodeAge [i] >= 3.1)
2640                                 return true;
2641                         switch (char.GetUnicodeCategory ((char) i)) {
2642                         case UnicodeCategory.OtherNotAssigned:
2643                         case UnicodeCategory.Format:
2644                                 return true;
2645                         }
2646                         return false;
2647                 }
2648 */
2649
2650                 // FIXME: In the future use DerivedAge.txt to examine character
2651                 // versions and set those ones that have higher version than
2652                 // 1.0 as ignorable.
2653                 static bool IsIgnorable (int i)
2654                 {
2655                         switch (i) {
2656                         case 0:
2657                         // I guess, those characters are added between
2658                         // Unicode 1.0 (LCMapString) and Unicode 3.1
2659                         // (UnicodeCategory), so they used to be 
2660                         // something like OtherNotAssigned as of Unicode 1.1.
2661                         case 0x2df: case 0x387:
2662                         case 0x3d7: case 0x3d8: case 0x3d9:
2663                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2664                         case 0x400: case 0x40d: case 0x450: case 0x45d:
2665                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
2666                         case 0x653: case 0x654: case 0x655: case 0x66d:
2667                         case 0xb56:
2668                         case 0x1e9b: case 0x202f: case 0x20ad:
2669                         case 0x20ae: case 0x20af:
2670                         case 0x20e2: case 0x20e3:
2671                         case 0x2139: case 0x213a: case 0x2183:
2672                         case 0x2425: case 0x2426: case 0x2619:
2673                         case 0x2670: case 0x2671: case 0x3007:
2674                         case 0x3190: case 0x3191:
2675                         case 0xfffc: case 0xfffd:
2676                                 return true;
2677                         // exceptional characters filtered by the 
2678                         // following conditions. Originally those exceptional
2679                         // ranges are incorrect (they should not be ignored)
2680                         // and most of those characters are unfortunately in
2681                         // those ranges.
2682                         case 0x4d8: case 0x4d9:
2683                         case 0x4e8: case 0x4e9:
2684                         case 0x3036: case 0x303f:
2685                         case 0x337b: case 0xfb1e:
2686                                 return false;
2687                         }
2688
2689                         if (
2690                                 // The whole Sinhala characters.
2691                                 0x0D82 <= i && i <= 0x0DF4
2692                                 // The whole Tibetan characters.
2693                                 || 0x0F00 <= i && i <= 0x0FD1
2694                                 // The whole Myanmar characters.
2695                                 || 0x1000 <= i && i <= 0x1059
2696                                 // The whole Etiopic, Cherokee, 
2697                                 // Canadian Syllablic, Ogham, Runic,
2698                                 // Tagalog, Hanunoo, Philippine,
2699                                 // Buhid, Tagbanwa, Khmer and Mongorian
2700                                 // characters.
2701                                 || 0x1200 <= i && i <= 0x1DFF
2702                                 // Greek extension characters.
2703                                 || 0x1F00 <= i && i <= 0x1FFF
2704                                 // The whole Braille characters.
2705                                 || 0x2800 <= i && i <= 0x28FF
2706                                 // CJK radical characters.
2707                                 || 0x2E80 <= i && i <= 0x2EF3
2708                                 // Kangxi radical characters.
2709                                 || 0x2F00 <= i && i <= 0x2FD5
2710                                 // Ideographic description characters.
2711                                 || 0x2FF0 <= i && i <= 0x2FFB
2712                                 // Bopomofo letter and final
2713                                 || 0x31A0 <= i && i <= 0x31B7
2714                                 // White square with quadrant characters.
2715                                 || 0x25F0 <= i && i <= 0x25F7
2716                                 // Ideographic telegraph symbols.
2717                                 || 0x32C0 <= i && i <= 0x32CB
2718                                 || 0x3358 <= i && i <= 0x3370
2719                                 || 0x33E0 <= i && i <= 0x33FF
2720                                 // The whole YI characters.
2721                                 || 0xA000 <= i && i <= 0xA48C
2722                                 || 0xA490 <= i && i <= 0xA4C6
2723                                 // American small ligatures
2724                                 || 0xFB13 <= i && i <= 0xFB17
2725                                 // hebrew, arabic, variation selector.
2726                                 || 0xFB1D <= i && i <= 0xFE2F
2727                                 // Arabic ligatures.
2728                                 || 0xFEF5 <= i && i <= 0xFEFC
2729                                 // FIXME: why are they excluded?
2730                                 || 0x01F6 <= i && i <= 0x01F9
2731                                 || 0x0218 <= i && i <= 0x0233
2732                                 || 0x02A9 <= i && i <= 0x02AD
2733                                 || 0x02EA <= i && i <= 0x02EE
2734                                 || 0x0349 <= i && i <= 0x036F
2735                                 || 0x0488 <= i && i <= 0x048F
2736                                 || 0x04D0 <= i && i <= 0x04FF
2737                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2738                                 || 0x06D6 <= i && i <= 0x06ED
2739                                 || 0x06FA <= i && i <= 0x06FE
2740                                 || 0x2048 <= i && i <= 0x204D
2741                                 || 0x20e4 <= i && i <= 0x20ea
2742                                 || 0x213C <= i && i <= 0x214B
2743                                 || 0x21EB <= i && i <= 0x21FF
2744                                 || 0x22F2 <= i && i <= 0x22FF
2745                                 || 0x237B <= i && i <= 0x239A
2746                                 || 0x239B <= i && i <= 0x23CF
2747                                 || 0x24EB <= i && i <= 0x24FF
2748                                 || 0x2596 <= i && i <= 0x259F
2749                                 || 0x25F8 <= i && i <= 0x25FF
2750                                 || 0x2672 <= i && i <= 0x2689
2751                                 || 0x2768 <= i && i <= 0x2775
2752                                 || 0x27d0 <= i && i <= 0x27ff
2753                                 || 0x2900 <= i && i <= 0x2aff
2754                                 || 0x3033 <= i && i <= 0x303F
2755                                 || 0x31F0 <= i && i <= 0x31FF
2756                                 || 0x3250 <= i && i <= 0x325F
2757                                 || 0x32B1 <= i && i <= 0x32BF
2758                                 || 0x3371 <= i && i <= 0x337B
2759                                 || 0xFA30 <= i && i <= 0xFA6A
2760                         )
2761                                 return true;
2762
2763                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2764                         switch (uc) {
2765                         case UnicodeCategory.PrivateUse:
2766                         case UnicodeCategory.Surrogate:
2767                                 return false;
2768                         // ignored by nature
2769                         case UnicodeCategory.Format:
2770                         case UnicodeCategory.OtherNotAssigned:
2771                                 return true;
2772                         default:
2773                                 return false;
2774                         }
2775                 }
2776
2777                 // To check IsIgnorable sanity, try the driver below under MS.NET.
2778
2779                 /*
2780                 public static void Main ()
2781                 {
2782                         for (int i = 0; i <= char.MaxValue; i++)
2783                                 Dump (i, IsIgnorable (i));
2784                 }
2785
2786                 static void Dump (int i, bool ignore)
2787                 {
2788                         switch (Char.GetUnicodeCategory ((char) i)) {
2789                         case UnicodeCategory.PrivateUse:
2790                         case UnicodeCategory.Surrogate:
2791                                 return; // check nothing
2792                         }
2793
2794                         string s1 = "";
2795                         string s2 = new string ((char) i, 10);
2796                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2797                         if ((ret == 0) == ignore)
2798                                 return;
2799                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2800                 }
2801                 */
2802                 #endregion // IsIgnorable
2803
2804                 #region IsIgnorableSymbol
2805                 static bool IsIgnorableSymbol (int i)
2806                 {
2807                         if (IsIgnorable (i))
2808                                 return true;
2809
2810                         switch (i) {
2811                         // *Letter
2812                         case 0x00b5: case 0x01C0: case 0x01C1:
2813                         case 0x01C2: case 0x01C3: case 0x01F6:
2814                         case 0x01F7: case 0x01F8: case 0x01F9:
2815                         case 0x02D0: case 0x02EE: case 0x037A:
2816                         case 0x03D7: case 0x03F3:
2817                         case 0x0400: case 0x040d:
2818                         case 0x0450: case 0x045d:
2819                         case 0x048C: case 0x048D:
2820                         case 0x048E: case 0x048F:
2821                         case 0x0587: case 0x0640: case 0x06E5:
2822                         case 0x06E6: case 0x06FA: case 0x06FB:
2823                         case 0x06FC: case 0x093D: case 0x0950:
2824                         case 0x1E9B: case 0x2139: case 0x3006:
2825                         case 0x3033: case 0x3034: case 0x3035:
2826                         case 0xFE7E: case 0xFE7F:
2827                         // OtherNumber
2828                         case 0x16EE: case 0x16EF: case 0x16F0:
2829                         // LetterNumber
2830                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2831                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2832                         case 0x3038: // HANGZHOU NUMERAL TEN
2833                         case 0x3039: // HANGZHOU NUMERAL TWENTY
2834                         case 0x303a: // HANGZHOU NUMERAL THIRTY
2835                         // OtherSymbol
2836                         case 0x2117:
2837                         case 0x327F:
2838                                 return true;
2839                         // ModifierSymbol
2840                         case 0x02B9: case 0x02BA: case 0x02C2:
2841                         case 0x02C3: case 0x02C4: case 0x02C5:
2842                         case 0x02C8: case 0x02CC: case 0x02CD:
2843                         case 0x02CE: case 0x02CF: case 0x02D2:
2844                         case 0x02D3: case 0x02D4: case 0x02D5:
2845                         case 0x02D6: case 0x02D7: case 0x02DE:
2846                         case 0x02E5: case 0x02E6: case 0x02E7:
2847                         case 0x02E8: case 0x02E9:
2848                         case 0x309B: case 0x309C:
2849                         // OtherPunctuation
2850                         case 0x055A: // American Apos
2851                         case 0x05C0: // Hebrew Punct
2852                         case 0x0E4F: // Thai FONGMAN
2853                         case 0x0E5A: // Thai ANGKHANKHU
2854                         case 0x0E5B: // Thai KHOMUT
2855                         // CurencySymbol
2856                         case 0x09F2: // Bengali Rupee Mark
2857                         case 0x09F3: // Bengali Rupee Sign
2858                         // MathSymbol
2859                         case 0x221e: // INF.
2860                         // OtherSymbol
2861                         case 0x0482:
2862                         case 0x09FA:
2863                         case 0x0B70:
2864                                 return false;
2865                         }
2866
2867                         // *Letter
2868                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2869 #if NET_2_0
2870                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2871                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2872 #endif
2873                         )
2874                                 return true;
2875
2876                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2877                         switch (uc) {
2878                         case UnicodeCategory.Surrogate:
2879                                 return false; // inconsistent
2880
2881                         case UnicodeCategory.SpacingCombiningMark:
2882                         case UnicodeCategory.EnclosingMark:
2883                         case UnicodeCategory.NonSpacingMark:
2884                         case UnicodeCategory.PrivateUse:
2885                                 // NonSpacingMark
2886                                 if (0x064B <= i && i <= 0x0652) // Arabic
2887                                         return true;
2888                                 return false;
2889
2890                         case UnicodeCategory.Format:
2891                         case UnicodeCategory.OtherNotAssigned:
2892                                 return true;
2893
2894                         default:
2895                                 bool use = false;
2896                                 // OtherSymbols
2897                                 if (
2898                                         // latin in a circle
2899                                         0x249A <= i && i <= 0x24E9
2900                                         || 0x2100 <= i && i <= 0x2132
2901                                         // Japanese
2902                                         || 0x3196 <= i && i <= 0x31A0
2903                                         // Korean
2904                                         || 0x3200 <= i && i <= 0x321C
2905                                         // Chinese/Japanese
2906                                         || 0x322A <= i && i <= 0x3243
2907                                         // CJK
2908                                         || 0x3260 <= i && i <= 0x32B0
2909                                         || 0x32D0 <= i && i <= 0x3357
2910                                         || 0x337B <= i && i <= 0x33DD
2911                                 )
2912                                         use = !Char.IsLetterOrDigit ((char) i);
2913                                 if (use)
2914                                         return false;
2915
2916                                 // This "Digit" rule is mystery.
2917                                 // It filters some symbols out.
2918                                 if (Char.IsLetterOrDigit ((char) i))
2919                                         return false;
2920                                 if (Char.IsNumber ((char) i))
2921                                         return false;
2922                                 if (Char.IsControl ((char) i)
2923                                         || Char.IsSeparator ((char) i)
2924                                         || Char.IsPunctuation ((char) i))
2925                                         return true;
2926                                 if (Char.IsSymbol ((char) i))
2927                                         return true;
2928
2929                                 // FIXME: should check more
2930                                 return false;
2931                         }
2932                 }
2933
2934                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
2935 /*
2936                 public static void Main ()
2937                 {
2938                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
2939                         for (int i = 0; i <= char.MaxValue; i++) {
2940                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2941                                 if (uc == UnicodeCategory.Surrogate)
2942                                         continue;
2943
2944                                 bool ret = IsIgnorableSymbol (i);
2945
2946                                 string s1 = "TEST ";
2947                                 string s2 = "TEST " + (char) i;
2948
2949                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
2950
2951                                 if (ret != (result == 0))
2952                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
2953                                                 ret ? "should not ignore" :
2954                                                         "should ignore",
2955                                                 i,(char) i, uc);
2956                         }
2957                 }
2958 */
2959                 #endregion
2960
2961                 #region NonSpacing
2962                 static bool IsIgnorableNonSpacing (int i)
2963                 {
2964                         if (IsIgnorable (i))
2965                                 return true;
2966
2967                         switch (i) {
2968                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
2969                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
2970                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
2971                                 return true;
2972                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
2973                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
2974                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
2975                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
2976                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
2977                         case 0x0CCD: case 0x0E4E:
2978                                 return false;
2979                         }
2980
2981                         if (0x02b9 <= i && i <= 0x02c5
2982                                 || 0x02cc <= i && i <= 0x02d7
2983                                 || 0x02e4 <= i && i <= 0x02ef
2984                                 || 0x20DD <= i && i <= 0x20E0
2985                         )
2986                                 return true;
2987
2988                         if (0x064B <= i && i <= 0x00652
2989                                 || 0x0941 <= i && i <= 0x0948
2990                                 || 0x0AC1 <= i && i <= 0x0ACD
2991                                 || 0x0C3E <= i && i <= 0x0C4F
2992                                 || 0x0E31 <= i && i <= 0x0E3F
2993                         )
2994                                 return false;
2995
2996                         return Char.GetUnicodeCategory ((char) i) ==
2997                                 UnicodeCategory.NonSpacingMark;
2998                 }
2999
3000                 // We can reuse IsIgnorableSymbol testcode 
3001                 // for IsIgnorableNonSpacing.
3002                 #endregion
3003         }
3004
3005         struct CharMapEntry
3006         {
3007                 public byte Category;
3008                 public byte Level1;
3009                 public byte Level2; // It is always single byte.
3010                 public bool Defined;
3011
3012                 public CharMapEntry (byte category, byte level1, byte level2)
3013                 {
3014                         Category = category;
3015                         Level1 = level1;
3016                         Level2 = level2;
3017                         Defined = true;
3018                 }
3019         }
3020
3021         class JISCharacter
3022         {
3023                 public readonly int CP;
3024                 public readonly int JIS;
3025
3026                 public JISCharacter (int cp, int cpJIS)
3027                 {
3028                         CP = cp;
3029                         JIS = cpJIS;
3030                 }
3031         }
3032
3033         class JISComparer : IComparer
3034         {
3035                 public static readonly JISComparer Instance =
3036                         new JISComparer ();
3037
3038                 public int Compare (object o1, object o2)
3039                 {
3040                         JISCharacter j1 = (JISCharacter) o1;
3041                         JISCharacter j2 = (JISCharacter) o2;
3042                         return j2.JIS - j1.JIS;
3043                 }
3044         }
3045
3046         class NonJISCharacter
3047         {
3048                 public readonly int CP;
3049                 public readonly string Name;
3050
3051                 public NonJISCharacter (int cp, string name)
3052                 {
3053                         CP = cp;
3054                         Name = name;
3055                 }
3056         }
3057
3058         class NonJISComparer : IComparer
3059         {
3060                 public static readonly NonJISComparer Instance =
3061                         new NonJISComparer ();
3062
3063                 public int Compare (object o1, object o2)
3064                 {
3065                         NonJISCharacter j1 = (NonJISCharacter) o1;
3066                         NonJISCharacter j2 = (NonJISCharacter) o2;
3067                         return string.CompareOrdinal (j1.Name, j2.Name);
3068                 }
3069         }
3070
3071         class DecimalDictionaryValueComparer : IComparer
3072         {
3073                 public static readonly DecimalDictionaryValueComparer Instance
3074                         = new DecimalDictionaryValueComparer ();
3075
3076                 private DecimalDictionaryValueComparer ()
3077                 {
3078                 }
3079
3080                 public int Compare (object o1, object o2)
3081                 {
3082                         DictionaryEntry e1 = (DictionaryEntry) o1;
3083                         DictionaryEntry e2 = (DictionaryEntry) o2;
3084                         // FIXME: in case of 0, compare decomposition categories
3085                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3086                         if (ret != 0)
3087                                 return ret;
3088                         int i1 = (int) e1.Key;
3089                         int i2 = (int) e2.Key;
3090                         return i1 - i2;
3091                 }
3092         }
3093
3094         class StringDictionaryValueComparer : IComparer
3095         {
3096                 public static readonly StringDictionaryValueComparer Instance
3097                         = new StringDictionaryValueComparer ();
3098
3099                 private StringDictionaryValueComparer ()
3100                 {
3101                 }
3102
3103                 public int Compare (object o1, object o2)
3104                 {
3105                         DictionaryEntry e1 = (DictionaryEntry) o1;
3106                         DictionaryEntry e2 = (DictionaryEntry) o2;
3107                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3108                         if (ret != 0)
3109                                 return ret;
3110                         int i1 = (int) e1.Key;
3111                         int i2 = (int) e2.Key;
3112                         return i1 - i2;
3113                 }
3114         }
3115
3116         class UCAComparer : IComparer
3117         {
3118                 public static readonly UCAComparer Instance
3119                         = new UCAComparer ();
3120
3121                 private UCAComparer ()
3122                 {
3123                 }
3124
3125                 public int Compare (object o1, object o2)
3126                 {
3127                         char i1 = (char) o1;
3128                         char i2 = (char) o2;
3129
3130                         int l1 = CollationElementTable.GetSortKeyCount (i1);
3131                         int l2 = CollationElementTable.GetSortKeyCount (i2);
3132                         int l = l1 > l2 ? l2 : l1;
3133
3134                         for (int i = 0; i < l; i++) {
3135                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3136                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3137                                 int v = k1.Primary - k2.Primary;
3138                                 if (v != 0)
3139                                         return v;
3140                                 v = k1.Secondary - k2.Secondary;
3141                                 if (v != 0)
3142                                         return v;
3143                                 v = k1.Thirtiary - k2.Thirtiary;
3144                                 if (v != 0)
3145                                         return v;
3146                                 v = k1.Quarternary - k2.Quarternary;
3147                                 if (v != 0)
3148                                         return v;
3149                         }
3150                         return l1 - l2;
3151                 }
3152         }
3153
3154         class Tailoring
3155         {
3156                 int lcid;
3157                 int alias;
3158                 bool frenchSort;
3159                 ArrayList items = new ArrayList ();
3160
3161                 public Tailoring (int lcid)
3162                         : this (lcid, 0)
3163                 {
3164                 }
3165
3166                 public Tailoring (int lcid, int alias)
3167                 {
3168                         this.lcid = lcid;
3169                         this.alias = alias;
3170                 }
3171
3172                 public int LCID {
3173                         get { return lcid; }
3174                 }
3175
3176                 public int Alias {
3177                         get { return alias; }
3178                 }
3179
3180                 public bool FrenchSort {
3181                         get { return frenchSort; }
3182                         set { frenchSort = value; }
3183                 }
3184
3185                 public void AddDiacriticalMap (byte target, byte replace)
3186                 {
3187                         items.Add (new DiacriticalMap (target, replace));
3188                 }
3189
3190                 public void AddSortKeyMap (string source, byte [] sortkey)
3191                 {
3192                         items.Add (new SortKeyMap (source, sortkey));
3193                 }
3194
3195                 public void AddReplacementMap (string source, string replace)
3196                 {
3197                         items.Add (new ReplacementMap (source, replace));
3198                 }
3199
3200                 public char [] ItemToCharArray ()
3201                 {
3202                         ArrayList al = new ArrayList ();
3203                         foreach (ITailoringMap m in items)
3204                                 al.AddRange (m.ToCharArray ());
3205                         return al.ToArray (typeof (char)) as char [];
3206                 }
3207
3208                 interface ITailoringMap
3209                 {
3210                         char [] ToCharArray ();
3211                 }
3212
3213                 class DiacriticalMap : ITailoringMap
3214                 {
3215                         public readonly byte Target;
3216                         public readonly byte Replace;
3217
3218                         public DiacriticalMap (byte target, byte replace)
3219                         {
3220                                 Target = target;
3221                                 Replace = replace;
3222                         }
3223
3224                         public char [] ToCharArray ()
3225                         {
3226                                 char [] ret = new char [3];
3227                                 ret [0] = (char) 02; // kind:DiacriticalMap
3228                                 ret [1] = (char) Target;
3229                                 ret [2] = (char) Replace;
3230                                 return ret;
3231                         }
3232                 }
3233
3234                 class SortKeyMap : ITailoringMap
3235                 {
3236                         public readonly string Source;
3237                         public readonly byte [] SortKey;
3238
3239                         public SortKeyMap (string source, byte [] sortkey)
3240                         {
3241                                 Source = source;
3242                                 SortKey = sortkey;
3243                         }
3244
3245                         public char [] ToCharArray ()
3246                         {
3247                                 char [] ret = new char [Source.Length + 7];
3248                                 ret [0] = (char) 01; // kind:SortKeyMap
3249                                 for (int i = 0; i < Source.Length; i++)
3250                                         ret [i + 1] = Source [i];
3251                                 // null terminate
3252                                 for (int i = 0; i < 5; i++)
3253                                         ret [i + Source.Length + 2] = (char) SortKey [i];
3254                                 return ret;
3255                         }
3256                 }
3257
3258                 class ReplacementMap : ITailoringMap
3259                 {
3260                         public readonly string Source;
3261                         public readonly string Replace;
3262
3263                         public ReplacementMap (string source, string replace)
3264                         {
3265                                 Source = source;
3266                                 Replace = replace;
3267                         }
3268
3269                         public char [] ToCharArray ()
3270                         {
3271                                 char [] ret = new char [Source.Length + Replace.Length + 3];
3272                                 ret [0] = (char) 03; // kind:ReplaceMap
3273                                 int pos = 1;
3274                                 for (int i = 0; i < Source.Length; i++)
3275                                         ret [pos++] = Source [i];
3276                                 // null terminate
3277                                 pos++;
3278                                 for (int i = 0; i < Replace.Length; i++)
3279                                         ret [pos++] = Replace [i];
3280                                 // null terminate
3281                                 return ret;
3282                         }
3283                 }
3284         }
3285 }