mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14
  15 //
  16 // * sortkey getter signature
  17 //
  18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
  19 //      Stores sort key for corresponding character element into buf and
  20 //      returns the length of the consumed _source_ character element in s.
  21 //
  22 // * character length to consume
  23 //
  24 //      If there are characters whose primary weight is 0, they are consumed
  25 //      and considered as a part of the character element.
  26 //
  27
  28 using System;
  29 using System.IO;
  30 using System.Collections;
  31 using System.Globalization;
  32 using System.Text;
  33 using System.Xml;
  34
  35 namespace Mono.Globalization.Unicode
  36 {
  37         internal class MSCompatSortKeyTableGenerator
  38         {
  39                 public static void Main (string [] args)
  40                 {
  41                         new MSCompatSortKeyTableGenerator ().Run (args);
  42                 }
  43
  44                 const int DecompositionWide = 1; // fixed
  45                 const int DecompositionSub = 2; // fixed
  46                 const int DecompositionSmall = 3;
  47                 const int DecompositionIsolated = 4;
  48                 const int DecompositionInitial = 5;
  49                 const int DecompositionFinal = 6;
  50                 const int DecompositionMedial = 7;
  51                 const int DecompositionNoBreak = 8;
  52                 const int DecompositionVertical = 9;
  53                 const int DecompositionFraction = 0xA;
  54                 const int DecompositionFont = 0xB;
  55                 const int DecompositionSuper = 0xC; // fixed
  56                 const int DecompositionFull = 0xE;
  57                 const int DecompositionNarrow = 0xD;
  58                 const int DecompositionCircle = 0xF;
  59                 const int DecompositionSquare = 0x10;
  60                 const int DecompositionCompat = 0x11;
  61                 const int DecompositionCanonical = 0x12;
  62
  63                 TextWriter Result = Console.Out;
  64
  65                 byte [] fillIndex = new byte [256]; // by category
  66                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  67
  68                 char [] specialIgnore = new char [] {
  69                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  70                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  71                         };
  72
  73                 // FIXME: need more love (as always)
  74                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  75                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  76                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  77                         '\u0292', '\u01BE', '\u0298'};
  78                 byte [] alphaWeights = new byte [] {
  79                         2, 9, 0xA, 0x1A, 0x21,
  80                         0x23, 0x25, 0x2C, 0x32, 0x35,
  81                         0x36, 0x48, 0x51, 0x70, 0x7C,
  82                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  83                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  84                         0xA9, 0xAA, 0xB3, 0xB4};
  85
  86                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  87                 bool [] isUppercase = new bool [char.MaxValue + 1];
  88
  89                 byte [] decompType = new byte [char.MaxValue + 1];
  90                 int [] decompIndex = new int [char.MaxValue + 1];
  91                 int [] decompLength = new int [char.MaxValue + 1];
  92                 int [] decompValues;
  93                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  94
  95                 byte [] diacritical = new byte [char.MaxValue + 1];
  96
  97                 string [] diacritics = new string [] {
  98                         // LATIN
  99                         "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
 100                         "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 101                         " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
 102                         " OGONEK;", " CEDILLA;",
 103                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 104                         " STROKE;", " CIRCUMFLEX AND ACUTE;",
 105                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 106                         " DIAERESIS AND GRAVE;",
 107                         " BREVE AND ACUTE;",
 108                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 109                         " MACRON AND ACUTE;",
 110                         " MACRON AND GRAVE;",
 111                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 112                         " RING ABOVE AND ACUTE",
 113                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 114                         " CIRCUMFLEX AND TILDE",
 115                         " TILDE AND DIAERESIS",
 116                         " STROKE AND ACUTE",
 117                         " BREVE AND TILDE",
 118                         " CEDILLA AND BREVE",
 119                         " OGONEK AND MACRON",
 120                         " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 121                         " DOUBLE GRAVE;",
 122                         " INVERTED BREVE",
 123                         " PRECEDED BY APOSTROPHE",
 124                         " HORN;",
 125                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 126                         " PALATAL HOOK",
 127                         " DOT BELOW;",
 128                         " RETROFLEX;", "DIAERESIS BELOW",
 129                         " RING BELOW",
 130                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 131                         " BREVE BELOW;", " HORN AND GRAVE",
 132                         " TILDE BELOW",
 133                         " DOT BELOW AND DOT ABOVE",
 134                         " RIGHT HALF RING", " HORN AND TILDE",
 135                         " CIRCUMFLEX AND DOT BELOW",
 136                         " BREVE AND DOT BELOW",
 137                         " DOT BELOW AND MACRON",
 138                         " HORN AND HOOK ABOVE",
 139                         " HORN AND DOT",
 140                         // CIRCLED, PARENTHESIZED and so on
 141                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
 142                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 143                         };
 144                 byte [] diacriticWeights = new byte [] {
 145                         // LATIN.
 146                         0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 147                         0x17, 0x19, 0x1A, 0x1B, 0x1C,
 148                         0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 149                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 150                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 151                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 152                         0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
 153                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
 154                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
 155                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 156                         0x95, 0xAA,
 157                         // CIRCLED, PARENTHESIZED and so on.
 158                         0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
 159                         };
 160
 161                 int [] numberSecondaryWeightBounds = new int [] {
 162                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 163                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 164                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 165                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 166                         0xE50, 0xE60, 0xED0, 0xEE0
 167                         };
 168
 169                 char [] orderedCyrillic;
 170                 char [] orderedGurmukhi;
 171                 char [] orderedGujarati;
 172                 char [] orderedGeorgian;
 173                 char [] orderedThaana;
 174
 175                 static readonly char [] orderedTamilConsonants = new char [] {
 176                         // based on traditional Tamil consonants, except for
 177                         // Grantha (where Microsoft breaks traditionalism).
 178                         // http://www.angelfire.com/empire/thamizh/padanGaL
 179                         '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
 180                         '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
 181                         '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
 182                         '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
 183                         '\u0BB9'};
 184
 185                 // cp -> character name (only for some characters)
 186                 ArrayList sortableCharNames = new ArrayList ();
 187
 188                 // cp -> arrow value (int)
 189                 ArrayList arrowValues = new ArrayList ();
 190
 191                 // cp -> box value (int)
 192                 ArrayList boxValues = new ArrayList ();
 193
 194                 // cp -> level1 value
 195                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 196
 197                 // letterName -> cp
 198                 Hashtable arabicNameMap = new Hashtable ();
 199
 200                 // cp -> Hashtable [decompType] -> cp
 201                 Hashtable nfkdMap = new Hashtable ();
 202
 203                 // Latin letter -> ArrayList [int]
 204                 Hashtable latinMap = new Hashtable ();
 205
 206                 ArrayList jisJapanese = new ArrayList ();
 207                 ArrayList nonJisJapanese = new ArrayList ();
 208
 209                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
 210                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
 211                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
 212                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
 213                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
 214
 215                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 216
 217                 static double [] unicodeAge = new double [char.MaxValue + 1];
 218
 219                 ArrayList tailorings = new ArrayList ();
 220
 221                 void Run (string [] args)
 222                 {
 223                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 224                         ParseSources (dirname);
 225                         Console.Error.WriteLine ("parse done.");
 226
 227                         ModifyParsedValues ();
 228                         GenerateCore ();
 229                         Console.Error.WriteLine ("generation done.");
 230                         Serialize ();
 231                         Console.Error.WriteLine ("serialization done.");
 232 /*
 233 StreamWriter sw = new StreamWriter ("agelog.txt");
 234 for (int i = 0; i < char.MaxValue; i++) {
 235 bool shouldBe = false;
 236 switch (Char.GetUnicodeCategory ((char) i)) {
 237 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
 238         shouldBe = true; break;
 239 }
 240 if (unicodeAge [i] >= 3.1)
 241         shouldBe = true;
 242 //if (IsIgnorable (i) != shouldBe)
 243 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
 244 }
 245 sw.Close ();
 246 */
 247                 }
 248
 249                 byte [] CompressArray (byte [] source, CodePointIndexer i)
 250                 {
 251                         return (byte []) CodePointIndexer.CompressArray  (
 252                                 source, typeof (byte), i);
 253                 }
 254
 255                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
 256                 {
 257                         return (ushort []) CodePointIndexer.CompressArray  (
 258                                 source, typeof (ushort), i);
 259                 }
 260
 261                 void Serialize ()
 262                 {
 263                         // Tailorings
 264                         SerializeTailorings ();
 265
 266                         byte [] categories = new byte [map.Length];
 267                         byte [] level1 = new byte [map.Length];
 268                         byte [] level2 = new byte [map.Length];
 269                         byte [] level3 = new byte [map.Length];
 270                         int [] widthCompat = new int [map.Length];
 271                         for (int i = 0; i < map.Length; i++) {
 272                                 categories [i] = map [i].Category;
 273                                 level1 [i] = map [i].Level1;
 274                                 level2 [i] = map [i].Level2;
 275                                 level3 [i] = ComputeLevel3Weight ((char) i);
 276                                 switch (decompType [i]) {
 277                                 case DecompositionNarrow:
 278                                 case DecompositionWide:
 279                                 case DecompositionSuper:
 280                                 case DecompositionSub:
 281                                         // they are always 1 char
 282                                         widthCompat [i] = decompValues [decompIndex [i]];
 283                                         break;
 284                                 }
 285                         }
 286
 287                         // compress
 288                         ignorableFlags = CompressArray (ignorableFlags,
 289                                 MSCompatUnicodeTableUtil.Ignorable);
 290                         categories = CompressArray (categories,
 291                                 MSCompatUnicodeTableUtil.Category);
 292                         level1 = CompressArray (level1,
 293                                 MSCompatUnicodeTableUtil.Level1);
 294                         level2 = CompressArray (level2,
 295                                 MSCompatUnicodeTableUtil.Level2);
 296                         level3 = CompressArray (level3,
 297                                 MSCompatUnicodeTableUtil.Level3);
 298                         widthCompat = (int []) CodePointIndexer.CompressArray (
 299                                 widthCompat, typeof (int),
 300                                 MSCompatUnicodeTableUtil.WidthCompat);
 301                         cjkCHS = CompressArray (cjkCHS,
 302                                 MSCompatUnicodeTableUtil.CjkCHS);
 303                         cjkCHT = CompressArray (cjkCHT,
 304                                 MSCompatUnicodeTableUtil.Cjk);
 305                         cjkJA = CompressArray (cjkJA,
 306                                 MSCompatUnicodeTableUtil.Cjk);
 307                         cjkKO = CompressArray (cjkKO,
 308                                 MSCompatUnicodeTableUtil.Cjk);
 309                         cjkKOlv2 = CompressArray (cjkKOlv2,
 310                                 MSCompatUnicodeTableUtil.Cjk);
 311
 312                         // Ignorables
 313                         Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
 314                         for (int i = 0; i < ignorableFlags.Length; i++) {
 315                                 byte value = ignorableFlags [i];
 316                                 if (value < 10)
 317                                         Result.Write ("{0},", value);
 318                                 else
 319                                         Result.Write ("0x{0:X02},", value);
 320                                 if ((i & 0xF) == 0xF)
 321                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 322                         }
 323                         Result.WriteLine ("};");
 324                         Result.WriteLine ();
 325
 326                         // Primary category
 327                         Result.WriteLine ("static byte [] categories = new byte [] {");
 328                         for (int i = 0; i < categories.Length; i++) {
 329                                 byte value = categories [i];
 330                                 if (value < 10)
 331                                         Result.Write ("{0},", value);
 332                                 else
 333                                         Result.Write ("0x{0:X02},", value);
 334                                 if ((i & 0xF) == 0xF)
 335                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 336                         }
 337                         Result.WriteLine ("};");
 338                         Result.WriteLine ();
 339
 340                         // Primary weight value
 341                         Result.WriteLine ("static byte [] level1 = new byte [] {");
 342                         for (int i = 0; i < level1.Length; i++) {
 343                                 byte value = level1 [i];
 344                                 if (value < 10)
 345                                         Result.Write ("{0},", value);
 346                                 else
 347                                         Result.Write ("0x{0:X02},", value);
 348                                 if ((i & 0xF) == 0xF)
 349                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 350                         }
 351                         Result.WriteLine ("};");
 352                         Result.WriteLine ();
 353
 354                         // Secondary weight
 355                         Result.WriteLine ("static byte [] level2 = new byte [] {");
 356                         for (int i = 0; i < level2.Length; i++) {
 357                                 int value = level2 [i];
 358                                 if (value < 10)
 359                                         Result.Write ("{0},", value);
 360                                 else
 361                                         Result.Write ("0x{0:X02},", value);
 362                                 if ((i & 0xF) == 0xF)
 363                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 364                         }
 365                         Result.WriteLine ("};");
 366                         Result.WriteLine ();
 367
 368                         // Thirtiary weight
 369                         Result.WriteLine ("static byte [] level3 = new byte [] {");
 370                         for (int i = 0; i < level3.Length; i++) {
 371                                 byte value = level3 [i];
 372                                 if (value < 10)
 373                                         Result.Write ("{0},", value);
 374                                 else
 375                                         Result.Write ("0x{0:X02},", value);
 376                                 if ((i & 0xF) == 0xF)
 377                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 378                         }
 379                         Result.WriteLine ("};");
 380                         Result.WriteLine ();
 381
 382                         // Width insensitivity mappings
 383                         // (for now it is more lightweight than dumping the
 384                         // entire NFKD table).
 385                         Result.WriteLine ("static int [] widthCompat = new int [] {");
 386                         for (int i = 0; i < widthCompat.Length; i++) {
 387                                 int value = widthCompat [i];
 388                                 if (value < 10)
 389                                         Result.Write ("{0},", value);
 390                                 else
 391                                         Result.Write ("0x{0:X02},", value);
 392                                 if ((i & 0xF) == 0xF)
 393                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 394                         }
 395                         Result.WriteLine ("};");
 396                         Result.WriteLine ();
 397
 398                         // CJK
 399                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 400                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 401                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 402                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 403                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 404                 }
 405
 406                 void SerializeCJK (string name, ushort [] cjk, int max)
 407                 {
 408                         int offset = 0;//char.MaxValue - cjk.Length;
 409                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
 410                         for (int i = 0; i < cjk.Length; i++) {
 411                                 if (i + offset == max)
 412                                         break;
 413                                 ushort value = cjk [i];
 414                                 if (value < 10)
 415                                         Result.Write ("{0},", value);
 416                                 else
 417                                         Result.Write ("0x{0:X04},", value);
 418                                 if ((i & 0xF) == 0xF)
 419                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 420                         }
 421                         Result.WriteLine ("};");
 422                         Result.WriteLine ();
 423                 }
 424
 425                 void SerializeCJK (string name, byte [] cjk, int max)
 426                 {
 427                         int offset = 0;//char.MaxValue - cjk.Length;
 428                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
 429                         for (int i = 0; i < cjk.Length; i++) {
 430                                 if (i + offset == max)
 431                                         break;
 432                                 byte value = cjk [i];
 433                                 if (value < 10)
 434                                         Result.Write ("{0},", value);
 435                                 else
 436                                         Result.Write ("0x{0:X02},", value);
 437                                 if ((i & 0xF) == 0xF)
 438                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 439                         }
 440                         Result.WriteLine ("};");
 441                         Result.WriteLine ();
 442                 }
 443
 444                 void SerializeTailorings ()
 445                 {
 446                         Hashtable indexes = new Hashtable ();
 447                         Hashtable counts = new Hashtable ();
 448                         Result.WriteLine ("static char [] tailorings = new char [] {");
 449                         int count = 0;
 450                         foreach (Tailoring t in tailorings) {
 451                                 if (t.Alias != 0)
 452                                         continue;
 453                                 Result.Write ("/*{0}*/", t.LCID);
 454                                 indexes.Add (t.LCID, count);
 455                                 char [] values = t.ItemToCharArray ();
 456                                 counts.Add (t.LCID, values.Length);
 457                                 foreach (char c in values) {
 458                                         Result.Write ("'\\x{0:X}', ", (int) c);
 459                                         if (++count % 16 == 0)
 460                                                 Result.WriteLine (" // {0:X04}", count - 16);
 461                                 }
 462                         }
 463                         Result.WriteLine ("};");
 464
 465                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
 466                         foreach (Tailoring t in tailorings) {
 467                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
 468                                 if (!indexes.ContainsKey (target)) {
 469                                         Console.Error.WriteLine ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias);
 470                                         continue;
 471                                 }
 472                                 int idx = (int) indexes [target];
 473                                 int cnt = (int) counts [target];
 474                                 bool french = t.FrenchSort;
 475                                 if (t.Alias != 0)
 476                                         foreach (Tailoring t2 in tailorings)
 477                                                 if (t2.LCID == t.LCID)
 478                                                         french = t2.FrenchSort;
 479                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
 480                         }
 481                         Result.WriteLine ("};");
 482                 }
 483
 484                 #region Parse
 485
 486                 void ParseSources (string dirname)
 487                 {
 488                         string unidata =
 489                                 dirname + "/UnicodeData.txt";
 490                         string derivedCoreProps =
 491                                 dirname + "/DerivedCoreProperties.txt";
 492                         string scripts =
 493                                 dirname + "/Scripts.txt";
 494                         string cp932 =
 495                                 dirname + "/CP932.TXT";
 496                         string derivedAge =
 497                                 dirname + "/DerivedAge.txt";
 498                         string chXML = dirname + "/common/collation/zh.xml";
 499                         string jaXML = dirname + "/common/collation/ja.xml";
 500                         string koXML = dirname + "/common/collation/ko.xml";
 501
 502                         ParseDerivedAge (derivedAge);
 503
 504                         FillIgnorables ();
 505
 506                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 507                         ParseUnidata (unidata);
 508                         ParseDerivedCoreProperties (derivedCoreProps);
 509                         ParseScripts (scripts);
 510                         ParseCJK (chXML, jaXML, koXML);
 511
 512                         ParseTailorings ("mono-tailoring-source.txt");
 513                 }
 514
 515                 void ParseTailorings (string filename)
 516                 {
 517                         Tailoring t = null;
 518                         int line = 0;
 519                         using (StreamReader sr = new StreamReader (filename)) {
 520                                 try {
 521                                         while (sr.Peek () >= 0) {
 522                                                 line++;
 523                                                 ProcessTailoringLine (ref t,
 524                                                         sr.ReadLine ().Trim ());
 525                                         }
 526                                 } catch (Exception) {
 527                                         Console.Error.WriteLine ("ERROR at line {0}", line);
 528                                         throw;
 529                                 }
 530                         }
 531                 }
 532
 533                 // For now this is enough.
 534                 string ParseTailoringSourceValue (string s)
 535                 {
 536                         StringBuilder sb = new StringBuilder ();
 537                         for (int i = 0; i < s.Length; i++) {
 538                                 if (s.StartsWith ("\\u")) {
 539                                         sb.Append ((char) int.Parse (
 540                                                 s.Substring (2, 4), NumberStyles.HexNumber),
 541                                                 1);
 542                                         i += 5;
 543                                 }
 544                         else
 545                                 sb.Append (s [i]);
 546                         }
 547                         return sb.ToString ();
 548                 }
 549
 550                 void ProcessTailoringLine (ref Tailoring t, string s)
 551                 {
 552                         int idx = s.IndexOf ('#');
 553                         if (idx > 0)
 554                                 s = s.Substring (0, idx).Trim ();
 555                         if (s.Length == 0 || s [0] == '#')
 556                                 return;
 557                         if (s [0] == '@') {
 558                                 idx = s.IndexOf ('=');
 559                                 if (idx > 0)
 560                                         t = new Tailoring (
 561                                                 int.Parse (s.Substring (1, idx - 1)),
 562                                                 int.Parse (s.Substring (idx + 1)));
 563                                 else
 564                                         t = new Tailoring (int.Parse (s.Substring (1)));
 565                                 tailorings.Add (t);
 566                                 return;
 567                         }
 568                         if (s.StartsWith ("*FrenchSort")) {
 569                                 t.FrenchSort = true;
 570                                 return;
 571                         }
 572                         string d = "*Diacritical";
 573                         if (s.StartsWith (d)) {
 574                                 idx = s.IndexOf ("->");
 575                                 t.AddDiacriticalMap (
 576                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
 577                                                 NumberStyles.HexNumber),
 578                                         byte.Parse (s.Substring (idx + 2).Trim (),
 579                                                 NumberStyles.HexNumber));
 580                                 return;
 581                         }
 582                         idx = s.IndexOf (':');
 583                         if (idx > 0) {
 584                                 string source = s.Substring (0, idx).Trim ();
 585                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
 586                                 byte [] b = new byte [5];
 587                                 for (int i = 0; i < 5; i++) {
 588                                         if (l [i] == "*")
 589                                                 b [i] = 0;
 590                                         else
 591                                                 b [i] = byte.Parse (l [i],
 592                                                         NumberStyles.HexNumber);
 593                                 }
 594                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
 595                                         b);
 596                         }
 597                         idx = s.IndexOf ('=');
 598                         if (idx > 0)
 599                                 t.AddReplacementMap (
 600                                         ParseTailoringSourceValue (
 601                                                 s.Substring (0, idx).Trim ()),
 602                                         ParseTailoringSourceValue (
 603                                                 s.Substring (idx + 1).Trim ()));
 604                 }
 605
 606                 void ParseDerivedAge (string filename)
 607                 {
 608                         using (StreamReader file =
 609                                 new StreamReader (filename)) {
 610                                 while (file.Peek () >= 0) {
 611                                         string s = file.ReadLine ();
 612                                         int idx = s.IndexOf ('#');
 613                                         if (idx >= 0)
 614                                                 s = s.Substring (0, idx);
 615                                         idx = s.IndexOf (';');
 616                                         if (idx < 0)
 617                                                 continue;
 618
 619                                         string cpspec = s.Substring (0, idx);
 620                                         idx = cpspec.IndexOf ("..");
 621                                         NumberStyles nf = NumberStyles.HexNumber |
 622                                                 NumberStyles.AllowTrailingWhite;
 623                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 624                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 625                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 626
 627                                         // FIXME: use index
 628                                         if (cp > char.MaxValue)
 629                                                 continue;
 630
 631                                         for (int i = cp; i <= cpEnd; i++)
 632                                                 unicodeAge [i] = double.Parse (value);
 633                                 }
 634                         }
 635                         unicodeAge [0] = double.MaxValue; // never be supported
 636                 }
 637
 638                 void ParseUnidata (string filename)
 639                 {
 640                         ArrayList decompValues = new ArrayList ();
 641                         using (StreamReader unidata =
 642                                 new StreamReader (filename)) {
 643                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 644                                         try {
 645                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 646                                         } catch (Exception) {
 647                                                 Console.Error.WriteLine ("**** At line " + line);
 648                                                 throw;
 649                                         }
 650                                 }
 651                         }
 652                         this.decompValues = (int [])
 653                                 decompValues.ToArray (typeof (int));
 654                 }
 655
 656                 void ProcessUnidataLine (string s, ArrayList decompValues)
 657                 {
 658                         int idx = s.IndexOf ('#');
 659                         if (idx >= 0)
 660                                 s = s.Substring (0, idx);
 661                         idx = s.IndexOf (';');
 662                         if (idx < 0)
 663                                 return;
 664                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 665                         string [] values = s.Substring (idx + 1).Split (';');
 666
 667                         // FIXME: use index
 668                         if (cp > char.MaxValue)
 669                                 return;
 670                         if (IsIgnorable (cp))
 671                                 return;
 672
 673                         string name = values [0];
 674
 675                         // isSmallCapital
 676                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 677                                 isSmallCapital [cp] = true;
 678
 679                         // latin mapping by character name
 680                         if (s.IndexOf ("LATIN") > 0) {
 681                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 682                                 int offset = lidx + 15;
 683                                 if (lidx < 0) {
 684                                         lidx = s.IndexOf ("LETTER TURNED ");
 685                                         offset = lidx + 14;
 686                                 }
 687                                 if (lidx < 0) {
 688                                         lidx = s.IndexOf ("LETTER ");
 689                                         offset = lidx + 7;
 690                                 }
 691                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 692                                 if ('A' <= c && c <= 'Z' &&
 693                                         (s.Length == offset + 1 || s [offset + 1] == ' ')) {
 694                                         ArrayList entry = (ArrayList) latinMap [c];
 695                                         if (entry == null) {
 696                                                 entry = new ArrayList ();
 697                                                 latinMap [c] = entry;
 698                                         }
 699                                         entry.Add (cp);
 700                                 }
 701                         }
 702
 703                         // Arrow names
 704                         if (0x2000 <= cp && cp < 0x3000) {
 705                                 int value = 0;
 706                                 // SPECIAL CASES. FIXME: why?
 707                                 switch (cp) {
 708                                 case 0x21C5: value = -1; break; // E2
 709                                 case 0x261D: value = 1; break;
 710                                 case 0x27A6: value = 3; break;
 711                                 case 0x21B0: value = 7; break;
 712                                 case 0x21B1: value = 3; break;
 713                                 case 0x21B2: value = 7; break;
 714                                 case 0x21B4: value = 5; break;
 715                                 case 0x21B5: value = 7; break;
 716                                 case 0x21B9: value = -1; break; // E1
 717                                 case 0x21CF: value = 7; break;
 718                                 case 0x21D0: value = 3; break;
 719                                 }
 720                                 string [] arrowTargets = new string [] {
 721                                         "",
 722                                         "UPWARDS",
 723                                         "NORTH EAST",
 724                                         "RIGHTWARDS",
 725                                         "SOUTH EAST",
 726                                         "DOWNWARDS",
 727                                         "SOUTH WEST",
 728                                         "LEFTWARDS",
 729                                         "NORTH WEST",
 730                                         };
 731                                 if (value == 0)
 732                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
 733                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
 734                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
 735                                                         s.IndexOf (" OVER") < 0
 736                                                 )
 737                                                         value = i;
 738                                 if (value > 0)
 739                                         arrowValues.Add (new DictionaryEntry (
 740                                                 cp, value));
 741                         }
 742
 743                         // Box names
 744                         if (0x2500 <= cp && cp < 0x25B0) {
 745                                 int value = 0;
 746                                 // flags:
 747                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
 748                                 // [h,rl] [r] [l]
 749                                 // [v,ud] [u] [d]
 750                                 // [dr] [dl] [ur] [ul]
 751                                 // [vr,udr] [vl,vdl]
 752                                 // [hd,rld] [hu,rlu]
 753                                 // [hv,udrl,rlv,udh]
 754                                 ArrayList flags = new ArrayList (new int [] {
 755                                         32, 8 + 4, 8, 4,
 756                                         16, 1 + 2, 1, 2,
 757                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
 758                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
 759                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
 760                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
 761                                         });
 762                                 byte [] offsets = new byte [] {
 763                                         0, 0, 1, 2,
 764                                         3, 3, 4, 5,
 765                                         6, 7, 8, 9,
 766                                         10, 10, 11, 11,
 767                                         12, 12, 13, 13,
 768                                         14, 14, 14, 14};
 769                                 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
 770                                         int flag = 0;
 771                                         if (s.IndexOf (" UP") > 0)
 772                                                 flag |= 1;
 773                                         if (s.IndexOf (" DOWN") > 0)
 774                                                 flag |= 2;
 775                                         if (s.IndexOf (" RIGHT") > 0)
 776                                                 flag |= 4;
 777                                         if (s.IndexOf (" LEFT") > 0)
 778                                                 flag |= 8;
 779                                         if (s.IndexOf (" VERTICAL") > 0)
 780                                                 flag |= 16;
 781                                         if (s.IndexOf (" HORIZONTAL") > 0)
 782                                                 flag |= 32;
 783
 784                                         int fidx = flags.IndexOf (flag);
 785                                         value = fidx < 0 ? fidx : offsets [fidx];
 786                                 } else if (s.IndexOf ("BLOCK") > 0) {
 787                                         if (s.IndexOf ("ONE EIGHTH") > 0)
 788                                                 value = 0x12;
 789                                         else if (s.IndexOf ("ONE QUARTER") > 0)
 790                                                 value = 0x13;
 791                                         else if (s.IndexOf ("THREE EIGHTHS") > 0)
 792                                                 value = 0x14;
 793                                         else if (s.IndexOf ("HALF") > 0)
 794                                                 value = 0x15;
 795                                         else if (s.IndexOf ("FIVE EIGHTHS") > 0)
 796                                                 value = 0x16;
 797                                         else if (s.IndexOf ("THREE QUARTERS") > 0)
 798                                                 value = 0x17;
 799                                         else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
 800                                                 value = 0x18;
 801                                         else
 802                                                 value = 0x19;
 803                                 }
 804                                 if (value >= 0)
 805                                         boxValues.Add (new DictionaryEntry (
 806                                                 cp, value));
 807                         }
 808
 809                         // For some characters store the name and sort later
 810                         // to determine sorting.
 811                         if (0x2100 <= cp && cp <= 0x213F &&
 812                                 Char.IsSymbol ((char) cp))
 813                                 sortableCharNames.Add (
 814                                         new DictionaryEntry (cp, values [0]));
 815                         else if (0x3380 <= cp && cp <= 0x33DD)
 816                                 sortableCharNames.Add (new DictionaryEntry (
 817                                         cp, values [0].Substring (7)));
 818
 819                         // diacritical weights by character name
 820                         for (int d = 0; d < diacritics.Length; d++)
 821                                 if (s.IndexOf (diacritics [d]) > 0)
 822                                         diacritical [cp] |= diacriticWeights [d];
 823                         // Two-step grep required for it.
 824                         if (s.IndexOf ("FULL STOP") > 0 &&
 825                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
 826                                 diacritical [cp] |= 0xF4;
 827
 828                         // Arabic letter name
 829                         if (0x0621 <= cp && cp <= 0x064A &&
 830                                 Char.GetUnicodeCategory ((char) cp)
 831                                 == UnicodeCategory.OtherLetter) {
 832                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
 833                                 switch (cp) {
 834                                 case 0x0621:
 835                                 case 0x0624:
 836                                 case 0x0626:
 837                                         // hamza, waw, yeh ... special cases.
 838                                         value = 0x07;
 839                                         break;
 840                                 case 0x0649:
 841                                 case 0x064A:
 842                                         value = 0x77; // special cases.
 843                                         break;
 844                                 default:
 845                                         // Get primary letter name i.e.
 846                                         // XXX part of ARABIC LETTER XXX yyy
 847                                         // e.g. that of "TEH MARBUTA" is "TEH".
 848                                         string letterName =
 849                                                 (cp == 0x0640) ?
 850                                                 // 0x0640 is special: it does
 851                                                 // not start with ARABIC LETTER
 852                                                 values [0] :
 853                                                 values [0].Substring (14);
 854                                         int tmpIdx = letterName.IndexOf (' ');
 855                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
 856 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
 857                                         if (arabicNameMap.ContainsKey (letterName))
 858                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
 859                                         else
 860                                                 arabicNameMap [letterName] = cp;
 861                                         break;
 862                                 }
 863                                 arabicLetterPrimaryValues [cp] = value;
 864                         }
 865
 866                         // Japanese square letter
 867                         if (0x3300 <= cp && cp <= 0x3357)
 868                                 if (!ExistsJIS (cp))
 869                                         nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
 870
 871                         // normalizationType
 872                         string decomp = values [4];
 873                         idx = decomp.IndexOf ('<');
 874                         if (idx >= 0) {
 875                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
 876                                 case "full":
 877                                         decompType [cp] = DecompositionFull;
 878                                         break;
 879                                 case "sub":
 880                                         decompType [cp] = DecompositionSub;
 881                                         break;
 882                                 case "super":
 883                                         decompType [cp] = DecompositionSuper;
 884                                         break;
 885                                 case "small":
 886                                         decompType [cp] = DecompositionSmall;
 887                                         break;
 888                                 case "isolated":
 889                                         decompType [cp] = DecompositionIsolated;
 890                                         break;
 891                                 case "initial":
 892                                         decompType [cp] = DecompositionInitial;
 893                                         break;
 894                                 case "final":
 895                                         decompType [cp] = DecompositionFinal;
 896                                         break;
 897                                 case "medial":
 898                                         decompType [cp] = DecompositionMedial;
 899                                         break;
 900                                 case "noBreak":
 901                                         decompType [cp] = DecompositionNoBreak;
 902                                         break;
 903                                 case "compat":
 904                                         decompType [cp] = DecompositionCompat;
 905                                         break;
 906                                 case "fraction":
 907                                         decompType [cp] = DecompositionFraction;
 908                                         break;
 909                                 case "font":
 910                                         decompType [cp] = DecompositionFont;
 911                                         break;
 912                                 case "circle":
 913                                         decompType [cp] = DecompositionCircle;
 914                                         break;
 915                                 case "square":
 916                                         decompType [cp] = DecompositionSquare;
 917                                         break;
 918                                 case "wide":
 919                                         decompType [cp] = DecompositionWide;
 920                                         break;
 921                                 case "narrow":
 922                                         decompType [cp] = DecompositionNarrow;
 923                                         break;
 924                                 case "vertical":
 925                                         decompType [cp] = DecompositionVertical;
 926                                         break;
 927                                 default:
 928                                         throw new Exception ("Support NFKD type : " + decomp);
 929                                 }
 930                         }
 931                         else
 932                                 decompType [cp] = DecompositionCanonical;
 933                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
 934                         if (decomp.Length > 0) {
 935
 936                                 string [] velems = decomp.Split (' ');
 937                                 int didx = decompValues.Count;
 938                                 decompIndex [cp] = didx;
 939                                 foreach (string v in velems)
 940                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
 941                                 decompLength [cp] = velems.Length;
 942
 943                                 // [decmpType] -> this_cp
 944                                 int targetCP = (int) decompValues [didx];
 945                                 // for "(x)" it specially maps to 'x' .
 946                                 // FIXME: check if it is sane
 947                                 if (velems.Length == 3 &&
 948                                         (int) decompValues [didx] == '(' &&
 949                                         (int) decompValues [didx + 2] == ')')
 950                                         targetCP = (int) decompValues [didx + 1];
 951                                 // special: 0x215F "1/"
 952                                 else if (cp == 0x215F)
 953                                         targetCP = '1';
 954                                 else if (velems.Length > 1 &&
 955                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
 956                                         // skip them, except for CJK ideograph compat
 957                                         targetCP = 0;
 958
 959                                 if (targetCP != 0) {
 960                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
 961                                         if (entry == null) {
 962                                                 entry = new Hashtable ();
 963                                                 nfkdMap [targetCP] = entry;
 964                                         }
 965                                         entry [(byte) decompType [cp]] = cp;
 966                                 }
 967                         }
 968                         // numeric values
 969                         if (values [5].Length > 0)
 970                                 decimalValue [cp] = decimal.Parse (values [5]);
 971                         else if (values [6].Length > 0)
 972                                 decimalValue [cp] = decimal.Parse (values [6]);
 973                         else if (values [7].Length > 0) {
 974                                 string decstr = values [7];
 975                                 idx = decstr.IndexOf ('/');
 976                                 if (cp == 0x215F) // special. "1/"
 977                                         decimalValue [cp] = 0x1;
 978                                 else if (idx > 0)
 979                                         // m/n
 980                                         decimalValue [cp] =
 981                                                 decimal.Parse (decstr.Substring (0, idx))
 982                                                 / decimal.Parse (decstr.Substring (idx + 1));
 983                                 else if (decstr [0] == '(' &&
 984                                         decstr [decstr.Length - 1] == ')')
 985                                         // (n)
 986                                         decimalValue [cp] =
 987                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
 988                                 else if (decstr [decstr.Length - 1] == '.')
 989                                         // n.
 990                                         decimalValue [cp] =
 991                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
 992                                 else
 993                                         decimalValue [cp] = decimal.Parse (decstr);
 994                         }
 995                 }
 996
 997                 void ParseDerivedCoreProperties (string filename)
 998                 {
 999                         // IsUppercase
1000                         using (StreamReader file =
1001                                 new StreamReader (filename)) {
1002                                 for (int line = 1; file.Peek () >= 0; line++) {
1003                                         try {
1004                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1005                                         } catch (Exception) {
1006                                                 Console.Error.WriteLine ("**** At line " + line);
1007                                                 throw;
1008                                         }
1009                                 }
1010                         }
1011                 }
1012
1013                 void ProcessDerivedCorePropLine (string s)
1014                 {
1015                         int idx = s.IndexOf ('#');
1016                         if (idx >= 0)
1017                                 s = s.Substring (0, idx);
1018                         idx = s.IndexOf (';');
1019                         if (idx < 0)
1020                                 return;
1021                         string cpspec = s.Substring (0, idx);
1022                         idx = cpspec.IndexOf ("..");
1023                         NumberStyles nf = NumberStyles.HexNumber |
1024                                 NumberStyles.AllowTrailingWhite;
1025                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1026                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1027                         string value = s.Substring (cpspec.Length + 1).Trim ();
1028
1029                         // FIXME: use index
1030                         if (cp > char.MaxValue)
1031                                 return;
1032
1033                         switch (value) {
1034                         case "Uppercase":
1035                                 for (int x = cp; x <= cpEnd; x++)
1036                                         isUppercase [x] = true;
1037                                 break;
1038                         }
1039                 }
1040
1041                 void ParseScripts (string filename)
1042                 {
1043                         ArrayList cyrillic = new ArrayList ();
1044                         ArrayList gurmukhi = new ArrayList ();
1045                         ArrayList gujarati = new ArrayList ();
1046                         ArrayList georgian = new ArrayList ();
1047                         ArrayList thaana = new ArrayList ();
1048
1049                         using (StreamReader file =
1050                                 new StreamReader (filename)) {
1051                                 while (file.Peek () >= 0) {
1052                                         string s = file.ReadLine ();
1053                                         int idx = s.IndexOf ('#');
1054                                         if (idx >= 0)
1055                                                 s = s.Substring (0, idx);
1056                                         idx = s.IndexOf (';');
1057                                         if (idx < 0)
1058                                                 continue;
1059
1060                                         string cpspec = s.Substring (0, idx);
1061                                         idx = cpspec.IndexOf ("..");
1062                                         NumberStyles nf = NumberStyles.HexNumber |
1063                                                 NumberStyles.AllowTrailingWhite;
1064                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1065                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1066                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1067
1068                                         // FIXME: use index
1069                                         if (cp > char.MaxValue)
1070                                                 continue;
1071
1072                                         switch (value) {
1073                                         case "Cyrillic":
1074                                                 for (int x = cp; x <= cpEnd; x++)
1075                                                         if (!IsIgnorable (x))
1076                                                                 cyrillic.Add ((char) x);
1077                                                 break;
1078                                         case "Gurmukhi":
1079                                                 for (int x = cp; x <= cpEnd; x++)
1080                                                         if (!IsIgnorable (x))
1081                                                                 gurmukhi.Add ((char) x);
1082                                                 break;
1083                                         case "Gujarati":
1084                                                 for (int x = cp; x <= cpEnd; x++)
1085                                                         if (!IsIgnorable (x))
1086                                                                 gujarati.Add ((char) x);
1087                                                 break;
1088                                         case "Georgian":
1089                                                 for (int x = cp; x <= cpEnd; x++)
1090                                                         if (!IsIgnorable (x))
1091                                                                 georgian.Add ((char) x);
1092                                                 break;
1093                                         case "Thaana":
1094                                                 for (int x = cp; x <= cpEnd; x++)
1095                                                         if (!IsIgnorable (x))
1096                                                                 thaana.Add ((char) x);
1097                                                 break;
1098                                         }
1099                                 }
1100                         }
1101                         cyrillic.Sort (UCAComparer.Instance);
1102                         gurmukhi.Sort (UCAComparer.Instance);
1103                         gujarati.Sort (UCAComparer.Instance);
1104                         georgian.Sort (UCAComparer.Instance);
1105                         thaana.Sort (UCAComparer.Instance);
1106                         orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1107                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1108                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1109                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1110                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1111                 }
1112
1113                 void ParseJISOrder (string filename)
1114                 {
1115                         using (StreamReader file =
1116                                 new StreamReader (filename)) {
1117                                 while (file.Peek () >= 0) {
1118                                         string s = file.ReadLine ();
1119                                         int idx = s.IndexOf ('#');
1120                                         if (idx >= 0)
1121                                                 s = s.Substring (0, idx).Trim ();
1122                                         if (s.Length == 0)
1123                                                 continue;
1124                                         idx = s.IndexOf (' ');
1125                                         if (idx < 0)
1126                                                 continue;
1127                                         // They start with "0x" so cut them out.
1128                                         int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
1129                                         int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
1130                                         jisJapanese.Add (new JISCharacter (cp, jis));
1131                                 }
1132                         }
1133                 }
1134
1135                 void ParseCJK (string zhXML, string jaXML, string koXML)
1136                 {
1137                         XmlDocument doc = new XmlDocument ();
1138                         doc.XmlResolver = null;
1139                         int v;
1140                         string s;
1141                         string category;
1142                         int offset;
1143                         ushort [] arr;
1144
1145                         // Chinese Simplified
1146                         category = "chs";
1147                         arr = cjkCHS;
1148                         offset = 0;//char.MaxValue - arr.Length;
1149                         doc.Load (zhXML);
1150                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1151                         v = 0x8008;
1152                         foreach (char c in s) {
1153                                 if (c < '\u3100')
1154                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1155                                 else {
1156                                         arr [(int) c - offset] = (ushort) v++;
1157                                         if (v % 256 == 0)
1158                                                 v += 2;
1159                                 }
1160                         }
1161
1162                         // Chinese Traditional
1163                         category = "cht";
1164                         arr = cjkCHT;
1165                         offset = 0;//char.MaxValue - arr.Length;
1166                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1167                         v = 0x8002;
1168                         foreach (char c in s) {
1169                                 if (c < '\u4E00')
1170                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1171                                 else {
1172                                         arr [(int) c - offset] = (ushort) v++;
1173                                         if (v % 256 == 0)
1174                                                 v += 2;
1175                                 }
1176                         }
1177
1178                         // Japanese
1179                         category = "ja";
1180                         arr = cjkJA;
1181                         offset = 0;//char.MaxValue - arr.Length;
1182                         doc.Load (jaXML);
1183                         s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1184                         v = 0x8008;
1185                         foreach (char c in s) {
1186                                 if (c < '\u4E00')
1187                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1188                                 else {
1189                                         arr [(int) c - offset] = (ushort) v++;
1190                                         if (v % 256 == 0)
1191                                                 v += 2;
1192                                 }
1193                         }
1194
1195                         // Korean
1196                         // Korean weight is somewhat complex. It first shifts
1197                         // Hangul category from 52-x to 80-x (they are anyways
1198                         // computed). CJK ideographs are placed at secondary
1199                         // weight, like XX YY 01 zz 01, where XX and YY are
1200                         // corresponding "reset" value and zz is 41,43,45...
1201                         //
1202                         // Unlike chs,cht and ja, Korean value is a combined
1203                         // ushort which is computed as category
1204                         //
1205                         category = "ko";
1206                         arr = cjkKO;
1207                         offset = 0;//char.MaxValue - arr.Length;
1208                         doc.Load (koXML);
1209                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1210                                 XmlElement sc = (XmlElement) reset.NextSibling;
1211                                 // compute "category" and "level 1" for the
1212                                 // target "reset" Hangle syllable
1213                                 char rc = reset.InnerText [0];
1214                                 int ri = ((int) rc - 0xAC00) + 1;
1215                                 ushort p = (ushort)
1216                                         ((ri / 254) * 256 + (ri % 254) + 2);
1217                                 // Place the characters after the target.
1218                                 s = sc.InnerText;
1219                                 v = 0x41;
1220                                 foreach (char c in s) {
1221                                         arr [(int) c - offset] = p;
1222                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1223                                         v += 2;
1224                                 }
1225                         }
1226                 }
1227
1228                 #endregion
1229
1230                 #region Generation
1231
1232                 void FillIgnorables ()
1233                 {
1234                         for (int i = 0; i <= char.MaxValue; i++) {
1235                                 if (Char.GetUnicodeCategory ((char) i) ==
1236                                         UnicodeCategory.OtherNotAssigned)
1237                                         continue;
1238                                 if (IsIgnorable (i))
1239                                         ignorableFlags [i] |= 1;
1240                                 if (IsIgnorableSymbol (i))
1241                                         ignorableFlags [i] |= 2;
1242                                 if (IsIgnorableNonSpacing (i))
1243                                         ignorableFlags [i] |= 4;
1244                         }
1245                 }
1246
1247                 void ModifyParsedValues ()
1248                 {
1249                         // number, secondary weights
1250                         byte weight = 0x38;
1251                         int [] numarr = numberSecondaryWeightBounds;
1252                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1253                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1254                                         if (Char.IsNumber ((char) cp))
1255                                                 diacritical [cp] = weight;
1256
1257                         // Modify some decomposition equivalence
1258                         decompType [0xFE31] = 0;
1259                         decompIndex [0xFE31] = 0;
1260                         decompLength [0xFE31] = 0;
1261                         decompType [0xFE32] = 0;
1262                         decompIndex [0xFE32] = 0;
1263                         decompLength [0xFE32] = 0;
1264
1265                         // Korean parens numbers
1266                         for (int i = 0x3200; i <= 0x321C; i++)
1267                                 diacritical [i] = 0xA;
1268                         for (int i = 0x3260; i <= 0x327B; i++)
1269                                 diacritical [i] = 0xC;
1270
1271                         // Update name part of named characters
1272                         for (int i = 0; i < sortableCharNames.Count; i++) {
1273                                 DictionaryEntry de =
1274                                         (DictionaryEntry) sortableCharNames [i];
1275                                 int cp = (int) de.Key;
1276                                 string renamed = null;
1277                                 switch (cp) {
1278                                 case 0x2101: renamed = "A_1"; break;
1279                                 case 0x33C3: renamed = "A_2"; break;
1280                                 case 0x2105: renamed = "C_1"; break;
1281                                 case 0x2106: renamed = "C_2"; break;
1282                                 case 0x211E: renamed = "R1"; break;
1283                                 case 0x211F: renamed = "R2"; break;
1284                                 // Remove some of them!
1285                                 case 0x2103:
1286                                 case 0x2109:
1287                                 case 0x2116:
1288                                 case 0x2117:
1289                                 case 0x2118:
1290                                 case 0x2125:
1291                                 case 0x2127:
1292                                 case 0x2129:
1293                                 case 0x212E:
1294                                 case 0x2132:
1295                                         sortableCharNames.RemoveAt (i);
1296                                         i--;
1297                                         continue;
1298                                 }
1299                                 if (renamed != null)
1300                                         sortableCharNames [i] =
1301                                                 new DictionaryEntry (cp, renamed);
1302                         }
1303                 }
1304
1305                 void GenerateCore ()
1306                 {
1307                         UnicodeCategory uc;
1308
1309                         #region Specially ignored // 01
1310                         // This will raise "Defined" flag up.
1311                         foreach (char c in specialIgnore)
1312                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1313                         #endregion
1314
1315
1316                         #region Variable weights
1317                         // Controls : 06 03 - 06 3D
1318                         fillIndex [6] = 3;
1319                         for (int i = 0; i < 65536; i++) {
1320                                 if (IsIgnorable (i))
1321                                         continue;
1322                                 char c = (char) i;
1323                                 uc = Char.GetUnicodeCategory (c);
1324                                 // NEL is whitespace but not ignored here.
1325                                 if (uc == UnicodeCategory.Control &&
1326                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1327                                         AddCharMap (c, 6, 1);
1328                         }
1329
1330                         // Apostrophe 06 80
1331                         fillIndex [6] = 0x80;
1332                         AddCharMapGroup ('\'', 6, 1, 0);
1333                         AddCharMap ('\uFE63', 6, 1);
1334
1335                         // Hyphen/Dash : 06 81 - 06 90
1336                         for (int i = 0; i < char.MaxValue; i++) {
1337                                 if (!IsIgnorable (i) &&
1338                                         Char.GetUnicodeCategory ((char) i) ==
1339                                         UnicodeCategory.DashPunctuation) {
1340                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1341                                         if (i == 0x2011) {
1342                                                 // SPECIAL: add 2027 and 2043
1343                                                 // Maybe they are regarded the
1344                                                 // same hyphens in "central"
1345                                                 // position.
1346                                                 AddCharMap ('\u2027', 6, 1);
1347                                                 AddCharMap ('\u2043', 6, 1);
1348                                         }
1349                                 }
1350                         }
1351
1352                         // Arabic variable weight chars 06 A0 -
1353                         fillIndex [6] = 0xA0;
1354                         // vowels
1355                         for (int i = 0x64B; i <= 0x650; i++)
1356                                 AddArabicCharMap ((char) i);
1357                         // sukun
1358                         AddCharMapGroup ('\u0652', 6, 1, 0);
1359                         // shadda
1360                         AddCharMapGroup ('\u0651', 6, 1, 0);
1361                         #endregion
1362
1363
1364                         #region Nonspacing marks // 01
1365                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1366
1367                         // Combining diacritical marks: 01 DC -
1368
1369                         fillIndex [0x1] = 0x41;
1370                         for (int i = 0x030E; i <= 0x0326; i++)
1371                                 if (!IsIgnorable (i))
1372                                         AddCharMap ((char) i, 0x1, 1);
1373                         for (int i = 0x0329; i <= 0x0334; i++)
1374                                 if (!IsIgnorable (i))
1375                                         AddCharMap ((char) i, 0x1, 1);
1376                         for (int i = 0x0339; i <= 0x0341; i++)
1377                                 if (!IsIgnorable (i))
1378                                         AddCharMap ((char) i, 0x1, 1);
1379                         fillIndex [0x1] = 0x72;
1380                         for (int i = 0x0346; i <= 0x0348; i++)
1381                                 if (!IsIgnorable (i))
1382                                         AddCharMap ((char) i, 0x1, 1);
1383                         for (int i = 0x02BE; i <= 0x02BF; i++)
1384                                 if (!IsIgnorable (i))
1385                                         AddCharMap ((char) i, 0x1, 1);
1386                         for (int i = 0x02C1; i <= 0x02C5; i++)
1387                                 if (!IsIgnorable (i))
1388                                         AddCharMap ((char) i, 0x1, 1);
1389                         for (int i = 0x02CE; i <= 0x02CF; i++)
1390                                 if (!IsIgnorable (i))
1391                                         AddCharMap ((char) i, 0x1, 1);
1392                         for (int i = 0x02D1; i <= 0x02D3; i++)
1393                                 if (!IsIgnorable (i))
1394                                         AddCharMap ((char) i, 0x1, 1);
1395                         AddCharMap ('\u02DE', 0x1, 1);
1396                         for (int i = 0x02E4; i <= 0x02E9; i++)
1397                                 if (!IsIgnorable (i))
1398                                         AddCharMap ((char) i, 0x1, 1);
1399
1400                         // LAMESPEC: It should not stop at '\u20E1'. There are
1401                         // a few more characters (that however results in
1402                         // overflow of level 2 unless we start before 0xDD).
1403                         fillIndex [0x1] = 0xDC;
1404                         for (int i = 0x20d0; i <= 0x20e1; i++)
1405                                 AddCharMap ((char) i, 0x1, 1);
1406                         #endregion
1407
1408
1409                         #region Whitespaces // 07 03 -
1410                         fillIndex [0x7] = 0x2;
1411                         AddCharMap (' ', 0x7, 2);
1412                         AddCharMap ('\u00A0', 0x7, 1);
1413                         for (int i = 9; i <= 0xD; i++)
1414                                 AddCharMap ((char) i, 0x7, 1);
1415                         for (int i = 0x2000; i <= 0x200B; i++)
1416                                 AddCharMap ((char) i, 0x7, 1);
1417
1418                         fillIndex [0x7] = 0x17;
1419                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1420                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1421
1422                         // Characters which used to represent layout control.
1423                         // LAMESPEC: Windows developers seem to have thought
1424                         // that those characters are kind of whitespaces,
1425                         // while they aren't.
1426                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1427                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1428                         #endregion
1429
1430                         // FIXME: 09 should be more complete.
1431                         fillIndex [0x9] = 2;
1432                         // misc tech mark
1433                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1434                                 AddCharMap ((char) cp, 0x9, 1, 0);
1435
1436                         // arrows
1437                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1438                         foreach (DictionaryEntry de in arrowValues) {
1439                                 int idx = (int) de.Value;
1440                                 int cp = (int) de.Key;
1441                                 if (map [cp].Defined)
1442                                         continue;
1443                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1444                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1445                                 arrowLv2 [idx]++;
1446                         }
1447                         // boxes
1448                         byte [] boxLv2 = new byte [128];
1449                         for (int i = 0; i < boxLv2.Length; i++)
1450                                 boxLv2 [i] = 3;
1451                         foreach (DictionaryEntry de in boxValues) {
1452                                 int cp = (int) de.Key;
1453                                 int idx = (int) de.Value;
1454                                 if (map [cp].Defined)
1455                                         continue;
1456                                 fillIndex [0x9] = (byte) (0xE5 + idx);
1457                                 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1458                                 boxLv2 [idx]++;
1459                         }
1460                         // Some special characters (slanted)
1461                         fillIndex [0x9] = 0xF4;
1462                         AddCharMap ('\u2571', 0x9, 3);
1463                         AddCharMap ('\u2572', 0x9, 3);
1464                         AddCharMap ('\u2573', 0x9, 3);
1465
1466                         // FIXME: implement 0A
1467                         #region Symbols
1468                         fillIndex [0xA] = 2;
1469                         // byte currency symbols
1470                         for (int cp = 0; cp < 0x100; cp++) {
1471                                 uc = Char.GetUnicodeCategory ((char) cp);
1472                                 if (!IsIgnorable (cp) &&
1473                                         uc == UnicodeCategory.CurrencySymbol &&
1474                                         cp != '$')
1475                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1476                         }
1477                         // byte other symbols
1478                         for (int cp = 0; cp < 0x100; cp++) {
1479                                 if (cp == 0xA6)
1480                                         continue; // SPECIAL: skip FIXME: why?
1481                                 uc = Char.GetUnicodeCategory ((char) cp);
1482                                 if (!IsIgnorable (cp) &&
1483                                         uc == UnicodeCategory.OtherSymbol)
1484                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1485                         }
1486
1487                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1488                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1489                                 AddCharMap ((char) cp, 0xA, 1, 0);
1490                         // Dingbats
1491                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1492                                 if (Char.IsSymbol ((char) cp))
1493                                         AddCharMap ((char) cp, 0xA, 1, 0);
1494                         // OCR
1495                         for (int i = 0x2440; i < 0x2460; i++)
1496                                 AddCharMap ((char) i, 0xA, 1, 0);
1497
1498                         #endregion
1499
1500                         #region Numbers // 0C 02 - 0C E1
1501                         fillIndex [0xC] = 2;
1502
1503                         // 9F8 : Bengali "one less than the denominator"
1504                         AddCharMap ('\u09F8', 0xC, 1);
1505
1506                         ArrayList numbers = new ArrayList ();
1507                         for (int i = 0; i < 65536; i++)
1508                                 if (!IsIgnorable (i) &&
1509                                         Char.IsNumber ((char) i) &&
1510                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1511                                         numbers.Add (i);
1512
1513                         ArrayList numberValues = new ArrayList ();
1514                         foreach (int i in numbers)
1515                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1516                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1517
1518 //foreach (DictionaryEntry de in numberValues)
1519 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1520
1521                         decimal prevValue = -1;
1522                         foreach (DictionaryEntry de in numberValues) {
1523                                 int cp = (int) de.Key;
1524                                 decimal currValue = (decimal) de.Value;
1525                                 bool addnew = false;
1526                                 if (prevValue < currValue &&
1527                                         prevValue - (int) prevValue == 0 &&
1528                                         prevValue >= 1) {
1529
1530                                         addnew = true;
1531                                         // Process Hangzhou and Roman numbers
1532
1533                                         // There are some SPECIAL cases.
1534                                         if (currValue != 4) // no increment for 4
1535                                                 fillIndex [0xC]++;
1536
1537                                         int xcp;
1538                                         xcp = (int) prevValue + 0x2170 - 1;
1539                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1540                                         xcp = (int) prevValue + 0x2160 - 1;
1541                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1542                                         fillIndex [0xC] += 2;
1543                                         xcp = (int) prevValue + 0x3021 - 1;
1544                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1545                                         fillIndex [0xC]++;
1546                                 }
1547                                 if (prevValue < currValue)
1548                                         prevValue = currValue;
1549                                 if (map [cp].Defined)
1550                                         continue;
1551                                 // HangZhou and Roman are add later
1552                                 // (code is above)
1553                                 else if (0x3021 <= cp && cp < 0x302A
1554                                         || 0x2160 <= cp && cp < 0x216A
1555                                         || 0x2170 <= cp && cp < 0x217A)
1556                                         continue;
1557
1558                                 if (cp ==  0x215B) // FIXME: why?
1559                                         fillIndex [0xC] += 2;
1560                                 else if (cp == 0x3021) // FIXME: why?
1561                                         fillIndex [0xC]++;
1562                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1563
1564                                 if (addnew || cp <= '9') {
1565                                         int xcp;
1566                                         if (1 <= currValue && currValue <= 10) {
1567                                                 xcp = cp - 0x31 + 0x2776;
1568                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1569                                                 xcp = cp - 0x31 + 0x2780;
1570                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1571                                                 xcp = cp - 0x31 + 0x278A;
1572                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1573                                         }
1574                                         if (1 <= currValue && currValue <= 20) {
1575                                                 xcp = cp - 0x31 + 0x2460;
1576                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1577                                                 xcp = cp - 0x31 + 0x2474;
1578                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1579                                                 xcp = cp - 0x31 + 0x2488;
1580                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1581                                         }
1582                                 }
1583
1584                                 if (cp != 0x09E7 && cp != 0x09EA)
1585                                         fillIndex [0xC]++;
1586
1587                                 // Add special cases that are not regarded as
1588                                 // numbers in UnicodeCategory speak.
1589                                 if (cp == '5') {
1590                                         // TONE FIVE
1591                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1592                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1593                                 }
1594                                 else if (cp == '6') // FIXME: why?
1595                                         fillIndex [0xC]++;
1596                         }
1597
1598                         // 221E: infinity
1599                         fillIndex [0xC] = 0xFF;
1600                         AddCharMap ('\u221E', 0xC, 1);
1601                         #endregion
1602
1603                         #region Letters and NonSpacing Marks (general)
1604
1605                         // ASCII Latin alphabets
1606                         for (int i = 0; i < alphabets.Length; i++)
1607                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1608
1609
1610                         // non-ASCII Latin alphabets
1611                         // FIXME: there is no such characters that are placed
1612                         // *after* "alphabets" array items. This is nothing
1613                         // more than a hack that creates dummy weight for
1614                         // primary characters.
1615                         for (int i = 0x0080; i < 0x0300; i++) {
1616                                 if (!Char.IsLetter ((char) i))
1617                                         continue;
1618                                 // For those Latin Letters which has NFKD are
1619                                 // not added as independent primary character.
1620                                 if (decompIndex [i] != 0)
1621                                         continue;
1622                                 // SPECIAL CASES:
1623                                 // 1.some alphabets have primarily
1624                                 //   equivalent ASCII alphabets.
1625                                 // 2.some have independent primary weights,
1626                                 //   but inside a-to-z range.
1627                                 // 3.there are some expanded characters that
1628                                 //   are not part of Unicode Standard NFKD.
1629                                 switch (i) {
1630                                 // 1. skipping them does not make sense
1631 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
1632 //                              case 0x184: case 0x185: case 0x186: case 0x189:
1633 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
1634 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
1635 //                              case 0x19B: case 0x19C:
1636                                 // 2. skipping them does not make sense
1637 //                              case 0x14A: // Ng
1638 //                              case 0x14B: // ng
1639                                 // 3.
1640                                 case 0xC6: // AE
1641                                 case 0xE6: // ae
1642                                 case 0xDE: // Icelandic Thorn
1643                                 case 0xFE: // Icelandic Thorn
1644                                 case 0xDF: // German ss
1645                                 case 0xFF: // German ss
1646                                 // not classified yet
1647 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1648 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1649 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1650 //                              case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1651 //                              case 0x1DD:
1652                                         continue;
1653                                 }
1654                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
1655                         }
1656
1657                         // Greek and Coptic
1658                         fillIndex [0xF] = 02;
1659                         for (int i = 0x0380; i < 0x0390; i++)
1660                                 if (Char.IsLetter ((char) i))
1661                                         AddLetterMap ((char) i, 0xF, 1);
1662                         fillIndex [0xF] = 02;
1663                         for (int i = 0x0391; i < 0x03CF; i++)
1664                                 if (Char.IsLetter ((char) i))
1665                                         AddLetterMap ((char) i, 0xF, 1);
1666                         fillIndex [0xF] = 0x40;
1667                         for (int i = 0x03D0; i < 0x0400; i++)
1668                                 if (Char.IsLetter ((char) i))
1669                                         AddLetterMap ((char) i, 0xF, 1);
1670
1671                         // Cyrillic - UCA order w/ some modification
1672                         fillIndex [0x10] = 0x3;
1673                         // table which is moslty from UCA DUCET.
1674                         for (int i = 0; i < orderedCyrillic.Length; i++) {
1675                                 char c = orderedCyrillic [i];
1676                                 if (Char.IsLetter (c))
1677                                         AddLetterMap (c, 0x10, 3);
1678                         }
1679                         for (int i = 0x0460; i < 0x0481; i++) {
1680                                 if (Char.IsLetter ((char) i))
1681                                         AddLetterMap ((char) i, 0x10, 3);
1682                         }
1683
1684                         // Armenian
1685                         fillIndex [0x11] = 0x3;
1686                         for (int i = 0x0531; i < 0x0586; i++)
1687                                 if (Char.IsLetter ((char) i))
1688                                         AddLetterMap ((char) i, 0x11, 1);
1689
1690                         // Hebrew
1691                         // -Letters
1692                         fillIndex [0x12] = 0x3;
1693                         for (int i = 0x05D0; i < 0x05FF; i++)
1694                                 if (Char.IsLetter ((char) i))
1695                                         AddLetterMap ((char) i, 0x12, 1);
1696                         // -Accents
1697                         fillIndex [0x1] = 0x3;
1698                         for (int i = 0x0591; i <= 0x05C2; i++)
1699                                 if (i != 0x05BE)
1700                                         AddCharMap ((char) i, 0x1, 1);
1701
1702                         // Arabic
1703                         fillIndex [0x1] = 0x8E;
1704                         fillIndex [0x13] = 0x3;
1705                         for (int i = 0x0621; i <= 0x064A; i++) {
1706                                 // Abjad
1707                                 if (Char.GetUnicodeCategory ((char) i)
1708                                         != UnicodeCategory.OtherLetter) {
1709                                         // FIXME: arabic nonspacing marks are
1710                                         // in different order.
1711                                         AddCharMap ((char) i, 0x1, 1);
1712                                         continue;
1713                                 }
1714 //                              map [i] = new CharMapEntry (0x13,
1715 //                                      (byte) arabicLetterPrimaryValues [i], 1);
1716                                 fillIndex [0x13] =
1717                                         (byte) arabicLetterPrimaryValues [i];
1718                                 AddLetterMap ((char) i, 0x13, 0);
1719                         }
1720                         fillIndex [0x13] = 0x84;
1721                         for (int i = 0x0674; i < 0x06D6; i++)
1722                                 if (Char.IsLetter ((char) i))
1723                                         AddLetterMap ((char) i, 0x13, 1);
1724
1725                         // Devanagari
1726                         // FIXME: it does seem straight codepoint mapping.
1727                         fillIndex [0x14] = 04;
1728                         for (int i = 0x0901; i < 0x0905; i++)
1729                                 if (!IsIgnorable (i))
1730                                         AddLetterMap ((char) i, 0x14, 2);
1731                         fillIndex [0x14] = 0xB;
1732                         for (int i = 0x0905; i < 0x093A; i++)
1733                                 if (Char.IsLetter ((char) i))
1734                                         AddLetterMap ((char) i, 0x14, 4);
1735                         for (int i = 0x093E; i < 0x094F; i++)
1736                                 if (!IsIgnorable (i))
1737                                         AddLetterMap ((char) i, 0x14, 2);
1738
1739                         // Bengali
1740                         // -Letters
1741                         fillIndex [0x15] = 02;
1742                         for (int i = 0x0980; i < 0x9FF; i++) {
1743                                 if (IsIgnorable (i))
1744                                         continue;
1745                                 if (i == 0x09E0)
1746                                         fillIndex [0x15] = 0x3B;
1747                                 switch (Char.GetUnicodeCategory ((char) i)) {
1748                                 case UnicodeCategory.NonSpacingMark:
1749                                 case UnicodeCategory.DecimalDigitNumber:
1750                                 case UnicodeCategory.OtherNumber:
1751                                         continue;
1752                                 }
1753                                 AddLetterMap ((char) i, 0x15, 1);
1754                         }
1755                         // -Signs
1756                         fillIndex [0x1] = 0x3;
1757                         for (int i = 0x0981; i < 0x0A00; i++)
1758                                 if (Char.GetUnicodeCategory ((char) i) ==
1759                                         UnicodeCategory.NonSpacingMark)
1760                                         AddCharMap ((char) i, 0x1, 1);
1761
1762                         // Gurmukhi. orderedGurmukhi is from UCA
1763                         // FIXME: it does not look equivalent to UCA.
1764                         fillIndex [0x1] = 03;
1765                         fillIndex [0x16] = 02;
1766                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
1767                                 char c = orderedGurmukhi [i];
1768                                 if (IsIgnorable ((int) c))
1769                                         continue;
1770                                 if (!Char.IsLetter (c)) {
1771                                         AddLetterMap (c, 0x1, 1);
1772                                         continue;
1773                                 }
1774                                 if (c == '\u0A3C' || c == '\u0A4D' ||
1775                                         '\u0A66' <= c && c <= '\u0A71')
1776                                         continue;
1777                                 AddLetterMap (c, 0x16, 4);
1778                         }
1779
1780                         // Gujarati. orderedGujarati is from UCA
1781                         fillIndex [0x17] = 02;
1782                         for (int i = 0; i < orderedGujarati.Length; i++)
1783                                 AddLetterMap (orderedGujarati [i], 0x17, 4);
1784
1785                         // Oriya
1786                         fillIndex [0x18] = 02;
1787                         for (int i = 0x0B00; i < 0x0B7F; i++) {
1788                                 switch (Char.GetUnicodeCategory ((char) i)) {
1789                                 case UnicodeCategory.NonSpacingMark:
1790                                 case UnicodeCategory.DecimalDigitNumber:
1791                                         continue;
1792                                 }
1793                                 AddLetterMap ((char) i, 0x18, 1);
1794                         }
1795
1796                         // Tamil
1797                         fillIndex [0x19] = 2;
1798                         AddCharMap ('\u0BD7', 0x19, 0);
1799                         fillIndex [0x19] = 0xA;
1800                         // vowels
1801                         for (int i = 0x0BD7; i < 0x0B94; i++)
1802                                 if (Char.IsLetter ((char) i))
1803                                         AddCharMap ((char) i, 0x19, 2);
1804                         // special vowel
1805                         fillIndex [0x19] = 0x24;
1806                         AddCharMap ('\u0B94', 0x19, 0);
1807                         fillIndex [0x19] = 0x26;
1808                         // The array for Tamil consonants is a constant.
1809                         // Windows have almost similar sequence to TAM from
1810                         // tamilnet but a bit different in Grantha.
1811                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
1812                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1813                         // combining marks
1814                         fillIndex [0x19] = 0x82;
1815                         for (int i = 0x0BBE; i < 0x0BCD; i++)
1816                                 if (Char.GetUnicodeCategory ((char) i) ==
1817                                         UnicodeCategory.SpacingCombiningMark
1818                                         || i == 0x0BC0)
1819                                         AddLetterMap ((char) i, 0x19, 2);
1820
1821                         // Telugu
1822                         fillIndex [0x1A] = 0x4;
1823                         for (int i = 0x0C00; i < 0x0C62; i++) {
1824                                 if (i == 0x0C55 || i == 0x0C56)
1825                                         continue; // skip
1826                                 AddCharMap ((char) i, 0x1A, 3);
1827                                 char supp = (i == 0x0C0B) ? '\u0C60':
1828                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
1829                                 if (supp == char.MinValue)
1830                                         continue;
1831                                 AddCharMap (supp, 0x1A, 3);
1832                         }
1833
1834                         // Kannada
1835                         fillIndex [0x1B] = 4;
1836                         for (int i = 0x0C80; i < 0x0CE5; i++) {
1837                                 if (i == 0x0CD5 || i == 0x0CD6)
1838                                         continue; // ignore
1839                                 AddCharMap ((char) i, 0x1B, 3);
1840                         }
1841
1842                         // Malayalam
1843                         fillIndex [0x1C] = 2;
1844                         for (int i = 0x0D02; i < 0x0D61; i++)
1845                                 // FIXME: I avoided MSCompatUnicodeTable usage
1846                                 // here (it results in recursion). So check if
1847                                 // using NonSpacingMark makes sense or not.
1848                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1849 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1850                                         AddCharMap ((char) i, 0x1C, 1);
1851
1852                         // Thai ... note that it breaks 0x1E wall after E2B!
1853                         // Also, all Thai characters have level 2 value 3.
1854                         fillIndex [0x1E] = 2;
1855                         for (int i = 0xE44; i < 0xE48; i++)
1856                                 AddCharMap ((char) i, 0x1E, 1, 3);
1857                         for (int i = 0xE01; i < 0xE2B; i++)
1858                                 AddCharMap ((char) i, 0x1E, 6, 0);
1859                         fillIndex [0x1F] = 5;
1860                         for (int i = 0xE2B; i < 0xE30; i++)
1861                                 AddCharMap ((char) i, 0x1F, 6, 0);
1862                         for (int i = 0xE30; i < 0xE3B; i++)
1863                                 AddCharMap ((char) i, 0x1F, 1, 3);
1864                         // some Thai characters remains.
1865                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
1866                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1867                         foreach (char c in specialThai)
1868                                 AddCharMap (c, 0x1F, 1);
1869
1870                         // Lao
1871                         fillIndex [0x1F] = 2;
1872                         for (int i = 0xE80; i < 0xEDF; i++)
1873                                 if (Char.IsLetter ((char) i))
1874                                         AddCharMap ((char) i, 0x1F, 1);
1875
1876                         // Georgian. orderedGeorgian is from UCA DUCET.
1877                         fillIndex [0x21] = 5;
1878                         for (int i = 0; i < orderedGeorgian.Length; i++)
1879                                 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1880
1881                         // Japanese Kana.
1882                         fillIndex [0x22] = 2;
1883                         int kanaOffset = 0x3041;
1884                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1885
1886                         for (int gyo = 0; gyo < 9; gyo++) {
1887                                 for (int dan = 0; dan < 5; dan++) {
1888                                         if (gyo == 7 && dan % 2 == 1) {
1889                                                 // 'ya'-gyo
1890                                                 fillIndex [0x22]++;
1891                                                 kanaOffset -= 2; // There is no space for yi and ye.
1892                                                 continue;
1893                                         }
1894                                         int cp = kanaOffset + dan * kanaLines [gyo];
1895                                         // small lines (a-gyo, ya-gyo)
1896                                         if (gyo == 0 || gyo == 7) {
1897                                                 AddKanaMap (cp, 1); // small
1898                                                 AddKanaMap (cp + 1, 1);
1899                                         }
1900                                         else
1901                                                 AddKanaMap (cp, kanaLines [gyo]);
1902                                         fillIndex [0x22]++;
1903
1904                                         if (cp == 0x3061) {
1905                                                 // add small 'Tsu' (before normal one)
1906                                                 AddKanaMap (0x3063, 1);
1907                                                 kanaOffset++;
1908                                         }
1909                                 }
1910                                 fillIndex [0x22] += 3;
1911                                 kanaOffset += 5 * kanaLines [gyo];
1912                         }
1913
1914                         // Wa-gyo is almost special, so I just manually add.
1915                         AddLetterMap ((char) 0x308E, 0x22, 0);
1916                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1917                         AddLetterMap ((char) 0x308F, 0x22, 0);
1918                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1919                         fillIndex [0x22]++;
1920                         AddLetterMap ((char) 0x3090, 0x22, 0);
1921                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1922                         fillIndex [0x22] += 2;
1923                         // no "Wu" in Japanese.
1924                         AddLetterMap ((char) 0x3091, 0x22, 0);
1925                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1926                         fillIndex [0x22]++;
1927                         AddLetterMap ((char) 0x3092, 0x22, 0);
1928                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1929                         // Nn
1930                         fillIndex [0x22] = 0x80;
1931                         AddLetterMap ((char) 0x3093, 0x22, 0);
1932                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1933
1934                         // JIS Japanese square chars.
1935                         fillIndex [0x22] = 0x97;
1936                         jisJapanese.Sort (JISComparer.Instance);
1937                         foreach (JISCharacter j in jisJapanese)
1938                                 AddCharMap ((char) j.CP, 0x22, 1);
1939                         // non-JIS Japanese square chars.
1940                         nonJisJapanese.Sort (NonJISComparer.Instance);
1941                         foreach (NonJISCharacter j in nonJisJapanese)
1942                                 AddCharMap ((char) j.CP, 0x22, 1);
1943
1944                         // Bopomofo
1945                         fillIndex [0x23] = 0x02;
1946                         for (int i = 0x3105; i <= 0x312C; i++)
1947                                 AddCharMap ((char) i, 0x23, 1);
1948
1949                         // Estrangela: ancient Syriac
1950                         fillIndex [0x24] = 0x0B;
1951                         // FIXME: is 0x71E really alternative form?
1952                         ArrayList syriacAlternatives = new ArrayList (
1953                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
1954                         for (int i = 0x0710; i <= 0x072C; i++) {
1955                                 if (i == 0x0711) // NonSpacingMark
1956                                         continue;
1957                                 if (syriacAlternatives.Contains (i))
1958                                         continue;
1959                                 AddCharMap ((char) i, 0x24, 4);
1960                                 // FIXME: why?
1961                                 if (i == 0x721)
1962                                         fillIndex [0x24]++;
1963                         }
1964                         foreach (int cp in syriacAlternatives)
1965                                 map [cp] = new CharMapEntry (0x24,
1966                                         (byte) (map [cp - 1].Level1 + 2),
1967                                         0);
1968
1969                         // Thaana
1970                         // FIXME: it turned out that it does not look like UCA
1971                         fillIndex [0x24] = 0x6E;
1972                         for (int i = 0; i < orderedThaana.Length; i++) {
1973                                 if (IsIgnorableNonSpacing (i))
1974                                         continue;
1975                                 AddCharMap (orderedThaana [i], 0x24, 2);
1976                         }
1977                         #endregion
1978
1979                         // FIXME: Add more culture-specific letters (that are
1980                         // not supported in Windows collation) here.
1981
1982                         // Surrogate ... they are computed.
1983
1984                         #region Hangul
1985                         // Hangul.
1986                         //
1987                         // Unlike UCA Windows Hangul sequence mixes Jongseong
1988                         // with Choseong sequence as well as Jungseong,
1989                         // adjusted to have the same primary weight for the
1990                         // same base character. So it is impossible to compute
1991                         // those sort keys.
1992                         //
1993                         // Here I introduce an ordered sequence of mixed
1994                         // 'commands' and 'characters' that is similar to
1995                         // LDML text:
1996                         //      - ',' increases primary weight.
1997                         //      - [A B] means a range, increasing index
1998                         //      - {A B} means a range, without increasing index
1999                         //      - '=' is no operation (it means the characters
2000                         //        of both sides have the same weight).
2001                         //      - '>' inserts a Hangul Syllable block that
2002                         //        contains 0x251 characters.
2003                         //      - '<' decreases the index
2004                         //      - '0'-'9' means skip count
2005                         //      - whitespaces are ignored
2006                         //
2007
2008                         string hangulSequence =
2009                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2010                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2011                         + "<{\u1113 \u1116}, \u3165,"
2012                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2013                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2014                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
2015                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2016                                 + "[\u11D1 \u11D2], \u11B2,"
2017                                 + "[\u11D3 \u11D5], \u11B3,"
2018                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2019                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2020                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2021                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2022                         + "<{\u1121 \u112C}, \u11B9,,,,,,,,, [\u11E4 \u11E6],,"
2023                                 + "\u1109=\u11BA,,, \u3214=\u3274 <>"
2024                         + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
2025                                 + "\u11EA,, \u110A=\u11BB,,, >"
2026                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2027                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2028                         + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2029                                 + "\u11F1,, \u11F2,,,"
2030                                 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2031                         + "<\u114D, \u110D,,  >"
2032                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2033                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2034                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2035                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2036                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2037                                 + "[\u11F5 \u11F8]"
2038                         ;
2039
2040                         byte hangulCat = 0x52;
2041                         fillIndex [hangulCat] = 0x2;
2042
2043                         int syllableBlock = 0;
2044                         for (int n = 0; n < hangulSequence.Length; n++) {
2045                                 char c = hangulSequence [n];
2046                                 int start, end;
2047                                 if (Char.IsWhiteSpace (c))
2048                                         continue;
2049                                 switch (c) {
2050                                 case '=':
2051                                         break; // NOP
2052                                 case ',':
2053                                         IncrementSequentialIndex (ref hangulCat);
2054                                         break;
2055                                 case '<':
2056                                         if (fillIndex [hangulCat] == 2)
2057                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2058                                         fillIndex [hangulCat]--;
2059                                         break;
2060                                 case '>':
2061                                         IncrementSequentialIndex (ref hangulCat);
2062                                         for (int l = 0; l < 0x15; l++)
2063                                                 for (int v = 0; v < 0x1C; v++) {
2064                                                         AddCharMap (
2065                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2066                                                         IncrementSequentialIndex (ref hangulCat);
2067                                                 }
2068                                         syllableBlock++;
2069                                         break;
2070                                 case '[':
2071                                         start = hangulSequence [n + 1];
2072                                         end = hangulSequence [n + 3];
2073                                         for (int i = start; i <= end; i++) {
2074                                                 AddCharMap ((char) i, hangulCat, 0);
2075                                                 if (end > i)
2076                                                         IncrementSequentialIndex (ref hangulCat);
2077                                         }
2078                                         n += 4; // consumes 5 characters for this operation
2079                                         break;
2080                                 case '{':
2081                                         start = hangulSequence [n + 1];
2082                                         end = hangulSequence [n + 3];
2083                                         for (int i = start; i <= end; i++)
2084                                                 AddCharMap ((char) i, hangulCat, 0);
2085                                         n += 4; // consumes 5 characters for this operation
2086                                         break;
2087                                 default:
2088                                         AddCharMap (c, hangulCat, 0);
2089                                         break;
2090                                 }
2091                         }
2092
2093                         // Some Jamo NFKD.
2094                         for (int i = 0x3200; i < 0x3300; i++) {
2095                                 if (IsIgnorable (i) || map [i].Defined)
2096                                         continue;
2097                                 int ch = 0;
2098                                 // w/ bracket
2099                                 if (decompLength [i] == 4 &&
2100                                         decompValues [decompIndex [i]] == '(')
2101                                         ch = decompIndex [i] + 1;
2102                                 // circled
2103                                 else if (decompLength [i] == 2 &&
2104                                         decompValues [decompIndex [i] + 1] == '\u1161')
2105                                         ch = decompIndex [i];
2106                                 else if (decompLength [i] == 1)
2107                                         ch = decompIndex [i];
2108                                 else
2109                                         continue;
2110                                 ch = decompValues [ch];
2111                                 if (ch < 0x1100 || 0x1200 < ch &&
2112                                         ch < 0xAC00 || 0xD800 < ch)
2113                                         continue;
2114                                 map [i] = new CharMapEntry (map [ch].Category,
2115                                         (byte) (map [ch].Level1 + 1),
2116                                         map [ch].Level2);
2117 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2118                         }
2119
2120
2121                         #endregion
2122
2123                         // Letterlike characters and CJK compatibility square
2124                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2125                         int [] counts = new int ['Z' - 'A' + 1];
2126                         char [] namedChars = new char [sortableCharNames.Count];
2127                         int nCharNames = 0;
2128                         foreach (DictionaryEntry de in sortableCharNames) {
2129                                 counts [((string) de.Value) [0] - 'A']++;
2130                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2131                         }
2132                         nCharNames = 0; // reset
2133                         for (int a = 0; a < counts.Length; a++) {
2134                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2135                                 for (int i = 0; i < counts [a]; i++)
2136 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2137                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2138                         }
2139
2140                         // CJK unified ideograph.
2141                         byte cjkCat = 0x9E;
2142                         fillIndex [cjkCat] = 0x2;
2143                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2144                                 if (!IsIgnorable (cp))
2145                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2146                         // CJK Extensions goes here.
2147                         // LAMESPEC: With this Windows style CJK layout, it is
2148                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2149                         // 0x9FBB can never be added w/o breaking compat.
2150                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2151                                 if (!IsIgnorable (cp))
2152                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2153
2154                         // PrivateUse ... computed.
2155                         // remaining Surrogate ... computed.
2156
2157                         #region Special "biggest" area (FF FF)
2158                         fillIndex [0xFF] = 0xFF;
2159                         char [] specialBiggest = new char [] {
2160                                 '\u3005', '\u3031', '\u3032', '\u309D',
2161                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2162                                 '\uFE7C', '\uFE7D', '\uFF70'};
2163                         foreach (char c in specialBiggest)
2164                                 AddCharMap (c, 0xFF, 0);
2165                         #endregion
2166
2167                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2168                         // non-alphanumeric ASCII except for: + - < = > '
2169                         for (int i = 0x21; i < 0x7F; i++) {
2170                                 if (Char.IsLetterOrDigit ((char) i)
2171                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2172                                         continue; // they are not added here.
2173                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2174                                 // Insert 3001 after ',' and 3002 after '.'
2175                                 if (i == 0x2C)
2176                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2177                                 else if (i == 0x2E) {
2178                                         fillIndex [0x7]--;
2179                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2180                                 }
2181                                 else if (i == 0x3A)
2182                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2183                         }
2184                         #endregion
2185
2186                         #region 07 - Punctuations and something else
2187                         for (int i = 0xA0; i < char.MaxValue; i++) {
2188                                 if (IsIgnorable (i))
2189                                         continue;
2190
2191                                 // SPECIAL CASES:
2192                                 switch (i) {
2193                                 case 0xAB: // 08
2194                                 case 0xB7: // 0A
2195                                 case 0x2329: // 09
2196                                 case 0x232A: // 09
2197                                         continue;
2198                                 }
2199
2200                                 switch (Char.GetUnicodeCategory ((char) i)) {
2201                                 case UnicodeCategory.OtherPunctuation:
2202                                 case UnicodeCategory.ClosePunctuation:
2203                                 case UnicodeCategory.OpenPunctuation:
2204                                 case UnicodeCategory.InitialQuotePunctuation:
2205                                 case UnicodeCategory.FinalQuotePunctuation:
2206                                 case UnicodeCategory.ModifierSymbol:
2207                                         // SPECIAL CASES: // 0xA
2208                                         if (0x2020 <= i && i <= 0x2042)
2209                                                 continue;
2210                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
2211                                         break;
2212                                 default:
2213                                         if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2214                                                 goto case UnicodeCategory.OtherPunctuation;
2215                                         break;
2216                                 }
2217                         }
2218                         // Control pictures
2219                         for (int i = 0x2400; i <= 0x2421; i++)
2220                                 AddCharMap ((char) i, 0x7, 1, 0);
2221                         #endregion
2222
2223                         // FIXME: for 07 xx we need more love.
2224
2225                         // FIXME: 08 should be more complete.
2226                         fillIndex [0x8] = 2;
2227                         for (int cp = 0; cp < char.MaxValue; cp++)
2228                                 if (!map [cp].Defined &&
2229                                         Char.GetUnicodeCategory ((char) cp) ==
2230                                         UnicodeCategory.MathSymbol)
2231                                         AddCharMapGroup ((char) cp, 0x8, 1, 0);
2232
2233                         // Characters w/ diacritical marks (NFKD)
2234                         for (int i = 0; i <= char.MaxValue; i++) {
2235                                 if (map [i].Defined || IsIgnorable (i))
2236                                         continue;
2237                                 if (decompIndex [i] == 0)
2238                                         continue;
2239
2240                                 int start = decompIndex [i];
2241                                 int primaryChar = decompValues [start];
2242                                 int secondary = 0;
2243                                 bool skip = false;
2244                                 int length = decompLength [i];
2245                                 // special processing for parenthesized ones.
2246                                 if (length == 3 &&
2247                                         decompValues [start] == '(' &&
2248                                         decompValues [start + 2] == ')') {
2249                                         primaryChar = decompValues [start + 1];
2250                                         length = 1;
2251                                 }
2252
2253                                 if (map [primaryChar].Level1 == 0)
2254                                         continue;
2255
2256                                 for (int l = 1; l < length; l++) {
2257                                         int c = decompValues [start + l];
2258                                         if (map [c].Level1 != 0)
2259                                                 skip = true;
2260                                         secondary += diacritical [c];
2261                                 }
2262                                 if (skip)
2263                                         continue;
2264                                 map [i] = new CharMapEntry (
2265                                         map [primaryChar].Category,
2266                                         map [primaryChar].Level1,
2267                                         (byte) secondary);
2268
2269                         }
2270
2271                         #region Level2 adjustment
2272                         // Arabic Hamzah
2273                         diacritical [0x624] = 0x5;
2274                         diacritical [0x626] = 0x7;
2275                         diacritical [0x622] = 0x9;
2276                         diacritical [0x623] = 0xA;
2277                         diacritical [0x625] = 0xB;
2278                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
2279                         diacritical [0x64A] = 0x7; // Yaa'
2280
2281
2282                         for (int i = 0; i < char.MaxValue; i++) {
2283                                 byte mod = 0;
2284                                 byte cat = map [i].Category;
2285                                 switch (cat) {
2286                                 case 0xE: // Latin diacritics
2287                                 case 0x22: // Japanese: circled characters
2288                                         mod = diacritical [i];
2289                                         break;
2290                                 case 0x13: // Arabic
2291                                         if (diacritical [i] == 0)
2292                                                 mod = 0x8; // default for arabic
2293                                         break;
2294                                 }
2295                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
2296                                         mod = diacritical [i];
2297                                 if (mod > 0)
2298                                         map [i] = new CharMapEntry (
2299                                                 cat, map [i].Level1, mod);
2300                         }
2301                         #endregion
2302
2303                         // FIXME: this is hack but those which are
2304                         // NonSpacingMark characters and still undefined
2305                         // are likely to be nonspacing.
2306                         for (int i = 0; i < char.MaxValue; i++)
2307                                 if (!map [i].Defined &&
2308                                         !IsIgnorable (i) &&
2309                                         Char.GetUnicodeCategory ((char) i) ==
2310                                         UnicodeCategory.NonSpacingMark)
2311                                         AddCharMap ((char) i, 1, 1);
2312                 }
2313
2314                 private void IncrementSequentialIndex (ref byte hangulCat)
2315                 {
2316                         fillIndex [hangulCat]++;
2317                         if (fillIndex [hangulCat] == 0) { // overflown
2318                                 hangulCat++;
2319                                 fillIndex [hangulCat] = 0x2;
2320                         }
2321                 }
2322
2323                 // Reset fillIndex to fixed value and call AddLetterMap().
2324                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2325                 {
2326                         fillIndex [category] = alphaWeight;
2327                         AddLetterMap (c, category, 0);
2328
2329                         ArrayList al = latinMap [c] as ArrayList;
2330                         if (al == null)
2331                                 return;
2332
2333                         foreach (int cp in al)
2334                                 AddLetterMap ((char) cp, category, 0);
2335                 }
2336
2337                 private void AddKanaMap (int i, byte voices)
2338                 {
2339                         for (byte b = 0; b < voices; b++) {
2340                                 char c = (char) (i + b);
2341                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
2342                                 // Hiragana
2343                                 AddLetterMapCore (c, 0x22, 0, arg);
2344                                 // Katakana
2345                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2346                         }
2347                 }
2348
2349                 private void AddLetterMap (char c, byte category, byte updateCount)
2350                 {
2351                         AddLetterMapCore (c, category, updateCount, 0);
2352                 }
2353
2354                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2355                 {
2356                         char c2;
2357                         // <small> updates index
2358                         c2 = ToSmallForm (c);
2359                         if (c2 != c)
2360                                 AddCharMapGroup (c2, category, updateCount, level2);
2361                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2362                         if (c2 != c && !map [(int) c2].Defined)
2363                                 AddLetterMapCore (c2, category, 0, level2);
2364                         bool doUpdate = true;
2365                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2366                                 doUpdate = false;
2367                         else
2368                                 AddCharMapGroup (c, category, 0, level2);
2369                         if (doUpdate)
2370                                 fillIndex [category] += updateCount;
2371                 }
2372
2373                 private bool AddCharMap (char c, byte category, byte increment)
2374                 {
2375                         return AddCharMap (c, category, increment, 0);
2376                 }
2377
2378                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2379                 {
2380                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2381                                 return false; // do nothing
2382                         map [(int) c] = new CharMapEntry (category,
2383                                 category == 1 ? alt : fillIndex [category],
2384                                 category == 1 ? fillIndex [category] : alt);
2385                         fillIndex [category] += increment;
2386                         return true;
2387                 }
2388
2389                 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2390                 {
2391                         char c2 = ToSmallFormTail (c);
2392                         if (c2 != c)
2393                                 AddCharMap (c2, category, updateCount, 0);
2394                         // itself
2395                         AddCharMap (c, category, updateCount, 0);
2396                         // <full>
2397                         c2 = ToFullWidthTail (c);
2398                         if (c2 != c)
2399                                 AddCharMapGroupTail (c2, category, updateCount);
2400                 }
2401
2402                 //
2403                 // Adds characters to table in the order below
2404                 // (+ increases weight):
2405                 //      (<small> +)
2406                 //      itself
2407                 //      <fraction>
2408                 //      <full> | <super> | <sub>
2409                 //      <circle> | <wide> (| <narrow>)
2410                 //      +
2411                 //      (vertical +)
2412                 //
2413                 // level2 is fixed (does not increase).
2414                 int [] sameWeightItems = new int [] {
2415                         DecompositionFraction,
2416                         DecompositionFull,
2417                         DecompositionSuper,
2418                         DecompositionSub,
2419                         DecompositionCircle,
2420                         DecompositionWide,
2421                         DecompositionNarrow,
2422                         };
2423                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2424                 {
2425                         if (map [(int) c].Defined)
2426                                 return;
2427
2428                         char small = char.MinValue;
2429                         char vertical = char.MinValue;
2430                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2431                         if (nfkd != null) {
2432                                 object smv = nfkd [(byte) DecompositionSmall];
2433                                 if (smv != null)
2434                                         small = (char) ((int) smv);
2435                                 object vv = nfkd [(byte) DecompositionVertical];
2436                                 if (vv != null)
2437                                         vertical = (char) ((int) vv);
2438                         }
2439
2440                         // <small> updates index
2441                         if (small != char.MinValue)
2442                                 AddCharMap (small, category, updateCount);
2443
2444                         // itself
2445                         AddCharMap (c, category, 0, level2);
2446
2447                         if (nfkd != null) {
2448                                 foreach (int weight in sameWeightItems) {
2449                                         object wv = nfkd [(byte) weight];
2450                                         if (wv != null)
2451                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
2452                                 }
2453                         }
2454
2455                         // update index here.
2456                         fillIndex [category] += updateCount;
2457
2458                         if (vertical != char.MinValue)
2459                                 AddCharMap (vertical, category, updateCount, level2);
2460                 }
2461
2462                 private void AddCharMapCJK (char c, ref byte category)
2463                 {
2464                         AddCharMap (c, category, 0, 0);
2465                         IncrementSequentialIndex (ref category);
2466
2467                         // Special. I wonder why but Windows skips 9E F9.
2468                         if (category == 0x9E && fillIndex [category] == 0xF9)
2469                                 IncrementSequentialIndex (ref category);
2470                 }
2471
2472                 private void AddCharMapGroupCJK (char c, ref byte category)
2473                 {
2474                         AddCharMapCJK (c, ref category);
2475
2476                         // LAMESPEC: see below.
2477                         if (c == '\u52DE') {
2478                                 AddCharMapCJK ('\u3298', ref category);
2479                                 AddCharMapCJK ('\u3238', ref category);
2480                         }
2481                         if (c == '\u5BEB')
2482                                 AddCharMapCJK ('\u32A2', ref category);
2483                         if (c == '\u91AB')
2484                                 // Especially this mapping order totally does
2485                                 // not make sense to me.
2486                                 AddCharMapCJK ('\u32A9', ref category);
2487
2488                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2489                         if (nfkd == null)
2490                                 return;
2491                         for (byte weight = 0; weight <= 0x12; weight++) {
2492                                 object wv = nfkd [weight];
2493                                 if (wv == null)
2494                                         continue;
2495                                 int w = (int) wv;
2496
2497                                 // Special: they are ignored in this area.
2498                                 // FIXME: check if it is sane
2499                                 if (0xF900 <= w && w <= 0xFAD9)
2500                                         continue;
2501                                 // LAMESPEC: on Windows some of CJK characters
2502                                 // in 3200-32B0 are incorrectly mapped. They
2503                                 // mix Chinise and Japanese Kanji when
2504                                 // ordering those characters.
2505                                 switch (w) {
2506                                 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
2507                                         continue;
2508                                 }
2509
2510                                 AddCharMapCJK ((char) w, ref category);
2511                         }
2512                 }
2513
2514                 // For now it is only for 0x7 category.
2515                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2516                 {
2517                         char small = char.MinValue;
2518                         char vertical = char.MinValue;
2519                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2520                         if (nfkd != null) {
2521                                 object smv = nfkd [(byte) DecompositionSmall];
2522                                 if (smv != null)
2523                                         small = (char) ((int) smv);
2524                                 object vv = nfkd [(byte) DecompositionVertical];
2525                                 if (vv != null)
2526                                         vertical = (char) ((int) vv);
2527                         }
2528
2529                         // <small> updates index
2530                         if (small != char.MinValue)
2531                                 // SPECIAL CASE excluded (FIXME: why?)
2532                                 if (small != '\u2024')
2533                                         AddCharMap (small, category, updateCount);
2534
2535                         // itself
2536                         AddCharMap (c, category, updateCount, level2);
2537
2538                         // Since nfkdMap is problematic to have two or more
2539                         // NFKD to an identical character, here I iterate all.
2540                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
2541                                 if (decompLength [c2] == 1 &&
2542                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
2543                                         switch (decompType [c2]) {
2544                                         case DecompositionCompat:
2545                                                 AddCharMap ((char) c2, category, updateCount, level2);
2546                                                 break;
2547                                         }
2548                                 }
2549                         }
2550
2551                         if (vertical != char.MinValue)
2552                                 // SPECIAL CASE excluded (FIXME: why?)
2553                                 if (vertical != '\uFE33' && vertical != '\uFE34')
2554                                         AddCharMap (vertical, category, updateCount, level2);
2555                 }
2556
2557                 private void AddArabicCharMap (char c)
2558                 {
2559                         byte category = 6;
2560                         byte updateCount = 1;
2561                         byte level2 = 0;
2562
2563                         // itself
2564                         AddCharMap (c, category, 0, level2);
2565
2566                         // Since nfkdMap is problematic to have two or more
2567                         // NFKD to an identical character, here I iterate all.
2568                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
2569                                 if (decompLength [c2] == 0)
2570                                         continue;
2571                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
2572                                 if ((int) (decompValues [idx]) == (int) c)
2573                                         AddCharMap ((char) c2, category,
2574                                                 0, level2);
2575                         }
2576                         fillIndex [category] += updateCount;
2577                 }
2578
2579                 char ToFullWidth (char c)
2580                 {
2581                         return ToDecomposed (c, DecompositionFull, false);
2582                 }
2583
2584                 char ToFullWidthTail (char c)
2585                 {
2586                         return ToDecomposed (c, DecompositionFull, true);
2587                 }
2588
2589                 char ToSmallForm (char c)
2590                 {
2591                         return ToDecomposed (c, DecompositionSmall, false);
2592                 }
2593
2594                 char ToSmallFormTail (char c)
2595                 {
2596                         return ToDecomposed (c, DecompositionSmall, true);
2597                 }
2598
2599                 char ToDecomposed (char c, byte d, bool tail)
2600                 {
2601                         if (decompType [(int) c] != d)
2602                                 return c;
2603                         int idx = decompIndex [(int) c];
2604                         if (tail)
2605                                 idx += decompLength [(int) c] - 1;
2606                         return (char) decompValues [idx];
2607                 }
2608
2609                 bool ExistsJIS (int cp)
2610                 {
2611                         foreach (JISCharacter j in jisJapanese)
2612                                 if (j.CP == cp)
2613                                         return true;
2614                         return false;
2615                 }
2616
2617                 #endregion
2618
2619                 #region Level 3 properties (Case/Width)
2620
2621                 private byte ComputeLevel3Weight (char c)
2622                 {
2623                         byte b = ComputeLevel3WeightRaw (c);
2624                         return b > 0 ? (byte) (b + 2) : b;
2625                 }
2626
2627                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
2628                 {
2629                         // Korean
2630                         if ('\u11A8' <= c && c <= '\u11F9')
2631                                 return 2;
2632                         if ('\uFFA0' <= c && c <= '\uFFDC')
2633                                 return 4;
2634                         if ('\u3130' <= c && c <= '\u3164')
2635                                 return 5;
2636                         // numbers
2637                         if ('\u2776' <= c && c <= '\u277F')
2638                                 return 4;
2639                         if ('\u2780' <= c && c <= '\u2789')
2640                                 return 8;
2641                         if ('\u2776' <= c && c <= '\u2793')
2642                                 return 0xC;
2643                         if ('\u2160' <= c && c <= '\u216F')
2644                                 return 0x18;
2645                         if ('\u2181' <= c && c <= '\u2182')
2646                                 return 0x18;
2647                         // Arabic
2648                         if ('\u2135' <= c && c <= '\u2138')
2649                                 return 4;
2650                         if ('\uFE80' <= c && c < '\uFE8E') {
2651                                 // 2(Isolated)/8(Final)/0x18(Medial)
2652                                 switch (decompType [(int) c]) {
2653                                 case DecompositionIsolated:
2654                                         return 2;
2655                                 case DecompositionFinal:
2656                                         return 8;
2657                                 case DecompositionMedial:
2658                                         return 0x18;
2659                                 }
2660                         }
2661
2662                         // actually I dunno the reason why they have weights.
2663                         switch (c) {
2664                         case '\u01BC':
2665                                 return 0x10;
2666                         case '\u06A9':
2667                                 return 0x20;
2668                         case '\u06AA':
2669                                 return 0x28;
2670                         }
2671
2672                         byte ret = 0;
2673                         switch (c) {
2674                         case '\u03C2':
2675                         case '\u2104':
2676                         case '\u212B':
2677                                 ret |= 8;
2678                                 break;
2679                         case '\uFE42':
2680                                 ret |= 0xC;
2681                                 break;
2682                         }
2683
2684                         // misc
2685                         switch (decompType [(int) c]) {
2686                         case DecompositionWide: // <wide>
2687                         case DecompositionSub: // <sub>
2688                         case DecompositionSuper: // <super>
2689                                 ret |= decompType [(int) c];
2690                                 break;
2691                         }
2692                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2693                                 ret |= 8;
2694                         if (isUppercase [(int) c]) // DerivedCoreProperties
2695                                 ret |= 0x10;
2696
2697                         return ret;
2698                 }
2699
2700                 #endregion
2701
2702                 #region IsIgnorable
2703 /*
2704                 static bool IsIgnorable (int i)
2705                 {
2706                         if (unicodeAge [i] >= 3.1)
2707                                 return true;
2708                         switch (char.GetUnicodeCategory ((char) i)) {
2709                         case UnicodeCategory.OtherNotAssigned:
2710                         case UnicodeCategory.Format:
2711                                 return true;
2712                         }
2713                         return false;
2714                 }
2715 */
2716
2717                 // FIXME: In the future use DerivedAge.txt to examine character
2718                 // versions and set those ones that have higher version than
2719                 // 1.0 as ignorable.
2720                 static bool IsIgnorable (int i)
2721                 {
2722                         switch (i) {
2723                         case 0:
2724                         // I guess, those characters are added between
2725                         // Unicode 1.0 (LCMapString) and Unicode 3.1
2726                         // (UnicodeCategory), so they used to be
2727                         // something like OtherNotAssigned as of Unicode 1.1.
2728                         case 0x2df: case 0x387:
2729                         case 0x3d7: case 0x3d8: case 0x3d9:
2730                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2731                         case 0x400: case 0x40d: case 0x450: case 0x45d:
2732                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
2733                         case 0x653: case 0x654: case 0x655: case 0x66d:
2734                         case 0xb56:
2735                         case 0x1e9b: case 0x202f: case 0x20ad:
2736                         case 0x20ae: case 0x20af:
2737                         case 0x20e2: case 0x20e3:
2738                         case 0x2139: case 0x213a: case 0x2183:
2739                         case 0x2425: case 0x2426: case 0x2619:
2740                         case 0x2670: case 0x2671: case 0x3007:
2741                         case 0x3190: case 0x3191:
2742                         case 0xfffc: case 0xfffd:
2743                                 return true;
2744                         // exceptional characters filtered by the
2745                         // following conditions. Originally those exceptional
2746                         // ranges are incorrect (they should not be ignored)
2747                         // and most of those characters are unfortunately in
2748                         // those ranges.
2749                         case 0x4d8: case 0x4d9:
2750                         case 0x4e8: case 0x4e9:
2751                         case 0x3036: case 0x303f:
2752                         case 0x337b: case 0xfb1e:
2753                                 return false;
2754                         }
2755
2756                         if (
2757                                 // The whole Sinhala characters.
2758                                 0x0D82 <= i && i <= 0x0DF4
2759                                 // The whole Tibetan characters.
2760                                 || 0x0F00 <= i && i <= 0x0FD1
2761                                 // The whole Myanmar characters.
2762                                 || 0x1000 <= i && i <= 0x1059
2763                                 // The whole Etiopic, Cherokee,
2764                                 // Canadian Syllablic, Ogham, Runic,
2765                                 // Tagalog, Hanunoo, Philippine,
2766                                 // Buhid, Tagbanwa, Khmer and Mongorian
2767                                 // characters.
2768                                 || 0x1200 <= i && i <= 0x1DFF
2769                                 // Greek extension characters.
2770                                 || 0x1F00 <= i && i <= 0x1FFF
2771                                 // The whole Braille characters.
2772                                 || 0x2800 <= i && i <= 0x28FF
2773                                 // CJK radical characters.
2774                                 || 0x2E80 <= i && i <= 0x2EF3
2775                                 // Kangxi radical characters.
2776                                 || 0x2F00 <= i && i <= 0x2FD5
2777                                 // Ideographic description characters.
2778                                 || 0x2FF0 <= i && i <= 0x2FFB
2779                                 // Bopomofo letter and final
2780                                 || 0x31A0 <= i && i <= 0x31B7
2781                                 // White square with quadrant characters.
2782                                 || 0x25F0 <= i && i <= 0x25F7
2783                                 // Ideographic telegraph symbols.
2784                                 || 0x32C0 <= i && i <= 0x32CB
2785                                 || 0x3358 <= i && i <= 0x3370
2786                                 || 0x33E0 <= i && i <= 0x33FF
2787                                 // The whole YI characters.
2788                                 || 0xA000 <= i && i <= 0xA48C
2789                                 || 0xA490 <= i && i <= 0xA4C6
2790                                 // American small ligatures
2791                                 || 0xFB13 <= i && i <= 0xFB17
2792                                 // hebrew, arabic, variation selector.
2793                                 || 0xFB1D <= i && i <= 0xFE2F
2794                                 // Arabic ligatures.
2795                                 || 0xFEF5 <= i && i <= 0xFEFC
2796                                 // FIXME: why are they excluded?
2797                                 || 0x01F6 <= i && i <= 0x01F9
2798                                 || 0x0218 <= i && i <= 0x0233
2799                                 || 0x02A9 <= i && i <= 0x02AD
2800                                 || 0x02EA <= i && i <= 0x02EE
2801                                 || 0x0349 <= i && i <= 0x036F
2802                                 || 0x0488 <= i && i <= 0x048F
2803                                 || 0x04D0 <= i && i <= 0x04FF
2804                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2805                                 || 0x06D6 <= i && i <= 0x06ED
2806                                 || 0x06FA <= i && i <= 0x06FE
2807                                 || 0x2048 <= i && i <= 0x204D
2808                                 || 0x20e4 <= i && i <= 0x20ea
2809                                 || 0x213C <= i && i <= 0x214B
2810                                 || 0x21EB <= i && i <= 0x21FF
2811                                 || 0x22F2 <= i && i <= 0x22FF
2812                                 || 0x237B <= i && i <= 0x239A
2813                                 || 0x239B <= i && i <= 0x23CF
2814                                 || 0x24EB <= i && i <= 0x24FF
2815                                 || 0x2596 <= i && i <= 0x259F
2816                                 || 0x25F8 <= i && i <= 0x25FF
2817                                 || 0x2672 <= i && i <= 0x2689
2818                                 || 0x2768 <= i && i <= 0x2775
2819                                 || 0x27d0 <= i && i <= 0x27ff
2820                                 || 0x2900 <= i && i <= 0x2aff
2821                                 || 0x3033 <= i && i <= 0x303F
2822                                 || 0x31F0 <= i && i <= 0x31FF
2823                                 || 0x3250 <= i && i <= 0x325F
2824                                 || 0x32B1 <= i && i <= 0x32BF
2825                                 || 0x3371 <= i && i <= 0x337B
2826                                 || 0xFA30 <= i && i <= 0xFA6A
2827                         )
2828                                 return true;
2829
2830                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2831                         switch (uc) {
2832                         case UnicodeCategory.PrivateUse:
2833                         case UnicodeCategory.Surrogate:
2834                                 return false;
2835                         // ignored by nature
2836                         case UnicodeCategory.Format:
2837                         case UnicodeCategory.OtherNotAssigned:
2838                                 return true;
2839                         default:
2840                                 return false;
2841                         }
2842                 }
2843
2844                 // To check IsIgnorable sanity, try the driver below under MS.NET.
2845
2846                 /*
2847                 public static void Main ()
2848                 {
2849                         for (int i = 0; i <= char.MaxValue; i++)
2850                                 Dump (i, IsIgnorable (i));
2851                 }
2852
2853                 static void Dump (int i, bool ignore)
2854                 {
2855                         switch (Char.GetUnicodeCategory ((char) i)) {
2856                         case UnicodeCategory.PrivateUse:
2857                         case UnicodeCategory.Surrogate:
2858                                 return; // check nothing
2859                         }
2860
2861                         string s1 = "";
2862                         string s2 = new string ((char) i, 10);
2863                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2864                         if ((ret == 0) == ignore)
2865                                 return;
2866                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2867                 }
2868                 */
2869                 #endregion // IsIgnorable
2870
2871                 #region IsIgnorableSymbol
2872                 static bool IsIgnorableSymbol (int i)
2873                 {
2874                         if (IsIgnorable (i))
2875                                 return true;
2876
2877                         switch (i) {
2878                         // *Letter
2879                         case 0x00b5: case 0x01C0: case 0x01C1:
2880                         case 0x01C2: case 0x01C3: case 0x01F6:
2881                         case 0x01F7: case 0x01F8: case 0x01F9:
2882                         case 0x02D0: case 0x02EE: case 0x037A:
2883                         case 0x03D7: case 0x03F3:
2884                         case 0x0400: case 0x040d:
2885                         case 0x0450: case 0x045d:
2886                         case 0x048C: case 0x048D:
2887                         case 0x048E: case 0x048F:
2888                         case 0x0587: case 0x0640: case 0x06E5:
2889                         case 0x06E6: case 0x06FA: case 0x06FB:
2890                         case 0x06FC: case 0x093D: case 0x0950:
2891                         case 0x1E9B: case 0x2139: case 0x3006:
2892                         case 0x3033: case 0x3034: case 0x3035:
2893                         case 0xFE7E: case 0xFE7F:
2894                         // OtherNumber
2895                         case 0x16EE: case 0x16EF: case 0x16F0:
2896                         // LetterNumber
2897                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2898                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2899                         case 0x3038: // HANGZHOU NUMERAL TEN
2900                         case 0x3039: // HANGZHOU NUMERAL TWENTY
2901                         case 0x303a: // HANGZHOU NUMERAL THIRTY
2902                         // OtherSymbol
2903                         case 0x2117:
2904                         case 0x327F:
2905                                 return true;
2906                         // ModifierSymbol
2907                         case 0x02B9: case 0x02BA: case 0x02C2:
2908                         case 0x02C3: case 0x02C4: case 0x02C5:
2909                         case 0x02C8: case 0x02CC: case 0x02CD:
2910                         case 0x02CE: case 0x02CF: case 0x02D2:
2911                         case 0x02D3: case 0x02D4: case 0x02D5:
2912                         case 0x02D6: case 0x02D7: case 0x02DE:
2913                         case 0x02E5: case 0x02E6: case 0x02E7:
2914                         case 0x02E8: case 0x02E9:
2915                         case 0x309B: case 0x309C:
2916                         // OtherPunctuation
2917                         case 0x055A: // American Apos
2918                         case 0x05C0: // Hebrew Punct
2919                         case 0x0E4F: // Thai FONGMAN
2920                         case 0x0E5A: // Thai ANGKHANKHU
2921                         case 0x0E5B: // Thai KHOMUT
2922                         // CurencySymbol
2923                         case 0x09F2: // Bengali Rupee Mark
2924                         case 0x09F3: // Bengali Rupee Sign
2925                         // MathSymbol
2926                         case 0x221e: // INF.
2927                         // OtherSymbol
2928                         case 0x0482:
2929                         case 0x09FA:
2930                         case 0x0B70:
2931                                 return false;
2932                         }
2933
2934                         // *Letter
2935                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2936 #if NET_2_0
2937                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2938                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2939 #endif
2940                         )
2941                                 return true;
2942
2943                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2944                         switch (uc) {
2945                         case UnicodeCategory.Surrogate:
2946                                 return false; // inconsistent
2947
2948                         case UnicodeCategory.SpacingCombiningMark:
2949                         case UnicodeCategory.EnclosingMark:
2950                         case UnicodeCategory.NonSpacingMark:
2951                         case UnicodeCategory.PrivateUse:
2952                                 // NonSpacingMark
2953                                 if (0x064B <= i && i <= 0x0652) // Arabic
2954                                         return true;
2955                                 return false;
2956
2957                         case UnicodeCategory.Format:
2958                         case UnicodeCategory.OtherNotAssigned:
2959                                 return true;
2960
2961                         default:
2962                                 bool use = false;
2963                                 // OtherSymbols
2964                                 if (
2965                                         // latin in a circle
2966                                         0x249A <= i && i <= 0x24E9
2967                                         || 0x2100 <= i && i <= 0x2132
2968                                         // Japanese
2969                                         || 0x3196 <= i && i <= 0x31A0
2970                                         // Korean
2971                                         || 0x3200 <= i && i <= 0x321C
2972                                         // Chinese/Japanese
2973                                         || 0x322A <= i && i <= 0x3243
2974                                         // CJK
2975                                         || 0x3260 <= i && i <= 0x32B0
2976                                         || 0x32D0 <= i && i <= 0x3357
2977                                         || 0x337B <= i && i <= 0x33DD
2978                                 )
2979                                         use = !Char.IsLetterOrDigit ((char) i);
2980                                 if (use)
2981                                         return false;
2982
2983                                 // This "Digit" rule is mystery.
2984                                 // It filters some symbols out.
2985                                 if (Char.IsLetterOrDigit ((char) i))
2986                                         return false;
2987                                 if (Char.IsNumber ((char) i))
2988                                         return false;
2989                                 if (Char.IsControl ((char) i)
2990                                         || Char.IsSeparator ((char) i)
2991                                         || Char.IsPunctuation ((char) i))
2992                                         return true;
2993                                 if (Char.IsSymbol ((char) i))
2994                                         return true;
2995
2996                                 // FIXME: should check more
2997                                 return false;
2998                         }
2999                 }
3000
3001                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3002 /*
3003                 public static void Main ()
3004                 {
3005                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3006                         for (int i = 0; i <= char.MaxValue; i++) {
3007                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3008                                 if (uc == UnicodeCategory.Surrogate)
3009                                         continue;
3010
3011                                 bool ret = IsIgnorableSymbol (i);
3012
3013                                 string s1 = "TEST ";
3014                                 string s2 = "TEST " + (char) i;
3015
3016                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3017
3018                                 if (ret != (result == 0))
3019                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3020                                                 ret ? "should not ignore" :
3021                                                         "should ignore",
3022                                                 i,(char) i, uc);
3023                         }
3024                 }
3025 */
3026                 #endregion
3027
3028                 #region NonSpacing
3029                 static bool IsIgnorableNonSpacing (int i)
3030                 {
3031                         if (IsIgnorable (i))
3032                                 return true;
3033
3034                         switch (i) {
3035                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3036                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3037                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3038                                 return true;
3039                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3040                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3041                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3042                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3043                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3044                         case 0x0CCD: case 0x0E4E:
3045                                 return false;
3046                         }
3047
3048                         if (0x02b9 <= i && i <= 0x02c5
3049                                 || 0x02cc <= i && i <= 0x02d7
3050                                 || 0x02e4 <= i && i <= 0x02ef
3051                                 || 0x20DD <= i && i <= 0x20E0
3052                         )
3053                                 return true;
3054
3055                         if (0x064B <= i && i <= 0x00652
3056                                 || 0x0941 <= i && i <= 0x0948
3057                                 || 0x0AC1 <= i && i <= 0x0ACD
3058                                 || 0x0C3E <= i && i <= 0x0C4F
3059                                 || 0x0E31 <= i && i <= 0x0E3F
3060                         )
3061                                 return false;
3062
3063                         return Char.GetUnicodeCategory ((char) i) ==
3064                                 UnicodeCategory.NonSpacingMark;
3065                 }
3066
3067                 // We can reuse IsIgnorableSymbol testcode
3068                 // for IsIgnorableNonSpacing.
3069                 #endregion
3070         }
3071
3072         struct CharMapEntry
3073         {
3074                 public byte Category;
3075                 public byte Level1;
3076                 public byte Level2; // It is always single byte.
3077                 public bool Defined;
3078
3079                 public CharMapEntry (byte category, byte level1, byte level2)
3080                 {
3081                         Category = category;
3082                         Level1 = level1;
3083                         Level2 = level2;
3084                         Defined = true;
3085                 }
3086         }
3087
3088         class JISCharacter
3089         {
3090                 public readonly int CP;
3091                 public readonly int JIS;
3092
3093                 public JISCharacter (int cp, int cpJIS)
3094                 {
3095                         CP = cp;
3096                         JIS = cpJIS;
3097                 }
3098         }
3099
3100         class JISComparer : IComparer
3101         {
3102                 public static readonly JISComparer Instance =
3103                         new JISComparer ();
3104
3105                 public int Compare (object o1, object o2)
3106                 {
3107                         JISCharacter j1 = (JISCharacter) o1;
3108                         JISCharacter j2 = (JISCharacter) o2;
3109                         return j2.JIS - j1.JIS;
3110                 }
3111         }
3112
3113         class NonJISCharacter
3114         {
3115                 public readonly int CP;
3116                 public readonly string Name;
3117
3118                 public NonJISCharacter (int cp, string name)
3119                 {
3120                         CP = cp;
3121                         Name = name;
3122                 }
3123         }
3124
3125         class NonJISComparer : IComparer
3126         {
3127                 public static readonly NonJISComparer Instance =
3128                         new NonJISComparer ();
3129
3130                 public int Compare (object o1, object o2)
3131                 {
3132                         NonJISCharacter j1 = (NonJISCharacter) o1;
3133                         NonJISCharacter j2 = (NonJISCharacter) o2;
3134                         return string.CompareOrdinal (j1.Name, j2.Name);
3135                 }
3136         }
3137
3138         class DecimalDictionaryValueComparer : IComparer
3139         {
3140                 public static readonly DecimalDictionaryValueComparer Instance
3141                         = new DecimalDictionaryValueComparer ();
3142
3143                 private DecimalDictionaryValueComparer ()
3144                 {
3145                 }
3146
3147                 public int Compare (object o1, object o2)
3148                 {
3149                         DictionaryEntry e1 = (DictionaryEntry) o1;
3150                         DictionaryEntry e2 = (DictionaryEntry) o2;
3151                         // FIXME: in case of 0, compare decomposition categories
3152                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3153                         if (ret != 0)
3154                                 return ret;
3155                         int i1 = (int) e1.Key;
3156                         int i2 = (int) e2.Key;
3157                         return i1 - i2;
3158                 }
3159         }
3160
3161         class StringDictionaryValueComparer : IComparer
3162         {
3163                 public static readonly StringDictionaryValueComparer Instance
3164                         = new StringDictionaryValueComparer ();
3165
3166                 private StringDictionaryValueComparer ()
3167                 {
3168                 }
3169
3170                 public int Compare (object o1, object o2)
3171                 {
3172                         DictionaryEntry e1 = (DictionaryEntry) o1;
3173                         DictionaryEntry e2 = (DictionaryEntry) o2;
3174                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3175                         if (ret != 0)
3176                                 return ret;
3177                         int i1 = (int) e1.Key;
3178                         int i2 = (int) e2.Key;
3179                         return i1 - i2;
3180                 }
3181         }
3182
3183         class UCAComparer : IComparer
3184         {
3185                 public static readonly UCAComparer Instance
3186                         = new UCAComparer ();
3187
3188                 private UCAComparer ()
3189                 {
3190                 }
3191
3192                 public int Compare (object o1, object o2)
3193                 {
3194                         char i1 = (char) o1;
3195                         char i2 = (char) o2;
3196
3197                         int l1 = CollationElementTable.GetSortKeyCount (i1);
3198                         int l2 = CollationElementTable.GetSortKeyCount (i2);
3199                         int l = l1 > l2 ? l2 : l1;
3200
3201                         for (int i = 0; i < l; i++) {
3202                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3203                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3204                                 int v = k1.Primary - k2.Primary;
3205                                 if (v != 0)
3206                                         return v;
3207                                 v = k1.Secondary - k2.Secondary;
3208                                 if (v != 0)
3209                                         return v;
3210                                 v = k1.Thirtiary - k2.Thirtiary;
3211                                 if (v != 0)
3212                                         return v;
3213                                 v = k1.Quarternary - k2.Quarternary;
3214                                 if (v != 0)
3215                                         return v;
3216                         }
3217                         return l1 - l2;
3218                 }
3219         }
3220
3221         class Tailoring
3222         {
3223                 int lcid;
3224                 int alias;
3225                 bool frenchSort;
3226                 ArrayList items = new ArrayList ();
3227
3228                 public Tailoring (int lcid)
3229                         : this (lcid, 0)
3230                 {
3231                 }
3232
3233                 public Tailoring (int lcid, int alias)
3234                 {
3235                         this.lcid = lcid;
3236                         this.alias = alias;
3237                 }
3238
3239                 public int LCID {
3240                         get { return lcid; }
3241                 }
3242
3243                 public int Alias {
3244                         get { return alias; }
3245                 }
3246
3247                 public bool FrenchSort {
3248                         get { return frenchSort; }
3249                         set { frenchSort = value; }
3250                 }
3251
3252                 public void AddDiacriticalMap (byte target, byte replace)
3253                 {
3254                         items.Add (new DiacriticalMap (target, replace));
3255                 }
3256
3257                 public void AddSortKeyMap (string source, byte [] sortkey)
3258                 {
3259                         items.Add (new SortKeyMap (source, sortkey));
3260                 }
3261
3262                 public void AddReplacementMap (string source, string replace)
3263                 {
3264                         items.Add (new ReplacementMap (source, replace));
3265                 }
3266
3267                 public char [] ItemToCharArray ()
3268                 {
3269                         ArrayList al = new ArrayList ();
3270                         foreach (ITailoringMap m in items)
3271                                 al.AddRange (m.ToCharArray ());
3272                         return al.ToArray (typeof (char)) as char [];
3273                 }
3274
3275                 interface ITailoringMap
3276                 {
3277                         char [] ToCharArray ();
3278                 }
3279
3280                 class DiacriticalMap : ITailoringMap
3281                 {
3282                         public readonly byte Target;
3283                         public readonly byte Replace;
3284
3285                         public DiacriticalMap (byte target, byte replace)
3286                         {
3287                                 Target = target;
3288                                 Replace = replace;
3289                         }
3290
3291                         public char [] ToCharArray ()
3292                         {
3293                                 char [] ret = new char [3];
3294                                 ret [0] = (char) 02; // kind:DiacriticalMap
3295                                 ret [1] = (char) Target;
3296                                 ret [2] = (char) Replace;
3297                                 return ret;
3298                         }
3299                 }
3300
3301                 class SortKeyMap : ITailoringMap
3302                 {
3303                         public readonly string Source;
3304                         public readonly byte [] SortKey;
3305
3306                         public SortKeyMap (string source, byte [] sortkey)
3307                         {
3308                                 Source = source;
3309                                 SortKey = sortkey;
3310                         }
3311
3312                         public char [] ToCharArray ()
3313                         {
3314                                 char [] ret = new char [Source.Length + 7];
3315                                 ret [0] = (char) 01; // kind:SortKeyMap
3316                                 for (int i = 0; i < Source.Length; i++)
3317                                         ret [i + 1] = Source [i];
3318                                 // null terminate
3319                                 for (int i = 0; i < 5; i++)
3320                                         ret [i + Source.Length + 2] = (char) SortKey [i];
3321                                 return ret;
3322                         }
3323                 }
3324
3325                 class ReplacementMap : ITailoringMap
3326                 {
3327                         public readonly string Source;
3328                         public readonly string Replace;
3329
3330                         public ReplacementMap (string source, string replace)
3331                         {
3332                                 Source = source;
3333                                 Replace = replace;
3334                         }
3335
3336                         public char [] ToCharArray ()
3337                         {
3338                                 char [] ret = new char [Source.Length + Replace.Length + 3];
3339                                 ret [0] = (char) 03; // kind:ReplaceMap
3340                                 int pos = 1;
3341                                 for (int i = 0; i < Source.Length; i++)
3342                                         ret [pos++] = Source [i];
3343                                 // null terminate
3344                                 pos++;
3345                                 for (int i = 0; i < Replace.Length; i++)
3346                                         ret [pos++] = Replace [i];
3347                                 // null terminate
3348                                 return ret;
3349                         }
3350                 }
3351         }
3352 }