mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14
  15 //
  16 // * sortkey getter signature
  17 //
  18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
  19 //      Stores sort key for corresponding character element into buf and
  20 //      returns the length of the consumed _source_ character element in s.
  21 //
  22 // * character length to consume
  23 //
  24 //      If there are characters whose primary weight is 0, they are consumed
  25 //      and considered as a part of the character element.
  26 //
  27 #define Binary
  28
  29 using System;
  30 using System.IO;
  31 using System.Collections;
  32 using System.Globalization;
  33 using System.Text;
  34 using System.Xml;
  35
  36 namespace Mono.Globalization.Unicode
  37 {
  38         internal class MSCompatSortKeyTableGenerator
  39         {
  40                 public static void Main (string [] args)
  41                 {
  42                         new MSCompatSortKeyTableGenerator ().Run (args);
  43                 }
  44
  45                 const int DecompositionWide = 1; // fixed
  46                 const int DecompositionSub = 2; // fixed
  47                 const int DecompositionSmall = 3;
  48                 const int DecompositionIsolated = 4;
  49                 const int DecompositionInitial = 5;
  50                 const int DecompositionFinal = 6;
  51                 const int DecompositionMedial = 7;
  52                 const int DecompositionNoBreak = 8;
  53                 const int DecompositionVertical = 9;
  54                 const int DecompositionFraction = 0xA;
  55                 const int DecompositionFont = 0xB;
  56                 const int DecompositionSuper = 0xC; // fixed
  57                 const int DecompositionFull = 0xE;
  58                 const int DecompositionNarrow = 0xD;
  59                 const int DecompositionCircle = 0xF;
  60                 const int DecompositionSquare = 0x10;
  61                 const int DecompositionCompat = 0x11;
  62                 const int DecompositionCanonical = 0x12;
  63
  64                 TextWriter Result = Console.Out;
  65
  66                 byte [] fillIndex = new byte [256]; // by category
  67                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  68
  69                 char [] specialIgnore = new char [] {
  70                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  71                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  72                         };
  73
  74                 // FIXME: need more love (as always)
  75                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  76                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  77                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  78                         '\u0292', '\u01BE', '\u0298'};
  79                 byte [] alphaWeights = new byte [] {
  80                         2, 9, 0xA, 0x1A, 0x21,
  81                         0x23, 0x25, 0x2C, 0x32, 0x35,
  82                         0x36, 0x48, 0x51, 0x70, 0x7C,
  83                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  84                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  85                         0xA9, 0xAA, 0xB3, 0xB4};
  86
  87                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  88                 bool [] isUppercase = new bool [char.MaxValue + 1];
  89
  90                 byte [] decompType = new byte [char.MaxValue + 1];
  91                 int [] decompIndex = new int [char.MaxValue + 1];
  92                 int [] decompLength = new int [char.MaxValue + 1];
  93                 int [] decompValues;
  94                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  95
  96                 byte [] diacritical = new byte [char.MaxValue + 1];
  97
  98                 string [] diacritics = new string [] {
  99                         // LATIN
 100                         "WITH VERTICAL LINE ABOVE;",
 101                         "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
 102                         "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 103                         " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
 104                         "WITH OGONEK;", "WITH CEDILLA;",
 105                         //
 106                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 107                         " STROKE;", " CIRCUMFLEX AND ACUTE;",
 108                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 109                         " DIAERESIS AND GRAVE;",
 110                         " BREVE AND ACUTE;",
 111                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 112                         " MACRON AND ACUTE;",
 113                         " MACRON AND GRAVE;",
 114                         //
 115                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 116                         " RING ABOVE AND ACUTE",
 117                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 118                         " CIRCUMFLEX AND TILDE",
 119                         " TILDE AND DIAERESIS",
 120                         " STROKE AND ACUTE",
 121                         " BREVE AND TILDE",
 122                         " CEDILLA AND BREVE",
 123                         " OGONEK AND MACRON",
 124                         //
 125                         "WITH OVERLINE",
 126                         "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 127                         " DOUBLE GRAVE;",
 128                         " INVERTED BREVE",
 129                         " PRECEDED BY APOSTROPHE",
 130                         " HORN;",
 131                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 132                         " PALATAL HOOK",
 133                         " DOT BELOW;",
 134                         " RETROFLEX;", "DIAERESIS BELOW",
 135                         " RING BELOW",
 136                         //
 137                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 138                         " BREVE BELOW;", " HORN AND GRAVE",
 139                         " TILDE BELOW",
 140                         " TOPBAR",
 141                         " DOT BELOW AND DOT ABOVE",
 142                         " RIGHT HALF RING", " HORN AND TILDE",
 143                         " CIRCUMFLEX AND DOT BELOW",
 144                         " BREVE AND DOT BELOW",
 145                         " DOT BELOW AND MACRON",
 146                         " HORN AND HOOK ABOVE",
 147                         " HORN AND DOT",
 148                         // CIRCLED, PARENTHESIZED and so on
 149                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
 150                         "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
 151                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 152                         };
 153                 byte [] diacriticWeights = new byte [] {
 154                         // LATIN.
 155                         5,
 156                         0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 157                         0x17, 0x19, 0x1A, 0x1B, 0x1C,
 158                         //
 159                         0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 160                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 161                         //
 162                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 163                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 164                         //
 165                         0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
 166                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
 167                         //
 168                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
 169                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 170                         0x95, 0xAA,
 171                         // CIRCLED, PARENTHESIZED and so on.
 172                         0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
 173                         0xF3, 0xF3, 0xF3
 174                         };
 175
 176                 int [] numberSecondaryWeightBounds = new int [] {
 177                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 178                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 179                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 180                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 181                         0xE50, 0xE60, 0xED0, 0xEE0
 182                         };
 183
 184                 char [] orderedCyrillic;
 185                 char [] orderedGurmukhi;
 186                 char [] orderedGujarati;
 187                 char [] orderedGeorgian;
 188                 char [] orderedThaana;
 189
 190                 static readonly char [] orderedTamilConsonants = new char [] {
 191                         // based on traditional Tamil consonants, except for
 192                         // Grantha (where Microsoft breaks traditionalism).
 193                         // http://www.angelfire.com/empire/thamizh/padanGaL
 194                         '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
 195                         '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
 196                         '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
 197                         '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
 198                         '\u0BB7', '\u0BB9'};
 199
 200                 // cp -> character name (only for some characters)
 201                 ArrayList sortableCharNames = new ArrayList ();
 202
 203                 // cp -> arrow value (int)
 204                 ArrayList arrowValues = new ArrayList ();
 205
 206                 // cp -> box value (int)
 207                 ArrayList boxValues = new ArrayList ();
 208
 209                 // cp -> level1 value
 210                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 211                 Hashtable cyrillicLetterPrimaryValues = new Hashtable ();
 212
 213                 // letterName -> cp
 214                 Hashtable arabicNameMap = new Hashtable ();
 215                 Hashtable cyrillicNameMap = new Hashtable ();
 216
 217                 // cp -> Hashtable [decompType] -> cp
 218                 Hashtable nfkdMap = new Hashtable ();
 219
 220                 // Latin letter -> ArrayList [int]
 221                 Hashtable latinMap = new Hashtable ();
 222
 223                 ArrayList jisJapanese = new ArrayList ();
 224                 ArrayList nonJisJapanese = new ArrayList ();
 225
 226                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
 227                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
 228                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
 229                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
 230                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
 231
 232                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 233
 234                 static double [] unicodeAge = new double [char.MaxValue + 1];
 235
 236                 ArrayList tailorings = new ArrayList ();
 237
 238                 void Run (string [] args)
 239                 {
 240                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 241                         ParseSources (dirname);
 242                         Console.Error.WriteLine ("parse done.");
 243
 244                         ModifyParsedValues ();
 245                         GenerateCore ();
 246                         Console.Error.WriteLine ("generation done.");
 247                         Serialize ();
 248                         Console.Error.WriteLine ("serialization done.");
 249 /*
 250 StreamWriter sw = new StreamWriter ("agelog.txt");
 251 for (int i = 0; i < char.MaxValue; i++) {
 252 bool shouldBe = false;
 253 switch (Char.GetUnicodeCategory ((char) i)) {
 254 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
 255         shouldBe = true; break;
 256 }
 257 if (unicodeAge [i] >= 3.1)
 258         shouldBe = true;
 259 //if (IsIgnorable (i) != shouldBe)
 260 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
 261 }
 262 sw.Close ();
 263 */
 264                 }
 265
 266                 byte [] CompressArray (byte [] source, CodePointIndexer i)
 267                 {
 268                         return (byte []) CodePointIndexer.CompressArray  (
 269                                 source, typeof (byte), i);
 270                 }
 271
 272                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
 273                 {
 274                         return (ushort []) CodePointIndexer.CompressArray  (
 275                                 source, typeof (ushort), i);
 276                 }
 277
 278                 void Serialize ()
 279                 {
 280                         // Tailorings
 281                         SerializeTailorings ();
 282
 283                         byte [] categories = new byte [map.Length];
 284                         byte [] level1 = new byte [map.Length];
 285                         byte [] level2 = new byte [map.Length];
 286                         byte [] level3 = new byte [map.Length];
 287                         ushort [] widthCompat = new ushort [map.Length];
 288                         for (int i = 0; i < map.Length; i++) {
 289                                 categories [i] = map [i].Category;
 290                                 level1 [i] = map [i].Level1;
 291                                 level2 [i] = map [i].Level2;
 292                                 level3 [i] = ComputeLevel3Weight ((char) i);
 293                                 switch (decompType [i]) {
 294                                 case DecompositionNarrow:
 295                                 case DecompositionWide:
 296                                 case DecompositionSuper:
 297                                 case DecompositionSub:
 298                                         // they are always 1 char
 299                                         widthCompat [i] = (ushort) decompValues [decompIndex [i]];
 300                                         break;
 301                                 }
 302                         }
 303
 304                         // compress
 305                         ignorableFlags = CompressArray (ignorableFlags,
 306                                 MSCompatUnicodeTableUtil.Ignorable);
 307                         categories = CompressArray (categories,
 308                                 MSCompatUnicodeTableUtil.Category);
 309                         level1 = CompressArray (level1,
 310                                 MSCompatUnicodeTableUtil.Level1);
 311                         level2 = CompressArray (level2,
 312                                 MSCompatUnicodeTableUtil.Level2);
 313                         level3 = CompressArray (level3,
 314                                 MSCompatUnicodeTableUtil.Level3);
 315                         widthCompat = (ushort []) CodePointIndexer.CompressArray (
 316                                 widthCompat, typeof (ushort),
 317                                 MSCompatUnicodeTableUtil.WidthCompat);
 318                         cjkCHS = CompressArray (cjkCHS,
 319                                 MSCompatUnicodeTableUtil.CjkCHS);
 320                         cjkCHT = CompressArray (cjkCHT,
 321                                 MSCompatUnicodeTableUtil.Cjk);
 322                         cjkJA = CompressArray (cjkJA,
 323                                 MSCompatUnicodeTableUtil.Cjk);
 324                         cjkKO = CompressArray (cjkKO,
 325                                 MSCompatUnicodeTableUtil.Cjk);
 326                         cjkKOlv2 = CompressArray (cjkKOlv2,
 327                                 MSCompatUnicodeTableUtil.Cjk);
 328
 329                         // Ignorables
 330                         Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
 331 #if Binary
 332                         MemoryStream ms = new MemoryStream ();
 333                         BinaryWriter binary = new BinaryWriter (ms);
 334                         binary.Write (ignorableFlags.Length);
 335 #endif
 336                         for (int i = 0; i < ignorableFlags.Length; i++) {
 337                                 byte value = ignorableFlags [i];
 338                                 if (value < 10)
 339                                         Result.Write ("{0},", value);
 340                                 else
 341                                         Result.Write ("0x{0:X02},", value);
 342 #if Binary
 343                                 binary.Write (value);
 344 #endif
 345                                 if ((i & 0xF) == 0xF)
 346                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 347                         }
 348                         Result.WriteLine ("};");
 349                         Result.WriteLine ();
 350
 351                         // Primary category
 352                         Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
 353 #if Binary
 354                         binary.Write (categories.Length);
 355 #endif
 356                         for (int i = 0; i < categories.Length; i++) {
 357                                 byte value = categories [i];
 358                                 if (value < 10)
 359                                         Result.Write ("{0},", value);
 360                                 else
 361                                         Result.Write ("0x{0:X02},", value);
 362 #if Binary
 363                                 binary.Write (value);
 364 #endif
 365                                 if ((i & 0xF) == 0xF)
 366                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 367                         }
 368                         Result.WriteLine ("};");
 369                         Result.WriteLine ();
 370
 371                         // Primary weight value
 372                         Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
 373 #if Binary
 374                         binary.Write (level1.Length);
 375 #endif
 376                         for (int i = 0; i < level1.Length; i++) {
 377                                 byte value = level1 [i];
 378                                 if (value < 10)
 379                                         Result.Write ("{0},", value);
 380                                 else
 381                                         Result.Write ("0x{0:X02},", value);
 382 #if Binary
 383                                 binary.Write (value);
 384 #endif
 385                                 if ((i & 0xF) == 0xF)
 386                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 387                         }
 388                         Result.WriteLine ("};");
 389                         Result.WriteLine ();
 390
 391                         // Secondary weight
 392                         Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
 393 #if Binary
 394                         binary.Write (level2.Length);
 395 #endif
 396                         for (int i = 0; i < level2.Length; i++) {
 397                                 byte value = level2 [i];
 398                                 if (value < 10)
 399                                         Result.Write ("{0},", value);
 400                                 else
 401                                         Result.Write ("0x{0:X02},", value);
 402 #if Binary
 403                                 binary.Write (value);
 404 #endif
 405                                 if ((i & 0xF) == 0xF)
 406                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 407                         }
 408                         Result.WriteLine ("};");
 409                         Result.WriteLine ();
 410
 411                         // Thirtiary weight
 412                         Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
 413 #if Binary
 414                         binary.Write (level3.Length);
 415 #endif
 416                         for (int i = 0; i < level3.Length; i++) {
 417                                 byte value = level3 [i];
 418                                 if (value < 10)
 419                                         Result.Write ("{0},", value);
 420                                 else
 421                                         Result.Write ("0x{0:X02},", value);
 422 #if Binary
 423                                 binary.Write (value);
 424 #endif
 425                                 if ((i & 0xF) == 0xF)
 426                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 427                         }
 428                         Result.WriteLine ("};");
 429                         Result.WriteLine ();
 430
 431                         // Width insensitivity mappings
 432                         // (for now it is more lightweight than dumping the
 433                         // entire NFKD table).
 434                         Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
 435 #if Binary
 436                         binary.Write (widthCompat.Length);
 437 #endif
 438                         for (int i = 0; i < widthCompat.Length; i++) {
 439                                 ushort value = widthCompat [i];
 440                                 if (value < 10)
 441                                         Result.Write ("{0},", value);
 442                                 else
 443                                         Result.Write ("0x{0:X02},", value);
 444 #if Binary
 445                                 binary.Write (value);
 446 #endif
 447                                 if ((i & 0xF) == 0xF)
 448                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 449                         }
 450                         Result.WriteLine ("};");
 451                         Result.WriteLine ();
 452 #if Binary
 453                         using (FileStream fs = File.Create ("../collation.core.bin")) {
 454                                 byte [] array = ms.ToArray ();
 455                                 fs.Write (array, 0, array.Length);
 456                         }
 457 #endif
 458
 459                         // CJK
 460                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 461                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 462                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 463                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 464                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 465                 }
 466
 467                 void SerializeCJK (string name, ushort [] cjk, int max)
 468                 {
 469                         int offset = 0;//char.MaxValue - cjk.Length;
 470                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
 471 #if Binary
 472                         MemoryStream ms = new MemoryStream ();
 473                         BinaryWriter binary = new BinaryWriter (ms);
 474 #endif
 475                         for (int i = 0; i < cjk.Length; i++) {
 476                                 if (i + offset == max)
 477                                         break;
 478                                 ushort value = cjk [i];
 479                                 if (value < 10)
 480                                         Result.Write ("{0},", value);
 481                                 else
 482                                         Result.Write ("0x{0:X04},", value);
 483 #if Binary
 484                                 binary.Write (value);
 485 #endif
 486                                 if ((i & 0xF) == 0xF)
 487                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 488                         }
 489                         Result.WriteLine ("};");
 490                         Result.WriteLine ();
 491 #if Binary
 492                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 493                                 byte [] array = ms.ToArray ();
 494                                 fs.Write (array, 0, array.Length);
 495                         }
 496 #endif
 497                 }
 498
 499                 void SerializeCJK (string name, byte [] cjk, int max)
 500                 {
 501                         int offset = 0;//char.MaxValue - cjk.Length;
 502                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
 503 #if Binary
 504                         MemoryStream ms = new MemoryStream ();
 505                         BinaryWriter binary = new BinaryWriter (ms);
 506 #endif
 507                         for (int i = 0; i < cjk.Length; i++) {
 508                                 if (i + offset == max)
 509                                         break;
 510                                 byte value = cjk [i];
 511                                 if (value < 10)
 512                                         Result.Write ("{0},", value);
 513                                 else
 514                                         Result.Write ("0x{0:X02},", value);
 515 #if Binary
 516                                 binary.Write (value);
 517 #endif
 518                                 if ((i & 0xF) == 0xF)
 519                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 520                         }
 521                         Result.WriteLine ("};");
 522                         Result.WriteLine ();
 523 #if Binary
 524                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 525                                 byte [] array = ms.ToArray ();
 526                                 fs.Write (array, 0, array.Length);
 527                         }
 528 #endif
 529                 }
 530
 531                 void SerializeTailorings ()
 532                 {
 533                         Hashtable indexes = new Hashtable ();
 534                         Hashtable counts = new Hashtable ();
 535                         Result.WriteLine ("static char [] tailorings = new char [] {");
 536                         int count = 0;
 537 #if Binary
 538                         MemoryStream ms = new MemoryStream ();
 539                         BinaryWriter binary = new BinaryWriter (ms);
 540 #endif
 541                         foreach (Tailoring t in tailorings) {
 542                                 if (t.Alias != 0)
 543                                         continue;
 544                                 Result.Write ("/*{0}*/", t.LCID);
 545                                 indexes.Add (t.LCID, count);
 546                                 char [] values = t.ItemToCharArray ();
 547                                 counts.Add (t.LCID, values.Length);
 548                                 foreach (char c in values) {
 549                                         Result.Write ("'\\x{0:X}', ", (int) c);
 550                                         if (++count % 16 == 0)
 551                                                 Result.WriteLine (" // {0:X04}", count - 16);
 552 #if Binary
 553                                         binary.Write ((ushort) c);
 554 #endif
 555                                 }
 556                         }
 557                         Result.WriteLine ("};");
 558
 559                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
 560 #if Binary
 561                         byte [] rawdata = ms.ToArray ();
 562                         ms = new MemoryStream ();
 563                         binary = new BinaryWriter (ms);
 564                         binary.Write (tailorings.Count);
 565 #endif
 566                         foreach (Tailoring t in tailorings) {
 567                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
 568                                 if (!indexes.ContainsKey (target)) {
 569                                         throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
 570                                         continue;
 571                                 }
 572                                 int idx = (int) indexes [target];
 573                                 int cnt = (int) counts [target];
 574                                 bool french = t.FrenchSort;
 575                                 if (t.Alias != 0)
 576                                         foreach (Tailoring t2 in tailorings)
 577                                                 if (t2.LCID == t.LCID)
 578                                                         french = t2.FrenchSort;
 579                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
 580 #if Binary
 581                                 binary.Write (t.LCID);
 582                                 binary.Write (idx);
 583                                 binary.Write (cnt);
 584                                 binary.Write (french);
 585 #endif
 586                         }
 587                         Result.WriteLine ("};");
 588 #if Binary
 589                         binary.Write ((byte) 0xFF);
 590                         binary.Write ((byte) 0xFF);
 591                         binary.Write (rawdata.Length / 2);
 592                         binary.Write (rawdata, 0, rawdata.Length);
 593
 594
 595                         using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
 596                                 byte [] array = ms.ToArray ();
 597                                 fs.Write (array, 0, array.Length);
 598                         }
 599 #endif
 600                 }
 601
 602                 #region Parse
 603
 604                 void ParseSources (string dirname)
 605                 {
 606                         string unidata =
 607                                 dirname + "/UnicodeData.txt";
 608                         string derivedCoreProps =
 609                                 dirname + "/DerivedCoreProperties.txt";
 610                         string scripts =
 611                                 dirname + "/Scripts.txt";
 612                         string cp932 =
 613                                 dirname + "/CP932.TXT";
 614                         string derivedAge =
 615                                 dirname + "/DerivedAge.txt";
 616                         string chXML = dirname + "/common/collation/zh.xml";
 617                         string jaXML = dirname + "/common/collation/ja.xml";
 618                         string koXML = dirname + "/common/collation/ko.xml";
 619
 620                         ParseDerivedAge (derivedAge);
 621
 622                         FillIgnorables ();
 623
 624                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 625                         ParseUnidata (unidata);
 626                         ParseDerivedCoreProperties (derivedCoreProps);
 627                         ParseScripts (scripts);
 628                         ParseCJK (chXML, jaXML, koXML);
 629
 630                         ParseTailorings ("mono-tailoring-source.txt");
 631                 }
 632
 633                 void ParseTailorings (string filename)
 634                 {
 635                         Tailoring t = null;
 636                         int line = 0;
 637                         using (StreamReader sr = new StreamReader (filename)) {
 638                                 try {
 639                                         while (sr.Peek () >= 0) {
 640                                                 line++;
 641                                                 ProcessTailoringLine (ref t,
 642                                                         sr.ReadLine ().Trim ());
 643                                         }
 644                                 } catch (Exception) {
 645                                         Console.Error.WriteLine ("ERROR at line {0}", line);
 646                                         throw;
 647                                 }
 648                         }
 649                 }
 650
 651                 // For now this is enough.
 652                 string ParseTailoringSourceValue (string s)
 653                 {
 654                         StringBuilder sb = new StringBuilder ();
 655                         for (int i = 0; i < s.Length; i++) {
 656                                 if (s.StartsWith ("\\u")) {
 657                                         sb.Append ((char) int.Parse (
 658                                                 s.Substring (2, 4), NumberStyles.HexNumber),
 659                                                 1);
 660                                         i += 5;
 661                                 }
 662                         else
 663                                 sb.Append (s [i]);
 664                         }
 665                         return sb.ToString ();
 666                 }
 667
 668                 void ProcessTailoringLine (ref Tailoring t, string s)
 669                 {
 670                         int idx = s.IndexOf ('#');
 671                         if (idx > 0)
 672                                 s = s.Substring (0, idx).Trim ();
 673                         if (s.Length == 0 || s [0] == '#')
 674                                 return;
 675                         if (s [0] == '@') {
 676                                 idx = s.IndexOf ('=');
 677                                 if (idx > 0)
 678                                         t = new Tailoring (
 679                                                 int.Parse (s.Substring (1, idx - 1)),
 680                                                 int.Parse (s.Substring (idx + 1)));
 681                                 else
 682                                         t = new Tailoring (int.Parse (s.Substring (1)));
 683                                 tailorings.Add (t);
 684                                 return;
 685                         }
 686                         if (s.StartsWith ("*FrenchSort")) {
 687                                 t.FrenchSort = true;
 688                                 return;
 689                         }
 690                         string d = "*Diacritical";
 691                         if (s.StartsWith (d)) {
 692                                 idx = s.IndexOf ("->");
 693                                 t.AddDiacriticalMap (
 694                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
 695                                                 NumberStyles.HexNumber),
 696                                         byte.Parse (s.Substring (idx + 2).Trim (),
 697                                                 NumberStyles.HexNumber));
 698                                 return;
 699                         }
 700                         idx = s.IndexOf (':');
 701                         if (idx > 0) {
 702                                 string source = s.Substring (0, idx).Trim ();
 703                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
 704                                 byte [] b = new byte [4];
 705                                 for (int i = 0; i < 4; i++) {
 706                                         if (l [i] == "*")
 707                                                 b [i] = 0;
 708                                         else
 709                                                 b [i] = byte.Parse (l [i],
 710                                                         NumberStyles.HexNumber);
 711                                 }
 712                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
 713                                         b);
 714                         }
 715                         idx = s.IndexOf ('=');
 716                         if (idx > 0)
 717                                 t.AddReplacementMap (
 718                                         ParseTailoringSourceValue (
 719                                                 s.Substring (0, idx).Trim ()),
 720                                         ParseTailoringSourceValue (
 721                                                 s.Substring (idx + 1).Trim ()));
 722                 }
 723
 724                 void ParseDerivedAge (string filename)
 725                 {
 726                         using (StreamReader file =
 727                                 new StreamReader (filename)) {
 728                                 while (file.Peek () >= 0) {
 729                                         string s = file.ReadLine ();
 730                                         int idx = s.IndexOf ('#');
 731                                         if (idx >= 0)
 732                                                 s = s.Substring (0, idx);
 733                                         idx = s.IndexOf (';');
 734                                         if (idx < 0)
 735                                                 continue;
 736
 737                                         string cpspec = s.Substring (0, idx);
 738                                         idx = cpspec.IndexOf ("..");
 739                                         NumberStyles nf = NumberStyles.HexNumber |
 740                                                 NumberStyles.AllowTrailingWhite;
 741                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 742                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 743                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 744
 745                                         // FIXME: use index
 746                                         if (cp > char.MaxValue)
 747                                                 continue;
 748
 749                                         double v = double.Parse (value);
 750                                         for (int i = cp; i <= cpEnd; i++)
 751                                                 unicodeAge [i] = v;
 752                                 }
 753                         }
 754                         unicodeAge [0] = double.MaxValue; // never be supported
 755                 }
 756
 757                 void ParseUnidata (string filename)
 758                 {
 759                         ArrayList decompValues = new ArrayList ();
 760                         using (StreamReader unidata =
 761                                 new StreamReader (filename)) {
 762                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 763                                         try {
 764                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 765                                         } catch (Exception) {
 766                                                 Console.Error.WriteLine ("**** At line " + line);
 767                                                 throw;
 768                                         }
 769                                 }
 770                         }
 771                         this.decompValues = (int [])
 772                                 decompValues.ToArray (typeof (int));
 773                 }
 774
 775                 void ProcessUnidataLine (string s, ArrayList decompValues)
 776                 {
 777                         int idx = s.IndexOf ('#');
 778                         if (idx >= 0)
 779                                 s = s.Substring (0, idx);
 780                         idx = s.IndexOf (';');
 781                         if (idx < 0)
 782                                 return;
 783                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 784                         string [] values = s.Substring (idx + 1).Split (';');
 785
 786                         // FIXME: use index
 787                         if (cp > char.MaxValue)
 788                                 return;
 789                         if (IsIgnorable (cp))
 790                                 return;
 791
 792                         string name = values [0];
 793
 794                         // SPECIAL CASE: rename some characters for diacritical
 795                         // remapping. FIXME: why are they different?
 796                         // FIXME: it's still not working.
 797                         if (cp == 0x018B || cp == 0x018C)
 798                                 name = name.Replace ("TOPBAR", "STROKE");
 799
 800                         // isSmallCapital
 801                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 802                                 isSmallCapital [cp] = true;
 803
 804                         // latin mapping by character name
 805                         if (s.IndexOf ("LATIN") >= 0) {
 806                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 807                                 int offset = lidx + 15;
 808                                 if (lidx < 0) {
 809                                         lidx = s.IndexOf ("LETTER TURNED ");
 810                                         offset = lidx + 14;
 811                                 }
 812                                 if (lidx < 0) {
 813                                         lidx = s.IndexOf ("LETTER ");
 814                                         offset = lidx + 7;
 815                                 }
 816                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 817                                 char n = s [offset + 1];
 818                                 char target = char.MinValue;
 819                                 if ('A' <= c && c <= 'Z' &&
 820                                         (n == ' ') || n == ';')
 821                                         target = c;
 822                                 // FIXME: they are still not working fine.
 823                                 if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
 824                                         target = 'O';
 825                                 if (s.Substring (offset).StartsWith ("ALPHA"))
 826                                         target = 'A';
 827                                 if (s.Substring (offset).StartsWith ("SCHWA"))
 828                                         target = 'E';
 829                                 if (target != char.MinValue) {
 830                                         ArrayList entry = (ArrayList) latinMap [target];
 831                                         if (entry == null) {
 832                                                 entry = new ArrayList ();
 833                                                 latinMap [target] = entry;
 834                                         }
 835                                         entry.Add (cp);
 836                                 }
 837                         }
 838
 839                         // Arrow names
 840                         if (0x2000 <= cp && cp < 0x3000) {
 841                                 int value = 0;
 842                                 // SPECIAL CASES. FIXME: why?
 843                                 switch (cp) {
 844                                 case 0x21C5: value = -1; break; // E2
 845                                 case 0x261D: value = 1; break;
 846                                 case 0x27A6: value = 3; break;
 847                                 case 0x21B0: value = 7; break;
 848                                 case 0x21B1: value = 3; break;
 849                                 case 0x21B2: value = 7; break;
 850                                 case 0x21B4: value = 5; break;
 851                                 case 0x21B5: value = 7; break;
 852                                 case 0x21B9: value = -1; break; // E1
 853                                 case 0x21CF: value = 7; break;
 854                                 case 0x21D0: value = 3; break;
 855                                 }
 856                                 string [] arrowTargets = new string [] {
 857                                         "",
 858                                         "UPWARDS",
 859                                         "NORTH EAST",
 860                                         "RIGHTWARDS",
 861                                         "SOUTH EAST",
 862                                         "DOWNWARDS",
 863                                         "SOUTH WEST",
 864                                         "LEFTWARDS",
 865                                         "NORTH WEST",
 866                                         };
 867                                 if (value == 0)
 868                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
 869                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
 870                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
 871                                                         s.IndexOf (" OVER") < 0
 872                                                 )
 873                                                         value = i;
 874                                 if (value > 0)
 875                                         arrowValues.Add (new DictionaryEntry (
 876                                                 cp, value));
 877                         }
 878
 879                         // Box names
 880                         if (0x2500 <= cp && cp < 0x25B0) {
 881                                 int value = 0;
 882                                 // flags:
 883                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
 884                                 // [h,rl] [r] [l]
 885                                 // [v,ud] [u] [d]
 886                                 // [dr] [dl] [ur] [ul]
 887                                 // [vr,udr] [vl,vdl]
 888                                 // [hd,rld] [hu,rlu]
 889                                 // [hv,udrl,rlv,udh]
 890                                 ArrayList flags = new ArrayList (new int [] {
 891                                         32, 8 + 4, 8, 4,
 892                                         16, 1 + 2, 1, 2,
 893                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
 894                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
 895                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
 896                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
 897                                         });
 898                                 byte [] offsets = new byte [] {
 899                                         0, 0, 1, 2,
 900                                         3, 3, 4, 5,
 901                                         6, 7, 8, 9,
 902                                         10, 10, 11, 11,
 903                                         12, 12, 13, 13,
 904                                         14, 14, 14, 14};
 905                                 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
 906                                         int flag = 0;
 907                                         if (s.IndexOf (" UP") > 0)
 908                                                 flag |= 1;
 909                                         if (s.IndexOf (" DOWN") > 0)
 910                                                 flag |= 2;
 911                                         if (s.IndexOf (" RIGHT") > 0)
 912                                                 flag |= 4;
 913                                         if (s.IndexOf (" LEFT") > 0)
 914                                                 flag |= 8;
 915                                         if (s.IndexOf (" VERTICAL") > 0)
 916                                                 flag |= 16;
 917                                         if (s.IndexOf (" HORIZONTAL") > 0)
 918                                                 flag |= 32;
 919
 920                                         int fidx = flags.IndexOf (flag);
 921                                         value = fidx < 0 ? fidx : offsets [fidx];
 922                                 } else if (s.IndexOf ("BLOCK") > 0) {
 923                                         if (s.IndexOf ("ONE EIGHTH") > 0)
 924                                                 value = 0x12;
 925                                         else if (s.IndexOf ("ONE QUARTER") > 0)
 926                                                 value = 0x13;
 927                                         else if (s.IndexOf ("THREE EIGHTHS") > 0)
 928                                                 value = 0x14;
 929                                         else if (s.IndexOf ("HALF") > 0)
 930                                                 value = 0x15;
 931                                         else if (s.IndexOf ("FIVE EIGHTHS") > 0)
 932                                                 value = 0x16;
 933                                         else if (s.IndexOf ("THREE QUARTERS") > 0)
 934                                                 value = 0x17;
 935                                         else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
 936                                                 value = 0x18;
 937                                         else
 938                                                 value = 0x19;
 939                                 }
 940                                 if (value >= 0)
 941                                         boxValues.Add (new DictionaryEntry (
 942                                                 cp, value));
 943                         }
 944
 945                         // For some characters store the name and sort later
 946                         // to determine sorting.
 947                         if (0x2100 <= cp && cp <= 0x213F &&
 948                                 Char.IsSymbol ((char) cp))
 949                                 sortableCharNames.Add (
 950                                         new DictionaryEntry (cp, name));
 951                         else if (0x3380 <= cp && cp <= 0x33DD)
 952                                 sortableCharNames.Add (new DictionaryEntry (
 953                                         cp, name.Substring (7)));
 954
 955                         // diacritical weights by character name
 956 if (diacritics.Length != diacriticWeights.Length)
 957 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
 958                         for (int d = 0; d < diacritics.Length; d++) {
 959                                 if (s.IndexOf (diacritics [d]) > 0) {
 960                                         diacritical [cp] |= diacriticWeights [d];
 961                                         continue;
 962                                 }
 963                                 // also process "COMBINING blah" here
 964                                 // For now it is limited to cp < 0x0370
 965 //                              if (cp < 0x0300 || cp >= 0x0370)
 966 //                                      continue;
 967                                 string tmp = diacritics [d].TrimEnd (';');
 968                                 if (tmp.IndexOf ("WITH ") == 0)
 969                                         tmp = tmp.Substring (4);
 970                                 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
 971                                 if (name == tmp)
 972                                         diacritical [cp] = (byte) (diacriticWeights [d] - 2);
 973 //if (name == tmp)
 974 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
 975                         }
 976                         // Two-step grep required for it.
 977                         if (s.IndexOf ("FULL STOP") > 0 &&
 978                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
 979                                 diacritical [cp] |= 0xF4;
 980
 981                         // Cyrillic letter name
 982                         if (0x0430 <= cp && cp <= 0x0486 &&
 983                                 Char.IsLetter ((char) cp)) {
 984                                 byte value = (byte) (cyrillicNameMap.Count * 3 + 0x06);
 985                                 // Get primary letter name i.e.
 986                                 // XXX part of CYRILLIC LETTER XXX yyy
 987                                 // e.g. "IZHITSA" for "IZHITSA DOUBLE GRAVE".
 988                                 string letterName =
 989                                         name.Substring (name.IndexOf ("LETTER ") + 7);
 990                                 int tmpIdx = letterName.IndexOf (' ');
 991                                 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
 992 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
 993                                 if (cyrillicNameMap.ContainsKey (letterName))
 994                                         value = (byte) cyrillicLetterPrimaryValues [cyrillicNameMap [letterName]];
 995                                 else
 996                                         cyrillicNameMap [letterName] = cp;
 997
 998                                 cyrillicLetterPrimaryValues [cp] = value;
 999                         }
1000
1001                         // Arabic letter name
1002                         if (0x0621 <= cp && cp <= 0x064A &&
1003                                 Char.GetUnicodeCategory ((char) cp)
1004                                 == UnicodeCategory.OtherLetter) {
1005                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1006                                 switch (cp) {
1007                                 case 0x0621:
1008                                 case 0x0624:
1009                                 case 0x0626:
1010                                         // hamza, waw, yeh ... special cases.
1011                                         value = 0x07;
1012                                         break;
1013                                 case 0x0649:
1014                                 case 0x064A:
1015                                         value = 0x77; // special cases.
1016                                         break;
1017                                 default:
1018                                         // Get primary letter name i.e.
1019                                         // XXX part of ARABIC LETTER XXX yyy
1020                                         // e.g. that of "TEH MARBUTA" is "TEH".
1021                                         string letterName =
1022                                                 (cp == 0x0640) ?
1023                                                 // 0x0640 is special: it does
1024                                                 // not start with ARABIC LETTER
1025                                                 name :
1026                                                 name.Substring (14);
1027                                         int tmpIdx = letterName.IndexOf (' ');
1028                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1029 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1030                                         if (arabicNameMap.ContainsKey (letterName))
1031                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1032                                         else
1033                                                 arabicNameMap [letterName] = cp;
1034                                         break;
1035                                 }
1036                                 arabicLetterPrimaryValues [cp] = value;
1037                         }
1038
1039                         // Japanese square letter
1040                         if (0x3300 <= cp && cp <= 0x3357)
1041                                 if (!ExistsJIS (cp))
1042                                         nonJisJapanese.Add (new NonJISCharacter (cp, name));
1043
1044                         // normalizationType
1045                         string decomp = values [4];
1046                         idx = decomp.IndexOf ('<');
1047                         if (idx >= 0) {
1048                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1049                                 case "full":
1050                                         decompType [cp] = DecompositionFull;
1051                                         break;
1052                                 case "sub":
1053                                         decompType [cp] = DecompositionSub;
1054                                         break;
1055                                 case "super":
1056                                         decompType [cp] = DecompositionSuper;
1057                                         break;
1058                                 case "small":
1059                                         decompType [cp] = DecompositionSmall;
1060                                         break;
1061                                 case "isolated":
1062                                         decompType [cp] = DecompositionIsolated;
1063                                         break;
1064                                 case "initial":
1065                                         decompType [cp] = DecompositionInitial;
1066                                         break;
1067                                 case "final":
1068                                         decompType [cp] = DecompositionFinal;
1069                                         break;
1070                                 case "medial":
1071                                         decompType [cp] = DecompositionMedial;
1072                                         break;
1073                                 case "noBreak":
1074                                         decompType [cp] = DecompositionNoBreak;
1075                                         break;
1076                                 case "compat":
1077                                         decompType [cp] = DecompositionCompat;
1078                                         break;
1079                                 case "fraction":
1080                                         decompType [cp] = DecompositionFraction;
1081                                         break;
1082                                 case "font":
1083                                         decompType [cp] = DecompositionFont;
1084                                         break;
1085                                 case "circle":
1086                                         decompType [cp] = DecompositionCircle;
1087                                         break;
1088                                 case "square":
1089                                         decompType [cp] = DecompositionSquare;
1090                                         break;
1091                                 case "wide":
1092                                         decompType [cp] = DecompositionWide;
1093                                         break;
1094                                 case "narrow":
1095                                         decompType [cp] = DecompositionNarrow;
1096                                         break;
1097                                 case "vertical":
1098                                         decompType [cp] = DecompositionVertical;
1099                                         break;
1100                                 default:
1101                                         throw new Exception ("Support NFKD type : " + decomp);
1102                                 }
1103                         }
1104                         else
1105                                 decompType [cp] = DecompositionCanonical;
1106                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1107                         if (decomp.Length > 0) {
1108
1109                                 string [] velems = decomp.Split (' ');
1110                                 int didx = decompValues.Count;
1111                                 decompIndex [cp] = didx;
1112                                 foreach (string v in velems)
1113                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1114                                 decompLength [cp] = velems.Length;
1115
1116                                 // [decmpType] -> this_cp
1117                                 int targetCP = (int) decompValues [didx];
1118                                 // for "(x)" it specially maps to 'x' .
1119                                 // FIXME: check if it is sane
1120                                 if (velems.Length == 3 &&
1121                                         (int) decompValues [didx] == '(' &&
1122                                         (int) decompValues [didx + 2] == ')')
1123                                         targetCP = (int) decompValues [didx + 1];
1124                                 // special: 0x215F "1/"
1125                                 else if (cp == 0x215F)
1126                                         targetCP = '1';
1127                                 else if (velems.Length > 1 &&
1128                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
1129                                         // skip them, except for CJK ideograph compat
1130                                         targetCP = 0;
1131
1132                                 if (targetCP != 0) {
1133                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
1134                                         if (entry == null) {
1135                                                 entry = new Hashtable ();
1136                                                 nfkdMap [targetCP] = entry;
1137                                         }
1138                                         entry [(byte) decompType [cp]] = cp;
1139                                 }
1140                         }
1141                         // numeric values
1142                         if (values [5].Length > 0)
1143                                 decimalValue [cp] = decimal.Parse (values [5]);
1144                         else if (values [6].Length > 0)
1145                                 decimalValue [cp] = decimal.Parse (values [6]);
1146                         else if (values [7].Length > 0) {
1147                                 string decstr = values [7];
1148                                 idx = decstr.IndexOf ('/');
1149                                 if (cp == 0x215F) // special. "1/"
1150                                         decimalValue [cp] = 0x1;
1151                                 else if (idx > 0)
1152                                         // m/n
1153                                         decimalValue [cp] =
1154                                                 decimal.Parse (decstr.Substring (0, idx))
1155                                                 / decimal.Parse (decstr.Substring (idx + 1));
1156                                 else if (decstr [0] == '(' &&
1157                                         decstr [decstr.Length - 1] == ')')
1158                                         // (n)
1159                                         decimalValue [cp] =
1160                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1161                                 else if (decstr [decstr.Length - 1] == '.')
1162                                         // n.
1163                                         decimalValue [cp] =
1164                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1165                                 else
1166                                         decimalValue [cp] = decimal.Parse (decstr);
1167                         }
1168                 }
1169
1170                 void ParseDerivedCoreProperties (string filename)
1171                 {
1172                         // IsUppercase
1173                         using (StreamReader file =
1174                                 new StreamReader (filename)) {
1175                                 for (int line = 1; file.Peek () >= 0; line++) {
1176                                         try {
1177                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1178                                         } catch (Exception) {
1179                                                 Console.Error.WriteLine ("**** At line " + line);
1180                                                 throw;
1181                                         }
1182                                 }
1183                         }
1184                 }
1185
1186                 void ProcessDerivedCorePropLine (string s)
1187                 {
1188                         int idx = s.IndexOf ('#');
1189                         if (idx >= 0)
1190                                 s = s.Substring (0, idx);
1191                         idx = s.IndexOf (';');
1192                         if (idx < 0)
1193                                 return;
1194                         string cpspec = s.Substring (0, idx);
1195                         idx = cpspec.IndexOf ("..");
1196                         NumberStyles nf = NumberStyles.HexNumber |
1197                                 NumberStyles.AllowTrailingWhite;
1198                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1199                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1200                         string value = s.Substring (cpspec.Length + 1).Trim ();
1201
1202                         // FIXME: use index
1203                         if (cp > char.MaxValue)
1204                                 return;
1205
1206                         switch (value) {
1207                         case "Uppercase":
1208                                 for (int x = cp; x <= cpEnd; x++)
1209                                         isUppercase [x] = true;
1210                                 break;
1211                         }
1212                 }
1213
1214                 void ParseScripts (string filename)
1215                 {
1216                         ArrayList cyrillic = new ArrayList ();
1217                         ArrayList gurmukhi = new ArrayList ();
1218                         ArrayList gujarati = new ArrayList ();
1219                         ArrayList georgian = new ArrayList ();
1220                         ArrayList thaana = new ArrayList ();
1221
1222                         using (StreamReader file =
1223                                 new StreamReader (filename)) {
1224                                 while (file.Peek () >= 0) {
1225                                         string s = file.ReadLine ();
1226                                         int idx = s.IndexOf ('#');
1227                                         if (idx >= 0)
1228                                                 s = s.Substring (0, idx);
1229                                         idx = s.IndexOf (';');
1230                                         if (idx < 0)
1231                                                 continue;
1232
1233                                         string cpspec = s.Substring (0, idx);
1234                                         idx = cpspec.IndexOf ("..");
1235                                         NumberStyles nf = NumberStyles.HexNumber |
1236                                                 NumberStyles.AllowTrailingWhite;
1237                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1238                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1239                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1240
1241                                         // FIXME: use index
1242                                         if (cp > char.MaxValue)
1243                                                 continue;
1244
1245                                         switch (value) {
1246                                         case "Cyrillic":
1247                                                 for (int x = cp; x <= cpEnd; x++)
1248                                                         if (!IsIgnorable (x))
1249                                                                 cyrillic.Add ((char) x);
1250                                                 break;
1251                                         case "Gurmukhi":
1252                                                 for (int x = cp; x <= cpEnd; x++)
1253                                                         if (!IsIgnorable (x))
1254                                                                 gurmukhi.Add ((char) x);
1255                                                 break;
1256                                         case "Gujarati":
1257                                                 for (int x = cp; x <= cpEnd; x++)
1258                                                         if (!IsIgnorable (x))
1259                                                                 gujarati.Add ((char) x);
1260                                                 break;
1261                                         case "Georgian":
1262                                                 for (int x = cp; x <= cpEnd; x++)
1263                                                         if (!IsIgnorable (x))
1264                                                                 georgian.Add ((char) x);
1265                                                 break;
1266                                         case "Thaana":
1267                                                 for (int x = cp; x <= cpEnd; x++)
1268                                                         if (!IsIgnorable (x))
1269                                                                 thaana.Add ((char) x);
1270                                                 break;
1271                                         }
1272                                 }
1273                         }
1274                         cyrillic.Sort (UCAComparer.Instance);
1275                         gurmukhi.Sort (UCAComparer.Instance);
1276                         gujarati.Sort (UCAComparer.Instance);
1277                         georgian.Sort (UCAComparer.Instance);
1278                         thaana.Sort (UCAComparer.Instance);
1279                         orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1280                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1281                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1282                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1283                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1284                 }
1285
1286                 void ParseJISOrder (string filename)
1287                 {
1288                         int line = 1;
1289                         try {
1290                                 using (StreamReader file =
1291                                         new StreamReader (filename)) {
1292                                         for (;file.Peek () >= 0; line++)
1293                                                 ProcessJISOrderLine (file.ReadLine ());
1294                                 }
1295                         } catch (Exception) {
1296                                 Console.Error.WriteLine ("---- line {0}", line);
1297                                 throw;
1298                         }
1299                 }
1300
1301                 char [] ws = new char [] {'\t', ' '};
1302
1303                 void ProcessJISOrderLine (string s)
1304                 {
1305                         int idx = s.IndexOf ('#');
1306                         if (idx >= 0)
1307                                 s = s.Substring (0, idx).Trim ();
1308                         if (s.Length == 0)
1309                                 return;
1310                         idx = s.IndexOfAny (ws);
1311                         if (idx < 0)
1312                                 return;
1313                         // They start with "0x" so cut them out.
1314                         int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1315                         int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1316                         jisJapanese.Add (new JISCharacter (cp, jis));
1317                 }
1318
1319                 void ParseCJK (string zhXML, string jaXML, string koXML)
1320                 {
1321                         XmlDocument doc = new XmlDocument ();
1322                         doc.XmlResolver = null;
1323                         int v;
1324                         string s;
1325                         string category;
1326                         int offset;
1327                         ushort [] arr;
1328
1329                         // Chinese Simplified
1330                         category = "chs";
1331                         arr = cjkCHS;
1332                         offset = 0;//char.MaxValue - arr.Length;
1333                         doc.Load (zhXML);
1334                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1335                         v = 0x8008;
1336                         foreach (char c in s) {
1337                                 if (c < '\u3100')
1338                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1339                                 else {
1340                                         arr [(int) c - offset] = (ushort) v++;
1341                                         if (v % 256 == 0)
1342                                                 v += 2;
1343                                 }
1344                         }
1345
1346                         // Chinese Traditional
1347                         category = "cht";
1348                         arr = cjkCHT;
1349                         offset = 0;//char.MaxValue - arr.Length;
1350                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1351                         v = 0x8002;
1352                         foreach (char c in s) {
1353                                 if (c < '\u4E00')
1354                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1355                                 else {
1356                                         arr [(int) c - offset] = (ushort) v++;
1357                                         if (v % 256 == 0)
1358                                                 v += 2;
1359                                 }
1360                         }
1361
1362                         // Japanese
1363                         category = "ja";
1364                         arr = cjkJA;
1365                         offset = 0;//char.MaxValue - arr.Length;
1366                         doc.Load (jaXML);
1367                         s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1368                         v = 0x8008;
1369                         foreach (char c in s) {
1370                                 if (c < '\u4E00')
1371                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1372                                 else {
1373                                         arr [(int) c - offset] = (ushort) v++;
1374                                         if (v % 256 == 0)
1375                                                 v += 2;
1376                                 }
1377                         }
1378
1379                         // Korean
1380                         // Korean weight is somewhat complex. It first shifts
1381                         // Hangul category from 52-x to 80-x (they are anyways
1382                         // computed). CJK ideographs are placed at secondary
1383                         // weight, like XX YY 01 zz 01, where XX and YY are
1384                         // corresponding "reset" value and zz is 41,43,45...
1385                         //
1386                         // Unlike chs,cht and ja, Korean value is a combined
1387                         // ushort which is computed as category
1388                         //
1389                         category = "ko";
1390                         arr = cjkKO;
1391                         offset = 0;//char.MaxValue - arr.Length;
1392                         doc.Load (koXML);
1393                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1394                                 XmlElement sc = (XmlElement) reset.NextSibling;
1395                                 // compute "category" and "level 1" for the
1396                                 // target "reset" Hangle syllable
1397                                 char rc = reset.InnerText [0];
1398                                 int ri = ((int) rc - 0xAC00) + 1;
1399                                 ushort p = (ushort)
1400                                         ((ri / 254) * 256 + (ri % 254) + 2);
1401                                 // Place the characters after the target.
1402                                 s = sc.InnerText;
1403                                 v = 0x41;
1404                                 foreach (char c in s) {
1405                                         arr [(int) c - offset] = p;
1406                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1407                                         v += 2;
1408                                 }
1409                         }
1410                 }
1411
1412                 #endregion
1413
1414                 #region Generation
1415
1416                 void FillIgnorables ()
1417                 {
1418                         for (int i = 0; i <= char.MaxValue; i++) {
1419                                 if (Char.GetUnicodeCategory ((char) i) ==
1420                                         UnicodeCategory.OtherNotAssigned)
1421                                         continue;
1422                                 if (IsIgnorable (i))
1423                                         ignorableFlags [i] |= 1;
1424                                 if (IsIgnorableSymbol (i))
1425                                         ignorableFlags [i] |= 2;
1426                                 if (IsIgnorableNonSpacing (i))
1427                                         ignorableFlags [i] |= 4;
1428                         }
1429                 }
1430
1431                 void ModifyParsedValues ()
1432                 {
1433                         // number, secondary weights
1434                         byte weight = 0x38;
1435                         int [] numarr = numberSecondaryWeightBounds;
1436                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1437                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1438                                         if (Char.IsNumber ((char) cp))
1439                                                 diacritical [cp] = weight;
1440
1441                         // Modify some decomposition equivalence
1442                         decompType [0xFE31] = 0;
1443                         decompIndex [0xFE31] = 0;
1444                         decompLength [0xFE31] = 0;
1445                         decompType [0xFE32] = 0;
1446                         decompIndex [0xFE32] = 0;
1447                         decompLength [0xFE32] = 0;
1448
1449                         // Korean parens numbers
1450                         for (int i = 0x3200; i <= 0x321C; i++)
1451                                 diacritical [i] = 0xA;
1452                         for (int i = 0x3260; i <= 0x327B; i++)
1453                                 diacritical [i] = 0xC;
1454
1455                         // Update name part of named characters
1456                         for (int i = 0; i < sortableCharNames.Count; i++) {
1457                                 DictionaryEntry de =
1458                                         (DictionaryEntry) sortableCharNames [i];
1459                                 int cp = (int) de.Key;
1460                                 string renamed = null;
1461                                 switch (cp) {
1462                                 case 0x2101: renamed = "A_1"; break;
1463                                 case 0x33C3: renamed = "A_2"; break;
1464                                 case 0x2105: renamed = "C_1"; break;
1465                                 case 0x2106: renamed = "C_2"; break;
1466                                 case 0x211E: renamed = "R1"; break;
1467                                 case 0x211F: renamed = "R2"; break;
1468                                 // Remove some of them!
1469                                 case 0x2103:
1470                                 case 0x2109:
1471                                 case 0x2116:
1472                                 case 0x2117:
1473                                 case 0x2118:
1474                                 case 0x2125:
1475                                 case 0x2127:
1476                                 case 0x2129:
1477                                 case 0x212E:
1478                                 case 0x2132:
1479                                         sortableCharNames.RemoveAt (i);
1480                                         i--;
1481                                         continue;
1482                                 }
1483                                 if (renamed != null)
1484                                         sortableCharNames [i] =
1485                                                 new DictionaryEntry (cp, renamed);
1486                         }
1487                 }
1488
1489                 void GenerateCore ()
1490                 {
1491                         UnicodeCategory uc;
1492
1493                         #region Specially ignored // 01
1494                         // This will raise "Defined" flag up.
1495                         foreach (char c in specialIgnore)
1496                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1497                         #endregion
1498
1499
1500                         #region Variable weights
1501                         // Controls : 06 03 - 06 3D
1502                         fillIndex [6] = 3;
1503                         for (int i = 0; i < 65536; i++) {
1504                                 if (IsIgnorable (i))
1505                                         continue;
1506                                 char c = (char) i;
1507                                 uc = Char.GetUnicodeCategory (c);
1508                                 // NEL is whitespace but not ignored here.
1509                                 if (uc == UnicodeCategory.Control &&
1510                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1511                                         AddCharMap (c, 6, 1);
1512                         }
1513
1514                         // Apostrophe 06 80
1515                         fillIndex [6] = 0x80;
1516                         AddCharMapGroup ('\'', 6, 1, 0);
1517                         AddCharMap ('\uFE63', 6, 1);
1518
1519                         // Hyphen/Dash : 06 81 - 06 90
1520                         for (int i = 0; i < char.MaxValue; i++) {
1521                                 if (!IsIgnorable (i) &&
1522                                         Char.GetUnicodeCategory ((char) i) ==
1523                                         UnicodeCategory.DashPunctuation) {
1524                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1525                                         if (i == 0x2011) {
1526                                                 // SPECIAL: add 2027 and 2043
1527                                                 // Maybe they are regarded the
1528                                                 // same hyphens in "central"
1529                                                 // position.
1530                                                 AddCharMap ('\u2027', 6, 1);
1531                                                 AddCharMap ('\u2043', 6, 1);
1532                                         }
1533                                 }
1534                         }
1535
1536                         // Arabic variable weight chars 06 A0 -
1537                         fillIndex [6] = 0xA0;
1538                         // vowels
1539                         for (int i = 0x64B; i <= 0x650; i++)
1540                                 AddArabicCharMap ((char) i);
1541                         // sukun
1542                         AddCharMapGroup ('\u0652', 6, 1, 0);
1543                         // shadda
1544                         AddCharMapGroup ('\u0651', 6, 1, 0);
1545                         #endregion
1546
1547
1548                         #region Nonspacing marks // 01
1549                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1550
1551                         // Combining diacritical marks: 01 DC -
1552
1553                         fillIndex [0x1] = 0x41;
1554                         for (int i = 0x030E; i <= 0x0326; i++)
1555                                 if (!IsIgnorable (i))
1556                                         AddCharMap ((char) i, 0x1, 1);
1557                         for (int i = 0x0329; i <= 0x0334; i++)
1558                                 if (!IsIgnorable (i))
1559                                         AddCharMap ((char) i, 0x1, 1);
1560                         for (int i = 0x0339; i <= 0x0341; i++)
1561                                 if (!IsIgnorable (i))
1562                                         AddCharMap ((char) i, 0x1, 1);
1563                         fillIndex [0x1] = 0x72;
1564                         for (int i = 0x0346; i <= 0x0348; i++)
1565                                 if (!IsIgnorable (i))
1566                                         AddCharMap ((char) i, 0x1, 1);
1567                         for (int i = 0x02BE; i <= 0x02BF; i++)
1568                                 if (!IsIgnorable (i))
1569                                         AddCharMap ((char) i, 0x1, 1);
1570                         for (int i = 0x02C1; i <= 0x02C5; i++)
1571                                 if (!IsIgnorable (i))
1572                                         AddCharMap ((char) i, 0x1, 1);
1573                         for (int i = 0x02CE; i <= 0x02CF; i++)
1574                                 if (!IsIgnorable (i))
1575                                         AddCharMap ((char) i, 0x1, 1);
1576                         for (int i = 0x02D1; i <= 0x02D3; i++)
1577                                 if (!IsIgnorable (i))
1578                                         AddCharMap ((char) i, 0x1, 1);
1579                         AddCharMap ('\u02DE', 0x1, 1);
1580                         for (int i = 0x02E4; i <= 0x02E9; i++)
1581                                 if (!IsIgnorable (i))
1582                                         AddCharMap ((char) i, 0x1, 1);
1583
1584                         // FIXME: needs more love here (it should eliminate
1585                         // all the hacky code above).
1586                         for (int i = 0x0300; i < 0x0370; i++)
1587                                 if (!IsIgnorable (i) && diacritical [i] != 0
1588                                         /* especiall here*/ && !map [i].Defined)
1589                                         map [i] = new CharMapEntry (
1590                                                 0x1, 0x1, diacritical [i]);
1591
1592                         // LAMESPEC: It should not stop at '\u20E1'. There are
1593                         // a few more characters (that however results in
1594                         // overflow of level 2 unless we start before 0xDD).
1595                         fillIndex [0x1] = 0xDC;
1596                         for (int i = 0x20d0; i <= 0x20e1; i++)
1597                                 AddCharMap ((char) i, 0x1, 1);
1598                         #endregion
1599
1600
1601                         #region Whitespaces // 07 03 -
1602                         fillIndex [0x7] = 0x2;
1603                         AddCharMap (' ', 0x7, 2);
1604                         AddCharMap ('\u00A0', 0x7, 1);
1605                         for (int i = 9; i <= 0xD; i++)
1606                                 AddCharMap ((char) i, 0x7, 1);
1607                         for (int i = 0x2000; i <= 0x200B; i++)
1608                                 AddCharMap ((char) i, 0x7, 1);
1609
1610                         fillIndex [0x7] = 0x17;
1611                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1612                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1613
1614                         // Characters which used to represent layout control.
1615                         // LAMESPEC: Windows developers seem to have thought
1616                         // that those characters are kind of whitespaces,
1617                         // while they aren't.
1618                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1619                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1620                         #endregion
1621
1622                         // category 09 - continued symbols from 08
1623                         fillIndex [0x9] = 2;
1624                         // misc tech mark
1625                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1626                                 AddCharMap ((char) cp, 0x9, 1, 0);
1627
1628                         // arrows
1629                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1630                         foreach (DictionaryEntry de in arrowValues) {
1631                                 int idx = (int) de.Value;
1632                                 int cp = (int) de.Key;
1633                                 if (map [cp].Defined)
1634                                         continue;
1635                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1636                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1637                                 arrowLv2 [idx]++;
1638                         }
1639                         // boxes
1640                         byte [] boxLv2 = new byte [128];
1641                         for (int i = 0; i < boxLv2.Length; i++)
1642                                 boxLv2 [i] = 3;
1643                         foreach (DictionaryEntry de in boxValues) {
1644                                 int cp = (int) de.Key;
1645                                 int idx = (int) de.Value;
1646                                 if (map [cp].Defined)
1647                                         continue;
1648                                 fillIndex [0x9] = (byte) (0xE5 + idx);
1649                                 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1650                                 boxLv2 [idx]++;
1651                         }
1652                         // Some special characters (slanted)
1653                         fillIndex [0x9] = 0xF4;
1654                         AddCharMap ('\u2571', 0x9, 3);
1655                         AddCharMap ('\u2572', 0x9, 3);
1656                         AddCharMap ('\u2573', 0x9, 3);
1657
1658                         // FIXME: implement 0A
1659                         #region Symbols
1660                         fillIndex [0xA] = 2;
1661                         // byte currency symbols
1662                         for (int cp = 0; cp < 0x100; cp++) {
1663                                 uc = Char.GetUnicodeCategory ((char) cp);
1664                                 if (!IsIgnorable (cp) &&
1665                                         uc == UnicodeCategory.CurrencySymbol &&
1666                                         cp != '$')
1667                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1668                         }
1669                         // byte other symbols
1670                         for (int cp = 0; cp < 0x100; cp++) {
1671                                 if (cp == 0xA6)
1672                                         continue; // SPECIAL: skip FIXME: why?
1673                                 uc = Char.GetUnicodeCategory ((char) cp);
1674                                 if (!IsIgnorable (cp) &&
1675                                         uc == UnicodeCategory.OtherSymbol)
1676                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1677                         }
1678
1679                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1680                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1681                                 AddCharMap ((char) cp, 0xA, 1, 0);
1682                         // Dingbats
1683                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1684                                 if (Char.IsSymbol ((char) cp))
1685                                         AddCharMap ((char) cp, 0xA, 1, 0);
1686                         // OCR
1687                         for (int i = 0x2440; i < 0x2460; i++)
1688                                 AddCharMap ((char) i, 0xA, 1, 0);
1689
1690                         #endregion
1691
1692                         #region Numbers // 0C 02 - 0C E1
1693                         fillIndex [0xC] = 2;
1694
1695                         // 9F8 : Bengali "one less than the denominator"
1696                         AddCharMap ('\u09F8', 0xC, 1);
1697
1698                         ArrayList numbers = new ArrayList ();
1699                         for (int i = 0; i < 65536; i++)
1700                                 if (!IsIgnorable (i) &&
1701                                         Char.IsNumber ((char) i) &&
1702                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1703                                         numbers.Add (i);
1704
1705                         ArrayList numberValues = new ArrayList ();
1706                         foreach (int i in numbers)
1707                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1708                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1709
1710 //foreach (DictionaryEntry de in numberValues)
1711 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1712
1713                         decimal prevValue = -1;
1714                         foreach (DictionaryEntry de in numberValues) {
1715                                 int cp = (int) de.Key;
1716                                 decimal currValue = (decimal) de.Value;
1717                                 bool addnew = false;
1718                                 if (prevValue < currValue &&
1719                                         prevValue - (int) prevValue == 0 &&
1720                                         prevValue >= 1) {
1721
1722                                         addnew = true;
1723                                         // Process Hangzhou and Roman numbers
1724
1725                                         // There are some SPECIAL cases.
1726                                         if (currValue != 4) // no increment for 4
1727                                                 fillIndex [0xC]++;
1728
1729                                         int xcp;
1730                                         if (currValue <= 10) {
1731                                                 xcp = (int) prevValue + 0x2170 - 1;
1732                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1733                                                 xcp = (int) prevValue + 0x2160 - 1;
1734                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1735                                                 fillIndex [0xC] += 2;
1736                                                 xcp = (int) prevValue + 0x3021 - 1;
1737                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1738                                                 fillIndex [0xC]++;
1739                                         }
1740                                         else if (currValue == 11)
1741                                                 fillIndex [0xC]++;
1742                                 }
1743                                 if (prevValue < currValue)
1744                                         prevValue = currValue;
1745                                 if (map [cp].Defined)
1746                                         continue;
1747                                 // HangZhou and Roman are add later
1748                                 // (code is above)
1749                                 else if (0x3021 <= cp && cp < 0x302A
1750                                         || 0x2160 <= cp && cp < 0x216A
1751                                         || 0x2170 <= cp && cp < 0x217A)
1752                                         continue;
1753
1754                                 if (cp ==  0x215B) // FIXME: why?
1755                                         fillIndex [0xC] += 2;
1756                                 else if (cp == 0x3021) // FIXME: why?
1757                                         fillIndex [0xC]++;
1758                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1759                                 if (addnew || cp <= '9') {
1760                                         int mod = (int) currValue - 1;
1761                                         int xcp;
1762                                         if (1 <= currValue && currValue <= 10) {
1763                                                 xcp = mod + 0x2776;
1764                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1765                                                 xcp = mod + 0x2780;
1766                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1767                                                 xcp = mod + 0x278A;
1768                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1769                                         }
1770                                         if (1 <= currValue && currValue <= 20) {
1771                                                 xcp = mod + 0x2460;
1772                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1773                                                 xcp = mod + 0x2474;
1774                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1775                                                 xcp = mod + 0x2488;
1776                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1777                                         }
1778                                 }
1779
1780                                 if (cp != 0x09E7 && cp != 0x09EA)
1781                                         fillIndex [0xC]++;
1782
1783                                 // Add special cases that are not regarded as
1784                                 // numbers in UnicodeCategory speak.
1785                                 if (cp == '5') {
1786                                         // TONE FIVE
1787                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1788                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1789                                 }
1790                                 else if (cp == '6') // FIXME: why?
1791                                         fillIndex [0xC]++;
1792                         }
1793
1794                         // 221E: infinity
1795                         fillIndex [0xC] = 0xFF;
1796                         AddCharMap ('\u221E', 0xC, 1);
1797                         #endregion
1798
1799                         #region Letters and NonSpacing Marks (general)
1800
1801                         // ASCII Latin alphabets
1802                         for (int i = 0; i < alphabets.Length; i++)
1803                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1804
1805
1806                         // non-ASCII Latin alphabets
1807                         // FIXME: there is no such characters that are placed
1808                         // *after* "alphabets" array items. This is nothing
1809                         // more than a hack that creates dummy weight for
1810                         // primary characters.
1811                         for (int i = 0x0080; i < 0x0300; i++) {
1812                                 if (!Char.IsLetter ((char) i))
1813                                         continue;
1814                                 // For those Latin Letters which has NFKD are
1815                                 // not added as independent primary character.
1816                                 if (decompIndex [i] != 0)
1817                                         continue;
1818                                 // SPECIAL CASES:
1819                                 // 1.some alphabets have primarily
1820                                 //   equivalent ASCII alphabets.
1821                                 // 2.some have independent primary weights,
1822                                 //   but inside a-to-z range.
1823                                 // 3.there are some expanded characters that
1824                                 //   are not part of Unicode Standard NFKD.
1825                                 // 4. some characters are letter in IsLetter
1826                                 //   but not in sortkeys (maybe unicode version
1827                                 //   difference caused it).
1828                                 switch (i) {
1829                                 // 1. skipping them does not make sense
1830 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
1831 //                              case 0x184: case 0x185: case 0x186: case 0x189:
1832 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
1833 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
1834 //                              case 0x19B: case 0x19C:
1835                                 // 2. skipping them does not make sense
1836 //                              case 0x14A: // Ng
1837 //                              case 0x14B: // ng
1838                                 // 3.
1839                                 case 0xC6: // AE
1840                                 case 0xE6: // ae
1841                                 case 0xDE: // Icelandic Thorn
1842                                 case 0xFE: // Icelandic Thorn
1843                                 case 0xDF: // German ss
1844                                 case 0xFF: // German ss
1845                                 // 4.
1846                                 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1847                                 // not classified yet
1848 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1849 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1850 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1851 //                              case 0x1DD:
1852                                         continue;
1853                                 }
1854                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
1855                         }
1856
1857                         // Greek and Coptic
1858                         fillIndex [0xF] = 02;
1859                         for (int i = 0x0380; i < 0x0390; i++)
1860                                 if (Char.IsLetter ((char) i))
1861                                         AddLetterMap ((char) i, 0xF, 1);
1862                         fillIndex [0xF] = 02;
1863                         for (int i = 0x0391; i < 0x03CF; i++)
1864                                 if (Char.IsLetter ((char) i))
1865                                         AddLetterMap ((char) i, 0xF, 1);
1866                         fillIndex [0xF] = 0x40;
1867                         for (int i = 0x03D0; i < 0x0400; i++)
1868                                 if (Char.IsLetter ((char) i))
1869                                         AddLetterMap ((char) i, 0xF, 1);
1870
1871                         // Cyrillic - character name order
1872                         fillIndex [0x10] = 0x6;
1873 //*
1874 for (int i = 0; i < orderedCyrillic.Length; i++)
1875 Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
1876
1877                         // table which is moslty from UCA DUCET.
1878                         for (int i = 0; i < orderedCyrillic.Length; i++) {
1879                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
1880                                 if (!IsIgnorable ((int) c) &&
1881                                         c <= '\u045C' &&
1882                                         Char.IsLetter (c)) {
1883                                         AddLetterMap (c, 0x10, 0);
1884                                         fillIndex [0x10] += 3;
1885                                 }
1886                         }
1887                         /*
1888                         for (int i = 0x0460; i < 0x0481; i++) {
1889                                 if (Char.IsLetter ((char) i)) {
1890                                         AddLetterMap ((char) i, 0x10, 0);
1891                                         fillIndex [0x10] += 3;
1892                                 }
1893                         }
1894                         */
1895 /*
1896                         for (int i = 0x0400; i <= 0x0486; i++) {
1897                                 if (!Char.IsLetter ((char) i)) {
1898 //                                      AddCharMap ((char) i, 0x1, 1);
1899                                         continue;
1900                                 }
1901                                 if (!cyrillicLetterPrimaryValues.ContainsKey (i)) {
1902                                         Console.Error.WriteLine ("no value for {0:x04}", i);
1903                                         continue;
1904                                 }
1905                                 fillIndex [0x10] =
1906                                         (byte) cyrillicLetterPrimaryValues [i];
1907                                 AddLetterMap ((char) i, 0x10, 0);
1908                         }
1909 */
1910
1911                         // Armenian
1912                         fillIndex [0x11] = 0x3;
1913                         for (int i = 0x0531; i < 0x0586; i++)
1914                                 if (Char.IsLetter ((char) i))
1915                                         AddLetterMap ((char) i, 0x11, 1);
1916
1917                         // Hebrew
1918                         // -Letters
1919                         fillIndex [0x12] = 0x3;
1920                         for (int i = 0x05D0; i < 0x05FF; i++)
1921                                 if (Char.IsLetter ((char) i))
1922                                         AddLetterMap ((char) i, 0x12, 1);
1923                         // -Accents
1924                         fillIndex [0x1] = 0x3;
1925                         for (int i = 0x0591; i <= 0x05C2; i++)
1926                                 if (i != 0x05BE)
1927                                         AddCharMap ((char) i, 0x1, 1);
1928
1929                         // Arabic
1930                         fillIndex [0x1] = 0x8E;
1931                         fillIndex [0x13] = 0x3;
1932                         for (int i = 0x0621; i <= 0x064A; i++) {
1933                                 // Abjad
1934                                 if (Char.GetUnicodeCategory ((char) i)
1935                                         != UnicodeCategory.OtherLetter) {
1936                                         // FIXME: arabic nonspacing marks are
1937                                         // in different order.
1938                                         AddCharMap ((char) i, 0x1, 1);
1939                                         continue;
1940                                 }
1941 //                              map [i] = new CharMapEntry (0x13,
1942 //                                      (byte) arabicLetterPrimaryValues [i], 1);
1943                                 fillIndex [0x13] =
1944                                         (byte) arabicLetterPrimaryValues [i];
1945                                 AddLetterMap ((char) i, 0x13, 0);
1946                         }
1947                         fillIndex [0x13] = 0x84;
1948                         for (int i = 0x0674; i < 0x06D6; i++)
1949                                 if (Char.IsLetter ((char) i))
1950                                         AddLetterMap ((char) i, 0x13, 1);
1951
1952                         // Devanagari
1953                         // FIXME: it does seem straight codepoint mapping.
1954                         fillIndex [0x14] = 04;
1955                         for (int i = 0x0901; i < 0x0905; i++)
1956                                 if (!IsIgnorable (i))
1957                                         AddLetterMap ((char) i, 0x14, 2);
1958                         fillIndex [0x14] = 0xB;
1959                         for (int i = 0x0905; i < 0x093A; i++) {
1960                                 if (i == 0x0928)
1961                                         AddCharMap ('\u0929', 0x14, 0, 8);
1962                                 if (i == 0x0930)
1963                                         AddCharMap ('\u0931', 0x14, 0, 8);
1964                                 if (i == 0x0933)
1965                                         AddCharMap ('\u0934', 0x14, 0, 8);
1966                                 if (Char.IsLetter ((char) i))
1967                                         AddLetterMap ((char) i, 0x14, 4);
1968                                 if (i == 0x090B)
1969                                         AddCharMap ('\u0960', 0x14, 4);
1970                                 if (i == 0x090C)
1971                                         AddCharMap ('\u0961', 0x14, 4);
1972                         }
1973                         fillIndex [0x14] = 0xDA;
1974                         for (int i = 0x093E; i < 0x0945; i++)
1975                                 if (!IsIgnorable (i))
1976                                         AddLetterMap ((char) i, 0x14, 2);
1977                         fillIndex [0x14] = 0xEC;
1978                         for (int i = 0x0945; i < 0x094F; i++)
1979                                 if (!IsIgnorable (i))
1980                                         AddLetterMap ((char) i, 0x14, 2);
1981
1982                         // Bengali
1983                         // -Letters
1984                         fillIndex [0x15] = 02;
1985                         for (int i = 0x0980; i < 0x9FF; i++) {
1986                                 if (IsIgnorable (i))
1987                                         continue;
1988                                 if (i == 0x09E0)
1989                                         fillIndex [0x15] = 0x3B;
1990                                 switch (Char.GetUnicodeCategory ((char) i)) {
1991                                 case UnicodeCategory.NonSpacingMark:
1992                                 case UnicodeCategory.DecimalDigitNumber:
1993                                 case UnicodeCategory.OtherNumber:
1994                                         continue;
1995                                 }
1996                                 AddLetterMap ((char) i, 0x15, 1);
1997                         }
1998                         // -Signs
1999                         fillIndex [0x1] = 0x3;
2000                         for (int i = 0x0981; i < 0x0A00; i++)
2001                                 if (Char.GetUnicodeCategory ((char) i) ==
2002                                         UnicodeCategory.NonSpacingMark)
2003                                         AddCharMap ((char) i, 0x1, 1);
2004
2005                         // Gurmukhi. orderedGurmukhi is from UCA
2006                         // FIXME: it does not look equivalent to UCA.
2007                         fillIndex [0x16] = 04;
2008                         fillIndex [0x1] = 3;
2009                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
2010                                 char c = orderedGurmukhi [i];
2011                                 if (IsIgnorable ((int) c))
2012                                         continue;
2013                                 if (IsIgnorableNonSpacing (c)) {
2014                                         AddLetterMap (c, 0x1, 1);
2015                                         continue;
2016                                 }
2017                                 if (c == '\u0A3C' || c == '\u0A4D' ||
2018                                         '\u0A66' <= c && c <= '\u0A71')
2019                                         continue;
2020                                 // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
2021                                 byte shift = 4;
2022                                 if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
2023                                         shift = 0;
2024                                 AddLetterMap (c, 0x16, shift);
2025                         }
2026
2027                         // Gujarati. orderedGujarati is from UCA
2028                         fillIndex [0x17] = 0x4;
2029                         // nonspacing marks
2030                         map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2031                         map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2032                         map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2033                         map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2034                         map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2035                         map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2036                         // letters go first.
2037                         for (int i = 0; i < orderedGujarati.Length; i++) {
2038                                 // SPECIAL CASE
2039                                 char c = orderedGujarati [i];
2040                                 if (Char.IsLetter (c)) {
2041                                         // SPECIAL CASES
2042                                         if (c == '\u0AB3' || c == '\u0A32')
2043                                                 continue;
2044                                         if (c == '\u0A33') {
2045                                                 AddCharMap ('\u0A32', 0x17, 0);
2046                                                 AddCharMap ('\u0A33', 0x17, 4, 4);
2047                                                 continue;
2048                                         }
2049                                         if (c == '\u0A8B')
2050                                                 AddCharMap ('\u0AE0', 0x17, 0, 5);
2051                                         AddCharMap (c, 0x17, 4);
2052
2053                                         if (c == '\u0AB9')
2054                                                 AddCharMap ('\u0AB3', 0x17, 6);
2055                                 }
2056                         }
2057                         // non-letters
2058                         byte gujaratiShift = 4;
2059                         fillIndex [0x17] = 0xC0;
2060                         for (int i = 0; i < orderedGujarati.Length; i++) {
2061                                 char c = orderedGujarati [i];
2062                                 if (fillIndex [0x17] == 0xCC)
2063                                         gujaratiShift = 3;
2064                                 if (!Char.IsLetter (c)) {
2065                                         // SPECIAL CASES
2066                                         if (c == '\u0A82')
2067                                                 AddCharMap ('\u0A81', 0x17, 2);
2068                                         if (c == '\u0AC2')
2069                                                 fillIndex [0x17]++;
2070                                         AddLetterMap (c, 0x17, gujaratiShift);
2071                                 }
2072                         }
2073
2074                         // Oriya
2075                         fillIndex [0x1] = 03;
2076                         fillIndex [0x18] = 02;
2077                         for (int i = 0x0B00; i < 0x0B7F; i++) {
2078                                 switch (Char.GetUnicodeCategory ((char) i)) {
2079                                 case UnicodeCategory.NonSpacingMark:
2080                                 case UnicodeCategory.DecimalDigitNumber:
2081                                         AddLetterMap ((char) i, 0x1, 1);
2082                                         continue;
2083                                 }
2084                                 AddLetterMap ((char) i, 0x18, 1);
2085                         }
2086
2087                         // Tamil
2088                         fillIndex [0x19] = 2;
2089                         AddCharMap ('\u0BD7', 0x19, 0);
2090                         fillIndex [0x19] = 0xA;
2091                         // vowels
2092                         for (int i = 0x0B82; i <= 0x0B94; i++)
2093                                 if (!IsIgnorable ((char) i))
2094                                         AddCharMap ((char) i, 0x19, 2);
2095                         // special vowel
2096                         fillIndex [0x19] = 0x28;
2097                         // The array for Tamil consonants is a constant.
2098                         // Windows have almost similar sequence to TAM from
2099                         // tamilnet but a bit different in Grantha.
2100                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
2101                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2102                         // combining marks
2103                         fillIndex [0x19] = 0x82;
2104                         for (int i = 0x0BBE; i < 0x0BCD; i++)
2105                                 if (Char.GetUnicodeCategory ((char) i) ==
2106                                         UnicodeCategory.SpacingCombiningMark
2107                                         || i == 0x0BC0)
2108                                         AddLetterMap ((char) i, 0x19, 2);
2109
2110                         // Telugu
2111                         fillIndex [0x1A] = 0x4;
2112                         for (int i = 0x0C00; i < 0x0C62; i++) {
2113                                 if (i == 0x0C55 || i == 0x0C56)
2114                                         continue; // skip
2115                                 AddCharMap ((char) i, 0x1A, 3);
2116                                 char supp = (i == 0x0C0B) ? '\u0C60':
2117                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
2118                                 if (supp == char.MinValue)
2119                                         continue;
2120                                 AddCharMap (supp, 0x1A, 3);
2121                         }
2122
2123                         // Kannada
2124                         fillIndex [0x1B] = 4;
2125                         for (int i = 0x0C80; i < 0x0CE5; i++) {
2126                                 if (i == 0x0CD5 || i == 0x0CD6)
2127                                         continue; // ignore
2128                                 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2129                                         continue; // shift after 0xCB9
2130                                 AddCharMap ((char) i, 0x1B, 3);
2131                                 if (i == 0x0CB9) {
2132                                         // SPECIAL CASES: but why?
2133                                         AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2134                                         AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2135                                         AddCharMap ('\u0CDE', 0x1B, 3); // FA
2136                                 }
2137                                 if (i == 0x0CB2)
2138                                         AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2139                         }
2140
2141                         // Malayalam
2142                         fillIndex [0x1C] = 2;
2143                         for (int i = 0x0D02; i < 0x0D61; i++)
2144                                 // FIXME: I avoided MSCompatUnicodeTable usage
2145                                 // here (it results in recursion). So check if
2146                                 // using NonSpacingMark makes sense or not.
2147                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2148 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2149                                         AddCharMap ((char) i, 0x1C, 1);
2150
2151                         // Thai ... note that it breaks 0x1E wall after E2B!
2152                         // Also, all Thai characters have level 2 value 3.
2153                         fillIndex [0x1E] = 2;
2154                         for (int i = 0xE40; i <= 0xE44; i++)
2155                                 AddCharMap ((char) i, 0x1E, 1, 3);
2156                         for (int i = 0xE01; i < 0xE2B; i++)
2157                                 AddCharMap ((char) i, 0x1E, 6, 3);
2158                         fillIndex [0x1F] = 5;
2159                         for (int i = 0xE2B; i < 0xE30; i++)
2160                                 AddCharMap ((char) i, 0x1F, 6, 3);
2161                         fillIndex [0x1F] = 0x1E;
2162                         for (int i = 0xE30; i < 0xE3B; i++)
2163                                 AddCharMap ((char) i, 0x1F, 1, 3);
2164                         // some Thai characters remains.
2165                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
2166                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2167                         foreach (char c in specialThai)
2168                                 AddCharMap (c, 0x1F, 1);
2169
2170                         // Lao
2171                         fillIndex [0x1F] = 2;
2172                         for (int i = 0xE80; i < 0xEDF; i++)
2173                                 if (Char.IsLetter ((char) i))
2174                                         AddCharMap ((char) i, 0x1F, 1);
2175
2176                         // Georgian. orderedGeorgian is from UCA DUCET.
2177                         fillIndex [0x21] = 5;
2178                         for (int i = 0; i < orderedGeorgian.Length; i++) {
2179                                 char c = orderedGeorgian [i];
2180                                 if (map [(int) c].Defined)
2181                                         continue;
2182                                 AddCharMap (c, 0x21, 0);
2183                                 if (c < '\u10F6')
2184                                         AddCharMap ((char) (c - 0x30), 0x21, 0, 0x12);
2185                                 fillIndex [0x21] += 5;
2186                         }
2187
2188                         // Japanese Kana.
2189                         fillIndex [0x22] = 2;
2190                         int kanaOffset = 0x3041;
2191                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2192
2193                         for (int gyo = 0; gyo < 9; gyo++) {
2194                                 for (int dan = 0; dan < 5; dan++) {
2195                                         if (gyo == 7 && dan % 2 == 1) {
2196                                                 // 'ya'-gyo
2197                                                 fillIndex [0x22]++;
2198                                                 kanaOffset -= 2; // There is no space for yi and ye.
2199                                                 continue;
2200                                         }
2201                                         int cp = kanaOffset + dan * kanaLines [gyo];
2202                                         // small lines (a-gyo, ya-gyo)
2203                                         if (gyo == 0 || gyo == 7) {
2204                                                 AddKanaMap (cp, 1); // small
2205                                                 AddKanaMap (cp + 1, 1);
2206                                         }
2207                                         else
2208                                                 AddKanaMap (cp, kanaLines [gyo]);
2209                                         fillIndex [0x22]++;
2210
2211                                         if (cp == 0x3061) {
2212                                                 // add small 'Tsu' (before normal one)
2213                                                 AddKanaMap (0x3063, 1);
2214                                                 kanaOffset++;
2215                                         }
2216                                 }
2217                                 fillIndex [0x22] += 3;
2218                                 kanaOffset += 5 * kanaLines [gyo];
2219                         }
2220
2221                         // Wa-gyo is almost special, so I just manually add.
2222                         AddLetterMap ((char) 0x308E, 0x22, 0);
2223                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2224                         AddLetterMap ((char) 0x308F, 0x22, 0);
2225                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2226                         fillIndex [0x22]++;
2227                         AddLetterMap ((char) 0x3090, 0x22, 0);
2228                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2229                         fillIndex [0x22] += 2;
2230                         // no "Wu" in Japanese.
2231                         AddLetterMap ((char) 0x3091, 0x22, 0);
2232                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2233                         fillIndex [0x22]++;
2234                         AddLetterMap ((char) 0x3092, 0x22, 0);
2235                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2236                         // Nn
2237                         fillIndex [0x22] = 0x80;
2238                         AddLetterMap ((char) 0x3093, 0x22, 0);
2239                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2240
2241                         // JIS Japanese square chars.
2242                         fillIndex [0x22] = 0x97;
2243                         jisJapanese.Sort (JISComparer.Instance);
2244                         foreach (JISCharacter j in jisJapanese)
2245                                 if (0x3300 <= j.CP && j.CP <= 0x3357)
2246                                         AddCharMap ((char) j.CP, 0x22, 1);
2247                         // non-JIS Japanese square chars.
2248                         nonJisJapanese.Sort (NonJISComparer.Instance);
2249                         foreach (NonJISCharacter j in nonJisJapanese)
2250                                 AddCharMap ((char) j.CP, 0x22, 1);
2251
2252                         // Bopomofo
2253                         fillIndex [0x23] = 0x02;
2254                         for (int i = 0x3105; i <= 0x312C; i++)
2255                                 AddCharMap ((char) i, 0x23, 1);
2256
2257                         // Estrangela: ancient Syriac
2258                         fillIndex [0x24] = 0x0B;
2259                         // FIXME: is 0x71E really alternative form?
2260                         ArrayList syriacAlternatives = new ArrayList (
2261                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2262                         for (int i = 0x0710; i <= 0x072C; i++) {
2263                                 if (i == 0x0711) // NonSpacingMark
2264                                         continue;
2265                                 if (syriacAlternatives.Contains (i))
2266                                         continue;
2267                                 AddCharMap ((char) i, 0x24, 4);
2268                                 // FIXME: why?
2269                                 if (i == 0x721)
2270                                         fillIndex [0x24]++;
2271                         }
2272                         foreach (int cp in syriacAlternatives)
2273                                 map [cp] = new CharMapEntry (0x24,
2274                                         (byte) (map [cp - 1].Level1 + 2),
2275                                         0);
2276                         // FIXME: Syriac NonSpacingMark should go here.
2277
2278                         // Thaana
2279                         // FIXME: it turned out that it does not look like UCA
2280                         fillIndex [0x24] = 0x6E;
2281                         for (int i = 0; i < orderedThaana.Length; i++) {
2282                                 char c = orderedThaana [i];
2283                                 if (IsIgnorableNonSpacing ((int) c))
2284                                         continue;
2285                                 AddCharMap (c, 0x24, 2);
2286                                 if (c == '\u0782') // SPECIAL CASE: why?
2287                                         fillIndex [0x24] += 2;
2288                         }
2289                         #endregion
2290
2291                         // FIXME: Add more culture-specific letters (that are
2292                         // not supported in Windows collation) here.
2293
2294                         // Surrogate ... they are computed.
2295
2296                         #region Hangul
2297                         // Hangul.
2298                         //
2299                         // Unlike UCA Windows Hangul sequence mixes Jongseong
2300                         // with Choseong sequence as well as Jungseong,
2301                         // adjusted to have the same primary weight for the
2302                         // same base character. So it is impossible to compute
2303                         // those sort keys.
2304                         //
2305                         // Here I introduce an ordered sequence of mixed
2306                         // 'commands' and 'characters' that is similar to
2307                         // LDML text:
2308                         //      - ',' increases primary weight.
2309                         //      - [A B] means a range, increasing index
2310                         //      - {A B} means a range, without increasing index
2311                         //      - '=' is no operation (it means the characters
2312                         //        of both sides have the same weight).
2313                         //      - '>' inserts a Hangul Syllable block that
2314                         //        contains 0x251 characters.
2315                         //      - '<' decreases the index
2316                         //      - '0'-'9' means skip count
2317                         //      - whitespaces are ignored
2318                         //
2319
2320                         string hangulSequence =
2321                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2322                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2323                         + "<{\u1113 \u1116}, \u3165,"
2324                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2325                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2326                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
2327                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2328                                 + "[\u11D1 \u11D2], \u11B2,"
2329                                 + "[\u11D3 \u11D5], \u11B3,"
2330                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2331                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2332                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2333                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2334                         + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2335                                 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2336                                 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2337                         + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2338                                 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2339                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2340                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2341                         + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2342                                 + "\u11F1,, \u11F2,,,"
2343                                 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2344                         + "<\u114D, \u110D,,  >"
2345                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2346                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2347                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2348                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2349                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2350                                 + "[\u11F5 \u11F8]"
2351                         ;
2352
2353                         byte hangulCat = 0x52;
2354                         fillIndex [hangulCat] = 0x2;
2355
2356                         int syllableBlock = 0;
2357                         for (int n = 0; n < hangulSequence.Length; n++) {
2358                                 char c = hangulSequence [n];
2359                                 int start, end;
2360                                 if (Char.IsWhiteSpace (c))
2361                                         continue;
2362                                 switch (c) {
2363                                 case '=':
2364                                         break; // NOP
2365                                 case ',':
2366                                         IncrementSequentialIndex (ref hangulCat);
2367                                         break;
2368                                 case '<':
2369                                         if (fillIndex [hangulCat] == 2)
2370                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2371                                         fillIndex [hangulCat]--;
2372                                         break;
2373                                 case '>':
2374                                         IncrementSequentialIndex (ref hangulCat);
2375                                         for (int l = 0; l < 0x15; l++)
2376                                                 for (int v = 0; v < 0x1C; v++) {
2377                                                         AddCharMap (
2378                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2379                                                         IncrementSequentialIndex (ref hangulCat);
2380                                                 }
2381                                         syllableBlock++;
2382                                         break;
2383                                 case '[':
2384                                         start = hangulSequence [n + 1];
2385                                         end = hangulSequence [n + 3];
2386                                         for (int i = start; i <= end; i++) {
2387                                                 AddCharMap ((char) i, hangulCat, 0);
2388                                                 if (end > i)
2389                                                         IncrementSequentialIndex (ref hangulCat);
2390                                         }
2391                                         n += 4; // consumes 5 characters for this operation
2392                                         break;
2393                                 case '{':
2394                                         start = hangulSequence [n + 1];
2395                                         end = hangulSequence [n + 3];
2396                                         for (int i = start; i <= end; i++)
2397                                                 AddCharMap ((char) i, hangulCat, 0);
2398                                         n += 4; // consumes 5 characters for this operation
2399                                         break;
2400                                 default:
2401                                         AddCharMap (c, hangulCat, 0);
2402                                         break;
2403                                 }
2404                         }
2405
2406                         // Some Jamo NFKD.
2407                         for (int i = 0x3200; i < 0x3300; i++) {
2408                                 if (IsIgnorable (i) || map [i].Defined)
2409                                         continue;
2410                                 int ch = 0;
2411                                 // w/ bracket
2412                                 if (decompLength [i] == 4 &&
2413                                         decompValues [decompIndex [i]] == '(')
2414                                         ch = decompIndex [i] + 1;
2415                                 // circled
2416                                 else if (decompLength [i] == 2 &&
2417                                         decompValues [decompIndex [i] + 1] == '\u1161')
2418                                         ch = decompIndex [i];
2419                                 else if (decompLength [i] == 1)
2420                                         ch = decompIndex [i];
2421                                 else
2422                                         continue;
2423                                 ch = decompValues [ch];
2424                                 if (ch < 0x1100 || 0x1200 < ch &&
2425                                         ch < 0xAC00 || 0xD800 < ch)
2426                                         continue;
2427
2428                                 // SPECIAL CASE ?
2429                                 int offset = i < 0x3260 ? 1 : 0;
2430                                 if (0x326E <= i && i <= 0x3273)
2431                                         offset = 1;
2432
2433                                 map [i] = new CharMapEntry (map [ch].Category,
2434                                         (byte) (map [ch].Level1 + offset),
2435                                         map [ch].Level2);
2436 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2437                         }
2438
2439
2440                         #endregion
2441
2442                         // Letterlike characters and CJK compatibility square
2443                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2444                         int [] counts = new int ['Z' - 'A' + 1];
2445                         char [] namedChars = new char [sortableCharNames.Count];
2446                         int nCharNames = 0;
2447                         foreach (DictionaryEntry de in sortableCharNames) {
2448                                 counts [((string) de.Value) [0] - 'A']++;
2449                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2450                         }
2451                         nCharNames = 0; // reset
2452                         for (int a = 0; a < counts.Length; a++) {
2453                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2454                                 for (int i = 0; i < counts [a]; i++)
2455 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2456                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2457                         }
2458
2459                         // CJK unified ideograph.
2460                         byte cjkCat = 0x9E;
2461                         fillIndex [cjkCat] = 0x2;
2462                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2463                                 if (!IsIgnorable (cp))
2464                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2465                         // CJK Extensions goes here.
2466                         // LAMESPEC: With this Windows style CJK layout, it is
2467                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2468                         // 0x9FBB can never be added w/o breaking compat.
2469                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2470                                 if (!IsIgnorable (cp))
2471                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2472
2473                         // PrivateUse ... computed.
2474                         // remaining Surrogate ... computed.
2475
2476                         #region Special "biggest" area (FF FF)
2477                         fillIndex [0xFF] = 0xFF;
2478                         char [] specialBiggest = new char [] {
2479                                 '\u3005', '\u3031', '\u3032', '\u309D',
2480                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2481                                 '\uFE7C', '\uFE7D', '\uFF70'};
2482                         foreach (char c in specialBiggest)
2483                                 AddCharMap (c, 0xFF, 0);
2484                         #endregion
2485
2486                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2487                         // non-alphanumeric ASCII except for: + - < = > '
2488                         for (int i = 0x21; i < 0x7F; i++) {
2489                                 if (Char.IsLetterOrDigit ((char) i)
2490                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2491                                         continue; // they are not added here.
2492                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2493                                 // Insert 3001 after ',' and 3002 after '.'
2494                                 if (i == 0x2C)
2495                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2496                                 else if (i == 0x2E) {
2497                                         fillIndex [0x7]--;
2498                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2499                                 }
2500                                 else if (i == 0x3A)
2501                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2502                         }
2503                         #endregion
2504
2505                         #region 07 - Punctuations and something else
2506                         for (int i = 0xA0; i < char.MaxValue; i++) {
2507                                 if (IsIgnorable (i))
2508                                         continue;
2509
2510                                 // FIXME: actually those reset should not be
2511                                 // done but here I put for easy goal.
2512                                 if (i == 0x0700)
2513                                         fillIndex [0x7] = 0xE2;
2514                                 if (i == 0x2016)
2515                                         fillIndex [0x7] = 0x77;
2516
2517                                 // SPECIAL CASES:
2518                                 switch (i) {
2519                                 case 0xAB: // 08
2520                                 case 0xB7: // 0A
2521                                 case 0xBB: // 08
2522                                 case 0x2329: // 09
2523                                 case 0x232A: // 09
2524                                         continue;
2525                                 }
2526
2527                                 switch (Char.GetUnicodeCategory ((char) i)) {
2528                                 case UnicodeCategory.OtherPunctuation:
2529                                 case UnicodeCategory.ClosePunctuation:
2530                                 case UnicodeCategory.OpenPunctuation:
2531                                 case UnicodeCategory.InitialQuotePunctuation:
2532                                 case UnicodeCategory.FinalQuotePunctuation:
2533                                 case UnicodeCategory.ModifierSymbol:
2534                                         // SPECIAL CASES: // 0xA
2535                                         if (0x2020 <= i && i <= 0x2042)
2536                                                 continue;
2537                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
2538                                         break;
2539                                 default:
2540                                         if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2541                                                 goto case UnicodeCategory.OtherPunctuation;
2542                                         break;
2543                                 }
2544                         }
2545                         // Control pictures
2546                         for (int i = 0x2400; i <= 0x2421; i++)
2547                                 AddCharMap ((char) i, 0x7, 1, 0);
2548                         #endregion
2549
2550                         // FIXME: for 07 xx we need more love.
2551
2552                         // Characters w/ diacritical marks (NFKD)
2553                         for (int i = 0; i <= char.MaxValue; i++) {
2554                                 if (map [i].Defined || IsIgnorable (i))
2555                                         continue;
2556                                 if (decompIndex [i] == 0)
2557                                         continue;
2558
2559                                 int start = decompIndex [i];
2560                                 int primaryChar = decompValues [start];
2561                                 int secondary = 0;
2562                                 bool skip = false;
2563                                 int length = decompLength [i];
2564                                 // special processing for parenthesized ones.
2565                                 if (length == 3 &&
2566                                         decompValues [start] == '(' &&
2567                                         decompValues [start + 2] == ')') {
2568                                         primaryChar = decompValues [start + 1];
2569                                         length = 1;
2570                                 }
2571
2572                                 if (map [primaryChar].Level1 == 0)
2573                                         continue;
2574
2575                                 for (int l = 1; l < length; l++) {
2576                                         int c = decompValues [start + l];
2577                                         if (map [c].Level1 != 0)
2578                                                 skip = true;
2579                                         secondary += diacritical [c];
2580                                 }
2581                                 if (skip)
2582                                         continue;
2583                                 map [i] = new CharMapEntry (
2584                                         map [primaryChar].Category,
2585                                         map [primaryChar].Level1,
2586                                         (byte) secondary);
2587
2588                         }
2589
2590                         // category 08 - symbols
2591                         fillIndex [0x8] = 2;
2592                         // Here Windows mapping is not straightforward. It is
2593                         // not based on computation but seems manual sorting.
2594                         AddCharMapGroup ('+', 0x8, 1, 0); // plus
2595                         AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
2596                         AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
2597                         AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
2598                         AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
2599                         AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
2600                         AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
2601                         AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
2602                         AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
2603                         AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
2604                         AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
2605                         AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
2606                         AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
2607
2608                         for (int cp = 0; cp < 0x2300; cp++) {
2609                                 if (cp == 0x200)
2610                                         cp = 0x2200; // skip to 2200
2611                                 if (cp == 0xAC) // SPECIAL CASE: skip
2612                                         continue;
2613                                 if (!map [cp].Defined &&
2614 //                                      Char.GetUnicodeCategory ((char) cp) ==
2615 //                                      UnicodeCategory.MathSymbol)
2616                                         Char.IsSymbol ((char) cp))
2617                                         AddCharMapGroup ((char) cp, 0x8, 1, 0);
2618                                 // SPECIAL CASES: no idea why Windows sorts as such
2619                                 switch (cp) {
2620                                 case 0x3E:
2621                                         AddCharMap ('\u227B', 0x8, 1, 0);
2622                                         AddCharMap ('\u22B1', 0x8, 1, 0);
2623                                         break;
2624                                 case 0xB1:
2625                                         AddCharMapGroup ('\u00AB', 0x8, 1, 0);
2626                                         AddCharMapGroup ('\u226A', 0x8, 1, 0);
2627                                         AddCharMapGroup ('\u00BB', 0x8, 1, 0);
2628                                         AddCharMapGroup ('\u226B', 0x8, 1, 0);
2629                                         break;
2630                                 case 0xF7:
2631                                         AddCharMap ('\u01C0', 0x8, 1, 0);
2632                                         AddCharMap ('\u01C1', 0x8, 1, 0);
2633                                         AddCharMap ('\u01C2', 0x8, 1, 0);
2634                                         break;
2635                                 }
2636                         }
2637
2638                         #region Level2 adjustment
2639                         // Arabic Hamzah
2640                         diacritical [0x624] = 0x5;
2641                         diacritical [0x626] = 0x7;
2642                         diacritical [0x622] = 0x9;
2643                         diacritical [0x623] = 0xA;
2644                         diacritical [0x625] = 0xB;
2645                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
2646                         diacritical [0x64A] = 0x7; // Yaa'
2647
2648                         for (int i = 0; i < char.MaxValue; i++) {
2649                                 byte mod = 0;
2650                                 byte cat = map [i].Category;
2651                                 switch (cat) {
2652                                 case 0xE: // Latin diacritics
2653                                 case 0x22: // Japanese: circled characters
2654                                         mod = diacritical [i];
2655                                         break;
2656                                 case 0x13: // Arabic
2657                                         if (diacritical [i] == 0)
2658                                                 mod = 0x8; // default for arabic
2659                                         break;
2660                                 }
2661                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
2662                                         mod = diacritical [i];
2663                                 if (mod > 0)
2664                                         map [i] = new CharMapEntry (
2665                                                 cat, map [i].Level1, mod);
2666                         }
2667                         #endregion
2668
2669                         // FIXME: this is hack but those NonSpacingMark
2670                         // characters and still undefined are likely to
2671                         // be nonspacing.
2672                         for (int i = 0; i < char.MaxValue; i++)
2673                                 if (!map [i].Defined &&
2674                                         !IsIgnorable (i) &&
2675                                         Char.GetUnicodeCategory ((char) i) ==
2676                                         UnicodeCategory.NonSpacingMark)
2677                                         AddCharMap ((char) i, 1, 1);
2678
2679                         // FIXME: this is hack but those Symbol characters
2680                         // are likely to fall into 0xA category.
2681                         for (int i = 0; i < char.MaxValue; i++)
2682                                 if (!map [i].Defined &&
2683                                         !IsIgnorable (i) &&
2684                                         Char.IsSymbol ((char) i))
2685                                         AddCharMap ((char) i, 0xA, 1);
2686                 }
2687
2688                 private void IncrementSequentialIndex (ref byte hangulCat)
2689                 {
2690                         fillIndex [hangulCat]++;
2691                         if (fillIndex [hangulCat] == 0) { // overflown
2692                                 hangulCat++;
2693                                 fillIndex [hangulCat] = 0x2;
2694                         }
2695                 }
2696
2697                 // Reset fillIndex to fixed value and call AddLetterMap().
2698                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2699                 {
2700                         fillIndex [category] = alphaWeight;
2701                         AddLetterMap (c, category, 0);
2702
2703                         ArrayList al = latinMap [c] as ArrayList;
2704                         if (al == null)
2705                                 return;
2706
2707                         foreach (int cp in al)
2708                                 AddLetterMap ((char) cp, category, 0);
2709                 }
2710
2711                 private void AddKanaMap (int i, byte voices)
2712                 {
2713                         for (byte b = 0; b < voices; b++) {
2714                                 char c = (char) (i + b);
2715                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
2716                                 // Hiragana
2717                                 AddLetterMapCore (c, 0x22, 0, arg);
2718                                 // Katakana
2719                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2720                         }
2721                 }
2722
2723                 private void AddLetterMap (char c, byte category, byte updateCount)
2724                 {
2725                         AddLetterMapCore (c, category, updateCount, 0);
2726                 }
2727
2728                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2729                 {
2730                         char c2;
2731                         // <small> updates index
2732                         c2 = ToSmallForm (c);
2733                         if (c2 != c)
2734                                 AddCharMapGroup (c2, category, updateCount, level2);
2735                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2736                         if (c2 != c && !map [(int) c2].Defined)
2737                                 AddLetterMapCore (c2, category, 0, level2);
2738                         bool doUpdate = true;
2739                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2740                                 doUpdate = false;
2741                         else
2742                                 AddCharMapGroup (c, category, 0, level2);
2743                         if (doUpdate)
2744                                 fillIndex [category] += updateCount;
2745                 }
2746
2747                 private bool AddCharMap (char c, byte category, byte increment)
2748                 {
2749                         return AddCharMap (c, category, increment, 0);
2750                 }
2751
2752                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2753                 {
2754                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2755                                 return false; // do nothing
2756                         map [(int) c] = new CharMapEntry (category,
2757                                 category == 1 ? alt : fillIndex [category],
2758                                 category == 1 ? fillIndex [category] : alt);
2759                         fillIndex [category] += increment;
2760                         return true;
2761                 }
2762
2763                 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2764                 {
2765                         char c2 = ToSmallFormTail (c);
2766                         if (c2 != c)
2767                                 AddCharMap (c2, category, updateCount, 0);
2768                         // itself
2769                         AddCharMap (c, category, updateCount, 0);
2770                         // <full>
2771                         c2 = ToFullWidthTail (c);
2772                         if (c2 != c)
2773                                 AddCharMapGroupTail (c2, category, updateCount);
2774                 }
2775
2776                 //
2777                 // Adds characters to table in the order below
2778                 // (+ increases weight):
2779                 //      (<small> +)
2780                 //      itself
2781                 //      <fraction>
2782                 //      <full> | <super> | <sub>
2783                 //      <circle> | <wide> (| <narrow>)
2784                 //      +
2785                 //      (vertical +)
2786                 //
2787                 // level2 is fixed (does not increase).
2788                 int [] sameWeightItems = new int [] {
2789                         DecompositionFraction,
2790                         DecompositionFull,
2791                         DecompositionSuper,
2792                         DecompositionSub,
2793                         DecompositionCircle,
2794                         DecompositionWide,
2795                         DecompositionNarrow,
2796                         };
2797                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2798                 {
2799                         if (map [(int) c].Defined)
2800                                 return;
2801
2802                         char small = char.MinValue;
2803                         char vertical = char.MinValue;
2804                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2805                         if (nfkd != null) {
2806                                 object smv = nfkd [(byte) DecompositionSmall];
2807                                 if (smv != null)
2808                                         small = (char) ((int) smv);
2809                                 object vv = nfkd [(byte) DecompositionVertical];
2810                                 if (vv != null)
2811                                         vertical = (char) ((int) vv);
2812                         }
2813
2814                         // <small> updates index
2815                         if (small != char.MinValue)
2816                                 AddCharMap (small, category, updateCount);
2817
2818                         // itself
2819                         AddCharMap (c, category, 0, level2);
2820
2821                         if (nfkd != null) {
2822                                 foreach (int weight in sameWeightItems) {
2823                                         object wv = nfkd [(byte) weight];
2824                                         if (wv != null)
2825                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
2826                                 }
2827                         }
2828
2829                         // update index here.
2830                         fillIndex [category] += updateCount;
2831
2832                         if (vertical != char.MinValue)
2833                                 AddCharMap (vertical, category, updateCount, level2);
2834                 }
2835
2836                 private void AddCharMapCJK (char c, ref byte category)
2837                 {
2838                         AddCharMap (c, category, 0, 0);
2839                         IncrementSequentialIndex (ref category);
2840
2841                         // Special. I wonder why but Windows skips 9E F9.
2842                         if (category == 0x9E && fillIndex [category] == 0xF9)
2843                                 IncrementSequentialIndex (ref category);
2844                 }
2845
2846                 private void AddCharMapGroupCJK (char c, ref byte category)
2847                 {
2848                         AddCharMapCJK (c, ref category);
2849
2850                         // LAMESPEC: see below.
2851                         if (c == '\u5B78') {
2852                                 AddCharMapCJK ('\u32AB', ref category);
2853                                 AddCharMapCJK ('\u323B', ref category);
2854                         }
2855                         if (c == '\u52DE') {
2856                                 AddCharMapCJK ('\u3298', ref category);
2857                                 AddCharMapCJK ('\u3238', ref category);
2858                         }
2859                         if (c == '\u5BEB')
2860                                 AddCharMapCJK ('\u32A2', ref category);
2861                         if (c == '\u91AB')
2862                                 // Especially this mapping order totally does
2863                                 // not make sense to me.
2864                                 AddCharMapCJK ('\u32A9', ref category);
2865
2866                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2867                         if (nfkd == null)
2868                                 return;
2869                         for (byte weight = 0; weight <= 0x12; weight++) {
2870                                 object wv = nfkd [weight];
2871                                 if (wv == null)
2872                                         continue;
2873                                 int w = (int) wv;
2874
2875                                 // Special: they are ignored in this area.
2876                                 // FIXME: check if it is sane
2877                                 if (0xF900 <= w && w <= 0xFAD9)
2878                                         continue;
2879                                 // LAMESPEC: on Windows some of CJK characters
2880                                 // in 3200-32B0 are incorrectly mapped. They
2881                                 // mix Chinise and Japanese Kanji when
2882                                 // ordering those characters.
2883                                 switch (w) {
2884                                 case 0x32A2: case 0x3298: case 0x3238:
2885                                 case 0x32A9: case 0x323B: case 0x32AB:
2886                                         continue;
2887                                 }
2888
2889                                 AddCharMapCJK ((char) w, ref category);
2890                         }
2891                 }
2892
2893                 // For now it is only for 0x7 category.
2894                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2895                 {
2896                         char small = char.MinValue;
2897                         char vertical = char.MinValue;
2898                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2899                         if (nfkd != null) {
2900                                 object smv = nfkd [(byte) DecompositionSmall];
2901                                 if (smv != null)
2902                                         small = (char) ((int) smv);
2903                                 object vv = nfkd [(byte) DecompositionVertical];
2904                                 if (vv != null)
2905                                         vertical = (char) ((int) vv);
2906                         }
2907
2908                         // <small> updates index
2909                         if (small != char.MinValue)
2910                                 // SPECIAL CASE excluded (FIXME: why?)
2911                                 if (small != '\u2024')
2912                                         AddCharMap (small, category, updateCount);
2913
2914                         // itself
2915                         AddCharMap (c, category, updateCount, level2);
2916
2917                         // Since nfkdMap is problematic to have two or more
2918                         // NFKD to an identical character, here I iterate all.
2919                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
2920                                 if (decompLength [c2] == 1 &&
2921                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
2922                                         switch (decompType [c2]) {
2923                                         case DecompositionCompat:
2924                                                 AddCharMap ((char) c2, category, updateCount, level2);
2925                                                 break;
2926                                         }
2927                                 }
2928                         }
2929
2930                         if (vertical != char.MinValue)
2931                                 // SPECIAL CASE excluded (FIXME: why?)
2932                                 if (vertical != '\uFE33' && vertical != '\uFE34')
2933                                         AddCharMap (vertical, category, updateCount, level2);
2934                 }
2935
2936                 private void AddArabicCharMap (char c)
2937                 {
2938                         byte category = 6;
2939                         byte updateCount = 1;
2940                         byte level2 = 0;
2941
2942                         // itself
2943                         AddCharMap (c, category, 0, level2);
2944
2945                         // Since nfkdMap is problematic to have two or more
2946                         // NFKD to an identical character, here I iterate all.
2947                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
2948                                 if (decompLength [c2] == 0)
2949                                         continue;
2950                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
2951                                 if ((int) (decompValues [idx]) == (int) c)
2952                                         AddCharMap ((char) c2, category,
2953                                                 0, level2);
2954                         }
2955                         fillIndex [category] += updateCount;
2956                 }
2957
2958                 char ToFullWidth (char c)
2959                 {
2960                         return ToDecomposed (c, DecompositionFull, false);
2961                 }
2962
2963                 char ToFullWidthTail (char c)
2964                 {
2965                         return ToDecomposed (c, DecompositionFull, true);
2966                 }
2967
2968                 char ToSmallForm (char c)
2969                 {
2970                         return ToDecomposed (c, DecompositionSmall, false);
2971                 }
2972
2973                 char ToSmallFormTail (char c)
2974                 {
2975                         return ToDecomposed (c, DecompositionSmall, true);
2976                 }
2977
2978                 char ToDecomposed (char c, byte d, bool tail)
2979                 {
2980                         if (decompType [(int) c] != d)
2981                                 return c;
2982                         int idx = decompIndex [(int) c];
2983                         if (tail)
2984                                 idx += decompLength [(int) c] - 1;
2985                         return (char) decompValues [idx];
2986                 }
2987
2988                 bool ExistsJIS (int cp)
2989                 {
2990                         foreach (JISCharacter j in jisJapanese)
2991                                 if (j.CP == cp)
2992                                         return true;
2993                         return false;
2994                 }
2995
2996                 #endregion
2997
2998                 #region Level 3 properties (Case/Width)
2999
3000                 private byte ComputeLevel3Weight (char c)
3001                 {
3002                         byte b = ComputeLevel3WeightRaw (c);
3003                         return b > 0 ? (byte) (b + 2) : b;
3004                 }
3005
3006                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3007                 {
3008                         // CJK compat
3009                         if ('\u3192' <= c && c <= '\u319F')
3010                                 return 0;
3011                         // Korean
3012                         if ('\u11A8' <= c && c <= '\u11F9')
3013                                 return 2;
3014                         if ('\uFFA0' <= c && c <= '\uFFDC')
3015                                 return 4;
3016                         if ('\u3130' <= c && c <= '\u3164')
3017                                 return 5;
3018                         if ('\u3165' <= c && c <= '\u318E')
3019                                 return 4;
3020                         // numbers
3021                         if ('\u2776' <= c && c <= '\u277F')
3022                                 return 4;
3023                         if ('\u2780' <= c && c <= '\u2789')
3024                                 return 8;
3025                         if ('\u2776' <= c && c <= '\u2793')
3026                                 return 0xC;
3027                         if ('\u2160' <= c && c <= '\u216F')
3028                                 return 0x18;
3029                         if ('\u2181' <= c && c <= '\u2182')
3030                                 return 0x18;
3031                         // Arabic
3032                         if ('\u2135' <= c && c <= '\u2138')
3033                                 return 4;
3034                         if ('\uFE80' <= c && c < '\uFF00') {
3035                                 // 2(Isolated)/8(Final)/0x18(Medial)
3036                                 switch (decompType [(int) c]) {
3037                                 case DecompositionIsolated:
3038                                         return 2;
3039                                 case DecompositionFinal:
3040                                         return 8;
3041                                 case DecompositionMedial:
3042                                         return 0x18;
3043                                 }
3044                         }
3045
3046                         // actually I dunno the reason why they have weights.
3047                         switch (c) {
3048                         case '\u01BC':
3049                                 return 0x10;
3050                         case '\u06A9':
3051                                 return 0x20;
3052                         case '\u06AA':
3053                                 return 0x28;
3054                         }
3055
3056                         byte ret = 0;
3057                         switch (c) {
3058                         case '\u03C2':
3059                         case '\u2104':
3060                         case '\u212B':
3061                                 ret |= 8;
3062                                 break;
3063                         case '\uFE42':
3064                                 ret |= 0xC;
3065                                 break;
3066                         }
3067
3068                         // misc
3069                         switch (decompType [(int) c]) {
3070                         case DecompositionWide: // <wide>
3071                         case DecompositionSub: // <sub>
3072                         case DecompositionSuper: // <super>
3073                                 ret |= decompType [(int) c];
3074                                 break;
3075                         }
3076                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3077                                 ret |= 8;
3078                         if (isUppercase [(int) c]) // DerivedCoreProperties
3079                                 ret |= 0x10;
3080
3081                         return ret;
3082                 }
3083
3084                 #endregion
3085
3086                 #region IsIgnorable
3087 /*
3088                 static bool IsIgnorable (int i)
3089                 {
3090                         if (unicodeAge [i] >= 3.1)
3091                                 return true;
3092                         switch (char.GetUnicodeCategory ((char) i)) {
3093                         case UnicodeCategory.OtherNotAssigned:
3094                         case UnicodeCategory.Format:
3095                                 return true;
3096                         }
3097                         return false;
3098                 }
3099 */
3100
3101                 // FIXME: In the future use DerivedAge.txt to examine character
3102                 // versions and set those ones that have higher version than
3103                 // 1.0 as ignorable.
3104                 static bool IsIgnorable (int i)
3105                 {
3106                         switch (i) {
3107                         case 0:
3108                         // I guess, those characters are added between
3109                         // Unicode 1.0 (LCMapString) and Unicode 3.1
3110                         // (UnicodeCategory), so they used to be
3111                         // something like OtherNotAssigned as of Unicode 1.1.
3112                         case 0x2df: case 0x387:
3113                         case 0x3d7: case 0x3d8: case 0x3d9:
3114                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3115                         case 0x400: case 0x40d: case 0x450: case 0x45d:
3116                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
3117                         case 0x653: case 0x654: case 0x655: case 0x66d:
3118                         case 0xb56:
3119                         case 0x1e9b: case 0x202f: case 0x20ad:
3120                         case 0x20ae: case 0x20af:
3121                         case 0x20e2: case 0x20e3:
3122                         case 0x2139: case 0x213a: case 0x2183:
3123                         case 0x2425: case 0x2426: case 0x2619:
3124                         case 0x2670: case 0x2671: case 0x3007:
3125                         case 0x3190: case 0x3191:
3126                         case 0xfffc: case 0xfffd:
3127                                 return true;
3128                         // exceptional characters filtered by the
3129                         // following conditions. Originally those exceptional
3130                         // ranges are incorrect (they should not be ignored)
3131                         // and most of those characters are unfortunately in
3132                         // those ranges.
3133                         case 0x4d8: case 0x4d9:
3134                         case 0x4e8: case 0x4e9:
3135                         case 0x70F:
3136                         case 0x3036: case 0x303f:
3137                         case 0x337b: case 0xfb1e:
3138                                 return false;
3139                         }
3140
3141                         if (
3142                                 // The whole Sinhala characters.
3143                                 0x0D82 <= i && i <= 0x0DF4
3144                                 // The whole Tibetan characters.
3145                                 || 0x0F00 <= i && i <= 0x0FD1
3146                                 // The whole Myanmar characters.
3147                                 || 0x1000 <= i && i <= 0x1059
3148                                 // The whole Etiopic, Cherokee,
3149                                 // Canadian Syllablic, Ogham, Runic,
3150                                 // Tagalog, Hanunoo, Philippine,
3151                                 // Buhid, Tagbanwa, Khmer and Mongorian
3152                                 // characters.
3153                                 || 0x1200 <= i && i <= 0x1DFF
3154                                 // Greek extension characters.
3155                                 || 0x1F00 <= i && i <= 0x1FFF
3156                                 // The whole Braille characters.
3157                                 || 0x2800 <= i && i <= 0x28FF
3158                                 // CJK radical characters.
3159                                 || 0x2E80 <= i && i <= 0x2EF3
3160                                 // Kangxi radical characters.
3161                                 || 0x2F00 <= i && i <= 0x2FD5
3162                                 // Ideographic description characters.
3163                                 || 0x2FF0 <= i && i <= 0x2FFB
3164                                 // Bopomofo letter and final
3165                                 || 0x31A0 <= i && i <= 0x31B7
3166                                 // White square with quadrant characters.
3167                                 || 0x25F0 <= i && i <= 0x25F7
3168                                 // Ideographic telegraph symbols.
3169                                 || 0x32C0 <= i && i <= 0x32CB
3170                                 || 0x3358 <= i && i <= 0x3370
3171                                 || 0x33E0 <= i && i <= 0x33FF
3172                                 // The whole YI characters.
3173                                 || 0xA000 <= i && i <= 0xA48C
3174                                 || 0xA490 <= i && i <= 0xA4C6
3175                                 // American small ligatures
3176                                 || 0xFB13 <= i && i <= 0xFB17
3177                                 // hebrew, arabic, variation selector.
3178                                 || 0xFB1D <= i && i <= 0xFE2F
3179                                 // Arabic ligatures.
3180                                 || 0xFEF5 <= i && i <= 0xFEFC
3181                                 // FIXME: why are they excluded?
3182                                 || 0x01F6 <= i && i <= 0x01F9
3183                                 || 0x0218 <= i && i <= 0x0233
3184                                 || 0x02A9 <= i && i <= 0x02AD
3185                                 || 0x02EA <= i && i <= 0x02EE
3186                                 || 0x0349 <= i && i <= 0x036F
3187                                 || 0x0488 <= i && i <= 0x048F
3188                                 || 0x04D0 <= i && i <= 0x04FF
3189                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3190                                 || 0x06D6 <= i && i <= 0x06ED
3191                                 || 0x06FA <= i && i <= 0x06FE
3192                                 || 0x2048 <= i && i <= 0x204D
3193                                 || 0x20e4 <= i && i <= 0x20ea
3194                                 || 0x213C <= i && i <= 0x214B
3195                                 || 0x21EB <= i && i <= 0x21FF
3196                                 || 0x22F2 <= i && i <= 0x22FF
3197                                 || 0x237B <= i && i <= 0x239A
3198                                 || 0x239B <= i && i <= 0x23CF
3199                                 || 0x24EB <= i && i <= 0x24FF
3200                                 || 0x2596 <= i && i <= 0x259F
3201                                 || 0x25F8 <= i && i <= 0x25FF
3202                                 || 0x2672 <= i && i <= 0x2689
3203                                 || 0x2768 <= i && i <= 0x2775
3204                                 || 0x27d0 <= i && i <= 0x27ff
3205                                 || 0x2900 <= i && i <= 0x2aff
3206                                 || 0x3033 <= i && i <= 0x303F
3207                                 || 0x31F0 <= i && i <= 0x31FF
3208                                 || 0x3250 <= i && i <= 0x325F
3209                                 || 0x32B1 <= i && i <= 0x32BF
3210                                 || 0x3371 <= i && i <= 0x337B
3211                                 || 0xFA30 <= i && i <= 0xFA6A
3212                         )
3213                                 return true;
3214
3215                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3216                         switch (uc) {
3217                         case UnicodeCategory.PrivateUse:
3218                         case UnicodeCategory.Surrogate:
3219                                 return false;
3220                         // ignored by nature
3221                         case UnicodeCategory.Format:
3222                         case UnicodeCategory.OtherNotAssigned:
3223                                 return true;
3224                         default:
3225                                 return false;
3226                         }
3227                 }
3228
3229                 // To check IsIgnorable sanity, try the driver below under MS.NET.
3230
3231                 /*
3232                 public static void Main ()
3233                 {
3234                         for (int i = 0; i <= char.MaxValue; i++)
3235                                 Dump (i, IsIgnorable (i));
3236                 }
3237
3238                 static void Dump (int i, bool ignore)
3239                 {
3240                         switch (Char.GetUnicodeCategory ((char) i)) {
3241                         case UnicodeCategory.PrivateUse:
3242                         case UnicodeCategory.Surrogate:
3243                                 return; // check nothing
3244                         }
3245
3246                         string s1 = "";
3247                         string s2 = new string ((char) i, 10);
3248                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3249                         if ((ret == 0) == ignore)
3250                                 return;
3251                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3252                 }
3253                 */
3254                 #endregion // IsIgnorable
3255
3256                 #region IsIgnorableSymbol
3257                 static bool IsIgnorableSymbol (int i)
3258                 {
3259                         if (IsIgnorable (i))
3260                                 return true;
3261
3262                         switch (i) {
3263                         // *Letter
3264                         case 0x00b5: case 0x01C0: case 0x01C1:
3265                         case 0x01C2: case 0x01C3: case 0x01F6:
3266                         case 0x01F7: case 0x01F8: case 0x01F9:
3267                         case 0x02D0: case 0x02EE: case 0x037A:
3268                         case 0x03D7: case 0x03F3:
3269                         case 0x0400: case 0x040d:
3270                         case 0x0450: case 0x045d:
3271                         case 0x048C: case 0x048D:
3272                         case 0x048E: case 0x048F:
3273                         case 0x0587: case 0x0640: case 0x06E5:
3274                         case 0x06E6: case 0x06FA: case 0x06FB:
3275                         case 0x06FC: case 0x093D: case 0x0950:
3276                         case 0x1E9B: case 0x2139: case 0x3006:
3277                         case 0x3033: case 0x3034: case 0x3035:
3278                         case 0xFE7E: case 0xFE7F:
3279                         // OtherNumber
3280                         case 0x16EE: case 0x16EF: case 0x16F0:
3281                         // LetterNumber
3282                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3283                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3284                         case 0x3038: // HANGZHOU NUMERAL TEN
3285                         case 0x3039: // HANGZHOU NUMERAL TWENTY
3286                         case 0x303a: // HANGZHOU NUMERAL THIRTY
3287                         // OtherSymbol
3288                         case 0x2117:
3289                         case 0x327F:
3290                                 return true;
3291                         // ModifierSymbol
3292                         case 0x02B9: case 0x02BA: case 0x02C2:
3293                         case 0x02C3: case 0x02C4: case 0x02C5:
3294                         case 0x02C8: case 0x02CC: case 0x02CD:
3295                         case 0x02CE: case 0x02CF: case 0x02D2:
3296                         case 0x02D3: case 0x02D4: case 0x02D5:
3297                         case 0x02D6: case 0x02D7: case 0x02DE:
3298                         case 0x02E5: case 0x02E6: case 0x02E7:
3299                         case 0x02E8: case 0x02E9:
3300                         case 0x309B: case 0x309C:
3301                         // OtherPunctuation
3302                         case 0x055A: // American Apos
3303                         case 0x05C0: // Hebrew Punct
3304                         case 0x0E4F: // Thai FONGMAN
3305                         case 0x0E5A: // Thai ANGKHANKHU
3306                         case 0x0E5B: // Thai KHOMUT
3307                         // CurencySymbol
3308                         case 0x09F2: // Bengali Rupee Mark
3309                         case 0x09F3: // Bengali Rupee Sign
3310                         // MathSymbol
3311                         case 0x221e: // INF.
3312                         // OtherSymbol
3313                         case 0x0482:
3314                         case 0x09FA:
3315                         case 0x0B70:
3316                                 return false;
3317                         }
3318
3319                         // *Letter
3320                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3321 #if NET_2_0
3322                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3323                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3324 #endif
3325                         )
3326                                 return true;
3327
3328                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3329                         switch (uc) {
3330                         case UnicodeCategory.Surrogate:
3331                                 return false; // inconsistent
3332
3333                         case UnicodeCategory.SpacingCombiningMark:
3334                         case UnicodeCategory.EnclosingMark:
3335                         case UnicodeCategory.NonSpacingMark:
3336                         case UnicodeCategory.PrivateUse:
3337                                 // NonSpacingMark
3338                                 if (0x064B <= i && i <= 0x0652) // Arabic
3339                                         return true;
3340                                 return false;
3341
3342                         case UnicodeCategory.Format:
3343                         case UnicodeCategory.OtherNotAssigned:
3344                                 return true;
3345
3346                         default:
3347                                 bool use = false;
3348                                 // OtherSymbols
3349                                 if (
3350                                         // latin in a circle
3351                                         0x249A <= i && i <= 0x24E9
3352                                         || 0x2100 <= i && i <= 0x2132
3353                                         // Japanese
3354                                         || 0x3196 <= i && i <= 0x31A0
3355                                         // Korean
3356                                         || 0x3200 <= i && i <= 0x321C
3357                                         // Chinese/Japanese
3358                                         || 0x322A <= i && i <= 0x3243
3359                                         // CJK
3360                                         || 0x3260 <= i && i <= 0x32B0
3361                                         || 0x32D0 <= i && i <= 0x3357
3362                                         || 0x337B <= i && i <= 0x33DD
3363                                 )
3364                                         use = !Char.IsLetterOrDigit ((char) i);
3365                                 if (use)
3366                                         return false;
3367
3368                                 // This "Digit" rule is mystery.
3369                                 // It filters some symbols out.
3370                                 if (Char.IsLetterOrDigit ((char) i))
3371                                         return false;
3372                                 if (Char.IsNumber ((char) i))
3373                                         return false;
3374                                 if (Char.IsControl ((char) i)
3375                                         || Char.IsSeparator ((char) i)
3376                                         || Char.IsPunctuation ((char) i))
3377                                         return true;
3378                                 if (Char.IsSymbol ((char) i))
3379                                         return true;
3380
3381                                 // FIXME: should check more
3382                                 return false;
3383                         }
3384                 }
3385
3386                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3387 /*
3388                 public static void Main ()
3389                 {
3390                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3391                         for (int i = 0; i <= char.MaxValue; i++) {
3392                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3393                                 if (uc == UnicodeCategory.Surrogate)
3394                                         continue;
3395
3396                                 bool ret = IsIgnorableSymbol (i);
3397
3398                                 string s1 = "TEST ";
3399                                 string s2 = "TEST " + (char) i;
3400
3401                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3402
3403                                 if (ret != (result == 0))
3404                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3405                                                 ret ? "should not ignore" :
3406                                                         "should ignore",
3407                                                 i,(char) i, uc);
3408                         }
3409                 }
3410 */
3411                 #endregion
3412
3413                 #region NonSpacing
3414                 static bool IsIgnorableNonSpacing (int i)
3415                 {
3416                         if (IsIgnorable (i))
3417                                 return true;
3418
3419                         switch (i) {
3420                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3421                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3422                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3423                                 return true;
3424                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3425                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3426                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3427                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3428                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3429                         case 0x0CCD: case 0x0E4E:
3430                                 return false;
3431                         }
3432
3433                         if (0x02b9 <= i && i <= 0x02c5
3434                                 || 0x02cc <= i && i <= 0x02d7
3435                                 || 0x02e4 <= i && i <= 0x02ef
3436                                 || 0x20DD <= i && i <= 0x20E0
3437                         )
3438                                 return true;
3439
3440                         if (0x064B <= i && i <= 0x00652
3441                                 || 0x0941 <= i && i <= 0x0948
3442                                 || 0x0AC1 <= i && i <= 0x0ACD
3443                                 || 0x0C3E <= i && i <= 0x0C4F
3444                                 || 0x0E31 <= i && i <= 0x0E3F
3445                         )
3446                                 return false;
3447
3448                         return Char.GetUnicodeCategory ((char) i) ==
3449                                 UnicodeCategory.NonSpacingMark;
3450                 }
3451
3452                 // We can reuse IsIgnorableSymbol testcode
3453                 // for IsIgnorableNonSpacing.
3454                 #endregion
3455         }
3456
3457         struct CharMapEntry
3458         {
3459                 public byte Category;
3460                 public byte Level1;
3461                 public byte Level2; // It is always single byte.
3462                 public bool Defined;
3463
3464                 public CharMapEntry (byte category, byte level1, byte level2)
3465                 {
3466                         Category = category;
3467                         Level1 = level1;
3468                         Level2 = level2;
3469                         Defined = true;
3470                 }
3471         }
3472
3473         class JISCharacter
3474         {
3475                 public readonly int CP;
3476                 public readonly int JIS;
3477
3478                 public JISCharacter (int cp, int cpJIS)
3479                 {
3480                         CP = cp;
3481                         JIS = cpJIS;
3482                 }
3483         }
3484
3485         class JISComparer : IComparer
3486         {
3487                 public static readonly JISComparer Instance =
3488                         new JISComparer ();
3489
3490                 public int Compare (object o1, object o2)
3491                 {
3492                         JISCharacter j1 = (JISCharacter) o1;
3493                         JISCharacter j2 = (JISCharacter) o2;
3494                         return j1.JIS - j2.JIS;
3495                 }
3496         }
3497
3498         class NonJISCharacter
3499         {
3500                 public readonly int CP;
3501                 public readonly string Name;
3502
3503                 public NonJISCharacter (int cp, string name)
3504                 {
3505                         CP = cp;
3506                         Name = name;
3507                 }
3508         }
3509
3510         class NonJISComparer : IComparer
3511         {
3512                 public static readonly NonJISComparer Instance =
3513                         new NonJISComparer ();
3514
3515                 public int Compare (object o1, object o2)
3516                 {
3517                         NonJISCharacter j1 = (NonJISCharacter) o1;
3518                         NonJISCharacter j2 = (NonJISCharacter) o2;
3519                         return string.CompareOrdinal (j1.Name, j2.Name);
3520                 }
3521         }
3522
3523         class DecimalDictionaryValueComparer : IComparer
3524         {
3525                 public static readonly DecimalDictionaryValueComparer Instance
3526                         = new DecimalDictionaryValueComparer ();
3527
3528                 private DecimalDictionaryValueComparer ()
3529                 {
3530                 }
3531
3532                 public int Compare (object o1, object o2)
3533                 {
3534                         DictionaryEntry e1 = (DictionaryEntry) o1;
3535                         DictionaryEntry e2 = (DictionaryEntry) o2;
3536                         // FIXME: in case of 0, compare decomposition categories
3537                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3538                         if (ret != 0)
3539                                 return ret;
3540                         int i1 = (int) e1.Key;
3541                         int i2 = (int) e2.Key;
3542                         return i1 - i2;
3543                 }
3544         }
3545
3546         class StringDictionaryValueComparer : IComparer
3547         {
3548                 public static readonly StringDictionaryValueComparer Instance
3549                         = new StringDictionaryValueComparer ();
3550
3551                 private StringDictionaryValueComparer ()
3552                 {
3553                 }
3554
3555                 public int Compare (object o1, object o2)
3556                 {
3557                         DictionaryEntry e1 = (DictionaryEntry) o1;
3558                         DictionaryEntry e2 = (DictionaryEntry) o2;
3559                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3560                         if (ret != 0)
3561                                 return ret;
3562                         int i1 = (int) e1.Key;
3563                         int i2 = (int) e2.Key;
3564                         return i1 - i2;
3565                 }
3566         }
3567
3568         class UCAComparer : IComparer
3569         {
3570                 public static readonly UCAComparer Instance
3571                         = new UCAComparer ();
3572
3573                 private UCAComparer ()
3574                 {
3575                 }
3576
3577                 public int Compare (object o1, object o2)
3578                 {
3579                         char i1 = (char) o1;
3580                         char i2 = (char) o2;
3581
3582                         int l1 = CollationElementTable.GetSortKeyCount (i1);
3583                         int l2 = CollationElementTable.GetSortKeyCount (i2);
3584                         int l = l1 > l2 ? l2 : l1;
3585
3586                         for (int i = 0; i < l; i++) {
3587                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3588                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3589                                 int v = k1.Primary - k2.Primary;
3590                                 if (v != 0)
3591                                         return v;
3592                                 v = k1.Secondary - k2.Secondary;
3593                                 if (v != 0)
3594                                         return v;
3595                                 v = k1.Thirtiary - k2.Thirtiary;
3596                                 if (v != 0)
3597                                         return v;
3598                                 v = k1.Quarternary - k2.Quarternary;
3599                                 if (v != 0)
3600                                         return v;
3601                         }
3602                         return l1 - l2;
3603                 }
3604         }
3605
3606         class Tailoring
3607         {
3608                 int lcid;
3609                 int alias;
3610                 bool frenchSort;
3611                 ArrayList items = new ArrayList ();
3612
3613                 public Tailoring (int lcid)
3614                         : this (lcid, 0)
3615                 {
3616                 }
3617
3618                 public Tailoring (int lcid, int alias)
3619                 {
3620                         this.lcid = lcid;
3621                         this.alias = alias;
3622                 }
3623
3624                 public int LCID {
3625                         get { return lcid; }
3626                 }
3627
3628                 public int Alias {
3629                         get { return alias; }
3630                 }
3631
3632                 public bool FrenchSort {
3633                         get { return frenchSort; }
3634                         set { frenchSort = value; }
3635                 }
3636
3637                 public void AddDiacriticalMap (byte target, byte replace)
3638                 {
3639                         items.Add (new DiacriticalMap (target, replace));
3640                 }
3641
3642                 public void AddSortKeyMap (string source, byte [] sortkey)
3643                 {
3644                         items.Add (new SortKeyMap (source, sortkey));
3645                 }
3646
3647                 public void AddReplacementMap (string source, string replace)
3648                 {
3649                         items.Add (new ReplacementMap (source, replace));
3650                 }
3651
3652                 public char [] ItemToCharArray ()
3653                 {
3654                         ArrayList al = new ArrayList ();
3655                         foreach (ITailoringMap m in items)
3656                                 al.AddRange (m.ToCharArray ());
3657                         return al.ToArray (typeof (char)) as char [];
3658                 }
3659
3660                 interface ITailoringMap
3661                 {
3662                         char [] ToCharArray ();
3663                 }
3664
3665                 class DiacriticalMap : ITailoringMap
3666                 {
3667                         public readonly byte Target;
3668                         public readonly byte Replace;
3669
3670                         public DiacriticalMap (byte target, byte replace)
3671                         {
3672                                 Target = target;
3673                                 Replace = replace;
3674                         }
3675
3676                         public char [] ToCharArray ()
3677                         {
3678                                 char [] ret = new char [3];
3679                                 ret [0] = (char) 02; // kind:DiacriticalMap
3680                                 ret [1] = (char) Target;
3681                                 ret [2] = (char) Replace;
3682                                 return ret;
3683                         }
3684                 }
3685
3686                 class SortKeyMap : ITailoringMap
3687                 {
3688                         public readonly string Source;
3689                         public readonly byte [] SortKey;
3690
3691                         public SortKeyMap (string source, byte [] sortkey)
3692                         {
3693                                 Source = source;
3694                                 SortKey = sortkey;
3695                         }
3696
3697                         public char [] ToCharArray ()
3698                         {
3699                                 char [] ret = new char [Source.Length + 7];
3700                                 ret [0] = (char) 01; // kind:SortKeyMap
3701                                 for (int i = 0; i < Source.Length; i++)
3702                                         ret [i + 1] = Source [i];
3703                                 // null terminate
3704                                 for (int i = 0; i < 4; i++)
3705                                         ret [i + Source.Length + 2] = (char) SortKey [i];
3706                                 return ret;
3707                         }
3708                 }
3709
3710                 class ReplacementMap : ITailoringMap
3711                 {
3712                         public readonly string Source;
3713                         public readonly string Replace;
3714
3715                         public ReplacementMap (string source, string replace)
3716                         {
3717                                 Source = source;
3718                                 Replace = replace;
3719                         }
3720
3721                         public char [] ToCharArray ()
3722                         {
3723                                 char [] ret = new char [Source.Length + Replace.Length + 3];
3724                                 ret [0] = (char) 03; // kind:ReplaceMap
3725                                 int pos = 1;
3726                                 for (int i = 0; i < Source.Length; i++)
3727                                         ret [pos++] = Source [i];
3728                                 // null terminate
3729                                 pos++;
3730                                 for (int i = 0; i < Replace.Length; i++)
3731                                         ret [pos++] = Replace [i];
3732                                 // null terminate
3733                                 return ret;
3734                         }
3735                 }
3736         }
3737 }