mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14
  15 //
  16 // * sortkey getter signature
  17 //
  18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
  19 //      Stores sort key for corresponding character element into buf and
  20 //      returns the length of the consumed _source_ character element in s.
  21 //
  22 // * character length to consume
  23 //
  24 //      If there are characters whose primary weight is 0, they are consumed
  25 //      and considered as a part of the character element.
  26 //
  27 #define Binary
  28
  29 using System;
  30 using System.IO;
  31 using System.Collections;
  32 using System.Globalization;
  33 using System.Text;
  34 using System.Xml;
  35
  36 namespace Mono.Globalization.Unicode
  37 {
  38         internal class MSCompatSortKeyTableGenerator
  39         {
  40                 public static void Main (string [] args)
  41                 {
  42                         new MSCompatSortKeyTableGenerator ().Run (args);
  43                 }
  44
  45                 const int DecompositionWide = 1; // fixed
  46                 const int DecompositionSub = 2; // fixed
  47                 const int DecompositionSmall = 3;
  48                 const int DecompositionIsolated = 4;
  49                 const int DecompositionInitial = 5;
  50                 const int DecompositionFinal = 6;
  51                 const int DecompositionMedial = 7;
  52                 const int DecompositionNoBreak = 8;
  53                 const int DecompositionVertical = 9;
  54                 const int DecompositionFraction = 0xA;
  55                 const int DecompositionFont = 0xB;
  56                 const int DecompositionSuper = 0xC; // fixed
  57                 const int DecompositionFull = 0xE;
  58                 const int DecompositionNarrow = 0xD;
  59                 const int DecompositionCircle = 0xF;
  60                 const int DecompositionSquare = 0x10;
  61                 const int DecompositionCompat = 0x11;
  62                 const int DecompositionCanonical = 0x12;
  63
  64                 TextWriter Result = Console.Out;
  65
  66                 byte [] fillIndex = new byte [256]; // by category
  67                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  68
  69                 char [] specialIgnore = new char [] {
  70                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  71                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  72                         };
  73
  74                 // FIXME: need more love (as always)
  75                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  76                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  77                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  78                         '\u0292', '\u01BE', '\u0298'};
  79                 byte [] alphaWeights = new byte [] {
  80                         2, 9, 0xA, 0x1A, 0x21,
  81                         0x23, 0x25, 0x2C, 0x32, 0x35,
  82                         0x36, 0x48, 0x51, 0x70, 0x7C,
  83                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  84                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  85                         0xA9, 0xAA, 0xB3, 0xB4};
  86
  87                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  88                 bool [] isUppercase = new bool [char.MaxValue + 1];
  89
  90                 byte [] decompType = new byte [char.MaxValue + 1];
  91                 int [] decompIndex = new int [char.MaxValue + 1];
  92                 int [] decompLength = new int [char.MaxValue + 1];
  93                 int [] decompValues;
  94                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  95
  96                 byte [] diacritical = new byte [char.MaxValue + 1];
  97
  98                 string [] diacritics = new string [] {
  99                         // LATIN, CYRILLIC etc.
 100                         "UPTURN", "DOUBLE-STRUCK",
 101                         "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
 102                         "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
 103                         "WITH ACUTE;", "WITH GRAVE;",
 104                         //
 105                         "WITH DOT ABOVE;", " MIDDLE DOT;",
 106                         "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
 107                         "WITH DIALYTIKA;",
 108                         "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 109                         "DIALYTIKA TONOS", "DIALYTIKA AND TONOS", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
 110                         "WITH OGONEK;", "WITH CEDILLA;",
 111                         //
 112                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 113                         "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
 114                         "STROKE OVERLAY",
 115                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 116                         " DIAERESIS AND GRAVE;",
 117                         " BREVE AND ACUTE;",
 118                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 119                         " MACRON AND ACUTE;",
 120                         " MACRON AND GRAVE;",
 121                         //
 122                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 123                         " RING ABOVE AND ACUTE",
 124                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 125                         " CIRCUMFLEX AND TILDE",
 126                         " TILDE AND DIAERESIS",
 127                         " STROKE AND ACUTE",
 128                         " BREVE AND TILDE",
 129                         " CEDILLA AND BREVE",
 130                         " OGONEK AND MACRON",
 131                         //
 132                         "WITH OVERLINE",
 133                         "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 134                         " DOUBLE GRAVE",
 135                         " INVERTED BREVE",
 136                         "ROMAN NUMERAL",
 137                         " PRECEDED BY APOSTROPHE",
 138                         "WITH HORN;",
 139                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 140                         " PALATAL HOOK",
 141                         " DOT BELOW;",
 142                         " RETROFLEX;", "DIAERESIS BELOW",
 143                         " RING BELOW",
 144                         //
 145                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 146                         " BREVE BELOW;", " HORN AND GRAVE",
 147                         " TILDE BELOW",
 148                         " TOPBAR",
 149                         " DOT BELOW AND DOT ABOVE",
 150                         " RIGHT HALF RING", " HORN AND TILDE",
 151                         " CIRCUMFLEX AND DOT BELOW",
 152                         " BREVE AND DOT BELOW",
 153                         " DOT BELOW AND MACRON",
 154                         " TONE TWO",
 155                         " HORN AND HOOK ABOVE",
 156                         " HORN AND DOT",
 157                         // CIRCLED, PARENTHESIZED and so on
 158                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
 159                         "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
 160                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 161                         };
 162                 byte [] diacriticWeights = new byte [] {
 163                         // LATIN.
 164                         3, 3, 5, 5, 5,
 165                         0xE, 0xF,
 166                         0xE, 0xF,
 167                         //
 168                         0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
 169                         0x16, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
 170                         //
 171                         0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 172                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 173                         //
 174                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 175                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 176                         //
 177                         0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
 178                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
 179                         //
 180                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
 181                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 182                         0x87, 0x95, 0xAA,
 183                         // CIRCLED, PARENTHESIZED and so on.
 184                         0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
 185                         0xF3, 0xF3, 0xF3
 186                         };
 187
 188                 int [] numberSecondaryWeightBounds = new int [] {
 189                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 190                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 191                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 192                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 193                         0xE50, 0xE60, 0xED0, 0xEE0
 194                         };
 195
 196                 char [] orderedGurmukhi;
 197                 char [] orderedGujarati;
 198                 char [] orderedGeorgian;
 199                 char [] orderedThaana;
 200
 201                 static readonly char [] orderedTamilConsonants = new char [] {
 202                         // based on traditional Tamil consonants, except for
 203                         // Grantha (where Microsoft breaks traditionalism).
 204                         // http://www.angelfire.com/empire/thamizh/padanGaL
 205                         '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
 206                         '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
 207                         '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
 208                         '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
 209                         '\u0BB7', '\u0BB9'};
 210
 211                 // cp -> character name (only for some characters)
 212                 ArrayList sortableCharNames = new ArrayList ();
 213
 214                 // cp -> arrow value (int)
 215                 ArrayList arrowValues = new ArrayList ();
 216
 217                 // cp -> box value (int)
 218                 ArrayList boxValues = new ArrayList ();
 219
 220                 // cp -> level1 value
 221                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 222
 223                 // letterName -> cp
 224                 Hashtable arabicNameMap = new Hashtable ();
 225
 226                 // cp -> Hashtable [decompType] -> cp
 227                 Hashtable nfkdMap = new Hashtable ();
 228
 229                 // Latin letter -> ArrayList [int]
 230                 Hashtable latinMap = new Hashtable ();
 231
 232                 ArrayList jisJapanese = new ArrayList ();
 233                 ArrayList nonJisJapanese = new ArrayList ();
 234
 235                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
 236                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
 237                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
 238                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
 239                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
 240
 241                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 242
 243                 static double [] unicodeAge = new double [char.MaxValue + 1];
 244
 245                 ArrayList tailorings = new ArrayList ();
 246
 247                 void Run (string [] args)
 248                 {
 249                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 250                         ParseSources (dirname);
 251                         Console.Error.WriteLine ("parse done.");
 252
 253                         ModifyParsedValues ();
 254                         GenerateCore ();
 255                         Console.Error.WriteLine ("generation done.");
 256                         Serialize ();
 257                         Console.Error.WriteLine ("serialization done.");
 258 /*
 259 StreamWriter sw = new StreamWriter ("agelog.txt");
 260 for (int i = 0; i < char.MaxValue; i++) {
 261 bool shouldBe = false;
 262 switch (Char.GetUnicodeCategory ((char) i)) {
 263 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
 264         shouldBe = true; break;
 265 }
 266 if (unicodeAge [i] >= 3.1)
 267         shouldBe = true;
 268 //if (IsIgnorable (i) != shouldBe)
 269 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
 270 }
 271 sw.Close ();
 272 */
 273                 }
 274
 275                 byte [] CompressArray (byte [] source, CodePointIndexer i)
 276                 {
 277                         return (byte []) CodePointIndexer.CompressArray  (
 278                                 source, typeof (byte), i);
 279                 }
 280
 281                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
 282                 {
 283                         return (ushort []) CodePointIndexer.CompressArray  (
 284                                 source, typeof (ushort), i);
 285                 }
 286
 287                 void Serialize ()
 288                 {
 289                         // Tailorings
 290                         SerializeTailorings ();
 291
 292                         byte [] categories = new byte [map.Length];
 293                         byte [] level1 = new byte [map.Length];
 294                         byte [] level2 = new byte [map.Length];
 295                         byte [] level3 = new byte [map.Length];
 296                         ushort [] widthCompat = new ushort [map.Length];
 297                         for (int i = 0; i < map.Length; i++) {
 298                                 categories [i] = map [i].Category;
 299                                 level1 [i] = map [i].Level1;
 300                                 level2 [i] = map [i].Level2;
 301                                 level3 [i] = ComputeLevel3Weight ((char) i);
 302                                 // For Japanese Half-width characters, don't
 303                                 // map widthCompat. It is IgnoreKanaType that
 304                                 // handles those width differences.
 305                                 if (0xFF6D <= i && i <= 0xFF9D)
 306                                         continue;
 307                                 switch (decompType [i]) {
 308                                 case DecompositionNarrow:
 309                                 case DecompositionWide:
 310                                 case DecompositionSuper:
 311                                 case DecompositionSub:
 312                                         // they are always 1 char
 313                                         widthCompat [i] = (ushort) decompValues [decompIndex [i]];
 314                                         break;
 315                                 }
 316                         }
 317
 318                         // compress
 319                         ignorableFlags = CompressArray (ignorableFlags,
 320                                 MSCompatUnicodeTableUtil.Ignorable);
 321                         categories = CompressArray (categories,
 322                                 MSCompatUnicodeTableUtil.Category);
 323                         level1 = CompressArray (level1,
 324                                 MSCompatUnicodeTableUtil.Level1);
 325                         level2 = CompressArray (level2,
 326                                 MSCompatUnicodeTableUtil.Level2);
 327                         level3 = CompressArray (level3,
 328                                 MSCompatUnicodeTableUtil.Level3);
 329                         widthCompat = (ushort []) CodePointIndexer.CompressArray (
 330                                 widthCompat, typeof (ushort),
 331                                 MSCompatUnicodeTableUtil.WidthCompat);
 332                         cjkCHS = CompressArray (cjkCHS,
 333                                 MSCompatUnicodeTableUtil.CjkCHS);
 334                         cjkCHT = CompressArray (cjkCHT,
 335                                 MSCompatUnicodeTableUtil.Cjk);
 336                         cjkJA = CompressArray (cjkJA,
 337                                 MSCompatUnicodeTableUtil.Cjk);
 338                         cjkKO = CompressArray (cjkKO,
 339                                 MSCompatUnicodeTableUtil.Cjk);
 340                         cjkKOlv2 = CompressArray (cjkKOlv2,
 341                                 MSCompatUnicodeTableUtil.Cjk);
 342
 343                         // Ignorables
 344                         Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
 345 #if Binary
 346                         MemoryStream ms = new MemoryStream ();
 347                         BinaryWriter binary = new BinaryWriter (ms);
 348                         binary.Write (ignorableFlags.Length);
 349 #endif
 350                         for (int i = 0; i < ignorableFlags.Length; i++) {
 351                                 byte value = ignorableFlags [i];
 352                                 if (value < 10)
 353                                         Result.Write ("{0},", value);
 354                                 else
 355                                         Result.Write ("0x{0:X02},", value);
 356 #if Binary
 357                                 binary.Write (value);
 358 #endif
 359                                 if ((i & 0xF) == 0xF)
 360                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 361                         }
 362                         Result.WriteLine ("};");
 363                         Result.WriteLine ();
 364
 365                         // Primary category
 366                         Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
 367 #if Binary
 368                         binary.Write (categories.Length);
 369 #endif
 370                         for (int i = 0; i < categories.Length; i++) {
 371                                 byte value = categories [i];
 372                                 if (value < 10)
 373                                         Result.Write ("{0},", value);
 374                                 else
 375                                         Result.Write ("0x{0:X02},", value);
 376 #if Binary
 377                                 binary.Write (value);
 378 #endif
 379                                 if ((i & 0xF) == 0xF)
 380                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 381                         }
 382                         Result.WriteLine ("};");
 383                         Result.WriteLine ();
 384
 385                         // Primary weight value
 386                         Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
 387 #if Binary
 388                         binary.Write (level1.Length);
 389 #endif
 390                         for (int i = 0; i < level1.Length; i++) {
 391                                 byte value = level1 [i];
 392                                 if (value < 10)
 393                                         Result.Write ("{0},", value);
 394                                 else
 395                                         Result.Write ("0x{0:X02},", value);
 396 #if Binary
 397                                 binary.Write (value);
 398 #endif
 399                                 if ((i & 0xF) == 0xF)
 400                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 401                         }
 402                         Result.WriteLine ("};");
 403                         Result.WriteLine ();
 404
 405                         // Secondary weight
 406                         Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
 407 #if Binary
 408                         binary.Write (level2.Length);
 409 #endif
 410                         for (int i = 0; i < level2.Length; i++) {
 411                                 byte value = level2 [i];
 412                                 if (value < 10)
 413                                         Result.Write ("{0},", value);
 414                                 else
 415                                         Result.Write ("0x{0:X02},", value);
 416 #if Binary
 417                                 binary.Write (value);
 418 #endif
 419                                 if ((i & 0xF) == 0xF)
 420                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 421                         }
 422                         Result.WriteLine ("};");
 423                         Result.WriteLine ();
 424
 425                         // Thirtiary weight
 426                         Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
 427 #if Binary
 428                         binary.Write (level3.Length);
 429 #endif
 430                         for (int i = 0; i < level3.Length; i++) {
 431                                 byte value = level3 [i];
 432                                 if (value < 10)
 433                                         Result.Write ("{0},", value);
 434                                 else
 435                                         Result.Write ("0x{0:X02},", value);
 436 #if Binary
 437                                 binary.Write (value);
 438 #endif
 439                                 if ((i & 0xF) == 0xF)
 440                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 441                         }
 442                         Result.WriteLine ("};");
 443                         Result.WriteLine ();
 444
 445                         // Width insensitivity mappings
 446                         // (for now it is more lightweight than dumping the
 447                         // entire NFKD table).
 448                         Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
 449 #if Binary
 450                         binary.Write (widthCompat.Length);
 451 #endif
 452                         for (int i = 0; i < widthCompat.Length; i++) {
 453                                 ushort value = widthCompat [i];
 454                                 if (value < 10)
 455                                         Result.Write ("{0},", value);
 456                                 else
 457                                         Result.Write ("0x{0:X02},", value);
 458 #if Binary
 459                                 binary.Write (value);
 460 #endif
 461                                 if ((i & 0xF) == 0xF)
 462                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 463                         }
 464                         Result.WriteLine ("};");
 465                         Result.WriteLine ();
 466 #if Binary
 467                         using (FileStream fs = File.Create ("../collation.core.bin")) {
 468                                 byte [] array = ms.ToArray ();
 469                                 fs.Write (array, 0, array.Length);
 470                         }
 471 #endif
 472
 473                         // CJK
 474                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 475                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 476                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 477                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 478                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 479                 }
 480
 481                 void SerializeCJK (string name, ushort [] cjk, int max)
 482                 {
 483                         int offset = 0;//char.MaxValue - cjk.Length;
 484                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
 485 #if Binary
 486                         MemoryStream ms = new MemoryStream ();
 487                         BinaryWriter binary = new BinaryWriter (ms);
 488                         binary.Write (cjk.Length);
 489 #endif
 490                         for (int i = 0; i < cjk.Length; i++) {
 491                                 if (i + offset == max)
 492                                         break;
 493                                 ushort value = cjk [i];
 494                                 if (value < 10)
 495                                         Result.Write ("{0},", value);
 496                                 else
 497                                         Result.Write ("0x{0:X04},", value);
 498 #if Binary
 499                                 binary.Write (value);
 500 #endif
 501                                 if ((i & 0xF) == 0xF)
 502                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 503                         }
 504                         Result.WriteLine ("};");
 505                         Result.WriteLine ();
 506 #if Binary
 507                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 508                                 byte [] array = ms.ToArray ();
 509                                 fs.Write (array, 0, array.Length);
 510                         }
 511 #endif
 512                 }
 513
 514                 void SerializeCJK (string name, byte [] cjk, int max)
 515                 {
 516                         int offset = 0;//char.MaxValue - cjk.Length;
 517                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
 518 #if Binary
 519                         MemoryStream ms = new MemoryStream ();
 520                         BinaryWriter binary = new BinaryWriter (ms);
 521 #endif
 522                         for (int i = 0; i < cjk.Length; i++) {
 523                                 if (i + offset == max)
 524                                         break;
 525                                 byte value = cjk [i];
 526                                 if (value < 10)
 527                                         Result.Write ("{0},", value);
 528                                 else
 529                                         Result.Write ("0x{0:X02},", value);
 530 #if Binary
 531                                 binary.Write (value);
 532 #endif
 533                                 if ((i & 0xF) == 0xF)
 534                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 535                         }
 536                         Result.WriteLine ("};");
 537                         Result.WriteLine ();
 538 #if Binary
 539                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 540                                 byte [] array = ms.ToArray ();
 541                                 fs.Write (array, 0, array.Length);
 542                         }
 543 #endif
 544                 }
 545
 546                 void SerializeTailorings ()
 547                 {
 548                         Hashtable indexes = new Hashtable ();
 549                         Hashtable counts = new Hashtable ();
 550                         Result.WriteLine ("static char [] tailorings = new char [] {");
 551                         int count = 0;
 552 #if Binary
 553                         MemoryStream ms = new MemoryStream ();
 554                         BinaryWriter binary = new BinaryWriter (ms);
 555 #endif
 556                         foreach (Tailoring t in tailorings) {
 557                                 if (t.Alias != 0)
 558                                         continue;
 559                                 Result.Write ("/*{0}*/", t.LCID);
 560                                 indexes.Add (t.LCID, count);
 561                                 char [] values = t.ItemToCharArray ();
 562                                 counts.Add (t.LCID, values.Length);
 563                                 foreach (char c in values) {
 564                                         Result.Write ("'\\x{0:X}', ", (int) c);
 565                                         if (++count % 16 == 0)
 566                                                 Result.WriteLine (" // {0:X04}", count - 16);
 567 #if Binary
 568                                         binary.Write ((ushort) c);
 569 #endif
 570                                 }
 571                         }
 572                         Result.WriteLine ("};");
 573
 574                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
 575 #if Binary
 576                         byte [] rawdata = ms.ToArray ();
 577                         ms = new MemoryStream ();
 578                         binary = new BinaryWriter (ms);
 579                         binary.Write (tailorings.Count);
 580 #endif
 581                         foreach (Tailoring t in tailorings) {
 582                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
 583                                 if (!indexes.ContainsKey (target)) {
 584                                         throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
 585                                         continue;
 586                                 }
 587                                 int idx = (int) indexes [target];
 588                                 int cnt = (int) counts [target];
 589                                 bool french = t.FrenchSort;
 590                                 if (t.Alias != 0)
 591                                         foreach (Tailoring t2 in tailorings)
 592                                                 if (t2.LCID == t.LCID)
 593                                                         french = t2.FrenchSort;
 594                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
 595 #if Binary
 596                                 binary.Write (t.LCID);
 597                                 binary.Write (idx);
 598                                 binary.Write (cnt);
 599                                 binary.Write (french);
 600 #endif
 601                         }
 602                         Result.WriteLine ("};");
 603 #if Binary
 604                         binary.Write ((byte) 0xFF);
 605                         binary.Write ((byte) 0xFF);
 606                         binary.Write (rawdata.Length / 2);
 607                         binary.Write (rawdata, 0, rawdata.Length);
 608
 609
 610                         using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
 611                                 byte [] array = ms.ToArray ();
 612                                 fs.Write (array, 0, array.Length);
 613                         }
 614 #endif
 615                 }
 616
 617                 #region Parse
 618
 619                 void ParseSources (string dirname)
 620                 {
 621                         string unidata =
 622                                 dirname + "/UnicodeData.txt";
 623                         string derivedCoreProps =
 624                                 dirname + "/DerivedCoreProperties.txt";
 625                         string scripts =
 626                                 dirname + "/Scripts.txt";
 627                         string cp932 =
 628                                 dirname + "/CP932.TXT";
 629                         string derivedAge =
 630                                 dirname + "/DerivedAge.txt";
 631                         string chXML = dirname + "/common/collation/zh.xml";
 632                         string jaXML = dirname + "/common/collation/ja.xml";
 633                         string koXML = dirname + "/common/collation/ko.xml";
 634
 635                         ParseDerivedAge (derivedAge);
 636
 637                         FillIgnorables ();
 638
 639                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 640                         ParseUnidata (unidata);
 641                         ModifyUnidata ();
 642                         ParseDerivedCoreProperties (derivedCoreProps);
 643                         ParseScripts (scripts);
 644                         ParseCJK (chXML, jaXML, koXML);
 645
 646                         ParseTailorings ("mono-tailoring-source.txt");
 647                 }
 648
 649                 void ParseTailorings (string filename)
 650                 {
 651                         Tailoring t = null;
 652                         int line = 0;
 653                         using (StreamReader sr = new StreamReader (filename)) {
 654                                 try {
 655                                         while (sr.Peek () >= 0) {
 656                                                 line++;
 657                                                 ProcessTailoringLine (ref t,
 658                                                         sr.ReadLine ().Trim ());
 659                                         }
 660                                 } catch (Exception) {
 661                                         Console.Error.WriteLine ("ERROR at line {0}", line);
 662                                         throw;
 663                                 }
 664                         }
 665                 }
 666
 667                 // For now this is enough.
 668                 string ParseTailoringSourceValue (string s)
 669                 {
 670                         StringBuilder sb = new StringBuilder ();
 671                         for (int i = 0; i < s.Length; i++) {
 672                                 if (s.StartsWith ("\\u")) {
 673                                         sb.Append ((char) int.Parse (
 674                                                 s.Substring (2, 4), NumberStyles.HexNumber),
 675                                                 1);
 676                                         i += 5;
 677                                 }
 678                         else
 679                                 sb.Append (s [i]);
 680                         }
 681                         return sb.ToString ();
 682                 }
 683
 684                 void ProcessTailoringLine (ref Tailoring t, string s)
 685                 {
 686                         int idx = s.IndexOf ('#');
 687                         if (idx > 0)
 688                                 s = s.Substring (0, idx).Trim ();
 689                         if (s.Length == 0 || s [0] == '#')
 690                                 return;
 691                         if (s [0] == '@') {
 692                                 idx = s.IndexOf ('=');
 693                                 if (idx > 0)
 694                                         t = new Tailoring (
 695                                                 int.Parse (s.Substring (1, idx - 1)),
 696                                                 int.Parse (s.Substring (idx + 1)));
 697                                 else
 698                                         t = new Tailoring (int.Parse (s.Substring (1)));
 699                                 tailorings.Add (t);
 700                                 return;
 701                         }
 702                         if (s.StartsWith ("*FrenchSort")) {
 703                                 t.FrenchSort = true;
 704                                 return;
 705                         }
 706                         string d = "*Diacritical";
 707                         if (s.StartsWith (d)) {
 708                                 idx = s.IndexOf ("->");
 709                                 t.AddDiacriticalMap (
 710                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
 711                                                 NumberStyles.HexNumber),
 712                                         byte.Parse (s.Substring (idx + 2).Trim (),
 713                                                 NumberStyles.HexNumber));
 714                                 return;
 715                         }
 716                         idx = s.IndexOf (':');
 717                         if (idx > 0) {
 718                                 string source = s.Substring (0, idx).Trim ();
 719                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
 720                                 byte [] b = new byte [4];
 721                                 for (int i = 0; i < 4; i++) {
 722                                         if (l [i] == "*")
 723                                                 b [i] = 0;
 724                                         else
 725                                                 b [i] = byte.Parse (l [i],
 726                                                         NumberStyles.HexNumber);
 727                                 }
 728                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
 729                                         b);
 730                         }
 731                         idx = s.IndexOf ('=');
 732                         if (idx > 0)
 733                                 t.AddReplacementMap (
 734                                         ParseTailoringSourceValue (
 735                                                 s.Substring (0, idx).Trim ()),
 736                                         ParseTailoringSourceValue (
 737                                                 s.Substring (idx + 1).Trim ()));
 738                 }
 739
 740                 void ParseDerivedAge (string filename)
 741                 {
 742                         using (StreamReader file =
 743                                 new StreamReader (filename)) {
 744                                 while (file.Peek () >= 0) {
 745                                         string s = file.ReadLine ();
 746                                         int idx = s.IndexOf ('#');
 747                                         if (idx >= 0)
 748                                                 s = s.Substring (0, idx);
 749                                         idx = s.IndexOf (';');
 750                                         if (idx < 0)
 751                                                 continue;
 752
 753                                         string cpspec = s.Substring (0, idx);
 754                                         idx = cpspec.IndexOf ("..");
 755                                         NumberStyles nf = NumberStyles.HexNumber |
 756                                                 NumberStyles.AllowTrailingWhite;
 757                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 758                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 759                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 760
 761                                         // FIXME: use index
 762                                         if (cp > char.MaxValue)
 763                                                 continue;
 764
 765                                         double v = double.Parse (value);
 766                                         for (int i = cp; i <= cpEnd; i++)
 767                                                 unicodeAge [i] = v;
 768                                 }
 769                         }
 770                         unicodeAge [0] = double.MaxValue; // never be supported
 771                 }
 772
 773                 void ParseUnidata (string filename)
 774                 {
 775                         ArrayList decompValues = new ArrayList ();
 776                         using (StreamReader unidata =
 777                                 new StreamReader (filename)) {
 778                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 779                                         try {
 780                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 781                                         } catch (Exception) {
 782                                                 Console.Error.WriteLine ("**** At line " + line);
 783                                                 throw;
 784                                         }
 785                                 }
 786                         }
 787                         this.decompValues = (int [])
 788                                 decompValues.ToArray (typeof (int));
 789                 }
 790
 791                 char previousLatinTarget = char.MinValue;
 792                 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
 793
 794                 void ProcessUnidataLine (string s, ArrayList decompValues)
 795                 {
 796                         int idx = s.IndexOf ('#');
 797                         if (idx >= 0)
 798                                 s = s.Substring (0, idx);
 799                         idx = s.IndexOf (';');
 800                         if (idx < 0)
 801                                 return;
 802                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 803                         string [] values = s.Substring (idx + 1).Split (';');
 804
 805                         // FIXME: use index
 806                         if (cp > char.MaxValue)
 807                                 return;
 808                         if (IsIgnorable (cp))
 809                                 return;
 810
 811                         string name = values [0];
 812
 813                         // SPECIAL CASE: rename some characters for diacritical
 814                         // remapping. FIXME: why are they different?
 815                         // FIXME: it's still not working.
 816                         if (cp == 0x018B || cp == 0x018C)
 817                                 name = name.Replace ("TOPBAR", "STROKE");
 818
 819                         // isSmallCapital
 820                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 821                                 isSmallCapital [cp] = true;
 822
 823                         // latin mapping by character name
 824                         if (s.IndexOf ("LATIN") >= 0) {
 825                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 826                                 int offset = lidx + 15;
 827                                 if (lidx < 0) {
 828                                         lidx = s.IndexOf ("LETTER TURNED ");
 829                                         offset = lidx + 14;
 830                                 }
 831                                 if (lidx < 0) {
 832                                         lidx = s.IndexOf ("LETTER CAPITAL ");
 833                                         offset = lidx + 15;
 834                                 }
 835                                 if (lidx < 0) {
 836                                         lidx = s.IndexOf ("LETTER SCRIPT ");
 837                                         offset = lidx + 14;
 838                                 }
 839                                 if (lidx < 0) {
 840                                         lidx = s.IndexOf ("LETTER ");
 841                                         offset = lidx + 7;
 842                                 }
 843                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 844                                 char n = s [offset + 1];
 845                                 char target = char.MinValue;
 846                                 if ('A' <= c && c <= 'Z' &&
 847                                         (n == ' ') || n == ';') {
 848                                         target = c;
 849                                         // FIXME: After 'Z', I cannot reset this state.
 850                                         previousLatinTarget = c == 'Z' ? char.MinValue : c;
 851                                 }
 852
 853                                 if (s.Substring (offset).StartsWith ("ALPHA"))
 854                                         target = 'A';
 855                                 else if (s.Substring (offset).StartsWith ("TONE SIX"))
 856                                         target = 'B';
 857                                 else if (s.Substring (offset).StartsWith ("OPEN O"))
 858                                         target = 'C';
 859                                 else if (s.Substring (offset).StartsWith ("SCHWA"))
 860                                         target = 'E';
 861                                 else if (s.Substring (offset).StartsWith ("ENG"))
 862                                         target = 'N';
 863                                 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
 864                                         target = 'O';
 865                                 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
 866                                         target = 'R';
 867                                 else if (s.Substring (offset).StartsWith ("TONE TWO"))
 868                                         target = 'S';
 869                                 else if (s.Substring (offset).StartsWith ("ESH"))
 870                                         target = 'S';
 871
 872                                 // For remaining IPA chars, direct mapping is
 873                                 // much faster.
 874                                 switch (cp) {
 875                                 case 0x0299: target = 'B'; break;
 876                                 case 0x029A: target = 'E'; break;
 877                                 case 0x029B: target = 'G'; break;
 878                                 case 0x029C: target = 'H'; break;
 879                                 case 0x029D: target = 'J'; break;
 880                                 case 0x029E: target = 'K'; break;
 881                                 case 0x029F: target = 'L'; break;
 882                                 case 0x02A0: target = 'Q'; break;
 883                                 case 0x02A7: target = 'T'; break;
 884                                 case 0x02A8: target = 'T'; break;
 885                                 }
 886
 887                                 if (target == char.MinValue)
 888                                         target = previousLatinTarget;
 889
 890                                 if (target != char.MinValue) {
 891                                         ArrayList entry = (ArrayList) latinMap [target];
 892                                         if (entry == null) {
 893                                                 entry = new ArrayList ();
 894                                                 latinMap [target] = entry;
 895                                         }
 896                                         entry.Add (cp);
 897                                         // FIXME: This secondary weight is hack.
 898                                         // They are here because they must not
 899                                         // be identical to the corresponding
 900                                         // ASCII latins.
 901                                         if (c != target && diacritical [cp] == 0) {
 902                                                 diacriticalOffset [c - 'A']++;
 903                                                 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
 904                                         }
 905                                 }
 906                         }
 907
 908                         // Arrow names
 909                         if (0x2000 <= cp && cp < 0x3000) {
 910                                 int value = 0;
 911                                 // SPECIAL CASES. FIXME: why?
 912                                 switch (cp) {
 913                                 case 0x21C5: value = -1; break; // E2
 914                                 case 0x261D: value = 1; break;
 915                                 case 0x27A6: value = 3; break;
 916                                 case 0x21B0: value = 7; break;
 917                                 case 0x21B1: value = 3; break;
 918                                 case 0x21B2: value = 7; break;
 919                                 case 0x21B4: value = 5; break;
 920                                 case 0x21B5: value = 7; break;
 921                                 case 0x21B9: value = -1; break; // E1
 922                                 case 0x21CF: value = 7; break;
 923                                 case 0x21D0: value = 3; break;
 924                                 }
 925                                 string [] arrowTargets = new string [] {
 926                                         "",
 927                                         "UPWARDS",
 928                                         "NORTH EAST",
 929                                         "RIGHTWARDS",
 930                                         "SOUTH EAST",
 931                                         "DOWNWARDS",
 932                                         "SOUTH WEST",
 933                                         "LEFTWARDS",
 934                                         "NORTH WEST",
 935                                         "LEFT RIGHT",
 936                                         "UP DOWN",
 937                                         };
 938                                 if (s.IndexOf ("RIGHTWARDS") >= 0 &&
 939                                         s.IndexOf ("LEFTWARDS") >= 0)
 940                                         value = 0xE1 - 0xD8;
 941                                 else if (s.IndexOf ("UPWARDS") >= 0 &&
 942                                         s.IndexOf ("DOWNWARDS") >= 0)
 943                                         value = 0xE2 - 0xD8;
 944                                 else if (s.IndexOf ("ARROW") >= 0 &&
 945                                         s.IndexOf ("COMBINING") < 0 &&
 946                                         s.IndexOf ("CLOCKWISE") >= 0)
 947                                         value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8;
 948                                 if (value == 0)
 949                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
 950                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
 951                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
 952                                                         s.IndexOf (" OVER") < 0
 953                                                 )
 954                                                         value = i;
 955                                 if (value > 0)
 956                                         arrowValues.Add (new DictionaryEntry (
 957                                                 cp, value));
 958                         }
 959
 960                         // Box names
 961                         if (0x2500 <= cp && cp < 0x2600) {
 962                                 int value = int.MinValue;
 963                                 // flags:
 964                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
 965                                 // [h,rl] [r] [l]
 966                                 // [v,ud] [u] [d]
 967                                 // [dr] [dl] [ur] [ul]
 968                                 // [vr,udr] [vl,vdl]
 969                                 // [hd,rld] [hu,rlu]
 970                                 // [hv,udrl,rlv,udh]
 971                                 ArrayList flags = new ArrayList (new int [] {
 972                                         32, 8 + 4, 8, 4,
 973                                         16, 1 + 2, 1, 2,
 974                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
 975                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
 976                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
 977                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
 978                                         });
 979                                 byte [] offsets = new byte [] {
 980                                         0, 0, 1, 2,
 981                                         3, 3, 4, 5,
 982                                         6, 7, 8, 9,
 983                                         10, 10, 11, 11,
 984                                         12, 12, 13, 13,
 985                                         14, 14, 14, 14};
 986                                 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
 987                                         int flag = 0;
 988                                         if (s.IndexOf (" UP") >= 0)
 989                                                 flag |= 1;
 990                                         if (s.IndexOf (" DOWN") >= 0)
 991                                                 flag |= 2;
 992                                         if (s.IndexOf (" RIGHT") >= 0)
 993                                                 flag |= 4;
 994                                         if (s.IndexOf (" LEFT") >= 0)
 995                                                 flag |= 8;
 996                                         if (s.IndexOf (" VERTICAL") >= 0)
 997                                                 flag |= 16;
 998                                         if (s.IndexOf (" HORIZONTAL") >= 0)
 999                                                 flag |= 32;
1000
1001                                         int fidx = flags.IndexOf (flag);
1002                                         if (fidx >= 0)
1003                                                 value = offsets [fidx];
1004                                 } else if (s.IndexOf ("BLOCK") >= 0) {
1005                                         if (s.IndexOf ("ONE EIGHTH") >= 0)
1006                                                 value = 0x12;
1007                                         else if (s.IndexOf ("ONE QUARTER") >= 0)
1008                                                 value = 0x13;
1009                                         else if (s.IndexOf ("THREE EIGHTHS") >= 0)
1010                                                 value = 0x14;
1011                                         else if (s.IndexOf ("HALF") >= 0)
1012                                                 value = 0x15;
1013                                         else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1014                                                 value = 0x16;
1015                                         else if (s.IndexOf ("THREE QUARTERS") >= 0)
1016                                                 value = 0x17;
1017                                         else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1018                                                 value = 0x18;
1019                                         else
1020                                                 value = 0x19;
1021                                 }
1022                                 else if (s.IndexOf ("SHADE") >= 0)
1023                                         value = 0x19;
1024                                 else if (s.IndexOf ("SQUARE") >= 0)
1025                                         value = 0xBC - 0xE5;
1026                                 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1027                                         value = 0xBE - 0xE5;
1028                                 else if (s.IndexOf ("RECTANGLE") >= 0)
1029                                         value = 0xBD - 0xE5;
1030                                 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1031                                         value = 0xBF - 0xE5;
1032                                 else if (s.IndexOf ("TRIANGLE") >= 0) {
1033                                         if (s.IndexOf ("UP-POINTING") >= 0)
1034                                                 value = 0xC0 - 0xE5;
1035                                         else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1036                                                 value = 0xC1 - 0xE5;
1037                                         else if (s.IndexOf ("DOWN-POINTING") >= 0)
1038                                                 value = 0xC2 - 0xE5;
1039                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1040                                                 value = 0xC3 - 0xE5;
1041                                 }
1042                                 else if (s.IndexOf ("POINTER") >= 0) {
1043                                         if (s.IndexOf ("RIGHT-POINTING") >= 0)
1044                                                 value = 0xC4 - 0xE5;
1045                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1046                                                 value = 0xC5 - 0xE5;
1047                                 }
1048                                 else if (s.IndexOf ("DIAMOND") >= 0)
1049                                         value = 0xC6 - 0xE5;
1050                                 else if (s.IndexOf ("FISHEYE") >= 0)
1051                                         value = 0xC7 - 0xE5;
1052                                 else if (s.IndexOf ("LOZENGE") >= 0)
1053                                         value = 0xC8 - 0xE5;
1054                                 else if (s.IndexOf ("BULLSEYE") >= 0)
1055                                         value = 0xC9 - 0xE5;
1056                                 else if (s.IndexOf ("CIRCLE") >= 0) {
1057                                         if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1058                                                 value = 0xCA - 0xE5;
1059                                         else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1060                                                 value = 0xCB - 0xE5;
1061                                         else
1062                                                 value = 0xC9 - 0xE5;
1063                                 }
1064                                 else if (s.IndexOf ("BULLET") >= 0)
1065                                         value = 0xCC - 0xE5;
1066                                 if (0x25DA <= cp && cp <= 0x25E5)
1067                                         value = 0xCD + cp - 0x25DA - 0xE5;
1068
1069                                 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1070                                 switch (cp) {
1071                                 case 0x2571: value = 0xF; break;
1072                                 case 0x2572: value = 0x10; break;
1073                                 case 0x2573: value = 0x11; break;
1074                                 }
1075                                 if (value != int.MinValue)
1076                                         boxValues.Add (new DictionaryEntry (
1077                                                 cp, value));
1078                         }
1079
1080                         // For some characters store the name and sort later
1081                         // to determine sorting.
1082                         if (0x2100 <= cp && cp <= 0x213F &&
1083                                 Char.IsSymbol ((char) cp))
1084                                 sortableCharNames.Add (
1085                                         new DictionaryEntry (cp, name));
1086                         else if (0x3380 <= cp && cp <= 0x33DD)
1087                                 sortableCharNames.Add (new DictionaryEntry (
1088                                         cp, name.Substring (7)));
1089
1090                         if (Char.GetUnicodeCategory ((char) cp) ==
1091                                 UnicodeCategory.MathSymbol) {
1092                                 if (name.StartsWith ("CIRCLED "))
1093                                         diacritical [cp] = 0xEE;
1094                                 if (name.StartsWith ("SQUARED "))
1095                                         diacritical [cp] = 0xEF;
1096                         }
1097
1098                         // diacritical weights by character name
1099 if (diacritics.Length != diacriticWeights.Length)
1100 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1101                         for (int d = 0; d < diacritics.Length; d++) {
1102                                 if (s.IndexOf (diacritics [d]) > 0) {
1103                                         diacritical [cp] += diacriticWeights [d];
1104                                         if (s.IndexOf ("COMBINING") >= 0)
1105                                                 diacritical [cp] -= (byte) 2;
1106                                         continue;
1107                                 }
1108                                 // also process "COMBINING blah" here
1109                                 // For now it is limited to cp < 0x0370
1110 //                              if (cp < 0x0300 || cp >= 0x0370)
1111 //                                      continue;
1112                                 string tmp = diacritics [d].TrimEnd (';');
1113                                 if (tmp.IndexOf ("WITH ") == 0)
1114                                         tmp = tmp.Substring (4);
1115                                 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1116                                 if (name == tmp) {
1117                                         diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1118                                         break;
1119                                 }
1120 //if (name == tmp)
1121 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1122                         }
1123                         // Two-step grep required for it.
1124                         if (s.IndexOf ("FULL STOP") > 0 &&
1125                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1126                                 diacritical [cp] |= 0xF4;
1127                         if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1128                                 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1129                                         s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1130
1131                         // Arabic letter name
1132                         if (0x0621 <= cp && cp <= 0x064A &&
1133                                 Char.GetUnicodeCategory ((char) cp)
1134                                 == UnicodeCategory.OtherLetter) {
1135                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1136                                 switch (cp) {
1137                                 case 0x0621:
1138                                 case 0x0624:
1139                                 case 0x0626:
1140                                         // hamza, waw, yeh ... special cases.
1141                                         value = 0x07;
1142                                         break;
1143                                 case 0x0649:
1144                                 case 0x064A:
1145                                         value = 0x77; // special cases.
1146                                         break;
1147                                 default:
1148                                         // Get primary letter name i.e.
1149                                         // XXX part of ARABIC LETTER XXX yyy
1150                                         // e.g. that of "TEH MARBUTA" is "TEH".
1151                                         string letterName =
1152                                                 (cp == 0x0640) ?
1153                                                 // 0x0640 is special: it does
1154                                                 // not start with ARABIC LETTER
1155                                                 name :
1156                                                 name.Substring (14);
1157                                         int tmpIdx = letterName.IndexOf (' ');
1158                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1159 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1160                                         if (arabicNameMap.ContainsKey (letterName))
1161                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1162                                         else
1163                                                 arabicNameMap [letterName] = cp;
1164                                         break;
1165                                 }
1166                                 arabicLetterPrimaryValues [cp] = value;
1167                         }
1168
1169                         // Japanese square letter
1170                         if (0x3300 <= cp && cp <= 0x3357)
1171                                 if (!ExistsJIS (cp))
1172                                         nonJisJapanese.Add (new NonJISCharacter (cp, name));
1173
1174                         // normalizationType
1175                         string decomp = values [4];
1176                         idx = decomp.IndexOf ('<');
1177                         if (idx >= 0) {
1178                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1179                                 case "full":
1180                                         decompType [cp] = DecompositionFull;
1181                                         break;
1182                                 case "sub":
1183                                         decompType [cp] = DecompositionSub;
1184                                         break;
1185                                 case "super":
1186                                         decompType [cp] = DecompositionSuper;
1187                                         break;
1188                                 case "small":
1189                                         decompType [cp] = DecompositionSmall;
1190                                         break;
1191                                 case "isolated":
1192                                         decompType [cp] = DecompositionIsolated;
1193                                         break;
1194                                 case "initial":
1195                                         decompType [cp] = DecompositionInitial;
1196                                         break;
1197                                 case "final":
1198                                         decompType [cp] = DecompositionFinal;
1199                                         break;
1200                                 case "medial":
1201                                         decompType [cp] = DecompositionMedial;
1202                                         break;
1203                                 case "noBreak":
1204                                         decompType [cp] = DecompositionNoBreak;
1205                                         break;
1206                                 case "compat":
1207                                         decompType [cp] = DecompositionCompat;
1208                                         break;
1209                                 case "fraction":
1210                                         decompType [cp] = DecompositionFraction;
1211                                         break;
1212                                 case "font":
1213                                         decompType [cp] = DecompositionFont;
1214                                         break;
1215                                 case "circle":
1216                                         decompType [cp] = DecompositionCircle;
1217                                         break;
1218                                 case "square":
1219                                         decompType [cp] = DecompositionSquare;
1220                                         break;
1221                                 case "wide":
1222                                         decompType [cp] = DecompositionWide;
1223                                         break;
1224                                 case "narrow":
1225                                         decompType [cp] = DecompositionNarrow;
1226                                         break;
1227                                 case "vertical":
1228                                         decompType [cp] = DecompositionVertical;
1229                                         break;
1230                                 default:
1231                                         throw new Exception ("Support NFKD type : " + decomp);
1232                                 }
1233                         }
1234                         else
1235                                 decompType [cp] = DecompositionCanonical;
1236                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1237                         if (decomp.Length > 0) {
1238
1239                                 string [] velems = decomp.Split (' ');
1240                                 int didx = decompValues.Count;
1241                                 decompIndex [cp] = didx;
1242                                 foreach (string v in velems)
1243                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1244                                 decompLength [cp] = velems.Length;
1245
1246                                 // [decmpType] -> this_cp
1247                                 int targetCP = (int) decompValues [didx];
1248                                 // for "(x)" it specially maps to 'x' .
1249                                 // FIXME: check if it is sane
1250                                 if (velems.Length == 3 &&
1251                                         (int) decompValues [didx] == '(' &&
1252                                         (int) decompValues [didx + 2] == ')')
1253                                         targetCP = (int) decompValues [didx + 1];
1254                                 // special: 0x215F "1/"
1255                                 else if (cp == 0x215F)
1256                                         targetCP = '1';
1257                                 else if (velems.Length > 1 &&
1258                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
1259                                         // skip them, except for CJK ideograph compat
1260                                         targetCP = 0;
1261
1262                                 if (targetCP != 0) {
1263                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
1264                                         if (entry == null) {
1265                                                 entry = new Hashtable ();
1266                                                 nfkdMap [targetCP] = entry;
1267                                         }
1268                                         entry [(byte) decompType [cp]] = cp;
1269                                 }
1270                         }
1271                         // numeric values
1272                         if (values [5].Length > 0)
1273                                 decimalValue [cp] = decimal.Parse (values [5]);
1274                         else if (values [6].Length > 0)
1275                                 decimalValue [cp] = decimal.Parse (values [6]);
1276                         else if (values [7].Length > 0) {
1277                                 string decstr = values [7];
1278                                 idx = decstr.IndexOf ('/');
1279                                 if (cp == 0x215F) // special. "1/"
1280                                         decimalValue [cp] = 0x1;
1281                                 else if (idx > 0)
1282                                         // m/n
1283                                         decimalValue [cp] =
1284                                                 decimal.Parse (decstr.Substring (0, idx))
1285                                                 / decimal.Parse (decstr.Substring (idx + 1));
1286                                 else if (decstr [0] == '(' &&
1287                                         decstr [decstr.Length - 1] == ')')
1288                                         // (n)
1289                                         decimalValue [cp] =
1290                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1291                                 else if (decstr [decstr.Length - 1] == '.')
1292                                         // n.
1293                                         decimalValue [cp] =
1294                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1295                                 else
1296                                         decimalValue [cp] = decimal.Parse (decstr);
1297                         }
1298                 }
1299
1300                 void ParseDerivedCoreProperties (string filename)
1301                 {
1302                         // IsUppercase
1303                         using (StreamReader file =
1304                                 new StreamReader (filename)) {
1305                                 for (int line = 1; file.Peek () >= 0; line++) {
1306                                         try {
1307                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1308                                         } catch (Exception) {
1309                                                 Console.Error.WriteLine ("**** At line " + line);
1310                                                 throw;
1311                                         }
1312                                 }
1313                         }
1314                 }
1315
1316                 void ProcessDerivedCorePropLine (string s)
1317                 {
1318                         int idx = s.IndexOf ('#');
1319                         if (idx >= 0)
1320                                 s = s.Substring (0, idx);
1321                         idx = s.IndexOf (';');
1322                         if (idx < 0)
1323                                 return;
1324                         string cpspec = s.Substring (0, idx);
1325                         idx = cpspec.IndexOf ("..");
1326                         NumberStyles nf = NumberStyles.HexNumber |
1327                                 NumberStyles.AllowTrailingWhite;
1328                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1329                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1330                         string value = s.Substring (cpspec.Length + 1).Trim ();
1331
1332                         // FIXME: use index
1333                         if (cp > char.MaxValue)
1334                                 return;
1335
1336                         switch (value) {
1337                         case "Uppercase":
1338                                 for (int x = cp; x <= cpEnd; x++)
1339                                         isUppercase [x] = true;
1340                                 break;
1341                         }
1342                 }
1343
1344                 void ParseScripts (string filename)
1345                 {
1346                         ArrayList gurmukhi = new ArrayList ();
1347                         ArrayList gujarati = new ArrayList ();
1348                         ArrayList georgian = new ArrayList ();
1349                         ArrayList thaana = new ArrayList ();
1350
1351                         using (StreamReader file =
1352                                 new StreamReader (filename)) {
1353                                 while (file.Peek () >= 0) {
1354                                         string s = file.ReadLine ();
1355                                         int idx = s.IndexOf ('#');
1356                                         if (idx >= 0)
1357                                                 s = s.Substring (0, idx);
1358                                         idx = s.IndexOf (';');
1359                                         if (idx < 0)
1360                                                 continue;
1361
1362                                         string cpspec = s.Substring (0, idx);
1363                                         idx = cpspec.IndexOf ("..");
1364                                         NumberStyles nf = NumberStyles.HexNumber |
1365                                                 NumberStyles.AllowTrailingWhite;
1366                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1367                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1368                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1369
1370                                         // FIXME: use index
1371                                         if (cp > char.MaxValue)
1372                                                 continue;
1373
1374                                         switch (value) {
1375                                         case "Gurmukhi":
1376                                                 for (int x = cp; x <= cpEnd; x++)
1377                                                         if (!IsIgnorable (x))
1378                                                                 gurmukhi.Add ((char) x);
1379                                                 break;
1380                                         case "Gujarati":
1381                                                 for (int x = cp; x <= cpEnd; x++)
1382                                                         if (!IsIgnorable (x))
1383                                                                 gujarati.Add ((char) x);
1384                                                 break;
1385                                         case "Georgian":
1386                                                 for (int x = cp; x <= cpEnd; x++)
1387                                                         if (!IsIgnorable (x))
1388                                                                 georgian.Add ((char) x);
1389                                                 break;
1390                                         case "Thaana":
1391                                                 for (int x = cp; x <= cpEnd; x++)
1392                                                         if (!IsIgnorable (x))
1393                                                                 thaana.Add ((char) x);
1394                                                 break;
1395                                         }
1396                                 }
1397                         }
1398                         gurmukhi.Sort (UCAComparer.Instance);
1399                         gujarati.Sort (UCAComparer.Instance);
1400                         georgian.Sort (UCAComparer.Instance);
1401                         thaana.Sort (UCAComparer.Instance);
1402                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1403                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1404                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1405                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1406                 }
1407
1408                 void ParseJISOrder (string filename)
1409                 {
1410                         int line = 1;
1411                         try {
1412                                 using (StreamReader file =
1413                                         new StreamReader (filename)) {
1414                                         for (;file.Peek () >= 0; line++)
1415                                                 ProcessJISOrderLine (file.ReadLine ());
1416                                 }
1417                         } catch (Exception) {
1418                                 Console.Error.WriteLine ("---- line {0}", line);
1419                                 throw;
1420                         }
1421                 }
1422
1423                 char [] ws = new char [] {'\t', ' '};
1424
1425                 void ProcessJISOrderLine (string s)
1426                 {
1427                         int idx = s.IndexOf ('#');
1428                         if (idx >= 0)
1429                                 s = s.Substring (0, idx).Trim ();
1430                         if (s.Length == 0)
1431                                 return;
1432                         idx = s.IndexOfAny (ws);
1433                         if (idx < 0)
1434                                 return;
1435                         // They start with "0x" so cut them out.
1436                         int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1437                         int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1438                         jisJapanese.Add (new JISCharacter (cp, jis));
1439                 }
1440
1441                 void ParseCJK (string zhXML, string jaXML, string koXML)
1442                 {
1443                         XmlDocument doc = new XmlDocument ();
1444                         doc.XmlResolver = null;
1445                         int v;
1446                         string s;
1447                         string category;
1448                         int offset;
1449                         ushort [] arr;
1450
1451                         // Chinese Simplified
1452                         category = "chs";
1453                         arr = cjkCHS;
1454                         offset = 0;//char.MaxValue - arr.Length;
1455                         doc.Load (zhXML);
1456                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1457                         v = 0x8008;
1458                         foreach (char c in s) {
1459                                 if (c < '\u3100')
1460                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1461                                 else {
1462                                         arr [(int) c - offset] = (ushort) v++;
1463                                         if (v % 256 == 0)
1464                                                 v += 2;
1465                                 }
1466                         }
1467
1468                         // Chinese Traditional
1469                         category = "cht";
1470                         arr = cjkCHT;
1471                         offset = 0;//char.MaxValue - arr.Length;
1472                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1473                         v = 0x8002;
1474                         foreach (char c in s) {
1475                                 if (c < '\u4E00')
1476                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1477                                 else {
1478                                         arr [(int) c - offset] = (ushort) v++;
1479                                         if (v % 256 == 0)
1480                                                 v += 2;
1481                                 }
1482                         }
1483
1484                         // Japanese
1485                         category = "ja";
1486                         arr = cjkJA;
1487                         offset = 0;//char.MaxValue - arr.Length;
1488
1489                         // SPECIAL CASES
1490                         arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1491                         arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1492                         arr [0x337E] = 0x8005;
1493                         arr [0x337D] = 0x8006;
1494                         arr [0x337C] = 0x8007;
1495
1496                         v = 0x8008;
1497                         foreach (JISCharacter jc in jisJapanese) {
1498                                 if (jc.JIS < 0x8800)
1499                                         continue;
1500                                 char c = (char) jc.CP;
1501
1502                                 if (c < '\u4E00')
1503                                         // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1504                                         continue;
1505                                 else {
1506                                         arr [(int) c - offset] = (ushort) v++;
1507                                         if (v % 256 == 0)
1508                                                 v += 2;
1509
1510                                         // SPECIAL CASES:
1511                                         if (c == '\u662D') // U+337C
1512                                                 continue;
1513                                         if (c == '\u5927') // U+337D
1514                                                 continue;
1515                                         if (c == '\u5E73') // U+337B
1516                                                 continue;
1517                                         if (c == '\u660E') // U+337E
1518                                                 continue;
1519                                         if (c == '\u9686') // U+F9DC
1520                                                 continue;
1521
1522                                         // FIXME: there are still remaining
1523                                         // characters after U+FA0C.
1524 //                                      for (int k = 0; k < char.MaxValue; k++) {
1525                                         for (int k = 0; k < '\uFA0D'; k++) {
1526                                                 if (decompIndex [k] == 0 || IsIgnorable (k))
1527                                                         continue;
1528                                                 if (decompValues [decompIndex [k]] == c /*&&
1529                                                         decompLength [k] == 1*/ ||
1530                                                         decompLength [k] == 3 &&
1531                                                         decompValues [decompIndex [k] + 1] == c) {
1532                                                         arr [k - offset] = (ushort) v++;
1533                                                         if (v % 256 == 0)
1534                                                                 v += 2;
1535                                                 }
1536                                         }
1537                                 }
1538                         }
1539
1540                         // Korean
1541                         // Korean weight is somewhat complex. It first shifts
1542                         // Hangul category from 52-x to 80-x (they are anyways
1543                         // computed). CJK ideographs are placed at secondary
1544                         // weight, like XX YY 01 zz 01, where XX and YY are
1545                         // corresponding "reset" value and zz is 41,43,45...
1546                         //
1547                         // Unlike chs,cht and ja, Korean value is a combined
1548                         // ushort which is computed as category
1549                         //
1550                         category = "ko";
1551                         arr = cjkKO;
1552                         offset = 0;//char.MaxValue - arr.Length;
1553                         doc.Load (koXML);
1554                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1555                                 XmlElement sc = (XmlElement) reset.NextSibling;
1556                                 // compute "category" and "level 1" for the
1557                                 // target "reset" Hangle syllable
1558                                 char rc = reset.InnerText [0];
1559                                 int ri = ((int) rc - 0xAC00) + 1;
1560                                 ushort p = (ushort)
1561                                         ((ri / 254) * 256 + (ri % 254) + 2);
1562                                 // Place the characters after the target.
1563                                 s = sc.InnerText;
1564                                 v = 0x41;
1565                                 foreach (char c in s) {
1566                                         arr [(int) c - offset] = p;
1567                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1568                                         v += 2;
1569                                 }
1570                         }
1571                 }
1572
1573                 #endregion
1574
1575                 #region Generation
1576
1577                 void FillIgnorables ()
1578                 {
1579                         for (int i = 0; i <= char.MaxValue; i++) {
1580                                 if (Char.GetUnicodeCategory ((char) i) ==
1581                                         UnicodeCategory.OtherNotAssigned)
1582                                         continue;
1583                                 if (IsIgnorable (i))
1584                                         ignorableFlags [i] |= 1;
1585                                 if (IsIgnorableSymbol (i))
1586                                         ignorableFlags [i] |= 2;
1587                                 if (IsIgnorableNonSpacing (i))
1588                                         ignorableFlags [i] |= 4;
1589                         }
1590                 }
1591
1592                 void ModifyUnidata ()
1593                 {
1594                         // Modify some decomposition equivalence
1595                         for (int i = 0xFE31; i <= 0xFE34; i++) {
1596                                 decompType [i] = 0;
1597                                 decompIndex [i] = 0;
1598                                 decompLength [i] = 0;
1599                         }
1600                         decompType [0x037E] = 0;
1601                         decompIndex [0x037E] = 0;
1602                         decompLength [0x037E] = 0;
1603
1604                         // Hangzhou numbers
1605                         for (int i = 0x3021; i <= 0x3029; i++)
1606                                 diacritical [i] = 0x4E;
1607                         // Korean parens numbers
1608                         for (int i = 0x3200; i <= 0x321C; i++)
1609                                 diacritical [i] = 0xA;
1610                         for (int i = 0x3260; i <= 0x327B; i++)
1611                                 diacritical [i] = 0xC;
1612
1613                         // LAMESPEC: these remapping should not be done.
1614                         // Windows have incorrect CJK compat mappings.
1615                         decompValues [decompIndex [0x32A9]] = 0x91AB;
1616                         decompLength [0x323B] = 1;
1617                         decompValues [decompIndex [0x323B]] = 0x5B78;
1618                         decompValues [decompIndex [0x32AB]] = 0x5B78;
1619                         decompValues [decompIndex [0x32A2]] = 0x5BEB;
1620                         decompLength [0x3238] = 1;
1621                         decompValues [decompIndex [0x3238]] = 0x52DE;
1622                         decompValues [decompIndex [0x3298]] = 0x52DE;
1623
1624                         // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1625                         decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1626                         decompValues [decompIndex [0xFA0C]] = 0x5140;
1627                         decompLength [0xFA0C] = 1;
1628                         decompIndex [0xF929] = decompLength [0xF929] = 0;
1629
1630                         decompValues [decompIndex [0xF92C]] = 0x90DE;
1631                 }
1632
1633                 void ModifyParsedValues ()
1634                 {
1635                         // some cyrillic diacritical weight. They seem to be
1636                         // based on old character names, so it's quicker to
1637                         // set them directly here.
1638                         diacritical [0x0496] = diacritical [0x0497] = 7;
1639                         diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1640                         diacritical [0x049A] = diacritical [0x049B] = 0x17;
1641                         diacritical [0x049C] = diacritical [0x049D] = 9;
1642                         diacritical [0x049E] = diacritical [0x049F] = 4;
1643                         diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1644                         diacritical [0x04A2] = diacritical [0x04A3] = 7;
1645                         diacritical [0x04A4] = diacritical [0x04A5] = 8;
1646
1647                         // number, secondary weights
1648                         byte weight = 0x38;
1649                         int [] numarr = numberSecondaryWeightBounds;
1650                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1651                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1652                                         if (Char.IsNumber ((char) cp))
1653                                                 diacritical [cp] = weight;
1654
1655                         // Update name part of named characters
1656                         for (int i = 0; i < sortableCharNames.Count; i++) {
1657                                 DictionaryEntry de =
1658                                         (DictionaryEntry) sortableCharNames [i];
1659                                 int cp = (int) de.Key;
1660                                 string renamed = null;
1661                                 switch (cp) {
1662                                 case 0x2101: renamed = "A_1"; break;
1663                                 case 0x33C3: renamed = "A_2"; break;
1664                                 case 0x2105: renamed = "C_1"; break;
1665                                 case 0x2106: renamed = "C_2"; break;
1666                                 case 0x211E: renamed = "R1"; break;
1667                                 case 0x211F: renamed = "R2"; break;
1668                                 // Remove some of them!
1669                                 case 0x2103:
1670                                 case 0x2109:
1671                                 case 0x2116:
1672                                 case 0x2117:
1673                                 case 0x2118:
1674                                 case 0x2125:
1675                                 case 0x2127:
1676                                 case 0x2129:
1677                                 case 0x212E:
1678                                 case 0x2132:
1679                                         sortableCharNames.RemoveAt (i);
1680                                         i--;
1681                                         continue;
1682                                 }
1683                                 if (renamed != null)
1684                                         sortableCharNames [i] =
1685                                                 new DictionaryEntry (cp, renamed);
1686                         }
1687                 }
1688
1689                 void GenerateCore ()
1690                 {
1691                         UnicodeCategory uc;
1692
1693                         #region Specially ignored // 01
1694                         // This will raise "Defined" flag up.
1695                         // FIXME: Check If it is really fine. Actually for
1696                         // Japanese voice marks this code does remapping.
1697                         foreach (char c in specialIgnore)
1698                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1699                         #endregion
1700
1701                         #region Extenders (FF FF)
1702                         fillIndex [0xFF] = 0xFF;
1703                         char [] specialBiggest = new char [] {
1704                                 '\u3005', '\u3031', '\u3032', '\u309D',
1705                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1706                                 '\uFE7C', '\uFE7D', '\uFF70'};
1707                         foreach (char c in specialBiggest)
1708                                 AddCharMap (c, 0xFF, 0);
1709                         #endregion
1710
1711                         #region Variable weights
1712                         // Controls : 06 03 - 06 3D
1713                         fillIndex [0x6] = 3;
1714                         for (int i = 0; i < 65536; i++) {
1715                                 if (IsIgnorable (i))
1716                                         continue;
1717                                 char c = (char) i;
1718                                 uc = Char.GetUnicodeCategory (c);
1719                                 // NEL is whitespace but not ignored here.
1720                                 if (uc == UnicodeCategory.Control &&
1721                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1722                                         AddCharMap (c, 6, 1);
1723                         }
1724
1725                         // Apostrophe 06 80
1726                         fillIndex [0x6] = 0x80;
1727                         AddCharMap ('\'', 6, 0);
1728                         AddCharMap ('\uFF07', 6, 1);
1729                         AddCharMap ('\uFE63', 6, 1);
1730
1731                         // SPECIAL CASE: fill FE32 here in prior to be added
1732                         // at 2013. Windows does not always respect NFKD.
1733                         map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1734
1735                         // Hyphen/Dash : 06 81 - 06 90
1736                         for (int i = 0; i < char.MaxValue; i++) {
1737                                 if (!IsIgnorable (i) &&
1738                                         Char.GetUnicodeCategory ((char) i) ==
1739                                         UnicodeCategory.DashPunctuation) {
1740                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1741                                         if (i == 0x2011) {
1742                                                 // SPECIAL: add 2027 and 2043
1743                                                 // Maybe they are regarded the
1744                                                 // same hyphens in "central"
1745                                                 // position.
1746                                                 AddCharMap ('\u2027', 6, 1);
1747                                                 AddCharMap ('\u2043', 6, 1);
1748                                         }
1749                                 }
1750                         }
1751                         // They are regarded as primarily equivalent to '-'
1752                         map [0x208B] = new CharMapEntry (6, 0x82, 0);
1753                         map [0x207B] = new CharMapEntry (6, 0x82, 0);
1754                         map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1755
1756                         // Arabic variable weight chars 06 A0 -
1757                         fillIndex [6] = 0xA0;
1758                         // vowels
1759                         for (int i = 0x64B; i <= 0x650; i++)
1760                                 AddArabicCharMap ((char) i);
1761                         // sukun
1762                         AddCharMapGroup ('\u0652', 6, 1, 0);
1763                         // shadda
1764                         AddCharMapGroup ('\u0651', 6, 1, 0);
1765                         #endregion
1766
1767
1768                         #region Nonspacing marks // 01
1769                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1770
1771                         // Combining diacritical marks: 01 DC -
1772
1773                         fillIndex [0x1] = 0x41;
1774                         for (int i = 0x030E; i <= 0x0326; i++)
1775                                 if (!IsIgnorable (i))
1776                                         AddCharMap ((char) i, 0x1, 1);
1777                         for (int i = 0x0329; i <= 0x0334; i++)
1778                                 if (!IsIgnorable (i))
1779                                         AddCharMap ((char) i, 0x1, 1);
1780                         fillIndex [0x1]++;
1781                         for (int i = 0x0339; i <= 0x0341; i++)
1782                                 if (!IsIgnorable (i))
1783                                         AddCharMap ((char) i, 0x1, 1);
1784                         fillIndex [0x1] = 0x74;
1785                         for (int i = 0x0346; i <= 0x0348; i++)
1786                                 if (!IsIgnorable (i))
1787                                         AddCharMap ((char) i, 0x1, 1);
1788                         for (int i = 0x02BE; i <= 0x02BF; i++)
1789                                 if (!IsIgnorable (i))
1790                                         AddCharMap ((char) i, 0x1, 1);
1791                         for (int i = 0x02C1; i <= 0x02C5; i++)
1792                                 if (!IsIgnorable (i))
1793                                         AddCharMap ((char) i, 0x1, 1);
1794                         for (int i = 0x02CE; i <= 0x02CF; i++)
1795                                 if (!IsIgnorable (i))
1796                                         AddCharMap ((char) i, 0x1, 1);
1797                         fillIndex [0x1]++;
1798                         for (int i = 0x02D1; i <= 0x02D3; i++)
1799                                 if (!IsIgnorable (i))
1800                                         AddCharMap ((char) i, 0x1, 1);
1801                         AddCharMap ('\u02DE', 0x1, 1);
1802                         for (int i = 0x02E4; i <= 0x02E9; i++)
1803                                 if (!IsIgnorable (i))
1804                                         AddCharMap ((char) i, 0x1, 1);
1805
1806                         // FIXME: needs more love here (it should eliminate
1807                         // all the hacky code above).
1808                         for (int i = 0x0300; i < 0x0370; i++)
1809                                 if (!IsIgnorable (i) && diacritical [i] != 0
1810                                         /* especiall here*/ && !map [i].Defined)
1811                                         map [i] = new CharMapEntry (
1812                                                 0x1, 0x1, diacritical [i]);
1813
1814                         // Cyrillic and Armenian nonspacing mark
1815                         fillIndex [0x1] = 0x94;
1816                         for (int i = 0x400; i < 0x580; i++)
1817                                 if (!IsIgnorable (i) &&
1818                                         Char.GetUnicodeCategory ((char) i) ==
1819                                         UnicodeCategory.NonSpacingMark)
1820                                         AddCharMap ((char) i, 1, 1);
1821
1822                         fillIndex [0x1] = 0x8D;
1823                         // syriac dotted nonspacing marks (1)
1824                         AddCharMap ('\u0740', 0x1, 1);
1825                         AddCharMap ('\u0741', 0x1, 1);
1826                         AddCharMap ('\u0742', 0x1, 1);
1827                         // syriac oblique nonspacing marks
1828                         AddCharMap ('\u0747', 0x1, 1);
1829                         AddCharMap ('\u0748', 0x1, 1);
1830                         // syriac dotted nonspacing marks (2)
1831                         fillIndex [0x1] = 0x94; // this reset is mandatory
1832                         AddCharMap ('\u0732', 0x1, 1);
1833                         AddCharMap ('\u0735', 0x1, 1);
1834                         AddCharMap ('\u0738', 0x1, 1);
1835                         AddCharMap ('\u0739', 0x1, 1);
1836                         AddCharMap ('\u073C', 0x1, 1);
1837                         // SPECIAL CASES: superscripts
1838                         AddCharMap ('\u073F', 0x1, 1);
1839                         AddCharMap ('\u0711', 0x1, 1);
1840                         // syriac "DOTS"
1841                         for (int i = 0x0743; i <= 0x0746; i++)
1842                                 AddCharMap ((char) i, 0x1, 1);
1843                         for (int i = 0x0730; i <= 0x0780; i++)
1844                                 if (!map [i].Defined &&
1845                                         Char.GetUnicodeCategory ((char) i) ==
1846                                         UnicodeCategory.NonSpacingMark)
1847                                         AddCharMap ((char) i, 0x1, 1);
1848
1849                         // LAMESPEC: It should not stop at '\u20E1'. There are
1850                         // a few more characters (that however results in
1851                         // overflow of level 2 unless we start before 0xDD).
1852                         fillIndex [0x1] = 0xDD;
1853                         for (int i = 0x20D0; i <= 0x20DC; i++)
1854                                 AddCharMap ((char) i, 0x1, 1);
1855                         fillIndex [0x1] = 0xEC;
1856                         for (int i = 0x20DD; i <= 0x20E1; i++)
1857                                 AddCharMap ((char) i, 0x1, 1);
1858                         fillIndex [0x1] = 0x7;
1859                         for (int i = 0x302A; i <= 0x302D; i++)
1860                                 AddCharMap ((char) i, 0x1, 1);
1861                         fillIndex [0x1] = 0x50; // I wonder how they are sorted
1862                         for (int i = 0x02D4; i <= 0x02D7; i++)
1863                                 AddCharMap ((char) i, 0x1, 1);
1864
1865                         // They are not part of Nonspacing marks, but have
1866                         // only diacritical weight.
1867                         for (int i = 0x3099; i <= 0x309C; i++)
1868                                 map [i] = new CharMapEntry (1, 1, 1);
1869                         map [0xFF9E] = new CharMapEntry (1, 1, 1);
1870                         map [0xFF9F] = new CharMapEntry (1, 1, 2);
1871                         map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1872                         map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1873                         for (int i = 0x30FC; i <= 0x30FE; i++)
1874                                 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1875
1876                         fillIndex [0x1] = 0xA;
1877                         for (int i = 0x0951; i <= 0x0954; i++)
1878                                 AddCharMap ((char) i, 0x1, 2);
1879
1880                         #endregion
1881
1882
1883                         #region Whitespaces // 07 03 -
1884                         fillIndex [0x7] = 0x2;
1885                         AddCharMap (' ', 0x7, 2);
1886                         AddCharMap ('\u00A0', 0x7, 1);
1887                         for (int i = 9; i <= 0xD; i++)
1888                                 AddCharMap ((char) i, 0x7, 1);
1889                         for (int i = 0x2000; i <= 0x200B; i++)
1890                                 AddCharMap ((char) i, 0x7, 1);
1891
1892                         fillIndex [0x7] = 0x17;
1893                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1894                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1895
1896                         // Characters which used to represent layout control.
1897                         // LAMESPEC: Windows developers seem to have thought
1898                         // that those characters are kind of whitespaces,
1899                         // while they aren't.
1900                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1901                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1902
1903                         #endregion
1904
1905                         // category 09 - continued symbols from 08
1906                         fillIndex [0x9] = 2;
1907                         // misc tech mark
1908                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1909                                 AddCharMap ((char) cp, 0x9, 1, 0);
1910
1911                         // arrows
1912                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
1913                         foreach (DictionaryEntry de in arrowValues) {
1914                                 int idx = (int) de.Value;
1915                                 int cp = (int) de.Key;
1916                                 if (map [cp].Defined)
1917                                         continue;
1918                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1919                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1920                                 arrowLv2 [idx]++;
1921                         }
1922                         // boxes
1923                         byte [] boxLv2 = new byte [128];
1924                         // 0-63 will be used for those offsets are positive,
1925                         // and 64-127 are for negative ones.
1926                         for (int i = 0; i < boxLv2.Length; i++)
1927                                 boxLv2 [i] = 3;
1928                         foreach (DictionaryEntry de in boxValues) {
1929                                 int cp = (int) de.Key;
1930                                 int off = (int) de.Value;
1931                                 if (map [cp].Defined)
1932                                         continue;
1933                                 if (off < 0) {
1934                                         fillIndex [0x9] = (byte) (0xE5 + off);
1935                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++);
1936                                 }
1937                                 else {
1938                                         fillIndex [0x9] = (byte) (0xE5 + off);
1939                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1940                                 }
1941                         }
1942                         // Some special characters (slanted)
1943                         fillIndex [0x9] = 0xF4;
1944                         AddCharMap ('\u2571', 0x9, 3);
1945                         AddCharMap ('\u2572', 0x9, 3);
1946                         AddCharMap ('\u2573', 0x9, 3);
1947
1948                         // FIXME: implement 0A
1949                         #region Symbols
1950                         fillIndex [0xA] = 2;
1951                         // byte currency symbols
1952                         for (int cp = 0; cp < 0x100; cp++) {
1953                                 uc = Char.GetUnicodeCategory ((char) cp);
1954                                 if (!IsIgnorable (cp) &&
1955                                         uc == UnicodeCategory.CurrencySymbol &&
1956                                         cp != '$')
1957                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1958                         }
1959                         // byte other symbols
1960                         for (int cp = 0; cp < 0x100; cp++) {
1961                                 if (cp == 0xA6)
1962                                         continue; // SPECIAL: skip FIXME: why?
1963                                 uc = Char.GetUnicodeCategory ((char) cp);
1964                                 if (!IsIgnorable (cp) &&
1965                                         uc == UnicodeCategory.OtherSymbol ||
1966                                         cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7')
1967                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1968                         }
1969                         // U+30FB here
1970                         AddCharMapGroup ('\u30FB', 0xA, 1, 0);
1971
1972                         for (int cp = 0x2020; cp <= 0x2031; cp++)
1973                                 if (Char.IsPunctuation ((char) cp))
1974                                         AddCharMap ((char) cp, 0xA, 1, 0);
1975                         // SPECIAL CASES: why?
1976                         AddCharMap ('\u203B', 0xA, 1, 0);
1977                         AddCharMap ('\u2040', 0xA, 1, 0);
1978                         AddCharMap ('\u2041', 0xA, 1, 0);
1979                         AddCharMap ('\u2042', 0xA, 1, 0);
1980
1981                         for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1982                                 AddCharMap ((char) cp, 0xA, 1, 0);
1983
1984                         // 3004 is skipped at first...
1985                         for (int cp = 0x3010; cp <= 0x3040; cp++)
1986                                 if (Char.IsSymbol ((char) cp))
1987                                         AddCharMap ((char) cp, 0xA, 1, 0);
1988                         // SPECIAL CASES: added here
1989                         AddCharMap ('\u3004', 0xA, 1, 0);
1990                         AddCharMap ('\u327F', 0xA, 1, 0);
1991
1992                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1993                                 AddCharMap ((char) cp, 0xA, 1, 0);
1994                         // Dingbats
1995                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1996                                 if (Char.IsSymbol ((char) cp))
1997                                         AddCharMap ((char) cp, 0xA, 1, 0);
1998                         // OCR
1999                         for (int i = 0x2440; i < 0x2460; i++)
2000                                 AddCharMap ((char) i, 0xA, 1, 0);
2001
2002                         // SPECIAL CASES: why?
2003                         AddCharMap ('\u0E3F', 0xA, 1, 0);
2004                         AddCharMap ('\u2117', 0xA, 1, 0);
2005                         AddCharMap ('\u20AC', 0xA, 1, 0);
2006                         #endregion
2007
2008                         #region Numbers // 0C 02 - 0C E1
2009                         fillIndex [0xC] = 2;
2010
2011                         // 9F8 : Bengali "one less than the denominator"
2012                         AddCharMap ('\u09F8', 0xC, 1, 0x3C);
2013
2014                         ArrayList numbers = new ArrayList ();
2015                         for (int i = 0; i < 65536; i++)
2016                                 if (!IsIgnorable (i) &&
2017                                         Char.IsNumber ((char) i) &&
2018                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
2019                                         numbers.Add (i);
2020
2021                         ArrayList numberValues = new ArrayList ();
2022                         foreach (int i in numbers)
2023                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
2024                         // SPECIAL CASE: Cyrillic Thousand sign
2025                         numberValues.Add (new DictionaryEntry (0x0482, 1000m));
2026                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
2027
2028 //foreach (DictionaryEntry de in numberValues)
2029 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
2030
2031                         // FIXME: fillIndex adjustment lines are too
2032                         // complicated. It must be simpler.
2033                         decimal prevValue = -1;
2034                         foreach (DictionaryEntry de in numberValues) {
2035                                 int cp = (int) de.Key;
2036                                 decimal currValue = (decimal) de.Value;
2037                                 bool addnew = false;
2038                                 if (prevValue < currValue &&
2039                                         prevValue - (int) prevValue == 0 &&
2040                                         prevValue >= 1) {
2041
2042                                         addnew = true;
2043                                         // Process Hangzhou and Roman numbers
2044
2045                                         // There are some SPECIAL cases.
2046                                         if (currValue != 4) // no increment for 4
2047                                                 fillIndex [0xC]++;
2048
2049                                         int xcp;
2050                                         if (currValue <= 13) {
2051                                                 if (currValue == 4)
2052                                                         fillIndex [0xC]++;
2053                                                 // SPECIAL CASE
2054                                                 if (currValue == 11)
2055                                                         AddCharMap ('\u0BF0', 0xC, 1);
2056                                                 xcp = (int) prevValue + 0x2160 - 1;
2057                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2058                                                 xcp = (int) prevValue + 0x2170 - 1;
2059                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2060                                                 fillIndex [0xC]++;
2061                                         }
2062                                         if (currValue < 12)
2063                                                 fillIndex [0xC]++;
2064                                         if (currValue <= 10) {
2065                                                 xcp = (int) prevValue + 0x3021 - 1;
2066                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2067                                                 fillIndex [0xC]++;
2068                                         }
2069                                 }
2070                                 if (prevValue < currValue)
2071                                         prevValue = currValue;
2072                                 if (map [cp].Defined)
2073                                         continue;
2074                                 // HangZhou and Roman are add later
2075                                 // (code is above)
2076                                 if (0x3021 <= cp && cp < 0x302A
2077                                         || 0x2160 <= cp && cp < 0x216C
2078                                         || 0x2170 <= cp && cp < 0x217C)
2079                                         continue;
2080
2081                                 if (cp == 0x215B) // FIXME: why?
2082                                         fillIndex [0xC] += 2;
2083                                 else if (cp == 0x3021) // FIXME: why?
2084                                         fillIndex [0xC]++;
2085                                 if (addnew || cp <= '9') {
2086                                         int mod = (int) currValue - 1;
2087                                         int xcp;
2088                                         if (1 <= currValue && currValue <= 11) {
2089                                                 xcp = mod + 0x2776;
2090                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2091                                                 xcp = mod + 0x2780;
2092                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2093                                                 xcp = mod + 0x278A;
2094                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2095                                         }
2096                                         if (1 <= currValue && currValue <= 20) {
2097                                                 xcp = mod + 0x2460;
2098                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2099                                                 xcp = mod + 0x2474;
2100                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2101                                                 xcp = mod + 0x2488;
2102                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2103                                         }
2104                                 }
2105                                 if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9)
2106                                         fillIndex [0xC]++;
2107                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2108
2109                                 switch (cp) {
2110                                 // Maybe Bengali digit numbers do not increase
2111                                 // indexes, but 0x09E6 does.
2112                                 case 0x09E7: case 0x09E8: case 0x09E9:
2113                                 case 0x09EA:
2114                                 // SPECIAL CASES
2115                                 case 0x0BF0: case 0x2180: case 0x2181:
2116                                         break;
2117                                 // SPECIAL CASE
2118                                 case 0x0BF1:
2119                                         fillIndex [0xC]++;
2120                                         break;
2121                                 default:
2122                                         if (currValue < 11 || currValue == 1000)
2123                                                 fillIndex [0xC]++;
2124                                         break;
2125                                 }
2126
2127                                 // Add special cases that are not regarded as
2128                                 // numbers in UnicodeCategory speak.
2129                                 if (cp == '5') {
2130                                         // TONE FIVE
2131                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2132                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2133                                 }
2134                                 else if (cp == '2' || cp == '6') // FIXME: why?
2135                                         fillIndex [0xC]++;
2136                         }
2137
2138                         // 221E: infinity
2139                         fillIndex [0xC] = 0xFF;
2140                         AddCharMap ('\u221E', 0xC, 1);
2141                         #endregion
2142
2143                         #region Letters and NonSpacing Marks (general)
2144
2145                         // ASCII Latin alphabets
2146                         for (int i = 0; i < alphabets.Length; i++)
2147                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2148
2149                         // non-ASCII Latin alphabets
2150                         // FIXME: there is no such characters that are placed
2151                         // *after* "alphabets" array items. This is nothing
2152                         // more than a hack that creates dummy weight for
2153                         // primary characters.
2154                         for (int i = 0x0080; i < 0x0300; i++) {
2155                                 if (!Char.IsLetter ((char) i))
2156                                         continue;
2157                                 // For those Latin Letters which has NFKD are
2158                                 // not added as independent primary character.
2159                                 if (decompIndex [i] != 0)
2160                                         continue;
2161                                 // SPECIAL CASES:
2162                                 // 1.some alphabets have primarily
2163                                 //   equivalent ASCII alphabets.
2164                                 // 2.some have independent primary weights,
2165                                 //   but inside a-to-z range.
2166                                 // 3.there are some expanded characters that
2167                                 //   are not part of Unicode Standard NFKD.
2168                                 // 4. some characters are letter in IsLetter
2169                                 //   but not in sortkeys (maybe unicode version
2170                                 //   difference caused it).
2171                                 switch (i) {
2172                                 // 1. skipping them does not make sense
2173 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
2174 //                              case 0x184: case 0x185: case 0x186: case 0x189:
2175 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
2176 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
2177 //                              case 0x19B: case 0x19C:
2178                                 // 2. skipping them does not make sense
2179 //                              case 0x14A: // Ng
2180 //                              case 0x14B: // ng
2181                                 // 3.
2182                                 case 0xC6: // AE
2183                                 case 0xE6: // ae
2184                                 case 0xDE: // Icelandic Thorn
2185                                 case 0xFE: // Icelandic Thorn
2186                                 case 0xDF: // German ss
2187                                 case 0xFF: // German ss
2188                                 // 4.
2189                                 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2190                                 // not classified yet
2191 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2192 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2193 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2194 //                              case 0x1DD:
2195                                         continue;
2196                                 }
2197                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
2198                         }
2199
2200                         // Greek and Coptic
2201                         fillIndex [0xF] = 02;
2202                         for (int i = 0x0380; i < 0x0390; i++)
2203                                 if (Char.IsLetter ((char) i))
2204                                         AddLetterMap ((char) i, 0xF, 1);
2205                         fillIndex [0xF] = 02;
2206                         for (int i = 0x0391; i < 0x03CF; i++)
2207                                 if (Char.IsLetter ((char) i))
2208                                         AddLetterMap ((char) i, 0xF, 1);
2209                         fillIndex [0xF] = 0x40;
2210                         for (int i = 0x03D0; i < 0x0400; i++)
2211                                 if (Char.IsLetter ((char) i))
2212                                         AddLetterMap ((char) i, 0xF, 1);
2213
2214                         // Cyrillic.
2215                         // Cyrillic letters are sorted like Latin letters i.e.
2216                         // containing culture-specific letters between the
2217                         // standard Cyrillic sequence.
2218                         //
2219                         // We can't use UCA here; it has different sorting.
2220                         char [] orderedCyrillic = new char [] {
2221                                 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2222                                 '\u0452', // DJE for Serbocroatian
2223                                 '\u0435',
2224                                 '\u0454', // IE for Ukrainian
2225                                 '\u0436', '\u0437',
2226                                 '\u0455', // DZE
2227                                 '\u0438',
2228                                 '\u0456', // Byelorussian-Ukrainian I
2229                                 '\u0457', // YI
2230                                 '\u0439',
2231                                 '\u0458', // JE
2232                                 '\u043A', '\u043B',
2233                                 '\u0459', // LJE
2234                                 '\u043C', '\u043D',
2235                                 '\u045A', // NJE
2236                                 '\u043E',
2237                                 // 4E9 goes here.
2238                                 '\u043F', '\u0440', '\u0441', '\u0442',
2239                                 '\u045B', // TSHE for Serbocroatian
2240                                 '\u0443',
2241                                 '\u045E', // Short U for Byelorussian
2242                                 '\u04B1', // Straight U w/ stroke (diacritical!)
2243                                 '\u0444', '\u0445', '\u0446', '\u0447',
2244                                 '\u045F', // DZHE
2245                                 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2246                                 '\u044D', '\u044E', '\u044F'};
2247
2248                         // For some characters here is a map to basic cyrillic
2249                         // letters. See UnicodeData.txt character names for
2250                         // the sources. Here I simply declare an equiv. array.
2251                         // The content characters are map from U+490(,491),
2252                         // skipping small letters.
2253                         char [] cymap_src = new char [] {
2254                                 '\u0433', '\u0433', '\u0433', '\u0436',
2255                                 '\u0437', '\u043A', '\u043A', '\u043A',
2256                                 '\u043A', '\u043D', '\u043D', '\u043F',
2257                                 '\u0445', '\u0441', '\u0442', '\u0443',
2258                                 '\u0443', '\u0445', '\u0446', '\u0447',
2259                                 '\u0447', '\u0432', '\u0435', '\u0435',
2260                                 '\u0406', '\u0436', '\u043A', '\u043D',
2261                                 '\u0447', '\u0435'};
2262
2263                         fillIndex [0x10] = 0x8D;
2264                         for (int i = 0x0460; i < 0x0481; i++) {
2265                                 if (Char.IsLetter ((char) i)) {
2266                                         if (i == 0x0476)
2267                                                 // U+476/477 have the same
2268                                                 // primary weight as U+474/475.
2269                                                 fillIndex [0x10] -= 3;
2270                                         AddLetterMap ((char) i, 0x10, 3);
2271                                 }
2272                         }
2273
2274                         fillIndex [0x10] = 0x6;
2275                         for (int i = 0; i < orderedCyrillic.Length; i++) {
2276                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2277                                 if (!IsIgnorable ((int) c) &&
2278                                         Char.IsLetter (c) &&
2279                                         !map [c].Defined) {
2280                                         AddLetterMap (c, 0x10, 0);
2281                                         fillIndex [0x10] += 3;
2282                                 }
2283                         }
2284
2285                         for (int i = 0; i < cymap_src.Length; i++) {
2286                                 char c = cymap_src [i];
2287                                 fillIndex [0x10] = map [c].Level1;
2288                                 int c2 = 0x0490 + i * 2;
2289                                 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2290                         }
2291
2292                         // Armenian
2293                         fillIndex [0x11] = 0x3;
2294                         fillIndex [0x1] = 0x98;
2295                         for (int i = 0x0531; i < 0x0586; i++) {
2296                                 if (i == 0x0559 || i == 0x55A)
2297                                         AddCharMap ((char) i, 1, 1);
2298                                 if (Char.IsLetter ((char) i))
2299                                         AddLetterMap ((char) i, 0x11, 1);
2300                         }
2301
2302                         // Hebrew
2303                         // -Letters
2304                         fillIndex [0x12] = 0x2;
2305                         for (int i = 0x05D0; i < 0x05FF; i++)
2306                                 if (Char.IsLetter ((char) i))
2307                                         AddLetterMap ((char) i, 0x12, 1);
2308                         // -Accents
2309                         fillIndex [0x1] = 0x3;
2310                         for (int i = 0x0591; i <= 0x05C2; i++) {
2311                                 if (i == 0x05A3 || i == 0x05BB)
2312                                         fillIndex [0x1]++;
2313                                 if (i != 0x05BE)
2314                                         AddCharMap ((char) i, 0x1, 1);
2315                         }
2316
2317                         // Arabic
2318                         fillIndex [0x1] = 0x8E;
2319                         fillIndex [0x13] = 0x3;
2320                         for (int i = 0x0621; i <= 0x064A; i++) {
2321                                 // Abjad
2322                                 if (Char.GetUnicodeCategory ((char) i)
2323                                         != UnicodeCategory.OtherLetter) {
2324                                         // FIXME: arabic nonspacing marks are
2325                                         // in different order.
2326                                         AddCharMap ((char) i, 0x1, 1);
2327                                         continue;
2328                                 }
2329 //                              map [i] = new CharMapEntry (0x13,
2330 //                                      (byte) arabicLetterPrimaryValues [i], 1);
2331                                 fillIndex [0x13] =
2332                                         (byte) arabicLetterPrimaryValues [i];
2333                                 byte formDiacritical = 8; // default
2334                                 // SPECIAL CASES:
2335                                 switch (i) {
2336                                 case 0x0622: formDiacritical = 9; break;
2337                                 case 0x0623: formDiacritical = 0xA; break;
2338                                 case 0x0624: formDiacritical = 5; break;
2339                                 case 0x0625: formDiacritical = 0xB; break;
2340                                 case 0x0626: formDiacritical = 7; break;
2341                                 case 0x0649: formDiacritical = 5; break;
2342                                 case 0x064A: formDiacritical = 7; break;
2343                                 }
2344                                 AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2345                         }
2346                         for (int i = 0x0670; i < 0x0673; i++)
2347                                 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2348                         fillIndex [0x13] = 0x84;
2349                         for (int i = 0x0674; i < 0x06D6; i++)
2350                                 if (Char.IsLetter ((char) i))
2351                                         AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2352
2353                         // Devanagari
2354
2355                         // FIXME: this could be fixed in more decent way
2356                         for (int i = 0x0958; i <= 0x095F; i++)
2357                                 diacritical [i] = 8;
2358
2359                         // FIXME: it does seem straight codepoint mapping.
2360                         fillIndex [0x14] = 04;
2361                         for (int i = 0x0901; i < 0x0905; i++)
2362                                 if (!IsIgnorable (i))
2363                                         AddLetterMap ((char) i, 0x14, 2);
2364                         fillIndex [0x14] = 0xB;
2365                         for (int i = 0x0905; i < 0x093A; i++) {
2366                                 if (i == 0x0928)
2367                                         AddCharMap ('\u0929', 0x14, 0, 8);
2368                                 if (i == 0x0930)
2369                                         AddCharMap ('\u0931', 0x14, 0, 8);
2370                                 if (i == 0x0933)
2371                                         AddCharMap ('\u0934', 0x14, 0, 8);
2372                                 if (Char.IsLetter ((char) i))
2373                                         AddLetterMap ((char) i, 0x14, 4);
2374                                 if (i == 0x090B)
2375                                         AddCharMap ('\u0960', 0x14, 4);
2376                                 if (i == 0x090C)
2377                                         AddCharMap ('\u0961', 0x14, 4);
2378                         }
2379                         fillIndex [0x14] = 0xDA;
2380                         for (int i = 0x093E; i < 0x0945; i++)
2381                                 if (!IsIgnorable (i))
2382                                         AddLetterMap ((char) i, 0x14, 2);
2383                         fillIndex [0x14] = 0xEC;
2384                         for (int i = 0x0945; i < 0x094F; i++)
2385                                 if (!IsIgnorable (i))
2386                                         AddLetterMap ((char) i, 0x14, 2);
2387
2388                         // Bengali
2389                         // -Letters
2390                         fillIndex [0x15] = 02;
2391                         for (int i = 0x0980; i < 0x9FF; i++) {
2392                                 if (IsIgnorable (i))
2393                                         continue;
2394                                 if (i == 0x09E0)
2395                                         fillIndex [0x15] = 0x3B;
2396                                 switch (Char.GetUnicodeCategory ((char) i)) {
2397                                 case UnicodeCategory.NonSpacingMark:
2398                                 case UnicodeCategory.DecimalDigitNumber:
2399                                 case UnicodeCategory.OtherNumber:
2400                                         continue;
2401                                 }
2402                                 AddLetterMap ((char) i, 0x15, 1);
2403                         }
2404                         // -Signs
2405                         fillIndex [0x1] = 0x3;
2406                         for (int i = 0x0981; i < 0x0A00; i++)
2407                                 if (Char.GetUnicodeCategory ((char) i) ==
2408                                         UnicodeCategory.NonSpacingMark)
2409                                         AddCharMap ((char) i, 0x1, 1);
2410
2411                         // Gurmukhi. orderedGurmukhi is from UCA
2412                         // FIXME: it does not look equivalent to UCA.
2413                         fillIndex [0x16] = 04;
2414                         fillIndex [0x1] = 3;
2415                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
2416                                 char c = orderedGurmukhi [i];
2417                                 if (IsIgnorable ((int) c))
2418                                         continue;
2419                                 if (IsIgnorableNonSpacing (c)) {
2420                                         AddLetterMap (c, 0x1, 1);
2421                                         continue;
2422                                 }
2423                                 if (c == '\u0A3C' || c == '\u0A4D' ||
2424                                         '\u0A66' <= c && c <= '\u0A71')
2425                                         continue;
2426                                 // SPECIAL CASES
2427                                 byte shift = 4;
2428                                 switch (c) {
2429                                 case '\u0A33': case '\u0A36': case '\u0A16':
2430                                 case '\u0A17': case '\u0A5B': case '\u0A5E':
2431                                         shift = 0;
2432                                         break;
2433                                 }
2434                                 if (c == '\u0A3E') // Skip
2435                                         fillIndex [0x16] = 0xC0;
2436                                 AddLetterMap (c, 0x16, shift);
2437                         }
2438
2439                         // Gujarati. orderedGujarati is from UCA
2440                         fillIndex [0x17] = 0x4;
2441                         // nonspacing marks
2442                         map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2443                         map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2444                         map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2445                         map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2446                         map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2447                         map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2448                         // letters go first.
2449                         for (int i = 0; i < orderedGujarati.Length; i++) {
2450                                 // SPECIAL CASE
2451                                 char c = orderedGujarati [i];
2452                                 if (Char.IsLetter (c)) {
2453                                         // SPECIAL CASES
2454                                         if (c == '\u0AB3' || c == '\u0A32')
2455                                                 continue;
2456                                         if (c == '\u0A33') {
2457                                                 AddCharMap ('\u0A32', 0x17, 0);
2458                                                 AddCharMap ('\u0A33', 0x17, 4, 4);
2459                                                 continue;
2460                                         }
2461                                         if (c == '\u0A8B')
2462                                                 AddCharMap ('\u0AE0', 0x17, 0, 5);
2463                                         AddCharMap (c, 0x17, 4);
2464
2465                                         if (c == '\u0AB9')
2466                                                 AddCharMap ('\u0AB3', 0x17, 6);
2467                                 }
2468                         }
2469                         // non-letters
2470                         byte gujaratiShift = 4;
2471                         fillIndex [0x17] = 0xC0;
2472                         for (int i = 0; i < orderedGujarati.Length; i++) {
2473                                 char c = orderedGujarati [i];
2474                                 if (fillIndex [0x17] == 0xCC)
2475                                         gujaratiShift = 3;
2476                                 if (!Char.IsLetter (c)) {
2477                                         // SPECIAL CASES
2478                                         if (c == '\u0A82')
2479                                                 AddCharMap ('\u0A81', 0x17, 2);
2480                                         if (c == '\u0AC2')
2481                                                 fillIndex [0x17]++;
2482                                         AddLetterMap (c, 0x17, gujaratiShift);
2483                                 }
2484                         }
2485
2486                         // Oriya
2487                         fillIndex [0x1] = 03;
2488                         fillIndex [0x18] = 02;
2489                         for (int i = 0x0B00; i < 0x0B7F; i++) {
2490                                 switch (Char.GetUnicodeCategory ((char) i)) {
2491                                 case UnicodeCategory.NonSpacingMark:
2492                                 case UnicodeCategory.DecimalDigitNumber:
2493                                         AddLetterMap ((char) i, 0x1, 1);
2494                                         continue;
2495                                 }
2496                                 AddLetterMap ((char) i, 0x18, 1);
2497                         }
2498
2499                         // Tamil
2500                         fillIndex [0x19] = 2;
2501                         AddCharMap ('\u0BD7', 0x19, 0);
2502                         fillIndex [0x19] = 0xA;
2503                         // vowels
2504                         for (int i = 0x0B82; i <= 0x0B94; i++)
2505                                 if (!IsIgnorable ((char) i))
2506                                         AddCharMap ((char) i, 0x19, 2);
2507                         // special vowel
2508                         fillIndex [0x19] = 0x28;
2509                         // The array for Tamil consonants is a constant.
2510                         // Windows have almost similar sequence to TAM from
2511                         // tamilnet but a bit different in Grantha.
2512                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
2513                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2514                         // combining marks
2515                         fillIndex [0x19] = 0x82;
2516                         for (int i = 0x0BBE; i < 0x0BCD; i++)
2517                                 if (Char.GetUnicodeCategory ((char) i) ==
2518                                         UnicodeCategory.SpacingCombiningMark
2519                                         || i == 0x0BC0)
2520                                         AddLetterMap ((char) i, 0x19, 2);
2521
2522                         // Telugu
2523                         fillIndex [0x1A] = 0x4;
2524                         for (int i = 0x0C00; i < 0x0C62; i++) {
2525                                 if (i == 0x0C55 || i == 0x0C56)
2526                                         continue; // skip
2527                                 AddCharMap ((char) i, 0x1A, 3);
2528                                 char supp = (i == 0x0C0B) ? '\u0C60':
2529                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
2530                                 if (supp == char.MinValue)
2531                                         continue;
2532                                 AddCharMap (supp, 0x1A, 3);
2533                         }
2534
2535                         // Kannada
2536                         fillIndex [0x1B] = 4;
2537                         for (int i = 0x0C80; i < 0x0CE5; i++) {
2538                                 if (i == 0x0CD5 || i == 0x0CD6)
2539                                         continue; // ignore
2540                                 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2541                                         continue; // shift after 0xCB9
2542                                 AddCharMap ((char) i, 0x1B, 3);
2543                                 if (i == 0x0CB9) {
2544                                         // SPECIAL CASES: but why?
2545                                         AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2546                                         AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2547                                         AddCharMap ('\u0CDE', 0x1B, 3); // FA
2548                                 }
2549                                 if (i == 0x0CB2)
2550                                         AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2551                         }
2552
2553                         // Malayalam
2554                         fillIndex [0x1C] = 2;
2555                         fillIndex [0x1] = 3;
2556                         for (int i = 0x0D02; i < 0x0D61; i++) {
2557                                 // FIXME: I avoided MSCompatUnicodeTable usage
2558                                 // here (it results in recursion). So check if
2559                                 // using NonSpacingMark makes sense or not.
2560                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2561 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2562                                         AddCharMap ((char) i, 0x1C, 1);
2563                                 else if (!IsIgnorable ((char) i))
2564                                         AddCharMap ((char) i, 1, 1);
2565                         }
2566
2567                         // Thai ... note that it breaks 0x1E wall after E2B!
2568                         // Also, all Thai characters have level 2 value 3.
2569                         fillIndex [0x1E] = 2;
2570                         fillIndex [0x1] = 3;
2571                         for (int i = 0xE40; i <= 0xE44; i++)
2572                                 AddCharMap ((char) i, 0x1E, 1, 3);
2573                         for (int i = 0xE01; i < 0xE2B; i++)
2574                                 AddCharMap ((char) i, 0x1E, 6, 3);
2575                         fillIndex [0x1F] = 5;
2576                         for (int i = 0xE2B; i < 0xE30; i++)
2577                                 AddCharMap ((char) i, 0x1F, 6, 3);
2578                         fillIndex [0x1F] = 0x1E;
2579                         for (int i = 0xE30; i < 0xE3B; i++)
2580                                 AddCharMap ((char) i, 0x1F, 1, 3);
2581                         // some Thai characters remains.
2582                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
2583                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2584                         foreach (char c in specialThai)
2585                                 AddCharMap (c, 0x1F, 1, 3);
2586
2587                         for (int i = 0xE00; i < 0xE80; i++)
2588                                 if (Char.GetUnicodeCategory ((char) i) ==
2589                                         UnicodeCategory.NonSpacingMark)
2590                                         AddCharMap ((char) i, 1, 1);
2591
2592                         // Lao
2593                         fillIndex [0x1F] = 2;
2594                         fillIndex [0x1] = 3;
2595                         for (int i = 0xE80; i < 0xEDF; i++) {
2596                                 if (IsIgnorable ((char) i))
2597                                         continue;
2598                                 else if (Char.IsLetter ((char) i))
2599                                         AddCharMap ((char) i, 0x1F, 1);
2600                                 else if (Char.GetUnicodeCategory ((char) i) ==
2601                                         UnicodeCategory.NonSpacingMark)
2602                                         AddCharMap ((char) i, 1, 1);
2603                         }
2604
2605                         // Georgian. orderedGeorgian is from UCA DUCET.
2606                         fillIndex [0x21] = 5;
2607                         for (int i = 0; i < orderedGeorgian.Length; i++) {
2608                                 char c = orderedGeorgian [i];
2609                                 if (map [(int) c].Defined)
2610                                         continue;
2611                                 AddCharMap (c, 0x21, 0);
2612                                 if (c < '\u10F6')
2613                                         AddCharMap ((char) (c - 0x30), 0x21, 0);
2614                                 fillIndex [0x21] += 5;
2615                         }
2616
2617                         // Japanese Kana.
2618                         fillIndex [0x22] = 2;
2619                         int kanaOffset = 0x3041;
2620                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2621
2622                         for (int gyo = 0; gyo < 9; gyo++) {
2623                                 for (int dan = 0; dan < 5; dan++) {
2624                                         if (gyo == 7 && dan % 2 == 1) {
2625                                                 // 'ya'-gyo
2626                                                 fillIndex [0x22]++;
2627                                                 kanaOffset -= 2; // There is no space for yi and ye.
2628                                                 continue;
2629                                         }
2630                                         int cp = kanaOffset + dan * kanaLines [gyo];
2631                                         // small lines (a-gyo, ya-gyo)
2632                                         if (gyo == 0 || gyo == 7) {
2633                                                 AddKanaMap (cp, 1); // small
2634                                                 AddKanaMap (cp + 1, 1);
2635                                         }
2636                                         else
2637                                                 AddKanaMap (cp, kanaLines [gyo]);
2638                                         fillIndex [0x22]++;
2639
2640                                         if (cp == 0x30AB) {
2641                                                 // add small 'ka' (before normal one)
2642                                                 AddKanaMap (0x30F5, 1);
2643                                                 kanaOffset++;
2644                                         }
2645                                         if (cp == 0x30B1) {
2646                                                 // add small 'ke' (before normal one)
2647                                                 AddKanaMap (0x30F6, 1);
2648                                                 kanaOffset++;
2649                                         }
2650                                         if (cp == 0x3061) {
2651                                                 // add small 'Tsu' (before normal one)
2652                                                 AddKanaMap (0x3063, 1);
2653                                                 kanaOffset++;
2654                                         }
2655                                 }
2656                                 fillIndex [0x22] += 3;
2657                                 kanaOffset += 5 * kanaLines [gyo];
2658                         }
2659
2660                         // Wa-gyo is almost special, so I just manually add.
2661                         AddLetterMap ((char) 0x308E, 0x22, 0);
2662                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2663                         AddLetterMap ((char) 0x308F, 0x22, 0);
2664                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2665                         fillIndex [0x22]++;
2666                         AddLetterMap ((char) 0x3090, 0x22, 0);
2667                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2668                         fillIndex [0x22] += 2;
2669                         // no "Wu" in Japanese.
2670                         AddLetterMap ((char) 0x3091, 0x22, 0);
2671                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2672                         fillIndex [0x22]++;
2673                         AddLetterMap ((char) 0x3092, 0x22, 0);
2674                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2675                         // Nn
2676                         fillIndex [0x22] = 0x80;
2677                         AddLetterMap ((char) 0x3093, 0x22, 0);
2678                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2679
2680                         map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2681                                 map [0x30A6].Level1, 3);// voiced hiragana U
2682                         map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2683                                 map [0x30A6].Level1, 3);// voiced katakana U
2684
2685                         map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2686                                 map [0x30AB].Level1, 0);// small katakana Ka
2687                         map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2688                                 map [0x30B1].Level1, 0);// small katakana Ke
2689                         // voiced Wa lines
2690                         for (int i = 0x30F7; i < 0x30FB; i++)
2691                                 map [i] = new CharMapEntry (map [i - 8].Category,
2692                                         map [i - 8].Level1,
2693                                         3);
2694
2695                         // JIS Japanese square chars.
2696                         fillIndex [0x22] = 0x97;
2697                         jisJapanese.Sort (JISComparer.Instance);
2698                         foreach (JISCharacter j in jisJapanese)
2699                                 if (0x3300 <= j.CP && j.CP <= 0x3357)
2700                                         AddCharMap ((char) j.CP, 0x22, 1);
2701                         // non-JIS Japanese square chars.
2702                         nonJisJapanese.Sort (NonJISComparer.Instance);
2703                         foreach (NonJISCharacter j in nonJisJapanese)
2704                                 AddCharMap ((char) j.CP, 0x22, 1);
2705
2706                         // Bopomofo
2707                         fillIndex [0x23] = 0x02;
2708                         for (int i = 0x3105; i <= 0x312C; i++)
2709                                 AddCharMap ((char) i, 0x23, 1);
2710
2711                         // Estrangela: ancient Syriac
2712                         fillIndex [0x24] = 0x0B;
2713                         // FIXME: is 0x71E really alternative form?
2714                         ArrayList syriacAlternatives = new ArrayList (
2715                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2716                         for (int i = 0x0710; i <= 0x072C; i++) {
2717                                 if (i == 0x0711) // NonSpacingMark
2718                                         continue;
2719                                 if (syriacAlternatives.Contains (i))
2720                                         continue;
2721                                 AddCharMap ((char) i, 0x24, 4);
2722                                 // FIXME: why?
2723                                 if (i == 0x721)
2724                                         fillIndex [0x24]++;
2725                         }
2726                         foreach (int cp in syriacAlternatives)
2727                                 map [cp] = new CharMapEntry (0x24,
2728                                         (byte) (map [cp - 1].Level1 + 2),
2729                                         0);
2730                         // FIXME: Syriac NonSpacingMark should go here.
2731
2732                         // Thaana
2733                         // FIXME: it turned out that it does not look like UCA
2734                         fillIndex [0x24] = 0x6E;
2735                         fillIndex [0x1] = 0xAC;
2736                         for (int i = 0; i < orderedThaana.Length; i++) {
2737                                 char c = orderedThaana [i];
2738                                 if (IsIgnorableNonSpacing ((int) c))
2739                                         AddCharMap (c, 1, 1);
2740                                 AddCharMap (c, 0x24, 2);
2741                                 if (c == '\u0782') // SPECIAL CASE: why?
2742                                         fillIndex [0x24] += 2;
2743                         }
2744                         #endregion
2745
2746                         // FIXME: Add more culture-specific letters (that are
2747                         // not supported in Windows collation) here.
2748
2749                         // Surrogate ... they are computed.
2750
2751                         #region Hangul
2752                         // Hangul.
2753                         //
2754                         // Unlike UCA Windows Hangul sequence mixes Jongseong
2755                         // with Choseong sequence as well as Jungseong,
2756                         // adjusted to have the same primary weight for the
2757                         // same base character. So it is impossible to compute
2758                         // those sort keys.
2759                         //
2760                         // Here I introduce an ordered sequence of mixed
2761                         // 'commands' and 'characters' that is similar to
2762                         // LDML text:
2763                         //      - ',' increases primary weight.
2764                         //      - [A B] means a range, increasing index
2765                         //      - {A B} means a range, without increasing index
2766                         //      - '=' is no operation (it means the characters
2767                         //        of both sides have the same weight).
2768                         //      - '>' inserts a Hangul Syllable block that
2769                         //        contains 0x251 characters.
2770                         //      - '<' decreases the index
2771                         //      - '0'-'9' means skip count
2772                         //      - whitespaces are ignored
2773                         //
2774
2775                         string hangulSequence =
2776                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2777                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2778                         + "<{\u1113 \u1116}, \u3165,"
2779                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2780                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2781                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2782                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2783                                 + "[\u11D1 \u11D2], \u11B2,"
2784                                 + "[\u11D3 \u11D5], \u11B3,"
2785                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2786                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2787                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2788                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2789                         + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2790                                 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2791                                 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2792                         + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2793                                 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2794                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2795                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2796                         + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
2797                                 + "\u11F1,, \u11F2,,,"
2798                                 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
2799                         + "<\u114D, \u110D,,  >"
2800                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2801                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2802                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2803                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2804                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2805                                 + "[\u11F5 \u11F8]"
2806                         ;
2807
2808                         byte hangulCat = 0x52;
2809                         fillIndex [hangulCat] = 0x2;
2810
2811                         int syllableBlock = 0;
2812                         for (int n = 0; n < hangulSequence.Length; n++) {
2813                                 char c = hangulSequence [n];
2814                                 int start, end;
2815                                 if (Char.IsWhiteSpace (c))
2816                                         continue;
2817                                 switch (c) {
2818                                 case '=':
2819                                         break; // NOP
2820                                 case ',':
2821                                         IncrementSequentialIndex (ref hangulCat);
2822                                         break;
2823                                 case '<':
2824                                         if (fillIndex [hangulCat] == 2)
2825                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2826                                         fillIndex [hangulCat]--;
2827                                         break;
2828                                 case '>':
2829                                         IncrementSequentialIndex (ref hangulCat);
2830                                         for (int l = 0; l < 0x15; l++)
2831                                                 for (int v = 0; v < 0x1C; v++) {
2832                                                         AddCharMap (
2833                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2834                                                         IncrementSequentialIndex (ref hangulCat);
2835                                                 }
2836                                         syllableBlock++;
2837                                         break;
2838                                 case '[':
2839                                         start = hangulSequence [n + 1];
2840                                         end = hangulSequence [n + 3];
2841                                         for (int i = start; i <= end; i++) {
2842                                                 AddCharMap ((char) i, hangulCat, 0);
2843                                                 if (end > i)
2844                                                         IncrementSequentialIndex (ref hangulCat);
2845                                         }
2846                                         n += 4; // consumes 5 characters for this operation
2847                                         break;
2848                                 case '{':
2849                                         start = hangulSequence [n + 1];
2850                                         end = hangulSequence [n + 3];
2851                                         for (int i = start; i <= end; i++)
2852                                                 AddCharMap ((char) i, hangulCat, 0);
2853                                         n += 4; // consumes 5 characters for this operation
2854                                         break;
2855                                 default:
2856                                         AddCharMap (c, hangulCat, 0);
2857                                         break;
2858                                 }
2859                         }
2860
2861                         // Some Jamo NFKD.
2862                         for (int i = 0x3200; i < 0x3300; i++) {
2863                                 if (IsIgnorable (i) || map [i].Defined)
2864                                         continue;
2865                                 int ch = 0;
2866                                 // w/ bracket
2867                                 if (decompLength [i] == 4 &&
2868                                         decompValues [decompIndex [i]] == '(')
2869                                         ch = decompIndex [i] + 1;
2870                                 // circled
2871                                 else if (decompLength [i] == 2 &&
2872                                         decompValues [decompIndex [i] + 1] == '\u1161')
2873                                         ch = decompIndex [i];
2874                                 else if (decompLength [i] == 1)
2875                                         ch = decompIndex [i];
2876                                 else
2877                                         continue;
2878                                 ch = decompValues [ch];
2879                                 if (ch < 0x1100 || 0x1200 < ch &&
2880                                         ch < 0xAC00 || 0xD800 < ch)
2881                                         continue;
2882
2883                                 // SPECIAL CASE ?
2884                                 int offset = i < 0x3260 ? 1 : 0;
2885                                 if (0x326E <= i && i <= 0x3273)
2886                                         offset = 1;
2887
2888                                 map [i] = new CharMapEntry (map [ch].Category,
2889                                         (byte) (map [ch].Level1 + offset),
2890                                         map [ch].Level2);
2891 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2892                         }
2893
2894
2895                         #endregion
2896
2897                         // Letterlike characters and CJK compatibility square
2898                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2899                         int [] counts = new int ['Z' - 'A' + 1];
2900                         char [] namedChars = new char [sortableCharNames.Count];
2901                         int nCharNames = 0;
2902                         foreach (DictionaryEntry de in sortableCharNames) {
2903                                 counts [((string) de.Value) [0] - 'A']++;
2904                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2905                         }
2906                         nCharNames = 0; // reset
2907                         for (int a = 0; a < counts.Length; a++) {
2908                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2909                                 for (int i = 0; i < counts [a]; i++)
2910 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2911                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2912                         }
2913
2914                         // CJK unified ideograph.
2915                         byte cjkCat = 0x9E;
2916                         fillIndex [cjkCat] = 0x2;
2917                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2918                                 if (!IsIgnorable (cp))
2919                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2920                         // CJK Extensions goes here.
2921                         // LAMESPEC: With this Windows style CJK layout, it is
2922                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2923                         // 0x9FBB can never be added w/o breaking compat.
2924                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2925                                 if (!IsIgnorable (cp))
2926                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2927
2928                         // PrivateUse ... computed.
2929                         // remaining Surrogate ... computed.
2930
2931                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2932                         // non-alphanumeric ASCII except for: + - < = > '
2933                         for (int i = 0x21; i < 0x7F; i++) {
2934                                 // SPECIAL CASE: 02C6 looks regarded as
2935                                 // equivalent to '^', which does not conform
2936                                 // to Unicode standard character database.
2937                                 if (i == 0x005B)
2938                                         AddCharMap ('\u2045', 0x7, 0, 0x1C);
2939                                 if (i == 0x005D)
2940                                         AddCharMap ('\u2046', 0x7, 0, 0x1C);
2941                                 if (i == 0x005E)
2942                                         AddCharMap ('\u02C6', 0x7, 0, 3);
2943                                 if (i == 0x0060)
2944                                         AddCharMap ('\u02CB', 0x7, 0, 3);
2945
2946                                 if (Char.IsLetterOrDigit ((char) i)
2947                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2948                                         continue; // they are not added here.
2949
2950                                 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2951                                 // Insert 3001 after ',' and 3002 after '.'
2952                                 if (i == 0x2C)
2953                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2954                                 else if (i == 0x2E)
2955                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2956                                 else if (i == 0x3A)
2957                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2958                         }
2959                         #endregion
2960
2961                         #region 07 - Punctuations and something else
2962                         for (int i = 0xA0; i < char.MaxValue; i++) {
2963                                 if (IsIgnorable (i))
2964                                         continue;
2965
2966                                 // FIXME: actually those reset should not be
2967                                 // done but here I put for easy goal.
2968                                 if (i == 0x05C3)
2969                                         fillIndex [0x7]++;
2970                                 if (i == 0x0700)
2971                                         fillIndex [0x7] = 0xE2;
2972                                 if (i == 0x2016)
2973                                         fillIndex [0x7] = 0x77;
2974                                 if (i == 0x3008)
2975                                         fillIndex [0x7] = 0x93;
2976
2977                                 if (0x02C8 <= i && i <= 0x02CD)
2978                                         continue; // nonspacing marks
2979
2980                                 // SPECIAL CASE: maybe they could be allocated
2981                                 // dummy NFKD mapping and no special processing
2982                                 // would be required here.
2983                                 if (i == 0x00AF)
2984                                         AddCharMap ('\u02C9', 0x7, 0, 3);
2985                                 if (i == 0x00B4)
2986                                         AddCharMap ('\u02CA', 0x7, 0, 3);
2987                                 if (i == 0x02C7)
2988                                         AddCharMap ('\u02D8', 0x7, 0, 3);
2989
2990                                 // SPECIAL CASES:
2991                                 switch (i) {
2992                                 case 0xAB: // 08
2993                                 case 0xB7: // 0A
2994                                 case 0xBB: // 08
2995                                 case 0x02B9: // 01
2996                                 case 0x02BA: // 01
2997                                 case 0x2329: // 09
2998                                 case 0x232A: // 09
2999                                         continue;
3000                                 }
3001
3002                                 switch (Char.GetUnicodeCategory ((char) i)) {
3003                                 case UnicodeCategory.OtherPunctuation:
3004                                 case UnicodeCategory.ClosePunctuation:
3005                                 case UnicodeCategory.OpenPunctuation:
3006                                 case UnicodeCategory.ConnectorPunctuation:
3007                                 case UnicodeCategory.InitialQuotePunctuation:
3008                                 case UnicodeCategory.FinalQuotePunctuation:
3009                                 case UnicodeCategory.ModifierSymbol:
3010                                         // SPECIAL CASES: // 0xA
3011                                         if (0x2020 <= i && i <= 0x2031)
3012                                                 continue;
3013                                         if (i == 0x3003) // added later
3014                                                 continue;
3015                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3016                                         break;
3017                                 default:
3018                                         if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why?
3019                                                 goto case UnicodeCategory.OtherPunctuation;
3020                                         break;
3021                                 }
3022                         }
3023
3024                         // Control pictures
3025                         // FIXME: it should not need to reset level 1, but
3026                         // it's for easy goal.
3027                         fillIndex [0x7] = 0xB6;
3028                         for (int i = 0x2400; i <= 0x2424; i++)
3029                                 AddCharMap ((char) i, 0x7, 1, 0);
3030
3031                         // FIXME: what are they?
3032                         AddCharMap ('\u3003', 0x7, 1);
3033                         AddCharMap ('\u3006', 0x7, 1);
3034                         AddCharMap ('\u02D0', 0x7, 1);
3035                         AddCharMap ('\u10FB', 0x7, 1);
3036                         AddCharMap ('\u0950', 0x7, 1);
3037                         AddCharMap ('\u093D', 0x7, 1);
3038                         AddCharMap ('\u0964', 0x7, 1);
3039                         AddCharMap ('\u0965', 0x7, 1);
3040                         AddCharMap ('\u0970', 0x7, 1);
3041
3042                         #endregion
3043
3044                         #region category 08 - symbols
3045                         fillIndex [0x8] = 2;
3046                         // Here Windows mapping is not straightforward. It is
3047                         // not based on computation but seems manual sorting.
3048                         AddCharMapGroup ('+', 0x8, 1, 0); // plus
3049                         AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
3050                         AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
3051                         AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
3052                         AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
3053                         AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
3054                         AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
3055                         AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
3056                         AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
3057                         AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
3058                         AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
3059                         AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
3060                         AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
3061
3062                         for (int cp = 0; cp < 0x2300; cp++) {
3063                                 if (cp == 0xAC) // SPECIAL CASE: skip
3064                                         continue;
3065                                 if (cp == 0x200) {
3066                                         cp = 0x2200; // skip to 2200
3067                                         fillIndex [0x8] = 0x21;
3068                                 }
3069                                 if (cp == 0x2295)
3070                                         fillIndex [0x8] = 0x3;
3071                                 if (cp == 0x22A2)
3072                                         fillIndex [0x8] = 0xAB;
3073                                 if (cp == 0x22B2)
3074                                         fillIndex [0x8] = 0xB9;
3075                                 if (!map [cp].Defined &&
3076 //                                      Char.GetUnicodeCategory ((char) cp) ==
3077 //                                      UnicodeCategory.MathSymbol)
3078                                         Char.IsSymbol ((char) cp))
3079                                         AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
3080                                 // SPECIAL CASES: no idea why Windows sorts as such
3081                                 switch (cp) {
3082                                 case 0x3E:
3083                                         AddCharMap ('\u227B', 0x8, 1, 0);
3084                                         AddCharMap ('\u22B1', 0x8, 1, 0);
3085                                         break;
3086                                 case 0xB1:
3087                                         AddCharMapGroup ('\u00AB', 0x8, 1, 0);
3088                                         AddCharMapGroup ('\u226A', 0x8, 1, 0);
3089                                         AddCharMapGroup ('\u00BB', 0x8, 1, 0);
3090                                         AddCharMapGroup ('\u226B', 0x8, 1, 0);
3091                                         break;
3092                                 case 0xF7:
3093                                         AddCharMap ('\u01C0', 0x8, 1, 0);
3094                                         AddCharMap ('\u01C1', 0x8, 1, 0);
3095                                         AddCharMap ('\u01C2', 0x8, 1, 0);
3096                                         break;
3097                                 }
3098                         }
3099                         #endregion
3100
3101                         #region Hack!
3102
3103                         // Characters w/ diacritical marks (NFKD)
3104                         for (int i = 0; i <= char.MaxValue; i++) {
3105                                 if (map [i].Defined || IsIgnorable (i))
3106                                         continue;
3107                                 if (decompIndex [i] == 0)
3108                                         continue;
3109
3110                                 int start = decompIndex [i];
3111                                 int primaryChar = decompValues [start];
3112                                 int secondary = diacritical [i];
3113                                 bool skip = false;
3114                                 int length = decompLength [i];
3115                                 // special processing for parenthesized ones.
3116                                 if (length == 3 &&
3117                                         decompValues [start] == '(' &&
3118                                         decompValues [start + 2] == ')') {
3119                                         primaryChar = decompValues [start + 1];
3120                                         length = 1;
3121                                 }
3122
3123                                 if (map [primaryChar].Level1 == 0)
3124                                         continue;
3125
3126                                 for (int l = 1; l < length; l++) {
3127                                         int c = decompValues [start + l];
3128                                         if (map [c].Level1 != 0)
3129                                                 skip = true;
3130                                         secondary += diacritical [c];
3131                                 }
3132                                 if (skip)
3133                                         continue;
3134                                 map [i] = new CharMapEntry (
3135                                         map [primaryChar].Category,
3136                                         map [primaryChar].Level1,
3137                                         (byte) secondary);
3138
3139                         }
3140
3141                         // Diacritical weight adjustment
3142
3143                         // Arabic Hamzah
3144                         diacritical [0x624] = 0x5;
3145                         diacritical [0x626] = 0x7;
3146                         diacritical [0x622] = 0x9;
3147                         diacritical [0x623] = 0xA;
3148                         diacritical [0x625] = 0xB;
3149                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
3150                         diacritical [0x64A] = 0x7; // Yaa'
3151
3152                         for (int i = 0; i < char.MaxValue; i++) {
3153                                 byte mod = 0;
3154                                 byte cat = map [i].Category;
3155                                 switch (cat) {
3156                                 case 0xE: // Latin diacritics
3157                                 case 0x22: // Japanese: circled characters
3158                                         mod = diacritical [i];
3159                                         break;
3160                                 case 0x13: // Arabic
3161                                         if (diacritical [i] == 0 && i >= 0xFE8D)
3162                                                 mod = 0x8; // default for arabic
3163                                         break;
3164                                 }
3165                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
3166                                         mod = diacritical [i];
3167                                 if (mod > 0)
3168                                         map [i] = new CharMapEntry (
3169                                                 cat, map [i].Level1, mod);
3170                         }
3171
3172                         // FIXME: this is halfly hack but those NonSpacingMark
3173                         // characters and still undefined are likely to
3174                         // be nonspacing.
3175                         for (int i = 0; i < char.MaxValue; i++) {
3176                                 if (map [i].Defined ||
3177                                         IsIgnorable (i))
3178                                         continue;
3179                                 switch (i) {
3180                                 // SPECIAL CASES.
3181                                 case 0x02B9:
3182                                 case 0x02BA:
3183                                         break;
3184                                 default:
3185                                         if (Char.GetUnicodeCategory ((char) i) !=
3186                                         UnicodeCategory.NonSpacingMark)
3187                                                 continue;
3188                                         break;
3189                                 }
3190                                 if (diacritical [i] != 0)
3191                                         map [i] = new CharMapEntry (1, 1, diacritical [i]);
3192                                 else
3193                                         AddCharMap ((char) i, 1, 1);
3194                         }
3195
3196                         #endregion
3197                 }
3198
3199                 private void IncrementSequentialIndex (ref byte hangulCat)
3200                 {
3201                         fillIndex [hangulCat]++;
3202                         if (fillIndex [hangulCat] == 0) { // overflown
3203                                 hangulCat++;
3204                                 fillIndex [hangulCat] = 0x2;
3205                         }
3206                 }
3207
3208                 // Reset fillIndex to fixed value and call AddLetterMap().
3209                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3210                 {
3211                         fillIndex [category] = alphaWeight;
3212                         AddLetterMap (c, category, 0);
3213
3214                         ArrayList al = latinMap [c] as ArrayList;
3215                         if (al == null)
3216                                 return;
3217
3218                         foreach (int cp in al)
3219                                 AddLetterMap ((char) cp, category, 0);
3220                 }
3221
3222                 private void AddKanaMap (int i, byte voices)
3223                 {
3224                         for (byte b = 0; b < voices; b++) {
3225                                 char c = (char) (i + b);
3226                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
3227                                 // Hiragana
3228                                 AddLetterMapCore (c, 0x22, 0, arg, false);
3229                                 // Katakana
3230                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3231                         }
3232                 }
3233
3234                 private void AddLetterMap (char c, byte category, byte updateCount)
3235                 {
3236                         AddLetterMapCore (c, category, updateCount, 0, true);
3237                 }
3238
3239                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3240                 {
3241                         char c2;
3242                         // <small> updates index
3243                         c2 = ToSmallForm (c);
3244                         if (c2 != c)
3245                                 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3246                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3247                         if (c2 != c && !map [(int) c2].Defined)
3248                                 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3249                         bool doUpdate = true;
3250                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3251                                 doUpdate = false;
3252                         else
3253                                 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3254                         if (doUpdate)
3255                                 fillIndex [category] += updateCount;
3256                 }
3257
3258                 private bool AddCharMap (char c, byte category, byte increment)
3259                 {
3260                         return AddCharMap (c, category, increment, 0);
3261                 }
3262
3263                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3264                 {
3265                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3266                                 return false; // do nothing
3267                         map [(int) c] = new CharMapEntry (category,
3268                                 category == 1 ? alt : fillIndex [category],
3269                                 category == 1 ? fillIndex [category] : alt);
3270                         fillIndex [category] += increment;
3271                         return true;
3272                 }
3273
3274                 //
3275                 // Adds characters to table in the order below
3276                 // (+ increases weight):
3277                 //      (<small> +)
3278                 //      itself
3279                 //      <fraction>
3280                 //      <full> | <super> | <sub>
3281                 //      <circle> | <wide> (| <narrow>)
3282                 //      +
3283                 //      (vertical +)
3284                 //
3285                 // level2 is fixed (does not increase).
3286                 int [] sameWeightItems = new int [] {
3287                         DecompositionFraction,
3288                         DecompositionFull,
3289                         DecompositionSuper,
3290                         DecompositionSub,
3291                         DecompositionCircle,
3292                         DecompositionWide,
3293                         DecompositionNarrow,
3294                         };
3295                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3296                 {
3297                         AddCharMapGroup (c, category, updateCount, level2, false);
3298                 }
3299
3300                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3301                 {
3302                         if (map [(int) c].Defined)
3303                                 return;
3304
3305                         if (deferLevel2)
3306                                 level2 = diacritical [(int) c];
3307
3308                         char small = char.MinValue;
3309                         char vertical = char.MinValue;
3310                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3311                         if (nfkd != null) {
3312                                 object smv = nfkd [(byte) DecompositionSmall];
3313                                 if (smv != null)
3314                                         small = (char) ((int) smv);
3315                                 object vv = nfkd [(byte) DecompositionVertical];
3316                                 if (vv != null)
3317                                         vertical = (char) ((int) vv);
3318                         }
3319
3320                         // <small> updates index
3321                         if (small != char.MinValue) {
3322                                 if (level2 == 0 && deferLevel2)
3323                                         level2 = diacritical [small];
3324                                 AddCharMap (small, category, updateCount, level2);
3325                         }
3326
3327                         // itself
3328                         AddCharMap (c, category, 0, level2);
3329
3330                         if (nfkd != null) {
3331                                 foreach (int weight in sameWeightItems) {
3332                                         object wv = nfkd [(byte) weight];
3333                                         if (wv != null) {
3334                                                 if (deferLevel2)
3335                                                         level2 = diacritical [(int) wv];
3336                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
3337                                         }
3338                                 }
3339                         }
3340
3341                         // update index here.
3342                         fillIndex [category] += updateCount;
3343
3344                         if (vertical != char.MinValue) {
3345                                 if (level2 == 0 && deferLevel2)
3346                                         level2 = diacritical [vertical];
3347                                 AddCharMap (vertical, category, updateCount, level2);
3348                         }
3349                 }
3350
3351                 private void AddCharMapCJK (char c, ref byte category)
3352                 {
3353                         AddCharMap (c, category, 0, 0);
3354                         IncrementSequentialIndex (ref category);
3355
3356                         // Special. I wonder why but Windows skips 9E F9.
3357                         if (category == 0x9E && fillIndex [category] == 0xF9)
3358                                 IncrementSequentialIndex (ref category);
3359                 }
3360
3361                 private void AddCharMapGroupCJK (char c, ref byte category)
3362                 {
3363                         AddCharMapCJK (c, ref category);
3364
3365                         // LAMESPEC: see below.
3366                         if (c == '\u5B78') {
3367                                 AddCharMapCJK ('\u32AB', ref category);
3368                                 AddCharMapCJK ('\u323B', ref category);
3369                         }
3370                         if (c == '\u52DE') {
3371                                 AddCharMapCJK ('\u3298', ref category);
3372                                 AddCharMapCJK ('\u3238', ref category);
3373                         }
3374                         if (c == '\u5BEB')
3375                                 AddCharMapCJK ('\u32A2', ref category);
3376                         if (c == '\u91AB')
3377                                 // Especially this mapping order totally does
3378                                 // not make sense to me.
3379                                 AddCharMapCJK ('\u32A9', ref category);
3380
3381                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3382                         if (nfkd == null)
3383                                 return;
3384                         for (byte weight = 0; weight <= 0x12; weight++) {
3385                                 object wv = nfkd [weight];
3386                                 if (wv == null)
3387                                         continue;
3388                                 int w = (int) wv;
3389
3390                                 // Special: they are ignored in this area.
3391                                 // FIXME: check if it is sane
3392                                 if (0xF900 <= w && w <= 0xFAD9)
3393                                         continue;
3394                                 // LAMESPEC: on Windows some of CJK characters
3395                                 // in 3200-32B0 are incorrectly mapped. They
3396                                 // mix Chinise and Japanese Kanji when
3397                                 // ordering those characters.
3398                                 switch (w) {
3399                                 case 0x32A2: case 0x3298: case 0x3238:
3400                                 case 0x32A9: case 0x323B: case 0x32AB:
3401                                         continue;
3402                                 }
3403
3404                                 AddCharMapCJK ((char) w, ref category);
3405                         }
3406                 }
3407
3408                 // For now it is only for 0x7 category.
3409                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3410                 {
3411                         if (map [(int) c].Defined)
3412                                 return;
3413
3414                         bool updateWeight = false;
3415                         // Process in advance (lower primary weight)
3416                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3417                                 if (!map [c2].Defined &&
3418                                         decompLength [c2] == 1 &&
3419                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3420                                         switch (decompType [c2]) {
3421                                         case DecompositionSmall:
3422                                                 updateWeight = true;
3423                                                 AddCharMap ((char) c2, category,
3424                                                         0, level2);
3425                                                 break;
3426                                         }
3427                                 }
3428                         }
3429                         if (updateWeight)
3430                                 fillIndex [category] = (byte)
3431                                         (fillIndex [category] + updateCount);
3432
3433                         // Identical weight
3434                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3435                                 if (!map [c2].Defined &&
3436                                         decompLength [c2] == 1 &&
3437                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3438                                         switch (decompType [c2]) {
3439                                         case DecompositionSub:
3440                                         case DecompositionSuper:
3441                                         case DecompositionWide:
3442                                         case DecompositionNarrow:
3443                                                 AddCharMap ((char) c2, category,
3444                                                         0, level2);
3445                                                 break;
3446                                         }
3447                                 }
3448                         }
3449
3450                         // itself
3451                         AddCharMap (c, category, updateCount, level2);
3452
3453                         // Since nfkdMap is problematic to have two or more
3454                         // NFKD to an identical character, here I iterate all.
3455                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3456                                 if (!map [c2].Defined &&
3457                                         decompLength [c2] == 1 &&
3458                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3459                                         switch (decompType [c2]) {
3460                                         case DecompositionWide:
3461                                         case DecompositionNarrow:
3462                                         case DecompositionSmall:
3463                                         case DecompositionSub:
3464                                         case DecompositionSuper:
3465                                                 continue;
3466                                         default:
3467                                                 AddCharMap ((char) c2, category, updateCount, level2);
3468                                                 break;
3469                                         }
3470                                 }
3471                         }
3472                 }
3473
3474                 private void AddArabicCharMap (char c)
3475                 {
3476                         byte category = 6;
3477                         byte updateCount = 1;
3478                         byte level2 = 0;
3479
3480                         // itself
3481                         AddCharMap (c, category, 0, level2);
3482
3483                         // Since nfkdMap is problematic to have two or more
3484                         // NFKD to an identical character, here I iterate all.
3485                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3486                                 if (decompLength [c2] == 0)
3487                                         continue;
3488                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
3489                                 if ((int) (decompValues [idx]) == (int) c)
3490                                         AddCharMap ((char) c2, category,
3491                                                 0, level2);
3492                         }
3493                         fillIndex [category] += updateCount;
3494                 }
3495
3496                 char ToSmallForm (char c)
3497                 {
3498                         return ToDecomposed (c, DecompositionSmall, false);
3499                 }
3500
3501                 char ToDecomposed (char c, byte d, bool tail)
3502                 {
3503                         if (decompType [(int) c] != d)
3504                                 return c;
3505                         int idx = decompIndex [(int) c];
3506                         if (tail)
3507                                 idx += decompLength [(int) c] - 1;
3508                         return (char) decompValues [idx];
3509                 }
3510
3511                 bool ExistsJIS (int cp)
3512                 {
3513                         foreach (JISCharacter j in jisJapanese)
3514                                 if (j.CP == cp)
3515                                         return true;
3516                         return false;
3517                 }
3518
3519                 #endregion
3520
3521                 #region Level 3 properties (Case/Width)
3522
3523                 private byte ComputeLevel3Weight (char c)
3524                 {
3525                         byte b = ComputeLevel3WeightRaw (c);
3526                         return b > 0 ? (byte) (b + 2) : b;
3527                 }
3528
3529                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3530                 {
3531                         // CJK compat
3532                         if ('\u3192' <= c && c <= '\u319F')
3533                                 return 0;
3534
3535                         // They have <narrow> NFKD mapping, and on Windows
3536                         // those narrow characters are regarded as "normal",
3537                         // thus those characters themselves are regarded as
3538                         // "wide". grep "<narrow>" and you can pick them up
3539                         // (ignoring Kana, Hangul etc.)
3540                         switch (c) {
3541                         case '\u3002':
3542                         case '\u300C':
3543                         case '\u300D':
3544                         case '\u3001':
3545                         case '\u30FB':
3546                         case '\u2502':
3547                         case '\u2190':
3548                         case '\u2191':
3549                         case '\u2192':
3550                         case '\u2193':
3551                         case '\u25A0':
3552                         case '\u25CB':
3553                                 return 1;
3554                         }
3555                         // Korean
3556                         if ('\u11A8' <= c && c <= '\u11F9')
3557                                 return 2;
3558                         if ('\uFFA0' <= c && c <= '\uFFDC')
3559                                 return 4;
3560                         if ('\u3130' <= c && c <= '\u3164')
3561                                 return 5;
3562                         if ('\u3165' <= c && c <= '\u318E')
3563                                 return 4;
3564                         // Georgian Capital letters
3565                         if ('\u10A0' <= c && c <= '\u10C5')
3566                                 return 0x10;
3567                         // numbers
3568                         if ('\u2776' <= c && c <= '\u277F')
3569                                 return 4;
3570                         if ('\u2780' <= c && c <= '\u2789')
3571                                 return 8;
3572                         if ('\u2776' <= c && c <= '\u2793')
3573                                 return 0xC;
3574                         if ('\u2160' <= c && c <= '\u216F')
3575                                 return 0x10;
3576                         if ('\u2181' <= c && c <= '\u2182')
3577                                 return 0x10;
3578                         // Arabic
3579                         if ('\u2135' <= c && c <= '\u2138')
3580                                 return 4;
3581                         byte [] arabicTmp = new byte [] {0x18, 0, 0x8, 0x10};
3582                         if ('\uFEB5' <= c && c < '\uFEED' ||
3583                                 '\uFEF1' <= c && c < '\uFEF5')
3584                                 return arabicTmp [c % 4];
3585                         if ('\uFE80' <= c && c < '\uFF00') {
3586                                 // 2(Isolated)/8(Final)/0x18(Medial)
3587                                 switch (decompType [(int) c]) {
3588                                 case DecompositionIsolated:
3589                                         return 2;
3590                                 case DecompositionFinal:
3591                                         return 8;
3592                                 case DecompositionMedial:
3593                                         return 0x18;
3594                                 }
3595                         }
3596
3597                         // actually I dunno the reason why they have weights.
3598                         switch (c) {
3599                         case '\u01BC':
3600                                 return 0x10;
3601                         case '\u06A9':
3602                                 return 0x20;
3603                         case '\u06AA':
3604                                 return 0x28;
3605                         // Gurmukhi
3606                         case '\u0A39':
3607                         case '\u0A59':
3608                         case '\u0A5A':
3609                         case '\u0A5B':
3610                         case '\u0A5E':
3611                                 return 0x10;
3612                         }
3613
3614                         byte ret = 0;
3615                         switch (c) {
3616                         case '\u03C2':
3617                         case '\u2104':
3618                         case '\u212B':
3619                                 ret = 8;
3620                                 break;
3621                         case '\uFE42':
3622                                 ret = 0xA;
3623                                 break;
3624                         }
3625
3626                         // misc
3627                         switch (decompType [(int) c]) {
3628                         case DecompositionWide: // <wide>
3629                         case DecompositionSub: // <sub>
3630                         case DecompositionSuper: // <super>
3631                                 ret |= decompType [(int) c];
3632                                 break;
3633                         }
3634                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3635                                 ret |= 8;
3636                         if (isUppercase [(int) c]) // DerivedCoreProperties
3637                                 ret |= 0x10;
3638
3639                         return ret;
3640                 }
3641
3642                 #endregion
3643
3644                 #region IsIgnorable
3645 /*
3646                 static bool IsIgnorable (int i)
3647                 {
3648                         if (unicodeAge [i] >= 3.1)
3649                                 return true;
3650                         switch (char.GetUnicodeCategory ((char) i)) {
3651                         case UnicodeCategory.OtherNotAssigned:
3652                         case UnicodeCategory.Format:
3653                                 return true;
3654                         }
3655                         return false;
3656                 }
3657 */
3658
3659                 // FIXME: In the future use DerivedAge.txt to examine character
3660                 // versions and set those ones that have higher version than
3661                 // 1.0 as ignorable.
3662                 static bool IsIgnorable (int i)
3663                 {
3664                         switch (i) {
3665                         case 0:
3666                         // I guess, those characters are added between
3667                         // Unicode 1.0 (LCMapString) and Unicode 3.1
3668                         // (UnicodeCategory), so they used to be
3669                         // something like OtherNotAssigned as of Unicode 1.1.
3670                         case 0x2df: case 0x387:
3671                         case 0x3d7: case 0x3d8: case 0x3d9:
3672                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3673                         case 0x400: case 0x40d: case 0x450: case 0x45d:
3674                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
3675                         case 0x653: case 0x654: case 0x655: case 0x66d:
3676                         case 0xb56:
3677                         case 0x1e9b: case 0x202f: case 0x20ad:
3678                         case 0x20ae: case 0x20af:
3679                         case 0x20e2: case 0x20e3:
3680                         case 0x2139: case 0x213a: case 0x2183:
3681                         case 0x2425: case 0x2426: case 0x2619:
3682                         case 0x2670: case 0x2671: case 0x3007:
3683                         case 0x3190: case 0x3191:
3684                         case 0xfffc: case 0xfffd:
3685                                 return true;
3686                         // exceptional characters filtered by the
3687                         // following conditions. Originally those exceptional
3688                         // ranges are incorrect (they should not be ignored)
3689                         // and most of those characters are unfortunately in
3690                         // those ranges.
3691                         case 0x4d8: case 0x4d9:
3692                         case 0x4e8: case 0x4e9:
3693                         case 0x70F:
3694                         case 0x3036: case 0x303f:
3695                         case 0x337b: case 0xfb1e:
3696                                 return false;
3697                         }
3698
3699                         if (
3700                                 // The whole Sinhala characters.
3701                                 0x0D82 <= i && i <= 0x0DF4
3702                                 // The whole Tibetan characters.
3703                                 || 0x0F00 <= i && i <= 0x0FD1
3704                                 // The whole Myanmar characters.
3705                                 || 0x1000 <= i && i <= 0x1059
3706                                 // The whole Etiopic, Cherokee,
3707                                 // Canadian Syllablic, Ogham, Runic,
3708                                 // Tagalog, Hanunoo, Philippine,
3709                                 // Buhid, Tagbanwa, Khmer and Mongorian
3710                                 // characters.
3711                                 || 0x1200 <= i && i <= 0x1DFF
3712                                 // Greek extension characters.
3713                                 || 0x1F00 <= i && i <= 0x1FFF
3714                                 // The whole Braille characters.
3715                                 || 0x2800 <= i && i <= 0x28FF
3716                                 // CJK radical characters.
3717                                 || 0x2E80 <= i && i <= 0x2EF3
3718                                 // Kangxi radical characters.
3719                                 || 0x2F00 <= i && i <= 0x2FD5
3720                                 // Ideographic description characters.
3721                                 || 0x2FF0 <= i && i <= 0x2FFB
3722                                 // Bopomofo letter and final
3723                                 || 0x31A0 <= i && i <= 0x31B7
3724                                 // White square with quadrant characters.
3725                                 || 0x25F0 <= i && i <= 0x25F7
3726                                 // Ideographic telegraph symbols.
3727                                 || 0x32C0 <= i && i <= 0x32CB
3728                                 || 0x3358 <= i && i <= 0x3370
3729                                 || 0x33E0 <= i && i <= 0x33FF
3730                                 // The whole YI characters.
3731                                 || 0xA000 <= i && i <= 0xA48C
3732                                 || 0xA490 <= i && i <= 0xA4C6
3733                                 // American small ligatures
3734                                 || 0xFB13 <= i && i <= 0xFB17
3735                                 // hebrew, arabic, variation selector.
3736                                 || 0xFB1D <= i && i <= 0xFE2F
3737                                 // Arabic ligatures.
3738                                 || 0xFEF5 <= i && i <= 0xFEFC
3739                                 // FIXME: why are they excluded?
3740                                 || 0x01F6 <= i && i <= 0x01F9
3741                                 || 0x0218 <= i && i <= 0x0233
3742                                 || 0x02A9 <= i && i <= 0x02AD
3743                                 || 0x02EA <= i && i <= 0x02EE
3744                                 || 0x0349 <= i && i <= 0x036F
3745                                 || 0x0488 <= i && i <= 0x048F
3746                                 || 0x04D0 <= i && i <= 0x04FF
3747                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3748                                 || 0x06D6 <= i && i <= 0x06ED
3749                                 || 0x06FA <= i && i <= 0x06FE
3750                                 || 0x2048 <= i && i <= 0x204D
3751                                 || 0x20e4 <= i && i <= 0x20ea
3752                                 || 0x213C <= i && i <= 0x214B
3753                                 || 0x21EB <= i && i <= 0x21FF
3754                                 || 0x22F2 <= i && i <= 0x22FF
3755                                 || 0x237B <= i && i <= 0x239A
3756                                 || 0x239B <= i && i <= 0x23CF
3757                                 || 0x24EB <= i && i <= 0x24FF
3758                                 || 0x2596 <= i && i <= 0x259F
3759                                 || 0x25F8 <= i && i <= 0x25FF
3760                                 || 0x2672 <= i && i <= 0x2689
3761                                 || 0x2768 <= i && i <= 0x2775
3762                                 || 0x27d0 <= i && i <= 0x27ff
3763                                 || 0x2900 <= i && i <= 0x2aff
3764                                 || 0x3033 <= i && i <= 0x303F
3765                                 || 0x31F0 <= i && i <= 0x31FF
3766                                 || 0x3250 <= i && i <= 0x325F
3767                                 || 0x32B1 <= i && i <= 0x32BF
3768                                 || 0x3371 <= i && i <= 0x337B
3769                                 || 0xFA30 <= i && i <= 0xFA6A
3770                         )
3771                                 return true;
3772
3773                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3774                         switch (uc) {
3775                         case UnicodeCategory.PrivateUse:
3776                         case UnicodeCategory.Surrogate:
3777                                 return false;
3778                         // ignored by nature
3779                         case UnicodeCategory.Format:
3780                         case UnicodeCategory.OtherNotAssigned:
3781                                 return true;
3782                         default:
3783                                 return false;
3784                         }
3785                 }
3786
3787                 // To check IsIgnorable sanity, try the driver below under MS.NET.
3788
3789                 /*
3790                 public static void Main ()
3791                 {
3792                         for (int i = 0; i <= char.MaxValue; i++)
3793                                 Dump (i, IsIgnorable (i));
3794                 }
3795
3796                 static void Dump (int i, bool ignore)
3797                 {
3798                         switch (Char.GetUnicodeCategory ((char) i)) {
3799                         case UnicodeCategory.PrivateUse:
3800                         case UnicodeCategory.Surrogate:
3801                                 return; // check nothing
3802                         }
3803
3804                         string s1 = "";
3805                         string s2 = new string ((char) i, 10);
3806                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3807                         if ((ret == 0) == ignore)
3808                                 return;
3809                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3810                 }
3811                 */
3812                 #endregion // IsIgnorable
3813
3814                 #region IsIgnorableSymbol
3815                 static bool IsIgnorableSymbol (int i)
3816                 {
3817                         if (IsIgnorable (i))
3818                                 return true;
3819
3820                         switch (i) {
3821                         // *Letter
3822                         case 0x00b5: case 0x01C0: case 0x01C1:
3823                         case 0x01C2: case 0x01C3: case 0x01F6:
3824                         case 0x01F7: case 0x01F8: case 0x01F9:
3825                         case 0x02D0: case 0x02EE: case 0x037A:
3826                         case 0x03D7: case 0x03F3:
3827                         case 0x0400: case 0x040d:
3828                         case 0x0450: case 0x045d:
3829                         case 0x048C: case 0x048D:
3830                         case 0x048E: case 0x048F:
3831                         case 0x0587: case 0x0640: case 0x06E5:
3832                         case 0x06E6: case 0x06FA: case 0x06FB:
3833                         case 0x06FC: case 0x093D: case 0x0950:
3834                         case 0x1E9B: case 0x2139: case 0x3006:
3835                         case 0x3033: case 0x3034: case 0x3035:
3836                         case 0xFE7E: case 0xFE7F:
3837                         // OtherNumber
3838                         case 0x16EE: case 0x16EF: case 0x16F0:
3839                         // LetterNumber
3840                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3841                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3842                         case 0x3038: // HANGZHOU NUMERAL TEN
3843                         case 0x3039: // HANGZHOU NUMERAL TWENTY
3844                         case 0x303a: // HANGZHOU NUMERAL THIRTY
3845                         // OtherSymbol
3846                         case 0x2117:
3847                         case 0x327F:
3848                                 return true;
3849                         // ModifierSymbol
3850                         case 0x02B9: case 0x02BA: case 0x02C2:
3851                         case 0x02C3: case 0x02C4: case 0x02C5:
3852                         case 0x02C8: case 0x02CC: case 0x02CD:
3853                         case 0x02CE: case 0x02CF: case 0x02D2:
3854                         case 0x02D3: case 0x02D4: case 0x02D5:
3855                         case 0x02D6: case 0x02D7: case 0x02DE:
3856                         case 0x02E5: case 0x02E6: case 0x02E7:
3857                         case 0x02E8: case 0x02E9:
3858                         case 0x309B: case 0x309C:
3859                         // OtherPunctuation
3860                         case 0x055A: // American Apos
3861                         case 0x05C0: // Hebrew Punct
3862                         case 0x0E4F: // Thai FONGMAN
3863                         case 0x0E5A: // Thai ANGKHANKHU
3864                         case 0x0E5B: // Thai KHOMUT
3865                         // CurencySymbol
3866                         case 0x09F2: // Bengali Rupee Mark
3867                         case 0x09F3: // Bengali Rupee Sign
3868                         // MathSymbol
3869                         case 0x221e: // INF.
3870                         // OtherSymbol
3871                         case 0x0482:
3872                         case 0x09FA:
3873                         case 0x0B70:
3874                                 return false;
3875                         }
3876
3877                         // *Letter
3878                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3879 #if NET_2_0
3880                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3881                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3882 #endif
3883                         )
3884                                 return true;
3885
3886                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3887                         switch (uc) {
3888                         case UnicodeCategory.Surrogate:
3889                                 return false; // inconsistent
3890
3891                         case UnicodeCategory.SpacingCombiningMark:
3892                         case UnicodeCategory.EnclosingMark:
3893                         case UnicodeCategory.NonSpacingMark:
3894                         case UnicodeCategory.PrivateUse:
3895                                 // NonSpacingMark
3896                                 if (0x064B <= i && i <= 0x0652) // Arabic
3897                                         return true;
3898                                 return false;
3899
3900                         case UnicodeCategory.Format:
3901                         case UnicodeCategory.OtherNotAssigned:
3902                                 return true;
3903
3904                         default:
3905                                 bool use = false;
3906                                 // OtherSymbols
3907                                 if (
3908                                         // latin in a circle
3909                                         0x249A <= i && i <= 0x24E9
3910                                         || 0x2100 <= i && i <= 0x2132
3911                                         // Japanese
3912                                         || 0x3196 <= i && i <= 0x31A0
3913                                         // Korean
3914                                         || 0x3200 <= i && i <= 0x321C
3915                                         // Chinese/Japanese
3916                                         || 0x322A <= i && i <= 0x3243
3917                                         // CJK
3918                                         || 0x3260 <= i && i <= 0x32B0
3919                                         || 0x32D0 <= i && i <= 0x3357
3920                                         || 0x337B <= i && i <= 0x33DD
3921                                 )
3922                                         use = !Char.IsLetterOrDigit ((char) i);
3923                                 if (use)
3924                                         return false;
3925
3926                                 // This "Digit" rule is mystery.
3927                                 // It filters some symbols out.
3928                                 if (Char.IsLetterOrDigit ((char) i))
3929                                         return false;
3930                                 if (Char.IsNumber ((char) i))
3931                                         return false;
3932                                 if (Char.IsControl ((char) i)
3933                                         || Char.IsSeparator ((char) i)
3934                                         || Char.IsPunctuation ((char) i))
3935                                         return true;
3936                                 if (Char.IsSymbol ((char) i))
3937                                         return true;
3938
3939                                 // FIXME: should check more
3940                                 return false;
3941                         }
3942                 }
3943
3944                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3945 /*
3946                 public static void Main ()
3947                 {
3948                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3949                         for (int i = 0; i <= char.MaxValue; i++) {
3950                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3951                                 if (uc == UnicodeCategory.Surrogate)
3952                                         continue;
3953
3954                                 bool ret = IsIgnorableSymbol (i);
3955
3956                                 string s1 = "TEST ";
3957                                 string s2 = "TEST " + (char) i;
3958
3959                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3960
3961                                 if (ret != (result == 0))
3962                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3963                                                 ret ? "should not ignore" :
3964                                                         "should ignore",
3965                                                 i,(char) i, uc);
3966                         }
3967                 }
3968 */
3969                 #endregion
3970
3971                 #region NonSpacing
3972                 static bool IsIgnorableNonSpacing (int i)
3973                 {
3974                         if (IsIgnorable (i))
3975                                 return true;
3976
3977                         switch (i) {
3978                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3979                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3980                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3981                                 return true;
3982                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3983                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3984                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3985                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3986                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3987                         case 0x0CCD: case 0x0E4E:
3988                                 return false;
3989                         }
3990
3991                         if (0x02b9 <= i && i <= 0x02c5
3992                                 || 0x02cc <= i && i <= 0x02d7
3993                                 || 0x02e4 <= i && i <= 0x02ef
3994                                 || 0x20DD <= i && i <= 0x20E0
3995                         )
3996                                 return true;
3997
3998                         if (0x064B <= i && i <= 0x00652
3999                                 || 0x0941 <= i && i <= 0x0948
4000                                 || 0x0AC1 <= i && i <= 0x0ACD
4001                                 || 0x0C3E <= i && i <= 0x0C4F
4002                                 || 0x0E31 <= i && i <= 0x0E3F
4003                         )
4004                                 return false;
4005
4006                         return Char.GetUnicodeCategory ((char) i) ==
4007                                 UnicodeCategory.NonSpacingMark;
4008                 }
4009
4010                 // We can reuse IsIgnorableSymbol testcode
4011                 // for IsIgnorableNonSpacing.
4012                 #endregion
4013         }
4014
4015         struct CharMapEntry
4016         {
4017                 public byte Category;
4018                 public byte Level1;
4019                 public byte Level2; // It is always single byte.
4020                 public bool Defined;
4021
4022                 public CharMapEntry (byte category, byte level1, byte level2)
4023                 {
4024                         Category = category;
4025                         Level1 = level1;
4026                         Level2 = level2;
4027                         Defined = true;
4028                 }
4029         }
4030
4031         class JISCharacter
4032         {
4033                 public readonly int CP;
4034                 public readonly int JIS;
4035
4036                 public JISCharacter (int cp, int cpJIS)
4037                 {
4038                         CP = cp;
4039                         JIS = cpJIS;
4040                 }
4041         }
4042
4043         class JISComparer : IComparer
4044         {
4045                 public static readonly JISComparer Instance =
4046                         new JISComparer ();
4047
4048                 public int Compare (object o1, object o2)
4049                 {
4050                         JISCharacter j1 = (JISCharacter) o1;
4051                         JISCharacter j2 = (JISCharacter) o2;
4052                         return j1.JIS - j2.JIS;
4053                 }
4054         }
4055
4056         class NonJISCharacter
4057         {
4058                 public readonly int CP;
4059                 public readonly string Name;
4060
4061                 public NonJISCharacter (int cp, string name)
4062                 {
4063                         CP = cp;
4064                         Name = name;
4065                 }
4066         }
4067
4068         class NonJISComparer : IComparer
4069         {
4070                 public static readonly NonJISComparer Instance =
4071                         new NonJISComparer ();
4072
4073                 public int Compare (object o1, object o2)
4074                 {
4075                         NonJISCharacter j1 = (NonJISCharacter) o1;
4076                         NonJISCharacter j2 = (NonJISCharacter) o2;
4077                         return string.CompareOrdinal (j1.Name, j2.Name);
4078                 }
4079         }
4080
4081         class DecimalDictionaryValueComparer : IComparer
4082         {
4083                 public static readonly DecimalDictionaryValueComparer Instance
4084                         = new DecimalDictionaryValueComparer ();
4085
4086                 private DecimalDictionaryValueComparer ()
4087                 {
4088                 }
4089
4090                 public int Compare (object o1, object o2)
4091                 {
4092                         DictionaryEntry e1 = (DictionaryEntry) o1;
4093                         DictionaryEntry e2 = (DictionaryEntry) o2;
4094                         // FIXME: in case of 0, compare decomposition categories
4095                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4096                         if (ret != 0)
4097                                 return ret;
4098                         int i1 = (int) e1.Key;
4099                         int i2 = (int) e2.Key;
4100                         return i1 - i2;
4101                 }
4102         }
4103
4104         class StringDictionaryValueComparer : IComparer
4105         {
4106                 public static readonly StringDictionaryValueComparer Instance
4107                         = new StringDictionaryValueComparer ();
4108
4109                 private StringDictionaryValueComparer ()
4110                 {
4111                 }
4112
4113                 public int Compare (object o1, object o2)
4114                 {
4115                         DictionaryEntry e1 = (DictionaryEntry) o1;
4116                         DictionaryEntry e2 = (DictionaryEntry) o2;
4117                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4118                         if (ret != 0)
4119                                 return ret;
4120                         int i1 = (int) e1.Key;
4121                         int i2 = (int) e2.Key;
4122                         return i1 - i2;
4123                 }
4124         }
4125
4126         class UCAComparer : IComparer
4127         {
4128                 public static readonly UCAComparer Instance
4129                         = new UCAComparer ();
4130
4131                 private UCAComparer ()
4132                 {
4133                 }
4134
4135                 public int Compare (object o1, object o2)
4136                 {
4137                         char i1 = (char) o1;
4138                         char i2 = (char) o2;
4139
4140                         int l1 = CollationElementTable.GetSortKeyCount (i1);
4141                         int l2 = CollationElementTable.GetSortKeyCount (i2);
4142                         int l = l1 > l2 ? l2 : l1;
4143
4144                         for (int i = 0; i < l; i++) {
4145                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4146                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4147                                 int v = k1.Primary - k2.Primary;
4148                                 if (v != 0)
4149                                         return v;
4150                                 v = k1.Secondary - k2.Secondary;
4151                                 if (v != 0)
4152                                         return v;
4153                                 v = k1.Thirtiary - k2.Thirtiary;
4154                                 if (v != 0)
4155                                         return v;
4156                                 v = k1.Quarternary - k2.Quarternary;
4157                                 if (v != 0)
4158                                         return v;
4159                         }
4160                         return l1 - l2;
4161                 }
4162         }
4163
4164         class Tailoring
4165         {
4166                 int lcid;
4167                 int alias;
4168                 bool frenchSort;
4169                 ArrayList items = new ArrayList ();
4170
4171                 public Tailoring (int lcid)
4172                         : this (lcid, 0)
4173                 {
4174                 }
4175
4176                 public Tailoring (int lcid, int alias)
4177                 {
4178                         this.lcid = lcid;
4179                         this.alias = alias;
4180                 }
4181
4182                 public int LCID {
4183                         get { return lcid; }
4184                 }
4185
4186                 public int Alias {
4187                         get { return alias; }
4188                 }
4189
4190                 public bool FrenchSort {
4191                         get { return frenchSort; }
4192                         set { frenchSort = value; }
4193                 }
4194
4195                 public void AddDiacriticalMap (byte target, byte replace)
4196                 {
4197                         items.Add (new DiacriticalMap (target, replace));
4198                 }
4199
4200                 public void AddSortKeyMap (string source, byte [] sortkey)
4201                 {
4202                         items.Add (new SortKeyMap (source, sortkey));
4203                 }
4204
4205                 public void AddReplacementMap (string source, string replace)
4206                 {
4207                         items.Add (new ReplacementMap (source, replace));
4208                 }
4209
4210                 public char [] ItemToCharArray ()
4211                 {
4212                         ArrayList al = new ArrayList ();
4213                         foreach (ITailoringMap m in items)
4214                                 al.AddRange (m.ToCharArray ());
4215                         return al.ToArray (typeof (char)) as char [];
4216                 }
4217
4218                 interface ITailoringMap
4219                 {
4220                         char [] ToCharArray ();
4221                 }
4222
4223                 class DiacriticalMap : ITailoringMap
4224                 {
4225                         public readonly byte Target;
4226                         public readonly byte Replace;
4227
4228                         public DiacriticalMap (byte target, byte replace)
4229                         {
4230                                 Target = target;
4231                                 Replace = replace;
4232                         }
4233
4234                         public char [] ToCharArray ()
4235                         {
4236                                 char [] ret = new char [3];
4237                                 ret [0] = (char) 02; // kind:DiacriticalMap
4238                                 ret [1] = (char) Target;
4239                                 ret [2] = (char) Replace;
4240                                 return ret;
4241                         }
4242                 }
4243
4244                 class SortKeyMap : ITailoringMap
4245                 {
4246                         public readonly string Source;
4247                         public readonly byte [] SortKey;
4248
4249                         public SortKeyMap (string source, byte [] sortkey)
4250                         {
4251                                 Source = source;
4252                                 SortKey = sortkey;
4253                         }
4254
4255                         public char [] ToCharArray ()
4256                         {
4257                                 char [] ret = new char [Source.Length + 7];
4258                                 ret [0] = (char) 01; // kind:SortKeyMap
4259                                 for (int i = 0; i < Source.Length; i++)
4260                                         ret [i + 1] = Source [i];
4261                                 // null terminate
4262                                 for (int i = 0; i < 4; i++)
4263                                         ret [i + Source.Length + 2] = (char) SortKey [i];
4264                                 return ret;
4265                         }
4266                 }
4267
4268                 class ReplacementMap : ITailoringMap
4269                 {
4270                         public readonly string Source;
4271                         public readonly string Replace;
4272
4273                         public ReplacementMap (string source, string replace)
4274                         {
4275                                 Source = source;
4276                                 Replace = replace;
4277                         }
4278
4279                         public char [] ToCharArray ()
4280                         {
4281                                 char [] ret = new char [Source.Length + Replace.Length + 3];
4282                                 ret [0] = (char) 03; // kind:ReplaceMap
4283                                 int pos = 1;
4284                                 for (int i = 0; i < Source.Length; i++)
4285                                         ret [pos++] = Source [i];
4286                                 // null terminate
4287                                 pos++;
4288                                 for (int i = 0; i < Replace.Length; i++)
4289                                         ret [pos++] = Replace [i];
4290                                 // null terminate
4291                                 return ret;
4292                         }
4293                 }
4294         }
4295 }