mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14
  15 //
  16 // * sortkey getter signature
  17 //
  18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
  19 //      Stores sort key for corresponding character element into buf and
  20 //      returns the length of the consumed _source_ character element in s.
  21 //
  22 // * character length to consume
  23 //
  24 //      If there are characters whose primary weight is 0, they are consumed
  25 //      and considered as a part of the character element.
  26 //
  27 #define Binary
  28
  29 using System;
  30 using System.IO;
  31 using System.Collections;
  32 using System.Globalization;
  33 using System.Text;
  34 using System.Xml;
  35
  36 namespace Mono.Globalization.Unicode
  37 {
  38         internal class MSCompatSortKeyTableGenerator
  39         {
  40                 public static void Main (string [] args)
  41                 {
  42                         new MSCompatSortKeyTableGenerator ().Run (args);
  43                 }
  44
  45                 const int DecompositionWide = 1; // fixed
  46                 const int DecompositionSub = 2; // fixed
  47                 const int DecompositionSmall = 3;
  48                 const int DecompositionIsolated = 4;
  49                 const int DecompositionInitial = 5;
  50                 const int DecompositionFinal = 6;
  51                 const int DecompositionMedial = 7;
  52                 const int DecompositionNoBreak = 8;
  53                 const int DecompositionVertical = 9;
  54                 const int DecompositionFraction = 0xA;
  55                 const int DecompositionFont = 0xB;
  56                 const int DecompositionSuper = 0xC; // fixed
  57                 const int DecompositionFull = 0xE;
  58                 const int DecompositionNarrow = 0xD;
  59                 const int DecompositionCircle = 0xF;
  60                 const int DecompositionSquare = 0x10;
  61                 const int DecompositionCompat = 0x11;
  62                 const int DecompositionCanonical = 0x12;
  63
  64                 TextWriter Result = Console.Out;
  65
  66                 byte [] fillIndex = new byte [256]; // by category
  67                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  68
  69                 char [] specialIgnore = new char [] {
  70                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  71                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  72                         };
  73
  74                 // FIXME: need more love (as always)
  75                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  76                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  77                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  78                         '\u0292', '\u01BE', '\u0298'};
  79                 byte [] alphaWeights = new byte [] {
  80                         2, 9, 0xA, 0x1A, 0x21,
  81                         0x23, 0x25, 0x2C, 0x32, 0x35,
  82                         0x36, 0x48, 0x51, 0x70, 0x7C,
  83                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  84                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  85                         0xA9, 0xAA, 0xB3, 0xB4};
  86
  87                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  88                 bool [] isUppercase = new bool [char.MaxValue + 1];
  89
  90                 byte [] decompType = new byte [char.MaxValue + 1];
  91                 int [] decompIndex = new int [char.MaxValue + 1];
  92                 int [] decompLength = new int [char.MaxValue + 1];
  93                 int [] decompValues;
  94                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  95
  96                 byte [] diacritical = new byte [char.MaxValue + 1];
  97
  98                 string [] diacritics = new string [] {
  99                         // LATIN
 100                         "WITH VERTICAL LINE ABOVE;",
 101                         "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
 102                         "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
 103                         "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 104                         " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
 105                         "WITH OGONEK;", "WITH CEDILLA;",
 106                         //
 107                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 108                         "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
 109                         "STROKE OVERLAY",
 110                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 111                         " DIAERESIS AND GRAVE;",
 112                         " BREVE AND ACUTE;",
 113                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 114                         " MACRON AND ACUTE;",
 115                         " MACRON AND GRAVE;",
 116                         //
 117                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 118                         " RING ABOVE AND ACUTE",
 119                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 120                         " CIRCUMFLEX AND TILDE",
 121                         " TILDE AND DIAERESIS",
 122                         " STROKE AND ACUTE",
 123                         " BREVE AND TILDE",
 124                         " CEDILLA AND BREVE",
 125                         " OGONEK AND MACRON",
 126                         //
 127                         "WITH OVERLINE",
 128                         "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 129                         " DOUBLE GRAVE;",
 130                         " INVERTED BREVE",
 131                         "ROMAN NUMERAL",
 132                         " PRECEDED BY APOSTROPHE",
 133                         "WITH HORN;",
 134                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 135                         " PALATAL HOOK",
 136                         " DOT BELOW;",
 137                         " RETROFLEX;", "DIAERESIS BELOW",
 138                         " RING BELOW",
 139                         //
 140                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 141                         " BREVE BELOW;", " HORN AND GRAVE",
 142                         " TILDE BELOW",
 143                         " TOPBAR",
 144                         " DOT BELOW AND DOT ABOVE",
 145                         " RIGHT HALF RING", " HORN AND TILDE",
 146                         " CIRCUMFLEX AND DOT BELOW",
 147                         " BREVE AND DOT BELOW",
 148                         " DOT BELOW AND MACRON",
 149                         " HORN AND HOOK ABOVE",
 150                         " HORN AND DOT",
 151                         // CIRCLED, PARENTHESIZED and so on
 152                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
 153                         "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
 154                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 155                         };
 156                 byte [] diacriticWeights = new byte [] {
 157                         // LATIN.
 158                         5,
 159                         0xF, 0xE, 0x12,
 160                         0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 161                         0x17, 0x19, 0x1A, 0x1B, 0x1C,
 162                         //
 163                         0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 164                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 165                         //
 166                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 167                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 168                         //
 169                         0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
 170                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
 171                         //
 172                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
 173                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 174                         0x95, 0xAA,
 175                         // CIRCLED, PARENTHESIZED and so on.
 176                         0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
 177                         0xF3, 0xF3, 0xF3
 178                         };
 179
 180                 int [] numberSecondaryWeightBounds = new int [] {
 181                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 182                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 183                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 184                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 185                         0xE50, 0xE60, 0xED0, 0xEE0
 186                         };
 187
 188                 char [] orderedCyrillic;
 189                 char [] orderedGurmukhi;
 190                 char [] orderedGujarati;
 191                 char [] orderedGeorgian;
 192                 char [] orderedThaana;
 193
 194                 static readonly char [] orderedTamilConsonants = new char [] {
 195                         // based on traditional Tamil consonants, except for
 196                         // Grantha (where Microsoft breaks traditionalism).
 197                         // http://www.angelfire.com/empire/thamizh/padanGaL
 198                         '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
 199                         '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
 200                         '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
 201                         '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
 202                         '\u0BB7', '\u0BB9'};
 203
 204                 // cp -> character name (only for some characters)
 205                 ArrayList sortableCharNames = new ArrayList ();
 206
 207                 // cp -> arrow value (int)
 208                 ArrayList arrowValues = new ArrayList ();
 209
 210                 // cp -> box value (int)
 211                 ArrayList boxValues = new ArrayList ();
 212
 213                 // cp -> level1 value
 214                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 215                 Hashtable cyrillicLetterPrimaryValues = new Hashtable ();
 216
 217                 // letterName -> cp
 218                 Hashtable arabicNameMap = new Hashtable ();
 219                 Hashtable cyrillicNameMap = new Hashtable ();
 220
 221                 // cp -> Hashtable [decompType] -> cp
 222                 Hashtable nfkdMap = new Hashtable ();
 223
 224                 // Latin letter -> ArrayList [int]
 225                 Hashtable latinMap = new Hashtable ();
 226
 227                 ArrayList jisJapanese = new ArrayList ();
 228                 ArrayList nonJisJapanese = new ArrayList ();
 229
 230                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
 231                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
 232                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
 233                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
 234                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
 235
 236                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 237
 238                 static double [] unicodeAge = new double [char.MaxValue + 1];
 239
 240                 ArrayList tailorings = new ArrayList ();
 241
 242                 void Run (string [] args)
 243                 {
 244                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 245                         ParseSources (dirname);
 246                         Console.Error.WriteLine ("parse done.");
 247
 248                         ModifyParsedValues ();
 249                         GenerateCore ();
 250                         Console.Error.WriteLine ("generation done.");
 251                         Serialize ();
 252                         Console.Error.WriteLine ("serialization done.");
 253 /*
 254 StreamWriter sw = new StreamWriter ("agelog.txt");
 255 for (int i = 0; i < char.MaxValue; i++) {
 256 bool shouldBe = false;
 257 switch (Char.GetUnicodeCategory ((char) i)) {
 258 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
 259         shouldBe = true; break;
 260 }
 261 if (unicodeAge [i] >= 3.1)
 262         shouldBe = true;
 263 //if (IsIgnorable (i) != shouldBe)
 264 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
 265 }
 266 sw.Close ();
 267 */
 268                 }
 269
 270                 byte [] CompressArray (byte [] source, CodePointIndexer i)
 271                 {
 272                         return (byte []) CodePointIndexer.CompressArray  (
 273                                 source, typeof (byte), i);
 274                 }
 275
 276                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
 277                 {
 278                         return (ushort []) CodePointIndexer.CompressArray  (
 279                                 source, typeof (ushort), i);
 280                 }
 281
 282                 void Serialize ()
 283                 {
 284                         // Tailorings
 285                         SerializeTailorings ();
 286
 287                         byte [] categories = new byte [map.Length];
 288                         byte [] level1 = new byte [map.Length];
 289                         byte [] level2 = new byte [map.Length];
 290                         byte [] level3 = new byte [map.Length];
 291                         ushort [] widthCompat = new ushort [map.Length];
 292                         for (int i = 0; i < map.Length; i++) {
 293                                 categories [i] = map [i].Category;
 294                                 level1 [i] = map [i].Level1;
 295                                 level2 [i] = map [i].Level2;
 296                                 level3 [i] = ComputeLevel3Weight ((char) i);
 297                                 switch (decompType [i]) {
 298                                 case DecompositionNarrow:
 299                                 case DecompositionWide:
 300                                 case DecompositionSuper:
 301                                 case DecompositionSub:
 302                                         // they are always 1 char
 303                                         widthCompat [i] = (ushort) decompValues [decompIndex [i]];
 304                                         break;
 305                                 }
 306                         }
 307
 308                         // compress
 309                         ignorableFlags = CompressArray (ignorableFlags,
 310                                 MSCompatUnicodeTableUtil.Ignorable);
 311                         categories = CompressArray (categories,
 312                                 MSCompatUnicodeTableUtil.Category);
 313                         level1 = CompressArray (level1,
 314                                 MSCompatUnicodeTableUtil.Level1);
 315                         level2 = CompressArray (level2,
 316                                 MSCompatUnicodeTableUtil.Level2);
 317                         level3 = CompressArray (level3,
 318                                 MSCompatUnicodeTableUtil.Level3);
 319                         widthCompat = (ushort []) CodePointIndexer.CompressArray (
 320                                 widthCompat, typeof (ushort),
 321                                 MSCompatUnicodeTableUtil.WidthCompat);
 322                         cjkCHS = CompressArray (cjkCHS,
 323                                 MSCompatUnicodeTableUtil.CjkCHS);
 324                         cjkCHT = CompressArray (cjkCHT,
 325                                 MSCompatUnicodeTableUtil.Cjk);
 326                         cjkJA = CompressArray (cjkJA,
 327                                 MSCompatUnicodeTableUtil.Cjk);
 328                         cjkKO = CompressArray (cjkKO,
 329                                 MSCompatUnicodeTableUtil.Cjk);
 330                         cjkKOlv2 = CompressArray (cjkKOlv2,
 331                                 MSCompatUnicodeTableUtil.Cjk);
 332
 333                         // Ignorables
 334                         Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
 335 #if Binary
 336                         MemoryStream ms = new MemoryStream ();
 337                         BinaryWriter binary = new BinaryWriter (ms);
 338                         binary.Write (ignorableFlags.Length);
 339 #endif
 340                         for (int i = 0; i < ignorableFlags.Length; i++) {
 341                                 byte value = ignorableFlags [i];
 342                                 if (value < 10)
 343                                         Result.Write ("{0},", value);
 344                                 else
 345                                         Result.Write ("0x{0:X02},", value);
 346 #if Binary
 347                                 binary.Write (value);
 348 #endif
 349                                 if ((i & 0xF) == 0xF)
 350                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 351                         }
 352                         Result.WriteLine ("};");
 353                         Result.WriteLine ();
 354
 355                         // Primary category
 356                         Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
 357 #if Binary
 358                         binary.Write (categories.Length);
 359 #endif
 360                         for (int i = 0; i < categories.Length; i++) {
 361                                 byte value = categories [i];
 362                                 if (value < 10)
 363                                         Result.Write ("{0},", value);
 364                                 else
 365                                         Result.Write ("0x{0:X02},", value);
 366 #if Binary
 367                                 binary.Write (value);
 368 #endif
 369                                 if ((i & 0xF) == 0xF)
 370                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 371                         }
 372                         Result.WriteLine ("};");
 373                         Result.WriteLine ();
 374
 375                         // Primary weight value
 376                         Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
 377 #if Binary
 378                         binary.Write (level1.Length);
 379 #endif
 380                         for (int i = 0; i < level1.Length; i++) {
 381                                 byte value = level1 [i];
 382                                 if (value < 10)
 383                                         Result.Write ("{0},", value);
 384                                 else
 385                                         Result.Write ("0x{0:X02},", value);
 386 #if Binary
 387                                 binary.Write (value);
 388 #endif
 389                                 if ((i & 0xF) == 0xF)
 390                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 391                         }
 392                         Result.WriteLine ("};");
 393                         Result.WriteLine ();
 394
 395                         // Secondary weight
 396                         Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
 397 #if Binary
 398                         binary.Write (level2.Length);
 399 #endif
 400                         for (int i = 0; i < level2.Length; i++) {
 401                                 byte value = level2 [i];
 402                                 if (value < 10)
 403                                         Result.Write ("{0},", value);
 404                                 else
 405                                         Result.Write ("0x{0:X02},", value);
 406 #if Binary
 407                                 binary.Write (value);
 408 #endif
 409                                 if ((i & 0xF) == 0xF)
 410                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 411                         }
 412                         Result.WriteLine ("};");
 413                         Result.WriteLine ();
 414
 415                         // Thirtiary weight
 416                         Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
 417 #if Binary
 418                         binary.Write (level3.Length);
 419 #endif
 420                         for (int i = 0; i < level3.Length; i++) {
 421                                 byte value = level3 [i];
 422                                 if (value < 10)
 423                                         Result.Write ("{0},", value);
 424                                 else
 425                                         Result.Write ("0x{0:X02},", value);
 426 #if Binary
 427                                 binary.Write (value);
 428 #endif
 429                                 if ((i & 0xF) == 0xF)
 430                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 431                         }
 432                         Result.WriteLine ("};");
 433                         Result.WriteLine ();
 434
 435                         // Width insensitivity mappings
 436                         // (for now it is more lightweight than dumping the
 437                         // entire NFKD table).
 438                         Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
 439 #if Binary
 440                         binary.Write (widthCompat.Length);
 441 #endif
 442                         for (int i = 0; i < widthCompat.Length; i++) {
 443                                 ushort value = widthCompat [i];
 444                                 if (value < 10)
 445                                         Result.Write ("{0},", value);
 446                                 else
 447                                         Result.Write ("0x{0:X02},", value);
 448 #if Binary
 449                                 binary.Write (value);
 450 #endif
 451                                 if ((i & 0xF) == 0xF)
 452                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 453                         }
 454                         Result.WriteLine ("};");
 455                         Result.WriteLine ();
 456 #if Binary
 457                         using (FileStream fs = File.Create ("../collation.core.bin")) {
 458                                 byte [] array = ms.ToArray ();
 459                                 fs.Write (array, 0, array.Length);
 460                         }
 461 #endif
 462
 463                         // CJK
 464                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 465                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 466                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 467                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 468                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 469                 }
 470
 471                 void SerializeCJK (string name, ushort [] cjk, int max)
 472                 {
 473                         int offset = 0;//char.MaxValue - cjk.Length;
 474                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
 475 #if Binary
 476                         MemoryStream ms = new MemoryStream ();
 477                         BinaryWriter binary = new BinaryWriter (ms);
 478 #endif
 479                         for (int i = 0; i < cjk.Length; i++) {
 480                                 if (i + offset == max)
 481                                         break;
 482                                 ushort value = cjk [i];
 483                                 if (value < 10)
 484                                         Result.Write ("{0},", value);
 485                                 else
 486                                         Result.Write ("0x{0:X04},", value);
 487 #if Binary
 488                                 binary.Write (value);
 489 #endif
 490                                 if ((i & 0xF) == 0xF)
 491                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 492                         }
 493                         Result.WriteLine ("};");
 494                         Result.WriteLine ();
 495 #if Binary
 496                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 497                                 byte [] array = ms.ToArray ();
 498                                 fs.Write (array, 0, array.Length);
 499                         }
 500 #endif
 501                 }
 502
 503                 void SerializeCJK (string name, byte [] cjk, int max)
 504                 {
 505                         int offset = 0;//char.MaxValue - cjk.Length;
 506                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
 507 #if Binary
 508                         MemoryStream ms = new MemoryStream ();
 509                         BinaryWriter binary = new BinaryWriter (ms);
 510 #endif
 511                         for (int i = 0; i < cjk.Length; i++) {
 512                                 if (i + offset == max)
 513                                         break;
 514                                 byte value = cjk [i];
 515                                 if (value < 10)
 516                                         Result.Write ("{0},", value);
 517                                 else
 518                                         Result.Write ("0x{0:X02},", value);
 519 #if Binary
 520                                 binary.Write (value);
 521 #endif
 522                                 if ((i & 0xF) == 0xF)
 523                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 524                         }
 525                         Result.WriteLine ("};");
 526                         Result.WriteLine ();
 527 #if Binary
 528                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 529                                 byte [] array = ms.ToArray ();
 530                                 fs.Write (array, 0, array.Length);
 531                         }
 532 #endif
 533                 }
 534
 535                 void SerializeTailorings ()
 536                 {
 537                         Hashtable indexes = new Hashtable ();
 538                         Hashtable counts = new Hashtable ();
 539                         Result.WriteLine ("static char [] tailorings = new char [] {");
 540                         int count = 0;
 541 #if Binary
 542                         MemoryStream ms = new MemoryStream ();
 543                         BinaryWriter binary = new BinaryWriter (ms);
 544 #endif
 545                         foreach (Tailoring t in tailorings) {
 546                                 if (t.Alias != 0)
 547                                         continue;
 548                                 Result.Write ("/*{0}*/", t.LCID);
 549                                 indexes.Add (t.LCID, count);
 550                                 char [] values = t.ItemToCharArray ();
 551                                 counts.Add (t.LCID, values.Length);
 552                                 foreach (char c in values) {
 553                                         Result.Write ("'\\x{0:X}', ", (int) c);
 554                                         if (++count % 16 == 0)
 555                                                 Result.WriteLine (" // {0:X04}", count - 16);
 556 #if Binary
 557                                         binary.Write ((ushort) c);
 558 #endif
 559                                 }
 560                         }
 561                         Result.WriteLine ("};");
 562
 563                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
 564 #if Binary
 565                         byte [] rawdata = ms.ToArray ();
 566                         ms = new MemoryStream ();
 567                         binary = new BinaryWriter (ms);
 568                         binary.Write (tailorings.Count);
 569 #endif
 570                         foreach (Tailoring t in tailorings) {
 571                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
 572                                 if (!indexes.ContainsKey (target)) {
 573                                         throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
 574                                         continue;
 575                                 }
 576                                 int idx = (int) indexes [target];
 577                                 int cnt = (int) counts [target];
 578                                 bool french = t.FrenchSort;
 579                                 if (t.Alias != 0)
 580                                         foreach (Tailoring t2 in tailorings)
 581                                                 if (t2.LCID == t.LCID)
 582                                                         french = t2.FrenchSort;
 583                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
 584 #if Binary
 585                                 binary.Write (t.LCID);
 586                                 binary.Write (idx);
 587                                 binary.Write (cnt);
 588                                 binary.Write (french);
 589 #endif
 590                         }
 591                         Result.WriteLine ("};");
 592 #if Binary
 593                         binary.Write ((byte) 0xFF);
 594                         binary.Write ((byte) 0xFF);
 595                         binary.Write (rawdata.Length / 2);
 596                         binary.Write (rawdata, 0, rawdata.Length);
 597
 598
 599                         using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
 600                                 byte [] array = ms.ToArray ();
 601                                 fs.Write (array, 0, array.Length);
 602                         }
 603 #endif
 604                 }
 605
 606                 #region Parse
 607
 608                 void ParseSources (string dirname)
 609                 {
 610                         string unidata =
 611                                 dirname + "/UnicodeData.txt";
 612                         string derivedCoreProps =
 613                                 dirname + "/DerivedCoreProperties.txt";
 614                         string scripts =
 615                                 dirname + "/Scripts.txt";
 616                         string cp932 =
 617                                 dirname + "/CP932.TXT";
 618                         string derivedAge =
 619                                 dirname + "/DerivedAge.txt";
 620                         string chXML = dirname + "/common/collation/zh.xml";
 621                         string jaXML = dirname + "/common/collation/ja.xml";
 622                         string koXML = dirname + "/common/collation/ko.xml";
 623
 624                         ParseDerivedAge (derivedAge);
 625
 626                         FillIgnorables ();
 627
 628                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 629                         ParseUnidata (unidata);
 630                         ParseDerivedCoreProperties (derivedCoreProps);
 631                         ParseScripts (scripts);
 632                         ParseCJK (chXML, jaXML, koXML);
 633
 634                         ParseTailorings ("mono-tailoring-source.txt");
 635                 }
 636
 637                 void ParseTailorings (string filename)
 638                 {
 639                         Tailoring t = null;
 640                         int line = 0;
 641                         using (StreamReader sr = new StreamReader (filename)) {
 642                                 try {
 643                                         while (sr.Peek () >= 0) {
 644                                                 line++;
 645                                                 ProcessTailoringLine (ref t,
 646                                                         sr.ReadLine ().Trim ());
 647                                         }
 648                                 } catch (Exception) {
 649                                         Console.Error.WriteLine ("ERROR at line {0}", line);
 650                                         throw;
 651                                 }
 652                         }
 653                 }
 654
 655                 // For now this is enough.
 656                 string ParseTailoringSourceValue (string s)
 657                 {
 658                         StringBuilder sb = new StringBuilder ();
 659                         for (int i = 0; i < s.Length; i++) {
 660                                 if (s.StartsWith ("\\u")) {
 661                                         sb.Append ((char) int.Parse (
 662                                                 s.Substring (2, 4), NumberStyles.HexNumber),
 663                                                 1);
 664                                         i += 5;
 665                                 }
 666                         else
 667                                 sb.Append (s [i]);
 668                         }
 669                         return sb.ToString ();
 670                 }
 671
 672                 void ProcessTailoringLine (ref Tailoring t, string s)
 673                 {
 674                         int idx = s.IndexOf ('#');
 675                         if (idx > 0)
 676                                 s = s.Substring (0, idx).Trim ();
 677                         if (s.Length == 0 || s [0] == '#')
 678                                 return;
 679                         if (s [0] == '@') {
 680                                 idx = s.IndexOf ('=');
 681                                 if (idx > 0)
 682                                         t = new Tailoring (
 683                                                 int.Parse (s.Substring (1, idx - 1)),
 684                                                 int.Parse (s.Substring (idx + 1)));
 685                                 else
 686                                         t = new Tailoring (int.Parse (s.Substring (1)));
 687                                 tailorings.Add (t);
 688                                 return;
 689                         }
 690                         if (s.StartsWith ("*FrenchSort")) {
 691                                 t.FrenchSort = true;
 692                                 return;
 693                         }
 694                         string d = "*Diacritical";
 695                         if (s.StartsWith (d)) {
 696                                 idx = s.IndexOf ("->");
 697                                 t.AddDiacriticalMap (
 698                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
 699                                                 NumberStyles.HexNumber),
 700                                         byte.Parse (s.Substring (idx + 2).Trim (),
 701                                                 NumberStyles.HexNumber));
 702                                 return;
 703                         }
 704                         idx = s.IndexOf (':');
 705                         if (idx > 0) {
 706                                 string source = s.Substring (0, idx).Trim ();
 707                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
 708                                 byte [] b = new byte [4];
 709                                 for (int i = 0; i < 4; i++) {
 710                                         if (l [i] == "*")
 711                                                 b [i] = 0;
 712                                         else
 713                                                 b [i] = byte.Parse (l [i],
 714                                                         NumberStyles.HexNumber);
 715                                 }
 716                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
 717                                         b);
 718                         }
 719                         idx = s.IndexOf ('=');
 720                         if (idx > 0)
 721                                 t.AddReplacementMap (
 722                                         ParseTailoringSourceValue (
 723                                                 s.Substring (0, idx).Trim ()),
 724                                         ParseTailoringSourceValue (
 725                                                 s.Substring (idx + 1).Trim ()));
 726                 }
 727
 728                 void ParseDerivedAge (string filename)
 729                 {
 730                         using (StreamReader file =
 731                                 new StreamReader (filename)) {
 732                                 while (file.Peek () >= 0) {
 733                                         string s = file.ReadLine ();
 734                                         int idx = s.IndexOf ('#');
 735                                         if (idx >= 0)
 736                                                 s = s.Substring (0, idx);
 737                                         idx = s.IndexOf (';');
 738                                         if (idx < 0)
 739                                                 continue;
 740
 741                                         string cpspec = s.Substring (0, idx);
 742                                         idx = cpspec.IndexOf ("..");
 743                                         NumberStyles nf = NumberStyles.HexNumber |
 744                                                 NumberStyles.AllowTrailingWhite;
 745                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 746                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 747                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 748
 749                                         // FIXME: use index
 750                                         if (cp > char.MaxValue)
 751                                                 continue;
 752
 753                                         double v = double.Parse (value);
 754                                         for (int i = cp; i <= cpEnd; i++)
 755                                                 unicodeAge [i] = v;
 756                                 }
 757                         }
 758                         unicodeAge [0] = double.MaxValue; // never be supported
 759                 }
 760
 761                 void ParseUnidata (string filename)
 762                 {
 763                         ArrayList decompValues = new ArrayList ();
 764                         using (StreamReader unidata =
 765                                 new StreamReader (filename)) {
 766                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 767                                         try {
 768                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 769                                         } catch (Exception) {
 770                                                 Console.Error.WriteLine ("**** At line " + line);
 771                                                 throw;
 772                                         }
 773                                 }
 774                         }
 775                         this.decompValues = (int [])
 776                                 decompValues.ToArray (typeof (int));
 777                 }
 778
 779                 char previousLatinTarget = char.MinValue;
 780                 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
 781
 782                 void ProcessUnidataLine (string s, ArrayList decompValues)
 783                 {
 784                         int idx = s.IndexOf ('#');
 785                         if (idx >= 0)
 786                                 s = s.Substring (0, idx);
 787                         idx = s.IndexOf (';');
 788                         if (idx < 0)
 789                                 return;
 790                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 791                         string [] values = s.Substring (idx + 1).Split (';');
 792
 793                         // FIXME: use index
 794                         if (cp > char.MaxValue)
 795                                 return;
 796                         if (IsIgnorable (cp))
 797                                 return;
 798
 799                         string name = values [0];
 800
 801                         // SPECIAL CASE: rename some characters for diacritical
 802                         // remapping. FIXME: why are they different?
 803                         // FIXME: it's still not working.
 804                         if (cp == 0x018B || cp == 0x018C)
 805                                 name = name.Replace ("TOPBAR", "STROKE");
 806
 807                         // isSmallCapital
 808                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 809                                 isSmallCapital [cp] = true;
 810
 811                         // latin mapping by character name
 812                         if (s.IndexOf ("LATIN") >= 0) {
 813                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 814                                 int offset = lidx + 15;
 815                                 if (lidx < 0) {
 816                                         lidx = s.IndexOf ("LETTER TURNED ");
 817                                         offset = lidx + 14;
 818                                 }
 819                                 if (lidx < 0) {
 820                                         lidx = s.IndexOf ("LETTER CAPITAL ");
 821                                         offset = lidx + 15;
 822                                 }
 823                                 if (lidx < 0) {
 824                                         lidx = s.IndexOf ("LETTER SCRIPT ");
 825                                         offset = lidx + 14;
 826                                 }
 827                                 if (lidx < 0) {
 828                                         lidx = s.IndexOf ("LETTER ");
 829                                         offset = lidx + 7;
 830                                 }
 831                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 832                                 char n = s [offset + 1];
 833                                 char target = char.MinValue;
 834                                 if ('A' <= c && c <= 'Z' &&
 835                                         (n == ' ') || n == ';') {
 836                                         target = c;
 837                                         // FIXME: After 'Z', I cannot reset this state.
 838                                         previousLatinTarget = c == 'Z' ? char.MinValue : c;
 839                                 }
 840
 841                                 if (s.Substring (offset).StartsWith ("ALPHA"))
 842                                         target = 'A';
 843                                 else if (s.Substring (offset).StartsWith ("TONE SIX"))
 844                                         target = 'B';
 845                                 else if (s.Substring (offset).StartsWith ("OPEN O"))
 846                                         target = 'C';
 847                                 else if (s.Substring (offset).StartsWith ("SCHWA"))
 848                                         target = 'E';
 849                                 else if (s.Substring (offset).StartsWith ("ENG"))
 850                                         target = 'N';
 851                                 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
 852                                         target = 'O';
 853                                 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
 854                                         target = 'R';
 855                                 else if (s.Substring (offset).StartsWith ("TONE TWO"))
 856                                         target = 'S';
 857                                 else if (s.Substring (offset).StartsWith ("ESH"))
 858                                         target = 'S';
 859
 860                                 if (target == char.MinValue)
 861                                         target = previousLatinTarget;
 862
 863                                 if (target != char.MinValue) {
 864                                         ArrayList entry = (ArrayList) latinMap [target];
 865                                         if (entry == null) {
 866                                                 entry = new ArrayList ();
 867                                                 latinMap [target] = entry;
 868                                         }
 869                                         entry.Add (cp);
 870                                         // FIXME: This secondary weight is hack.
 871                                         // They are here because they must not
 872                                         // be identical to the corresponding
 873                                         // ASCII latins.
 874                                         if (c != target && diacritical [cp] == 0) {
 875                                                 diacriticalOffset [c - 'A']++;
 876                                                 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
 877                                         }
 878                                 }
 879                         }
 880
 881                         // Arrow names
 882                         if (0x2000 <= cp && cp < 0x3000) {
 883                                 int value = 0;
 884                                 // SPECIAL CASES. FIXME: why?
 885                                 switch (cp) {
 886                                 case 0x21C5: value = -1; break; // E2
 887                                 case 0x261D: value = 1; break;
 888                                 case 0x27A6: value = 3; break;
 889                                 case 0x21B0: value = 7; break;
 890                                 case 0x21B1: value = 3; break;
 891                                 case 0x21B2: value = 7; break;
 892                                 case 0x21B4: value = 5; break;
 893                                 case 0x21B5: value = 7; break;
 894                                 case 0x21B9: value = -1; break; // E1
 895                                 case 0x21CF: value = 7; break;
 896                                 case 0x21D0: value = 3; break;
 897                                 }
 898                                 string [] arrowTargets = new string [] {
 899                                         "",
 900                                         "UPWARDS",
 901                                         "NORTH EAST",
 902                                         "RIGHTWARDS",
 903                                         "SOUTH EAST",
 904                                         "DOWNWARDS",
 905                                         "SOUTH WEST",
 906                                         "LEFTWARDS",
 907                                         "NORTH WEST",
 908                                         };
 909                                 if (value == 0)
 910                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
 911                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
 912                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
 913                                                         s.IndexOf (" OVER") < 0
 914                                                 )
 915                                                         value = i;
 916                                 if (value > 0)
 917                                         arrowValues.Add (new DictionaryEntry (
 918                                                 cp, value));
 919                         }
 920
 921                         // Box names
 922                         if (0x2500 <= cp && cp < 0x2600) {
 923                                 int value = 0;
 924                                 // flags:
 925                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
 926                                 // [h,rl] [r] [l]
 927                                 // [v,ud] [u] [d]
 928                                 // [dr] [dl] [ur] [ul]
 929                                 // [vr,udr] [vl,vdl]
 930                                 // [hd,rld] [hu,rlu]
 931                                 // [hv,udrl,rlv,udh]
 932                                 ArrayList flags = new ArrayList (new int [] {
 933                                         32, 8 + 4, 8, 4,
 934                                         16, 1 + 2, 1, 2,
 935                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
 936                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
 937                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
 938                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
 939                                         });
 940                                 byte [] offsets = new byte [] {
 941                                         0, 0, 1, 2,
 942                                         3, 3, 4, 5,
 943                                         6, 7, 8, 9,
 944                                         10, 10, 11, 11,
 945                                         12, 12, 13, 13,
 946                                         14, 14, 14, 14};
 947                                 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
 948                                         int flag = 0;
 949                                         if (s.IndexOf (" UP") >= 0)
 950                                                 flag |= 1;
 951                                         if (s.IndexOf (" DOWN") >= 0)
 952                                                 flag |= 2;
 953                                         if (s.IndexOf (" RIGHT") >= 0)
 954                                                 flag |= 4;
 955                                         if (s.IndexOf (" LEFT") >= 0)
 956                                                 flag |= 8;
 957                                         if (s.IndexOf (" VERTICAL") >= 0)
 958                                                 flag |= 16;
 959                                         if (s.IndexOf (" HORIZONTAL") >= 0)
 960                                                 flag |= 32;
 961
 962                                         int fidx = flags.IndexOf (flag);
 963                                         value = fidx < 0 ? fidx : offsets [fidx];
 964                                 } else if (s.IndexOf ("BLOCK") >= 0) {
 965                                         if (s.IndexOf ("ONE EIGHTH") >= 0)
 966                                                 value = 0x12;
 967                                         else if (s.IndexOf ("ONE QUARTER") >= 0)
 968                                                 value = 0x13;
 969                                         else if (s.IndexOf ("THREE EIGHTHS") >= 0)
 970                                                 value = 0x14;
 971                                         else if (s.IndexOf ("HALF") >= 0)
 972                                                 value = 0x15;
 973                                         else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
 974                                                 value = 0x16;
 975                                         else if (s.IndexOf ("THREE QUARTERS") >= 0)
 976                                                 value = 0x17;
 977                                         else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
 978                                                 value = 0x18;
 979                                         else
 980                                                 value = 0x19;
 981                                 }
 982                                 else if (s.IndexOf ("SHADE") >= 0)
 983                                         value = 0x19;
 984                                 else if (s.IndexOf ("SQUARE") >= 0)
 985                                         value = 0xBC - 0xE5;
 986                                 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
 987                                         value = 0xBE - 0xE5;
 988                                 else if (s.IndexOf ("RECTANGLE") >= 0)
 989                                         value = 0xBD - 0xE5;
 990                                 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
 991                                         value = 0xBF - 0xE5;
 992                                 else if (s.IndexOf ("TRIANGLE") >= 0) {
 993                                         if (s.IndexOf ("UP-POINTING") >= 0)
 994                                                 value = 0xC0 - 0xE5;
 995                                         else if (s.IndexOf ("RIGHT-POINTING") >= 0)
 996                                                 value = 0xC1 - 0xE5;
 997                                         else if (s.IndexOf ("DOWN-POINTING") >= 0)
 998                                                 value = 0xC2 - 0xE5;
 999                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1000                                                 value = 0xC3 - 0xE5;
1001                                 }
1002                                 else if (s.IndexOf ("POINTER") >= 0) {
1003                                         if (s.IndexOf ("RIGHT-POINTING") >= 0)
1004                                                 value = 0xC4 - 0xE5;
1005                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1006                                                 value = 0xC5 - 0xE5;
1007                                 }
1008                                 else if (s.IndexOf ("DIAMOND") >= 0)
1009                                         value = 0xC6 - 0xE5;
1010                                 else if (s.IndexOf ("FISHEYE") >= 0)
1011                                         value = 0xC7 - 0xE5;
1012                                 else if (s.IndexOf ("LOZENGE") >= 0)
1013                                         value = 0xC8 - 0xE5;
1014                                 else if (s.IndexOf ("BULLSEYE") >= 0)
1015                                         value = 0xC9 - 0xE5;
1016                                 else if (s.IndexOf ("CIRCLE") >= 0) {
1017                                         if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1018                                                 value = 0xCA - 0xE5;
1019                                         else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1020                                                 value = 0xCB - 0xE5;
1021                                         else
1022                                                 value = 0xC9 - 0xE5;
1023                                 }
1024                                 if (0x25DA <= cp && cp <= 0x25E5)
1025                                         value = 0xCD + cp - 0x25DA - 0xE5;
1026
1027                                 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1028                                 switch (cp) {
1029                                 case 0x2571: value = 0xF; break;
1030                                 case 0x2572: value = 0x10; break;
1031                                 case 0x2573: value = 0x11; break;
1032                                 }
1033                                 if (value != 0)
1034                                         boxValues.Add (new DictionaryEntry (
1035                                                 cp, value));
1036                         }
1037
1038                         // For some characters store the name and sort later
1039                         // to determine sorting.
1040                         if (0x2100 <= cp && cp <= 0x213F &&
1041                                 Char.IsSymbol ((char) cp))
1042                                 sortableCharNames.Add (
1043                                         new DictionaryEntry (cp, name));
1044                         else if (0x3380 <= cp && cp <= 0x33DD)
1045                                 sortableCharNames.Add (new DictionaryEntry (
1046                                         cp, name.Substring (7)));
1047
1048                         // diacritical weights by character name
1049 if (diacritics.Length != diacriticWeights.Length)
1050 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1051                         for (int d = 0; d < diacritics.Length; d++) {
1052                                 if (s.IndexOf (diacritics [d]) > 0) {
1053                                         diacritical [cp] += diacriticWeights [d];
1054                                         if (s.IndexOf ("COMBINING") >= 0)
1055                                                 diacritical [cp] -= (byte) 2;
1056                                         continue;
1057                                 }
1058                                 // also process "COMBINING blah" here
1059                                 // For now it is limited to cp < 0x0370
1060 //                              if (cp < 0x0300 || cp >= 0x0370)
1061 //                                      continue;
1062                                 string tmp = diacritics [d].TrimEnd (';');
1063                                 if (tmp.IndexOf ("WITH ") == 0)
1064                                         tmp = tmp.Substring (4);
1065                                 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1066                                 if (name == tmp)
1067                                         diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1068 //if (name == tmp)
1069 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1070                         }
1071                         // Two-step grep required for it.
1072                         if (s.IndexOf ("FULL STOP") > 0 &&
1073                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1074                                 diacritical [cp] |= 0xF4;
1075
1076                         // Cyrillic letter name
1077                         if (0x0430 <= cp && cp <= 0x0486 &&
1078                                 Char.IsLetter ((char) cp)) {
1079                                 byte value = (byte) (cyrillicNameMap.Count * 3 + 0x06);
1080                                 // Get primary letter name i.e.
1081                                 // XXX part of CYRILLIC LETTER XXX yyy
1082                                 // e.g. "IZHITSA" for "IZHITSA DOUBLE GRAVE".
1083                                 string letterName =
1084                                         name.Substring (name.IndexOf ("LETTER ") + 7);
1085                                 int tmpIdx = letterName.IndexOf (' ');
1086                                 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1087 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1088                                 if (cyrillicNameMap.ContainsKey (letterName))
1089                                         value = (byte) cyrillicLetterPrimaryValues [cyrillicNameMap [letterName]];
1090                                 else
1091                                         cyrillicNameMap [letterName] = cp;
1092
1093                                 cyrillicLetterPrimaryValues [cp] = value;
1094                         }
1095
1096                         // Arabic letter name
1097                         if (0x0621 <= cp && cp <= 0x064A &&
1098                                 Char.GetUnicodeCategory ((char) cp)
1099                                 == UnicodeCategory.OtherLetter) {
1100                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1101                                 switch (cp) {
1102                                 case 0x0621:
1103                                 case 0x0624:
1104                                 case 0x0626:
1105                                         // hamza, waw, yeh ... special cases.
1106                                         value = 0x07;
1107                                         break;
1108                                 case 0x0649:
1109                                 case 0x064A:
1110                                         value = 0x77; // special cases.
1111                                         break;
1112                                 default:
1113                                         // Get primary letter name i.e.
1114                                         // XXX part of ARABIC LETTER XXX yyy
1115                                         // e.g. that of "TEH MARBUTA" is "TEH".
1116                                         string letterName =
1117                                                 (cp == 0x0640) ?
1118                                                 // 0x0640 is special: it does
1119                                                 // not start with ARABIC LETTER
1120                                                 name :
1121                                                 name.Substring (14);
1122                                         int tmpIdx = letterName.IndexOf (' ');
1123                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1124 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1125                                         if (arabicNameMap.ContainsKey (letterName))
1126                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1127                                         else
1128                                                 arabicNameMap [letterName] = cp;
1129                                         break;
1130                                 }
1131                                 arabicLetterPrimaryValues [cp] = value;
1132                         }
1133
1134                         // Japanese square letter
1135                         if (0x3300 <= cp && cp <= 0x3357)
1136                                 if (!ExistsJIS (cp))
1137                                         nonJisJapanese.Add (new NonJISCharacter (cp, name));
1138
1139                         // normalizationType
1140                         string decomp = values [4];
1141                         idx = decomp.IndexOf ('<');
1142                         if (idx >= 0) {
1143                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1144                                 case "full":
1145                                         decompType [cp] = DecompositionFull;
1146                                         break;
1147                                 case "sub":
1148                                         decompType [cp] = DecompositionSub;
1149                                         break;
1150                                 case "super":
1151                                         decompType [cp] = DecompositionSuper;
1152                                         break;
1153                                 case "small":
1154                                         decompType [cp] = DecompositionSmall;
1155                                         break;
1156                                 case "isolated":
1157                                         decompType [cp] = DecompositionIsolated;
1158                                         break;
1159                                 case "initial":
1160                                         decompType [cp] = DecompositionInitial;
1161                                         break;
1162                                 case "final":
1163                                         decompType [cp] = DecompositionFinal;
1164                                         break;
1165                                 case "medial":
1166                                         decompType [cp] = DecompositionMedial;
1167                                         break;
1168                                 case "noBreak":
1169                                         decompType [cp] = DecompositionNoBreak;
1170                                         break;
1171                                 case "compat":
1172                                         decompType [cp] = DecompositionCompat;
1173                                         break;
1174                                 case "fraction":
1175                                         decompType [cp] = DecompositionFraction;
1176                                         break;
1177                                 case "font":
1178                                         decompType [cp] = DecompositionFont;
1179                                         break;
1180                                 case "circle":
1181                                         decompType [cp] = DecompositionCircle;
1182                                         break;
1183                                 case "square":
1184                                         decompType [cp] = DecompositionSquare;
1185                                         break;
1186                                 case "wide":
1187                                         decompType [cp] = DecompositionWide;
1188                                         break;
1189                                 case "narrow":
1190                                         decompType [cp] = DecompositionNarrow;
1191                                         break;
1192                                 case "vertical":
1193                                         decompType [cp] = DecompositionVertical;
1194                                         break;
1195                                 default:
1196                                         throw new Exception ("Support NFKD type : " + decomp);
1197                                 }
1198                         }
1199                         else
1200                                 decompType [cp] = DecompositionCanonical;
1201                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1202                         if (decomp.Length > 0) {
1203
1204                                 string [] velems = decomp.Split (' ');
1205                                 int didx = decompValues.Count;
1206                                 decompIndex [cp] = didx;
1207                                 foreach (string v in velems)
1208                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1209                                 decompLength [cp] = velems.Length;
1210
1211                                 // [decmpType] -> this_cp
1212                                 int targetCP = (int) decompValues [didx];
1213                                 // for "(x)" it specially maps to 'x' .
1214                                 // FIXME: check if it is sane
1215                                 if (velems.Length == 3 &&
1216                                         (int) decompValues [didx] == '(' &&
1217                                         (int) decompValues [didx + 2] == ')')
1218                                         targetCP = (int) decompValues [didx + 1];
1219                                 // special: 0x215F "1/"
1220                                 else if (cp == 0x215F)
1221                                         targetCP = '1';
1222                                 else if (velems.Length > 1 &&
1223                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
1224                                         // skip them, except for CJK ideograph compat
1225                                         targetCP = 0;
1226
1227                                 if (targetCP != 0) {
1228                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
1229                                         if (entry == null) {
1230                                                 entry = new Hashtable ();
1231                                                 nfkdMap [targetCP] = entry;
1232                                         }
1233                                         entry [(byte) decompType [cp]] = cp;
1234                                 }
1235                         }
1236                         // numeric values
1237                         if (values [5].Length > 0)
1238                                 decimalValue [cp] = decimal.Parse (values [5]);
1239                         else if (values [6].Length > 0)
1240                                 decimalValue [cp] = decimal.Parse (values [6]);
1241                         else if (values [7].Length > 0) {
1242                                 string decstr = values [7];
1243                                 idx = decstr.IndexOf ('/');
1244                                 if (cp == 0x215F) // special. "1/"
1245                                         decimalValue [cp] = 0x1;
1246                                 else if (idx > 0)
1247                                         // m/n
1248                                         decimalValue [cp] =
1249                                                 decimal.Parse (decstr.Substring (0, idx))
1250                                                 / decimal.Parse (decstr.Substring (idx + 1));
1251                                 else if (decstr [0] == '(' &&
1252                                         decstr [decstr.Length - 1] == ')')
1253                                         // (n)
1254                                         decimalValue [cp] =
1255                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1256                                 else if (decstr [decstr.Length - 1] == '.')
1257                                         // n.
1258                                         decimalValue [cp] =
1259                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1260                                 else
1261                                         decimalValue [cp] = decimal.Parse (decstr);
1262                         }
1263                 }
1264
1265                 void ParseDerivedCoreProperties (string filename)
1266                 {
1267                         // IsUppercase
1268                         using (StreamReader file =
1269                                 new StreamReader (filename)) {
1270                                 for (int line = 1; file.Peek () >= 0; line++) {
1271                                         try {
1272                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1273                                         } catch (Exception) {
1274                                                 Console.Error.WriteLine ("**** At line " + line);
1275                                                 throw;
1276                                         }
1277                                 }
1278                         }
1279                 }
1280
1281                 void ProcessDerivedCorePropLine (string s)
1282                 {
1283                         int idx = s.IndexOf ('#');
1284                         if (idx >= 0)
1285                                 s = s.Substring (0, idx);
1286                         idx = s.IndexOf (';');
1287                         if (idx < 0)
1288                                 return;
1289                         string cpspec = s.Substring (0, idx);
1290                         idx = cpspec.IndexOf ("..");
1291                         NumberStyles nf = NumberStyles.HexNumber |
1292                                 NumberStyles.AllowTrailingWhite;
1293                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1294                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1295                         string value = s.Substring (cpspec.Length + 1).Trim ();
1296
1297                         // FIXME: use index
1298                         if (cp > char.MaxValue)
1299                                 return;
1300
1301                         switch (value) {
1302                         case "Uppercase":
1303                                 for (int x = cp; x <= cpEnd; x++)
1304                                         isUppercase [x] = true;
1305                                 break;
1306                         }
1307                 }
1308
1309                 void ParseScripts (string filename)
1310                 {
1311                         ArrayList cyrillic = new ArrayList ();
1312                         ArrayList gurmukhi = new ArrayList ();
1313                         ArrayList gujarati = new ArrayList ();
1314                         ArrayList georgian = new ArrayList ();
1315                         ArrayList thaana = new ArrayList ();
1316
1317                         using (StreamReader file =
1318                                 new StreamReader (filename)) {
1319                                 while (file.Peek () >= 0) {
1320                                         string s = file.ReadLine ();
1321                                         int idx = s.IndexOf ('#');
1322                                         if (idx >= 0)
1323                                                 s = s.Substring (0, idx);
1324                                         idx = s.IndexOf (';');
1325                                         if (idx < 0)
1326                                                 continue;
1327
1328                                         string cpspec = s.Substring (0, idx);
1329                                         idx = cpspec.IndexOf ("..");
1330                                         NumberStyles nf = NumberStyles.HexNumber |
1331                                                 NumberStyles.AllowTrailingWhite;
1332                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1333                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1334                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1335
1336                                         // FIXME: use index
1337                                         if (cp > char.MaxValue)
1338                                                 continue;
1339
1340                                         switch (value) {
1341                                         case "Cyrillic":
1342                                                 for (int x = cp; x <= cpEnd; x++)
1343                                                         if (!IsIgnorable (x))
1344                                                                 cyrillic.Add ((char) x);
1345                                                 break;
1346                                         case "Gurmukhi":
1347                                                 for (int x = cp; x <= cpEnd; x++)
1348                                                         if (!IsIgnorable (x))
1349                                                                 gurmukhi.Add ((char) x);
1350                                                 break;
1351                                         case "Gujarati":
1352                                                 for (int x = cp; x <= cpEnd; x++)
1353                                                         if (!IsIgnorable (x))
1354                                                                 gujarati.Add ((char) x);
1355                                                 break;
1356                                         case "Georgian":
1357                                                 for (int x = cp; x <= cpEnd; x++)
1358                                                         if (!IsIgnorable (x))
1359                                                                 georgian.Add ((char) x);
1360                                                 break;
1361                                         case "Thaana":
1362                                                 for (int x = cp; x <= cpEnd; x++)
1363                                                         if (!IsIgnorable (x))
1364                                                                 thaana.Add ((char) x);
1365                                                 break;
1366                                         }
1367                                 }
1368                         }
1369                         cyrillic.Sort (UCAComparer.Instance);
1370                         gurmukhi.Sort (UCAComparer.Instance);
1371                         gujarati.Sort (UCAComparer.Instance);
1372                         georgian.Sort (UCAComparer.Instance);
1373                         thaana.Sort (UCAComparer.Instance);
1374                         orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1375                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1376                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1377                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1378                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1379                 }
1380
1381                 void ParseJISOrder (string filename)
1382                 {
1383                         int line = 1;
1384                         try {
1385                                 using (StreamReader file =
1386                                         new StreamReader (filename)) {
1387                                         for (;file.Peek () >= 0; line++)
1388                                                 ProcessJISOrderLine (file.ReadLine ());
1389                                 }
1390                         } catch (Exception) {
1391                                 Console.Error.WriteLine ("---- line {0}", line);
1392                                 throw;
1393                         }
1394                 }
1395
1396                 char [] ws = new char [] {'\t', ' '};
1397
1398                 void ProcessJISOrderLine (string s)
1399                 {
1400                         int idx = s.IndexOf ('#');
1401                         if (idx >= 0)
1402                                 s = s.Substring (0, idx).Trim ();
1403                         if (s.Length == 0)
1404                                 return;
1405                         idx = s.IndexOfAny (ws);
1406                         if (idx < 0)
1407                                 return;
1408                         // They start with "0x" so cut them out.
1409                         int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1410                         int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1411                         jisJapanese.Add (new JISCharacter (cp, jis));
1412                 }
1413
1414                 void ParseCJK (string zhXML, string jaXML, string koXML)
1415                 {
1416                         XmlDocument doc = new XmlDocument ();
1417                         doc.XmlResolver = null;
1418                         int v;
1419                         string s;
1420                         string category;
1421                         int offset;
1422                         ushort [] arr;
1423
1424                         // Chinese Simplified
1425                         category = "chs";
1426                         arr = cjkCHS;
1427                         offset = 0;//char.MaxValue - arr.Length;
1428                         doc.Load (zhXML);
1429                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1430                         v = 0x8008;
1431                         foreach (char c in s) {
1432                                 if (c < '\u3100')
1433                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1434                                 else {
1435                                         arr [(int) c - offset] = (ushort) v++;
1436                                         if (v % 256 == 0)
1437                                                 v += 2;
1438                                 }
1439                         }
1440
1441                         // Chinese Traditional
1442                         category = "cht";
1443                         arr = cjkCHT;
1444                         offset = 0;//char.MaxValue - arr.Length;
1445                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1446                         v = 0x8002;
1447                         foreach (char c in s) {
1448                                 if (c < '\u4E00')
1449                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1450                                 else {
1451                                         arr [(int) c - offset] = (ushort) v++;
1452                                         if (v % 256 == 0)
1453                                                 v += 2;
1454                                 }
1455                         }
1456
1457                         // Japanese
1458                         category = "ja";
1459                         arr = cjkJA;
1460                         offset = 0;//char.MaxValue - arr.Length;
1461                         doc.Load (jaXML);
1462                         s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1463                         v = 0x8008;
1464                         foreach (char c in s) {
1465                                 if (c < '\u4E00')
1466                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1467                                 else {
1468                                         arr [(int) c - offset] = (ushort) v++;
1469                                         if (v % 256 == 0)
1470                                                 v += 2;
1471                                 }
1472                         }
1473
1474                         // Korean
1475                         // Korean weight is somewhat complex. It first shifts
1476                         // Hangul category from 52-x to 80-x (they are anyways
1477                         // computed). CJK ideographs are placed at secondary
1478                         // weight, like XX YY 01 zz 01, where XX and YY are
1479                         // corresponding "reset" value and zz is 41,43,45...
1480                         //
1481                         // Unlike chs,cht and ja, Korean value is a combined
1482                         // ushort which is computed as category
1483                         //
1484                         category = "ko";
1485                         arr = cjkKO;
1486                         offset = 0;//char.MaxValue - arr.Length;
1487                         doc.Load (koXML);
1488                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1489                                 XmlElement sc = (XmlElement) reset.NextSibling;
1490                                 // compute "category" and "level 1" for the
1491                                 // target "reset" Hangle syllable
1492                                 char rc = reset.InnerText [0];
1493                                 int ri = ((int) rc - 0xAC00) + 1;
1494                                 ushort p = (ushort)
1495                                         ((ri / 254) * 256 + (ri % 254) + 2);
1496                                 // Place the characters after the target.
1497                                 s = sc.InnerText;
1498                                 v = 0x41;
1499                                 foreach (char c in s) {
1500                                         arr [(int) c - offset] = p;
1501                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1502                                         v += 2;
1503                                 }
1504                         }
1505                 }
1506
1507                 #endregion
1508
1509                 #region Generation
1510
1511                 void FillIgnorables ()
1512                 {
1513                         for (int i = 0; i <= char.MaxValue; i++) {
1514                                 if (Char.GetUnicodeCategory ((char) i) ==
1515                                         UnicodeCategory.OtherNotAssigned)
1516                                         continue;
1517                                 if (IsIgnorable (i))
1518                                         ignorableFlags [i] |= 1;
1519                                 if (IsIgnorableSymbol (i))
1520                                         ignorableFlags [i] |= 2;
1521                                 if (IsIgnorableNonSpacing (i))
1522                                         ignorableFlags [i] |= 4;
1523                         }
1524                 }
1525
1526                 void ModifyParsedValues ()
1527                 {
1528                         // number, secondary weights
1529                         byte weight = 0x38;
1530                         int [] numarr = numberSecondaryWeightBounds;
1531                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1532                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1533                                         if (Char.IsNumber ((char) cp))
1534                                                 diacritical [cp] = weight;
1535
1536                         // Modify some decomposition equivalence
1537                         decompType [0xFE31] = 0;
1538                         decompIndex [0xFE31] = 0;
1539                         decompLength [0xFE31] = 0;
1540                         decompType [0xFE32] = 0;
1541                         decompIndex [0xFE32] = 0;
1542                         decompLength [0xFE32] = 0;
1543
1544                         // Korean parens numbers
1545                         for (int i = 0x3200; i <= 0x321C; i++)
1546                                 diacritical [i] = 0xA;
1547                         for (int i = 0x3260; i <= 0x327B; i++)
1548                                 diacritical [i] = 0xC;
1549
1550                         // Update name part of named characters
1551                         for (int i = 0; i < sortableCharNames.Count; i++) {
1552                                 DictionaryEntry de =
1553                                         (DictionaryEntry) sortableCharNames [i];
1554                                 int cp = (int) de.Key;
1555                                 string renamed = null;
1556                                 switch (cp) {
1557                                 case 0x2101: renamed = "A_1"; break;
1558                                 case 0x33C3: renamed = "A_2"; break;
1559                                 case 0x2105: renamed = "C_1"; break;
1560                                 case 0x2106: renamed = "C_2"; break;
1561                                 case 0x211E: renamed = "R1"; break;
1562                                 case 0x211F: renamed = "R2"; break;
1563                                 // Remove some of them!
1564                                 case 0x2103:
1565                                 case 0x2109:
1566                                 case 0x2116:
1567                                 case 0x2117:
1568                                 case 0x2118:
1569                                 case 0x2125:
1570                                 case 0x2127:
1571                                 case 0x2129:
1572                                 case 0x212E:
1573                                 case 0x2132:
1574                                         sortableCharNames.RemoveAt (i);
1575                                         i--;
1576                                         continue;
1577                                 }
1578                                 if (renamed != null)
1579                                         sortableCharNames [i] =
1580                                                 new DictionaryEntry (cp, renamed);
1581                         }
1582                 }
1583
1584                 void GenerateCore ()
1585                 {
1586                         UnicodeCategory uc;
1587
1588                         #region Specially ignored // 01
1589                         // This will raise "Defined" flag up.
1590                         foreach (char c in specialIgnore)
1591                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1592                         #endregion
1593
1594
1595                         #region Variable weights
1596                         // Controls : 06 03 - 06 3D
1597                         fillIndex [6] = 3;
1598                         for (int i = 0; i < 65536; i++) {
1599                                 if (IsIgnorable (i))
1600                                         continue;
1601                                 char c = (char) i;
1602                                 uc = Char.GetUnicodeCategory (c);
1603                                 // NEL is whitespace but not ignored here.
1604                                 if (uc == UnicodeCategory.Control &&
1605                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1606                                         AddCharMap (c, 6, 1);
1607                         }
1608
1609                         // Apostrophe 06 80
1610                         fillIndex [6] = 0x80;
1611                         AddCharMapGroup ('\'', 6, 1, 0);
1612                         AddCharMap ('\uFE63', 6, 1);
1613
1614                         // Hyphen/Dash : 06 81 - 06 90
1615                         for (int i = 0; i < char.MaxValue; i++) {
1616                                 if (!IsIgnorable (i) &&
1617                                         Char.GetUnicodeCategory ((char) i) ==
1618                                         UnicodeCategory.DashPunctuation) {
1619                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1620                                         if (i == 0x2011) {
1621                                                 // SPECIAL: add 2027 and 2043
1622                                                 // Maybe they are regarded the
1623                                                 // same hyphens in "central"
1624                                                 // position.
1625                                                 AddCharMap ('\u2027', 6, 1);
1626                                                 AddCharMap ('\u2043', 6, 1);
1627                                         }
1628                                 }
1629                         }
1630
1631                         // Arabic variable weight chars 06 A0 -
1632                         fillIndex [6] = 0xA0;
1633                         // vowels
1634                         for (int i = 0x64B; i <= 0x650; i++)
1635                                 AddArabicCharMap ((char) i);
1636                         // sukun
1637                         AddCharMapGroup ('\u0652', 6, 1, 0);
1638                         // shadda
1639                         AddCharMapGroup ('\u0651', 6, 1, 0);
1640                         #endregion
1641
1642
1643                         #region Nonspacing marks // 01
1644                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1645
1646                         // Combining diacritical marks: 01 DC -
1647
1648                         fillIndex [0x1] = 0x41;
1649                         for (int i = 0x030E; i <= 0x0326; i++)
1650                                 if (!IsIgnorable (i))
1651                                         AddCharMap ((char) i, 0x1, 1);
1652                         for (int i = 0x0329; i <= 0x0334; i++)
1653                                 if (!IsIgnorable (i))
1654                                         AddCharMap ((char) i, 0x1, 1);
1655                         for (int i = 0x0339; i <= 0x0341; i++)
1656                                 if (!IsIgnorable (i))
1657                                         AddCharMap ((char) i, 0x1, 1);
1658                         fillIndex [0x1] = 0x72;
1659                         for (int i = 0x0346; i <= 0x0348; i++)
1660                                 if (!IsIgnorable (i))
1661                                         AddCharMap ((char) i, 0x1, 1);
1662                         for (int i = 0x02BE; i <= 0x02BF; i++)
1663                                 if (!IsIgnorable (i))
1664                                         AddCharMap ((char) i, 0x1, 1);
1665                         for (int i = 0x02C1; i <= 0x02C5; i++)
1666                                 if (!IsIgnorable (i))
1667                                         AddCharMap ((char) i, 0x1, 1);
1668                         for (int i = 0x02CE; i <= 0x02CF; i++)
1669                                 if (!IsIgnorable (i))
1670                                         AddCharMap ((char) i, 0x1, 1);
1671                         for (int i = 0x02D1; i <= 0x02D3; i++)
1672                                 if (!IsIgnorable (i))
1673                                         AddCharMap ((char) i, 0x1, 1);
1674                         AddCharMap ('\u02DE', 0x1, 1);
1675                         for (int i = 0x02E4; i <= 0x02E9; i++)
1676                                 if (!IsIgnorable (i))
1677                                         AddCharMap ((char) i, 0x1, 1);
1678
1679                         // FIXME: needs more love here (it should eliminate
1680                         // all the hacky code above).
1681                         for (int i = 0x0300; i < 0x0370; i++)
1682                                 if (!IsIgnorable (i) && diacritical [i] != 0
1683                                         /* especiall here*/ && !map [i].Defined)
1684                                         map [i] = new CharMapEntry (
1685                                                 0x1, 0x1, diacritical [i]);
1686
1687                         fillIndex [0x1] = 0xAC;
1688                         for (int i = 0x07A6; i <= 0x07B0; i++)
1689                                 if (!IsIgnorable (i))
1690                                         AddCharMap ((char) i, 0x1, 1);
1691
1692                         fillIndex [0x1] = 0x0C;
1693                         for (int i = 0x0EC8; i <= 0x0ECD; i++)
1694                                 if (!IsIgnorable (i))
1695                                         AddCharMap ((char) i, 0x1, 1);
1696
1697                         // LAMESPEC: It should not stop at '\u20E1'. There are
1698                         // a few more characters (that however results in
1699                         // overflow of level 2 unless we start before 0xDD).
1700                         fillIndex [0x1] = 0xDC;
1701                         for (int i = 0x20d0; i <= 0x20e1; i++)
1702                                 AddCharMap ((char) i, 0x1, 1);
1703                         #endregion
1704
1705
1706                         #region Whitespaces // 07 03 -
1707                         fillIndex [0x7] = 0x2;
1708                         AddCharMap (' ', 0x7, 2);
1709                         AddCharMap ('\u00A0', 0x7, 1);
1710                         for (int i = 9; i <= 0xD; i++)
1711                                 AddCharMap ((char) i, 0x7, 1);
1712                         for (int i = 0x2000; i <= 0x200B; i++)
1713                                 AddCharMap ((char) i, 0x7, 1);
1714
1715                         fillIndex [0x7] = 0x17;
1716                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1717                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1718
1719                         // Characters which used to represent layout control.
1720                         // LAMESPEC: Windows developers seem to have thought
1721                         // that those characters are kind of whitespaces,
1722                         // while they aren't.
1723                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1724                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1725                         #endregion
1726
1727                         // category 09 - continued symbols from 08
1728                         fillIndex [0x9] = 2;
1729                         // misc tech mark
1730                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1731                                 AddCharMap ((char) cp, 0x9, 1, 0);
1732
1733                         // arrows
1734                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1735                         foreach (DictionaryEntry de in arrowValues) {
1736                                 int idx = (int) de.Value;
1737                                 int cp = (int) de.Key;
1738                                 if (map [cp].Defined)
1739                                         continue;
1740                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1741                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1742                                 arrowLv2 [idx]++;
1743                         }
1744                         // boxes
1745                         byte [] boxLv2 = new byte [128];
1746                         for (int i = 0; i < boxLv2.Length; i++)
1747                                 boxLv2 [i] = 3;
1748                         foreach (DictionaryEntry de in boxValues) {
1749                                 int cp = (int) de.Key;
1750                                 int off = (int) de.Value;
1751                                 if (map [cp].Defined)
1752                                         continue;
1753                                 if (off < 0) {
1754                                         fillIndex [0x9] = (byte) (0xE5 + off);
1755                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1756                                 }
1757                                 else {
1758                                         fillIndex [0x9] = (byte) (0xE5 + off);
1759                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1760                                 }
1761                         }
1762                         // Some special characters (slanted)
1763                         fillIndex [0x9] = 0xF4;
1764                         AddCharMap ('\u2571', 0x9, 3);
1765                         AddCharMap ('\u2572', 0x9, 3);
1766                         AddCharMap ('\u2573', 0x9, 3);
1767
1768                         // FIXME: implement 0A
1769                         #region Symbols
1770                         fillIndex [0xA] = 2;
1771                         // byte currency symbols
1772                         for (int cp = 0; cp < 0x100; cp++) {
1773                                 uc = Char.GetUnicodeCategory ((char) cp);
1774                                 if (!IsIgnorable (cp) &&
1775                                         uc == UnicodeCategory.CurrencySymbol &&
1776                                         cp != '$' ||
1777                                         cp == 0xAC)
1778                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1779                         }
1780                         // byte other symbols
1781                         for (int cp = 0; cp < 0x100; cp++) {
1782                                 if (cp == 0xA6)
1783                                         continue; // SPECIAL: skip FIXME: why?
1784                                 uc = Char.GetUnicodeCategory ((char) cp);
1785                                 if (!IsIgnorable (cp) &&
1786                                         uc == UnicodeCategory.OtherSymbol)
1787                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1788                         }
1789
1790                         fillIndex [0xA] = 0x1C; // FIXME: it won't be needed
1791                         for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1792                                 AddCharMap ((char) cp, 0xA, 1, 0);
1793                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1794                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1795                                 AddCharMap ((char) cp, 0xA, 1, 0);
1796                         // Dingbats
1797                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1798                                 if (Char.IsSymbol ((char) cp))
1799                                         AddCharMap ((char) cp, 0xA, 1, 0);
1800                         // OCR
1801                         for (int i = 0x2440; i < 0x2460; i++)
1802                                 AddCharMap ((char) i, 0xA, 1, 0);
1803
1804                         #endregion
1805
1806                         #region Numbers // 0C 02 - 0C E1
1807                         fillIndex [0xC] = 2;
1808
1809                         // 9F8 : Bengali "one less than the denominator"
1810                         AddCharMap ('\u09F8', 0xC, 1);
1811
1812                         ArrayList numbers = new ArrayList ();
1813                         for (int i = 0; i < 65536; i++)
1814                                 if (!IsIgnorable (i) &&
1815                                         Char.IsNumber ((char) i) &&
1816                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1817                                         numbers.Add (i);
1818
1819                         ArrayList numberValues = new ArrayList ();
1820                         foreach (int i in numbers)
1821                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1822                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1823
1824 //foreach (DictionaryEntry de in numberValues)
1825 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1826
1827                         decimal prevValue = -1;
1828                         foreach (DictionaryEntry de in numberValues) {
1829                                 int cp = (int) de.Key;
1830                                 decimal currValue = (decimal) de.Value;
1831                                 bool addnew = false;
1832                                 if (prevValue < currValue &&
1833                                         prevValue - (int) prevValue == 0 &&
1834                                         prevValue >= 1) {
1835
1836                                         addnew = true;
1837                                         // Process Hangzhou and Roman numbers
1838
1839                                         // There are some SPECIAL cases.
1840                                         if (currValue != 4) // no increment for 4
1841                                                 fillIndex [0xC]++;
1842
1843                                         int xcp;
1844                                         if (currValue <= 10) {
1845                                                 xcp = (int) prevValue + 0x2170 - 1;
1846                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1847                                                 xcp = (int) prevValue + 0x2160 - 1;
1848                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1849                                                 fillIndex [0xC] += 2;
1850                                                 xcp = (int) prevValue + 0x3021 - 1;
1851                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1852                                                 fillIndex [0xC]++;
1853                                         }
1854                                         else if (currValue == 11)
1855                                                 fillIndex [0xC]++;
1856                                 }
1857                                 if (prevValue < currValue)
1858                                         prevValue = currValue;
1859                                 if (map [cp].Defined)
1860                                         continue;
1861                                 // HangZhou and Roman are add later
1862                                 // (code is above)
1863                                 else if (0x3021 <= cp && cp < 0x302A
1864                                         || 0x2160 <= cp && cp < 0x216A
1865                                         || 0x2170 <= cp && cp < 0x217A)
1866                                         continue;
1867
1868                                 if (cp ==  0x215B) // FIXME: why?
1869                                         fillIndex [0xC] += 2;
1870                                 else if (cp == 0x3021) // FIXME: why?
1871                                         fillIndex [0xC]++;
1872                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1873                                 if (addnew || cp <= '9') {
1874                                         int mod = (int) currValue - 1;
1875                                         int xcp;
1876                                         if (1 <= currValue && currValue <= 10) {
1877                                                 xcp = mod + 0x2776;
1878                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1879                                                 xcp = mod + 0x2780;
1880                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1881                                                 xcp = mod + 0x278A;
1882                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1883                                         }
1884                                         if (1 <= currValue && currValue <= 20) {
1885                                                 xcp = mod + 0x2460;
1886                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1887                                                 xcp = mod + 0x2474;
1888                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1889                                                 xcp = mod + 0x2488;
1890                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1891                                         }
1892                                 }
1893
1894                                 if (cp != 0x09E7 && cp != 0x09EA)
1895                                         fillIndex [0xC]++;
1896
1897                                 // Add special cases that are not regarded as
1898                                 // numbers in UnicodeCategory speak.
1899                                 if (cp == '5') {
1900                                         // TONE FIVE
1901                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1902                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1903                                 }
1904                                 else if (cp == '6') // FIXME: why?
1905                                         fillIndex [0xC]++;
1906                         }
1907
1908                         // 221E: infinity
1909                         fillIndex [0xC] = 0xFF;
1910                         AddCharMap ('\u221E', 0xC, 1);
1911                         #endregion
1912
1913                         #region Letters and NonSpacing Marks (general)
1914
1915                         // ASCII Latin alphabets
1916                         for (int i = 0; i < alphabets.Length; i++)
1917                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1918
1919
1920                         // non-ASCII Latin alphabets
1921                         // FIXME: there is no such characters that are placed
1922                         // *after* "alphabets" array items. This is nothing
1923                         // more than a hack that creates dummy weight for
1924                         // primary characters.
1925                         for (int i = 0x0080; i < 0x0300; i++) {
1926                                 if (!Char.IsLetter ((char) i))
1927                                         continue;
1928                                 // For those Latin Letters which has NFKD are
1929                                 // not added as independent primary character.
1930                                 if (decompIndex [i] != 0)
1931                                         continue;
1932                                 // SPECIAL CASES:
1933                                 // 1.some alphabets have primarily
1934                                 //   equivalent ASCII alphabets.
1935                                 // 2.some have independent primary weights,
1936                                 //   but inside a-to-z range.
1937                                 // 3.there are some expanded characters that
1938                                 //   are not part of Unicode Standard NFKD.
1939                                 // 4. some characters are letter in IsLetter
1940                                 //   but not in sortkeys (maybe unicode version
1941                                 //   difference caused it).
1942                                 switch (i) {
1943                                 // 1. skipping them does not make sense
1944 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
1945 //                              case 0x184: case 0x185: case 0x186: case 0x189:
1946 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
1947 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
1948 //                              case 0x19B: case 0x19C:
1949                                 // 2. skipping them does not make sense
1950 //                              case 0x14A: // Ng
1951 //                              case 0x14B: // ng
1952                                 // 3.
1953                                 case 0xC6: // AE
1954                                 case 0xE6: // ae
1955                                 case 0xDE: // Icelandic Thorn
1956                                 case 0xFE: // Icelandic Thorn
1957                                 case 0xDF: // German ss
1958                                 case 0xFF: // German ss
1959                                 // 4.
1960                                 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1961                                 // not classified yet
1962 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1963 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1964 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1965 //                              case 0x1DD:
1966                                         continue;
1967                                 }
1968                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
1969                         }
1970
1971                         // Greek and Coptic
1972                         fillIndex [0xF] = 02;
1973                         for (int i = 0x0380; i < 0x0390; i++)
1974                                 if (Char.IsLetter ((char) i))
1975                                         AddLetterMap ((char) i, 0xF, 1);
1976                         fillIndex [0xF] = 02;
1977                         for (int i = 0x0391; i < 0x03CF; i++)
1978                                 if (Char.IsLetter ((char) i))
1979                                         AddLetterMap ((char) i, 0xF, 1);
1980                         fillIndex [0xF] = 0x40;
1981                         for (int i = 0x03D0; i < 0x0400; i++)
1982                                 if (Char.IsLetter ((char) i))
1983                                         AddLetterMap ((char) i, 0xF, 1);
1984
1985                         // Cyrillic - character name order
1986                         fillIndex [0x10] = 0x6;
1987 //*
1988 for (int i = 0; i < orderedCyrillic.Length; i++)
1989 Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
1990
1991                         // table which is moslty from UCA DUCET.
1992                         for (int i = 0; i < orderedCyrillic.Length; i++) {
1993                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
1994                                 if (!IsIgnorable ((int) c) &&
1995                                         c <= '\u045C' &&
1996                                         Char.IsLetter (c)) {
1997                                         AddLetterMap (c, 0x10, 0);
1998                                         fillIndex [0x10] += 3;
1999                                 }
2000                         }
2001                         /*
2002                         for (int i = 0x0460; i < 0x0481; i++) {
2003                                 if (Char.IsLetter ((char) i)) {
2004                                         AddLetterMap ((char) i, 0x10, 0);
2005                                         fillIndex [0x10] += 3;
2006                                 }
2007                         }
2008                         */
2009 /*
2010                         for (int i = 0x0400; i <= 0x0486; i++) {
2011                                 if (!Char.IsLetter ((char) i)) {
2012 //                                      AddCharMap ((char) i, 0x1, 1);
2013                                         continue;
2014                                 }
2015                                 if (!cyrillicLetterPrimaryValues.ContainsKey (i)) {
2016                                         Console.Error.WriteLine ("no value for {0:x04}", i);
2017                                         continue;
2018                                 }
2019                                 fillIndex [0x10] =
2020                                         (byte) cyrillicLetterPrimaryValues [i];
2021                                 AddLetterMap ((char) i, 0x10, 0);
2022                         }
2023 */
2024
2025                         // Armenian
2026                         fillIndex [0x11] = 0x3;
2027                         for (int i = 0x0531; i < 0x0586; i++)
2028                                 if (Char.IsLetter ((char) i))
2029                                         AddLetterMap ((char) i, 0x11, 1);
2030
2031                         // Hebrew
2032                         // -Letters
2033                         fillIndex [0x12] = 0x3;
2034                         for (int i = 0x05D0; i < 0x05FF; i++)
2035                                 if (Char.IsLetter ((char) i))
2036                                         AddLetterMap ((char) i, 0x12, 1);
2037                         // -Accents
2038                         fillIndex [0x1] = 0x3;
2039                         for (int i = 0x0591; i <= 0x05C2; i++)
2040                                 if (i != 0x05BE)
2041                                         AddCharMap ((char) i, 0x1, 1);
2042
2043                         // Arabic
2044                         fillIndex [0x1] = 0x8E;
2045                         fillIndex [0x13] = 0x3;
2046                         for (int i = 0x0621; i <= 0x064A; i++) {
2047                                 // Abjad
2048                                 if (Char.GetUnicodeCategory ((char) i)
2049                                         != UnicodeCategory.OtherLetter) {
2050                                         // FIXME: arabic nonspacing marks are
2051                                         // in different order.
2052                                         AddCharMap ((char) i, 0x1, 1);
2053                                         continue;
2054                                 }
2055 //                              map [i] = new CharMapEntry (0x13,
2056 //                                      (byte) arabicLetterPrimaryValues [i], 1);
2057                                 fillIndex [0x13] =
2058                                         (byte) arabicLetterPrimaryValues [i];
2059                                 AddLetterMap ((char) i, 0x13, 0);
2060                         }
2061                         fillIndex [0x13] = 0x84;
2062                         for (int i = 0x0674; i < 0x06D6; i++)
2063                                 if (Char.IsLetter ((char) i))
2064                                         AddLetterMap ((char) i, 0x13, 1);
2065
2066                         // Devanagari
2067                         // FIXME: it does seem straight codepoint mapping.
2068                         fillIndex [0x14] = 04;
2069                         for (int i = 0x0901; i < 0x0905; i++)
2070                                 if (!IsIgnorable (i))
2071                                         AddLetterMap ((char) i, 0x14, 2);
2072                         fillIndex [0x14] = 0xB;
2073                         for (int i = 0x0905; i < 0x093A; i++) {
2074                                 if (i == 0x0928)
2075                                         AddCharMap ('\u0929', 0x14, 0, 8);
2076                                 if (i == 0x0930)
2077                                         AddCharMap ('\u0931', 0x14, 0, 8);
2078                                 if (i == 0x0933)
2079                                         AddCharMap ('\u0934', 0x14, 0, 8);
2080                                 if (Char.IsLetter ((char) i))
2081                                         AddLetterMap ((char) i, 0x14, 4);
2082                                 if (i == 0x090B)
2083                                         AddCharMap ('\u0960', 0x14, 4);
2084                                 if (i == 0x090C)
2085                                         AddCharMap ('\u0961', 0x14, 4);
2086                         }
2087                         fillIndex [0x14] = 0xDA;
2088                         for (int i = 0x093E; i < 0x0945; i++)
2089                                 if (!IsIgnorable (i))
2090                                         AddLetterMap ((char) i, 0x14, 2);
2091                         fillIndex [0x14] = 0xEC;
2092                         for (int i = 0x0945; i < 0x094F; i++)
2093                                 if (!IsIgnorable (i))
2094                                         AddLetterMap ((char) i, 0x14, 2);
2095
2096                         // Bengali
2097                         // -Letters
2098                         fillIndex [0x15] = 02;
2099                         for (int i = 0x0980; i < 0x9FF; i++) {
2100                                 if (IsIgnorable (i))
2101                                         continue;
2102                                 if (i == 0x09E0)
2103                                         fillIndex [0x15] = 0x3B;
2104                                 switch (Char.GetUnicodeCategory ((char) i)) {
2105                                 case UnicodeCategory.NonSpacingMark:
2106                                 case UnicodeCategory.DecimalDigitNumber:
2107                                 case UnicodeCategory.OtherNumber:
2108                                         continue;
2109                                 }
2110                                 AddLetterMap ((char) i, 0x15, 1);
2111                         }
2112                         // -Signs
2113                         fillIndex [0x1] = 0x3;
2114                         for (int i = 0x0981; i < 0x0A00; i++)
2115                                 if (Char.GetUnicodeCategory ((char) i) ==
2116                                         UnicodeCategory.NonSpacingMark)
2117                                         AddCharMap ((char) i, 0x1, 1);
2118
2119                         // Gurmukhi. orderedGurmukhi is from UCA
2120                         // FIXME: it does not look equivalent to UCA.
2121                         fillIndex [0x16] = 04;
2122                         fillIndex [0x1] = 3;
2123                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
2124                                 char c = orderedGurmukhi [i];
2125                                 if (IsIgnorable ((int) c))
2126                                         continue;
2127                                 if (IsIgnorableNonSpacing (c)) {
2128                                         AddLetterMap (c, 0x1, 1);
2129                                         continue;
2130                                 }
2131                                 if (c == '\u0A3C' || c == '\u0A4D' ||
2132                                         '\u0A66' <= c && c <= '\u0A71')
2133                                         continue;
2134                                 // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
2135                                 byte shift = 4;
2136                                 if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
2137                                         shift = 0;
2138                                 AddLetterMap (c, 0x16, shift);
2139                         }
2140
2141                         // Gujarati. orderedGujarati is from UCA
2142                         fillIndex [0x17] = 0x4;
2143                         // nonspacing marks
2144                         map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2145                         map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2146                         map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2147                         map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2148                         map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2149                         map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2150                         // letters go first.
2151                         for (int i = 0; i < orderedGujarati.Length; i++) {
2152                                 // SPECIAL CASE
2153                                 char c = orderedGujarati [i];
2154                                 if (Char.IsLetter (c)) {
2155                                         // SPECIAL CASES
2156                                         if (c == '\u0AB3' || c == '\u0A32')
2157                                                 continue;
2158                                         if (c == '\u0A33') {
2159                                                 AddCharMap ('\u0A32', 0x17, 0);
2160                                                 AddCharMap ('\u0A33', 0x17, 4, 4);
2161                                                 continue;
2162                                         }
2163                                         if (c == '\u0A8B')
2164                                                 AddCharMap ('\u0AE0', 0x17, 0, 5);
2165                                         AddCharMap (c, 0x17, 4);
2166
2167                                         if (c == '\u0AB9')
2168                                                 AddCharMap ('\u0AB3', 0x17, 6);
2169                                 }
2170                         }
2171                         // non-letters
2172                         byte gujaratiShift = 4;
2173                         fillIndex [0x17] = 0xC0;
2174                         for (int i = 0; i < orderedGujarati.Length; i++) {
2175                                 char c = orderedGujarati [i];
2176                                 if (fillIndex [0x17] == 0xCC)
2177                                         gujaratiShift = 3;
2178                                 if (!Char.IsLetter (c)) {
2179                                         // SPECIAL CASES
2180                                         if (c == '\u0A82')
2181                                                 AddCharMap ('\u0A81', 0x17, 2);
2182                                         if (c == '\u0AC2')
2183                                                 fillIndex [0x17]++;
2184                                         AddLetterMap (c, 0x17, gujaratiShift);
2185                                 }
2186                         }
2187
2188                         // Oriya
2189                         fillIndex [0x1] = 03;
2190                         fillIndex [0x18] = 02;
2191                         for (int i = 0x0B00; i < 0x0B7F; i++) {
2192                                 switch (Char.GetUnicodeCategory ((char) i)) {
2193                                 case UnicodeCategory.NonSpacingMark:
2194                                 case UnicodeCategory.DecimalDigitNumber:
2195                                         AddLetterMap ((char) i, 0x1, 1);
2196                                         continue;
2197                                 }
2198                                 AddLetterMap ((char) i, 0x18, 1);
2199                         }
2200
2201                         // Tamil
2202                         fillIndex [0x19] = 2;
2203                         AddCharMap ('\u0BD7', 0x19, 0);
2204                         fillIndex [0x19] = 0xA;
2205                         // vowels
2206                         for (int i = 0x0B82; i <= 0x0B94; i++)
2207                                 if (!IsIgnorable ((char) i))
2208                                         AddCharMap ((char) i, 0x19, 2);
2209                         // special vowel
2210                         fillIndex [0x19] = 0x28;
2211                         // The array for Tamil consonants is a constant.
2212                         // Windows have almost similar sequence to TAM from
2213                         // tamilnet but a bit different in Grantha.
2214                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
2215                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2216                         // combining marks
2217                         fillIndex [0x19] = 0x82;
2218                         for (int i = 0x0BBE; i < 0x0BCD; i++)
2219                                 if (Char.GetUnicodeCategory ((char) i) ==
2220                                         UnicodeCategory.SpacingCombiningMark
2221                                         || i == 0x0BC0)
2222                                         AddLetterMap ((char) i, 0x19, 2);
2223
2224                         // Telugu
2225                         fillIndex [0x1A] = 0x4;
2226                         for (int i = 0x0C00; i < 0x0C62; i++) {
2227                                 if (i == 0x0C55 || i == 0x0C56)
2228                                         continue; // skip
2229                                 AddCharMap ((char) i, 0x1A, 3);
2230                                 char supp = (i == 0x0C0B) ? '\u0C60':
2231                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
2232                                 if (supp == char.MinValue)
2233                                         continue;
2234                                 AddCharMap (supp, 0x1A, 3);
2235                         }
2236
2237                         // Kannada
2238                         fillIndex [0x1B] = 4;
2239                         for (int i = 0x0C80; i < 0x0CE5; i++) {
2240                                 if (i == 0x0CD5 || i == 0x0CD6)
2241                                         continue; // ignore
2242                                 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2243                                         continue; // shift after 0xCB9
2244                                 AddCharMap ((char) i, 0x1B, 3);
2245                                 if (i == 0x0CB9) {
2246                                         // SPECIAL CASES: but why?
2247                                         AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2248                                         AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2249                                         AddCharMap ('\u0CDE', 0x1B, 3); // FA
2250                                 }
2251                                 if (i == 0x0CB2)
2252                                         AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2253                         }
2254
2255                         // Malayalam
2256                         fillIndex [0x1C] = 2;
2257                         for (int i = 0x0D02; i < 0x0D61; i++)
2258                                 // FIXME: I avoided MSCompatUnicodeTable usage
2259                                 // here (it results in recursion). So check if
2260                                 // using NonSpacingMark makes sense or not.
2261                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2262 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2263                                         AddCharMap ((char) i, 0x1C, 1);
2264
2265                         // Thai ... note that it breaks 0x1E wall after E2B!
2266                         // Also, all Thai characters have level 2 value 3.
2267                         fillIndex [0x1E] = 2;
2268                         for (int i = 0xE40; i <= 0xE44; i++)
2269                                 AddCharMap ((char) i, 0x1E, 1, 3);
2270                         for (int i = 0xE01; i < 0xE2B; i++)
2271                                 AddCharMap ((char) i, 0x1E, 6, 3);
2272                         fillIndex [0x1F] = 5;
2273                         for (int i = 0xE2B; i < 0xE30; i++)
2274                                 AddCharMap ((char) i, 0x1F, 6, 3);
2275                         fillIndex [0x1F] = 0x1E;
2276                         for (int i = 0xE30; i < 0xE3B; i++)
2277                                 AddCharMap ((char) i, 0x1F, 1, 3);
2278                         // some Thai characters remains.
2279                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
2280                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2281                         foreach (char c in specialThai)
2282                                 AddCharMap (c, 0x1F, 1);
2283
2284                         // Lao
2285                         fillIndex [0x1F] = 2;
2286                         for (int i = 0xE80; i < 0xEDF; i++)
2287                                 if (Char.IsLetter ((char) i))
2288                                         AddCharMap ((char) i, 0x1F, 1);
2289
2290                         // Georgian. orderedGeorgian is from UCA DUCET.
2291                         fillIndex [0x21] = 5;
2292                         for (int i = 0; i < orderedGeorgian.Length; i++) {
2293                                 char c = orderedGeorgian [i];
2294                                 if (map [(int) c].Defined)
2295                                         continue;
2296                                 AddCharMap (c, 0x21, 0);
2297                                 if (c < '\u10F6')
2298                                         AddCharMap ((char) (c - 0x30), 0x21, 0);
2299                                 fillIndex [0x21] += 5;
2300                         }
2301
2302                         // Japanese Kana.
2303                         fillIndex [0x22] = 2;
2304                         int kanaOffset = 0x3041;
2305                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2306
2307                         for (int gyo = 0; gyo < 9; gyo++) {
2308                                 for (int dan = 0; dan < 5; dan++) {
2309                                         if (gyo == 7 && dan % 2 == 1) {
2310                                                 // 'ya'-gyo
2311                                                 fillIndex [0x22]++;
2312                                                 kanaOffset -= 2; // There is no space for yi and ye.
2313                                                 continue;
2314                                         }
2315                                         int cp = kanaOffset + dan * kanaLines [gyo];
2316                                         // small lines (a-gyo, ya-gyo)
2317                                         if (gyo == 0 || gyo == 7) {
2318                                                 AddKanaMap (cp, 1); // small
2319                                                 AddKanaMap (cp + 1, 1);
2320                                         }
2321                                         else
2322                                                 AddKanaMap (cp, kanaLines [gyo]);
2323                                         fillIndex [0x22]++;
2324
2325                                         if (cp == 0x30AB) {
2326                                                 // add small 'ka' (before normal one)
2327                                                 AddKanaMap (0x30F5, 1);
2328                                                 kanaOffset++;
2329                                         }
2330                                         if (cp == 0x30B1) {
2331                                                 // add small 'ke' (before normal one)
2332                                                 AddKanaMap (0x30F6, 1);
2333                                                 kanaOffset++;
2334                                         }
2335                                         if (cp == 0x3061) {
2336                                                 // add small 'Tsu' (before normal one)
2337                                                 AddKanaMap (0x3063, 1);
2338                                                 kanaOffset++;
2339                                         }
2340                                 }
2341                                 fillIndex [0x22] += 3;
2342                                 kanaOffset += 5 * kanaLines [gyo];
2343                         }
2344
2345                         // Wa-gyo is almost special, so I just manually add.
2346                         AddLetterMap ((char) 0x308E, 0x22, 0);
2347                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2348                         AddLetterMap ((char) 0x308F, 0x22, 0);
2349                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2350                         fillIndex [0x22]++;
2351                         AddLetterMap ((char) 0x3090, 0x22, 0);
2352                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2353                         fillIndex [0x22] += 2;
2354                         // no "Wu" in Japanese.
2355                         AddLetterMap ((char) 0x3091, 0x22, 0);
2356                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2357                         fillIndex [0x22]++;
2358                         AddLetterMap ((char) 0x3092, 0x22, 0);
2359                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2360                         // Nn
2361                         fillIndex [0x22] = 0x80;
2362                         AddLetterMap ((char) 0x3093, 0x22, 0);
2363                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2364
2365                         // JIS Japanese square chars.
2366                         fillIndex [0x22] = 0x97;
2367                         jisJapanese.Sort (JISComparer.Instance);
2368                         foreach (JISCharacter j in jisJapanese)
2369                                 if (0x3300 <= j.CP && j.CP <= 0x3357)
2370                                         AddCharMap ((char) j.CP, 0x22, 1);
2371                         // non-JIS Japanese square chars.
2372                         nonJisJapanese.Sort (NonJISComparer.Instance);
2373                         foreach (NonJISCharacter j in nonJisJapanese)
2374                                 AddCharMap ((char) j.CP, 0x22, 1);
2375
2376                         // Bopomofo
2377                         fillIndex [0x23] = 0x02;
2378                         for (int i = 0x3105; i <= 0x312C; i++)
2379                                 AddCharMap ((char) i, 0x23, 1);
2380
2381                         // Estrangela: ancient Syriac
2382                         fillIndex [0x24] = 0x0B;
2383                         // FIXME: is 0x71E really alternative form?
2384                         ArrayList syriacAlternatives = new ArrayList (
2385                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2386                         for (int i = 0x0710; i <= 0x072C; i++) {
2387                                 if (i == 0x0711) // NonSpacingMark
2388                                         continue;
2389                                 if (syriacAlternatives.Contains (i))
2390                                         continue;
2391                                 AddCharMap ((char) i, 0x24, 4);
2392                                 // FIXME: why?
2393                                 if (i == 0x721)
2394                                         fillIndex [0x24]++;
2395                         }
2396                         foreach (int cp in syriacAlternatives)
2397                                 map [cp] = new CharMapEntry (0x24,
2398                                         (byte) (map [cp - 1].Level1 + 2),
2399                                         0);
2400                         // FIXME: Syriac NonSpacingMark should go here.
2401
2402                         // Thaana
2403                         // FIXME: it turned out that it does not look like UCA
2404                         fillIndex [0x24] = 0x6E;
2405                         for (int i = 0; i < orderedThaana.Length; i++) {
2406                                 char c = orderedThaana [i];
2407                                 if (IsIgnorableNonSpacing ((int) c))
2408                                         continue;
2409                                 AddCharMap (c, 0x24, 2);
2410                                 if (c == '\u0782') // SPECIAL CASE: why?
2411                                         fillIndex [0x24] += 2;
2412                         }
2413                         #endregion
2414
2415                         // FIXME: Add more culture-specific letters (that are
2416                         // not supported in Windows collation) here.
2417
2418                         // Surrogate ... they are computed.
2419
2420                         #region Hangul
2421                         // Hangul.
2422                         //
2423                         // Unlike UCA Windows Hangul sequence mixes Jongseong
2424                         // with Choseong sequence as well as Jungseong,
2425                         // adjusted to have the same primary weight for the
2426                         // same base character. So it is impossible to compute
2427                         // those sort keys.
2428                         //
2429                         // Here I introduce an ordered sequence of mixed
2430                         // 'commands' and 'characters' that is similar to
2431                         // LDML text:
2432                         //      - ',' increases primary weight.
2433                         //      - [A B] means a range, increasing index
2434                         //      - {A B} means a range, without increasing index
2435                         //      - '=' is no operation (it means the characters
2436                         //        of both sides have the same weight).
2437                         //      - '>' inserts a Hangul Syllable block that
2438                         //        contains 0x251 characters.
2439                         //      - '<' decreases the index
2440                         //      - '0'-'9' means skip count
2441                         //      - whitespaces are ignored
2442                         //
2443
2444                         string hangulSequence =
2445                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2446                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2447                         + "<{\u1113 \u1116}, \u3165,"
2448                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2449                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2450                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2451                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2452                                 + "[\u11D1 \u11D2], \u11B2,"
2453                                 + "[\u11D3 \u11D5], \u11B3,"
2454                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2455                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2456                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2457                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2458                         + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2459                                 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2460                                 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2461                         + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2462                                 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2463                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2464                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2465                         + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2466                                 + "\u11F1,, \u11F2,,,"
2467                                 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2468                         + "<\u114D, \u110D,,  >"
2469                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2470                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2471                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2472                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2473                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2474                                 + "[\u11F5 \u11F8]"
2475                         ;
2476
2477                         byte hangulCat = 0x52;
2478                         fillIndex [hangulCat] = 0x2;
2479
2480                         int syllableBlock = 0;
2481                         for (int n = 0; n < hangulSequence.Length; n++) {
2482                                 char c = hangulSequence [n];
2483                                 int start, end;
2484                                 if (Char.IsWhiteSpace (c))
2485                                         continue;
2486                                 switch (c) {
2487                                 case '=':
2488                                         break; // NOP
2489                                 case ',':
2490                                         IncrementSequentialIndex (ref hangulCat);
2491                                         break;
2492                                 case '<':
2493                                         if (fillIndex [hangulCat] == 2)
2494                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2495                                         fillIndex [hangulCat]--;
2496                                         break;
2497                                 case '>':
2498                                         IncrementSequentialIndex (ref hangulCat);
2499                                         for (int l = 0; l < 0x15; l++)
2500                                                 for (int v = 0; v < 0x1C; v++) {
2501                                                         AddCharMap (
2502                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2503                                                         IncrementSequentialIndex (ref hangulCat);
2504                                                 }
2505                                         syllableBlock++;
2506                                         break;
2507                                 case '[':
2508                                         start = hangulSequence [n + 1];
2509                                         end = hangulSequence [n + 3];
2510                                         for (int i = start; i <= end; i++) {
2511                                                 AddCharMap ((char) i, hangulCat, 0);
2512                                                 if (end > i)
2513                                                         IncrementSequentialIndex (ref hangulCat);
2514                                         }
2515                                         n += 4; // consumes 5 characters for this operation
2516                                         break;
2517                                 case '{':
2518                                         start = hangulSequence [n + 1];
2519                                         end = hangulSequence [n + 3];
2520                                         for (int i = start; i <= end; i++)
2521                                                 AddCharMap ((char) i, hangulCat, 0);
2522                                         n += 4; // consumes 5 characters for this operation
2523                                         break;
2524                                 default:
2525                                         AddCharMap (c, hangulCat, 0);
2526                                         break;
2527                                 }
2528                         }
2529
2530                         // Some Jamo NFKD.
2531                         for (int i = 0x3200; i < 0x3300; i++) {
2532                                 if (IsIgnorable (i) || map [i].Defined)
2533                                         continue;
2534                                 int ch = 0;
2535                                 // w/ bracket
2536                                 if (decompLength [i] == 4 &&
2537                                         decompValues [decompIndex [i]] == '(')
2538                                         ch = decompIndex [i] + 1;
2539                                 // circled
2540                                 else if (decompLength [i] == 2 &&
2541                                         decompValues [decompIndex [i] + 1] == '\u1161')
2542                                         ch = decompIndex [i];
2543                                 else if (decompLength [i] == 1)
2544                                         ch = decompIndex [i];
2545                                 else
2546                                         continue;
2547                                 ch = decompValues [ch];
2548                                 if (ch < 0x1100 || 0x1200 < ch &&
2549                                         ch < 0xAC00 || 0xD800 < ch)
2550                                         continue;
2551
2552                                 // SPECIAL CASE ?
2553                                 int offset = i < 0x3260 ? 1 : 0;
2554                                 if (0x326E <= i && i <= 0x3273)
2555                                         offset = 1;
2556
2557                                 map [i] = new CharMapEntry (map [ch].Category,
2558                                         (byte) (map [ch].Level1 + offset),
2559                                         map [ch].Level2);
2560 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2561                         }
2562
2563
2564                         #endregion
2565
2566                         // Letterlike characters and CJK compatibility square
2567                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2568                         int [] counts = new int ['Z' - 'A' + 1];
2569                         char [] namedChars = new char [sortableCharNames.Count];
2570                         int nCharNames = 0;
2571                         foreach (DictionaryEntry de in sortableCharNames) {
2572                                 counts [((string) de.Value) [0] - 'A']++;
2573                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2574                         }
2575                         nCharNames = 0; // reset
2576                         for (int a = 0; a < counts.Length; a++) {
2577                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2578                                 for (int i = 0; i < counts [a]; i++)
2579 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2580                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2581                         }
2582
2583                         // CJK unified ideograph.
2584                         byte cjkCat = 0x9E;
2585                         fillIndex [cjkCat] = 0x2;
2586                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2587                                 if (!IsIgnorable (cp))
2588                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2589                         // CJK Extensions goes here.
2590                         // LAMESPEC: With this Windows style CJK layout, it is
2591                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2592                         // 0x9FBB can never be added w/o breaking compat.
2593                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2594                                 if (!IsIgnorable (cp))
2595                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2596
2597                         // PrivateUse ... computed.
2598                         // remaining Surrogate ... computed.
2599
2600                         #region Special "biggest" area (FF FF)
2601                         fillIndex [0xFF] = 0xFF;
2602                         char [] specialBiggest = new char [] {
2603                                 '\u3005', '\u3031', '\u3032', '\u309D',
2604                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2605                                 '\uFE7C', '\uFE7D', '\uFF70'};
2606                         foreach (char c in specialBiggest)
2607                                 AddCharMap (c, 0xFF, 0);
2608                         #endregion
2609
2610                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2611                         // non-alphanumeric ASCII except for: + - < = > '
2612                         for (int i = 0x21; i < 0x7F; i++) {
2613                                 if (Char.IsLetterOrDigit ((char) i)
2614                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2615                                         continue; // they are not added here.
2616                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2617                                 // Insert 3001 after ',' and 3002 after '.'
2618                                 if (i == 0x2C)
2619                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2620                                 else if (i == 0x2E) {
2621                                         fillIndex [0x7]--;
2622                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2623                                 }
2624                                 else if (i == 0x3A)
2625                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2626                         }
2627                         #endregion
2628
2629                         #region 07 - Punctuations and something else
2630                         for (int i = 0xA0; i < char.MaxValue; i++) {
2631                                 if (IsIgnorable (i))
2632                                         continue;
2633
2634                                 // FIXME: actually those reset should not be
2635                                 // done but here I put for easy goal.
2636                                 if (i == 0x0700)
2637                                         fillIndex [0x7] = 0xE2;
2638                                 if (i == 0x2016)
2639                                         fillIndex [0x7] = 0x77;
2640
2641                                 // SPECIAL CASES:
2642                                 switch (i) {
2643                                 case 0xAB: // 08
2644                                 case 0xB7: // 0A
2645                                 case 0xBB: // 08
2646                                 case 0x2329: // 09
2647                                 case 0x232A: // 09
2648                                         continue;
2649                                 }
2650
2651                                 switch (Char.GetUnicodeCategory ((char) i)) {
2652                                 case UnicodeCategory.OtherPunctuation:
2653                                 case UnicodeCategory.ClosePunctuation:
2654                                 case UnicodeCategory.OpenPunctuation:
2655                                 case UnicodeCategory.InitialQuotePunctuation:
2656                                 case UnicodeCategory.FinalQuotePunctuation:
2657                                 case UnicodeCategory.ModifierSymbol:
2658                                         // SPECIAL CASES: // 0xA
2659                                         if (0x2020 <= i && i <= 0x2042)
2660                                                 continue;
2661                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
2662                                         break;
2663                                 default:
2664                                         if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2665                                                 goto case UnicodeCategory.OtherPunctuation;
2666                                         break;
2667                                 }
2668                         }
2669                         // Control pictures
2670                         // FIXME: it should not need to reset level 1, but
2671                         // it's for easy goal.
2672                         fillIndex [0x7] = 0xB6;
2673                         for (int i = 0x2400; i <= 0x2421; i++)
2674                                 AddCharMap ((char) i, 0x7, 1, 0);
2675                         #endregion
2676
2677                         // FIXME: for 07 xx we need more love.
2678
2679                         // Characters w/ diacritical marks (NFKD)
2680                         for (int i = 0; i <= char.MaxValue; i++) {
2681                                 if (map [i].Defined || IsIgnorable (i))
2682                                         continue;
2683                                 if (decompIndex [i] == 0)
2684                                         continue;
2685
2686                                 int start = decompIndex [i];
2687                                 int primaryChar = decompValues [start];
2688                                 int secondary = 0;
2689                                 bool skip = false;
2690                                 int length = decompLength [i];
2691                                 // special processing for parenthesized ones.
2692                                 if (length == 3 &&
2693                                         decompValues [start] == '(' &&
2694                                         decompValues [start + 2] == ')') {
2695                                         primaryChar = decompValues [start + 1];
2696                                         length = 1;
2697                                 }
2698
2699                                 if (map [primaryChar].Level1 == 0)
2700                                         continue;
2701
2702                                 for (int l = 1; l < length; l++) {
2703                                         int c = decompValues [start + l];
2704                                         if (map [c].Level1 != 0)
2705                                                 skip = true;
2706                                         secondary += diacritical [c];
2707                                 }
2708                                 if (skip)
2709                                         continue;
2710                                 map [i] = new CharMapEntry (
2711                                         map [primaryChar].Category,
2712                                         map [primaryChar].Level1,
2713                                         (byte) secondary);
2714
2715                         }
2716
2717                         // category 08 - symbols
2718                         fillIndex [0x8] = 2;
2719                         // Here Windows mapping is not straightforward. It is
2720                         // not based on computation but seems manual sorting.
2721                         AddCharMapGroup ('+', 0x8, 1, 0); // plus
2722                         AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
2723                         AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
2724                         AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
2725                         AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
2726                         AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
2727                         AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
2728                         AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
2729                         AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
2730                         AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
2731                         AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
2732                         AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
2733                         AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
2734
2735                         for (int cp = 0; cp < 0x2300; cp++) {
2736                                 if (cp == 0xAC) // SPECIAL CASE: skip
2737                                         continue;
2738                                 if (cp == 0x200) {
2739                                         cp = 0x2200; // skip to 2200
2740                                         fillIndex [0x8] = 0x21;
2741                                 }
2742                                 if (cp == 0x2295)
2743                                         fillIndex [0x8] = 0x3;
2744                                 if (!map [cp].Defined &&
2745 //                                      Char.GetUnicodeCategory ((char) cp) ==
2746 //                                      UnicodeCategory.MathSymbol)
2747                                         Char.IsSymbol ((char) cp))
2748                                         AddCharMapGroup ((char) cp, 0x8, 1, 0);
2749                                 // SPECIAL CASES: no idea why Windows sorts as such
2750                                 switch (cp) {
2751                                 case 0x3E:
2752                                         AddCharMap ('\u227B', 0x8, 1, 0);
2753                                         AddCharMap ('\u22B1', 0x8, 1, 0);
2754                                         break;
2755                                 case 0xB1:
2756                                         AddCharMapGroup ('\u00AB', 0x8, 1, 0);
2757                                         AddCharMapGroup ('\u226A', 0x8, 1, 0);
2758                                         AddCharMapGroup ('\u00BB', 0x8, 1, 0);
2759                                         AddCharMapGroup ('\u226B', 0x8, 1, 0);
2760                                         break;
2761                                 case 0xF7:
2762                                         AddCharMap ('\u01C0', 0x8, 1, 0);
2763                                         AddCharMap ('\u01C1', 0x8, 1, 0);
2764                                         AddCharMap ('\u01C2', 0x8, 1, 0);
2765                                         break;
2766                                 }
2767                         }
2768
2769                         #region Level2 adjustment
2770                         // Arabic Hamzah
2771                         diacritical [0x624] = 0x5;
2772                         diacritical [0x626] = 0x7;
2773                         diacritical [0x622] = 0x9;
2774                         diacritical [0x623] = 0xA;
2775                         diacritical [0x625] = 0xB;
2776                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
2777                         diacritical [0x64A] = 0x7; // Yaa'
2778
2779                         for (int i = 0; i < char.MaxValue; i++) {
2780                                 byte mod = 0;
2781                                 byte cat = map [i].Category;
2782                                 switch (cat) {
2783                                 case 0xE: // Latin diacritics
2784                                 case 0x22: // Japanese: circled characters
2785                                         mod = diacritical [i];
2786                                         break;
2787                                 case 0x13: // Arabic
2788                                         if (diacritical [i] == 0 && i >= 0xFE8D)
2789                                                 mod = 0x8; // default for arabic
2790                                         break;
2791                                 }
2792                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
2793                                         mod = diacritical [i];
2794                                 if (mod > 0)
2795                                         map [i] = new CharMapEntry (
2796                                                 cat, map [i].Level1, mod);
2797                         }
2798                         #endregion
2799
2800                         // FIXME: this is hack but those NonSpacingMark
2801                         // characters and still undefined are likely to
2802                         // be nonspacing.
2803                         for (int i = 0; i < char.MaxValue; i++)
2804                                 if (!map [i].Defined &&
2805                                         !IsIgnorable (i) &&
2806                                         Char.GetUnicodeCategory ((char) i) ==
2807                                         UnicodeCategory.NonSpacingMark)
2808                                         AddCharMap ((char) i, 1, 1);
2809
2810                         // FIXME: this is hack but those Symbol characters
2811                         // are likely to fall into 0xA category.
2812                         for (int i = 0; i < char.MaxValue; i++)
2813                                 if (!map [i].Defined &&
2814                                         !IsIgnorable (i) &&
2815                                         Char.IsSymbol ((char) i))
2816                                         AddCharMap ((char) i, 0xA, 1);
2817                 }
2818
2819                 private void IncrementSequentialIndex (ref byte hangulCat)
2820                 {
2821                         fillIndex [hangulCat]++;
2822                         if (fillIndex [hangulCat] == 0) { // overflown
2823                                 hangulCat++;
2824                                 fillIndex [hangulCat] = 0x2;
2825                         }
2826                 }
2827
2828                 // Reset fillIndex to fixed value and call AddLetterMap().
2829                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2830                 {
2831                         fillIndex [category] = alphaWeight;
2832                         AddLetterMap (c, category, 0);
2833
2834                         ArrayList al = latinMap [c] as ArrayList;
2835                         if (al == null)
2836                                 return;
2837
2838                         foreach (int cp in al)
2839                                 AddLetterMap ((char) cp, category, 0);
2840                 }
2841
2842                 private void AddKanaMap (int i, byte voices)
2843                 {
2844                         for (byte b = 0; b < voices; b++) {
2845                                 char c = (char) (i + b);
2846                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
2847                                 // Hiragana
2848                                 AddLetterMapCore (c, 0x22, 0, arg);
2849                                 // Katakana
2850                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2851                         }
2852                 }
2853
2854                 private void AddLetterMap (char c, byte category, byte updateCount)
2855                 {
2856                         AddLetterMapCore (c, category, updateCount, 0);
2857                 }
2858
2859                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2860                 {
2861                         char c2;
2862                         // <small> updates index
2863                         c2 = ToSmallForm (c);
2864                         if (c2 != c)
2865                                 AddCharMapGroup (c2, category, updateCount, level2);
2866                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2867                         if (c2 != c && !map [(int) c2].Defined)
2868                                 AddLetterMapCore (c2, category, 0, level2);
2869                         bool doUpdate = true;
2870                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2871                                 doUpdate = false;
2872                         else
2873                                 AddCharMapGroup (c, category, 0, level2);
2874                         if (doUpdate)
2875                                 fillIndex [category] += updateCount;
2876                 }
2877
2878                 private bool AddCharMap (char c, byte category, byte increment)
2879                 {
2880                         return AddCharMap (c, category, increment, 0);
2881                 }
2882
2883                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2884                 {
2885                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2886                                 return false; // do nothing
2887                         map [(int) c] = new CharMapEntry (category,
2888                                 category == 1 ? alt : fillIndex [category],
2889                                 category == 1 ? fillIndex [category] : alt);
2890                         fillIndex [category] += increment;
2891                         return true;
2892                 }
2893
2894                 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2895                 {
2896                         char c2 = ToSmallFormTail (c);
2897                         if (c2 != c)
2898                                 AddCharMap (c2, category, updateCount, 0);
2899                         // itself
2900                         AddCharMap (c, category, updateCount, 0);
2901                         // <full>
2902                         c2 = ToFullWidthTail (c);
2903                         if (c2 != c)
2904                                 AddCharMapGroupTail (c2, category, updateCount);
2905                 }
2906
2907                 //
2908                 // Adds characters to table in the order below
2909                 // (+ increases weight):
2910                 //      (<small> +)
2911                 //      itself
2912                 //      <fraction>
2913                 //      <full> | <super> | <sub>
2914                 //      <circle> | <wide> (| <narrow>)
2915                 //      +
2916                 //      (vertical +)
2917                 //
2918                 // level2 is fixed (does not increase).
2919                 int [] sameWeightItems = new int [] {
2920                         DecompositionFraction,
2921                         DecompositionFull,
2922                         DecompositionSuper,
2923                         DecompositionSub,
2924                         DecompositionCircle,
2925                         DecompositionWide,
2926                         DecompositionNarrow,
2927                         };
2928                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2929                 {
2930                         if (map [(int) c].Defined)
2931                                 return;
2932
2933                         char small = char.MinValue;
2934                         char vertical = char.MinValue;
2935                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2936                         if (nfkd != null) {
2937                                 object smv = nfkd [(byte) DecompositionSmall];
2938                                 if (smv != null)
2939                                         small = (char) ((int) smv);
2940                                 object vv = nfkd [(byte) DecompositionVertical];
2941                                 if (vv != null)
2942                                         vertical = (char) ((int) vv);
2943                         }
2944
2945                         // <small> updates index
2946                         if (small != char.MinValue)
2947                                 AddCharMap (small, category, updateCount);
2948
2949                         // itself
2950                         AddCharMap (c, category, 0, level2);
2951
2952                         if (nfkd != null) {
2953                                 foreach (int weight in sameWeightItems) {
2954                                         object wv = nfkd [(byte) weight];
2955                                         if (wv != null)
2956                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
2957                                 }
2958                         }
2959
2960                         // update index here.
2961                         fillIndex [category] += updateCount;
2962
2963                         if (vertical != char.MinValue)
2964                                 AddCharMap (vertical, category, updateCount, level2);
2965                 }
2966
2967                 private void AddCharMapCJK (char c, ref byte category)
2968                 {
2969                         AddCharMap (c, category, 0, 0);
2970                         IncrementSequentialIndex (ref category);
2971
2972                         // Special. I wonder why but Windows skips 9E F9.
2973                         if (category == 0x9E && fillIndex [category] == 0xF9)
2974                                 IncrementSequentialIndex (ref category);
2975                 }
2976
2977                 private void AddCharMapGroupCJK (char c, ref byte category)
2978                 {
2979                         AddCharMapCJK (c, ref category);
2980
2981                         // LAMESPEC: see below.
2982                         if (c == '\u5B78') {
2983                                 AddCharMapCJK ('\u32AB', ref category);
2984                                 AddCharMapCJK ('\u323B', ref category);
2985                         }
2986                         if (c == '\u52DE') {
2987                                 AddCharMapCJK ('\u3298', ref category);
2988                                 AddCharMapCJK ('\u3238', ref category);
2989                         }
2990                         if (c == '\u5BEB')
2991                                 AddCharMapCJK ('\u32A2', ref category);
2992                         if (c == '\u91AB')
2993                                 // Especially this mapping order totally does
2994                                 // not make sense to me.
2995                                 AddCharMapCJK ('\u32A9', ref category);
2996
2997                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2998                         if (nfkd == null)
2999                                 return;
3000                         for (byte weight = 0; weight <= 0x12; weight++) {
3001                                 object wv = nfkd [weight];
3002                                 if (wv == null)
3003                                         continue;
3004                                 int w = (int) wv;
3005
3006                                 // Special: they are ignored in this area.
3007                                 // FIXME: check if it is sane
3008                                 if (0xF900 <= w && w <= 0xFAD9)
3009                                         continue;
3010                                 // LAMESPEC: on Windows some of CJK characters
3011                                 // in 3200-32B0 are incorrectly mapped. They
3012                                 // mix Chinise and Japanese Kanji when
3013                                 // ordering those characters.
3014                                 switch (w) {
3015                                 case 0x32A2: case 0x3298: case 0x3238:
3016                                 case 0x32A9: case 0x323B: case 0x32AB:
3017                                         continue;
3018                                 }
3019
3020                                 AddCharMapCJK ((char) w, ref category);
3021                         }
3022                 }
3023
3024                 // For now it is only for 0x7 category.
3025                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3026                 {
3027                         char small = char.MinValue;
3028                         char vertical = char.MinValue;
3029                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3030                         if (nfkd != null) {
3031                                 object smv = nfkd [(byte) DecompositionSmall];
3032                                 if (smv != null)
3033                                         small = (char) ((int) smv);
3034                                 object vv = nfkd [(byte) DecompositionVertical];
3035                                 if (vv != null)
3036                                         vertical = (char) ((int) vv);
3037                         }
3038
3039                         // <small> updates index
3040                         if (small != char.MinValue)
3041                                 // SPECIAL CASE excluded (FIXME: why?)
3042                                 if (small != '\u2024')
3043                                         AddCharMap (small, category, updateCount);
3044
3045                         // itself
3046                         AddCharMap (c, category, updateCount, level2);
3047
3048                         // Since nfkdMap is problematic to have two or more
3049                         // NFKD to an identical character, here I iterate all.
3050                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3051                                 if (decompLength [c2] == 1 &&
3052                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3053                                         switch (decompType [c2]) {
3054                                         case DecompositionCompat:
3055                                                 AddCharMap ((char) c2, category, updateCount, level2);
3056                                                 break;
3057                                         }
3058                                 }
3059                         }
3060
3061                         if (vertical != char.MinValue)
3062                                 // SPECIAL CASE excluded (FIXME: why?)
3063                                 if (vertical != '\uFE33' && vertical != '\uFE34')
3064                                         AddCharMap (vertical, category, updateCount, level2);
3065                 }
3066
3067                 private void AddArabicCharMap (char c)
3068                 {
3069                         byte category = 6;
3070                         byte updateCount = 1;
3071                         byte level2 = 0;
3072
3073                         // itself
3074                         AddCharMap (c, category, 0, level2);
3075
3076                         // Since nfkdMap is problematic to have two or more
3077                         // NFKD to an identical character, here I iterate all.
3078                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3079                                 if (decompLength [c2] == 0)
3080                                         continue;
3081                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
3082                                 if ((int) (decompValues [idx]) == (int) c)
3083                                         AddCharMap ((char) c2, category,
3084                                                 0, level2);
3085                         }
3086                         fillIndex [category] += updateCount;
3087                 }
3088
3089                 char ToFullWidth (char c)
3090                 {
3091                         return ToDecomposed (c, DecompositionFull, false);
3092                 }
3093
3094                 char ToFullWidthTail (char c)
3095                 {
3096                         return ToDecomposed (c, DecompositionFull, true);
3097                 }
3098
3099                 char ToSmallForm (char c)
3100                 {
3101                         return ToDecomposed (c, DecompositionSmall, false);
3102                 }
3103
3104                 char ToSmallFormTail (char c)
3105                 {
3106                         return ToDecomposed (c, DecompositionSmall, true);
3107                 }
3108
3109                 char ToDecomposed (char c, byte d, bool tail)
3110                 {
3111                         if (decompType [(int) c] != d)
3112                                 return c;
3113                         int idx = decompIndex [(int) c];
3114                         if (tail)
3115                                 idx += decompLength [(int) c] - 1;
3116                         return (char) decompValues [idx];
3117                 }
3118
3119                 bool ExistsJIS (int cp)
3120                 {
3121                         foreach (JISCharacter j in jisJapanese)
3122                                 if (j.CP == cp)
3123                                         return true;
3124                         return false;
3125                 }
3126
3127                 #endregion
3128
3129                 #region Level 3 properties (Case/Width)
3130
3131                 private byte ComputeLevel3Weight (char c)
3132                 {
3133                         byte b = ComputeLevel3WeightRaw (c);
3134                         return b > 0 ? (byte) (b + 2) : b;
3135                 }
3136
3137                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3138                 {
3139                         // CJK compat
3140                         if ('\u3192' <= c && c <= '\u319F')
3141                                 return 0;
3142                         // Japanese reading marks
3143                         if (c == '\u3001' || c == '\u3002')
3144                                 return 2;
3145                         // Korean
3146                         if ('\u11A8' <= c && c <= '\u11F9')
3147                                 return 2;
3148                         if ('\uFFA0' <= c && c <= '\uFFDC')
3149                                 return 4;
3150                         if ('\u3130' <= c && c <= '\u3164')
3151                                 return 5;
3152                         if ('\u3165' <= c && c <= '\u318E')
3153                                 return 4;
3154                         // Georgian Capital letters
3155                         if ('\u10A0' <= c && c <= '\u10C5')
3156                                 return 0x10;
3157                         // numbers
3158                         if ('\u2776' <= c && c <= '\u277F')
3159                                 return 4;
3160                         if ('\u2780' <= c && c <= '\u2789')
3161                                 return 8;
3162                         if ('\u2776' <= c && c <= '\u2793')
3163                                 return 0xC;
3164                         if ('\u2160' <= c && c <= '\u216F')
3165                                 return 0x10;
3166                         if ('\u2181' <= c && c <= '\u2182')
3167                                 return 0x18;
3168                         // Arabic
3169                         if ('\u2135' <= c && c <= '\u2138')
3170                                 return 4;
3171                         if ('\uFE80' <= c && c < '\uFF00') {
3172                                 // 2(Isolated)/8(Final)/0x18(Medial)
3173                                 switch (decompType [(int) c]) {
3174                                 case DecompositionIsolated:
3175                                         return 2;
3176                                 case DecompositionFinal:
3177                                         return 8;
3178                                 case DecompositionMedial:
3179                                         return 0x18;
3180                                 }
3181                         }
3182
3183                         // actually I dunno the reason why they have weights.
3184                         switch (c) {
3185                         case '\u01BC':
3186                                 return 0x10;
3187                         case '\u06A9':
3188                                 return 0x20;
3189                         case '\u06AA':
3190                                 return 0x28;
3191                         }
3192
3193                         byte ret = 0;
3194                         switch (c) {
3195                         case '\u03C2':
3196                         case '\u2104':
3197                         case '\u212B':
3198                                 ret |= 8;
3199                                 break;
3200                         case '\uFE42':
3201                                 ret |= 0xC;
3202                                 break;
3203                         }
3204
3205                         // misc
3206                         switch (decompType [(int) c]) {
3207                         case DecompositionWide: // <wide>
3208                         case DecompositionSub: // <sub>
3209                         case DecompositionSuper: // <super>
3210                                 ret |= decompType [(int) c];
3211                                 break;
3212                         }
3213                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3214                                 ret |= 8;
3215                         if (isUppercase [(int) c]) // DerivedCoreProperties
3216                                 ret |= 0x10;
3217
3218                         return ret;
3219                 }
3220
3221                 #endregion
3222
3223                 #region IsIgnorable
3224 /*
3225                 static bool IsIgnorable (int i)
3226                 {
3227                         if (unicodeAge [i] >= 3.1)
3228                                 return true;
3229                         switch (char.GetUnicodeCategory ((char) i)) {
3230                         case UnicodeCategory.OtherNotAssigned:
3231                         case UnicodeCategory.Format:
3232                                 return true;
3233                         }
3234                         return false;
3235                 }
3236 */
3237
3238                 // FIXME: In the future use DerivedAge.txt to examine character
3239                 // versions and set those ones that have higher version than
3240                 // 1.0 as ignorable.
3241                 static bool IsIgnorable (int i)
3242                 {
3243                         switch (i) {
3244                         case 0:
3245                         // I guess, those characters are added between
3246                         // Unicode 1.0 (LCMapString) and Unicode 3.1
3247                         // (UnicodeCategory), so they used to be
3248                         // something like OtherNotAssigned as of Unicode 1.1.
3249                         case 0x2df: case 0x387:
3250                         case 0x3d7: case 0x3d8: case 0x3d9:
3251                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3252                         case 0x400: case 0x40d: case 0x450: case 0x45d:
3253                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
3254                         case 0x653: case 0x654: case 0x655: case 0x66d:
3255                         case 0xb56:
3256                         case 0x1e9b: case 0x202f: case 0x20ad:
3257                         case 0x20ae: case 0x20af:
3258                         case 0x20e2: case 0x20e3:
3259                         case 0x2139: case 0x213a: case 0x2183:
3260                         case 0x2425: case 0x2426: case 0x2619:
3261                         case 0x2670: case 0x2671: case 0x3007:
3262                         case 0x3190: case 0x3191:
3263                         case 0xfffc: case 0xfffd:
3264                                 return true;
3265                         // exceptional characters filtered by the
3266                         // following conditions. Originally those exceptional
3267                         // ranges are incorrect (they should not be ignored)
3268                         // and most of those characters are unfortunately in
3269                         // those ranges.
3270                         case 0x4d8: case 0x4d9:
3271                         case 0x4e8: case 0x4e9:
3272                         case 0x70F:
3273                         case 0x3036: case 0x303f:
3274                         case 0x337b: case 0xfb1e:
3275                                 return false;
3276                         }
3277
3278                         if (
3279                                 // The whole Sinhala characters.
3280                                 0x0D82 <= i && i <= 0x0DF4
3281                                 // The whole Tibetan characters.
3282                                 || 0x0F00 <= i && i <= 0x0FD1
3283                                 // The whole Myanmar characters.
3284                                 || 0x1000 <= i && i <= 0x1059
3285                                 // The whole Etiopic, Cherokee,
3286                                 // Canadian Syllablic, Ogham, Runic,
3287                                 // Tagalog, Hanunoo, Philippine,
3288                                 // Buhid, Tagbanwa, Khmer and Mongorian
3289                                 // characters.
3290                                 || 0x1200 <= i && i <= 0x1DFF
3291                                 // Greek extension characters.
3292                                 || 0x1F00 <= i && i <= 0x1FFF
3293                                 // The whole Braille characters.
3294                                 || 0x2800 <= i && i <= 0x28FF
3295                                 // CJK radical characters.
3296                                 || 0x2E80 <= i && i <= 0x2EF3
3297                                 // Kangxi radical characters.
3298                                 || 0x2F00 <= i && i <= 0x2FD5
3299                                 // Ideographic description characters.
3300                                 || 0x2FF0 <= i && i <= 0x2FFB
3301                                 // Bopomofo letter and final
3302                                 || 0x31A0 <= i && i <= 0x31B7
3303                                 // White square with quadrant characters.
3304                                 || 0x25F0 <= i && i <= 0x25F7
3305                                 // Ideographic telegraph symbols.
3306                                 || 0x32C0 <= i && i <= 0x32CB
3307                                 || 0x3358 <= i && i <= 0x3370
3308                                 || 0x33E0 <= i && i <= 0x33FF
3309                                 // The whole YI characters.
3310                                 || 0xA000 <= i && i <= 0xA48C
3311                                 || 0xA490 <= i && i <= 0xA4C6
3312                                 // American small ligatures
3313                                 || 0xFB13 <= i && i <= 0xFB17
3314                                 // hebrew, arabic, variation selector.
3315                                 || 0xFB1D <= i && i <= 0xFE2F
3316                                 // Arabic ligatures.
3317                                 || 0xFEF5 <= i && i <= 0xFEFC
3318                                 // FIXME: why are they excluded?
3319                                 || 0x01F6 <= i && i <= 0x01F9
3320                                 || 0x0218 <= i && i <= 0x0233
3321                                 || 0x02A9 <= i && i <= 0x02AD
3322                                 || 0x02EA <= i && i <= 0x02EE
3323                                 || 0x0349 <= i && i <= 0x036F
3324                                 || 0x0488 <= i && i <= 0x048F
3325                                 || 0x04D0 <= i && i <= 0x04FF
3326                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3327                                 || 0x06D6 <= i && i <= 0x06ED
3328                                 || 0x06FA <= i && i <= 0x06FE
3329                                 || 0x2048 <= i && i <= 0x204D
3330                                 || 0x20e4 <= i && i <= 0x20ea
3331                                 || 0x213C <= i && i <= 0x214B
3332                                 || 0x21EB <= i && i <= 0x21FF
3333                                 || 0x22F2 <= i && i <= 0x22FF
3334                                 || 0x237B <= i && i <= 0x239A
3335                                 || 0x239B <= i && i <= 0x23CF
3336                                 || 0x24EB <= i && i <= 0x24FF
3337                                 || 0x2596 <= i && i <= 0x259F
3338                                 || 0x25F8 <= i && i <= 0x25FF
3339                                 || 0x2672 <= i && i <= 0x2689
3340                                 || 0x2768 <= i && i <= 0x2775
3341                                 || 0x27d0 <= i && i <= 0x27ff
3342                                 || 0x2900 <= i && i <= 0x2aff
3343                                 || 0x3033 <= i && i <= 0x303F
3344                                 || 0x31F0 <= i && i <= 0x31FF
3345                                 || 0x3250 <= i && i <= 0x325F
3346                                 || 0x32B1 <= i && i <= 0x32BF
3347                                 || 0x3371 <= i && i <= 0x337B
3348                                 || 0xFA30 <= i && i <= 0xFA6A
3349                         )
3350                                 return true;
3351
3352                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3353                         switch (uc) {
3354                         case UnicodeCategory.PrivateUse:
3355                         case UnicodeCategory.Surrogate:
3356                                 return false;
3357                         // ignored by nature
3358                         case UnicodeCategory.Format:
3359                         case UnicodeCategory.OtherNotAssigned:
3360                                 return true;
3361                         default:
3362                                 return false;
3363                         }
3364                 }
3365
3366                 // To check IsIgnorable sanity, try the driver below under MS.NET.
3367
3368                 /*
3369                 public static void Main ()
3370                 {
3371                         for (int i = 0; i <= char.MaxValue; i++)
3372                                 Dump (i, IsIgnorable (i));
3373                 }
3374
3375                 static void Dump (int i, bool ignore)
3376                 {
3377                         switch (Char.GetUnicodeCategory ((char) i)) {
3378                         case UnicodeCategory.PrivateUse:
3379                         case UnicodeCategory.Surrogate:
3380                                 return; // check nothing
3381                         }
3382
3383                         string s1 = "";
3384                         string s2 = new string ((char) i, 10);
3385                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3386                         if ((ret == 0) == ignore)
3387                                 return;
3388                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3389                 }
3390                 */
3391                 #endregion // IsIgnorable
3392
3393                 #region IsIgnorableSymbol
3394                 static bool IsIgnorableSymbol (int i)
3395                 {
3396                         if (IsIgnorable (i))
3397                                 return true;
3398
3399                         switch (i) {
3400                         // *Letter
3401                         case 0x00b5: case 0x01C0: case 0x01C1:
3402                         case 0x01C2: case 0x01C3: case 0x01F6:
3403                         case 0x01F7: case 0x01F8: case 0x01F9:
3404                         case 0x02D0: case 0x02EE: case 0x037A:
3405                         case 0x03D7: case 0x03F3:
3406                         case 0x0400: case 0x040d:
3407                         case 0x0450: case 0x045d:
3408                         case 0x048C: case 0x048D:
3409                         case 0x048E: case 0x048F:
3410                         case 0x0587: case 0x0640: case 0x06E5:
3411                         case 0x06E6: case 0x06FA: case 0x06FB:
3412                         case 0x06FC: case 0x093D: case 0x0950:
3413                         case 0x1E9B: case 0x2139: case 0x3006:
3414                         case 0x3033: case 0x3034: case 0x3035:
3415                         case 0xFE7E: case 0xFE7F:
3416                         // OtherNumber
3417                         case 0x16EE: case 0x16EF: case 0x16F0:
3418                         // LetterNumber
3419                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3420                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3421                         case 0x3038: // HANGZHOU NUMERAL TEN
3422                         case 0x3039: // HANGZHOU NUMERAL TWENTY
3423                         case 0x303a: // HANGZHOU NUMERAL THIRTY
3424                         // OtherSymbol
3425                         case 0x2117:
3426                         case 0x327F:
3427                                 return true;
3428                         // ModifierSymbol
3429                         case 0x02B9: case 0x02BA: case 0x02C2:
3430                         case 0x02C3: case 0x02C4: case 0x02C5:
3431                         case 0x02C8: case 0x02CC: case 0x02CD:
3432                         case 0x02CE: case 0x02CF: case 0x02D2:
3433                         case 0x02D3: case 0x02D4: case 0x02D5:
3434                         case 0x02D6: case 0x02D7: case 0x02DE:
3435                         case 0x02E5: case 0x02E6: case 0x02E7:
3436                         case 0x02E8: case 0x02E9:
3437                         case 0x309B: case 0x309C:
3438                         // OtherPunctuation
3439                         case 0x055A: // American Apos
3440                         case 0x05C0: // Hebrew Punct
3441                         case 0x0E4F: // Thai FONGMAN
3442                         case 0x0E5A: // Thai ANGKHANKHU
3443                         case 0x0E5B: // Thai KHOMUT
3444                         // CurencySymbol
3445                         case 0x09F2: // Bengali Rupee Mark
3446                         case 0x09F3: // Bengali Rupee Sign
3447                         // MathSymbol
3448                         case 0x221e: // INF.
3449                         // OtherSymbol
3450                         case 0x0482:
3451                         case 0x09FA:
3452                         case 0x0B70:
3453                                 return false;
3454                         }
3455
3456                         // *Letter
3457                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3458 #if NET_2_0
3459                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3460                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3461 #endif
3462                         )
3463                                 return true;
3464
3465                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3466                         switch (uc) {
3467                         case UnicodeCategory.Surrogate:
3468                                 return false; // inconsistent
3469
3470                         case UnicodeCategory.SpacingCombiningMark:
3471                         case UnicodeCategory.EnclosingMark:
3472                         case UnicodeCategory.NonSpacingMark:
3473                         case UnicodeCategory.PrivateUse:
3474                                 // NonSpacingMark
3475                                 if (0x064B <= i && i <= 0x0652) // Arabic
3476                                         return true;
3477                                 return false;
3478
3479                         case UnicodeCategory.Format:
3480                         case UnicodeCategory.OtherNotAssigned:
3481                                 return true;
3482
3483                         default:
3484                                 bool use = false;
3485                                 // OtherSymbols
3486                                 if (
3487                                         // latin in a circle
3488                                         0x249A <= i && i <= 0x24E9
3489                                         || 0x2100 <= i && i <= 0x2132
3490                                         // Japanese
3491                                         || 0x3196 <= i && i <= 0x31A0
3492                                         // Korean
3493                                         || 0x3200 <= i && i <= 0x321C
3494                                         // Chinese/Japanese
3495                                         || 0x322A <= i && i <= 0x3243
3496                                         // CJK
3497                                         || 0x3260 <= i && i <= 0x32B0
3498                                         || 0x32D0 <= i && i <= 0x3357
3499                                         || 0x337B <= i && i <= 0x33DD
3500                                 )
3501                                         use = !Char.IsLetterOrDigit ((char) i);
3502                                 if (use)
3503                                         return false;
3504
3505                                 // This "Digit" rule is mystery.
3506                                 // It filters some symbols out.
3507                                 if (Char.IsLetterOrDigit ((char) i))
3508                                         return false;
3509                                 if (Char.IsNumber ((char) i))
3510                                         return false;
3511                                 if (Char.IsControl ((char) i)
3512                                         || Char.IsSeparator ((char) i)
3513                                         || Char.IsPunctuation ((char) i))
3514                                         return true;
3515                                 if (Char.IsSymbol ((char) i))
3516                                         return true;
3517
3518                                 // FIXME: should check more
3519                                 return false;
3520                         }
3521                 }
3522
3523                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3524 /*
3525                 public static void Main ()
3526                 {
3527                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3528                         for (int i = 0; i <= char.MaxValue; i++) {
3529                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3530                                 if (uc == UnicodeCategory.Surrogate)
3531                                         continue;
3532
3533                                 bool ret = IsIgnorableSymbol (i);
3534
3535                                 string s1 = "TEST ";
3536                                 string s2 = "TEST " + (char) i;
3537
3538                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3539
3540                                 if (ret != (result == 0))
3541                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3542                                                 ret ? "should not ignore" :
3543                                                         "should ignore",
3544                                                 i,(char) i, uc);
3545                         }
3546                 }
3547 */
3548                 #endregion
3549
3550                 #region NonSpacing
3551                 static bool IsIgnorableNonSpacing (int i)
3552                 {
3553                         if (IsIgnorable (i))
3554                                 return true;
3555
3556                         switch (i) {
3557                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3558                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3559                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3560                                 return true;
3561                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3562                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3563                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3564                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3565                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3566                         case 0x0CCD: case 0x0E4E:
3567                                 return false;
3568                         }
3569
3570                         if (0x02b9 <= i && i <= 0x02c5
3571                                 || 0x02cc <= i && i <= 0x02d7
3572                                 || 0x02e4 <= i && i <= 0x02ef
3573                                 || 0x20DD <= i && i <= 0x20E0
3574                         )
3575                                 return true;
3576
3577                         if (0x064B <= i && i <= 0x00652
3578                                 || 0x0941 <= i && i <= 0x0948
3579                                 || 0x0AC1 <= i && i <= 0x0ACD
3580                                 || 0x0C3E <= i && i <= 0x0C4F
3581                                 || 0x0E31 <= i && i <= 0x0E3F
3582                         )
3583                                 return false;
3584
3585                         return Char.GetUnicodeCategory ((char) i) ==
3586                                 UnicodeCategory.NonSpacingMark;
3587                 }
3588
3589                 // We can reuse IsIgnorableSymbol testcode
3590                 // for IsIgnorableNonSpacing.
3591                 #endregion
3592         }
3593
3594         struct CharMapEntry
3595         {
3596                 public byte Category;
3597                 public byte Level1;
3598                 public byte Level2; // It is always single byte.
3599                 public bool Defined;
3600
3601                 public CharMapEntry (byte category, byte level1, byte level2)
3602                 {
3603                         Category = category;
3604                         Level1 = level1;
3605                         Level2 = level2;
3606                         Defined = true;
3607                 }
3608         }
3609
3610         class JISCharacter
3611         {
3612                 public readonly int CP;
3613                 public readonly int JIS;
3614
3615                 public JISCharacter (int cp, int cpJIS)
3616                 {
3617                         CP = cp;
3618                         JIS = cpJIS;
3619                 }
3620         }
3621
3622         class JISComparer : IComparer
3623         {
3624                 public static readonly JISComparer Instance =
3625                         new JISComparer ();
3626
3627                 public int Compare (object o1, object o2)
3628                 {
3629                         JISCharacter j1 = (JISCharacter) o1;
3630                         JISCharacter j2 = (JISCharacter) o2;
3631                         return j1.JIS - j2.JIS;
3632                 }
3633         }
3634
3635         class NonJISCharacter
3636         {
3637                 public readonly int CP;
3638                 public readonly string Name;
3639
3640                 public NonJISCharacter (int cp, string name)
3641                 {
3642                         CP = cp;
3643                         Name = name;
3644                 }
3645         }
3646
3647         class NonJISComparer : IComparer
3648         {
3649                 public static readonly NonJISComparer Instance =
3650                         new NonJISComparer ();
3651
3652                 public int Compare (object o1, object o2)
3653                 {
3654                         NonJISCharacter j1 = (NonJISCharacter) o1;
3655                         NonJISCharacter j2 = (NonJISCharacter) o2;
3656                         return string.CompareOrdinal (j1.Name, j2.Name);
3657                 }
3658         }
3659
3660         class DecimalDictionaryValueComparer : IComparer
3661         {
3662                 public static readonly DecimalDictionaryValueComparer Instance
3663                         = new DecimalDictionaryValueComparer ();
3664
3665                 private DecimalDictionaryValueComparer ()
3666                 {
3667                 }
3668
3669                 public int Compare (object o1, object o2)
3670                 {
3671                         DictionaryEntry e1 = (DictionaryEntry) o1;
3672                         DictionaryEntry e2 = (DictionaryEntry) o2;
3673                         // FIXME: in case of 0, compare decomposition categories
3674                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3675                         if (ret != 0)
3676                                 return ret;
3677                         int i1 = (int) e1.Key;
3678                         int i2 = (int) e2.Key;
3679                         return i1 - i2;
3680                 }
3681         }
3682
3683         class StringDictionaryValueComparer : IComparer
3684         {
3685                 public static readonly StringDictionaryValueComparer Instance
3686                         = new StringDictionaryValueComparer ();
3687
3688                 private StringDictionaryValueComparer ()
3689                 {
3690                 }
3691
3692                 public int Compare (object o1, object o2)
3693                 {
3694                         DictionaryEntry e1 = (DictionaryEntry) o1;
3695                         DictionaryEntry e2 = (DictionaryEntry) o2;
3696                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3697                         if (ret != 0)
3698                                 return ret;
3699                         int i1 = (int) e1.Key;
3700                         int i2 = (int) e2.Key;
3701                         return i1 - i2;
3702                 }
3703         }
3704
3705         class UCAComparer : IComparer
3706         {
3707                 public static readonly UCAComparer Instance
3708                         = new UCAComparer ();
3709
3710                 private UCAComparer ()
3711                 {
3712                 }
3713
3714                 public int Compare (object o1, object o2)
3715                 {
3716                         char i1 = (char) o1;
3717                         char i2 = (char) o2;
3718
3719                         int l1 = CollationElementTable.GetSortKeyCount (i1);
3720                         int l2 = CollationElementTable.GetSortKeyCount (i2);
3721                         int l = l1 > l2 ? l2 : l1;
3722
3723                         for (int i = 0; i < l; i++) {
3724                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3725                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3726                                 int v = k1.Primary - k2.Primary;
3727                                 if (v != 0)
3728                                         return v;
3729                                 v = k1.Secondary - k2.Secondary;
3730                                 if (v != 0)
3731                                         return v;
3732                                 v = k1.Thirtiary - k2.Thirtiary;
3733                                 if (v != 0)
3734                                         return v;
3735                                 v = k1.Quarternary - k2.Quarternary;
3736                                 if (v != 0)
3737                                         return v;
3738                         }
3739                         return l1 - l2;
3740                 }
3741         }
3742
3743         class Tailoring
3744         {
3745                 int lcid;
3746                 int alias;
3747                 bool frenchSort;
3748                 ArrayList items = new ArrayList ();
3749
3750                 public Tailoring (int lcid)
3751                         : this (lcid, 0)
3752                 {
3753                 }
3754
3755                 public Tailoring (int lcid, int alias)
3756                 {
3757                         this.lcid = lcid;
3758                         this.alias = alias;
3759                 }
3760
3761                 public int LCID {
3762                         get { return lcid; }
3763                 }
3764
3765                 public int Alias {
3766                         get { return alias; }
3767                 }
3768
3769                 public bool FrenchSort {
3770                         get { return frenchSort; }
3771                         set { frenchSort = value; }
3772                 }
3773
3774                 public void AddDiacriticalMap (byte target, byte replace)
3775                 {
3776                         items.Add (new DiacriticalMap (target, replace));
3777                 }
3778
3779                 public void AddSortKeyMap (string source, byte [] sortkey)
3780                 {
3781                         items.Add (new SortKeyMap (source, sortkey));
3782                 }
3783
3784                 public void AddReplacementMap (string source, string replace)
3785                 {
3786                         items.Add (new ReplacementMap (source, replace));
3787                 }
3788
3789                 public char [] ItemToCharArray ()
3790                 {
3791                         ArrayList al = new ArrayList ();
3792                         foreach (ITailoringMap m in items)
3793                                 al.AddRange (m.ToCharArray ());
3794                         return al.ToArray (typeof (char)) as char [];
3795                 }
3796
3797                 interface ITailoringMap
3798                 {
3799                         char [] ToCharArray ();
3800                 }
3801
3802                 class DiacriticalMap : ITailoringMap
3803                 {
3804                         public readonly byte Target;
3805                         public readonly byte Replace;
3806
3807                         public DiacriticalMap (byte target, byte replace)
3808                         {
3809                                 Target = target;
3810                                 Replace = replace;
3811                         }
3812
3813                         public char [] ToCharArray ()
3814                         {
3815                                 char [] ret = new char [3];
3816                                 ret [0] = (char) 02; // kind:DiacriticalMap
3817                                 ret [1] = (char) Target;
3818                                 ret [2] = (char) Replace;
3819                                 return ret;
3820                         }
3821                 }
3822
3823                 class SortKeyMap : ITailoringMap
3824                 {
3825                         public readonly string Source;
3826                         public readonly byte [] SortKey;
3827
3828                         public SortKeyMap (string source, byte [] sortkey)
3829                         {
3830                                 Source = source;
3831                                 SortKey = sortkey;
3832                         }
3833
3834                         public char [] ToCharArray ()
3835                         {
3836                                 char [] ret = new char [Source.Length + 7];
3837                                 ret [0] = (char) 01; // kind:SortKeyMap
3838                                 for (int i = 0; i < Source.Length; i++)
3839                                         ret [i + 1] = Source [i];
3840                                 // null terminate
3841                                 for (int i = 0; i < 4; i++)
3842                                         ret [i + Source.Length + 2] = (char) SortKey [i];
3843                                 return ret;
3844                         }
3845                 }
3846
3847                 class ReplacementMap : ITailoringMap
3848                 {
3849                         public readonly string Source;
3850                         public readonly string Replace;
3851
3852                         public ReplacementMap (string source, string replace)
3853                         {
3854                                 Source = source;
3855                                 Replace = replace;
3856                         }
3857
3858                         public char [] ToCharArray ()
3859                         {
3860                                 char [] ret = new char [Source.Length + Replace.Length + 3];
3861                                 ret [0] = (char) 03; // kind:ReplaceMap
3862                                 int pos = 1;
3863                                 for (int i = 0; i < Source.Length; i++)
3864                                         ret [pos++] = Source [i];
3865                                 // null terminate
3866                                 pos++;
3867                                 for (int i = 0; i < Replace.Length; i++)
3868                                         ret [pos++] = Replace [i];
3869                                 // null terminate
3870                                 return ret;
3871                         }
3872                 }
3873         }
3874 }