mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14 #define Binary
  15
  16 using System;
  17 using System.IO;
  18 using System.Collections;
  19 using System.Globalization;
  20 using System.Text;
  21 using System.Xml;
  22
  23 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
  24
  25 namespace Mono.Globalization.Unicode
  26 {
  27         internal class MSCompatSortKeyTableGenerator
  28         {
  29                 public static void Main (string [] args)
  30                 {
  31                         new MSCompatSortKeyTableGenerator ().Run (args);
  32                 }
  33
  34                 const int DecompositionWide = 1; // fixed
  35                 const int DecompositionSub = 2; // fixed
  36                 const int DecompositionSmall = 3;
  37                 const int DecompositionIsolated = 4;
  38                 const int DecompositionInitial = 5;
  39                 const int DecompositionFinal = 6;
  40                 const int DecompositionMedial = 7;
  41                 const int DecompositionNoBreak = 8;
  42                 const int DecompositionVertical = 9;
  43                 const int DecompositionFraction = 0xA;
  44                 const int DecompositionFont = 0xB;
  45                 const int DecompositionSuper = 0xC; // fixed
  46                 const int DecompositionFull = 0xE;
  47                 const int DecompositionNarrow = 0xD;
  48                 const int DecompositionCircle = 0xF;
  49                 const int DecompositionSquare = 0x10;
  50                 const int DecompositionCompat = 0x11;
  51                 const int DecompositionCanonical = 0x12;
  52
  53                 TextWriter CSResult = Console.Out;
  54                 TextWriter CResult = TextWriter.Null;
  55
  56                 byte [] fillIndex = new byte [256]; // by category
  57                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  58
  59                 char [] specialIgnore = new char [] {
  60                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  61                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  62                         };
  63
  64                 // FIXME: need more love (as always)
  65                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  66                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  67                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  68                         '\u0292', '\u01BE', '\u0298'};
  69                 byte [] alphaWeights = new byte [] {
  70                         2, 9, 0xA, 0x1A, 0x21,
  71                         0x23, 0x25, 0x2C, 0x32, 0x35,
  72                         0x36, 0x48, 0x51, 0x70, 0x7C,
  73                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  74                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  75                         0xA9, 0xAA, 0xB3, 0xB4};
  76
  77                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  78                 bool [] isUppercase = new bool [char.MaxValue + 1];
  79
  80                 byte [] decompType = new byte [char.MaxValue + 1];
  81                 int [] decompIndex = new int [char.MaxValue + 1];
  82                 int [] decompLength = new int [char.MaxValue + 1];
  83                 int [] decompValues;
  84                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  85
  86                 byte [] diacritical = new byte [char.MaxValue + 1];
  87
  88                 string [] diacritics = new string [] {
  89                         // LATIN, CYRILLIC etc.
  90                         "VERTICAL LINE ABOVE", "UPTURN", "DOUBLE-STRUCK",
  91                         "ABKHASIAN",
  92                         "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
  93                         "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
  94                         "WITH ACUTE;", "WITH GRAVE;",
  95                         //
  96                         "WITH DOT ABOVE;", " MIDDLE DOT;",
  97                         "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
  98                         "WITH DIALYTIKA;",
  99                         "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 100                         "DIALYTIKA TONOS", "DIALYTIKA AND TONOS",
 101                         "ABKHASIAN CHE WITH DESCENDER",
 102                         "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
 103                         "WITH OGONEK;", "WITH CEDILLA;",
 104                         //
 105                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 106                         "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
 107                         "STROKE OVERLAY",
 108                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 109                         " DIAERESIS AND GRAVE;",
 110                         " BREVE AND ACUTE;",
 111                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 112                         " MACRON AND ACUTE;",
 113                         " MACRON AND GRAVE;",
 114                         //
 115                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 116                         " RING ABOVE AND ACUTE",
 117                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 118                         " CIRCUMFLEX AND TILDE",
 119                         " TILDE AND DIAERESIS",
 120                         " STROKE AND ACUTE",
 121                         " BREVE AND TILDE",
 122                         " CEDILLA AND BREVE",
 123                         " OGONEK AND MACRON",
 124                         // 0x40
 125                         "WITH OVERLINE", "DOUBLE VERTICAL LINE ABOVE",
 126                         "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 127                         " DOUBLE GRAVE",
 128                         " INVERTED BREVE",
 129                         "ROMAN NUMERAL",
 130                         " PRECEDED BY APOSTROPHE",
 131                         "WITH HORN;",
 132                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 133                         " PALATAL HOOK",
 134                         " DOT BELOW;",
 135                         " RETROFLEX;", "DIAERESIS BELOW", "RETROFLEX HOOK",
 136                         " RING BELOW", "LOW VERTICAL LINE",
 137                         //
 138                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 139                         " BREVE BELOW;", " HORN AND GRAVE",
 140                         " LOW MACRON",
 141                         " TILDE BELOW",
 142                         " TOPBAR",
 143                         " DOT BELOW AND DOT ABOVE",
 144                         " RIGHT HALF RING", " HORN AND TILDE",
 145                         " CIRCUMFLEX AND DOT BELOW",
 146                         " BREVE AND DOT BELOW",
 147                         " DOT BELOW AND MACRON",
 148                         " TONE TWO",
 149                         " HORN AND HOOK ABOVE",
 150                         " HORN AND DOT",
 151                         // CIRCLED, PARENTHESIZED and so on
 152                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
 153                         "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
 154                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 155                         };
 156                 byte [] diacriticWeights = new byte [] {
 157                         // LATIN.
 158                         3, 3, 3, 5, 5, 5, 5,
 159                         0xE, 0xF,
 160                         0xE, 0xF,
 161                         //
 162                         0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
 163                         0x16, 0x17, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
 164                         //
 165                         0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 166                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 167                         //
 168                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 169                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 170                         //
 171                         0x40, 0x41, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
 172                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x59,
 173                         0x5A, 0x5A,
 174                         //
 175                         0x60, 0x60, 0x61, 0x61, 0x62, 0x63, 0x68, 0x68,
 176                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 177                         0x87, 0x95, 0xAA,
 178                         // CIRCLED, PARENTHESIZED and so on.
 179                         0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
 180                         0xF3, 0xF3, 0xF3
 181                         };
 182
 183                 int [] numberSecondaryWeightBounds = new int [] {
 184                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 185                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 186                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 187                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 188                         0xE50, 0xE60, 0xED0, 0xEE0
 189                         };
 190
 191                 char [] orderedGurmukhi;
 192                 char [] orderedGujarati;
 193                 char [] orderedGeorgian;
 194                 char [] orderedThaana;
 195
 196                 static readonly char [] orderedTamilConsonants = new char [] {
 197                         // based on traditional Tamil consonants, except for
 198                         // Grantha (where Microsoft breaks traditionalism).
 199                         // http://www.angelfire.com/empire/thamizh/padanGaL
 200                         '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
 201                         '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
 202                         '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
 203                         '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
 204                         '\u0BB7', '\u0BB9'};
 205
 206                 // cp -> character name (only for some characters)
 207                 ArrayList sortableCharNames = new ArrayList ();
 208
 209                 // cp -> arrow value (int)
 210                 ArrayList arrowValues = new ArrayList ();
 211
 212                 // cp -> box value (int)
 213                 ArrayList boxValues = new ArrayList ();
 214
 215                 // cp -> level1 value
 216                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 217
 218                 // letterName -> cp
 219                 Hashtable arabicNameMap = new Hashtable ();
 220
 221                 // cp -> Hashtable [decompType] -> cp
 222                 Hashtable nfkdMap = new Hashtable ();
 223
 224                 // Latin letter -> ArrayList [int]
 225                 Hashtable latinMap = new Hashtable ();
 226
 227                 ArrayList jisJapanese = new ArrayList ();
 228                 ArrayList nonJisJapanese = new ArrayList ();
 229
 230                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
 231                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
 232                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
 233                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
 234                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
 235
 236                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 237
 238                 static double [] unicodeAge = new double [char.MaxValue + 1];
 239
 240                 ArrayList tailorings = new ArrayList ();
 241
 242                 void Run (string [] args)
 243                 {
 244                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 245                         ParseSources (dirname);
 246                         Console.Error.WriteLine ("parse done.");
 247
 248                         ModifyParsedValues ();
 249                         GenerateCore ();
 250                         Console.Error.WriteLine ("generation done.");
 251                         CResult = new StreamWriter ("collation-tables.h", false);
 252                         Serialize ();
 253                         CResult.Close ();
 254                         Console.Error.WriteLine ("serialization done.");
 255 /*
 256 StreamWriter sw = new StreamWriter ("agelog.txt");
 257 for (int i = 0; i < char.MaxValue; i++) {
 258 bool shouldBe = false;
 259 switch (Char.GetUnicodeCategory ((char) i)) {
 260 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
 261         shouldBe = true; break;
 262 }
 263 if (unicodeAge [i] >= 3.1)
 264         shouldBe = true;
 265 //if (IsIgnorable (i) != shouldBe)
 266 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
 267 }
 268 sw.Close ();
 269 */
 270                 }
 271
 272                 byte [] CompressArray (byte [] source, CodePointIndexer i)
 273                 {
 274                         return (byte []) CodePointIndexer.CompressArray  (
 275                                 source, typeof (byte), i);
 276                 }
 277
 278                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
 279                 {
 280                         return (ushort []) CodePointIndexer.CompressArray  (
 281                                 source, typeof (ushort), i);
 282                 }
 283
 284                 void WriteByte (byte value)
 285                 {
 286
 287                 }
 288
 289                 void Serialize ()
 290                 {
 291                         // Tailorings
 292                         SerializeTailorings ();
 293
 294                         byte [] categories = new byte [map.Length];
 295                         byte [] level1 = new byte [map.Length];
 296                         byte [] level2 = new byte [map.Length];
 297                         byte [] level3 = new byte [map.Length];
 298 // widthCompat is now removed from the mapping table.
 299 // If it turned out that it is still required, grep this source and uncomment
 300 // widthCompat related lines. FIXME: remove those lines in the future.
 301 //                      ushort [] widthCompat = new ushort [map.Length];
 302                         for (int i = 0; i < map.Length; i++) {
 303                                 categories [i] = map [i].Category;
 304                                 level1 [i] = map [i].Level1;
 305                                 level2 [i] = map [i].Level2;
 306                                 level3 [i] = ComputeLevel3Weight ((char) i);
 307 /*
 308                                 // For Japanese Half-width characters, don't
 309                                 // map widthCompat. It is IgnoreKanaType that
 310                                 // handles those width differences.
 311                                 if (0xFF6D <= i && i <= 0xFF9D)
 312                                         continue;
 313                                 switch (decompType [i]) {
 314                                 case DecompositionNarrow:
 315                                 case DecompositionWide:
 316                                 case DecompositionSuper:
 317                                 case DecompositionSub:
 318                                         // they are always 1 char
 319                                         widthCompat [i] = (ushort) decompValues [decompIndex [i]];
 320                                         break;
 321                                 }
 322 */
 323                         }
 324
 325                         // compress
 326                         ignorableFlags = CompressArray (ignorableFlags,
 327                                 UUtil.Ignorable);
 328                         categories = CompressArray (categories, UUtil.Category);
 329                         level1 = CompressArray (level1, UUtil.Level1);
 330                         level2 = CompressArray (level2, UUtil.Level2);
 331                         level3 = CompressArray (level3, UUtil.Level3);
 332 //                      widthCompat = (ushort []) CodePointIndexer.CompressArray (
 333 //                              widthCompat, typeof (ushort), UUtil.WidthCompat);
 334                         cjkCHS = CompressArray (cjkCHS, UUtil.CjkCHS);
 335                         cjkCHT = CompressArray (cjkCHT,UUtil.Cjk);
 336                         cjkJA = CompressArray (cjkJA, UUtil.Cjk);
 337                         cjkKO = CompressArray (cjkKO, UUtil.Cjk);
 338                         cjkKOlv2 = CompressArray (cjkKOlv2, UUtil.Cjk);
 339
 340                         // Ignorables
 341                         CResult.WriteLine ("static const guint8* collation_table_ignorableFlags [] = {");
 342                         CSResult.WriteLine ("static readonly byte [] ignorableFlagsArr = new byte [] {");
 343 #if Binary
 344                         MemoryStream ms = new MemoryStream ();
 345                         BinaryWriter binary = new BinaryWriter (ms);
 346                         binary.Write (UUtil.ResourceVersion);
 347                         binary.Write (ignorableFlags.Length);
 348 #endif
 349                         for (int i = 0; i < ignorableFlags.Length; i++) {
 350                                 byte value = ignorableFlags [i];
 351                                 if (value < 10)
 352                                         CSResult.Write ("{0},", value);
 353                                 else
 354                                         CSResult.Write ("0x{0:X02},", value);
 355                                 CResult.Write ("{0},", value);
 356 #if Binary
 357                                 binary.Write (value);
 358 #endif
 359                                 if ((i & 0xF) == 0xF) {
 360                                         CSResult.WriteLine ("// {0:X04}",
 361                                                 UUtil.Ignorable.ToCodePoint (i - 0xF));
 362                                         CResult.WriteLine ();
 363                                 }
 364                         }
 365                         CSResult.WriteLine ("};");
 366                         CSResult.WriteLine ();
 367
 368                         // Primary category
 369                         CResult.WriteLine ("static const guint8* collation_table_category [] = {");
 370                         CSResult.WriteLine ("static readonly byte [] categoriesArr = new byte [] {");
 371 #if Binary
 372                         binary.Write (categories.Length);
 373 #endif
 374                         for (int i = 0; i < categories.Length; i++) {
 375                                 byte value = categories [i];
 376                                 if (value < 10)
 377                                         CSResult.Write ("{0},", value);
 378                                 else
 379                                         CSResult.Write ("0x{0:X02},", value);
 380                                 CResult.Write ("{0},", value);
 381 #if Binary
 382                                 binary.Write (value);
 383 #endif
 384                                 if ((i & 0xF) == 0xF) {
 385                                         CSResult.WriteLine ("// {0:X04}",
 386                                                 UUtil.Category.ToCodePoint (i - 0xF));
 387                                         CResult.WriteLine ();
 388                                 }
 389                         }
 390                         CResult.WriteLine ("};");
 391                         CSResult.WriteLine ("};");
 392                         CSResult.WriteLine ();
 393
 394                         // Primary weight value
 395                         CResult.WriteLine ("static const guint8* collation_table_level1 [] = {");
 396                         CSResult.WriteLine ("static readonly byte [] level1Arr = new byte [] {");
 397 #if Binary
 398                         binary.Write (level1.Length);
 399 #endif
 400                         for (int i = 0; i < level1.Length; i++) {
 401                                 byte value = level1 [i];
 402                                 if (value < 10)
 403                                         CSResult.Write ("{0},", value);
 404                                 else
 405                                         CSResult.Write ("0x{0:X02},", value);
 406                                 CResult.Write ("{0},", value);
 407 #if Binary
 408                                 binary.Write (value);
 409 #endif
 410                                 if ((i & 0xF) == 0xF) {
 411                                         CSResult.WriteLine ("// {0:X04}",
 412                                                 UUtil.Level1.ToCodePoint (i - 0xF));
 413                                         CResult.WriteLine ();
 414                                 }
 415                         }
 416                         CResult.WriteLine ("0};");
 417                         CSResult.WriteLine ("};");
 418                         CSResult.WriteLine ();
 419
 420                         // Secondary weight
 421                         CResult.WriteLine ("static const guint8* collation_table_level2 [] = {");
 422                         CSResult.WriteLine ("static readonly byte [] level2Arr = new byte [] {");
 423 #if Binary
 424                         binary.Write (level2.Length);
 425 #endif
 426                         for (int i = 0; i < level2.Length; i++) {
 427                                 byte value = level2 [i];
 428                                 if (value < 10)
 429                                         CSResult.Write ("{0},", value);
 430                                 else
 431                                         CSResult.Write ("0x{0:X02},", value);
 432                                 CResult.Write ("{0},", value);
 433 #if Binary
 434                                 binary.Write (value);
 435 #endif
 436                                 if ((i & 0xF) == 0xF) {
 437                                         CSResult.WriteLine ("// {0:X04}",
 438                                                 UUtil.Level2.ToCodePoint (i - 0xF));
 439                                         CResult.WriteLine ();
 440                                 }
 441                         }
 442                         CResult.WriteLine ("0};");
 443                         CSResult.WriteLine ("};");
 444                         CSResult.WriteLine ();
 445
 446                         // Thirtiary weight
 447                         CResult.WriteLine ("static const guint8* collation_table_level3 [] = {");
 448                         CSResult.WriteLine ("static readonly byte [] level3Arr = new byte [] {");
 449 #if Binary
 450                         binary.Write (level3.Length);
 451 #endif
 452                         for (int i = 0; i < level3.Length; i++) {
 453                                 byte value = level3 [i];
 454                                 if (value < 10)
 455                                         CSResult.Write ("{0},", value);
 456                                 else
 457                                         CSResult.Write ("0x{0:X02},", value);
 458                                 CResult.Write ("{0},", value);
 459 #if Binary
 460                                 binary.Write (value);
 461 #endif
 462                                 if ((i & 0xF) == 0xF) {
 463                                         CSResult.WriteLine ("// {0:X04}",
 464                                                 UUtil.Level3.ToCodePoint (i - 0xF));
 465                                         CResult.WriteLine ();
 466                                 }
 467                         }
 468                         CResult.WriteLine ("0};");
 469                         CSResult.WriteLine ("};");
 470                         CSResult.WriteLine ();
 471
 472 /*
 473                         // Width insensitivity mappings
 474                         // (for now it is more lightweight than dumping the
 475                         // entire NFKD table).
 476                         CResult.WriteLine ("static const guint16* widthCompat [] = {");
 477                         CSResult.WriteLine ("static readonly ushort [] widthCompatArr = new ushort [] {");
 478 #if Binary
 479                         binary.Write (widthCompat.Length);
 480 #endif
 481                         for (int i = 0; i < widthCompat.Length; i++) {
 482                                 ushort value = widthCompat [i];
 483                                 if (value < 10)
 484                                         CSResult.Write ("{0},", value);
 485                                 else
 486                                         CSResult.Write ("0x{0:X02},", value);
 487                                 CResult.Write ("{0},", value);
 488 #if Binary
 489                                 binary.Write (value);
 490 #endif
 491                                 if ((i & 0xF) == 0xF) {
 492                                         CSResult.WriteLine ("// {0:X04}",
 493                                                 UUtil.WidthCompat.ToCodePoint (i - 0xF));
 494                                         CResult.WriteLine ();
 495                                 }
 496                         }
 497                         CResult.WriteLine ("0};");
 498                         CSResult.WriteLine ("};");
 499                         CSResult.WriteLine ();
 500 */
 501
 502 #if Binary
 503                         using (FileStream fs = File.Create ("../collation.core.bin")) {
 504                                 byte [] array = ms.ToArray ();
 505                                 fs.Write (array, 0, array.Length);
 506                         }
 507 #endif
 508
 509                         // CJK
 510                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 511                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 512                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 513                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 514                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 515                 }
 516
 517                 void SerializeCJK (string name, ushort [] cjk, int max_unused)
 518                 {
 519                         CResult.WriteLine ("static const int collation_table_collation_cjk_{0}_size [] = {1};", name, cjk.Length);
 520                         CSResult.WriteLine ("const int {0}ArrLength = {1};", name, cjk.Length);
 521
 522                         CResult.WriteLine ("static const guint8* collation_table_collation_cjk_{0} [] = {{", name);
 523                         CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
 524 #if Binary
 525                         MemoryStream ms = new MemoryStream ();
 526                         BinaryWriter binary = new BinaryWriter (ms);
 527                         binary.Write (UUtil.ResourceVersion);
 528                         binary.Write (cjk.Length); // the actual size is *2.
 529 #endif
 530                         // category
 531                         for (int i = 0; i < cjk.Length; i++) {
 532 //                              if (i == max)
 533 //                                      break;
 534                                 byte value = (byte) (cjk [i] >> 8);
 535                                 if (value < 10)
 536                                         CSResult.Write ("{0},", value);
 537                                 else
 538                                         CSResult.Write ("0x{0:X02},", value);
 539                                 CResult.Write ("{0},", value);
 540 #if Binary
 541                                 binary.Write (value);
 542 #endif
 543                                 if ((i & 0xF) == 0xF) {
 544                                         CSResult.WriteLine ("// {0:X04}", i - 0xF);
 545                                         CResult.WriteLine ();
 546                                 }
 547                         }
 548
 549                         // level 1
 550                         for (int i = 0; i < cjk.Length; i++) {
 551 //                              if (i == max)
 552 //                                      break;
 553                                 byte value = (byte) (cjk [i] & 0xFF);
 554                                 if (value < 10)
 555                                         CSResult.Write ("{0},", value);
 556                                 else
 557                                         CSResult.Write ("0x{0:X02},", value);
 558                                 CResult.Write ("{0},", value);
 559 #if Binary
 560                                 binary.Write (value);
 561 #endif
 562                                 if ((i & 0xF) == 0xF) {
 563                                         CSResult.WriteLine ("// {0:X04}", i - 0xF);
 564                                         CResult.WriteLine ();
 565                                 }
 566                         }
 567
 568                         CResult.WriteLine ("0};");
 569                         CSResult.WriteLine ("};");
 570                         CSResult.WriteLine ();
 571 #if Binary
 572                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 573                                 byte [] array = ms.ToArray ();
 574                                 fs.Write (array, 0, array.Length);
 575                         }
 576 #endif
 577                 }
 578
 579                 void SerializeCJK (string name, byte [] cjk, int max)
 580                 {
 581                         CResult.WriteLine ("static const guint8* collation_table_collation_cjk_{0} [] = {{", name);
 582                         CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
 583 #if Binary
 584                         MemoryStream ms = new MemoryStream ();
 585                         BinaryWriter binary = new BinaryWriter (ms);
 586                         binary.Write (UUtil.ResourceVersion);
 587 #endif
 588                         for (int i = 0; i < cjk.Length; i++) {
 589                                 if (i == max)
 590                                         break;
 591                                 byte value = cjk [i];
 592                                 if (value < 10)
 593                                         CSResult.Write ("{0},", value);
 594                                 else
 595                                         CSResult.Write ("0x{0:X02},", value);
 596                                 CResult.Write ("{0},", value);
 597 #if Binary
 598                                 binary.Write (value);
 599 #endif
 600                                 if ((i & 0xF) == 0xF) {
 601                                         CSResult.WriteLine ("// {0:X04}", i - 0xF);
 602                                         CResult.WriteLine ();
 603                                 }
 604                         }
 605                         CResult.WriteLine ("0};");
 606                         CSResult.WriteLine ("};");
 607                         CSResult.WriteLine ();
 608 #if Binary
 609                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 610                                 byte [] array = ms.ToArray ();
 611                                 fs.Write (array, 0, array.Length);
 612                         }
 613 #endif
 614                 }
 615
 616                 void SerializeTailorings ()
 617                 {
 618                         Hashtable indexes = new Hashtable ();
 619                         Hashtable counts = new Hashtable ();
 620                         CResult.WriteLine ("static const guint16*collation_table_tailoring = {");
 621                         CSResult.WriteLine ("static char [] tailorings = new char [] {");
 622                         int count = 0;
 623 #if Binary
 624                         MemoryStream ms = new MemoryStream ();
 625                         BinaryWriter binary = new BinaryWriter (ms);
 626                         // Here we don't need to output resource version.
 627                         // This is cached.
 628 #endif
 629                         foreach (Tailoring t in tailorings) {
 630                                 if (t.Alias != 0)
 631                                         continue;
 632                                 CResult.Write ("/*{0}*/", t.LCID);
 633                                 CSResult.Write ("/*{0}*/", t.LCID);
 634                                 indexes.Add (t.LCID, count);
 635                                 char [] values = t.ItemToCharArray ();
 636                                 counts.Add (t.LCID, values.Length);
 637                                 foreach (char c in values) {
 638                                         CSResult.Write ("'\\x{0:X}', ", (int) c);
 639                                         CResult.Write ("{0},", (int) c);
 640                                         if (++count % 16 == 0) {
 641                                                 CSResult.WriteLine (" // {0:X04}", count - 16);
 642                                                 CResult.WriteLine ();
 643                                         }
 644 #if Binary
 645                                         binary.Write ((ushort) c);
 646 #endif
 647                                 }
 648                         }
 649                         CResult.WriteLine ("0};");
 650                         CSResult.WriteLine ("};");
 651
 652                         CResult.WriteLine ("static const int collation_tailoring_count = {0};", tailorings.Count);
 653                         CResult.WriteLine ("static const int* collation_tailoring_infos = {");
 654                         CSResult.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
 655 #if Binary
 656                         byte [] rawdata = ms.ToArray ();
 657                         ms = new MemoryStream ();
 658                         binary = new BinaryWriter (ms);
 659                         binary.Write (UUtil.ResourceVersion);
 660                         binary.Write (tailorings.Count);
 661 #endif
 662                         foreach (Tailoring t in tailorings) {
 663                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
 664                                 if (!indexes.ContainsKey (target)) {
 665                                         throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
 666                                         continue;
 667                                 }
 668                                 int idx = (int) indexes [target];
 669                                 int cnt = (int) counts [target];
 670                                 bool french = t.FrenchSort;
 671                                 if (t.Alias != 0)
 672                                         foreach (Tailoring t2 in tailorings)
 673                                                 if (t2.LCID == t.LCID)
 674                                                         french = t2.FrenchSort;
 675                                 CSResult.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
 676                                 CResult.WriteLine ("{0},{1},{2},{3},", t.LCID, idx, cnt, french ? 1 : 0);
 677 #if Binary
 678                                 binary.Write (t.LCID);
 679                                 binary.Write (idx);
 680                                 binary.Write (cnt);
 681                                 binary.Write (french);
 682 #endif
 683                         }
 684                         CResult.WriteLine ("0};");
 685                         CSResult.WriteLine ("};");
 686 #if Binary
 687                         binary.Write ((byte) 0xFF);
 688                         binary.Write ((byte) 0xFF);
 689                         binary.Write (rawdata.Length / 2);
 690                         binary.Write (rawdata, 0, rawdata.Length);
 691
 692
 693                         using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
 694                                 byte [] array = ms.ToArray ();
 695                                 fs.Write (array, 0, array.Length);
 696                         }
 697 #endif
 698                 }
 699
 700                 #region Parse
 701
 702                 void ParseSources (string dirname)
 703                 {
 704                         string unidata =
 705                                 dirname + "/UnicodeData.txt";
 706                         string derivedCoreProps =
 707                                 dirname + "/DerivedCoreProperties.txt";
 708                         string scripts =
 709                                 dirname + "/Scripts.txt";
 710                         string cp932 =
 711                                 dirname + "/CP932.TXT";
 712                         string derivedAge =
 713                                 dirname + "/DerivedAge.txt";
 714                         string chXML = dirname + "/common/collation/zh.xml";
 715                         string jaXML = dirname + "/common/collation/ja.xml";
 716                         string koXML = dirname + "/common/collation/ko.xml";
 717
 718                         ParseDerivedAge (derivedAge);
 719
 720                         FillIgnorables ();
 721
 722                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 723                         ParseUnidata (unidata);
 724                         ModifyUnidata ();
 725                         ParseDerivedCoreProperties (derivedCoreProps);
 726                         ParseScripts (scripts);
 727                         ParseCJK (chXML, jaXML, koXML);
 728
 729                         ParseTailorings ("mono-tailoring-source.txt");
 730                 }
 731
 732                 void ParseTailorings (string filename)
 733                 {
 734                         Tailoring t = null;
 735                         int line = 0;
 736                         using (StreamReader sr = new StreamReader (filename)) {
 737                                 try {
 738                                         while (sr.Peek () >= 0) {
 739                                                 line++;
 740                                                 ProcessTailoringLine (ref t,
 741                                                         sr.ReadLine ().Trim ());
 742                                         }
 743                                 } catch (Exception) {
 744                                         Console.Error.WriteLine ("ERROR at line {0}", line);
 745                                         throw;
 746                                 }
 747                         }
 748                 }
 749
 750                 // For now this is enough.
 751                 string ParseTailoringSourceValue (string s)
 752                 {
 753                         StringBuilder sb = new StringBuilder ();
 754                         for (int i = 0; i < s.Length; i++) {
 755                                 if (i + 5 < s.Length &&
 756                                         s [i] == '\\' && s [i + 1] == 'u') {
 757                                         sb.Append (
 758                                                 (char) int.Parse (
 759                                                         s.Substring (i + 2, 4),
 760                                                         NumberStyles.HexNumber),
 761                                                 1);
 762                                         i += 5;
 763                                 }
 764                                 else
 765                                         sb.Append (s [i]);
 766                         }
 767                         return sb.ToString ();
 768                 }
 769
 770                 void ProcessTailoringLine (ref Tailoring t, string s)
 771                 {
 772                         int idx = s.IndexOf ('#');
 773                         if (idx > 0)
 774                                 s = s.Substring (0, idx).Trim ();
 775                         if (s.Length == 0 || s [0] == '#')
 776                                 return;
 777                         if (s [0] == '@') {
 778                                 idx = s.IndexOf ('=');
 779                                 if (idx > 0)
 780                                         t = new Tailoring (
 781                                                 int.Parse (s.Substring (1, idx - 1)),
 782                                                 int.Parse (s.Substring (idx + 1)));
 783                                 else
 784                                         t = new Tailoring (int.Parse (s.Substring (1)));
 785                                 tailorings.Add (t);
 786                                 return;
 787                         }
 788                         if (s.StartsWith ("*FrenchSort")) {
 789                                 t.FrenchSort = true;
 790                                 return;
 791                         }
 792                         string d = "*Diacritical";
 793                         if (s.StartsWith (d)) {
 794                                 idx = s.IndexOf ("->");
 795                                 t.AddDiacriticalMap (
 796                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
 797                                                 NumberStyles.HexNumber),
 798                                         byte.Parse (s.Substring (idx + 2).Trim (),
 799                                                 NumberStyles.HexNumber));
 800                                 return;
 801                         }
 802                         idx = s.IndexOf (':');
 803                         if (idx > 0) {
 804                                 string source = s.Substring (0, idx).Trim ();
 805                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
 806                                 byte [] b = new byte [4];
 807                                 for (int i = 0; i < 4; i++) {
 808                                         if (l [i] == "*")
 809                                                 b [i] = 0;
 810                                         else
 811                                                 b [i] = byte.Parse (l [i],
 812                                                         NumberStyles.HexNumber);
 813                                 }
 814                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
 815                                         b);
 816                         }
 817                         idx = s.IndexOf ('=');
 818                         if (idx > 0)
 819                                 t.AddReplacementMap (
 820                                         ParseTailoringSourceValue (
 821                                                 s.Substring (0, idx).Trim ()),
 822                                         ParseTailoringSourceValue (
 823                                                 s.Substring (idx + 1).Trim ()));
 824                 }
 825
 826                 void ParseDerivedAge (string filename)
 827                 {
 828                         using (StreamReader file =
 829                                 new StreamReader (filename)) {
 830                                 while (file.Peek () >= 0) {
 831                                         string s = file.ReadLine ();
 832                                         int idx = s.IndexOf ('#');
 833                                         if (idx >= 0)
 834                                                 s = s.Substring (0, idx);
 835                                         idx = s.IndexOf (';');
 836                                         if (idx < 0)
 837                                                 continue;
 838
 839                                         string cpspec = s.Substring (0, idx);
 840                                         idx = cpspec.IndexOf ("..");
 841                                         NumberStyles nf = NumberStyles.HexNumber |
 842                                                 NumberStyles.AllowTrailingWhite;
 843                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 844                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 845                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 846
 847                                         // FIXME: use index
 848                                         if (cp > char.MaxValue)
 849                                                 continue;
 850
 851                                         double v = double.Parse (value);
 852                                         for (int i = cp; i <= cpEnd; i++)
 853                                                 unicodeAge [i] = v;
 854                                 }
 855                         }
 856                         unicodeAge [0] = double.MaxValue; // never be supported
 857                 }
 858
 859                 void ParseUnidata (string filename)
 860                 {
 861                         ArrayList decompValues = new ArrayList ();
 862                         using (StreamReader unidata =
 863                                 new StreamReader (filename)) {
 864                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 865                                         try {
 866                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 867                                         } catch (Exception) {
 868                                                 Console.Error.WriteLine ("**** At line " + line);
 869                                                 throw;
 870                                         }
 871                                 }
 872                         }
 873                         this.decompValues = (int [])
 874                                 decompValues.ToArray (typeof (int));
 875                 }
 876
 877                 char previousLatinTarget = char.MinValue;
 878                 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
 879
 880                 void ProcessUnidataLine (string s, ArrayList decompValues)
 881                 {
 882                         int idx = s.IndexOf ('#');
 883                         if (idx >= 0)
 884                                 s = s.Substring (0, idx);
 885                         idx = s.IndexOf (';');
 886                         if (idx < 0)
 887                                 return;
 888                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 889                         string [] values = s.Substring (idx + 1).Split (';');
 890
 891                         // FIXME: use index
 892                         if (cp > char.MaxValue)
 893                                 return;
 894                         if (IsIgnorable (cp))
 895                                 return;
 896
 897                         string name = values [0];
 898
 899                         // SPECIAL CASE: rename some characters for diacritical
 900                         // remapping. FIXME: why are they different?
 901                         // FIXME: it's still not working.
 902                         if (cp == 0x018B || cp == 0x018C)
 903                                 name = name.Replace ("TOPBAR", "STROKE");
 904
 905                         // isSmallCapital
 906                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 907                                 isSmallCapital [cp] = true;
 908
 909                         // latin mapping by character name
 910                         if (s.IndexOf ("LATIN") >= 0) {
 911                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 912                                 int offset = lidx + 15;
 913                                 if (lidx < 0) {
 914                                         lidx = s.IndexOf ("LETTER TURNED ");
 915                                         offset = lidx + 14;
 916                                 }
 917                                 if (lidx < 0) {
 918                                         lidx = s.IndexOf ("LETTER CAPITAL ");
 919                                         offset = lidx + 15;
 920                                 }
 921                                 if (lidx < 0) {
 922                                         lidx = s.IndexOf ("LETTER SCRIPT ");
 923                                         offset = lidx + 14;
 924                                 }
 925                                 if (lidx < 0) {
 926                                         lidx = s.IndexOf ("LETTER ");
 927                                         offset = lidx + 7;
 928                                 }
 929                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 930                                 char n = s [offset + 1];
 931                                 char target = char.MinValue;
 932                                 if ('A' <= c && c <= 'Z' &&
 933                                         (n == ' ') || n == ';') {
 934                                         target = c;
 935                                         // FIXME: After 'Z', I cannot reset this state.
 936                                         previousLatinTarget = c == 'Z' ? char.MinValue : c;
 937                                 }
 938
 939                                 if (s.Substring (offset).StartsWith ("ALPHA"))
 940                                         target = 'A';
 941                                 else if (s.Substring (offset).StartsWith ("TONE SIX"))
 942                                         target = 'B';
 943                                 else if (s.Substring (offset).StartsWith ("OPEN O"))
 944                                         target = 'C';
 945                                 else if (s.Substring (offset).StartsWith ("ETH"))
 946                                         target = 'D';
 947                                 else if (s.Substring (offset).StartsWith ("SCHWA"))
 948                                         target = 'E';
 949                                 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
 950                                         target = 'O';
 951                                 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
 952                                         target = 'R';
 953                                 else if (s.Substring (offset).StartsWith ("TONE TWO"))
 954                                         target = 'S';
 955                                 else if (s.Substring (offset).StartsWith ("ESH"))
 956                                         target = 'S';
 957                                 else if (s.Substring (offset).StartsWith ("OUNCE"))
 958                                         target = 'Z';
 959
 960                                 // For remaining IPA chars, direct mapping is
 961                                 // much faster.
 962                                 switch (cp) {
 963                                 case 0x0166: case 0x0167:
 964                                         // Though they are 'T', they have different weight
 965                                         target = char.MinValue; break;
 966                                 case 0x0299: target = 'B'; break;
 967                                 case 0x029A: target = 'E'; break;
 968                                 case 0x029B: target = 'G'; break;
 969                                 case 0x029C: target = 'H'; break;
 970                                 case 0x029D: target = 'J'; break;
 971                                 case 0x029E: target = 'K'; break;
 972                                 case 0x029F: target = 'L'; break;
 973                                 case 0x02A0: target = 'Q'; break;
 974                                 case 0x02A7: target = 'T'; break;
 975                                 case 0x02A8: target = 'T'; break;
 976                                 }
 977
 978                                 if (target == char.MinValue)
 979                                         target = previousLatinTarget;
 980
 981                                 if (target != char.MinValue) {
 982                                         ArrayList entry = (ArrayList) latinMap [target];
 983                                         if (entry == null) {
 984                                                 entry = new ArrayList ();
 985                                                 latinMap [target] = entry;
 986                                         }
 987                                         entry.Add (cp);
 988                                         // FIXME: This secondary weight is hack.
 989                                         // They are here because they must not
 990                                         // be identical to the corresponding
 991                                         // ASCII latins.
 992                                         if (c != target && diacritical [cp] == 0) {
 993                                                 diacriticalOffset [c - 'A']++;
 994                                                 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
 995                                         }
 996                                 }
 997                         }
 998
 999                         // Arrow names
1000                         if (0x2000 <= cp && cp < 0x3000) {
1001                                 int value = 0;
1002                                 // SPECIAL CASES. FIXME: why?
1003                                 switch (cp) {
1004                                 case 0x21C5: value = -1; break; // E2
1005                                 case 0x261D: value = 1; break;
1006                                 case 0x27A6: value = 3; break;
1007                                 case 0x21B0: value = 7; break;
1008                                 case 0x21B1: value = 3; break;
1009                                 case 0x21B2: value = 7; break;
1010                                 case 0x21B4: value = 5; break;
1011                                 case 0x21B5: value = 7; break;
1012                                 case 0x21B9: value = -1; break; // E1
1013                                 case 0x21CF: value = 7; break;
1014                                 case 0x21D0: value = 3; break;
1015                                 }
1016                                 string [] arrowTargets = new string [] {
1017                                         "",
1018                                         "UPWARDS",
1019                                         "NORTH EAST",
1020                                         "RIGHTWARDS",
1021                                         "SOUTH EAST",
1022                                         "DOWNWARDS",
1023                                         "SOUTH WEST",
1024                                         "LEFTWARDS",
1025                                         "NORTH WEST",
1026                                         "LEFT RIGHT",
1027                                         "UP DOWN",
1028                                         };
1029                                 if (s.IndexOf ("RIGHTWARDS") >= 0 &&
1030                                         s.IndexOf ("LEFTWARDS") >= 0)
1031                                         value = 0xE1 - 0xD8;
1032                                 else if (s.IndexOf ("UPWARDS") >= 0 &&
1033                                         s.IndexOf ("DOWNWARDS") >= 0)
1034                                         value = 0xE2 - 0xD8;
1035                                 else if (s.IndexOf ("ARROW") >= 0 &&
1036                                         s.IndexOf ("COMBINING") < 0 &&
1037                                         s.IndexOf ("CLOCKWISE") >= 0)
1038                                         value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8;
1039                                 if (value == 0)
1040                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
1041                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
1042                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
1043                                                         s.IndexOf (" OVER") < 0
1044                                                 )
1045                                                         value = i;
1046                                 if (value > 0)
1047                                         arrowValues.Add (new DictionaryEntry (
1048                                                 cp, value));
1049                         }
1050
1051                         // Box names
1052                         if (0x2500 <= cp && cp < 0x2600) {
1053                                 int value = int.MinValue;
1054                                 // flags:
1055                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
1056                                 // [h,rl] [r] [l]
1057                                 // [v,ud] [u] [d]
1058                                 // [dr] [dl] [ur] [ul]
1059                                 // [vr,udr] [vl,vdl]
1060                                 // [hd,rld] [hu,rlu]
1061                                 // [hv,udrl,rlv,udh]
1062                                 ArrayList flags = new ArrayList (new int [] {
1063                                         32, 8 + 4, 8, 4,
1064                                         16, 1 + 2, 1, 2,
1065                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
1066                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
1067                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
1068                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
1069                                         });
1070                                 byte [] offsets = new byte [] {
1071                                         0, 0, 1, 2,
1072                                         3, 3, 4, 5,
1073                                         6, 7, 8, 9,
1074                                         10, 10, 11, 11,
1075                                         12, 12, 13, 13,
1076                                         14, 14, 14, 14};
1077                                 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
1078                                         int flag = 0;
1079                                         if (s.IndexOf (" UP") >= 0)
1080                                                 flag |= 1;
1081                                         if (s.IndexOf (" DOWN") >= 0)
1082                                                 flag |= 2;
1083                                         if (s.IndexOf (" RIGHT") >= 0)
1084                                                 flag |= 4;
1085                                         if (s.IndexOf (" LEFT") >= 0)
1086                                                 flag |= 8;
1087                                         if (s.IndexOf (" VERTICAL") >= 0)
1088                                                 flag |= 16;
1089                                         if (s.IndexOf (" HORIZONTAL") >= 0)
1090                                                 flag |= 32;
1091
1092                                         int fidx = flags.IndexOf (flag);
1093                                         if (fidx >= 0)
1094                                                 value = offsets [fidx];
1095                                 } else if (s.IndexOf ("BLOCK") >= 0) {
1096                                         if (s.IndexOf ("ONE EIGHTH") >= 0)
1097                                                 value = 0x12;
1098                                         else if (s.IndexOf ("ONE QUARTER") >= 0)
1099                                                 value = 0x13;
1100                                         else if (s.IndexOf ("THREE EIGHTHS") >= 0)
1101                                                 value = 0x14;
1102                                         else if (s.IndexOf ("HALF") >= 0)
1103                                                 value = 0x15;
1104                                         else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1105                                                 value = 0x16;
1106                                         else if (s.IndexOf ("THREE QUARTERS") >= 0)
1107                                                 value = 0x17;
1108                                         else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1109                                                 value = 0x18;
1110                                         else
1111                                                 value = 0x19;
1112                                 }
1113                                 else if (s.IndexOf ("SHADE") >= 0)
1114                                         value = 0x19;
1115                                 else if (s.IndexOf ("SQUARE") >= 0)
1116                                         value = 0xBC - 0xE5;
1117                                 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1118                                         value = 0xBE - 0xE5;
1119                                 else if (s.IndexOf ("RECTANGLE") >= 0)
1120                                         value = 0xBD - 0xE5;
1121                                 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1122                                         value = 0xBF - 0xE5;
1123                                 else if (s.IndexOf ("TRIANGLE") >= 0) {
1124                                         if (s.IndexOf ("UP-POINTING") >= 0)
1125                                                 value = 0xC0 - 0xE5;
1126                                         else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1127                                                 value = 0xC1 - 0xE5;
1128                                         else if (s.IndexOf ("DOWN-POINTING") >= 0)
1129                                                 value = 0xC2 - 0xE5;
1130                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1131                                                 value = 0xC3 - 0xE5;
1132                                 }
1133                                 else if (s.IndexOf ("POINTER") >= 0) {
1134                                         if (s.IndexOf ("RIGHT-POINTING") >= 0)
1135                                                 value = 0xC4 - 0xE5;
1136                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1137                                                 value = 0xC5 - 0xE5;
1138                                 }
1139                                 else if (s.IndexOf ("DIAMOND") >= 0)
1140                                         value = 0xC6 - 0xE5;
1141                                 else if (s.IndexOf ("FISHEYE") >= 0)
1142                                         value = 0xC7 - 0xE5;
1143                                 else if (s.IndexOf ("LOZENGE") >= 0)
1144                                         value = 0xC8 - 0xE5;
1145                                 else if (s.IndexOf ("BULLSEYE") >= 0)
1146                                         value = 0xC9 - 0xE5;
1147                                 else if (s.IndexOf ("CIRCLE") >= 0) {
1148                                         if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1149                                                 value = 0xCA - 0xE5;
1150                                         else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1151                                                 value = 0xCB - 0xE5;
1152                                         else
1153                                                 value = 0xC9 - 0xE5;
1154                                 }
1155                                 else if (s.IndexOf ("BULLET") >= 0)
1156                                         value = 0xCC - 0xE5;
1157                                 if (0x25DA <= cp && cp <= 0x25E5)
1158                                         value = 0xCD + cp - 0x25DA - 0xE5;
1159
1160                                 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1161                                 switch (cp) {
1162                                 case 0x2571: value = 0xF; break;
1163                                 case 0x2572: value = 0x10; break;
1164                                 case 0x2573: value = 0x11; break;
1165                                 }
1166                                 if (value != int.MinValue)
1167                                         boxValues.Add (new DictionaryEntry (
1168                                                 cp, value));
1169                         }
1170
1171                         // For some characters store the name and sort later
1172                         // to determine sorting.
1173                         if (0x2100 <= cp && cp <= 0x213F &&
1174                                 Char.IsSymbol ((char) cp))
1175                                 sortableCharNames.Add (
1176                                         new DictionaryEntry (cp, name));
1177                         else if (0x3380 <= cp && cp <= 0x33DD)
1178                                 sortableCharNames.Add (new DictionaryEntry (
1179                                         cp, name.Substring (7)));
1180
1181                         if (Char.GetUnicodeCategory ((char) cp) ==
1182                                 UnicodeCategory.MathSymbol) {
1183                                 if (name.StartsWith ("CIRCLED "))
1184                                         diacritical [cp] = 0xEE;
1185                                 if (name.StartsWith ("SQUARED "))
1186                                         diacritical [cp] = 0xEF;
1187                         }
1188
1189                         // diacritical weights by character name
1190 if (diacritics.Length != diacriticWeights.Length)
1191 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1192                         for (int d = diacritics.Length - 1; d >= 0; d--) {
1193                                 if (s.IndexOf (diacritics [d]) > 0) {
1194                                         diacritical [cp] += diacriticWeights [d];
1195                                         if (s.IndexOf ("COMBINING") >= 0)
1196                                                 diacritical [cp] -= (byte) 2;
1197                                         break;
1198                                 }
1199                                 // also process "COMBINING blah" here
1200                                 // For now it is limited to cp < 0x0370
1201 //                              if (cp < 0x0300 || cp >= 0x0370)
1202 //                                      continue;
1203                                 string tmp = diacritics [d].TrimEnd (';');
1204                                 if (tmp.IndexOf ("WITH ") == 0)
1205                                         tmp = tmp.Substring (4);
1206                                 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1207                                 if (name == tmp) {
1208                                         diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1209                                         break;
1210                                 }
1211 //if (name == tmp)
1212 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1213                         }
1214                         // Two-step grep required for it.
1215                         if (s.IndexOf ("FULL STOP") > 0 &&
1216                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1217                                 diacritical [cp] |= 0xF4;
1218                         if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1219                                 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1220                                         s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1221
1222                         // Arabic letter name
1223                         if (0x0621 <= cp && cp <= 0x064A &&
1224                                 Char.GetUnicodeCategory ((char) cp)
1225                                 == UnicodeCategory.OtherLetter) {
1226                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1227                                 switch (cp) {
1228                                 case 0x0621:
1229                                 case 0x0624:
1230                                 case 0x0626:
1231                                         // hamza, waw, yeh ... special cases.
1232                                         value = 0x07;
1233                                         break;
1234                                 case 0x0649:
1235                                 case 0x064A:
1236                                         value = 0x77; // special cases.
1237                                         break;
1238                                 default:
1239                                         // Get primary letter name i.e.
1240                                         // XXX part of ARABIC LETTER XXX yyy
1241                                         // e.g. that of "TEH MARBUTA" is "TEH".
1242                                         string letterName =
1243                                                 (cp == 0x0640) ?
1244                                                 // 0x0640 is special: it does
1245                                                 // not start with ARABIC LETTER
1246                                                 name :
1247                                                 name.Substring (14);
1248                                         int tmpIdx = letterName.IndexOf (' ');
1249                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1250 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1251                                         if (arabicNameMap.ContainsKey (letterName))
1252                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1253                                         else
1254                                                 arabicNameMap [letterName] = cp;
1255                                         break;
1256                                 }
1257                                 arabicLetterPrimaryValues [cp] = value;
1258                         }
1259
1260                         // Japanese square letter
1261                         if (0x3300 <= cp && cp <= 0x3357)
1262                                 if (!ExistsJIS (cp))
1263                                         nonJisJapanese.Add (new NonJISCharacter (cp, name));
1264
1265                         // normalizationType
1266                         string decomp = values [4];
1267                         idx = decomp.IndexOf ('<');
1268                         if (idx >= 0) {
1269                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1270                                 case "full":
1271                                         decompType [cp] = DecompositionFull;
1272                                         break;
1273                                 case "sub":
1274                                         decompType [cp] = DecompositionSub;
1275                                         break;
1276                                 case "super":
1277                                         decompType [cp] = DecompositionSuper;
1278                                         break;
1279                                 case "small":
1280                                         decompType [cp] = DecompositionSmall;
1281                                         break;
1282                                 case "isolated":
1283                                         decompType [cp] = DecompositionIsolated;
1284                                         break;
1285                                 case "initial":
1286                                         decompType [cp] = DecompositionInitial;
1287                                         break;
1288                                 case "final":
1289                                         decompType [cp] = DecompositionFinal;
1290                                         break;
1291                                 case "medial":
1292                                         decompType [cp] = DecompositionMedial;
1293                                         break;
1294                                 case "noBreak":
1295                                         decompType [cp] = DecompositionNoBreak;
1296                                         break;
1297                                 case "compat":
1298                                         decompType [cp] = DecompositionCompat;
1299                                         break;
1300                                 case "fraction":
1301                                         decompType [cp] = DecompositionFraction;
1302                                         break;
1303                                 case "font":
1304                                         decompType [cp] = DecompositionFont;
1305                                         break;
1306                                 case "circle":
1307                                         decompType [cp] = DecompositionCircle;
1308                                         break;
1309                                 case "square":
1310                                         decompType [cp] = DecompositionSquare;
1311                                         break;
1312                                 case "wide":
1313                                         decompType [cp] = DecompositionWide;
1314                                         break;
1315                                 case "narrow":
1316                                         decompType [cp] = DecompositionNarrow;
1317                                         break;
1318                                 case "vertical":
1319                                         decompType [cp] = DecompositionVertical;
1320                                         break;
1321                                 default:
1322                                         throw new Exception ("Support NFKD type : " + decomp);
1323                                 }
1324                         }
1325                         else
1326                                 decompType [cp] = DecompositionCanonical;
1327                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1328                         if (decomp.Length > 0) {
1329
1330                                 string [] velems = decomp.Split (' ');
1331                                 int didx = decompValues.Count;
1332                                 decompIndex [cp] = didx;
1333                                 foreach (string v in velems)
1334                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1335                                 decompLength [cp] = velems.Length;
1336
1337                                 // [decmpType] -> this_cp
1338                                 int targetCP = (int) decompValues [didx];
1339                                 // for "(x)" it specially maps to 'x' .
1340                                 // FIXME: check if it is sane
1341                                 if (velems.Length == 3 &&
1342                                         (int) decompValues [didx] == '(' &&
1343                                         (int) decompValues [didx + 2] == ')')
1344                                         targetCP = (int) decompValues [didx + 1];
1345                                 // special: 0x215F "1/"
1346                                 else if (cp == 0x215F)
1347                                         targetCP = '1';
1348                                 else if (velems.Length > 1 &&
1349                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
1350                                         // skip them, except for CJK ideograph compat
1351                                         targetCP = 0;
1352
1353                                 if (targetCP != 0) {
1354                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
1355                                         if (entry == null) {
1356                                                 entry = new Hashtable ();
1357                                                 nfkdMap [targetCP] = entry;
1358                                         }
1359                                         entry [(byte) decompType [cp]] = cp;
1360                                 }
1361                         }
1362                         // numeric values
1363                         if (values [5].Length > 0)
1364                                 decimalValue [cp] = decimal.Parse (values [5]);
1365                         else if (values [6].Length > 0)
1366                                 decimalValue [cp] = decimal.Parse (values [6]);
1367                         else if (values [7].Length > 0) {
1368                                 string decstr = values [7];
1369                                 idx = decstr.IndexOf ('/');
1370                                 if (cp == 0x215F) // special. "1/"
1371                                         decimalValue [cp] = 0x1;
1372                                 else if (idx > 0)
1373                                         // m/n
1374                                         decimalValue [cp] =
1375                                                 decimal.Parse (decstr.Substring (0, idx))
1376                                                 / decimal.Parse (decstr.Substring (idx + 1));
1377                                 else if (decstr [0] == '(' &&
1378                                         decstr [decstr.Length - 1] == ')')
1379                                         // (n)
1380                                         decimalValue [cp] =
1381                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1382                                 else if (decstr [decstr.Length - 1] == '.')
1383                                         // n.
1384                                         decimalValue [cp] =
1385                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1386                                 else
1387                                         decimalValue [cp] = decimal.Parse (decstr);
1388                         }
1389                 }
1390
1391                 void ParseDerivedCoreProperties (string filename)
1392                 {
1393                         // IsUppercase
1394                         using (StreamReader file =
1395                                 new StreamReader (filename)) {
1396                                 for (int line = 1; file.Peek () >= 0; line++) {
1397                                         try {
1398                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1399                                         } catch (Exception) {
1400                                                 Console.Error.WriteLine ("**** At line " + line);
1401                                                 throw;
1402                                         }
1403                                 }
1404                         }
1405                 }
1406
1407                 void ProcessDerivedCorePropLine (string s)
1408                 {
1409                         int idx = s.IndexOf ('#');
1410                         if (idx >= 0)
1411                                 s = s.Substring (0, idx);
1412                         idx = s.IndexOf (';');
1413                         if (idx < 0)
1414                                 return;
1415                         string cpspec = s.Substring (0, idx);
1416                         idx = cpspec.IndexOf ("..");
1417                         NumberStyles nf = NumberStyles.HexNumber |
1418                                 NumberStyles.AllowTrailingWhite;
1419                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1420                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1421                         string value = s.Substring (cpspec.Length + 1).Trim ();
1422
1423                         // FIXME: use index
1424                         if (cp > char.MaxValue)
1425                                 return;
1426
1427                         switch (value) {
1428                         case "Uppercase":
1429                                 for (int x = cp; x <= cpEnd; x++)
1430                                         isUppercase [x] = true;
1431                                 break;
1432                         }
1433                 }
1434
1435                 void ParseScripts (string filename)
1436                 {
1437                         ArrayList gurmukhi = new ArrayList ();
1438                         ArrayList gujarati = new ArrayList ();
1439                         ArrayList georgian = new ArrayList ();
1440                         ArrayList thaana = new ArrayList ();
1441
1442                         using (StreamReader file =
1443                                 new StreamReader (filename)) {
1444                                 while (file.Peek () >= 0) {
1445                                         string s = file.ReadLine ();
1446                                         int idx = s.IndexOf ('#');
1447                                         if (idx >= 0)
1448                                                 s = s.Substring (0, idx);
1449                                         idx = s.IndexOf (';');
1450                                         if (idx < 0)
1451                                                 continue;
1452
1453                                         string cpspec = s.Substring (0, idx);
1454                                         idx = cpspec.IndexOf ("..");
1455                                         NumberStyles nf = NumberStyles.HexNumber |
1456                                                 NumberStyles.AllowTrailingWhite;
1457                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1458                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1459                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1460
1461                                         // FIXME: use index
1462                                         if (cp > char.MaxValue)
1463                                                 continue;
1464
1465                                         switch (value) {
1466                                         case "Gurmukhi":
1467                                                 for (int x = cp; x <= cpEnd; x++)
1468                                                         if (!IsIgnorable (x))
1469                                                                 gurmukhi.Add ((char) x);
1470                                                 break;
1471                                         case "Gujarati":
1472                                                 for (int x = cp; x <= cpEnd; x++)
1473                                                         if (!IsIgnorable (x))
1474                                                                 gujarati.Add ((char) x);
1475                                                 break;
1476                                         case "Georgian":
1477                                                 for (int x = cp; x <= cpEnd; x++)
1478                                                         if (!IsIgnorable (x))
1479                                                                 georgian.Add ((char) x);
1480                                                 break;
1481                                         case "Thaana":
1482                                                 for (int x = cp; x <= cpEnd; x++)
1483                                                         if (!IsIgnorable (x))
1484                                                                 thaana.Add ((char) x);
1485                                                 break;
1486                                         }
1487                                 }
1488                         }
1489                         gurmukhi.Sort (UCAComparer.Instance);
1490                         gujarati.Sort (UCAComparer.Instance);
1491                         georgian.Sort (UCAComparer.Instance);
1492                         thaana.Sort (UCAComparer.Instance);
1493                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1494                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1495                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1496                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1497                 }
1498
1499                 void ParseJISOrder (string filename)
1500                 {
1501                         int line = 1;
1502                         try {
1503                                 using (StreamReader file =
1504                                         new StreamReader (filename)) {
1505                                         for (;file.Peek () >= 0; line++)
1506                                                 ProcessJISOrderLine (file.ReadLine ());
1507                                 }
1508                         } catch (Exception) {
1509                                 Console.Error.WriteLine ("---- line {0}", line);
1510                                 throw;
1511                         }
1512                 }
1513
1514                 char [] ws = new char [] {'\t', ' '};
1515
1516                 void ProcessJISOrderLine (string s)
1517                 {
1518                         int idx = s.IndexOf ('#');
1519                         if (idx >= 0)
1520                                 s = s.Substring (0, idx).Trim ();
1521                         if (s.Length == 0)
1522                                 return;
1523                         idx = s.IndexOfAny (ws);
1524                         if (idx < 0)
1525                                 return;
1526                         // They start with "0x" so cut them out.
1527                         int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1528                         int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1529                         jisJapanese.Add (new JISCharacter (cp, jis));
1530                 }
1531
1532                 void ParseCJK (string zhXML, string jaXML, string koXML)
1533                 {
1534                         XmlDocument doc = new XmlDocument ();
1535                         doc.XmlResolver = null;
1536                         int v;
1537                         string s;
1538                         string category;
1539                         int offset;
1540                         ushort [] arr;
1541
1542                         // Chinese Simplified
1543                         category = "chs";
1544                         arr = cjkCHS;
1545                         offset = 0;//char.MaxValue - arr.Length;
1546                         doc.Load (zhXML);
1547                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1548                         v = 0x8008;
1549                         foreach (char c in s) {
1550                                 if (c < '\u3100')
1551                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1552                                 else {
1553                                         arr [(int) c - offset] = (ushort) v++;
1554                                         if (v % 256 == 0)
1555                                                 v += 2;
1556                                 }
1557                         }
1558
1559                         // Chinese Traditional
1560                         category = "cht";
1561                         arr = cjkCHT;
1562                         offset = 0;//char.MaxValue - arr.Length;
1563                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1564                         v = 0x8002;
1565                         foreach (char c in s) {
1566                                 if (c < '\u4E00')
1567                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1568                                 else {
1569                                         arr [(int) c - offset] = (ushort) v++;
1570                                         if (v % 256 == 0)
1571                                                 v += 2;
1572                                 }
1573                         }
1574
1575                         // Japanese
1576                         category = "ja";
1577                         arr = cjkJA;
1578                         offset = 0;//char.MaxValue - arr.Length;
1579
1580                         // SPECIAL CASES
1581                         arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1582                         arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1583                         arr [0x337E] = 0x8005;
1584                         arr [0x337D] = 0x8006;
1585                         arr [0x337C] = 0x8007;
1586
1587                         v = 0x8008;
1588                         foreach (JISCharacter jc in jisJapanese) {
1589                                 if (jc.JIS < 0x8800)
1590                                         continue;
1591                                 char c = (char) jc.CP;
1592
1593                                 if (c < '\u4E00')
1594                                         // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1595                                         continue;
1596                                 else {
1597                                         arr [(int) c - offset] = (ushort) v++;
1598                                         if (v % 256 == 0)
1599                                                 v += 2;
1600
1601                                         // SPECIAL CASES:
1602                                         if (c == '\u662D') // U+337C
1603                                                 continue;
1604                                         if (c == '\u5927') // U+337D
1605                                                 continue;
1606                                         if (c == '\u5E73') // U+337B
1607                                                 continue;
1608                                         if (c == '\u660E') // U+337E
1609                                                 continue;
1610                                         if (c == '\u9686') // U+F9DC
1611                                                 continue;
1612
1613                                         // FIXME: there are still remaining
1614                                         // characters after U+FA0C.
1615 //                                      for (int k = 0; k < char.MaxValue; k++) {
1616                                         for (int k = 0; k < '\uFA0D'; k++) {
1617                                                 if (decompIndex [k] == 0 || IsIgnorable (k))
1618                                                         continue;
1619                                                 if (decompValues [decompIndex [k]] == c /*&&
1620                                                         decompLength [k] == 1*/ ||
1621                                                         decompLength [k] == 3 &&
1622                                                         decompValues [decompIndex [k] + 1] == c) {
1623                                                         arr [k - offset] = (ushort) v++;
1624                                                         if (v % 256 == 0)
1625                                                                 v += 2;
1626                                                 }
1627                                         }
1628                                 }
1629                         }
1630
1631                         // Korean
1632                         // Korean weight is somewhat complex. It first shifts
1633                         // Hangul category from 52-x to 80-x (they are anyways
1634                         // computed). CJK ideographs are placed at secondary
1635                         // weight, like XX YY 01 zz 01, where XX and YY are
1636                         // corresponding "reset" value and zz is 41,43,45...
1637                         //
1638                         // Unlike chs,cht and ja, Korean value is a combined
1639                         // ushort which is computed as category
1640                         //
1641                         category = "ko";
1642                         arr = cjkKO;
1643                         offset = 0;//char.MaxValue - arr.Length;
1644                         doc.Load (koXML);
1645                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1646                                 XmlElement sc = (XmlElement) reset.NextSibling;
1647                                 // compute "category" and "level 1" for the
1648                                 // target "reset" Hangle syllable
1649                                 char rc = reset.InnerText [0];
1650                                 int ri = ((int) rc - 0xAC00) + 1;
1651                                 ushort p = (ushort)
1652                                         ((ri / 254) * 256 + (ri % 254) + 2);
1653                                 // Place the characters after the target.
1654                                 s = sc.InnerText;
1655                                 v = 0x41;
1656                                 foreach (char c in s) {
1657                                         arr [(int) c - offset] = p;
1658                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1659                                         v += 2;
1660                                 }
1661                         }
1662                 }
1663
1664                 #endregion
1665
1666                 #region Generation
1667
1668                 void FillIgnorables ()
1669                 {
1670                         for (int i = 0; i <= char.MaxValue; i++) {
1671                                 if (Char.GetUnicodeCategory ((char) i) ==
1672                                         UnicodeCategory.OtherNotAssigned)
1673                                         continue;
1674                                 if (IsIgnorable (i))
1675                                         ignorableFlags [i] |= 1;
1676                                 if (IsIgnorableSymbol (i))
1677                                         ignorableFlags [i] |= 2;
1678                                 if (IsIgnorableNonSpacing (i))
1679                                         ignorableFlags [i] |= 4;
1680                         }
1681                 }
1682
1683                 void ModifyUnidata ()
1684                 {
1685                         ArrayList decompValues = new ArrayList (this.decompValues);
1686
1687                         // Hebrew uppercase letters.
1688                         foreach (int i in new int []
1689                                 {0x05DB, 0x05DE, 0x05E0, 0x05E4, 0x05E6})
1690                                 isUppercase [i] = true;
1691
1692
1693                         // Modify some decomposition equivalence
1694                         for (int i = 0xFE31; i <= 0xFE34; i++) {
1695                                 decompType [i] = 0;
1696                                 decompIndex [i] = 0;
1697                                 decompLength [i] = 0;
1698                         }
1699                         decompType [0x037E] = 0;
1700                         decompIndex [0x037E] = 0;
1701                         decompLength [0x037E] = 0;
1702
1703                         // Hangzhou numbers
1704                         for (int i = 0x3021; i <= 0x3029; i++)
1705                                 diacritical [i] = 0x4E;
1706                         // Korean parens numbers
1707                         for (int i = 0x3200; i <= 0x321C; i++)
1708                                 diacritical [i] = 0xA;
1709                         for (int i = 0x3260; i <= 0x327B; i++)
1710                                 diacritical [i] = 0xC;
1711
1712                         // LAMESPEC: these remapping should not be done.
1713                         // Windows have incorrect CJK compat mappings.
1714                         decompValues [decompIndex [0x32A9]] = 0x91AB;
1715                         decompLength [0x323B] = 1;
1716                         decompValues [decompIndex [0x323B]] = 0x5B78;
1717                         decompValues [decompIndex [0x32AB]] = 0x5B78;
1718                         decompValues [decompIndex [0x32A2]] = 0x5BEB;
1719                         decompLength [0x3238] = 1;
1720                         decompValues [decompIndex [0x3238]] = 0x52DE;
1721                         decompValues [decompIndex [0x3298]] = 0x52DE;
1722
1723                         // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1724                         decompIndex [0xFA0C] = decompValues.Count;
1725                         decompValues.Add ((int) 0x5140);
1726                         decompLength [0xFA0C] = 1;
1727                         decompIndex [0xF929] = decompLength [0xF929] = 0;
1728
1729                         decompValues [decompIndex [0xF92C]] = 0x90DE;
1730
1731                         decompIndex [0x2125] = decompValues.Count;
1732                         decompValues.Add ((int) 0x005A);
1733                         decompLength [0x2125] = 1;
1734                         decompType [0x2125] = DecompositionFont;
1735
1736                         this.decompValues = decompValues.ToArray (typeof (int)) as int [];
1737                 }
1738
1739                 void ModifyParsedValues ()
1740                 {
1741                         // Sometimes STROKE don't work fine
1742                         diacritical [0xD8] = diacritical [0xF8] = 0x21;
1743                         diacritical [0x141] = diacritical [0x142] = 0x1F;
1744                         // FIXME: why?
1745                         diacritical [0xAA] = diacritical [0xBA] = 3;
1746                         diacritical [0xD0] = diacritical [0xF0] = 0x68;
1747                         diacritical [0x131] = 3;
1748                         diacritical [0x138] = 3;
1749                         // TOPBAR does not work as an identifier for the weight
1750                         diacritical [0x182] = diacritical [0x183] = 0x68; // B
1751                         diacritical [0x18B] = diacritical [0x18C] = 0x1E; // D
1752                         // TONE TWO
1753                         diacritical [0x1A7] = diacritical [0x1A8] = 0x87;
1754                         // TONE SIX
1755                         diacritical [0x184] = diacritical [0x185] = 0x87;
1756                         // OPEN E
1757                         diacritical [0x190] = diacritical [0x25B] = 0x7B;
1758                         // There are many letters w/ diacritical weight 0x7B
1759                         diacritical [0x0192] = diacritical [0x0194] =
1760                         diacritical [0x0195] = diacritical [0x0196] =
1761                         diacritical [0x019C] = diacritical [0x019E] =
1762                         diacritical [0x01A6] = diacritical [0x01B1] =
1763                         diacritical [0x01B2] = diacritical [0x01BF] = 0x7B;
1764                         // ... as well as 0x7C
1765                         diacritical [0x01A2] = diacritical [0x01A3] = 0x7C;
1766
1767                         // <font> NFKD characters seem to have diacritical
1768                         // weight as 3,4,5... but the order does not look
1769                         // by codepoint and I have no idea how they are sorted.
1770                         diacritical [0x210E] = 3;
1771                         diacritical [0x210F] = 0x68;
1772                         diacritical [0x2110] = 4;
1773                         diacritical [0x2111] = 5;
1774                         diacritical [0x2112] = 4;
1775                         diacritical [0x2113] = 4;
1776                         diacritical [0x211B] = 4;
1777                         diacritical [0x211C] = 5;
1778
1779                         // some cyrillic diacritical weight. They seem to be
1780                         // based on old character names, so it's quicker to
1781                         // set them directly here.
1782                         // FIXME: they are by mostly unknown reason
1783                         diacritical [0x0496] = diacritical [0x0497] = 7;
1784                         diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1785                         diacritical [0x049A] = diacritical [0x049B] = 0x17;
1786                         diacritical [0x049C] = diacritical [0x049D] = 9;
1787                         diacritical [0x049E] = diacritical [0x049F] = 4;
1788                         diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1789                         diacritical [0x04A2] = diacritical [0x04A3] = 7;
1790                         diacritical [0x04A4] = diacritical [0x04A5] = 8;
1791                         diacritical [0x04AA] = diacritical [0x04AB] = 0x1A; // ES CEDILLA?
1792                         diacritical [0x04AC] = diacritical [0x04AD] = 7; // RIGHT DESCENDER? but U+4B2
1793                         diacritical [0x04AE] = diacritical [0x04AF] = 0xB; // STRAIGHT U?
1794                         diacritical [0x04B2] = diacritical [0x04B3] = 0x17; // RIGHT DESCENDER? but U+4AC
1795                         diacritical [0x04B4] = diacritical [0x04B5] = 3;
1796                         diacritical [0x04B6] = 8;
1797                         diacritical [0x04B7] = 7;
1798                         diacritical [0x04B8] = diacritical [0x04B9] = 9;
1799                         diacritical [0x04BA] = diacritical [0x04BB] = 9;
1800
1801                         // number, secondary weights
1802                         byte weight = 0x38;
1803                         int [] numarr = numberSecondaryWeightBounds;
1804                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1805                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1806                                         if (Char.IsNumber ((char) cp))
1807                                                 diacritical [cp] = weight;
1808
1809                         // Gurmukhi special letters' diacritical weight
1810                         for (int i = 0x0A50; i < 0x0A60; i++)
1811                                 diacritical [i] = 4;
1812                         // Oriya special letters' diacritical weight
1813                         for (int i = 0x0B5C; i < 0x0B60; i++)
1814                                 diacritical [i] = 6;
1815
1816                         // Update name part of named characters
1817                         for (int i = 0; i < sortableCharNames.Count; i++) {
1818                                 DictionaryEntry de =
1819                                         (DictionaryEntry) sortableCharNames [i];
1820                                 int cp = (int) de.Key;
1821                                 string renamed = null;
1822                                 switch (cp) {
1823                                 case 0x2101: renamed = "A_1"; break;
1824                                 case 0x33C3: renamed = "A_2"; break;
1825                                 case 0x2105: renamed = "C_1"; break;
1826                                 case 0x2106: renamed = "C_2"; break;
1827                                 case 0x211E: renamed = "R1"; break;
1828                                 case 0x211F: renamed = "R2"; break;
1829                                 // Remove some of them!
1830                                 case 0x2103:
1831                                 case 0x2109:
1832                                 case 0x2116:
1833                                 case 0x2117:
1834                                 case 0x2118:
1835                                 case 0x2125:
1836                                 case 0x2127:
1837                                 case 0x2129:
1838                                 case 0x212E:
1839                                 case 0x2132:
1840                                         sortableCharNames.RemoveAt (i);
1841                                         i--;
1842                                         continue;
1843                                 }
1844                                 if (renamed != null)
1845                                         sortableCharNames [i] =
1846                                                 new DictionaryEntry (cp, renamed);
1847                         }
1848                 }
1849
1850                 void GenerateCore ()
1851                 {
1852                         UnicodeCategory uc;
1853
1854                         #region Specially ignored // 01
1855                         // This will raise "Defined" flag up.
1856                         // FIXME: Check If it is really fine. Actually for
1857                         // Japanese voice marks this code does remapping.
1858                         foreach (char c in specialIgnore)
1859                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1860                         #endregion
1861
1862                         #region Extenders (FF FF)
1863                         fillIndex [0xFF] = 0xFF;
1864                         char [] specialBiggest = new char [] {
1865                                 '\u3005', '\u3031', '\u3032', '\u309D',
1866                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1867                                 '\uFE7C', '\uFE7D', '\uFF70'};
1868                         foreach (char c in specialBiggest)
1869                                 AddCharMap (c, 0xFF, 0);
1870                         #endregion
1871
1872                         #region Variable weights
1873                         // Controls : 06 03 - 06 3D
1874                         fillIndex [0x6] = 3;
1875                         for (int i = 0; i < 65536; i++) {
1876                                 if (IsIgnorable (i))
1877                                         continue;
1878                                 char c = (char) i;
1879                                 uc = Char.GetUnicodeCategory (c);
1880                                 // NEL is whitespace but not ignored here.
1881                                 if (uc == UnicodeCategory.Control &&
1882                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1883                                         AddCharMap (c, 6, 1);
1884                         }
1885
1886                         // Apostrophe 06 80
1887                         fillIndex [0x6] = 0x80;
1888                         AddCharMap ('\'', 6, 0);
1889                         AddCharMap ('\uFF07', 6, 1);
1890                         AddCharMap ('\uFE63', 6, 1);
1891
1892                         // SPECIAL CASE: fill FE32 here in prior to be added
1893                         // at 2013. Windows does not always respect NFKD.
1894                         map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1895
1896                         // Hyphen/Dash : 06 81 - 06 90
1897                         for (int i = 0; i < char.MaxValue; i++) {
1898                                 if (!IsIgnorable (i) &&
1899                                         Char.GetUnicodeCategory ((char) i) ==
1900                                         UnicodeCategory.DashPunctuation) {
1901                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1902                                         if (i == 0x2011) {
1903                                                 // SPECIAL: add 2027 and 2043
1904                                                 // Maybe they are regarded the
1905                                                 // same hyphens in "central"
1906                                                 // position.
1907                                                 AddCharMap ('\u2027', 6, 1);
1908                                                 AddCharMap ('\u2043', 6, 1);
1909                                         }
1910                                 }
1911                         }
1912                         // They are regarded as primarily equivalent to '-'
1913                         map [0x208B] = new CharMapEntry (6, 0x82, 0);
1914                         map [0x207B] = new CharMapEntry (6, 0x82, 0);
1915                         map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1916
1917                         // Arabic variable weight chars 06 A0 -
1918                         fillIndex [6] = 0xA0;
1919                         // vowels
1920                         for (int i = 0x64B; i <= 0x650; i++)
1921                                 AddArabicCharMap ((char) i, 6, 1, 0);
1922                         // sukun
1923                         AddCharMapGroup ('\u0652', 6, 1, 0);
1924                         // shadda
1925                         AddCharMapGroup ('\u0651', 6, 1, 0);
1926                         #endregion
1927
1928
1929                         #region Nonspacing marks // 01
1930                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1931
1932                         // Combining diacritical marks: 01 DC -
1933
1934                         fillIndex [0x1] = 0x41;
1935                         for (int i = 0x030E; i <= 0x0326; i++)
1936                                 if (!IsIgnorable (i))
1937                                         AddCharMap ((char) i, 0x1, 1);
1938                         for (int i = 0x0329; i <= 0x0334; i++)
1939                                 if (!IsIgnorable (i))
1940                                         AddCharMap ((char) i, 0x1, 1);
1941                         fillIndex [0x1]++;
1942                         for (int i = 0x0339; i <= 0x0341; i++)
1943                                 if (!IsIgnorable (i))
1944                                         AddCharMap ((char) i, 0x1, 1);
1945                         fillIndex [0x1] = 0x74;
1946                         for (int i = 0x0346; i <= 0x0348; i++)
1947                                 if (!IsIgnorable (i))
1948                                         AddCharMap ((char) i, 0x1, 1);
1949                         for (int i = 0x02BE; i <= 0x02BF; i++)
1950                                 if (!IsIgnorable (i))
1951                                         AddCharMap ((char) i, 0x1, 1);
1952                         for (int i = 0x02C1; i <= 0x02C5; i++)
1953                                 if (!IsIgnorable (i))
1954                                         AddCharMap ((char) i, 0x1, 1);
1955                         for (int i = 0x02CE; i <= 0x02CF; i++)
1956                                 if (!IsIgnorable (i))
1957                                         AddCharMap ((char) i, 0x1, 1);
1958                         fillIndex [0x1]++;
1959                         for (int i = 0x02D1; i <= 0x02D3; i++)
1960                                 if (!IsIgnorable (i))
1961                                         AddCharMap ((char) i, 0x1, 1);
1962                         AddCharMap ('\u02DE', 0x1, 1);
1963                         for (int i = 0x02E4; i <= 0x02E9; i++)
1964                                 if (!IsIgnorable (i))
1965                                         AddCharMap ((char) i, 0x1, 1);
1966
1967
1968                         // FIXME: needs more love here (it should eliminate
1969                         // all the hacky code above).
1970                         for (int i = 0x0300; i < 0x0370; i++)
1971                                 if (!IsIgnorable (i) && diacritical [i] != 0
1972                                         && !map [i].Defined)
1973                                         map [i] = new CharMapEntry (
1974                                                 0x1, 0x1, diacritical [i]);
1975
1976                         // Cyrillic and Armenian nonspacing mark
1977                         fillIndex [0x1] = 0x94;
1978                         for (int i = 0x400; i < 0x580; i++)
1979                                 if (!IsIgnorable (i) &&
1980                                         Char.GetUnicodeCategory ((char) i) ==
1981                                         UnicodeCategory.NonSpacingMark)
1982                                         AddCharMap ((char) i, 1, 1);
1983
1984                         fillIndex [0x1] = 0x8D;
1985                         // syriac dotted nonspacing marks (1)
1986                         AddCharMap ('\u0740', 0x1, 1);
1987                         AddCharMap ('\u0741', 0x1, 1);
1988                         AddCharMap ('\u0742', 0x1, 1);
1989                         // syriac oblique nonspacing marks
1990                         AddCharMap ('\u0747', 0x1, 1);
1991                         AddCharMap ('\u0748', 0x1, 1);
1992                         // syriac dotted nonspacing marks (2)
1993                         fillIndex [0x1] = 0x94; // this reset is mandatory
1994                         AddCharMap ('\u0732', 0x1, 1);
1995                         AddCharMap ('\u0735', 0x1, 1);
1996                         AddCharMap ('\u0738', 0x1, 1);
1997                         AddCharMap ('\u0739', 0x1, 1);
1998                         AddCharMap ('\u073C', 0x1, 1);
1999                         // SPECIAL CASES: superscripts
2000                         AddCharMap ('\u073F', 0x1, 1);
2001                         AddCharMap ('\u0711', 0x1, 1);
2002                         // syriac "DOTS"
2003                         for (int i = 0x0743; i <= 0x0746; i++)
2004                                 AddCharMap ((char) i, 0x1, 1);
2005                         for (int i = 0x0730; i <= 0x0780; i++)
2006                                 if (!map [i].Defined &&
2007                                         Char.GetUnicodeCategory ((char) i) ==
2008                                         UnicodeCategory.NonSpacingMark)
2009                                         AddCharMap ((char) i, 0x1, 1);
2010
2011                         // LAMESPEC: It should not stop at '\u20E1'. There are
2012                         // a few more characters (that however results in
2013                         // overflow of level 2 unless we start before 0xDD).
2014                         fillIndex [0x1] = 0xDD;
2015                         for (int i = 0x20D0; i <= 0x20DC; i++)
2016                                 AddCharMap ((char) i, 0x1, 1);
2017                         fillIndex [0x1] = 0xEC;
2018                         for (int i = 0x20DD; i <= 0x20E1; i++)
2019                                 AddCharMap ((char) i, 0x1, 1);
2020                         fillIndex [0x1] = 0x4;
2021                         AddCharMap ('\u0CD5', 0x1, 1);
2022                         AddCharMap ('\u0CD6', 0x1, 1);
2023                         AddCharMap ('\u093C', 0x1, 1);
2024                         for (int i = 0x302A; i <= 0x302D; i++)
2025                                 AddCharMap ((char) i, 0x1, 1);
2026                         AddCharMap ('\u0C55', 0x1, 1);
2027                         AddCharMap ('\u0C56', 0x1, 1);
2028
2029                         fillIndex [0x1] = 0x50; // I wonder how they are sorted
2030                         for (int i = 0x02D4; i <= 0x02D7; i++)
2031                                 AddCharMap ((char) i, 0x1, 1);
2032
2033                         // They are not part of Nonspacing marks, but have
2034                         // only diacritical weight.
2035                         for (int i = 0x3099; i <= 0x309C; i++)
2036                                 map [i] = new CharMapEntry (1, 1, 1);
2037                         map [0xFF9E] = new CharMapEntry (1, 1, 1);
2038                         map [0xFF9F] = new CharMapEntry (1, 1, 2);
2039                         map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
2040                         map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
2041                         for (int i = 0x30FC; i <= 0x30FE; i++)
2042                                 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
2043
2044                         fillIndex [0x1] = 0xA;
2045                         for (int i = 0x0951; i <= 0x0954; i++)
2046                                 AddCharMap ((char) i, 0x1, 2);
2047
2048                         #endregion
2049
2050
2051                         #region Whitespaces // 07 03 -
2052                         fillIndex [0x7] = 0x2;
2053                         AddCharMap (' ', 0x7, 2);
2054                         AddCharMap ('\u00A0', 0x7, 1);
2055                         for (int i = 9; i <= 0xD; i++)
2056                                 AddCharMap ((char) i, 0x7, 1);
2057                         for (int i = 0x2000; i <= 0x200B; i++)
2058                                 AddCharMap ((char) i, 0x7, 1);
2059
2060                         fillIndex [0x7] = 0x17;
2061                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
2062                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
2063
2064                         // Characters which used to represent layout control.
2065                         // LAMESPEC: Windows developers seem to have thought
2066                         // that those characters are kind of whitespaces,
2067                         // while they aren't.
2068                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
2069                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
2070
2071                         #endregion
2072
2073                         // category 09 - continued symbols from 08
2074                         fillIndex [0x9] = 2;
2075                         // misc tech mark
2076                         for (int cp = 0x2300; cp <= 0x237A; cp++)
2077                                 AddCharMap ((char) cp, 0x9, 1, 0);
2078
2079                         // arrows
2080                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
2081                         foreach (DictionaryEntry de in arrowValues) {
2082                                 int idx = (int) de.Value;
2083                                 int cp = (int) de.Key;
2084                                 if (map [cp].Defined)
2085                                         continue;
2086                                 fillIndex [0x9] = (byte) (0xD8 + idx);
2087                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
2088                                 arrowLv2 [idx]++;
2089                         }
2090                         // boxes
2091                         byte [] boxLv2 = new byte [128];
2092                         // 0-63 will be used for those offsets are positive,
2093                         // and 64-127 are for negative ones.
2094                         for (int i = 0; i < boxLv2.Length; i++)
2095                                 boxLv2 [i] = 3;
2096                         foreach (DictionaryEntry de in boxValues) {
2097                                 int cp = (int) de.Key;
2098                                 int off = (int) de.Value;
2099                                 if (map [cp].Defined)
2100                                         continue;
2101                                 if (off < 0) {
2102                                         fillIndex [0x9] = (byte) (0xE5 + off);
2103                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++);
2104                                 }
2105                                 else {
2106                                         fillIndex [0x9] = (byte) (0xE5 + off);
2107                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
2108                                 }
2109                         }
2110                         // Some special characters (slanted)
2111                         fillIndex [0x9] = 0xF4;
2112                         AddCharMap ('\u2571', 0x9, 3);
2113                         AddCharMap ('\u2572', 0x9, 3);
2114                         AddCharMap ('\u2573', 0x9, 3);
2115
2116                         // FIXME: implement 0A
2117                         #region Symbols
2118                         fillIndex [0xA] = 2;
2119                         // byte currency symbols
2120                         for (int cp = 0; cp < 0x100; cp++) {
2121                                 uc = Char.GetUnicodeCategory ((char) cp);
2122                                 if (!IsIgnorable (cp) &&
2123                                         uc == UnicodeCategory.CurrencySymbol &&
2124                                         cp != '$')
2125                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
2126                         }
2127                         // byte other symbols
2128                         for (int cp = 0; cp < 0x100; cp++) {
2129                                 if (cp == 0xA6)
2130                                         continue; // SPECIAL: skip FIXME: why?
2131                                 uc = Char.GetUnicodeCategory ((char) cp);
2132                                 if (!IsIgnorable (cp) &&
2133                                         uc == UnicodeCategory.OtherSymbol ||
2134                                         cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7')
2135                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
2136                         }
2137                         // U+30FB here
2138                         AddCharMapGroup ('\u30FB', 0xA, 1, 0);
2139
2140                         for (int cp = 0x2020; cp <= 0x2031; cp++)
2141                                 if (Char.IsPunctuation ((char) cp))
2142                                         AddCharMap ((char) cp, 0xA, 1, 0);
2143                         // SPECIAL CASES: why?
2144                         AddCharMap ('\u203B', 0xA, 1, 0);
2145                         AddCharMap ('\u2040', 0xA, 1, 0);
2146                         AddCharMap ('\u2041', 0xA, 1, 0);
2147                         AddCharMap ('\u2042', 0xA, 1, 0);
2148
2149                         for (int cp = 0x20A0; cp <= 0x20AB; cp++)
2150                                 AddCharMap ((char) cp, 0xA, 1, 0);
2151
2152                         // 3004 is skipped at first...
2153                         for (int cp = 0x3010; cp <= 0x3040; cp++)
2154                                 if (Char.IsSymbol ((char) cp))
2155                                         AddCharMap ((char) cp, 0xA, 1, 0);
2156                         // SPECIAL CASES: added here
2157                         AddCharMap ('\u3004', 0xA, 1, 0);
2158                         AddCharMap ('\u327F', 0xA, 1, 0);
2159
2160                         for (int cp = 0x2600; cp <= 0x2613; cp++)
2161                                 AddCharMap ((char) cp, 0xA, 1, 0);
2162                         // Dingbats
2163                         for (int cp = 0x2620; cp <= 0x2770; cp++)
2164                                 if (Char.IsSymbol ((char) cp))
2165                                         AddCharMap ((char) cp, 0xA, 1, 0);
2166                         // OCR
2167                         for (int i = 0x2440; i < 0x2460; i++)
2168                                 AddCharMap ((char) i, 0xA, 1, 0);
2169
2170                         // SPECIAL CASES: why?
2171                         AddCharMap ('\u0E3F', 0xA, 1, 0);
2172                         AddCharMap ('\u2117', 0xA, 1, 0);
2173                         AddCharMap ('\u20AC', 0xA, 1, 0);
2174                         #endregion
2175
2176                         #region Numbers // 0C 02 - 0C E1
2177                         fillIndex [0xC] = 2;
2178
2179                         // 9F8 : Bengali "one less than the denominator"
2180                         AddCharMap ('\u09F8', 0xC, 1, 0x3C);
2181
2182                         ArrayList numbers = new ArrayList ();
2183                         for (int i = 0; i < 65536; i++)
2184                                 if (!IsIgnorable (i) &&
2185                                         Char.IsNumber ((char) i) &&
2186                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
2187                                         numbers.Add (i);
2188
2189                         ArrayList numberValues = new ArrayList ();
2190                         foreach (int i in numbers)
2191                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
2192                         // SPECIAL CASE: Cyrillic Thousand sign
2193                         numberValues.Add (new DictionaryEntry (0x0482, 1000m));
2194                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
2195
2196 //foreach (DictionaryEntry de in numberValues)
2197 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
2198
2199                         // FIXME: fillIndex adjustment lines are too
2200                         // complicated. It must be simpler.
2201                         decimal prevValue = -1;
2202                         foreach (DictionaryEntry de in numberValues) {
2203                                 int cp = (int) de.Key;
2204                                 decimal currValue = (decimal) de.Value;
2205                                 bool addnew = false;
2206                                 if (prevValue < currValue &&
2207                                         prevValue - (int) prevValue == 0 &&
2208                                         prevValue >= 1) {
2209
2210                                         addnew = true;
2211                                         // Process Hangzhou and Roman numbers
2212
2213                                         // There are some SPECIAL cases.
2214                                         if (currValue != 4) // no increment for 4
2215                                                 fillIndex [0xC]++;
2216
2217                                         int xcp;
2218                                         if (currValue <= 13) {
2219                                                 if (currValue == 4)
2220                                                         fillIndex [0xC]++;
2221                                                 // SPECIAL CASE
2222                                                 if (currValue == 11)
2223                                                         AddCharMap ('\u0BF0', 0xC, 1);
2224                                                 xcp = (int) prevValue + 0x2160 - 1;
2225                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2226                                                 xcp = (int) prevValue + 0x2170 - 1;
2227                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2228                                                 fillIndex [0xC]++;
2229                                         }
2230                                         if (currValue < 12)
2231                                                 fillIndex [0xC]++;
2232                                         if (currValue <= 10) {
2233                                                 xcp = (int) prevValue + 0x3021 - 1;
2234                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2235                                                 fillIndex [0xC]++;
2236                                         }
2237                                 }
2238                                 if (prevValue < currValue)
2239                                         prevValue = currValue;
2240                                 if (map [cp].Defined)
2241                                         continue;
2242                                 // HangZhou and Roman are add later
2243                                 // (code is above)
2244                                 if (0x3021 <= cp && cp < 0x302A
2245                                         || 0x2160 <= cp && cp < 0x216C
2246                                         || 0x2170 <= cp && cp < 0x217C)
2247                                         continue;
2248
2249                                 if (cp == 0x215B) // FIXME: why?
2250                                         fillIndex [0xC] += 2;
2251                                 else if (cp == 0x3021) // FIXME: why?
2252                                         fillIndex [0xC]++;
2253                                 if (addnew || cp <= '9') {
2254                                         int mod = (int) currValue - 1;
2255                                         int xcp;
2256                                         if (1 <= currValue && currValue <= 11) {
2257                                                 xcp = mod + 0x2776;
2258                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2259                                                 xcp = mod + 0x2780;
2260                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2261                                                 xcp = mod + 0x278A;
2262                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2263                                         }
2264                                         if (1 <= currValue && currValue <= 20) {
2265                                                 xcp = mod + 0x2460;
2266                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2267                                                 xcp = mod + 0x2474;
2268                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2269                                                 xcp = mod + 0x2488;
2270                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2271                                         }
2272                                 }
2273                                 if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9)
2274                                         fillIndex [0xC]++;
2275                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2276
2277                                 switch (cp) {
2278                                 // Maybe Bengali digit numbers do not increase
2279                                 // indexes, but 0x09E6 does.
2280                                 case 0x09E7: case 0x09E8: case 0x09E9:
2281                                 case 0x09EA:
2282                                 // SPECIAL CASES
2283                                 case 0x0BF0: case 0x2180: case 0x2181:
2284                                         break;
2285                                 // SPECIAL CASE
2286                                 case 0x0BF1:
2287                                         fillIndex [0xC]++;
2288                                         break;
2289                                 default:
2290                                         if (currValue < 11 || currValue == 1000)
2291                                                 fillIndex [0xC]++;
2292                                         break;
2293                                 }
2294
2295                                 // Add special cases that are not regarded as
2296                                 // numbers in UnicodeCategory speak.
2297                                 if (cp == '5') {
2298                                         // TONE FIVE
2299                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2300                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2301                                 }
2302                                 else if (cp == '2' || cp == '6') // FIXME: why?
2303                                         fillIndex [0xC]++;
2304                         }
2305
2306                         // 221E: infinity
2307                         fillIndex [0xC] = 0xFF;
2308                         AddCharMap ('\u221E', 0xC, 1);
2309                         #endregion
2310
2311                         #region Letters and NonSpacing Marks (general)
2312
2313                         // ASCII Latin alphabets
2314                         for (int i = 0; i < alphabets.Length; i++)
2315                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2316
2317                         // non-ASCII Latin alphabets
2318                         // FIXME: there is no such characters that are placed
2319                         // *after* "alphabets" array items. This is nothing
2320                         // more than a hack that creates dummy weight for
2321                         // primary characters.
2322                         for (int i = 0x0080; i < 0x0300; i++) {
2323                                 if (!Char.IsLetter ((char) i))
2324                                         continue;
2325                                 // For those Latin Letters which has NFKD are
2326                                 // not added as independent primary character.
2327                                 if (decompIndex [i] != 0)
2328                                         continue;
2329                                 // SPECIAL CASES:
2330                                 // 1.some alphabets have primarily
2331                                 //   equivalent ASCII alphabets.
2332                                 // 2.some have independent primary weights,
2333                                 //   but inside a-to-z range.
2334                                 // 3.there are some expanded characters that
2335                                 //   are not part of Unicode Standard NFKD.
2336                                 // 4. some characters are letter in IsLetter
2337                                 //   but not in sortkeys (maybe unicode version
2338                                 //   difference caused it).
2339                                 switch (i) {
2340                                 // 1. skipping them does not make sense
2341 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
2342 //                              case 0x184: case 0x185: case 0x186: case 0x189:
2343 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
2344 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
2345 //                              case 0x19B: case 0x19C:
2346                                 // 2. skipping them does not make sense
2347 //                              case 0x14A: // Ng
2348 //                              case 0x14B: // ng
2349                                 // 3.
2350                                 case 0xC6: // AE
2351                                 case 0xE6: // ae
2352                                 case 0xDE: // Icelandic Thorn
2353                                 case 0xFE: // Icelandic Thorn
2354                                 case 0xDF: // German ss
2355                                 case 0xFF: // German ss
2356                                 // 4.
2357                                 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2358                                 // not classified yet
2359 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2360 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2361 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2362 //                              case 0x1DD:
2363                                         continue;
2364                                 }
2365                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
2366                         }
2367
2368                         // IPA extensions
2369                         // FIXME: this results in not equivalent values to
2370                         // Windows, but is safer for comparison.
2371                         char [] ipaArray = new char [0x300 - 0x250 + 0x20];
2372                         for (int i = 0x40; i < 0x60; i++)
2373                                 if (Char.IsLetter ((char) i))
2374                                         ipaArray [i - 0x40] = (char) (i);
2375                         for (int i = 0x250; i < 0x300; i++)
2376                                 if (Char.IsLetter ((char) i))
2377                                         ipaArray [i - 0x250 + 0x20] = (char) i;
2378                         Array.Sort (ipaArray, UCAComparer.Instance);
2379                         int targetASCII = 0;
2380                         byte latinDiacritical = 0x7B;
2381                         foreach (char c in ipaArray) {
2382                                 if (c <= 'Z') {
2383                                         targetASCII = c;
2384                                         latinDiacritical = 0x7B;
2385                                 }
2386                                 else
2387                                         map [(int) c] = new CharMapEntry (
2388                                                 0xE,
2389                                                 map [targetASCII].Level1,
2390                                                 latinDiacritical++);
2391                         }
2392
2393                         // Greek and Coptic
2394
2395                         // FIXME: this is (mysterious and) incomplete.
2396                         for (int i = 0x0380; i < 0x0400; i++)
2397                                 if (diacritical [i] == 0 &&
2398                                         decompLength [i] == 1 &&
2399                                         decompType [i] == DecompositionCompat)
2400                                         diacritical [i] = 3;
2401
2402                         fillIndex [0xF] = 2;
2403                         for (int i = 0x0391; i < 0x03AA; i++)
2404                                 if (i != 0x03A2)
2405                                         AddCharMap ((char) i, 0xF, 1,
2406                                                 diacritical [i]);
2407                         fillIndex [0xF] = 2;
2408                         for (int i = 0x03B1; i < 0x03CA; i++)
2409                                 if (i != 0x03C2)
2410                                         AddCharMap ((char) i, 0xF, 1,
2411                                                 diacritical [i]);
2412                         // Final Sigma
2413                         map [0x03C2] = new CharMapEntry (0xF,
2414                                 map [0x03C3].Level1, map [0x03C3].Level2);
2415
2416                         fillIndex [0xF] = 0x40;
2417                         for (int i = 0x03DA; i < 0x03F0; i++)
2418                                 AddCharMap ((char) i, 0xF,
2419                                         (byte) (i % 2 == 0 ? 0 : 2),
2420                                         diacritical [i]);
2421
2422                         // NFKD
2423                         for (int i = 0x0386; i <= 0x0400; i++)
2424                                 FillLetterNFKD (i, true, true);
2425
2426                         // Cyrillic.
2427                         // Cyrillic letters are sorted like Latin letters i.e.
2428                         // containing culture-specific letters between the
2429                         // standard Cyrillic sequence.
2430                         //
2431                         // We can't use UCA here; it has different sorting.
2432                         char [] orderedCyrillic = new char [] {
2433                                 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2434                                 '\u0452', // DJE for Serbocroatian
2435                                 '\u0435',
2436                                 '\u0454', // IE for Ukrainian
2437                                 '\u0436', '\u0437',
2438                                 '\u0455', // DZE
2439                                 '\u0438',
2440                                 '\u0456', // Byelorussian-Ukrainian I
2441                                 '\u0457', // YI
2442                                 '\u0439',
2443                                 '\u0458', // JE
2444                                 '\u043A', '\u043B',
2445                                 '\u0459', // LJE
2446                                 '\u043C', '\u043D',
2447                                 '\u045A', // NJE
2448                                 '\u043E',
2449                                 // 4E9 goes here.
2450                                 '\u043F', '\u0440', '\u0441', '\u0442',
2451                                 '\u045B', // TSHE for Serbocroatian
2452                                 '\u0443',
2453                                 '\u045E', // Short U for Byelorussian
2454                                 '\u04B1', // Straight U w/ stroke (diacritical!)
2455                                 '\u0444', '\u0445', '\u0446', '\u0447',
2456                                 '\u045F', // DZHE
2457                                 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2458                                 '\u044D', '\u044E', '\u044F'};
2459
2460                         // For some characters here is a map to basic cyrillic
2461                         // letters. See UnicodeData.txt character names for
2462                         // the sources. Here I simply declare an equiv. array.
2463                         // The content characters are map from U+490(,491),
2464                         // skipping small letters.
2465                         char [] cymap_src = new char [] {
2466                                 '\u0433', '\u0433', '\u0433', '\u0436',
2467                                 '\u0437', '\u043A', '\u043A', '\u043A',
2468                                 '\u043A', '\u043D', '\u043D', '\u043F',
2469                                 '\u0445', '\u0441', '\u0442', '\u0443',
2470                                 '\u0443', '\u0445', '\u0446', '\u0447',
2471                                 '\u0447', '\u0432', '\u0435', '\u0435',
2472                                 '\u0406', '\u0436', '\u043A', '\u043D',
2473                                 '\u0447', '\u0435'};
2474
2475                         fillIndex [0x10] = 0x8D;
2476                         for (int i = 0x0460; i < 0x0481; i++) {
2477                                 if (Char.IsLetter ((char) i)) {
2478                                         if (i == 0x0476)
2479                                                 // U+476/477 have the same
2480                                                 // primary weight as U+474/475.
2481                                                 fillIndex [0x10] -= 3;
2482                                         AddLetterMap ((char) i, 0x10, 3);
2483                                 }
2484                         }
2485
2486                         fillIndex [0x10] = 0x6;
2487                         for (int i = 0; i < orderedCyrillic.Length; i++) {
2488                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2489                                 if (!IsIgnorable ((int) c) &&
2490                                         Char.IsLetter (c) &&
2491                                         !map [c].Defined) {
2492                                         AddLetterMap (c, 0x10, 0);
2493                                         fillIndex [0x10] += 3;
2494                                 }
2495                         }
2496
2497                         // NFKD
2498                         for (int i = 0x0401; i <= 0x045F; i++)
2499                                 FillLetterNFKD (i, false, false);
2500
2501                         for (int i = 0; i < cymap_src.Length; i++) {
2502                                 char c = cymap_src [i];
2503                                 fillIndex [0x10] = map [c].Level1;
2504                                 int c2 = 0x0490 + i * 2;
2505                                 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2506                         }
2507
2508                         // Armenian
2509                         fillIndex [0x11] = 0x3;
2510                         fillIndex [0x1] = 0x98;
2511                         for (int i = 0x0531; i < 0x0586; i++) {
2512                                 if (i == 0x0559 || i == 0x55A)
2513                                         AddCharMap ((char) i, 1, 1);
2514                                 if (Char.IsLetter ((char) i))
2515                                         AddLetterMap ((char) i, 0x11, 1);
2516                         }
2517
2518                         // Hebrew
2519                         // -Letters
2520                         fillIndex [0x12] = 0x2;
2521                         for (int i = 0x05D0; i < 0x05FF; i++)
2522                                 if (Char.IsLetter ((char) i)) {
2523                                         if (isUppercase [i]) {
2524                                                 fillIndex [0x12]--;
2525                                                 AddLetterMap ((char) i, 0x12, 2);
2526                                         }
2527                                         else
2528                                                 AddLetterMap ((char) i, 0x12, 1);
2529                                 }
2530                         // -Accents
2531                         fillIndex [0x1] = 0x3;
2532                         for (int i = 0x0591; i <= 0x05C2; i++) {
2533                                 if (i == 0x05A3 || i == 0x05BB)
2534                                         fillIndex [0x1]++;
2535                                 if (i != 0x05BE)
2536                                         AddCharMap ((char) i, 0x1, 1);
2537                         }
2538
2539                         // Arabic
2540                         fillIndex [0x1] = 0x8E;
2541                         fillIndex [0x13] = 0x3;
2542                         for (int i = 0x0621; i <= 0x064A; i++) {
2543                                 // Abjad
2544                                 if (Char.GetUnicodeCategory ((char) i)
2545                                         != UnicodeCategory.OtherLetter) {
2546                                         // FIXME: arabic nonspacing marks are
2547                                         // in different order.
2548                                         AddCharMap ((char) i, 0x1, 1);
2549                                         continue;
2550                                 }
2551 //                              map [i] = new CharMapEntry (0x13,
2552 //                                      (byte) arabicLetterPrimaryValues [i], 1);
2553                                 fillIndex [0x13] =
2554                                         (byte) arabicLetterPrimaryValues [i];
2555                                 byte formDiacritical = 8; // default
2556                                 // SPECIAL CASES:
2557                                 switch (i) {
2558                                 case 0x0622: formDiacritical = 9; break;
2559                                 case 0x0623: formDiacritical = 0xA; break;
2560                                 case 0x0624: formDiacritical = 5; break;
2561                                 case 0x0625: formDiacritical = 0xB; break;
2562                                 case 0x0626: formDiacritical = 7; break;
2563                                 case 0x0649: formDiacritical = 5; break;
2564                                 case 0x064A: formDiacritical = 7; break;
2565                                 }
2566 //                              AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2567                                 AddArabicCharMap ((char) i, 0x13, 1, formDiacritical);
2568                         }
2569                         for (int i = 0x0670; i < 0x0673; i++)
2570                                 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2571                         fillIndex [0x13] = 0x84;
2572                         for (int i = 0x0674; i < 0x06D6; i++)
2573                                 if (Char.IsLetter ((char) i))
2574                                         AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2575
2576                         // Devanagari
2577
2578                         // FIXME: this could be fixed in more decent way
2579                         for (int i = 0x0958; i <= 0x095F; i++)
2580                                 diacritical [i] = 8;
2581
2582                         // FIXME: it does seem straight codepoint mapping.
2583                         fillIndex [0x14] = 04;
2584                         for (int i = 0x0901; i < 0x0905; i++)
2585                                 if (!IsIgnorable (i))
2586                                         AddLetterMap ((char) i, 0x14, 2);
2587                         fillIndex [0x14] = 0xB;
2588                         for (int i = 0x0905; i < 0x093A; i++) {
2589                                 if (i == 0x0928)
2590                                         AddCharMap ('\u0929', 0x14, 0, 8);
2591                                 if (i == 0x0930)
2592                                         AddCharMap ('\u0931', 0x14, 0, 8);
2593                                 if (i == 0x0933)
2594                                         AddCharMap ('\u0934', 0x14, 0, 8);
2595                                 if (Char.IsLetter ((char) i))
2596                                         AddLetterMap ((char) i, 0x14, 4);
2597                                 if (i == 0x090B)
2598                                         AddCharMap ('\u0960', 0x14, 4);
2599                                 if (i == 0x090C)
2600                                         AddCharMap ('\u0961', 0x14, 4);
2601                         }
2602                         fillIndex [0x14] = 0xDA;
2603                         for (int i = 0x093E; i < 0x0945; i++)
2604                                 if (!IsIgnorable (i))
2605                                         AddLetterMap ((char) i, 0x14, 2);
2606                         fillIndex [0x14] = 0xEC;
2607                         for (int i = 0x0945; i < 0x094F; i++)
2608                                 if (!IsIgnorable (i))
2609                                         AddLetterMap ((char) i, 0x14, 2);
2610
2611                         // Bengali
2612                         // -Letters
2613                         fillIndex [0x15] = 02;
2614                         for (int i = 0x0980; i < 0x9FF; i++) {
2615                                 if (IsIgnorable (i))
2616                                         continue;
2617                                 if (i == 0x09E0)
2618                                         fillIndex [0x15] = 0x3B;
2619                                 switch (Char.GetUnicodeCategory ((char) i)) {
2620                                 case UnicodeCategory.NonSpacingMark:
2621                                 case UnicodeCategory.DecimalDigitNumber:
2622                                 case UnicodeCategory.OtherNumber:
2623                                         continue;
2624                                 }
2625                                 AddLetterMap ((char) i, 0x15, 1);
2626                         }
2627                         // -Signs
2628                         fillIndex [0x1] = 0x3;
2629                         for (int i = 0x0981; i < 0x0A00; i++)
2630                                 if (Char.GetUnicodeCategory ((char) i) ==
2631                                         UnicodeCategory.NonSpacingMark)
2632                                         AddCharMap ((char) i, 0x1, 1);
2633
2634                         // Gurmukhi. orderedGurmukhi is from UCA
2635                         // FIXME: it does not look equivalent to UCA.
2636                         fillIndex [0x16] = 04;
2637                         fillIndex [0x1] = 3;
2638                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
2639                                 char c = orderedGurmukhi [i];
2640                                 if (IsIgnorable ((int) c))
2641                                         continue;
2642                                 if (IsIgnorableNonSpacing (c)) {
2643                                         AddLetterMap (c, 0x1, 1);
2644                                         continue;
2645                                 }
2646                                 if (c == '\u0A3C' || c == '\u0A4D' ||
2647                                         '\u0A66' <= c && c <= '\u0A71')
2648                                         continue;
2649                                 // SPECIAL CASES
2650                                 byte shift = 4;
2651                                 switch (c) {
2652                                 case '\u0A33': case '\u0A36': case '\u0A16':
2653                                 case '\u0A17': case '\u0A5B': case '\u0A5E':
2654                                         shift = 0;
2655                                         break;
2656                                 }
2657                                 if (c == '\u0A3E') // Skip
2658                                         fillIndex [0x16] = 0xC0;
2659                                 AddLetterMap (c, 0x16, shift);
2660                         }
2661
2662                         // Gujarati. orderedGujarati is from UCA
2663                         fillIndex [0x17] = 0x4;
2664                         // nonspacing marks
2665                         map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2666                         map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2667                         map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2668                         map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2669                         map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2670                         map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2671                         // letters go first.
2672                         for (int i = 0; i < orderedGujarati.Length; i++) {
2673                                 // SPECIAL CASE
2674                                 char c = orderedGujarati [i];
2675                                 if (Char.IsLetter (c)) {
2676                                         // SPECIAL CASES
2677                                         if (c == '\u0AB3' || c == '\u0A32')
2678                                                 continue;
2679                                         if (c == '\u0A33') {
2680                                                 AddCharMap ('\u0A32', 0x17, 0);
2681                                                 AddCharMap ('\u0A33', 0x17, 4, 4);
2682                                                 continue;
2683                                         }
2684                                         if (c == '\u0A8B')
2685                                                 AddCharMap ('\u0AE0', 0x17, 0, 5);
2686                                         AddCharMap (c, 0x17, 4);
2687
2688                                         if (c == '\u0AB9')
2689                                                 AddCharMap ('\u0AB3', 0x17, 6);
2690                                 }
2691                         }
2692                         // non-letters
2693                         byte gujaratiShift = 4;
2694                         fillIndex [0x17] = 0xC0;
2695                         for (int i = 0; i < orderedGujarati.Length; i++) {
2696                                 char c = orderedGujarati [i];
2697                                 if (fillIndex [0x17] == 0xCC)
2698                                         gujaratiShift = 3;
2699                                 if (!Char.IsLetter (c)) {
2700                                         // SPECIAL CASES
2701                                         if (c == '\u0A82')
2702                                                 AddCharMap ('\u0A81', 0x17, 2);
2703                                         if (c == '\u0AC2')
2704                                                 fillIndex [0x17]++;
2705                                         AddLetterMap (c, 0x17, gujaratiShift);
2706                                 }
2707                         }
2708
2709                         // Oriya
2710                         fillIndex [0x1] = 03;
2711                         fillIndex [0x18] = 02;
2712                         for (int i = 0x0B00; i < 0x0B7F; i++) {
2713                                 switch (Char.GetUnicodeCategory ((char) i)) {
2714                                 case UnicodeCategory.NonSpacingMark:
2715                                 case UnicodeCategory.DecimalDigitNumber:
2716                                         AddLetterMap ((char) i, 0x1, 1);
2717                                         continue;
2718                                 }
2719                                 AddLetterMapCore ((char) i, 0x18, 1, 0, true);
2720                         }
2721
2722                         // Tamil
2723                         fillIndex [0x19] = 2;
2724                         AddCharMap ('\u0BD7', 0x19, 0);
2725                         fillIndex [0x19] = 0xA;
2726                         // vowels
2727                         for (int i = 0x0B82; i <= 0x0B94; i++)
2728                                 if (!IsIgnorable ((char) i))
2729                                         AddCharMap ((char) i, 0x19, 2);
2730                         // special vowel
2731                         fillIndex [0x19] = 0x28;
2732                         // The array for Tamil consonants is a constant.
2733                         // Windows have almost similar sequence to TAM from
2734                         // tamilnet but a bit different in Grantha.
2735                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
2736                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2737                         // combining marks
2738                         fillIndex [0x19] = 0x82;
2739                         for (int i = 0x0BBE; i < 0x0BCD; i++)
2740                                 if (Char.GetUnicodeCategory ((char) i) ==
2741                                         UnicodeCategory.SpacingCombiningMark
2742                                         || i == 0x0BC0)
2743                                         AddLetterMap ((char) i, 0x19, 2);
2744
2745                         // Telugu
2746                         fillIndex [0x1A] = 0x4;
2747                         for (int i = 0x0C00; i < 0x0C62; i++) {
2748                                 if (i == 0x0C55 || i == 0x0C56)
2749                                         continue; // skip
2750                                 AddCharMap ((char) i, 0x1A, 3);
2751                                 char supp = (i == 0x0C0B) ? '\u0C60':
2752                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
2753                                 if (supp == char.MinValue)
2754                                         continue;
2755                                 AddCharMap (supp, 0x1A, 3);
2756                         }
2757
2758                         // Kannada
2759                         fillIndex [0x1B] = 4;
2760                         for (int i = 0x0C80; i < 0x0CE5; i++) {
2761                                 if (i == 0x0CD5 || i == 0x0CD6)
2762                                         continue; // ignore
2763                                 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2764                                         continue; // shift after 0xCB9
2765                                 AddCharMap ((char) i, 0x1B, 3);
2766                                 if (i == 0x0CB9) {
2767                                         // SPECIAL CASES: but why?
2768                                         AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2769                                         AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2770                                         AddCharMap ('\u0CDE', 0x1B, 3); // FA
2771                                 }
2772                                 if (i == 0x0CB2)
2773                                         AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2774                         }
2775
2776                         // Malayalam
2777                         fillIndex [0x1C] = 2;
2778                         fillIndex [0x1] = 3;
2779                         for (int i = 0x0D02; i < 0x0D61; i++) {
2780                                 // FIXME: I avoided MSCompatUnicodeTable usage
2781                                 // here (it results in recursion). So check if
2782                                 // using NonSpacingMark makes sense or not.
2783                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2784 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2785                                         AddCharMap ((char) i, 0x1C, 1);
2786                                 else if (!IsIgnorable ((char) i))
2787                                         AddCharMap ((char) i, 1, 1);
2788                         }
2789
2790                         // Thai ... note that it breaks 0x1E wall after E2B!
2791                         // Also, all Thai characters have level 2 value 3.
2792                         fillIndex [0x1E] = 2;
2793                         fillIndex [0x1] = 3;
2794                         for (int i = 0xE40; i <= 0xE44; i++)
2795                                 AddCharMap ((char) i, 0x1E, 1, 3);
2796                         for (int i = 0xE01; i < 0xE2B; i++)
2797                                 AddCharMap ((char) i, 0x1E, 6, 3);
2798                         fillIndex [0x1F] = 5;
2799                         for (int i = 0xE2B; i < 0xE30; i++)
2800                                 AddCharMap ((char) i, 0x1F, 6, 3);
2801                         fillIndex [0x1F] = 0x1E;
2802                         for (int i = 0xE30; i < 0xE3B; i++)
2803                                 AddCharMap ((char) i, 0x1F, 1, 3);
2804                         // some Thai characters remains.
2805                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
2806                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2807                         foreach (char c in specialThai)
2808                                 AddCharMap (c, 0x1F, 1, 3);
2809
2810                         for (int i = 0xE00; i < 0xE80; i++)
2811                                 if (Char.GetUnicodeCategory ((char) i) ==
2812                                         UnicodeCategory.NonSpacingMark)
2813                                         AddCharMap ((char) i, 1, 1);
2814
2815                         // Lao
2816                         fillIndex [0x1F] = 2;
2817                         fillIndex [0x1] = 3;
2818                         for (int i = 0xE80; i < 0xEDF; i++) {
2819                                 if (IsIgnorable ((char) i))
2820                                         continue;
2821                                 else if (Char.IsLetter ((char) i))
2822                                         AddCharMap ((char) i, 0x1F, 1);
2823                                 else if (Char.GetUnicodeCategory ((char) i) ==
2824                                         UnicodeCategory.NonSpacingMark)
2825                                         AddCharMap ((char) i, 1, 1);
2826                         }
2827
2828                         // Georgian. orderedGeorgian is from UCA DUCET.
2829                         fillIndex [0x21] = 5;
2830                         for (int i = 0; i < orderedGeorgian.Length; i++) {
2831                                 char c = orderedGeorgian [i];
2832                                 if (map [(int) c].Defined)
2833                                         continue;
2834                                 AddCharMap (c, 0x21, 0);
2835                                 if (c < '\u10F6')
2836                                         AddCharMap ((char) (c - 0x30), 0x21, 0);
2837                                 fillIndex [0x21] += 5;
2838                         }
2839
2840                         // Japanese Kana.
2841                         fillIndex [0x22] = 2;
2842                         int kanaOffset = 0x3041;
2843                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2844
2845                         for (int gyo = 0; gyo < 9; gyo++) {
2846                                 for (int dan = 0; dan < 5; dan++) {
2847                                         if (gyo == 7 && dan % 2 == 1) {
2848                                                 // 'ya'-gyo
2849                                                 fillIndex [0x22]++;
2850                                                 kanaOffset -= 2; // There is no space for yi and ye.
2851                                                 continue;
2852                                         }
2853                                         int cp = kanaOffset + dan * kanaLines [gyo];
2854                                         // small lines (a-gyo, ya-gyo)
2855                                         if (gyo == 0 || gyo == 7) {
2856                                                 AddKanaMap (cp, 1); // small
2857                                                 AddKanaMap (cp + 1, 1);
2858                                         }
2859                                         else
2860                                                 AddKanaMap (cp, kanaLines [gyo]);
2861                                         fillIndex [0x22]++;
2862
2863                                         if (cp == 0x30AB) {
2864                                                 // add small 'ka' (before normal one)
2865                                                 AddKanaMap (0x30F5, 1);
2866                                                 kanaOffset++;
2867                                         }
2868                                         if (cp == 0x30B1) {
2869                                                 // add small 'ke' (before normal one)
2870                                                 AddKanaMap (0x30F6, 1);
2871                                                 kanaOffset++;
2872                                         }
2873                                         if (cp == 0x3061) {
2874                                                 // add small 'Tsu' (before normal one)
2875                                                 AddKanaMap (0x3063, 1);
2876                                                 kanaOffset++;
2877                                         }
2878                                 }
2879                                 fillIndex [0x22] += 3;
2880                                 kanaOffset += 5 * kanaLines [gyo];
2881                         }
2882
2883                         // Wa-gyo is almost special, so I just manually add.
2884                         AddLetterMap ((char) 0x308E, 0x22, 0);
2885                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2886                         AddLetterMap ((char) 0x308F, 0x22, 0);
2887                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2888                         fillIndex [0x22]++;
2889                         AddLetterMap ((char) 0x3090, 0x22, 0);
2890                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2891                         fillIndex [0x22] += 2;
2892                         // no "Wu" in Japanese.
2893                         AddLetterMap ((char) 0x3091, 0x22, 0);
2894                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2895                         fillIndex [0x22]++;
2896                         AddLetterMap ((char) 0x3092, 0x22, 0);
2897                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2898                         // Nn
2899                         fillIndex [0x22] = 0x80;
2900                         AddLetterMap ((char) 0x3093, 0x22, 0);
2901                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2902
2903                         map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2904                                 map [0x30A6].Level1, 3);// voiced hiragana U
2905                         map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2906                                 map [0x30A6].Level1, 3);// voiced katakana U
2907
2908                         map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2909                                 map [0x30AB].Level1, 0);// small katakana Ka
2910                         map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2911                                 map [0x30B1].Level1, 0);// small katakana Ke
2912                         // voiced Wa lines
2913                         for (int i = 0x30F7; i < 0x30FB; i++)
2914                                 map [i] = new CharMapEntry (map [i - 8].Category,
2915                                         map [i - 8].Level1,
2916                                         3);
2917
2918                         // JIS Japanese square chars.
2919                         fillIndex [0x22] = 0x97;
2920                         jisJapanese.Sort (JISComparer.Instance);
2921                         foreach (JISCharacter j in jisJapanese)
2922                                 if (0x3300 <= j.CP && j.CP <= 0x3357)
2923                                         AddCharMap ((char) j.CP, 0x22, 1);
2924                         // non-JIS Japanese square chars.
2925                         nonJisJapanese.Sort (NonJISComparer.Instance);
2926                         foreach (NonJISCharacter j in nonJisJapanese)
2927                                 AddCharMap ((char) j.CP, 0x22, 1);
2928
2929                         // Bopomofo
2930                         fillIndex [0x23] = 0x02;
2931                         for (int i = 0x3105; i <= 0x312C; i++)
2932                                 AddCharMap ((char) i, 0x23, 1);
2933
2934                         // Estrangela: ancient Syriac
2935                         fillIndex [0x24] = 0x0B;
2936                         // FIXME: is 0x71E really alternative form?
2937                         ArrayList syriacAlternatives = new ArrayList (
2938                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2939                         for (int i = 0x0710; i <= 0x072C; i++) {
2940                                 if (i == 0x0711) // NonSpacingMark
2941                                         continue;
2942                                 if (syriacAlternatives.Contains (i))
2943                                         continue;
2944                                 AddCharMap ((char) i, 0x24, 4);
2945                                 // FIXME: why?
2946                                 if (i == 0x721)
2947                                         fillIndex [0x24]++;
2948                         }
2949                         foreach (int cp in syriacAlternatives)
2950                                 map [cp] = new CharMapEntry (0x24,
2951                                         (byte) (map [cp - 1].Level1 + 2),
2952                                         0);
2953                         // FIXME: Syriac NonSpacingMark should go here.
2954
2955                         // Thaana
2956                         // FIXME: it turned out that it does not look like UCA
2957                         fillIndex [0x24] = 0x6E;
2958                         fillIndex [0x1] = 0xAC;
2959                         for (int i = 0; i < orderedThaana.Length; i++) {
2960                                 char c = orderedThaana [i];
2961                                 if (IsIgnorableNonSpacing ((int) c))
2962                                         AddCharMap (c, 1, 1);
2963                                 AddCharMap (c, 0x24, 2);
2964                                 if (c == '\u0782') // SPECIAL CASE: why?
2965                                         fillIndex [0x24] += 2;
2966                         }
2967                         #endregion
2968
2969                         // FIXME: Add more culture-specific letters (that are
2970                         // not supported in Windows collation) here.
2971
2972                         // Surrogate ... they are computed.
2973
2974                         #region Hangul
2975                         // Hangul.
2976                         //
2977                         // Unlike UCA Windows Hangul sequence mixes Jongseong
2978                         // with Choseong sequence as well as Jungseong,
2979                         // adjusted to have the same primary weight for the
2980                         // same base character. So it is impossible to compute
2981                         // those sort keys.
2982                         //
2983                         // Here I introduce an ordered sequence of mixed
2984                         // 'commands' and 'characters' that is similar to
2985                         // LDML text:
2986                         //      - ',' increases primary weight.
2987                         //      - [A B] means a range, increasing index
2988                         //      - {A B} means a range, without increasing index
2989                         //      - '=' is no operation (it means the characters
2990                         //        of both sides have the same weight).
2991                         //      - '>' inserts a Hangul Syllable block that
2992                         //        contains 0x251 characters.
2993                         //      - '<' decreases the index
2994                         //      - '0'-'9' means skip count
2995                         //      - whitespaces are ignored
2996                         //
2997
2998                         string hangulSequence =
2999                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
3000                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
3001                         + "<{\u1113 \u1116}, \u3165,"
3002                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
3003                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
3004                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
3005                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
3006                                 + "[\u11D1 \u11D2], \u11B2,"
3007                                 + "[\u11D3 \u11D5], \u11B3,"
3008                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
3009                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
3010                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
3011                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
3012                         + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
3013                                 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
3014                                 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
3015                         + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
3016                                 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
3017                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
3018                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
3019                         + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
3020                                 + "\u11F1,, \u11F2,,,"
3021                                 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
3022                         + "<\u114D, \u110D,,  >"
3023                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
3024                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
3025                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
3026                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
3027                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
3028                                 + "[\u11F5 \u11F8]"
3029                         ;
3030
3031                         byte hangulCat = 0x52;
3032                         fillIndex [hangulCat] = 0x2;
3033
3034                         int syllableBlock = 0;
3035                         for (int n = 0; n < hangulSequence.Length; n++) {
3036                                 char c = hangulSequence [n];
3037                                 int start, end;
3038                                 if (Char.IsWhiteSpace (c))
3039                                         continue;
3040                                 switch (c) {
3041                                 case '=':
3042                                         break; // NOP
3043                                 case ',':
3044                                         IncrementSequentialIndex (ref hangulCat);
3045                                         break;
3046                                 case '<':
3047                                         if (fillIndex [hangulCat] == 2)
3048                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
3049                                         fillIndex [hangulCat]--;
3050                                         break;
3051                                 case '>':
3052                                         IncrementSequentialIndex (ref hangulCat);
3053                                         for (int l = 0; l < 0x15; l++)
3054                                                 for (int v = 0; v < 0x1C; v++) {
3055                                                         AddCharMap (
3056                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
3057                                                         IncrementSequentialIndex (ref hangulCat);
3058                                                 }
3059                                         syllableBlock++;
3060                                         break;
3061                                 case '[':
3062                                         start = hangulSequence [n + 1];
3063                                         end = hangulSequence [n + 3];
3064                                         for (int i = start; i <= end; i++) {
3065                                                 AddCharMap ((char) i, hangulCat, 0);
3066                                                 if (end > i)
3067                                                         IncrementSequentialIndex (ref hangulCat);
3068                                         }
3069                                         n += 4; // consumes 5 characters for this operation
3070                                         break;
3071                                 case '{':
3072                                         start = hangulSequence [n + 1];
3073                                         end = hangulSequence [n + 3];
3074                                         for (int i = start; i <= end; i++)
3075                                                 AddCharMap ((char) i, hangulCat, 0);
3076                                         n += 4; // consumes 5 characters for this operation
3077                                         break;
3078                                 default:
3079                                         AddCharMap (c, hangulCat, 0);
3080                                         break;
3081                                 }
3082                         }
3083
3084                         // Some Jamo NFKD.
3085                         for (int i = 0x3200; i < 0x3300; i++) {
3086                                 if (IsIgnorable (i) || map [i].Defined)
3087                                         continue;
3088                                 int ch = 0;
3089                                 // w/ bracket
3090                                 if (decompLength [i] == 4 &&
3091                                         decompValues [decompIndex [i]] == '(')
3092                                         ch = decompIndex [i] + 1;
3093                                 // circled
3094                                 else if (decompLength [i] == 2 &&
3095                                         decompValues [decompIndex [i] + 1] == '\u1161')
3096                                         ch = decompIndex [i];
3097                                 else if (decompLength [i] == 1)
3098                                         ch = decompIndex [i];
3099                                 else
3100                                         continue;
3101                                 ch = decompValues [ch];
3102                                 if (ch < 0x1100 || 0x1200 < ch &&
3103                                         ch < 0xAC00 || 0xD800 < ch)
3104                                         continue;
3105
3106                                 // SPECIAL CASE ?
3107                                 int offset = i < 0x3260 ? 1 : 0;
3108                                 if (0x326E <= i && i <= 0x3273)
3109                                         offset = 1;
3110
3111                                 map [i] = new CharMapEntry (map [ch].Category,
3112                                         (byte) (map [ch].Level1 + offset),
3113                                         map [ch].Level2);
3114 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
3115                         }
3116
3117
3118                         #endregion
3119
3120                         // Letterlike characters and CJK compatibility square
3121                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
3122                         int [] counts = new int ['Z' - 'A' + 1];
3123                         char [] namedChars = new char [sortableCharNames.Count];
3124                         int nCharNames = 0;
3125                         foreach (DictionaryEntry de in sortableCharNames) {
3126                                 counts [((string) de.Value) [0] - 'A']++;
3127                                 namedChars [nCharNames++] = (char) ((int) de.Key);
3128                         }
3129                         nCharNames = 0; // reset
3130                         for (int a = 0; a < counts.Length; a++) {
3131                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
3132                                 for (int i = 0; i < counts [a]; i++)
3133 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
3134                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
3135                         }
3136
3137                         // CJK unified ideograph.
3138                         byte cjkCat = 0x9E;
3139                         fillIndex [cjkCat] = 0x2;
3140                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
3141                                 if (!IsIgnorable (cp))
3142                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
3143                         // CJK Extensions goes here.
3144                         // LAMESPEC: With this Windows style CJK layout, it is
3145                         // impossible to add more CJK ideograph i.e. 0x9FA6-
3146                         // 0x9FBB can never be added w/o breaking compat.
3147                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
3148                                 if (!IsIgnorable (cp))
3149                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
3150
3151                         // PrivateUse ... computed.
3152                         // remaining Surrogate ... computed.
3153
3154                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
3155                         // non-alphanumeric ASCII except for: + - < = > '
3156                         for (int i = 0x21; i < 0x7F; i++) {
3157                                 // SPECIAL CASE: 02C6 looks regarded as
3158                                 // equivalent to '^', which does not conform
3159                                 // to Unicode standard character database.
3160                                 if (i == 0x005B)
3161                                         AddCharMap ('\u2045', 0x7, 0, 0x1C);
3162                                 if (i == 0x005D)
3163                                         AddCharMap ('\u2046', 0x7, 0, 0x1C);
3164                                 if (i == 0x005E)
3165                                         AddCharMap ('\u02C6', 0x7, 0, 3);
3166                                 if (i == 0x0060)
3167                                         AddCharMap ('\u02CB', 0x7, 0, 3);
3168
3169                                 if (Char.IsLetterOrDigit ((char) i)
3170                                         || "+-<=>'".IndexOf ((char) i) >= 0)
3171                                         continue; // they are not added here.
3172
3173                                 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3174                                 // Insert 3001 after ',' and 3002 after '.'
3175                                 if (i == 0x2C)
3176                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
3177                                 else if (i == 0x2E)
3178                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
3179                                 else if (i == 0x3A)
3180                                         AddCharMap ('\uFE30', 0x7, 1, 0);
3181                         }
3182                         #endregion
3183
3184                         #region 07 - Punctuations and something else
3185                         for (int i = 0xA0; i < char.MaxValue; i++) {
3186                                 if (IsIgnorable (i))
3187                                         continue;
3188
3189                                 // FIXME: actually those reset should not be
3190                                 // done but here I put for easy goal.
3191                                 if (i == 0x05C3)
3192                                         fillIndex [0x7]++;
3193                                 if (i == 0x0700)
3194                                         fillIndex [0x7] = 0xE2;
3195                                 if (i == 0x2016)
3196                                         fillIndex [0x7] = 0x77;
3197                                 if (i == 0x3008)
3198                                         fillIndex [0x7] = 0x93;
3199
3200                                 if (0x02C8 <= i && i <= 0x02CD)
3201                                         continue; // nonspacing marks
3202
3203                                 // SPECIAL CASE: maybe they could be allocated
3204                                 // dummy NFKD mapping and no special processing
3205                                 // would be required here.
3206                                 if (i == 0x00AF)
3207                                         AddCharMap ('\u02C9', 0x7, 0, 3);
3208                                 if (i == 0x00B4)
3209                                         AddCharMap ('\u02CA', 0x7, 0, 3);
3210                                 if (i == 0x02C7)
3211                                         AddCharMap ('\u02D8', 0x7, 0, 3);
3212
3213                                 // SPECIAL CASES:
3214                                 switch (i) {
3215                                 case 0xAB: // 08
3216                                 case 0xB7: // 0A
3217                                 case 0xBB: // 08
3218                                 case 0x02B9: // 01
3219                                 case 0x02BA: // 01
3220                                 case 0x2329: // 09
3221                                 case 0x232A: // 09
3222                                         continue;
3223                                 }
3224
3225                                 switch (Char.GetUnicodeCategory ((char) i)) {
3226                                 case UnicodeCategory.OtherPunctuation:
3227                                 case UnicodeCategory.ClosePunctuation:
3228                                 case UnicodeCategory.OpenPunctuation:
3229                                 case UnicodeCategory.ConnectorPunctuation:
3230                                 case UnicodeCategory.InitialQuotePunctuation:
3231                                 case UnicodeCategory.FinalQuotePunctuation:
3232                                 case UnicodeCategory.ModifierSymbol:
3233                                         // SPECIAL CASES: // 0xA
3234                                         if (0x2020 <= i && i <= 0x2031)
3235                                                 continue;
3236                                         if (i == 0x3003) // added later
3237                                                 continue;
3238                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3239                                         break;
3240                                 default:
3241                                         if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why?
3242                                                 goto case UnicodeCategory.OtherPunctuation;
3243                                         break;
3244                                 }
3245                         }
3246
3247                         // Control pictures
3248                         // FIXME: it should not need to reset level 1, but
3249                         // it's for easy goal.
3250                         fillIndex [0x7] = 0xB6;
3251                         for (int i = 0x2400; i <= 0x2424; i++)
3252                                 AddCharMap ((char) i, 0x7, 1, 0);
3253
3254                         // FIXME: what are they?
3255                         AddCharMap ('\u3003', 0x7, 1);
3256                         AddCharMap ('\u3006', 0x7, 1);
3257                         AddCharMap ('\u02D0', 0x7, 1);
3258                         AddCharMap ('\u10FB', 0x7, 1);
3259                         AddCharMap ('\u0950', 0x7, 1);
3260                         AddCharMap ('\u093D', 0x7, 1);
3261                         AddCharMap ('\u0964', 0x7, 1);
3262                         AddCharMap ('\u0965', 0x7, 1);
3263                         AddCharMap ('\u0970', 0x7, 1);
3264
3265                         #endregion
3266
3267                         #region category 08 - symbols
3268                         fillIndex [0x8] = 2;
3269                         // Here Windows mapping is not straightforward. It is
3270                         // not based on computation but seems manual sorting.
3271                         AddCharMapGroup ('+', 0x8, 1, 0); // plus
3272                         AddCharMapGroup ('\u2212', 0x8, 1); // minus
3273                         AddCharMapGroup ('\u229D', 0x8, 1); // minus
3274                         AddCharMapGroup ('\u2297', 0x8, 1); // mul
3275                         AddCharMapGroup ('\u2044', 0x8, 1); // div
3276                         AddCharMapGroup ('\u2215', 0x8, 0); // div
3277                         AddCharMapGroup ('\u2298', 0x8, 1); // div slash
3278                         AddCharMapGroup ('\u2217', 0x8, 0); // mul
3279                         AddCharMapGroup ('\u229B', 0x8, 1); // asterisk oper
3280                         AddCharMapGroup ('\u2218', 0x8, 0); // ring
3281                         AddCharMapGroup ('\u229A', 0x8, 1); // ring
3282                         AddCharMapGroup ('\u2219', 0x8, 0); // bullet
3283                         AddCharMapGroup ('\u2299', 0x8, 1); // dot oper
3284                         AddCharMapGroup ('\u2213', 0x8, 1); // minus-or-plus
3285                         AddCharMapGroup ('\u003C', 0x8, 1); // <
3286                         AddCharMapGroup ('\u227A', 0x8, 1); // precedes relation
3287                         AddCharMapGroup ('\u22B0', 0x8, 1); // precedes under relation
3288
3289                         for (int cp = 0; cp < 0x2300; cp++) {
3290                                 if (cp == 0xAC) // SPECIAL CASE: skip
3291                                         continue;
3292                                 if (cp == 0x200) {
3293                                         cp = 0x2200; // skip to 2200
3294                                         fillIndex [0x8] = 0x21;
3295                                 }
3296                                 if (cp == 0x2295)
3297                                         fillIndex [0x8] = 0x3;
3298                                 if (cp == 0x22A2)
3299                                         fillIndex [0x8] = 0xAB;
3300                                 if (cp == 0x22B2)
3301                                         fillIndex [0x8] = 0xB9;
3302                                 if (!map [cp].Defined &&
3303 //                                      Char.GetUnicodeCategory ((char) cp) ==
3304 //                                      UnicodeCategory.MathSymbol)
3305                                         Char.IsSymbol ((char) cp))
3306                                         AddCharMapGroup ((char) cp, 0x8, 1);
3307                                 // SPECIAL CASES: no idea why Windows sorts as such
3308                                 switch (cp) {
3309                                 case 0x3E:
3310                                         AddCharMap ('\u227B', 0x8, 1, 0);
3311                                         AddCharMap ('\u22B1', 0x8, 1, 0);
3312                                         break;
3313                                 case 0xB1:
3314                                         AddCharMapGroup ('\u00AB', 0x8, 1);
3315                                         AddCharMapGroup ('\u226A', 0x8, 1);
3316                                         AddCharMapGroup ('\u00BB', 0x8, 1);
3317                                         AddCharMapGroup ('\u226B', 0x8, 1);
3318                                         break;
3319                                 case 0xF7:
3320                                         AddCharMap ('\u01C0', 0x8, 1, 0);
3321                                         AddCharMap ('\u01C1', 0x8, 1, 0);
3322                                         AddCharMap ('\u01C2', 0x8, 1, 0);
3323                                         break;
3324                                 }
3325                         }
3326                         #endregion
3327
3328                         #region Hack!
3329
3330                         // Characters w/ diacritical marks (NFKD)
3331                         for (int i = 0; i <= char.MaxValue; i++) {
3332                                 if (map [i].Defined || IsIgnorable (i))
3333                                         continue;
3334                                 if (decompIndex [i] == 0)
3335                                         continue;
3336
3337                                 int start = decompIndex [i];
3338                                 int primaryChar = decompValues [start];
3339                                 int secondary = diacritical [i];
3340                                 bool skip = false;
3341                                 int length = decompLength [i];
3342                                 // special processing for parenthesized ones.
3343                                 if (length == 3 &&
3344                                         decompValues [start] == '(' &&
3345                                         decompValues [start + 2] == ')') {
3346                                         primaryChar = decompValues [start + 1];
3347                                         length = 1;
3348                                 }
3349
3350                                 if (map [primaryChar].Level1 == 0)
3351                                         continue;
3352
3353                                 for (int l = 1; l < length; l++) {
3354                                         int c = decompValues [start + l];
3355                                         if (map [c].Level1 != 0)
3356                                                 skip = true;
3357                                         secondary += diacritical [c];
3358                                 }
3359                                 if (skip)
3360                                         continue;
3361                                 map [i] = new CharMapEntry (
3362                                         map [primaryChar].Category,
3363                                         map [primaryChar].Level1,
3364                                         (byte) secondary);
3365
3366                         }
3367
3368                         // Diacritical weight adjustment
3369
3370                         // Arabic Hamzah
3371                         diacritical [0x624] = 0x5;
3372                         diacritical [0x626] = 0x7;
3373                         diacritical [0x622] = 0x9;
3374                         diacritical [0x623] = 0xA;
3375                         diacritical [0x625] = 0xB;
3376                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
3377                         diacritical [0x64A] = 0x7; // Yaa'
3378
3379                         for (int i = 0; i < char.MaxValue; i++) {
3380                                 byte mod = 0;
3381                                 byte cat = map [i].Category;
3382                                 switch (cat) {
3383                                 case 0xE: // Latin diacritics
3384                                 case 0x22: // Japanese: circled characters
3385                                         mod = diacritical [i];
3386                                         break;
3387                                 case 0x13: // Arabic
3388                                         if (i == 0x0621)
3389                                                 break; // 0
3390                                         if (diacritical [i] == 0 && decompLength [i] != 0)
3391                                                 diacritical [i] = map [decompValues [decompIndex [i]]].Level2;
3392                                         if (diacritical [i] == 0 && i >= 0xFE8D)
3393                                                 mod = 0x8; // default for arabic
3394                                         break;
3395                                 }
3396                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
3397                                         mod = diacritical [i];
3398                                 if (mod > 0)
3399                                         map [i] = new CharMapEntry (
3400                                                 cat, map [i].Level1, mod);
3401                         }
3402
3403                         // FIXME: this is halfly hack but those NonSpacingMark
3404                         // characters and still undefined are likely to
3405                         // be nonspacing.
3406                         for (int i = 0; i < char.MaxValue; i++) {
3407                                 if (map [i].Defined ||
3408                                         IsIgnorable (i))
3409                                         continue;
3410                                 switch (i) {
3411                                 // SPECIAL CASES.
3412                                 case 0x02B9:
3413                                 case 0x02BA:
3414                                         break;
3415                                 default:
3416                                         if (Char.GetUnicodeCategory ((char) i) !=
3417                                         UnicodeCategory.NonSpacingMark)
3418                                                 continue;
3419                                         break;
3420                                 }
3421                                 if (diacritical [i] != 0)
3422                                         map [i] = new CharMapEntry (1, 1, diacritical [i]);
3423                                 else
3424                                         AddCharMap ((char) i, 1, 1);
3425                         }
3426
3427                         #endregion
3428                 }
3429
3430                 TextInfo ti = CultureInfo.InvariantCulture.TextInfo;
3431
3432                 private void FillLetterNFKD (int i, bool checkUpper, bool greekRemap)
3433                 {
3434                         if (map [i].Defined)
3435                                 return;
3436                         int up = (int) ti.ToUpper ((char) i);
3437                         if (checkUpper && map [up].Category == 0xF) {
3438                                 if (i == up)
3439                                         return;
3440                                 FillLetterNFKD (up, checkUpper, greekRemap);
3441                                 map [i] = new CharMapEntry (0xF,
3442                                         map [up].Level1,
3443                                         map [up].Level2);
3444                         } else {
3445                                 int idx = decompIndex [i];
3446                                 if (idx == 0)
3447                                         return;
3448                                 int primary = decompValues [decompIndex [i]];
3449                                 FillLetterNFKD (primary, checkUpper, greekRemap);
3450
3451                                 int lv2 = map [primary].Level2;
3452                                 byte off = 0;
3453                                 for (int l = 1; l < decompLength [i]; l++) {
3454                                         int tmp = decompValues [idx + l];
3455                                         if (map [tmp].Category != 1)
3456                                                 return;
3457                                         if (greekRemap && map [tmp].Level2 == 0xC)
3458                                                 off += 3;
3459                                         else
3460                                                 off += map [tmp].Level2;
3461                                 }
3462                                 if (off > 0) {
3463                                         if (lv2 == 0)
3464                                                 lv2 += 2;
3465                                         lv2 += off;
3466                                 }
3467                                 // ... but override if the value already exists.
3468                                 if (diacritical [i] != 0)
3469                                         lv2 = diacritical [i];
3470                                 map [i] = new CharMapEntry (
3471                                         map [primary].Category,
3472                                         map [primary].Level1,
3473                                         (byte) lv2);
3474                         }
3475                 }
3476
3477                 private void IncrementSequentialIndex (ref byte hangulCat)
3478                 {
3479                         fillIndex [hangulCat]++;
3480                         if (fillIndex [hangulCat] == 0) { // overflown
3481                                 hangulCat++;
3482                                 fillIndex [hangulCat] = 0x2;
3483                         }
3484                 }
3485
3486                 // Reset fillIndex to fixed value and call AddLetterMap().
3487                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3488                 {
3489                         fillIndex [category] = alphaWeight;
3490                         AddLetterMap (c, category, 0);
3491
3492                         ArrayList al = latinMap [c] as ArrayList;
3493                         if (al == null)
3494                                 return;
3495
3496                         foreach (int cp in al)
3497                                 AddLetterMap ((char) cp, category, 0);
3498                 }
3499
3500                 private void AddKanaMap (int i, byte voices)
3501                 {
3502                         for (byte b = 0; b < voices; b++) {
3503                                 char c = (char) (i + b);
3504                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
3505                                 // Hiragana
3506                                 AddLetterMapCore (c, 0x22, 0, arg, false);
3507                                 // Katakana
3508                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3509                         }
3510                 }
3511
3512                 private void AddLetterMap (char c, byte category, byte updateCount)
3513                 {
3514                         AddLetterMapCore (c, category, updateCount, 0, true);
3515                 }
3516
3517                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3518                 {
3519                         char c2;
3520                         // <small> updates index
3521                         c2 = ToSmallForm (c);
3522                         if (c2 != c)
3523                                 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3524                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3525                         if (c2 != c && !map [(int) c2].Defined)
3526                                 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3527                         bool doUpdate = true;
3528                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3529                                 doUpdate = false;
3530                         else
3531                                 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3532                         if (doUpdate)
3533                                 fillIndex [category] += updateCount;
3534                 }
3535
3536                 private bool AddCharMap (char c, byte category, byte increment)
3537                 {
3538                         return AddCharMap (c, category, increment, 0);
3539                 }
3540
3541                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3542                 {
3543                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3544                                 return false; // do nothing
3545                         map [(int) c] = new CharMapEntry (category,
3546                                 category == 1 ? alt : fillIndex [category],
3547                                 category == 1 ? fillIndex [category] : alt);
3548                         fillIndex [category] += increment;
3549                         return true;
3550                 }
3551
3552                 //
3553                 // Adds characters to table in the order below
3554                 // (+ increases weight):
3555                 //      (<small> +)
3556                 //      itself
3557                 //      <fraction>
3558                 //      <full> | <super> | <sub>
3559                 //      <circle> | <wide> (| <narrow>)
3560                 //      +
3561                 //      (vertical +)
3562                 //
3563                 // level2 is fixed (does not increase).
3564                 int [] sameWeightItems = new int [] {
3565                         DecompositionFraction,
3566                         DecompositionFull,
3567                         DecompositionSuper,
3568                         DecompositionSub,
3569                         DecompositionCircle,
3570                         DecompositionWide,
3571                         DecompositionNarrow,
3572                         };
3573                 private void AddCharMapGroup (char c, byte category, byte updateCount)
3574                 {
3575                         AddCharMapGroup (c, category, updateCount, 0, true);
3576                 }
3577
3578                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3579                 {
3580                         AddCharMapGroup (c, category, updateCount, level2, false);
3581                 }
3582
3583                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3584                 {
3585                         if (map [(int) c].Defined)
3586                                 return;
3587
3588                         if (deferLevel2)
3589                                 level2 = diacritical [(int) c];
3590
3591                         char small = char.MinValue;
3592                         char vertical = char.MinValue;
3593                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3594                         if (nfkd != null) {
3595                                 object smv = nfkd [(byte) DecompositionSmall];
3596                                 if (smv != null)
3597                                         small = (char) ((int) smv);
3598                                 object vv = nfkd [(byte) DecompositionVertical];
3599                                 if (vv != null)
3600                                         vertical = (char) ((int) vv);
3601                         }
3602
3603                         // <small> updates index
3604                         if (small != char.MinValue) {
3605                                 if (level2 == 0 && deferLevel2)
3606                                         level2 = diacritical [small];
3607                                 AddCharMap (small, category, updateCount, level2);
3608                         }
3609
3610                         // itself
3611                         AddCharMap (c, category, 0, level2);
3612
3613                         if (nfkd != null) {
3614                                 foreach (int weight in sameWeightItems) {
3615                                         object wv = nfkd [(byte) weight];
3616                                         if (wv != null) {
3617                                                 if (deferLevel2)
3618                                                         level2 = diacritical [(int) wv];
3619                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
3620                                         }
3621                                 }
3622                         }
3623
3624                         // update index here.
3625                         fillIndex [category] += updateCount;
3626
3627                         if (vertical != char.MinValue) {
3628                                 if (level2 == 0 && deferLevel2)
3629                                         level2 = diacritical [vertical];
3630                                 AddCharMap (vertical, category, updateCount, level2);
3631                         }
3632                 }
3633
3634                 private void AddCharMapCJK (char c, ref byte category)
3635                 {
3636                         AddCharMap (c, category, 0, 0);
3637                         IncrementSequentialIndex (ref category);
3638
3639                         // Special. I wonder why but Windows skips 9E F9.
3640                         if (category == 0x9E && fillIndex [category] == 0xF9)
3641                                 IncrementSequentialIndex (ref category);
3642                 }
3643
3644                 private void AddCharMapGroupCJK (char c, ref byte category)
3645                 {
3646                         AddCharMapCJK (c, ref category);
3647
3648                         // LAMESPEC: see below.
3649                         if (c == '\u5B78') {
3650                                 AddCharMapCJK ('\u32AB', ref category);
3651                                 AddCharMapCJK ('\u323B', ref category);
3652                         }
3653                         if (c == '\u52DE') {
3654                                 AddCharMapCJK ('\u3298', ref category);
3655                                 AddCharMapCJK ('\u3238', ref category);
3656                         }
3657                         if (c == '\u5BEB')
3658                                 AddCharMapCJK ('\u32A2', ref category);
3659                         if (c == '\u91AB')
3660                                 // Especially this mapping order totally does
3661                                 // not make sense to me.
3662                                 AddCharMapCJK ('\u32A9', ref category);
3663
3664                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3665                         if (nfkd == null)
3666                                 return;
3667                         for (byte weight = 0; weight <= 0x12; weight++) {
3668                                 object wv = nfkd [weight];
3669                                 if (wv == null)
3670                                         continue;
3671                                 int w = (int) wv;
3672
3673                                 // Special: they are ignored in this area.
3674                                 // FIXME: check if it is sane
3675                                 if (0xF900 <= w && w <= 0xFAD9)
3676                                         continue;
3677                                 // LAMESPEC: on Windows some of CJK characters
3678                                 // in 3200-32B0 are incorrectly mapped. They
3679                                 // mix Chinise and Japanese Kanji when
3680                                 // ordering those characters.
3681                                 switch (w) {
3682                                 case 0x32A2: case 0x3298: case 0x3238:
3683                                 case 0x32A9: case 0x323B: case 0x32AB:
3684                                         continue;
3685                                 }
3686
3687                                 AddCharMapCJK ((char) w, ref category);
3688                         }
3689                 }
3690
3691                 // For now it is only for 0x7 category.
3692                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3693                 {
3694                         if (map [(int) c].Defined)
3695                                 return;
3696
3697                         bool updateWeight = false;
3698                         // Process in advance (lower primary weight)
3699                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3700                                 if (!map [c2].Defined &&
3701                                         decompLength [c2] == 1 &&
3702                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3703                                         switch (decompType [c2]) {
3704                                         case DecompositionSmall:
3705                                                 updateWeight = true;
3706                                                 AddCharMap ((char) c2, category,
3707                                                         0, level2);
3708                                                 break;
3709                                         }
3710                                 }
3711                         }
3712                         if (updateWeight)
3713                                 fillIndex [category] = (byte)
3714                                         (fillIndex [category] + updateCount);
3715
3716                         // Identical weight
3717                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3718                                 if (!map [c2].Defined &&
3719                                         decompLength [c2] == 1 &&
3720                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3721                                         switch (decompType [c2]) {
3722                                         case DecompositionSub:
3723                                         case DecompositionSuper:
3724                                         case DecompositionWide:
3725                                         case DecompositionNarrow:
3726                                                 AddCharMap ((char) c2, category,
3727                                                         0, level2);
3728                                                 break;
3729                                         }
3730                                 }
3731                         }
3732
3733                         // itself
3734                         AddCharMap (c, category, updateCount, level2);
3735
3736                         // Since nfkdMap is problematic to have two or more
3737                         // NFKD to an identical character, here I iterate all.
3738                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3739                                 if (!map [c2].Defined &&
3740                                         decompLength [c2] == 1 &&
3741                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3742                                         switch (decompType [c2]) {
3743                                         case DecompositionWide:
3744                                         case DecompositionNarrow:
3745                                         case DecompositionSmall:
3746                                         case DecompositionSub:
3747                                         case DecompositionSuper:
3748                                                 continue;
3749                                         default:
3750                                                 AddCharMap ((char) c2, category, updateCount, level2);
3751                                                 break;
3752                                         }
3753                                 }
3754                         }
3755                 }
3756
3757                 private void AddArabicCharMap (char c, byte category, byte updateCount, byte level2)
3758                 {
3759                         // itself
3760                         AddCharMap (c, category, 0, level2);
3761
3762                         // Since nfkdMap is problematic to have two or more
3763                         // NFKD to an identical character, here I iterate all.
3764                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3765                                 if (decompLength [c2] == 0)
3766                                         continue;
3767                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
3768                                 if ((int) (decompValues [idx]) == (int) c)
3769                                         AddCharMap ((char) c2, category,
3770                                                 0, level2);
3771                         }
3772                         fillIndex [category] += updateCount;
3773                 }
3774
3775                 char ToSmallForm (char c)
3776                 {
3777                         return ToDecomposed (c, DecompositionSmall, false);
3778                 }
3779
3780                 char ToDecomposed (char c, byte d, bool tail)
3781                 {
3782                         if (decompType [(int) c] != d)
3783                                 return c;
3784                         int idx = decompIndex [(int) c];
3785                         if (tail)
3786                                 idx += decompLength [(int) c] - 1;
3787                         return (char) decompValues [idx];
3788                 }
3789
3790                 bool ExistsJIS (int cp)
3791                 {
3792                         foreach (JISCharacter j in jisJapanese)
3793                                 if (j.CP == cp)
3794                                         return true;
3795                         return false;
3796                 }
3797
3798                 #endregion
3799
3800                 #region Level 3 properties (Case/Width)
3801
3802                 private byte ComputeLevel3Weight (char c)
3803                 {
3804                         byte b = ComputeLevel3WeightRaw (c);
3805                         return b > 0 ? (byte) (b + 2) : b;
3806                 }
3807
3808                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3809                 {
3810                         // CJK compat
3811                         if ('\u3192' <= c && c <= '\u319F')
3812                                 return 0;
3813
3814                         // They have <narrow> NFKD mapping, and on Windows
3815                         // those narrow characters are regarded as "normal",
3816                         // thus those characters themselves are regarded as
3817                         // "wide". grep "<narrow>" and you can pick them up
3818                         // (ignoring Kana, Hangul etc.)
3819                         switch (c) {
3820                         case '\u3002':
3821                         case '\u300C':
3822                         case '\u300D':
3823                         case '\u3001':
3824                         case '\u30FB':
3825                         case '\u2502':
3826                         case '\u2190':
3827                         case '\u2191':
3828                         case '\u2192':
3829                         case '\u2193':
3830                         case '\u25A0':
3831                         case '\u25CB':
3832                                 return 1;
3833                         }
3834                         // Korean
3835                         if ('\u11A8' <= c && c <= '\u11F9')
3836                                 return 2;
3837                         if ('\uFFA0' <= c && c <= '\uFFDC')
3838                                 return 4;
3839                         if ('\u3130' <= c && c <= '\u3164')
3840                                 return 5;
3841                         if ('\u3165' <= c && c <= '\u318E')
3842                                 return 4;
3843                         // Georgian Capital letters
3844                         if ('\u10A0' <= c && c <= '\u10C5')
3845                                 return 0x10;
3846                         // numbers
3847                         if ('\u2776' <= c && c <= '\u277F')
3848                                 return 4;
3849                         if ('\u2780' <= c && c <= '\u2789')
3850                                 return 8;
3851                         if ('\u2776' <= c && c <= '\u2793')
3852                                 return 0xC;
3853                         if ('\u2160' <= c && c <= '\u216F')
3854                                 return 0x10;
3855                         if ('\u2181' <= c && c <= '\u2182')
3856                                 return 0x10;
3857                         // Arabic
3858                         if ('\u2135' <= c && c <= '\u2138')
3859                                 return 4;
3860                         // I believe that Windows has a bug on setting level 3
3861                         // weight here. NFKD results in different values.
3862                         if ('\uFE80' < c && c < '\uFF00') {
3863                                 // 2(Isolated)/8(Final)/0x18(Medial)
3864                                 switch (decompType [(int) c]) {
3865                                 case DecompositionIsolated:
3866                                         return 0; // 2;
3867                                 case DecompositionFinal:
3868                                         return 8;
3869                                 case DecompositionMedial:
3870                                         return 0x18;
3871                                 case DecompositionInitial:
3872                                         return 0x10;
3873                                 }
3874                         }
3875
3876                         // I have no idea why those symbols have level 3 weight
3877                         if (c == '\u2104' || c == '\u212B')
3878                                 return 0x18;
3879                         if ('\u211E' <= c && c <= '\u212B')
3880                                 return 0x10;
3881
3882                         // actually I dunno the reason why they have weights.
3883                         switch (c) {
3884                         case '\u01BC':
3885                                 return 0x10;
3886                         case '\u06A9':
3887                                 return 0x20;
3888                         case '\u06AA':
3889                                 return 0x28;
3890                         // Gurmukhi
3891                         case '\u0A39':
3892                         case '\u0A59':
3893                         case '\u0A5A':
3894                         case '\u0A5B':
3895                         case '\u0A5E':
3896                                 return 0x10;
3897                         }
3898
3899                         byte ret = 0;
3900                         switch (c) {
3901                         case '\u03C2':
3902                         case '\u212B':
3903                                 ret = 8;
3904                                 break;
3905                         case '\uFE42':
3906                                 ret = 0xA;
3907                                 break;
3908                         }
3909
3910                         // misc
3911                         switch (decompType [(int) c]) {
3912                         case DecompositionWide: // <wide>
3913                         case DecompositionSub: // <sub>
3914                         case DecompositionSuper: // <super>
3915                                 ret |= decompType [(int) c];
3916                                 break;
3917                         }
3918                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3919                                 ret |= 8;
3920                         if (isUppercase [(int) c]) // DerivedCoreProperties
3921                                 ret |= 0x10;
3922
3923                         return ret;
3924                 }
3925
3926                 #endregion
3927
3928                 #region IsIgnorable
3929 /*
3930                 static bool IsIgnorable (int i)
3931                 {
3932                         if (unicodeAge [i] >= 3.1)
3933                                 return true;
3934                         switch (char.GetUnicodeCategory ((char) i)) {
3935                         case UnicodeCategory.OtherNotAssigned:
3936                         case UnicodeCategory.Format:
3937                                 return true;
3938                         }
3939                         return false;
3940                 }
3941 */
3942
3943                 // FIXME: In the future use DerivedAge.txt to examine character
3944                 // versions and set those ones that have higher version than
3945                 // 1.0 as ignorable.
3946                 static bool IsIgnorable (int i)
3947                 {
3948                         switch (i) {
3949                         case 0:
3950                         // I guess, those characters are added between
3951                         // Unicode 1.0 (LCMapString) and Unicode 3.1
3952                         // (UnicodeCategory), so they used to be
3953                         // something like OtherNotAssigned as of Unicode 1.1.
3954                         case 0x2df: case 0x387:
3955                         case 0x3d7: case 0x3d8: case 0x3d9:
3956                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3957                         case 0x400: case 0x40d: case 0x450: case 0x45d:
3958                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
3959                         case 0x653: case 0x654: case 0x655: case 0x66d:
3960                         case 0xb56:
3961                         case 0x1e9b: case 0x202f: case 0x20ad:
3962                         case 0x20ae: case 0x20af:
3963                         case 0x20e2: case 0x20e3:
3964                         case 0x2139: case 0x213a: case 0x2183:
3965                         case 0x2425: case 0x2426: case 0x2619:
3966                         case 0x2670: case 0x2671: case 0x3007:
3967                         case 0x3190: case 0x3191:
3968                         case 0xfffc: case 0xfffd:
3969                                 return true;
3970                         // exceptional characters filtered by the
3971                         // following conditions. Originally those exceptional
3972                         // ranges are incorrect (they should not be ignored)
3973                         // and most of those characters are unfortunately in
3974                         // those ranges.
3975                         case 0x4d8: case 0x4d9:
3976                         case 0x4e8: case 0x4e9:
3977                         case 0x70F:
3978                         case 0x3036: case 0x303f:
3979                         case 0x337b: case 0xfb1e:
3980                                 return false;
3981                         }
3982
3983                         if (
3984                                 // The whole Sinhala characters.
3985                                 0x0D82 <= i && i <= 0x0DF4
3986                                 // The whole Tibetan characters.
3987                                 || 0x0F00 <= i && i <= 0x0FD1
3988                                 // The whole Myanmar characters.
3989                                 || 0x1000 <= i && i <= 0x1059
3990                                 // The whole Etiopic, Cherokee,
3991                                 // Canadian Syllablic, Ogham, Runic,
3992                                 // Tagalog, Hanunoo, Philippine,
3993                                 // Buhid, Tagbanwa, Khmer and Mongorian
3994                                 // characters.
3995                                 || 0x1200 <= i && i <= 0x1DFF
3996                                 // Greek extension characters.
3997                                 || 0x1F00 <= i && i <= 0x1FFF
3998                                 // The whole Braille characters.
3999                                 || 0x2800 <= i && i <= 0x28FF
4000                                 // CJK radical characters.
4001                                 || 0x2E80 <= i && i <= 0x2EF3
4002                                 // Kangxi radical characters.
4003                                 || 0x2F00 <= i && i <= 0x2FD5
4004                                 // Ideographic description characters.
4005                                 || 0x2FF0 <= i && i <= 0x2FFB
4006                                 // Bopomofo letter and final
4007                                 || 0x31A0 <= i && i <= 0x31B7
4008                                 // White square with quadrant characters.
4009                                 || 0x25F0 <= i && i <= 0x25F7
4010                                 // Ideographic telegraph symbols.
4011                                 || 0x32C0 <= i && i <= 0x32CB
4012                                 || 0x3358 <= i && i <= 0x3370
4013                                 || 0x33E0 <= i && i <= 0x33FF
4014                                 // The whole YI characters.
4015                                 || 0xA000 <= i && i <= 0xA48C
4016                                 || 0xA490 <= i && i <= 0xA4C6
4017                                 // American small ligatures
4018                                 || 0xFB13 <= i && i <= 0xFB17
4019                                 // hebrew, arabic, variation selector.
4020                                 || 0xFB1D <= i && i <= 0xFE2F
4021                                 // Arabic ligatures.
4022                                 || 0xFEF5 <= i && i <= 0xFEFC
4023                                 // FIXME: why are they excluded?
4024                                 || 0x01F6 <= i && i <= 0x01F9
4025                                 || 0x0218 <= i && i <= 0x0233
4026                                 || 0x02A9 <= i && i <= 0x02AD
4027                                 || 0x02EA <= i && i <= 0x02EE
4028                                 || 0x0349 <= i && i <= 0x036F
4029                                 || 0x0488 <= i && i <= 0x048F
4030                                 || 0x04D0 <= i && i <= 0x04FF
4031                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
4032                                 || 0x06D6 <= i && i <= 0x06ED
4033                                 || 0x06FA <= i && i <= 0x06FE
4034                                 || 0x2048 <= i && i <= 0x204D
4035                                 || 0x20e4 <= i && i <= 0x20ea
4036                                 || 0x213C <= i && i <= 0x214B
4037                                 || 0x21EB <= i && i <= 0x21FF
4038                                 || 0x22F2 <= i && i <= 0x22FF
4039                                 || 0x237B <= i && i <= 0x239A
4040                                 || 0x239B <= i && i <= 0x23CF
4041                                 || 0x24EB <= i && i <= 0x24FF
4042                                 || 0x2596 <= i && i <= 0x259F
4043                                 || 0x25F8 <= i && i <= 0x25FF
4044                                 || 0x2672 <= i && i <= 0x2689
4045                                 || 0x2768 <= i && i <= 0x2775
4046                                 || 0x27d0 <= i && i <= 0x27ff
4047                                 || 0x2900 <= i && i <= 0x2aff
4048                                 || 0x3033 <= i && i <= 0x303F
4049                                 || 0x31F0 <= i && i <= 0x31FF
4050                                 || 0x3250 <= i && i <= 0x325F
4051                                 || 0x32B1 <= i && i <= 0x32BF
4052                                 || 0x3371 <= i && i <= 0x337B
4053                                 || 0xFA30 <= i && i <= 0xFA6A
4054                         )
4055                                 return true;
4056
4057                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4058                         switch (uc) {
4059                         case UnicodeCategory.PrivateUse:
4060                         case UnicodeCategory.Surrogate:
4061                                 return false;
4062                         // ignored by nature
4063                         case UnicodeCategory.Format:
4064                         case UnicodeCategory.OtherNotAssigned:
4065                                 return true;
4066                         default:
4067                                 return false;
4068                         }
4069                 }
4070
4071                 // To check IsIgnorable sanity, try the driver below under MS.NET.
4072
4073                 /*
4074                 public static void Main ()
4075                 {
4076                         for (int i = 0; i <= char.MaxValue; i++)
4077                                 Dump (i, IsIgnorable (i));
4078                 }
4079
4080                 static void Dump (int i, bool ignore)
4081                 {
4082                         switch (Char.GetUnicodeCategory ((char) i)) {
4083                         case UnicodeCategory.PrivateUse:
4084                         case UnicodeCategory.Surrogate:
4085                                 return; // check nothing
4086                         }
4087
4088                         string s1 = "";
4089                         string s2 = new string ((char) i, 10);
4090                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
4091                         if ((ret == 0) == ignore)
4092                                 return;
4093                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
4094                 }
4095                 */
4096                 #endregion // IsIgnorable
4097
4098                 #region IsIgnorableSymbol
4099                 static bool IsIgnorableSymbol (int i)
4100                 {
4101                         if (IsIgnorable (i))
4102                                 return true;
4103
4104                         switch (i) {
4105                         // *Letter
4106                         case 0x00b5: case 0x01C0: case 0x01C1:
4107                         case 0x01C2: case 0x01C3: case 0x01F6:
4108                         case 0x01F7: case 0x01F8: case 0x01F9:
4109                         case 0x02D0: case 0x02EE: case 0x037A:
4110                         case 0x03D7: case 0x03F3:
4111                         case 0x0400: case 0x040d:
4112                         case 0x0450: case 0x045d:
4113                         case 0x048C: case 0x048D:
4114                         case 0x048E: case 0x048F:
4115                         case 0x0587: case 0x0640: case 0x06E5:
4116                         case 0x06E6: case 0x06FA: case 0x06FB:
4117                         case 0x06FC: case 0x093D: case 0x0950:
4118                         case 0x1E9B: case 0x2139: case 0x3006:
4119                         case 0x3033: case 0x3034: case 0x3035:
4120                         case 0xFE7E: case 0xFE7F:
4121                         // OtherNumber
4122                         case 0x16EE: case 0x16EF: case 0x16F0:
4123                         // LetterNumber
4124                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
4125                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
4126                         case 0x3038: // HANGZHOU NUMERAL TEN
4127                         case 0x3039: // HANGZHOU NUMERAL TWENTY
4128                         case 0x303a: // HANGZHOU NUMERAL THIRTY
4129                         // OtherSymbol
4130                         case 0x2117:
4131                         case 0x327F:
4132                                 return true;
4133                         // ModifierSymbol
4134                         case 0x02B9: case 0x02BA: case 0x02C2:
4135                         case 0x02C3: case 0x02C4: case 0x02C5:
4136                         case 0x02C8: case 0x02CC: case 0x02CD:
4137                         case 0x02CE: case 0x02CF: case 0x02D2:
4138                         case 0x02D3: case 0x02D4: case 0x02D5:
4139                         case 0x02D6: case 0x02D7: case 0x02DE:
4140                         case 0x02E5: case 0x02E6: case 0x02E7:
4141                         case 0x02E8: case 0x02E9:
4142                         case 0x309B: case 0x309C:
4143                         // OtherPunctuation
4144                         case 0x055A: // American Apos
4145                         case 0x05C0: // Hebrew Punct
4146                         case 0x0E4F: // Thai FONGMAN
4147                         case 0x0E5A: // Thai ANGKHANKHU
4148                         case 0x0E5B: // Thai KHOMUT
4149                         // CurencySymbol
4150                         case 0x09F2: // Bengali Rupee Mark
4151                         case 0x09F3: // Bengali Rupee Sign
4152                         // MathSymbol
4153                         case 0x221e: // INF.
4154                         // OtherSymbol
4155                         case 0x0482:
4156                         case 0x09FA:
4157                         case 0x0B70:
4158                                 return false;
4159                         }
4160
4161                         // *Letter
4162                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
4163 #if NET_2_0
4164                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
4165                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
4166 #endif
4167                         )
4168                                 return true;
4169
4170                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4171                         switch (uc) {
4172                         case UnicodeCategory.Surrogate:
4173                                 return false; // inconsistent
4174
4175                         case UnicodeCategory.SpacingCombiningMark:
4176                         case UnicodeCategory.EnclosingMark:
4177                         case UnicodeCategory.NonSpacingMark:
4178                         case UnicodeCategory.PrivateUse:
4179                                 // NonSpacingMark
4180                                 if (0x064B <= i && i <= 0x0652) // Arabic
4181                                         return true;
4182                                 return false;
4183
4184                         case UnicodeCategory.Format:
4185                         case UnicodeCategory.OtherNotAssigned:
4186                                 return true;
4187
4188                         default:
4189                                 bool use = false;
4190                                 // OtherSymbols
4191                                 if (
4192                                         // latin in a circle
4193                                         0x249A <= i && i <= 0x24E9
4194                                         || 0x2100 <= i && i <= 0x2132
4195                                         // Japanese
4196                                         || 0x3196 <= i && i <= 0x31A0
4197                                         // Korean
4198                                         || 0x3200 <= i && i <= 0x321C
4199                                         // Chinese/Japanese
4200                                         || 0x322A <= i && i <= 0x3243
4201                                         // CJK
4202                                         || 0x3260 <= i && i <= 0x32B0
4203                                         || 0x32D0 <= i && i <= 0x3357
4204                                         || 0x337B <= i && i <= 0x33DD
4205                                 )
4206                                         use = !Char.IsLetterOrDigit ((char) i);
4207                                 if (use)
4208                                         return false;
4209
4210                                 // This "Digit" rule is mystery.
4211                                 // It filters some symbols out.
4212                                 if (Char.IsLetterOrDigit ((char) i))
4213                                         return false;
4214                                 if (Char.IsNumber ((char) i))
4215                                         return false;
4216                                 if (Char.IsControl ((char) i)
4217                                         || Char.IsSeparator ((char) i)
4218                                         || Char.IsPunctuation ((char) i))
4219                                         return true;
4220                                 if (Char.IsSymbol ((char) i))
4221                                         return true;
4222
4223                                 // FIXME: should check more
4224                                 return false;
4225                         }
4226                 }
4227
4228                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
4229 /*
4230                 public static void Main ()
4231                 {
4232                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
4233                         for (int i = 0; i <= char.MaxValue; i++) {
4234                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4235                                 if (uc == UnicodeCategory.Surrogate)
4236                                         continue;
4237
4238                                 bool ret = IsIgnorableSymbol (i);
4239
4240                                 string s1 = "TEST ";
4241                                 string s2 = "TEST " + (char) i;
4242
4243                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
4244
4245                                 if (ret != (result == 0))
4246                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
4247                                                 ret ? "should not ignore" :
4248                                                         "should ignore",
4249                                                 i,(char) i, uc);
4250                         }
4251                 }
4252 */
4253                 #endregion
4254
4255                 #region NonSpacing
4256                 static bool IsIgnorableNonSpacing (int i)
4257                 {
4258                         if (IsIgnorable (i))
4259                                 return true;
4260
4261                         switch (i) {
4262                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
4263                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
4264                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
4265                                 return true;
4266                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
4267                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
4268                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
4269                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
4270                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
4271                         case 0x0CCD: case 0x0E4E:
4272                                 return false;
4273                         }
4274
4275                         if (0x02b9 <= i && i <= 0x02c5
4276                                 || 0x02cc <= i && i <= 0x02d7
4277                                 || 0x02e4 <= i && i <= 0x02ef
4278                                 || 0x20DD <= i && i <= 0x20E0
4279                         )
4280                                 return true;
4281
4282                         if (0x064B <= i && i <= 0x00652
4283                                 || 0x0941 <= i && i <= 0x0948
4284                                 || 0x0AC1 <= i && i <= 0x0ACD
4285                                 || 0x0C3E <= i && i <= 0x0C4F
4286                                 || 0x0E31 <= i && i <= 0x0E3F
4287                         )
4288                                 return false;
4289
4290                         return Char.GetUnicodeCategory ((char) i) ==
4291                                 UnicodeCategory.NonSpacingMark;
4292                 }
4293
4294                 // We can reuse IsIgnorableSymbol testcode
4295                 // for IsIgnorableNonSpacing.
4296                 #endregion
4297         }
4298
4299         struct CharMapEntry
4300         {
4301                 public byte Category;
4302                 public byte Level1;
4303                 public byte Level2; // It is always single byte.
4304                 public bool Defined;
4305
4306                 public CharMapEntry (byte category, byte level1, byte level2)
4307                 {
4308                         Category = category;
4309                         Level1 = level1;
4310                         Level2 = level2;
4311                         Defined = true;
4312                 }
4313         }
4314
4315         class JISCharacter
4316         {
4317                 public readonly int CP;
4318                 public readonly int JIS;
4319
4320                 public JISCharacter (int cp, int cpJIS)
4321                 {
4322                         CP = cp;
4323                         JIS = cpJIS;
4324                 }
4325         }
4326
4327         class JISComparer : IComparer
4328         {
4329                 public static readonly JISComparer Instance =
4330                         new JISComparer ();
4331
4332                 public int Compare (object o1, object o2)
4333                 {
4334                         JISCharacter j1 = (JISCharacter) o1;
4335                         JISCharacter j2 = (JISCharacter) o2;
4336                         return j1.JIS - j2.JIS;
4337                 }
4338         }
4339
4340         class NonJISCharacter
4341         {
4342                 public readonly int CP;
4343                 public readonly string Name;
4344
4345                 public NonJISCharacter (int cp, string name)
4346                 {
4347                         CP = cp;
4348                         Name = name;
4349                 }
4350         }
4351
4352         class NonJISComparer : IComparer
4353         {
4354                 public static readonly NonJISComparer Instance =
4355                         new NonJISComparer ();
4356
4357                 public int Compare (object o1, object o2)
4358                 {
4359                         NonJISCharacter j1 = (NonJISCharacter) o1;
4360                         NonJISCharacter j2 = (NonJISCharacter) o2;
4361                         return string.CompareOrdinal (j1.Name, j2.Name);
4362                 }
4363         }
4364
4365         class DecimalDictionaryValueComparer : IComparer
4366         {
4367                 public static readonly DecimalDictionaryValueComparer Instance
4368                         = new DecimalDictionaryValueComparer ();
4369
4370                 private DecimalDictionaryValueComparer ()
4371                 {
4372                 }
4373
4374                 public int Compare (object o1, object o2)
4375                 {
4376                         DictionaryEntry e1 = (DictionaryEntry) o1;
4377                         DictionaryEntry e2 = (DictionaryEntry) o2;
4378                         // FIXME: in case of 0, compare decomposition categories
4379                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4380                         if (ret != 0)
4381                                 return ret;
4382                         int i1 = (int) e1.Key;
4383                         int i2 = (int) e2.Key;
4384                         return i1 - i2;
4385                 }
4386         }
4387
4388         class StringDictionaryValueComparer : IComparer
4389         {
4390                 public static readonly StringDictionaryValueComparer Instance
4391                         = new StringDictionaryValueComparer ();
4392
4393                 private StringDictionaryValueComparer ()
4394                 {
4395                 }
4396
4397                 public int Compare (object o1, object o2)
4398                 {
4399                         DictionaryEntry e1 = (DictionaryEntry) o1;
4400                         DictionaryEntry e2 = (DictionaryEntry) o2;
4401                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4402                         if (ret != 0)
4403                                 return ret;
4404                         int i1 = (int) e1.Key;
4405                         int i2 = (int) e2.Key;
4406                         return i1 - i2;
4407                 }
4408         }
4409
4410         class UCAComparer : IComparer
4411         {
4412                 public static readonly UCAComparer Instance
4413                         = new UCAComparer ();
4414
4415                 private UCAComparer ()
4416                 {
4417                 }
4418
4419                 public int Compare (object o1, object o2)
4420                 {
4421                         char i1 = (char) o1;
4422                         char i2 = (char) o2;
4423
4424                         int l1 = CollationElementTable.GetSortKeyCount (i1);
4425                         int l2 = CollationElementTable.GetSortKeyCount (i2);
4426                         int l = l1 > l2 ? l2 : l1;
4427
4428                         for (int i = 0; i < l; i++) {
4429                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4430                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4431                                 int v = k1.Primary - k2.Primary;
4432                                 if (v != 0)
4433                                         return v;
4434                                 v = k1.Secondary - k2.Secondary;
4435                                 if (v != 0)
4436                                         return v;
4437                                 v = k1.Thirtiary - k2.Thirtiary;
4438                                 if (v != 0)
4439                                         return v;
4440                                 v = k1.Quarternary - k2.Quarternary;
4441                                 if (v != 0)
4442                                         return v;
4443                         }
4444                         return l1 - l2;
4445                 }
4446         }
4447
4448         class Tailoring
4449         {
4450                 int lcid;
4451                 int alias;
4452                 bool frenchSort;
4453                 ArrayList items = new ArrayList ();
4454
4455                 public Tailoring (int lcid)
4456                         : this (lcid, 0)
4457                 {
4458                 }
4459
4460                 public Tailoring (int lcid, int alias)
4461                 {
4462                         this.lcid = lcid;
4463                         this.alias = alias;
4464                 }
4465
4466                 public int LCID {
4467                         get { return lcid; }
4468                 }
4469
4470                 public int Alias {
4471                         get { return alias; }
4472                 }
4473
4474                 public bool FrenchSort {
4475                         get { return frenchSort; }
4476                         set { frenchSort = value; }
4477                 }
4478
4479                 public void AddDiacriticalMap (byte target, byte replace)
4480                 {
4481                         items.Add (new DiacriticalMap (target, replace));
4482                 }
4483
4484                 public void AddSortKeyMap (string source, byte [] sortkey)
4485                 {
4486                         items.Add (new SortKeyMap (source, sortkey));
4487                 }
4488
4489                 public void AddReplacementMap (string source, string replace)
4490                 {
4491                         items.Add (new ReplacementMap (source, replace));
4492                 }
4493
4494                 public char [] ItemToCharArray ()
4495                 {
4496                         ArrayList al = new ArrayList ();
4497                         foreach (ITailoringMap m in items)
4498                                 al.AddRange (m.ToCharArray ());
4499                         return al.ToArray (typeof (char)) as char [];
4500                 }
4501
4502                 interface ITailoringMap
4503                 {
4504                         char [] ToCharArray ();
4505                 }
4506
4507                 class DiacriticalMap : ITailoringMap
4508                 {
4509                         public readonly byte Target;
4510                         public readonly byte Replace;
4511
4512                         public DiacriticalMap (byte target, byte replace)
4513                         {
4514                                 Target = target;
4515                                 Replace = replace;
4516                         }
4517
4518                         public char [] ToCharArray ()
4519                         {
4520                                 char [] ret = new char [3];
4521                                 ret [0] = (char) 02; // kind:DiacriticalMap
4522                                 ret [1] = (char) Target;
4523                                 ret [2] = (char) Replace;
4524                                 return ret;
4525                         }
4526                 }
4527
4528                 class SortKeyMap : ITailoringMap
4529                 {
4530                         public readonly string Source;
4531                         public readonly byte [] SortKey;
4532
4533                         public SortKeyMap (string source, byte [] sortkey)
4534                         {
4535                                 Source = source;
4536                                 SortKey = sortkey;
4537                         }
4538
4539                         public char [] ToCharArray ()
4540                         {
4541                                 char [] ret = new char [Source.Length + 7];
4542                                 ret [0] = (char) 01; // kind:SortKeyMap
4543                                 for (int i = 0; i < Source.Length; i++)
4544                                         ret [i + 1] = Source [i];
4545                                 // null terminate
4546                                 for (int i = 0; i < 4; i++)
4547                                         ret [i + Source.Length + 2] = (char) SortKey [i];
4548                                 return ret;
4549                         }
4550                 }
4551
4552                 class ReplacementMap : ITailoringMap
4553                 {
4554                         public readonly string Source;
4555                         public readonly string Replace;
4556
4557                         public ReplacementMap (string source, string replace)
4558                         {
4559                                 Source = source;
4560                                 Replace = replace;
4561                         }
4562
4563                         public char [] ToCharArray ()
4564                         {
4565                                 char [] ret = new char [Source.Length + Replace.Length + 3];
4566                                 ret [0] = (char) 03; // kind:ReplaceMap
4567                                 int pos = 1;
4568                                 for (int i = 0; i < Source.Length; i++)
4569                                         ret [pos++] = Source [i];
4570                                 // null terminate
4571                                 pos++;
4572                                 for (int i = 0; i < Replace.Length; i++)
4573                                         ret [pos++] = Replace [i];
4574                                 // null terminate
4575                                 return ret;
4576                         }
4577                 }
4578         }
4579 }