mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14
  15 //
  16 // * sortkey getter signature
  17 //
  18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
  19 //      Stores sort key for corresponding character element into buf and
  20 //      returns the length of the consumed _source_ character element in s.
  21 //
  22 // * character length to consume
  23 //
  24 //      If there are characters whose primary weight is 0, they are consumed
  25 //      and considered as a part of the character element.
  26 //
  27 #define Binary
  28
  29 using System;
  30 using System.IO;
  31 using System.Collections;
  32 using System.Globalization;
  33 using System.Text;
  34 using System.Xml;
  35
  36 namespace Mono.Globalization.Unicode
  37 {
  38         internal class MSCompatSortKeyTableGenerator
  39         {
  40                 public static void Main (string [] args)
  41                 {
  42                         new MSCompatSortKeyTableGenerator ().Run (args);
  43                 }
  44
  45                 const int DecompositionWide = 1; // fixed
  46                 const int DecompositionSub = 2; // fixed
  47                 const int DecompositionSmall = 3;
  48                 const int DecompositionIsolated = 4;
  49                 const int DecompositionInitial = 5;
  50                 const int DecompositionFinal = 6;
  51                 const int DecompositionMedial = 7;
  52                 const int DecompositionNoBreak = 8;
  53                 const int DecompositionVertical = 9;
  54                 const int DecompositionFraction = 0xA;
  55                 const int DecompositionFont = 0xB;
  56                 const int DecompositionSuper = 0xC; // fixed
  57                 const int DecompositionFull = 0xE;
  58                 const int DecompositionNarrow = 0xD;
  59                 const int DecompositionCircle = 0xF;
  60                 const int DecompositionSquare = 0x10;
  61                 const int DecompositionCompat = 0x11;
  62                 const int DecompositionCanonical = 0x12;
  63
  64                 TextWriter Result = Console.Out;
  65
  66                 byte [] fillIndex = new byte [256]; // by category
  67                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  68
  69                 char [] specialIgnore = new char [] {
  70                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  71                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  72                         };
  73
  74                 // FIXME: need more love (as always)
  75                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  76                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  77                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  78                         '\u0292', '\u01BE', '\u0298'};
  79                 byte [] alphaWeights = new byte [] {
  80                         2, 9, 0xA, 0x1A, 0x21,
  81                         0x23, 0x25, 0x2C, 0x32, 0x35,
  82                         0x36, 0x48, 0x51, 0x70, 0x7C,
  83                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  84                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  85                         0xA9, 0xAA, 0xB3, 0xB4};
  86
  87                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  88                 bool [] isUppercase = new bool [char.MaxValue + 1];
  89
  90                 byte [] decompType = new byte [char.MaxValue + 1];
  91                 int [] decompIndex = new int [char.MaxValue + 1];
  92                 int [] decompLength = new int [char.MaxValue + 1];
  93                 int [] decompValues;
  94                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  95
  96                 byte [] diacritical = new byte [char.MaxValue + 1];
  97
  98                 string [] diacritics = new string [] {
  99                         // LATIN, CYRILLIC etc.
 100                         "UPTURN", "DOUBLE-STRUCK",
 101                         "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;",
 102                         "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
 103                         "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
 104                         "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 105                         " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
 106                         "WITH OGONEK;", "WITH CEDILLA;",
 107                         //
 108                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 109                         "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
 110                         "STROKE OVERLAY",
 111                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 112                         " DIAERESIS AND GRAVE;",
 113                         " BREVE AND ACUTE;",
 114                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 115                         " MACRON AND ACUTE;",
 116                         " MACRON AND GRAVE;",
 117                         //
 118                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 119                         " RING ABOVE AND ACUTE",
 120                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 121                         " CIRCUMFLEX AND TILDE",
 122                         " TILDE AND DIAERESIS",
 123                         " STROKE AND ACUTE",
 124                         " BREVE AND TILDE",
 125                         " CEDILLA AND BREVE",
 126                         " OGONEK AND MACRON",
 127                         //
 128                         "WITH OVERLINE",
 129                         "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 130                         " DOUBLE GRAVE",
 131                         " INVERTED BREVE",
 132                         "ROMAN NUMERAL",
 133                         " PRECEDED BY APOSTROPHE",
 134                         "WITH HORN;",
 135                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 136                         " PALATAL HOOK",
 137                         " DOT BELOW;",
 138                         " RETROFLEX;", "DIAERESIS BELOW",
 139                         " RING BELOW",
 140                         //
 141                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 142                         " BREVE BELOW;", " HORN AND GRAVE",
 143                         " TILDE BELOW",
 144                         " TOPBAR",
 145                         " DOT BELOW AND DOT ABOVE",
 146                         " RIGHT HALF RING", " HORN AND TILDE",
 147                         " CIRCUMFLEX AND DOT BELOW",
 148                         " BREVE AND DOT BELOW",
 149                         " DOT BELOW AND MACRON",
 150                         " TONE TWO",
 151                         " HORN AND HOOK ABOVE",
 152                         " HORN AND DOT",
 153                         // CIRCLED, PARENTHESIZED and so on
 154                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
 155                         "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
 156                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 157                         };
 158                 byte [] diacriticWeights = new byte [] {
 159                         // LATIN.
 160                         3, 3, 5, 5,
 161                         0xF, 0xE, 0x12,
 162                         0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 163                         0x17, 0x19, 0x1A, 0x1B, 0x1C,
 164                         //
 165                         0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 166                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 167                         //
 168                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 169                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 170                         //
 171                         0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
 172                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
 173                         //
 174                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
 175                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 176                         0x87, 0x95, 0xAA,
 177                         // CIRCLED, PARENTHESIZED and so on.
 178                         0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
 179                         0xF3, 0xF3, 0xF3
 180                         };
 181
 182                 int [] numberSecondaryWeightBounds = new int [] {
 183                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 184                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 185                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 186                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 187                         0xE50, 0xE60, 0xED0, 0xEE0
 188                         };
 189
 190                 char [] orderedGurmukhi;
 191                 char [] orderedGujarati;
 192                 char [] orderedGeorgian;
 193                 char [] orderedThaana;
 194
 195                 static readonly char [] orderedTamilConsonants = new char [] {
 196                         // based on traditional Tamil consonants, except for
 197                         // Grantha (where Microsoft breaks traditionalism).
 198                         // http://www.angelfire.com/empire/thamizh/padanGaL
 199                         '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
 200                         '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
 201                         '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
 202                         '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
 203                         '\u0BB7', '\u0BB9'};
 204
 205                 // cp -> character name (only for some characters)
 206                 ArrayList sortableCharNames = new ArrayList ();
 207
 208                 // cp -> arrow value (int)
 209                 ArrayList arrowValues = new ArrayList ();
 210
 211                 // cp -> box value (int)
 212                 ArrayList boxValues = new ArrayList ();
 213
 214                 // cp -> level1 value
 215                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 216
 217                 // letterName -> cp
 218                 Hashtable arabicNameMap = new Hashtable ();
 219
 220                 // cp -> Hashtable [decompType] -> cp
 221                 Hashtable nfkdMap = new Hashtable ();
 222
 223                 // Latin letter -> ArrayList [int]
 224                 Hashtable latinMap = new Hashtable ();
 225
 226                 ArrayList jisJapanese = new ArrayList ();
 227                 ArrayList nonJisJapanese = new ArrayList ();
 228
 229                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
 230                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
 231                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
 232                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
 233                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
 234
 235                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 236
 237                 static double [] unicodeAge = new double [char.MaxValue + 1];
 238
 239                 ArrayList tailorings = new ArrayList ();
 240
 241                 void Run (string [] args)
 242                 {
 243                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 244                         ParseSources (dirname);
 245                         Console.Error.WriteLine ("parse done.");
 246
 247                         ModifyParsedValues ();
 248                         GenerateCore ();
 249                         Console.Error.WriteLine ("generation done.");
 250                         Serialize ();
 251                         Console.Error.WriteLine ("serialization done.");
 252 /*
 253 StreamWriter sw = new StreamWriter ("agelog.txt");
 254 for (int i = 0; i < char.MaxValue; i++) {
 255 bool shouldBe = false;
 256 switch (Char.GetUnicodeCategory ((char) i)) {
 257 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
 258         shouldBe = true; break;
 259 }
 260 if (unicodeAge [i] >= 3.1)
 261         shouldBe = true;
 262 //if (IsIgnorable (i) != shouldBe)
 263 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
 264 }
 265 sw.Close ();
 266 */
 267                 }
 268
 269                 byte [] CompressArray (byte [] source, CodePointIndexer i)
 270                 {
 271                         return (byte []) CodePointIndexer.CompressArray  (
 272                                 source, typeof (byte), i);
 273                 }
 274
 275                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
 276                 {
 277                         return (ushort []) CodePointIndexer.CompressArray  (
 278                                 source, typeof (ushort), i);
 279                 }
 280
 281                 void Serialize ()
 282                 {
 283                         // Tailorings
 284                         SerializeTailorings ();
 285
 286                         byte [] categories = new byte [map.Length];
 287                         byte [] level1 = new byte [map.Length];
 288                         byte [] level2 = new byte [map.Length];
 289                         byte [] level3 = new byte [map.Length];
 290                         ushort [] widthCompat = new ushort [map.Length];
 291                         for (int i = 0; i < map.Length; i++) {
 292                                 categories [i] = map [i].Category;
 293                                 level1 [i] = map [i].Level1;
 294                                 level2 [i] = map [i].Level2;
 295                                 level3 [i] = ComputeLevel3Weight ((char) i);
 296                                 // For Japanese Half-width characters, don't
 297                                 // map widthCompat. It is IgnoreKanaType that
 298                                 // handles those width differences.
 299                                 if (0xFF6D <= i && i <= 0xFF9D)
 300                                         continue;
 301                                 switch (decompType [i]) {
 302                                 case DecompositionNarrow:
 303                                 case DecompositionWide:
 304                                 case DecompositionSuper:
 305                                 case DecompositionSub:
 306                                         // they are always 1 char
 307                                         widthCompat [i] = (ushort) decompValues [decompIndex [i]];
 308                                         break;
 309                                 }
 310                         }
 311
 312                         // compress
 313                         ignorableFlags = CompressArray (ignorableFlags,
 314                                 MSCompatUnicodeTableUtil.Ignorable);
 315                         categories = CompressArray (categories,
 316                                 MSCompatUnicodeTableUtil.Category);
 317                         level1 = CompressArray (level1,
 318                                 MSCompatUnicodeTableUtil.Level1);
 319                         level2 = CompressArray (level2,
 320                                 MSCompatUnicodeTableUtil.Level2);
 321                         level3 = CompressArray (level3,
 322                                 MSCompatUnicodeTableUtil.Level3);
 323                         widthCompat = (ushort []) CodePointIndexer.CompressArray (
 324                                 widthCompat, typeof (ushort),
 325                                 MSCompatUnicodeTableUtil.WidthCompat);
 326                         cjkCHS = CompressArray (cjkCHS,
 327                                 MSCompatUnicodeTableUtil.CjkCHS);
 328                         cjkCHT = CompressArray (cjkCHT,
 329                                 MSCompatUnicodeTableUtil.Cjk);
 330                         cjkJA = CompressArray (cjkJA,
 331                                 MSCompatUnicodeTableUtil.Cjk);
 332                         cjkKO = CompressArray (cjkKO,
 333                                 MSCompatUnicodeTableUtil.Cjk);
 334                         cjkKOlv2 = CompressArray (cjkKOlv2,
 335                                 MSCompatUnicodeTableUtil.Cjk);
 336
 337                         // Ignorables
 338                         Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
 339 #if Binary
 340                         MemoryStream ms = new MemoryStream ();
 341                         BinaryWriter binary = new BinaryWriter (ms);
 342                         binary.Write (ignorableFlags.Length);
 343 #endif
 344                         for (int i = 0; i < ignorableFlags.Length; i++) {
 345                                 byte value = ignorableFlags [i];
 346                                 if (value < 10)
 347                                         Result.Write ("{0},", value);
 348                                 else
 349                                         Result.Write ("0x{0:X02},", value);
 350 #if Binary
 351                                 binary.Write (value);
 352 #endif
 353                                 if ((i & 0xF) == 0xF)
 354                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 355                         }
 356                         Result.WriteLine ("};");
 357                         Result.WriteLine ();
 358
 359                         // Primary category
 360                         Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
 361 #if Binary
 362                         binary.Write (categories.Length);
 363 #endif
 364                         for (int i = 0; i < categories.Length; i++) {
 365                                 byte value = categories [i];
 366                                 if (value < 10)
 367                                         Result.Write ("{0},", value);
 368                                 else
 369                                         Result.Write ("0x{0:X02},", value);
 370 #if Binary
 371                                 binary.Write (value);
 372 #endif
 373                                 if ((i & 0xF) == 0xF)
 374                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 375                         }
 376                         Result.WriteLine ("};");
 377                         Result.WriteLine ();
 378
 379                         // Primary weight value
 380                         Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
 381 #if Binary
 382                         binary.Write (level1.Length);
 383 #endif
 384                         for (int i = 0; i < level1.Length; i++) {
 385                                 byte value = level1 [i];
 386                                 if (value < 10)
 387                                         Result.Write ("{0},", value);
 388                                 else
 389                                         Result.Write ("0x{0:X02},", value);
 390 #if Binary
 391                                 binary.Write (value);
 392 #endif
 393                                 if ((i & 0xF) == 0xF)
 394                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 395                         }
 396                         Result.WriteLine ("};");
 397                         Result.WriteLine ();
 398
 399                         // Secondary weight
 400                         Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
 401 #if Binary
 402                         binary.Write (level2.Length);
 403 #endif
 404                         for (int i = 0; i < level2.Length; i++) {
 405                                 byte value = level2 [i];
 406                                 if (value < 10)
 407                                         Result.Write ("{0},", value);
 408                                 else
 409                                         Result.Write ("0x{0:X02},", value);
 410 #if Binary
 411                                 binary.Write (value);
 412 #endif
 413                                 if ((i & 0xF) == 0xF)
 414                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 415                         }
 416                         Result.WriteLine ("};");
 417                         Result.WriteLine ();
 418
 419                         // Thirtiary weight
 420                         Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
 421 #if Binary
 422                         binary.Write (level3.Length);
 423 #endif
 424                         for (int i = 0; i < level3.Length; i++) {
 425                                 byte value = level3 [i];
 426                                 if (value < 10)
 427                                         Result.Write ("{0},", value);
 428                                 else
 429                                         Result.Write ("0x{0:X02},", value);
 430 #if Binary
 431                                 binary.Write (value);
 432 #endif
 433                                 if ((i & 0xF) == 0xF)
 434                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 435                         }
 436                         Result.WriteLine ("};");
 437                         Result.WriteLine ();
 438
 439                         // Width insensitivity mappings
 440                         // (for now it is more lightweight than dumping the
 441                         // entire NFKD table).
 442                         Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
 443 #if Binary
 444                         binary.Write (widthCompat.Length);
 445 #endif
 446                         for (int i = 0; i < widthCompat.Length; i++) {
 447                                 ushort value = widthCompat [i];
 448                                 if (value < 10)
 449                                         Result.Write ("{0},", value);
 450                                 else
 451                                         Result.Write ("0x{0:X02},", value);
 452 #if Binary
 453                                 binary.Write (value);
 454 #endif
 455                                 if ((i & 0xF) == 0xF)
 456                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 457                         }
 458                         Result.WriteLine ("};");
 459                         Result.WriteLine ();
 460 #if Binary
 461                         using (FileStream fs = File.Create ("../collation.core.bin")) {
 462                                 byte [] array = ms.ToArray ();
 463                                 fs.Write (array, 0, array.Length);
 464                         }
 465 #endif
 466
 467                         // CJK
 468                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 469                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 470                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 471                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 472                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 473                 }
 474
 475                 void SerializeCJK (string name, ushort [] cjk, int max)
 476                 {
 477                         int offset = 0;//char.MaxValue - cjk.Length;
 478                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
 479 #if Binary
 480                         MemoryStream ms = new MemoryStream ();
 481                         BinaryWriter binary = new BinaryWriter (ms);
 482                         binary.Write (cjk.Length);
 483 #endif
 484                         for (int i = 0; i < cjk.Length; i++) {
 485                                 if (i + offset == max)
 486                                         break;
 487                                 ushort value = cjk [i];
 488                                 if (value < 10)
 489                                         Result.Write ("{0},", value);
 490                                 else
 491                                         Result.Write ("0x{0:X04},", value);
 492 #if Binary
 493                                 binary.Write (value);
 494 #endif
 495                                 if ((i & 0xF) == 0xF)
 496                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 497                         }
 498                         Result.WriteLine ("};");
 499                         Result.WriteLine ();
 500 #if Binary
 501                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 502                                 byte [] array = ms.ToArray ();
 503                                 fs.Write (array, 0, array.Length);
 504                         }
 505 #endif
 506                 }
 507
 508                 void SerializeCJK (string name, byte [] cjk, int max)
 509                 {
 510                         int offset = 0;//char.MaxValue - cjk.Length;
 511                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
 512 #if Binary
 513                         MemoryStream ms = new MemoryStream ();
 514                         BinaryWriter binary = new BinaryWriter (ms);
 515 #endif
 516                         for (int i = 0; i < cjk.Length; i++) {
 517                                 if (i + offset == max)
 518                                         break;
 519                                 byte value = cjk [i];
 520                                 if (value < 10)
 521                                         Result.Write ("{0},", value);
 522                                 else
 523                                         Result.Write ("0x{0:X02},", value);
 524 #if Binary
 525                                 binary.Write (value);
 526 #endif
 527                                 if ((i & 0xF) == 0xF)
 528                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 529                         }
 530                         Result.WriteLine ("};");
 531                         Result.WriteLine ();
 532 #if Binary
 533                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 534                                 byte [] array = ms.ToArray ();
 535                                 fs.Write (array, 0, array.Length);
 536                         }
 537 #endif
 538                 }
 539
 540                 void SerializeTailorings ()
 541                 {
 542                         Hashtable indexes = new Hashtable ();
 543                         Hashtable counts = new Hashtable ();
 544                         Result.WriteLine ("static char [] tailorings = new char [] {");
 545                         int count = 0;
 546 #if Binary
 547                         MemoryStream ms = new MemoryStream ();
 548                         BinaryWriter binary = new BinaryWriter (ms);
 549 #endif
 550                         foreach (Tailoring t in tailorings) {
 551                                 if (t.Alias != 0)
 552                                         continue;
 553                                 Result.Write ("/*{0}*/", t.LCID);
 554                                 indexes.Add (t.LCID, count);
 555                                 char [] values = t.ItemToCharArray ();
 556                                 counts.Add (t.LCID, values.Length);
 557                                 foreach (char c in values) {
 558                                         Result.Write ("'\\x{0:X}', ", (int) c);
 559                                         if (++count % 16 == 0)
 560                                                 Result.WriteLine (" // {0:X04}", count - 16);
 561 #if Binary
 562                                         binary.Write ((ushort) c);
 563 #endif
 564                                 }
 565                         }
 566                         Result.WriteLine ("};");
 567
 568                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
 569 #if Binary
 570                         byte [] rawdata = ms.ToArray ();
 571                         ms = new MemoryStream ();
 572                         binary = new BinaryWriter (ms);
 573                         binary.Write (tailorings.Count);
 574 #endif
 575                         foreach (Tailoring t in tailorings) {
 576                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
 577                                 if (!indexes.ContainsKey (target)) {
 578                                         throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
 579                                         continue;
 580                                 }
 581                                 int idx = (int) indexes [target];
 582                                 int cnt = (int) counts [target];
 583                                 bool french = t.FrenchSort;
 584                                 if (t.Alias != 0)
 585                                         foreach (Tailoring t2 in tailorings)
 586                                                 if (t2.LCID == t.LCID)
 587                                                         french = t2.FrenchSort;
 588                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
 589 #if Binary
 590                                 binary.Write (t.LCID);
 591                                 binary.Write (idx);
 592                                 binary.Write (cnt);
 593                                 binary.Write (french);
 594 #endif
 595                         }
 596                         Result.WriteLine ("};");
 597 #if Binary
 598                         binary.Write ((byte) 0xFF);
 599                         binary.Write ((byte) 0xFF);
 600                         binary.Write (rawdata.Length / 2);
 601                         binary.Write (rawdata, 0, rawdata.Length);
 602
 603
 604                         using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
 605                                 byte [] array = ms.ToArray ();
 606                                 fs.Write (array, 0, array.Length);
 607                         }
 608 #endif
 609                 }
 610
 611                 #region Parse
 612
 613                 void ParseSources (string dirname)
 614                 {
 615                         string unidata =
 616                                 dirname + "/UnicodeData.txt";
 617                         string derivedCoreProps =
 618                                 dirname + "/DerivedCoreProperties.txt";
 619                         string scripts =
 620                                 dirname + "/Scripts.txt";
 621                         string cp932 =
 622                                 dirname + "/CP932.TXT";
 623                         string derivedAge =
 624                                 dirname + "/DerivedAge.txt";
 625                         string chXML = dirname + "/common/collation/zh.xml";
 626                         string jaXML = dirname + "/common/collation/ja.xml";
 627                         string koXML = dirname + "/common/collation/ko.xml";
 628
 629                         ParseDerivedAge (derivedAge);
 630
 631                         FillIgnorables ();
 632
 633                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 634                         ParseUnidata (unidata);
 635                         ModifyUnidata ();
 636                         ParseDerivedCoreProperties (derivedCoreProps);
 637                         ParseScripts (scripts);
 638                         ParseCJK (chXML, jaXML, koXML);
 639
 640                         ParseTailorings ("mono-tailoring-source.txt");
 641                 }
 642
 643                 void ParseTailorings (string filename)
 644                 {
 645                         Tailoring t = null;
 646                         int line = 0;
 647                         using (StreamReader sr = new StreamReader (filename)) {
 648                                 try {
 649                                         while (sr.Peek () >= 0) {
 650                                                 line++;
 651                                                 ProcessTailoringLine (ref t,
 652                                                         sr.ReadLine ().Trim ());
 653                                         }
 654                                 } catch (Exception) {
 655                                         Console.Error.WriteLine ("ERROR at line {0}", line);
 656                                         throw;
 657                                 }
 658                         }
 659                 }
 660
 661                 // For now this is enough.
 662                 string ParseTailoringSourceValue (string s)
 663                 {
 664                         StringBuilder sb = new StringBuilder ();
 665                         for (int i = 0; i < s.Length; i++) {
 666                                 if (s.StartsWith ("\\u")) {
 667                                         sb.Append ((char) int.Parse (
 668                                                 s.Substring (2, 4), NumberStyles.HexNumber),
 669                                                 1);
 670                                         i += 5;
 671                                 }
 672                         else
 673                                 sb.Append (s [i]);
 674                         }
 675                         return sb.ToString ();
 676                 }
 677
 678                 void ProcessTailoringLine (ref Tailoring t, string s)
 679                 {
 680                         int idx = s.IndexOf ('#');
 681                         if (idx > 0)
 682                                 s = s.Substring (0, idx).Trim ();
 683                         if (s.Length == 0 || s [0] == '#')
 684                                 return;
 685                         if (s [0] == '@') {
 686                                 idx = s.IndexOf ('=');
 687                                 if (idx > 0)
 688                                         t = new Tailoring (
 689                                                 int.Parse (s.Substring (1, idx - 1)),
 690                                                 int.Parse (s.Substring (idx + 1)));
 691                                 else
 692                                         t = new Tailoring (int.Parse (s.Substring (1)));
 693                                 tailorings.Add (t);
 694                                 return;
 695                         }
 696                         if (s.StartsWith ("*FrenchSort")) {
 697                                 t.FrenchSort = true;
 698                                 return;
 699                         }
 700                         string d = "*Diacritical";
 701                         if (s.StartsWith (d)) {
 702                                 idx = s.IndexOf ("->");
 703                                 t.AddDiacriticalMap (
 704                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
 705                                                 NumberStyles.HexNumber),
 706                                         byte.Parse (s.Substring (idx + 2).Trim (),
 707                                                 NumberStyles.HexNumber));
 708                                 return;
 709                         }
 710                         idx = s.IndexOf (':');
 711                         if (idx > 0) {
 712                                 string source = s.Substring (0, idx).Trim ();
 713                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
 714                                 byte [] b = new byte [4];
 715                                 for (int i = 0; i < 4; i++) {
 716                                         if (l [i] == "*")
 717                                                 b [i] = 0;
 718                                         else
 719                                                 b [i] = byte.Parse (l [i],
 720                                                         NumberStyles.HexNumber);
 721                                 }
 722                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
 723                                         b);
 724                         }
 725                         idx = s.IndexOf ('=');
 726                         if (idx > 0)
 727                                 t.AddReplacementMap (
 728                                         ParseTailoringSourceValue (
 729                                                 s.Substring (0, idx).Trim ()),
 730                                         ParseTailoringSourceValue (
 731                                                 s.Substring (idx + 1).Trim ()));
 732                 }
 733
 734                 void ParseDerivedAge (string filename)
 735                 {
 736                         using (StreamReader file =
 737                                 new StreamReader (filename)) {
 738                                 while (file.Peek () >= 0) {
 739                                         string s = file.ReadLine ();
 740                                         int idx = s.IndexOf ('#');
 741                                         if (idx >= 0)
 742                                                 s = s.Substring (0, idx);
 743                                         idx = s.IndexOf (';');
 744                                         if (idx < 0)
 745                                                 continue;
 746
 747                                         string cpspec = s.Substring (0, idx);
 748                                         idx = cpspec.IndexOf ("..");
 749                                         NumberStyles nf = NumberStyles.HexNumber |
 750                                                 NumberStyles.AllowTrailingWhite;
 751                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 752                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 753                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 754
 755                                         // FIXME: use index
 756                                         if (cp > char.MaxValue)
 757                                                 continue;
 758
 759                                         double v = double.Parse (value);
 760                                         for (int i = cp; i <= cpEnd; i++)
 761                                                 unicodeAge [i] = v;
 762                                 }
 763                         }
 764                         unicodeAge [0] = double.MaxValue; // never be supported
 765                 }
 766
 767                 void ParseUnidata (string filename)
 768                 {
 769                         ArrayList decompValues = new ArrayList ();
 770                         using (StreamReader unidata =
 771                                 new StreamReader (filename)) {
 772                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 773                                         try {
 774                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 775                                         } catch (Exception) {
 776                                                 Console.Error.WriteLine ("**** At line " + line);
 777                                                 throw;
 778                                         }
 779                                 }
 780                         }
 781                         this.decompValues = (int [])
 782                                 decompValues.ToArray (typeof (int));
 783                 }
 784
 785                 char previousLatinTarget = char.MinValue;
 786                 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
 787
 788                 void ProcessUnidataLine (string s, ArrayList decompValues)
 789                 {
 790                         int idx = s.IndexOf ('#');
 791                         if (idx >= 0)
 792                                 s = s.Substring (0, idx);
 793                         idx = s.IndexOf (';');
 794                         if (idx < 0)
 795                                 return;
 796                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 797                         string [] values = s.Substring (idx + 1).Split (';');
 798
 799                         // FIXME: use index
 800                         if (cp > char.MaxValue)
 801                                 return;
 802                         if (IsIgnorable (cp))
 803                                 return;
 804
 805                         string name = values [0];
 806
 807                         // SPECIAL CASE: rename some characters for diacritical
 808                         // remapping. FIXME: why are they different?
 809                         // FIXME: it's still not working.
 810                         if (cp == 0x018B || cp == 0x018C)
 811                                 name = name.Replace ("TOPBAR", "STROKE");
 812
 813                         // isSmallCapital
 814                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 815                                 isSmallCapital [cp] = true;
 816
 817                         // latin mapping by character name
 818                         if (s.IndexOf ("LATIN") >= 0) {
 819                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 820                                 int offset = lidx + 15;
 821                                 if (lidx < 0) {
 822                                         lidx = s.IndexOf ("LETTER TURNED ");
 823                                         offset = lidx + 14;
 824                                 }
 825                                 if (lidx < 0) {
 826                                         lidx = s.IndexOf ("LETTER CAPITAL ");
 827                                         offset = lidx + 15;
 828                                 }
 829                                 if (lidx < 0) {
 830                                         lidx = s.IndexOf ("LETTER SCRIPT ");
 831                                         offset = lidx + 14;
 832                                 }
 833                                 if (lidx < 0) {
 834                                         lidx = s.IndexOf ("LETTER ");
 835                                         offset = lidx + 7;
 836                                 }
 837                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 838                                 char n = s [offset + 1];
 839                                 char target = char.MinValue;
 840                                 if ('A' <= c && c <= 'Z' &&
 841                                         (n == ' ') || n == ';') {
 842                                         target = c;
 843                                         // FIXME: After 'Z', I cannot reset this state.
 844                                         previousLatinTarget = c == 'Z' ? char.MinValue : c;
 845                                 }
 846
 847                                 if (s.Substring (offset).StartsWith ("ALPHA"))
 848                                         target = 'A';
 849                                 else if (s.Substring (offset).StartsWith ("TONE SIX"))
 850                                         target = 'B';
 851                                 else if (s.Substring (offset).StartsWith ("OPEN O"))
 852                                         target = 'C';
 853                                 else if (s.Substring (offset).StartsWith ("SCHWA"))
 854                                         target = 'E';
 855                                 else if (s.Substring (offset).StartsWith ("ENG"))
 856                                         target = 'N';
 857                                 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
 858                                         target = 'O';
 859                                 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
 860                                         target = 'R';
 861                                 else if (s.Substring (offset).StartsWith ("TONE TWO"))
 862                                         target = 'S';
 863                                 else if (s.Substring (offset).StartsWith ("ESH"))
 864                                         target = 'S';
 865
 866                                 if (target == char.MinValue)
 867                                         target = previousLatinTarget;
 868
 869                                 if (target != char.MinValue) {
 870                                         ArrayList entry = (ArrayList) latinMap [target];
 871                                         if (entry == null) {
 872                                                 entry = new ArrayList ();
 873                                                 latinMap [target] = entry;
 874                                         }
 875                                         entry.Add (cp);
 876                                         // FIXME: This secondary weight is hack.
 877                                         // They are here because they must not
 878                                         // be identical to the corresponding
 879                                         // ASCII latins.
 880                                         if (c != target && diacritical [cp] == 0) {
 881                                                 diacriticalOffset [c - 'A']++;
 882                                                 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
 883                                         }
 884                                 }
 885                         }
 886
 887                         // Arrow names
 888                         if (0x2000 <= cp && cp < 0x3000) {
 889                                 int value = 0;
 890                                 // SPECIAL CASES. FIXME: why?
 891                                 switch (cp) {
 892                                 case 0x21C5: value = -1; break; // E2
 893                                 case 0x261D: value = 1; break;
 894                                 case 0x27A6: value = 3; break;
 895                                 case 0x21B0: value = 7; break;
 896                                 case 0x21B1: value = 3; break;
 897                                 case 0x21B2: value = 7; break;
 898                                 case 0x21B4: value = 5; break;
 899                                 case 0x21B5: value = 7; break;
 900                                 case 0x21B9: value = -1; break; // E1
 901                                 case 0x21CF: value = 7; break;
 902                                 case 0x21D0: value = 3; break;
 903                                 }
 904                                 string [] arrowTargets = new string [] {
 905                                         "",
 906                                         "UPWARDS",
 907                                         "NORTH EAST",
 908                                         "RIGHTWARDS",
 909                                         "SOUTH EAST",
 910                                         "DOWNWARDS",
 911                                         "SOUTH WEST",
 912                                         "LEFTWARDS",
 913                                         "NORTH WEST",
 914                                         };
 915                                 if (value == 0)
 916                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
 917                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
 918                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
 919                                                         s.IndexOf (" OVER") < 0
 920                                                 )
 921                                                         value = i;
 922                                 if (value > 0)
 923                                         arrowValues.Add (new DictionaryEntry (
 924                                                 cp, value));
 925                         }
 926
 927                         // Box names
 928                         if (0x2500 <= cp && cp < 0x2600) {
 929                                 int value = 0;
 930                                 // flags:
 931                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
 932                                 // [h,rl] [r] [l]
 933                                 // [v,ud] [u] [d]
 934                                 // [dr] [dl] [ur] [ul]
 935                                 // [vr,udr] [vl,vdl]
 936                                 // [hd,rld] [hu,rlu]
 937                                 // [hv,udrl,rlv,udh]
 938                                 ArrayList flags = new ArrayList (new int [] {
 939                                         32, 8 + 4, 8, 4,
 940                                         16, 1 + 2, 1, 2,
 941                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
 942                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
 943                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
 944                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
 945                                         });
 946                                 byte [] offsets = new byte [] {
 947                                         0, 0, 1, 2,
 948                                         3, 3, 4, 5,
 949                                         6, 7, 8, 9,
 950                                         10, 10, 11, 11,
 951                                         12, 12, 13, 13,
 952                                         14, 14, 14, 14};
 953                                 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
 954                                         int flag = 0;
 955                                         if (s.IndexOf (" UP") >= 0)
 956                                                 flag |= 1;
 957                                         if (s.IndexOf (" DOWN") >= 0)
 958                                                 flag |= 2;
 959                                         if (s.IndexOf (" RIGHT") >= 0)
 960                                                 flag |= 4;
 961                                         if (s.IndexOf (" LEFT") >= 0)
 962                                                 flag |= 8;
 963                                         if (s.IndexOf (" VERTICAL") >= 0)
 964                                                 flag |= 16;
 965                                         if (s.IndexOf (" HORIZONTAL") >= 0)
 966                                                 flag |= 32;
 967
 968                                         int fidx = flags.IndexOf (flag);
 969                                         value = fidx < 0 ? fidx : offsets [fidx];
 970                                 } else if (s.IndexOf ("BLOCK") >= 0) {
 971                                         if (s.IndexOf ("ONE EIGHTH") >= 0)
 972                                                 value = 0x12;
 973                                         else if (s.IndexOf ("ONE QUARTER") >= 0)
 974                                                 value = 0x13;
 975                                         else if (s.IndexOf ("THREE EIGHTHS") >= 0)
 976                                                 value = 0x14;
 977                                         else if (s.IndexOf ("HALF") >= 0)
 978                                                 value = 0x15;
 979                                         else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
 980                                                 value = 0x16;
 981                                         else if (s.IndexOf ("THREE QUARTERS") >= 0)
 982                                                 value = 0x17;
 983                                         else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
 984                                                 value = 0x18;
 985                                         else
 986                                                 value = 0x19;
 987                                 }
 988                                 else if (s.IndexOf ("SHADE") >= 0)
 989                                         value = 0x19;
 990                                 else if (s.IndexOf ("SQUARE") >= 0)
 991                                         value = 0xBC - 0xE5;
 992                                 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
 993                                         value = 0xBE - 0xE5;
 994                                 else if (s.IndexOf ("RECTANGLE") >= 0)
 995                                         value = 0xBD - 0xE5;
 996                                 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
 997                                         value = 0xBF - 0xE5;
 998                                 else if (s.IndexOf ("TRIANGLE") >= 0) {
 999                                         if (s.IndexOf ("UP-POINTING") >= 0)
1000                                                 value = 0xC0 - 0xE5;
1001                                         else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1002                                                 value = 0xC1 - 0xE5;
1003                                         else if (s.IndexOf ("DOWN-POINTING") >= 0)
1004                                                 value = 0xC2 - 0xE5;
1005                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1006                                                 value = 0xC3 - 0xE5;
1007                                 }
1008                                 else if (s.IndexOf ("POINTER") >= 0) {
1009                                         if (s.IndexOf ("RIGHT-POINTING") >= 0)
1010                                                 value = 0xC4 - 0xE5;
1011                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1012                                                 value = 0xC5 - 0xE5;
1013                                 }
1014                                 else if (s.IndexOf ("DIAMOND") >= 0)
1015                                         value = 0xC6 - 0xE5;
1016                                 else if (s.IndexOf ("FISHEYE") >= 0)
1017                                         value = 0xC7 - 0xE5;
1018                                 else if (s.IndexOf ("LOZENGE") >= 0)
1019                                         value = 0xC8 - 0xE5;
1020                                 else if (s.IndexOf ("BULLSEYE") >= 0)
1021                                         value = 0xC9 - 0xE5;
1022                                 else if (s.IndexOf ("CIRCLE") >= 0) {
1023                                         if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1024                                                 value = 0xCA - 0xE5;
1025                                         else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1026                                                 value = 0xCB - 0xE5;
1027                                         else
1028                                                 value = 0xC9 - 0xE5;
1029                                 }
1030                                 if (0x25DA <= cp && cp <= 0x25E5)
1031                                         value = 0xCD + cp - 0x25DA - 0xE5;
1032
1033                                 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1034                                 switch (cp) {
1035                                 case 0x2571: value = 0xF; break;
1036                                 case 0x2572: value = 0x10; break;
1037                                 case 0x2573: value = 0x11; break;
1038                                 }
1039                                 if (value != 0)
1040                                         boxValues.Add (new DictionaryEntry (
1041                                                 cp, value));
1042                         }
1043
1044                         // For some characters store the name and sort later
1045                         // to determine sorting.
1046                         if (0x2100 <= cp && cp <= 0x213F &&
1047                                 Char.IsSymbol ((char) cp))
1048                                 sortableCharNames.Add (
1049                                         new DictionaryEntry (cp, name));
1050                         else if (0x3380 <= cp && cp <= 0x33DD)
1051                                 sortableCharNames.Add (new DictionaryEntry (
1052                                         cp, name.Substring (7)));
1053
1054                         if (Char.GetUnicodeCategory ((char) cp) ==
1055                                 UnicodeCategory.MathSymbol) {
1056                                 if (name.StartsWith ("CIRCLED "))
1057                                         diacritical [cp] = 0xEE;
1058                                 if (name.StartsWith ("SQUARED "))
1059                                         diacritical [cp] = 0xEF;
1060                         }
1061
1062                         // diacritical weights by character name
1063 if (diacritics.Length != diacriticWeights.Length)
1064 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1065                         for (int d = 0; d < diacritics.Length; d++) {
1066                                 if (s.IndexOf (diacritics [d]) > 0) {
1067                                         diacritical [cp] += diacriticWeights [d];
1068                                         if (s.IndexOf ("COMBINING") >= 0)
1069                                                 diacritical [cp] -= (byte) 2;
1070                                         continue;
1071                                 }
1072                                 // also process "COMBINING blah" here
1073                                 // For now it is limited to cp < 0x0370
1074 //                              if (cp < 0x0300 || cp >= 0x0370)
1075 //                                      continue;
1076                                 string tmp = diacritics [d].TrimEnd (';');
1077                                 if (tmp.IndexOf ("WITH ") == 0)
1078                                         tmp = tmp.Substring (4);
1079                                 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1080                                 if (name == tmp)
1081                                         diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1082 //if (name == tmp)
1083 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1084                         }
1085                         // Two-step grep required for it.
1086                         if (s.IndexOf ("FULL STOP") > 0 &&
1087                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1088                                 diacritical [cp] |= 0xF4;
1089
1090                         // Arabic letter name
1091                         if (0x0621 <= cp && cp <= 0x064A &&
1092                                 Char.GetUnicodeCategory ((char) cp)
1093                                 == UnicodeCategory.OtherLetter) {
1094                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1095                                 switch (cp) {
1096                                 case 0x0621:
1097                                 case 0x0624:
1098                                 case 0x0626:
1099                                         // hamza, waw, yeh ... special cases.
1100                                         value = 0x07;
1101                                         break;
1102                                 case 0x0649:
1103                                 case 0x064A:
1104                                         value = 0x77; // special cases.
1105                                         break;
1106                                 default:
1107                                         // Get primary letter name i.e.
1108                                         // XXX part of ARABIC LETTER XXX yyy
1109                                         // e.g. that of "TEH MARBUTA" is "TEH".
1110                                         string letterName =
1111                                                 (cp == 0x0640) ?
1112                                                 // 0x0640 is special: it does
1113                                                 // not start with ARABIC LETTER
1114                                                 name :
1115                                                 name.Substring (14);
1116                                         int tmpIdx = letterName.IndexOf (' ');
1117                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1118 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1119                                         if (arabicNameMap.ContainsKey (letterName))
1120                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1121                                         else
1122                                                 arabicNameMap [letterName] = cp;
1123                                         break;
1124                                 }
1125                                 arabicLetterPrimaryValues [cp] = value;
1126                         }
1127
1128                         // Japanese square letter
1129                         if (0x3300 <= cp && cp <= 0x3357)
1130                                 if (!ExistsJIS (cp))
1131                                         nonJisJapanese.Add (new NonJISCharacter (cp, name));
1132
1133                         // normalizationType
1134                         string decomp = values [4];
1135                         idx = decomp.IndexOf ('<');
1136                         if (idx >= 0) {
1137                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1138                                 case "full":
1139                                         decompType [cp] = DecompositionFull;
1140                                         break;
1141                                 case "sub":
1142                                         decompType [cp] = DecompositionSub;
1143                                         break;
1144                                 case "super":
1145                                         decompType [cp] = DecompositionSuper;
1146                                         break;
1147                                 case "small":
1148                                         decompType [cp] = DecompositionSmall;
1149                                         break;
1150                                 case "isolated":
1151                                         decompType [cp] = DecompositionIsolated;
1152                                         break;
1153                                 case "initial":
1154                                         decompType [cp] = DecompositionInitial;
1155                                         break;
1156                                 case "final":
1157                                         decompType [cp] = DecompositionFinal;
1158                                         break;
1159                                 case "medial":
1160                                         decompType [cp] = DecompositionMedial;
1161                                         break;
1162                                 case "noBreak":
1163                                         decompType [cp] = DecompositionNoBreak;
1164                                         break;
1165                                 case "compat":
1166                                         decompType [cp] = DecompositionCompat;
1167                                         break;
1168                                 case "fraction":
1169                                         decompType [cp] = DecompositionFraction;
1170                                         break;
1171                                 case "font":
1172                                         decompType [cp] = DecompositionFont;
1173                                         break;
1174                                 case "circle":
1175                                         decompType [cp] = DecompositionCircle;
1176                                         break;
1177                                 case "square":
1178                                         decompType [cp] = DecompositionSquare;
1179                                         break;
1180                                 case "wide":
1181                                         decompType [cp] = DecompositionWide;
1182                                         break;
1183                                 case "narrow":
1184                                         decompType [cp] = DecompositionNarrow;
1185                                         break;
1186                                 case "vertical":
1187                                         decompType [cp] = DecompositionVertical;
1188                                         break;
1189                                 default:
1190                                         throw new Exception ("Support NFKD type : " + decomp);
1191                                 }
1192                         }
1193                         else
1194                                 decompType [cp] = DecompositionCanonical;
1195                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1196                         if (decomp.Length > 0) {
1197
1198                                 string [] velems = decomp.Split (' ');
1199                                 int didx = decompValues.Count;
1200                                 decompIndex [cp] = didx;
1201                                 foreach (string v in velems)
1202                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1203                                 decompLength [cp] = velems.Length;
1204
1205                                 // [decmpType] -> this_cp
1206                                 int targetCP = (int) decompValues [didx];
1207                                 // for "(x)" it specially maps to 'x' .
1208                                 // FIXME: check if it is sane
1209                                 if (velems.Length == 3 &&
1210                                         (int) decompValues [didx] == '(' &&
1211                                         (int) decompValues [didx + 2] == ')')
1212                                         targetCP = (int) decompValues [didx + 1];
1213                                 // special: 0x215F "1/"
1214                                 else if (cp == 0x215F)
1215                                         targetCP = '1';
1216                                 else if (velems.Length > 1 &&
1217                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
1218                                         // skip them, except for CJK ideograph compat
1219                                         targetCP = 0;
1220
1221                                 if (targetCP != 0) {
1222                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
1223                                         if (entry == null) {
1224                                                 entry = new Hashtable ();
1225                                                 nfkdMap [targetCP] = entry;
1226                                         }
1227                                         entry [(byte) decompType [cp]] = cp;
1228                                 }
1229                         }
1230                         // numeric values
1231                         if (values [5].Length > 0)
1232                                 decimalValue [cp] = decimal.Parse (values [5]);
1233                         else if (values [6].Length > 0)
1234                                 decimalValue [cp] = decimal.Parse (values [6]);
1235                         else if (values [7].Length > 0) {
1236                                 string decstr = values [7];
1237                                 idx = decstr.IndexOf ('/');
1238                                 if (cp == 0x215F) // special. "1/"
1239                                         decimalValue [cp] = 0x1;
1240                                 else if (idx > 0)
1241                                         // m/n
1242                                         decimalValue [cp] =
1243                                                 decimal.Parse (decstr.Substring (0, idx))
1244                                                 / decimal.Parse (decstr.Substring (idx + 1));
1245                                 else if (decstr [0] == '(' &&
1246                                         decstr [decstr.Length - 1] == ')')
1247                                         // (n)
1248                                         decimalValue [cp] =
1249                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1250                                 else if (decstr [decstr.Length - 1] == '.')
1251                                         // n.
1252                                         decimalValue [cp] =
1253                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1254                                 else
1255                                         decimalValue [cp] = decimal.Parse (decstr);
1256                         }
1257                 }
1258
1259                 void ParseDerivedCoreProperties (string filename)
1260                 {
1261                         // IsUppercase
1262                         using (StreamReader file =
1263                                 new StreamReader (filename)) {
1264                                 for (int line = 1; file.Peek () >= 0; line++) {
1265                                         try {
1266                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1267                                         } catch (Exception) {
1268                                                 Console.Error.WriteLine ("**** At line " + line);
1269                                                 throw;
1270                                         }
1271                                 }
1272                         }
1273                 }
1274
1275                 void ProcessDerivedCorePropLine (string s)
1276                 {
1277                         int idx = s.IndexOf ('#');
1278                         if (idx >= 0)
1279                                 s = s.Substring (0, idx);
1280                         idx = s.IndexOf (';');
1281                         if (idx < 0)
1282                                 return;
1283                         string cpspec = s.Substring (0, idx);
1284                         idx = cpspec.IndexOf ("..");
1285                         NumberStyles nf = NumberStyles.HexNumber |
1286                                 NumberStyles.AllowTrailingWhite;
1287                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1288                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1289                         string value = s.Substring (cpspec.Length + 1).Trim ();
1290
1291                         // FIXME: use index
1292                         if (cp > char.MaxValue)
1293                                 return;
1294
1295                         switch (value) {
1296                         case "Uppercase":
1297                                 for (int x = cp; x <= cpEnd; x++)
1298                                         isUppercase [x] = true;
1299                                 break;
1300                         }
1301                 }
1302
1303                 void ParseScripts (string filename)
1304                 {
1305                         ArrayList gurmukhi = new ArrayList ();
1306                         ArrayList gujarati = new ArrayList ();
1307                         ArrayList georgian = new ArrayList ();
1308                         ArrayList thaana = new ArrayList ();
1309
1310                         using (StreamReader file =
1311                                 new StreamReader (filename)) {
1312                                 while (file.Peek () >= 0) {
1313                                         string s = file.ReadLine ();
1314                                         int idx = s.IndexOf ('#');
1315                                         if (idx >= 0)
1316                                                 s = s.Substring (0, idx);
1317                                         idx = s.IndexOf (';');
1318                                         if (idx < 0)
1319                                                 continue;
1320
1321                                         string cpspec = s.Substring (0, idx);
1322                                         idx = cpspec.IndexOf ("..");
1323                                         NumberStyles nf = NumberStyles.HexNumber |
1324                                                 NumberStyles.AllowTrailingWhite;
1325                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1326                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1327                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1328
1329                                         // FIXME: use index
1330                                         if (cp > char.MaxValue)
1331                                                 continue;
1332
1333                                         switch (value) {
1334                                         case "Gurmukhi":
1335                                                 for (int x = cp; x <= cpEnd; x++)
1336                                                         if (!IsIgnorable (x))
1337                                                                 gurmukhi.Add ((char) x);
1338                                                 break;
1339                                         case "Gujarati":
1340                                                 for (int x = cp; x <= cpEnd; x++)
1341                                                         if (!IsIgnorable (x))
1342                                                                 gujarati.Add ((char) x);
1343                                                 break;
1344                                         case "Georgian":
1345                                                 for (int x = cp; x <= cpEnd; x++)
1346                                                         if (!IsIgnorable (x))
1347                                                                 georgian.Add ((char) x);
1348                                                 break;
1349                                         case "Thaana":
1350                                                 for (int x = cp; x <= cpEnd; x++)
1351                                                         if (!IsIgnorable (x))
1352                                                                 thaana.Add ((char) x);
1353                                                 break;
1354                                         }
1355                                 }
1356                         }
1357                         gurmukhi.Sort (UCAComparer.Instance);
1358                         gujarati.Sort (UCAComparer.Instance);
1359                         georgian.Sort (UCAComparer.Instance);
1360                         thaana.Sort (UCAComparer.Instance);
1361                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1362                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1363                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1364                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1365                 }
1366
1367                 void ParseJISOrder (string filename)
1368                 {
1369                         int line = 1;
1370                         try {
1371                                 using (StreamReader file =
1372                                         new StreamReader (filename)) {
1373                                         for (;file.Peek () >= 0; line++)
1374                                                 ProcessJISOrderLine (file.ReadLine ());
1375                                 }
1376                         } catch (Exception) {
1377                                 Console.Error.WriteLine ("---- line {0}", line);
1378                                 throw;
1379                         }
1380                 }
1381
1382                 char [] ws = new char [] {'\t', ' '};
1383
1384                 void ProcessJISOrderLine (string s)
1385                 {
1386                         int idx = s.IndexOf ('#');
1387                         if (idx >= 0)
1388                                 s = s.Substring (0, idx).Trim ();
1389                         if (s.Length == 0)
1390                                 return;
1391                         idx = s.IndexOfAny (ws);
1392                         if (idx < 0)
1393                                 return;
1394                         // They start with "0x" so cut them out.
1395                         int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1396                         int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1397                         jisJapanese.Add (new JISCharacter (cp, jis));
1398                 }
1399
1400                 void ParseCJK (string zhXML, string jaXML, string koXML)
1401                 {
1402                         XmlDocument doc = new XmlDocument ();
1403                         doc.XmlResolver = null;
1404                         int v;
1405                         string s;
1406                         string category;
1407                         int offset;
1408                         ushort [] arr;
1409
1410                         // Chinese Simplified
1411                         category = "chs";
1412                         arr = cjkCHS;
1413                         offset = 0;//char.MaxValue - arr.Length;
1414                         doc.Load (zhXML);
1415                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1416                         v = 0x8008;
1417                         foreach (char c in s) {
1418                                 if (c < '\u3100')
1419                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1420                                 else {
1421                                         arr [(int) c - offset] = (ushort) v++;
1422                                         if (v % 256 == 0)
1423                                                 v += 2;
1424                                 }
1425                         }
1426
1427                         // Chinese Traditional
1428                         category = "cht";
1429                         arr = cjkCHT;
1430                         offset = 0;//char.MaxValue - arr.Length;
1431                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1432                         v = 0x8002;
1433                         foreach (char c in s) {
1434                                 if (c < '\u4E00')
1435                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1436                                 else {
1437                                         arr [(int) c - offset] = (ushort) v++;
1438                                         if (v % 256 == 0)
1439                                                 v += 2;
1440                                 }
1441                         }
1442
1443                         // Japanese
1444                         category = "ja";
1445                         arr = cjkJA;
1446                         offset = 0;//char.MaxValue - arr.Length;
1447
1448                         // SPECIAL CASES
1449                         arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1450                         arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1451                         arr [0x337E] = 0x8005;
1452                         arr [0x337D] = 0x8006;
1453                         arr [0x337C] = 0x8007;
1454
1455                         v = 0x8008;
1456                         foreach (JISCharacter jc in jisJapanese) {
1457                                 if (jc.JIS < 0x8800)
1458                                         continue;
1459                                 char c = (char) jc.CP;
1460
1461                                 if (c < '\u4E00')
1462                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1463                                 else {
1464                                         arr [(int) c - offset] = (ushort) v++;
1465                                         if (v % 256 == 0)
1466                                                 v += 2;
1467
1468                                         // SPECIAL CASES:
1469                                         if (c == '\u662D') // U+337C
1470                                                 continue;
1471                                         if (c == '\u5927') // U+337D
1472                                                 continue;
1473                                         if (c == '\u5E73') // U+337B
1474                                                 continue;
1475                                         if (c == '\u660E') // U+337E
1476                                                 continue;
1477                                         if (c == '\u9686') // U+F9DC
1478                                                 continue;
1479
1480                                         // FIXME: there are still remaining
1481                                         // characters after U+FA0C.
1482 //                                      for (int k = 0; k < char.MaxValue; k++) {
1483                                         for (int k = 0; k < '\uFA0D'; k++) {
1484                                                 if (decompIndex [k] == 0 || IsIgnorable (k))
1485                                                         continue;
1486                                                 if (decompValues [decompIndex [k]] == c /*&&
1487                                                         decompLength [k] == 1*/ ||
1488                                                         decompLength [k] == 3 &&
1489                                                         decompValues [decompIndex [k] + 1] == c) {
1490                                                         arr [k - offset] = (ushort) v++;
1491                                                         if (v % 256 == 0)
1492                                                                 v += 2;
1493                                                 }
1494                                         }
1495                                 }
1496                         }
1497
1498                         // Korean
1499                         // Korean weight is somewhat complex. It first shifts
1500                         // Hangul category from 52-x to 80-x (they are anyways
1501                         // computed). CJK ideographs are placed at secondary
1502                         // weight, like XX YY 01 zz 01, where XX and YY are
1503                         // corresponding "reset" value and zz is 41,43,45...
1504                         //
1505                         // Unlike chs,cht and ja, Korean value is a combined
1506                         // ushort which is computed as category
1507                         //
1508                         category = "ko";
1509                         arr = cjkKO;
1510                         offset = 0;//char.MaxValue - arr.Length;
1511                         doc.Load (koXML);
1512                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1513                                 XmlElement sc = (XmlElement) reset.NextSibling;
1514                                 // compute "category" and "level 1" for the
1515                                 // target "reset" Hangle syllable
1516                                 char rc = reset.InnerText [0];
1517                                 int ri = ((int) rc - 0xAC00) + 1;
1518                                 ushort p = (ushort)
1519                                         ((ri / 254) * 256 + (ri % 254) + 2);
1520                                 // Place the characters after the target.
1521                                 s = sc.InnerText;
1522                                 v = 0x41;
1523                                 foreach (char c in s) {
1524                                         arr [(int) c - offset] = p;
1525                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1526                                         v += 2;
1527                                 }
1528                         }
1529                 }
1530
1531                 #endregion
1532
1533                 #region Generation
1534
1535                 void FillIgnorables ()
1536                 {
1537                         for (int i = 0; i <= char.MaxValue; i++) {
1538                                 if (Char.GetUnicodeCategory ((char) i) ==
1539                                         UnicodeCategory.OtherNotAssigned)
1540                                         continue;
1541                                 if (IsIgnorable (i))
1542                                         ignorableFlags [i] |= 1;
1543                                 if (IsIgnorableSymbol (i))
1544                                         ignorableFlags [i] |= 2;
1545                                 if (IsIgnorableNonSpacing (i))
1546                                         ignorableFlags [i] |= 4;
1547                         }
1548                 }
1549
1550                 void ModifyUnidata ()
1551                 {
1552                         // Modify some decomposition equivalence
1553                         decompType [0xFE31] = 0;
1554                         decompIndex [0xFE31] = 0;
1555                         decompLength [0xFE31] = 0;
1556                         decompType [0xFE32] = 0;
1557                         decompIndex [0xFE32] = 0;
1558                         decompLength [0xFE32] = 0;
1559
1560                         // Korean parens numbers
1561                         for (int i = 0x3200; i <= 0x321C; i++)
1562                                 diacritical [i] = 0xA;
1563                         for (int i = 0x3260; i <= 0x327B; i++)
1564                                 diacritical [i] = 0xC;
1565
1566                         // LAMESPEC: these remapping should not be done.
1567                         // Windows have incorrect CJK compat mappings.
1568                         decompValues [decompIndex [0x32A9]] = 0x91AB;
1569                         decompLength [0x323B] = 1;
1570                         decompValues [decompIndex [0x323B]] = 0x5B78;
1571                         decompValues [decompIndex [0x32AB]] = 0x5B78;
1572                         decompValues [decompIndex [0x32A2]] = 0x5BEB;
1573                         decompLength [0x3238] = 1;
1574                         decompValues [decompIndex [0x3238]] = 0x52DE;
1575                         decompValues [decompIndex [0x3298]] = 0x52DE;
1576
1577                         // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1578                         decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1579                         decompValues [decompIndex [0xFA0C]] = 0x5140;
1580                         decompLength [0xFA0C] = 1;
1581                         decompIndex [0xF929] = decompLength [0xF929] = 0;
1582
1583                         decompValues [decompIndex [0xF92C]] = 0x90DE;
1584                 }
1585
1586                 void ModifyParsedValues ()
1587                 {
1588                         // number, secondary weights
1589                         byte weight = 0x38;
1590                         int [] numarr = numberSecondaryWeightBounds;
1591                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1592                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1593                                         if (Char.IsNumber ((char) cp))
1594                                                 diacritical [cp] = weight;
1595
1596                         // Update name part of named characters
1597                         for (int i = 0; i < sortableCharNames.Count; i++) {
1598                                 DictionaryEntry de =
1599                                         (DictionaryEntry) sortableCharNames [i];
1600                                 int cp = (int) de.Key;
1601                                 string renamed = null;
1602                                 switch (cp) {
1603                                 case 0x2101: renamed = "A_1"; break;
1604                                 case 0x33C3: renamed = "A_2"; break;
1605                                 case 0x2105: renamed = "C_1"; break;
1606                                 case 0x2106: renamed = "C_2"; break;
1607                                 case 0x211E: renamed = "R1"; break;
1608                                 case 0x211F: renamed = "R2"; break;
1609                                 // Remove some of them!
1610                                 case 0x2103:
1611                                 case 0x2109:
1612                                 case 0x2116:
1613                                 case 0x2117:
1614                                 case 0x2118:
1615                                 case 0x2125:
1616                                 case 0x2127:
1617                                 case 0x2129:
1618                                 case 0x212E:
1619                                 case 0x2132:
1620                                         sortableCharNames.RemoveAt (i);
1621                                         i--;
1622                                         continue;
1623                                 }
1624                                 if (renamed != null)
1625                                         sortableCharNames [i] =
1626                                                 new DictionaryEntry (cp, renamed);
1627                         }
1628                 }
1629
1630                 void GenerateCore ()
1631                 {
1632                         UnicodeCategory uc;
1633
1634                         #region Specially ignored // 01
1635                         // This will raise "Defined" flag up.
1636                         foreach (char c in specialIgnore)
1637                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1638                         #endregion
1639
1640
1641                         #region Variable weights
1642                         // Controls : 06 03 - 06 3D
1643                         fillIndex [6] = 3;
1644                         for (int i = 0; i < 65536; i++) {
1645                                 if (IsIgnorable (i))
1646                                         continue;
1647                                 char c = (char) i;
1648                                 uc = Char.GetUnicodeCategory (c);
1649                                 // NEL is whitespace but not ignored here.
1650                                 if (uc == UnicodeCategory.Control &&
1651                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1652                                         AddCharMap (c, 6, 1);
1653                         }
1654
1655                         // Apostrophe 06 80
1656                         fillIndex [6] = 0x80;
1657                         AddCharMapGroup ('\'', 6, 1, 0);
1658                         AddCharMap ('\uFE63', 6, 1);
1659
1660                         // Hyphen/Dash : 06 81 - 06 90
1661                         for (int i = 0; i < char.MaxValue; i++) {
1662                                 if (!IsIgnorable (i) &&
1663                                         Char.GetUnicodeCategory ((char) i) ==
1664                                         UnicodeCategory.DashPunctuation) {
1665                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1666                                         if (i == 0x2011) {
1667                                                 // SPECIAL: add 2027 and 2043
1668                                                 // Maybe they are regarded the
1669                                                 // same hyphens in "central"
1670                                                 // position.
1671                                                 AddCharMap ('\u2027', 6, 1);
1672                                                 AddCharMap ('\u2043', 6, 1);
1673                                         }
1674                                 }
1675                         }
1676
1677                         // Arabic variable weight chars 06 A0 -
1678                         fillIndex [6] = 0xA0;
1679                         // vowels
1680                         for (int i = 0x64B; i <= 0x650; i++)
1681                                 AddArabicCharMap ((char) i);
1682                         // sukun
1683                         AddCharMapGroup ('\u0652', 6, 1, 0);
1684                         // shadda
1685                         AddCharMapGroup ('\u0651', 6, 1, 0);
1686                         #endregion
1687
1688
1689                         #region Nonspacing marks // 01
1690                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1691
1692                         // Combining diacritical marks: 01 DC -
1693
1694                         fillIndex [0x1] = 0x41;
1695                         for (int i = 0x030E; i <= 0x0326; i++)
1696                                 if (!IsIgnorable (i))
1697                                         AddCharMap ((char) i, 0x1, 1);
1698                         for (int i = 0x0329; i <= 0x0334; i++)
1699                                 if (!IsIgnorable (i))
1700                                         AddCharMap ((char) i, 0x1, 1);
1701                         for (int i = 0x0339; i <= 0x0341; i++)
1702                                 if (!IsIgnorable (i))
1703                                         AddCharMap ((char) i, 0x1, 1);
1704                         fillIndex [0x1] = 0x72;
1705                         for (int i = 0x0346; i <= 0x0348; i++)
1706                                 if (!IsIgnorable (i))
1707                                         AddCharMap ((char) i, 0x1, 1);
1708                         for (int i = 0x02BE; i <= 0x02BF; i++)
1709                                 if (!IsIgnorable (i))
1710                                         AddCharMap ((char) i, 0x1, 1);
1711                         for (int i = 0x02C1; i <= 0x02C5; i++)
1712                                 if (!IsIgnorable (i))
1713                                         AddCharMap ((char) i, 0x1, 1);
1714                         for (int i = 0x02CE; i <= 0x02CF; i++)
1715                                 if (!IsIgnorable (i))
1716                                         AddCharMap ((char) i, 0x1, 1);
1717                         for (int i = 0x02D1; i <= 0x02D3; i++)
1718                                 if (!IsIgnorable (i))
1719                                         AddCharMap ((char) i, 0x1, 1);
1720                         AddCharMap ('\u02DE', 0x1, 1);
1721                         for (int i = 0x02E4; i <= 0x02E9; i++)
1722                                 if (!IsIgnorable (i))
1723                                         AddCharMap ((char) i, 0x1, 1);
1724
1725                         // FIXME: needs more love here (it should eliminate
1726                         // all the hacky code above).
1727                         for (int i = 0x0300; i < 0x0370; i++)
1728                                 if (!IsIgnorable (i) && diacritical [i] != 0
1729                                         /* especiall here*/ && !map [i].Defined)
1730                                         map [i] = new CharMapEntry (
1731                                                 0x1, 0x1, diacritical [i]);
1732
1733                         fillIndex [0x1] = 0x8D;
1734                         // syriac dotted nonspacing marks (1)
1735                         AddCharMap ('\u0740', 0x1, 1);
1736                         AddCharMap ('\u0741', 0x1, 1);
1737                         AddCharMap ('\u0742', 0x1, 1);
1738                         // syriac oblique nonspacing marks
1739                         AddCharMap ('\u0747', 0x1, 1);
1740                         AddCharMap ('\u0748', 0x1, 1);
1741                         // syriac dotted nonspacing marks (2)
1742                         fillIndex [0x1] = 0x94; // this reset is mandatory
1743                         AddCharMap ('\u0732', 0x1, 1);
1744                         AddCharMap ('\u0735', 0x1, 1);
1745                         AddCharMap ('\u0738', 0x1, 1);
1746                         AddCharMap ('\u0739', 0x1, 1);
1747                         AddCharMap ('\u073C', 0x1, 1);
1748                         // SPECIAL CASES: superscripts
1749                         AddCharMap ('\u073F', 0x1, 1);
1750                         AddCharMap ('\u0711', 0x1, 1);
1751                         // syriac "DOTS"
1752                         for (int i = 0x0743; i <= 0x0746; i++)
1753                                 AddCharMap ((char) i, 0x1, 1);
1754                         for (int i = 0x0730; i <= 0x0780; i++)
1755                                 if (!map [i].Defined &&
1756                                         Char.GetUnicodeCategory ((char) i) ==
1757                                         UnicodeCategory.NonSpacingMark)
1758                                         AddCharMap ((char) i, 0x1, 1);
1759
1760                         // LAMESPEC: It should not stop at '\u20E1'. There are
1761                         // a few more characters (that however results in
1762                         // overflow of level 2 unless we start before 0xDD).
1763                         fillIndex [0x1] = 0xDD;
1764                         for (int i = 0x20d0; i <= 0x20e1; i++)
1765                                 AddCharMap ((char) i, 0x1, 1);
1766
1767                         // They are not part of Nonspacing marks, but have
1768                         // only diacritical weight.
1769                         for (int i = 0x3099; i <= 0x309C; i++)
1770                                 map [i] = new CharMapEntry (1, 1, 1);
1771                         map [0xFF9E] = new CharMapEntry (1, 1, 1);
1772                         map [0xFF9F] = new CharMapEntry (1, 1, 2);
1773                         map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1774                         map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1775                         for (int i = 0x30FC; i <= 0x30FE; i++)
1776                                 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1777
1778                         #endregion
1779
1780
1781                         #region Whitespaces // 07 03 -
1782                         fillIndex [0x7] = 0x2;
1783                         AddCharMap (' ', 0x7, 2);
1784                         AddCharMap ('\u00A0', 0x7, 1);
1785                         for (int i = 9; i <= 0xD; i++)
1786                                 AddCharMap ((char) i, 0x7, 1);
1787                         for (int i = 0x2000; i <= 0x200B; i++)
1788                                 AddCharMap ((char) i, 0x7, 1);
1789
1790                         fillIndex [0x7] = 0x17;
1791                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1792                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1793
1794                         // Characters which used to represent layout control.
1795                         // LAMESPEC: Windows developers seem to have thought
1796                         // that those characters are kind of whitespaces,
1797                         // while they aren't.
1798                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1799                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1800                         #endregion
1801
1802                         // category 09 - continued symbols from 08
1803                         fillIndex [0x9] = 2;
1804                         // misc tech mark
1805                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1806                                 AddCharMap ((char) cp, 0x9, 1, 0);
1807
1808                         // arrows
1809                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1810                         foreach (DictionaryEntry de in arrowValues) {
1811                                 int idx = (int) de.Value;
1812                                 int cp = (int) de.Key;
1813                                 if (map [cp].Defined)
1814                                         continue;
1815                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1816                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1817                                 arrowLv2 [idx]++;
1818                         }
1819                         // boxes
1820                         byte [] boxLv2 = new byte [128];
1821                         for (int i = 0; i < boxLv2.Length; i++)
1822                                 boxLv2 [i] = 3;
1823                         foreach (DictionaryEntry de in boxValues) {
1824                                 int cp = (int) de.Key;
1825                                 int off = (int) de.Value;
1826                                 if (map [cp].Defined)
1827                                         continue;
1828                                 if (off < 0) {
1829                                         fillIndex [0x9] = (byte) (0xE5 + off);
1830                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1831                                 }
1832                                 else {
1833                                         fillIndex [0x9] = (byte) (0xE5 + off);
1834                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1835                                 }
1836                         }
1837                         // Some special characters (slanted)
1838                         fillIndex [0x9] = 0xF4;
1839                         AddCharMap ('\u2571', 0x9, 3);
1840                         AddCharMap ('\u2572', 0x9, 3);
1841                         AddCharMap ('\u2573', 0x9, 3);
1842
1843                         // FIXME: implement 0A
1844                         #region Symbols
1845                         fillIndex [0xA] = 2;
1846                         // byte currency symbols
1847                         for (int cp = 0; cp < 0x100; cp++) {
1848                                 uc = Char.GetUnicodeCategory ((char) cp);
1849                                 if (!IsIgnorable (cp) &&
1850                                         uc == UnicodeCategory.CurrencySymbol &&
1851                                         cp != '$' ||
1852                                         cp == 0xAC)
1853                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1854                         }
1855                         // byte other symbols
1856                         for (int cp = 0; cp < 0x100; cp++) {
1857                                 if (cp == 0xA6)
1858                                         continue; // SPECIAL: skip FIXME: why?
1859                                 uc = Char.GetUnicodeCategory ((char) cp);
1860                                 if (!IsIgnorable (cp) &&
1861                                         uc == UnicodeCategory.OtherSymbol ||
1862                                         cp == '\u00B5' || cp == '\u00B7')
1863                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1864                         }
1865
1866                         fillIndex [0xA] = 0x0F; // FIXME: it won't be needed
1867                         for (int cp = 0x2020; cp <= 0x2031; cp++)
1868                                 if (Char.IsPunctuation ((char) cp))
1869                                         AddCharMap ((char) cp, 0xA, 1, 0);
1870                         // SPECIAL CASES: why?
1871                         AddCharMap ('\u203B', 0xA, 1, 0);
1872                         AddCharMap ('\u2040', 0xA, 1, 0);
1873                         AddCharMap ('\u2041', 0xA, 1, 0);
1874                         AddCharMap ('\u2042', 0xA, 1, 0);
1875
1876                         for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1877                                 AddCharMap ((char) cp, 0xA, 1, 0);
1878                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1879                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1880                                 AddCharMap ((char) cp, 0xA, 1, 0);
1881                         // Dingbats
1882                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1883                                 if (Char.IsSymbol ((char) cp))
1884                                         AddCharMap ((char) cp, 0xA, 1, 0);
1885                         // OCR
1886                         for (int i = 0x2440; i < 0x2460; i++)
1887                                 AddCharMap ((char) i, 0xA, 1, 0);
1888
1889                         #endregion
1890
1891                         #region Numbers // 0C 02 - 0C E1
1892                         fillIndex [0xC] = 2;
1893
1894                         // 9F8 : Bengali "one less than the denominator"
1895                         AddCharMap ('\u09F8', 0xC, 1);
1896
1897                         ArrayList numbers = new ArrayList ();
1898                         for (int i = 0; i < 65536; i++)
1899                                 if (!IsIgnorable (i) &&
1900                                         Char.IsNumber ((char) i) &&
1901                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1902                                         numbers.Add (i);
1903
1904                         ArrayList numberValues = new ArrayList ();
1905                         foreach (int i in numbers)
1906                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1907                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1908
1909 //foreach (DictionaryEntry de in numberValues)
1910 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1911
1912                         decimal prevValue = -1;
1913                         foreach (DictionaryEntry de in numberValues) {
1914                                 int cp = (int) de.Key;
1915                                 decimal currValue = (decimal) de.Value;
1916                                 bool addnew = false;
1917                                 if (prevValue < currValue &&
1918                                         prevValue - (int) prevValue == 0 &&
1919                                         prevValue >= 1) {
1920
1921                                         addnew = true;
1922                                         // Process Hangzhou and Roman numbers
1923
1924                                         // There are some SPECIAL cases.
1925                                         if (currValue != 4) // no increment for 4
1926                                                 fillIndex [0xC]++;
1927
1928                                         int xcp;
1929                                         if (currValue <= 10) {
1930                                                 xcp = (int) prevValue + 0x2170 - 1;
1931                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1932                                                 xcp = (int) prevValue + 0x2160 - 1;
1933                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1934                                                 fillIndex [0xC] += 2;
1935                                                 xcp = (int) prevValue + 0x3021 - 1;
1936                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1937                                                 fillIndex [0xC]++;
1938                                         }
1939                                         else if (currValue == 11)
1940                                                 fillIndex [0xC]++;
1941                                 }
1942                                 if (prevValue < currValue)
1943                                         prevValue = currValue;
1944                                 if (map [cp].Defined)
1945                                         continue;
1946                                 // HangZhou and Roman are add later
1947                                 // (code is above)
1948                                 else if (0x3021 <= cp && cp < 0x302A
1949                                         || 0x2160 <= cp && cp < 0x216A
1950                                         || 0x2170 <= cp && cp < 0x217A)
1951                                         continue;
1952
1953                                 if (cp ==  0x215B) // FIXME: why?
1954                                         fillIndex [0xC] += 2;
1955                                 else if (cp == 0x3021) // FIXME: why?
1956                                         fillIndex [0xC]++;
1957                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1958                                 if (addnew || cp <= '9') {
1959                                         int mod = (int) currValue - 1;
1960                                         int xcp;
1961                                         if (1 <= currValue && currValue <= 10) {
1962                                                 xcp = mod + 0x2776;
1963                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1964                                                 xcp = mod + 0x2780;
1965                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1966                                                 xcp = mod + 0x278A;
1967                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1968                                         }
1969                                         if (1 <= currValue && currValue <= 20) {
1970                                                 xcp = mod + 0x2460;
1971                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1972                                                 xcp = mod + 0x2474;
1973                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1974                                                 xcp = mod + 0x2488;
1975                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1976                                         }
1977                                 }
1978
1979                                 if (cp != 0x09E7 && cp != 0x09EA)
1980                                         fillIndex [0xC]++;
1981
1982                                 // Add special cases that are not regarded as
1983                                 // numbers in UnicodeCategory speak.
1984                                 if (cp == '5') {
1985                                         // TONE FIVE
1986                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1987                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1988                                 }
1989                                 else if (cp == '6') // FIXME: why?
1990                                         fillIndex [0xC]++;
1991                         }
1992
1993                         // 221E: infinity
1994                         fillIndex [0xC] = 0xFF;
1995                         AddCharMap ('\u221E', 0xC, 1);
1996                         #endregion
1997
1998                         #region Letters and NonSpacing Marks (general)
1999
2000                         // ASCII Latin alphabets
2001                         for (int i = 0; i < alphabets.Length; i++)
2002                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2003
2004                         // non-ASCII Latin alphabets
2005                         // FIXME: there is no such characters that are placed
2006                         // *after* "alphabets" array items. This is nothing
2007                         // more than a hack that creates dummy weight for
2008                         // primary characters.
2009                         for (int i = 0x0080; i < 0x0300; i++) {
2010                                 if (!Char.IsLetter ((char) i))
2011                                         continue;
2012                                 // For those Latin Letters which has NFKD are
2013                                 // not added as independent primary character.
2014                                 if (decompIndex [i] != 0)
2015                                         continue;
2016                                 // SPECIAL CASES:
2017                                 // 1.some alphabets have primarily
2018                                 //   equivalent ASCII alphabets.
2019                                 // 2.some have independent primary weights,
2020                                 //   but inside a-to-z range.
2021                                 // 3.there are some expanded characters that
2022                                 //   are not part of Unicode Standard NFKD.
2023                                 // 4. some characters are letter in IsLetter
2024                                 //   but not in sortkeys (maybe unicode version
2025                                 //   difference caused it).
2026                                 switch (i) {
2027                                 // 1. skipping them does not make sense
2028 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
2029 //                              case 0x184: case 0x185: case 0x186: case 0x189:
2030 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
2031 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
2032 //                              case 0x19B: case 0x19C:
2033                                 // 2. skipping them does not make sense
2034 //                              case 0x14A: // Ng
2035 //                              case 0x14B: // ng
2036                                 // 3.
2037                                 case 0xC6: // AE
2038                                 case 0xE6: // ae
2039                                 case 0xDE: // Icelandic Thorn
2040                                 case 0xFE: // Icelandic Thorn
2041                                 case 0xDF: // German ss
2042                                 case 0xFF: // German ss
2043                                 // 4.
2044                                 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2045                                 // not classified yet
2046 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2047 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2048 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2049 //                              case 0x1DD:
2050                                         continue;
2051                                 }
2052                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
2053                         }
2054
2055                         // Greek and Coptic
2056                         fillIndex [0xF] = 02;
2057                         for (int i = 0x0380; i < 0x0390; i++)
2058                                 if (Char.IsLetter ((char) i))
2059                                         AddLetterMap ((char) i, 0xF, 1);
2060                         fillIndex [0xF] = 02;
2061                         for (int i = 0x0391; i < 0x03CF; i++)
2062                                 if (Char.IsLetter ((char) i))
2063                                         AddLetterMap ((char) i, 0xF, 1);
2064                         fillIndex [0xF] = 0x40;
2065                         for (int i = 0x03D0; i < 0x0400; i++)
2066                                 if (Char.IsLetter ((char) i))
2067                                         AddLetterMap ((char) i, 0xF, 1);
2068
2069                         // Cyrillic.
2070                         // Cyrillic letters are sorted like Latin letters i.e.
2071                         // containing culture-specific letters between the
2072                         // standard Cyrillic sequence.
2073                         //
2074                         // We can't use UCA here; it has different sorting.
2075                         char [] orderedCyrillic = new char [] {
2076                                 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2077                                 '\u0452', // DJE for Serbocroatian
2078                                 '\u0435',
2079                                 '\u0454', // IE for Ukrainian
2080                                 '\u0436', '\u0437',
2081                                 '\u0455', // DZE
2082                                 '\u0438',
2083                                 '\u0456', // Byelorussian-Ukrainian I
2084                                 '\u0457', // YI
2085                                 '\u0439',
2086                                 '\u0458', // JE
2087                                 '\u043A', '\u043B',
2088                                 '\u0459', // LJE
2089                                 '\u043C', '\u043D',
2090                                 '\u045A', // NJE
2091                                 '\u043E',
2092                                 // 4E9 goes here.
2093                                 '\u043F', '\u0440', '\u0441', '\u0442',
2094                                 '\u045B', // TSHE for Serbocroatian
2095                                 '\u0443',
2096                                 '\u045E', // Short U for Byelorussian
2097                                 '\u04B1', // Straight U w/ stroke (diacritical!)
2098                                 '\u0444', '\u0445', '\u0446', '\u0447',
2099                                 '\u045F', // DZHE
2100                                 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2101                                 '\u044D', '\u044E', '\u044F'};
2102
2103                         // For some characters here is a map to basic cyrillic
2104                         // letters. See UnicodeData.txt character names for
2105                         // the sources. Here I simply declare an equiv. array.
2106                         // The content characters are map from U+490(,491),
2107                         // skipping small letters.
2108                         char [] cymap_src = new char [] {
2109                                 '\u0433', '\u0433', '\u0433', '\u0436',
2110                                 '\u0437', '\u043A', '\u043A', '\u043A',
2111                                 '\u043A', '\u043D', '\u043D', '\u043F',
2112                                 '\u0445', '\u0441', '\u0442', '\u0443',
2113                                 '\u0443', '\u0445', '\u0446', '\u0447',
2114                                 '\u0447', '\u0432', '\u0435', '\u0435',
2115                                 '\u0406', '\u0436', '\u043A', '\u043D',
2116                                 '\u0447', '\u0435'};
2117
2118                         fillIndex [0x10] = 0x8D;
2119                         for (int i = 0x0460; i < 0x0481; i++) {
2120                                 if (Char.IsLetter ((char) i)) {
2121                                         if (i == 0x0476)
2122                                                 // U+476/477 have the same
2123                                                 // primary weight as U+474/475.
2124                                                 fillIndex [0x10] -= 3;
2125                                         AddLetterMap ((char) i, 0x10, 3);
2126                                 }
2127                         }
2128
2129                         fillIndex [0x10] = 0x6;
2130                         for (int i = 0; i < orderedCyrillic.Length; i++) {
2131                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2132                                 if (!IsIgnorable ((int) c) &&
2133                                         Char.IsLetter (c) &&
2134                                         !map [c].Defined) {
2135                                         AddLetterMap (c, 0x10, 0);
2136                                         fillIndex [0x10] += 3;
2137                                 }
2138                         }
2139
2140                         for (int i = 0; i < cymap_src.Length; i++) {
2141                                 char c = cymap_src [i];
2142                                 fillIndex [0x10] = map [c].Level1;
2143                                 AddLetterMap ((char) (0x0490 + i * 2),
2144                                         0x10, 0);
2145                         }
2146
2147                         // Armenian
2148                         fillIndex [0x11] = 0x3;
2149                         for (int i = 0x0531; i < 0x0586; i++)
2150                                 if (Char.IsLetter ((char) i))
2151                                         AddLetterMap ((char) i, 0x11, 1);
2152
2153                         // Hebrew
2154                         // -Letters
2155                         fillIndex [0x12] = 0x2;
2156                         for (int i = 0x05D0; i < 0x05FF; i++)
2157                                 if (Char.IsLetter ((char) i))
2158                                         AddLetterMap ((char) i, 0x12, 1);
2159                         // -Accents
2160                         fillIndex [0x1] = 0x3;
2161                         for (int i = 0x0591; i <= 0x05C2; i++) {
2162                                 if (i == 0x05A3 || i == 0x05BB)
2163                                         fillIndex [0x1]++;
2164                                 if (i != 0x05BE)
2165                                         AddCharMap ((char) i, 0x1, 1);
2166                         }
2167
2168                         // Arabic
2169                         fillIndex [0x1] = 0x8E;
2170                         fillIndex [0x13] = 0x3;
2171                         for (int i = 0x0621; i <= 0x064A; i++) {
2172                                 // Abjad
2173                                 if (Char.GetUnicodeCategory ((char) i)
2174                                         != UnicodeCategory.OtherLetter) {
2175                                         // FIXME: arabic nonspacing marks are
2176                                         // in different order.
2177                                         AddCharMap ((char) i, 0x1, 1);
2178                                         continue;
2179                                 }
2180 //                              map [i] = new CharMapEntry (0x13,
2181 //                                      (byte) arabicLetterPrimaryValues [i], 1);
2182                                 fillIndex [0x13] =
2183                                         (byte) arabicLetterPrimaryValues [i];
2184                                 byte formDiacritical = 8; // default
2185                                 // SPECIAL CASES:
2186                                 switch (i) {
2187                                 case 0x0622: formDiacritical = 9; break;
2188                                 case 0x0623: formDiacritical = 0xA; break;
2189                                 case 0x0624: formDiacritical = 5; break;
2190                                 case 0x0625: formDiacritical = 0xB; break;
2191                                 case 0x0626: formDiacritical = 7; break;
2192                                 case 0x0649: formDiacritical = 5; break;
2193                                 case 0x064A: formDiacritical = 7; break;
2194                                 }
2195                                 AddLetterMapCore ((char) i, 0x13, 1, formDiacritical);
2196                         }
2197                         fillIndex [0x13] = 0x84;
2198                         for (int i = 0x0674; i < 0x06D6; i++)
2199                                 if (Char.IsLetter ((char) i))
2200                                         AddLetterMap ((char) i, 0x13, 1);
2201
2202                         // Devanagari
2203                         // FIXME: it does seem straight codepoint mapping.
2204                         fillIndex [0x14] = 04;
2205                         for (int i = 0x0901; i < 0x0905; i++)
2206                                 if (!IsIgnorable (i))
2207                                         AddLetterMap ((char) i, 0x14, 2);
2208                         fillIndex [0x14] = 0xB;
2209                         for (int i = 0x0905; i < 0x093A; i++) {
2210                                 if (i == 0x0928)
2211                                         AddCharMap ('\u0929', 0x14, 0, 8);
2212                                 if (i == 0x0930)
2213                                         AddCharMap ('\u0931', 0x14, 0, 8);
2214                                 if (i == 0x0933)
2215                                         AddCharMap ('\u0934', 0x14, 0, 8);
2216                                 if (Char.IsLetter ((char) i))
2217                                         AddLetterMap ((char) i, 0x14, 4);
2218                                 if (i == 0x090B)
2219                                         AddCharMap ('\u0960', 0x14, 4);
2220                                 if (i == 0x090C)
2221                                         AddCharMap ('\u0961', 0x14, 4);
2222                         }
2223                         fillIndex [0x14] = 0xDA;
2224                         for (int i = 0x093E; i < 0x0945; i++)
2225                                 if (!IsIgnorable (i))
2226                                         AddLetterMap ((char) i, 0x14, 2);
2227                         fillIndex [0x14] = 0xEC;
2228                         for (int i = 0x0945; i < 0x094F; i++)
2229                                 if (!IsIgnorable (i))
2230                                         AddLetterMap ((char) i, 0x14, 2);
2231
2232                         // Bengali
2233                         // -Letters
2234                         fillIndex [0x15] = 02;
2235                         for (int i = 0x0980; i < 0x9FF; i++) {
2236                                 if (IsIgnorable (i))
2237                                         continue;
2238                                 if (i == 0x09E0)
2239                                         fillIndex [0x15] = 0x3B;
2240                                 switch (Char.GetUnicodeCategory ((char) i)) {
2241                                 case UnicodeCategory.NonSpacingMark:
2242                                 case UnicodeCategory.DecimalDigitNumber:
2243                                 case UnicodeCategory.OtherNumber:
2244                                         continue;
2245                                 }
2246                                 AddLetterMap ((char) i, 0x15, 1);
2247                         }
2248                         // -Signs
2249                         fillIndex [0x1] = 0x3;
2250                         for (int i = 0x0981; i < 0x0A00; i++)
2251                                 if (Char.GetUnicodeCategory ((char) i) ==
2252                                         UnicodeCategory.NonSpacingMark)
2253                                         AddCharMap ((char) i, 0x1, 1);
2254
2255                         // Gurmukhi. orderedGurmukhi is from UCA
2256                         // FIXME: it does not look equivalent to UCA.
2257                         fillIndex [0x16] = 04;
2258                         fillIndex [0x1] = 3;
2259                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
2260                                 char c = orderedGurmukhi [i];
2261                                 if (IsIgnorable ((int) c))
2262                                         continue;
2263                                 if (IsIgnorableNonSpacing (c)) {
2264                                         AddLetterMap (c, 0x1, 1);
2265                                         continue;
2266                                 }
2267                                 if (c == '\u0A3C' || c == '\u0A4D' ||
2268                                         '\u0A66' <= c && c <= '\u0A71')
2269                                         continue;
2270                                 // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
2271                                 byte shift = 4;
2272                                 if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
2273                                         shift = 0;
2274                                 AddLetterMap (c, 0x16, shift);
2275                         }
2276
2277                         // Gujarati. orderedGujarati is from UCA
2278                         fillIndex [0x17] = 0x4;
2279                         // nonspacing marks
2280                         map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2281                         map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2282                         map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2283                         map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2284                         map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2285                         map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2286                         // letters go first.
2287                         for (int i = 0; i < orderedGujarati.Length; i++) {
2288                                 // SPECIAL CASE
2289                                 char c = orderedGujarati [i];
2290                                 if (Char.IsLetter (c)) {
2291                                         // SPECIAL CASES
2292                                         if (c == '\u0AB3' || c == '\u0A32')
2293                                                 continue;
2294                                         if (c == '\u0A33') {
2295                                                 AddCharMap ('\u0A32', 0x17, 0);
2296                                                 AddCharMap ('\u0A33', 0x17, 4, 4);
2297                                                 continue;
2298                                         }
2299                                         if (c == '\u0A8B')
2300                                                 AddCharMap ('\u0AE0', 0x17, 0, 5);
2301                                         AddCharMap (c, 0x17, 4);
2302
2303                                         if (c == '\u0AB9')
2304                                                 AddCharMap ('\u0AB3', 0x17, 6);
2305                                 }
2306                         }
2307                         // non-letters
2308                         byte gujaratiShift = 4;
2309                         fillIndex [0x17] = 0xC0;
2310                         for (int i = 0; i < orderedGujarati.Length; i++) {
2311                                 char c = orderedGujarati [i];
2312                                 if (fillIndex [0x17] == 0xCC)
2313                                         gujaratiShift = 3;
2314                                 if (!Char.IsLetter (c)) {
2315                                         // SPECIAL CASES
2316                                         if (c == '\u0A82')
2317                                                 AddCharMap ('\u0A81', 0x17, 2);
2318                                         if (c == '\u0AC2')
2319                                                 fillIndex [0x17]++;
2320                                         AddLetterMap (c, 0x17, gujaratiShift);
2321                                 }
2322                         }
2323
2324                         // Oriya
2325                         fillIndex [0x1] = 03;
2326                         fillIndex [0x18] = 02;
2327                         for (int i = 0x0B00; i < 0x0B7F; i++) {
2328                                 switch (Char.GetUnicodeCategory ((char) i)) {
2329                                 case UnicodeCategory.NonSpacingMark:
2330                                 case UnicodeCategory.DecimalDigitNumber:
2331                                         AddLetterMap ((char) i, 0x1, 1);
2332                                         continue;
2333                                 }
2334                                 AddLetterMap ((char) i, 0x18, 1);
2335                         }
2336
2337                         // Tamil
2338                         fillIndex [0x19] = 2;
2339                         AddCharMap ('\u0BD7', 0x19, 0);
2340                         fillIndex [0x19] = 0xA;
2341                         // vowels
2342                         for (int i = 0x0B82; i <= 0x0B94; i++)
2343                                 if (!IsIgnorable ((char) i))
2344                                         AddCharMap ((char) i, 0x19, 2);
2345                         // special vowel
2346                         fillIndex [0x19] = 0x28;
2347                         // The array for Tamil consonants is a constant.
2348                         // Windows have almost similar sequence to TAM from
2349                         // tamilnet but a bit different in Grantha.
2350                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
2351                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2352                         // combining marks
2353                         fillIndex [0x19] = 0x82;
2354                         for (int i = 0x0BBE; i < 0x0BCD; i++)
2355                                 if (Char.GetUnicodeCategory ((char) i) ==
2356                                         UnicodeCategory.SpacingCombiningMark
2357                                         || i == 0x0BC0)
2358                                         AddLetterMap ((char) i, 0x19, 2);
2359
2360                         // Telugu
2361                         fillIndex [0x1A] = 0x4;
2362                         for (int i = 0x0C00; i < 0x0C62; i++) {
2363                                 if (i == 0x0C55 || i == 0x0C56)
2364                                         continue; // skip
2365                                 AddCharMap ((char) i, 0x1A, 3);
2366                                 char supp = (i == 0x0C0B) ? '\u0C60':
2367                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
2368                                 if (supp == char.MinValue)
2369                                         continue;
2370                                 AddCharMap (supp, 0x1A, 3);
2371                         }
2372
2373                         // Kannada
2374                         fillIndex [0x1B] = 4;
2375                         for (int i = 0x0C80; i < 0x0CE5; i++) {
2376                                 if (i == 0x0CD5 || i == 0x0CD6)
2377                                         continue; // ignore
2378                                 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2379                                         continue; // shift after 0xCB9
2380                                 AddCharMap ((char) i, 0x1B, 3);
2381                                 if (i == 0x0CB9) {
2382                                         // SPECIAL CASES: but why?
2383                                         AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2384                                         AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2385                                         AddCharMap ('\u0CDE', 0x1B, 3); // FA
2386                                 }
2387                                 if (i == 0x0CB2)
2388                                         AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2389                         }
2390
2391                         // Malayalam
2392                         fillIndex [0x1C] = 2;
2393                         fillIndex [0x1] = 3;
2394                         for (int i = 0x0D02; i < 0x0D61; i++) {
2395                                 // FIXME: I avoided MSCompatUnicodeTable usage
2396                                 // here (it results in recursion). So check if
2397                                 // using NonSpacingMark makes sense or not.
2398                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2399 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2400                                         AddCharMap ((char) i, 0x1C, 1);
2401                                 else if (!IsIgnorable ((char) i))
2402                                         AddCharMap ((char) i, 1, 1);
2403                         }
2404
2405                         // Thai ... note that it breaks 0x1E wall after E2B!
2406                         // Also, all Thai characters have level 2 value 3.
2407                         fillIndex [0x1E] = 2;
2408                         fillIndex [0x1] = 3;
2409                         for (int i = 0xE40; i <= 0xE44; i++)
2410                                 AddCharMap ((char) i, 0x1E, 1, 3);
2411                         for (int i = 0xE01; i < 0xE2B; i++)
2412                                 AddCharMap ((char) i, 0x1E, 6, 3);
2413                         fillIndex [0x1F] = 5;
2414                         for (int i = 0xE2B; i < 0xE30; i++)
2415                                 AddCharMap ((char) i, 0x1F, 6, 3);
2416                         fillIndex [0x1F] = 0x1E;
2417                         for (int i = 0xE30; i < 0xE3B; i++)
2418                                 AddCharMap ((char) i, 0x1F, 1, 3);
2419                         // some Thai characters remains.
2420                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
2421                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2422                         foreach (char c in specialThai)
2423                                 AddCharMap (c, 0x1F, 1, 3);
2424
2425                         for (int i = 0xE00; i < 0xE80; i++)
2426                                 if (Char.GetUnicodeCategory ((char) i) ==
2427                                         UnicodeCategory.NonSpacingMark)
2428                                         AddCharMap ((char) i, 1, 1);
2429
2430                         // Lao
2431                         fillIndex [0x1F] = 2;
2432                         fillIndex [0x1] = 3;
2433                         for (int i = 0xE80; i < 0xEDF; i++) {
2434                                 if (IsIgnorable ((char) i))
2435                                         continue;
2436                                 else if (Char.IsLetter ((char) i))
2437                                         AddCharMap ((char) i, 0x1F, 1);
2438                                 else if (Char.GetUnicodeCategory ((char) i) ==
2439                                         UnicodeCategory.NonSpacingMark)
2440                                         AddCharMap ((char) i, 1, 1);
2441                         }
2442
2443                         // Georgian. orderedGeorgian is from UCA DUCET.
2444                         fillIndex [0x21] = 5;
2445                         for (int i = 0; i < orderedGeorgian.Length; i++) {
2446                                 char c = orderedGeorgian [i];
2447                                 if (map [(int) c].Defined)
2448                                         continue;
2449                                 AddCharMap (c, 0x21, 0);
2450                                 if (c < '\u10F6')
2451                                         AddCharMap ((char) (c - 0x30), 0x21, 0);
2452                                 fillIndex [0x21] += 5;
2453                         }
2454
2455                         // Japanese Kana.
2456                         fillIndex [0x22] = 2;
2457                         int kanaOffset = 0x3041;
2458                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2459
2460                         for (int gyo = 0; gyo < 9; gyo++) {
2461                                 for (int dan = 0; dan < 5; dan++) {
2462                                         if (gyo == 7 && dan % 2 == 1) {
2463                                                 // 'ya'-gyo
2464                                                 fillIndex [0x22]++;
2465                                                 kanaOffset -= 2; // There is no space for yi and ye.
2466                                                 continue;
2467                                         }
2468                                         int cp = kanaOffset + dan * kanaLines [gyo];
2469                                         // small lines (a-gyo, ya-gyo)
2470                                         if (gyo == 0 || gyo == 7) {
2471                                                 AddKanaMap (cp, 1); // small
2472                                                 AddKanaMap (cp + 1, 1);
2473                                         }
2474                                         else
2475                                                 AddKanaMap (cp, kanaLines [gyo]);
2476                                         fillIndex [0x22]++;
2477
2478                                         if (cp == 0x30AB) {
2479                                                 // add small 'ka' (before normal one)
2480                                                 AddKanaMap (0x30F5, 1);
2481                                                 kanaOffset++;
2482                                         }
2483                                         if (cp == 0x30B1) {
2484                                                 // add small 'ke' (before normal one)
2485                                                 AddKanaMap (0x30F6, 1);
2486                                                 kanaOffset++;
2487                                         }
2488                                         if (cp == 0x3061) {
2489                                                 // add small 'Tsu' (before normal one)
2490                                                 AddKanaMap (0x3063, 1);
2491                                                 kanaOffset++;
2492                                         }
2493                                 }
2494                                 fillIndex [0x22] += 3;
2495                                 kanaOffset += 5 * kanaLines [gyo];
2496                         }
2497
2498                         // Wa-gyo is almost special, so I just manually add.
2499                         AddLetterMap ((char) 0x308E, 0x22, 0);
2500                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2501                         AddLetterMap ((char) 0x308F, 0x22, 0);
2502                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2503                         fillIndex [0x22]++;
2504                         AddLetterMap ((char) 0x3090, 0x22, 0);
2505                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2506                         fillIndex [0x22] += 2;
2507                         // no "Wu" in Japanese.
2508                         AddLetterMap ((char) 0x3091, 0x22, 0);
2509                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2510                         fillIndex [0x22]++;
2511                         AddLetterMap ((char) 0x3092, 0x22, 0);
2512                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2513                         // Nn
2514                         fillIndex [0x22] = 0x80;
2515                         AddLetterMap ((char) 0x3093, 0x22, 0);
2516                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2517
2518                         map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2519                                 map [0x30A6].Level1, 3);// voiced hiragana U
2520                         map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2521                                 map [0x30A6].Level1, 3);// voiced katakana U
2522
2523                         map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2524                                 map [0x30AB].Level1, 0);// small katakana Ka
2525                         map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2526                                 map [0x30B1].Level1, 0);// small katakana Ke
2527                         // voiced Wa lines
2528                         for (int i = 0x30F7; i < 0x30FB; i++)
2529                                 map [i] = new CharMapEntry (map [i - 8].Category,
2530                                         map [i - 8].Level1,
2531                                         3);
2532
2533                         // JIS Japanese square chars.
2534                         fillIndex [0x22] = 0x97;
2535                         jisJapanese.Sort (JISComparer.Instance);
2536                         foreach (JISCharacter j in jisJapanese)
2537                                 if (0x3300 <= j.CP && j.CP <= 0x3357)
2538                                         AddCharMap ((char) j.CP, 0x22, 1);
2539                         // non-JIS Japanese square chars.
2540                         nonJisJapanese.Sort (NonJISComparer.Instance);
2541                         foreach (NonJISCharacter j in nonJisJapanese)
2542                                 AddCharMap ((char) j.CP, 0x22, 1);
2543
2544                         // Bopomofo
2545                         fillIndex [0x23] = 0x02;
2546                         for (int i = 0x3105; i <= 0x312C; i++)
2547                                 AddCharMap ((char) i, 0x23, 1);
2548
2549                         // Estrangela: ancient Syriac
2550                         fillIndex [0x24] = 0x0B;
2551                         // FIXME: is 0x71E really alternative form?
2552                         ArrayList syriacAlternatives = new ArrayList (
2553                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2554                         for (int i = 0x0710; i <= 0x072C; i++) {
2555                                 if (i == 0x0711) // NonSpacingMark
2556                                         continue;
2557                                 if (syriacAlternatives.Contains (i))
2558                                         continue;
2559                                 AddCharMap ((char) i, 0x24, 4);
2560                                 // FIXME: why?
2561                                 if (i == 0x721)
2562                                         fillIndex [0x24]++;
2563                         }
2564                         foreach (int cp in syriacAlternatives)
2565                                 map [cp] = new CharMapEntry (0x24,
2566                                         (byte) (map [cp - 1].Level1 + 2),
2567                                         0);
2568                         // FIXME: Syriac NonSpacingMark should go here.
2569
2570                         // Thaana
2571                         // FIXME: it turned out that it does not look like UCA
2572                         fillIndex [0x24] = 0x6E;
2573                         for (int i = 0; i < orderedThaana.Length; i++) {
2574                                 char c = orderedThaana [i];
2575                                 if (IsIgnorableNonSpacing ((int) c))
2576                                         continue;
2577                                 AddCharMap (c, 0x24, 2);
2578                                 if (c == '\u0782') // SPECIAL CASE: why?
2579                                         fillIndex [0x24] += 2;
2580                         }
2581                         #endregion
2582
2583                         // FIXME: Add more culture-specific letters (that are
2584                         // not supported in Windows collation) here.
2585
2586                         // Surrogate ... they are computed.
2587
2588                         #region Hangul
2589                         // Hangul.
2590                         //
2591                         // Unlike UCA Windows Hangul sequence mixes Jongseong
2592                         // with Choseong sequence as well as Jungseong,
2593                         // adjusted to have the same primary weight for the
2594                         // same base character. So it is impossible to compute
2595                         // those sort keys.
2596                         //
2597                         // Here I introduce an ordered sequence of mixed
2598                         // 'commands' and 'characters' that is similar to
2599                         // LDML text:
2600                         //      - ',' increases primary weight.
2601                         //      - [A B] means a range, increasing index
2602                         //      - {A B} means a range, without increasing index
2603                         //      - '=' is no operation (it means the characters
2604                         //        of both sides have the same weight).
2605                         //      - '>' inserts a Hangul Syllable block that
2606                         //        contains 0x251 characters.
2607                         //      - '<' decreases the index
2608                         //      - '0'-'9' means skip count
2609                         //      - whitespaces are ignored
2610                         //
2611
2612                         string hangulSequence =
2613                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2614                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2615                         + "<{\u1113 \u1116}, \u3165,"
2616                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2617                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2618                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2619                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2620                                 + "[\u11D1 \u11D2], \u11B2,"
2621                                 + "[\u11D3 \u11D5], \u11B3,"
2622                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2623                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2624                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2625                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2626                         + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2627                                 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2628                                 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2629                         + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2630                                 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2631                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2632                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2633                         + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2634                                 + "\u11F1,, \u11F2,,,"
2635                                 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2636                         + "<\u114D, \u110D,,  >"
2637                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2638                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2639                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2640                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2641                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2642                                 + "[\u11F5 \u11F8]"
2643                         ;
2644
2645                         byte hangulCat = 0x52;
2646                         fillIndex [hangulCat] = 0x2;
2647
2648                         int syllableBlock = 0;
2649                         for (int n = 0; n < hangulSequence.Length; n++) {
2650                                 char c = hangulSequence [n];
2651                                 int start, end;
2652                                 if (Char.IsWhiteSpace (c))
2653                                         continue;
2654                                 switch (c) {
2655                                 case '=':
2656                                         break; // NOP
2657                                 case ',':
2658                                         IncrementSequentialIndex (ref hangulCat);
2659                                         break;
2660                                 case '<':
2661                                         if (fillIndex [hangulCat] == 2)
2662                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2663                                         fillIndex [hangulCat]--;
2664                                         break;
2665                                 case '>':
2666                                         IncrementSequentialIndex (ref hangulCat);
2667                                         for (int l = 0; l < 0x15; l++)
2668                                                 for (int v = 0; v < 0x1C; v++) {
2669                                                         AddCharMap (
2670                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2671                                                         IncrementSequentialIndex (ref hangulCat);
2672                                                 }
2673                                         syllableBlock++;
2674                                         break;
2675                                 case '[':
2676                                         start = hangulSequence [n + 1];
2677                                         end = hangulSequence [n + 3];
2678                                         for (int i = start; i <= end; i++) {
2679                                                 AddCharMap ((char) i, hangulCat, 0);
2680                                                 if (end > i)
2681                                                         IncrementSequentialIndex (ref hangulCat);
2682                                         }
2683                                         n += 4; // consumes 5 characters for this operation
2684                                         break;
2685                                 case '{':
2686                                         start = hangulSequence [n + 1];
2687                                         end = hangulSequence [n + 3];
2688                                         for (int i = start; i <= end; i++)
2689                                                 AddCharMap ((char) i, hangulCat, 0);
2690                                         n += 4; // consumes 5 characters for this operation
2691                                         break;
2692                                 default:
2693                                         AddCharMap (c, hangulCat, 0);
2694                                         break;
2695                                 }
2696                         }
2697
2698                         // Some Jamo NFKD.
2699                         for (int i = 0x3200; i < 0x3300; i++) {
2700                                 if (IsIgnorable (i) || map [i].Defined)
2701                                         continue;
2702                                 int ch = 0;
2703                                 // w/ bracket
2704                                 if (decompLength [i] == 4 &&
2705                                         decompValues [decompIndex [i]] == '(')
2706                                         ch = decompIndex [i] + 1;
2707                                 // circled
2708                                 else if (decompLength [i] == 2 &&
2709                                         decompValues [decompIndex [i] + 1] == '\u1161')
2710                                         ch = decompIndex [i];
2711                                 else if (decompLength [i] == 1)
2712                                         ch = decompIndex [i];
2713                                 else
2714                                         continue;
2715                                 ch = decompValues [ch];
2716                                 if (ch < 0x1100 || 0x1200 < ch &&
2717                                         ch < 0xAC00 || 0xD800 < ch)
2718                                         continue;
2719
2720                                 // SPECIAL CASE ?
2721                                 int offset = i < 0x3260 ? 1 : 0;
2722                                 if (0x326E <= i && i <= 0x3273)
2723                                         offset = 1;
2724
2725                                 map [i] = new CharMapEntry (map [ch].Category,
2726                                         (byte) (map [ch].Level1 + offset),
2727                                         map [ch].Level2);
2728 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2729                         }
2730
2731
2732                         #endregion
2733
2734                         // Letterlike characters and CJK compatibility square
2735                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2736                         int [] counts = new int ['Z' - 'A' + 1];
2737                         char [] namedChars = new char [sortableCharNames.Count];
2738                         int nCharNames = 0;
2739                         foreach (DictionaryEntry de in sortableCharNames) {
2740                                 counts [((string) de.Value) [0] - 'A']++;
2741                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2742                         }
2743                         nCharNames = 0; // reset
2744                         for (int a = 0; a < counts.Length; a++) {
2745                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2746                                 for (int i = 0; i < counts [a]; i++)
2747 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2748                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2749                         }
2750
2751                         // CJK unified ideograph.
2752                         byte cjkCat = 0x9E;
2753                         fillIndex [cjkCat] = 0x2;
2754                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2755                                 if (!IsIgnorable (cp))
2756                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2757                         // CJK Extensions goes here.
2758                         // LAMESPEC: With this Windows style CJK layout, it is
2759                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2760                         // 0x9FBB can never be added w/o breaking compat.
2761                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2762                                 if (!IsIgnorable (cp))
2763                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2764
2765                         // PrivateUse ... computed.
2766                         // remaining Surrogate ... computed.
2767
2768                         #region Special "biggest" area (FF FF)
2769                         fillIndex [0xFF] = 0xFF;
2770                         char [] specialBiggest = new char [] {
2771                                 '\u3005', '\u3031', '\u3032', '\u309D',
2772                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2773                                 '\uFE7C', '\uFE7D', '\uFF70'};
2774                         foreach (char c in specialBiggest)
2775                                 AddCharMap (c, 0xFF, 0);
2776                         #endregion
2777
2778                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2779                         // non-alphanumeric ASCII except for: + - < = > '
2780                         for (int i = 0x21; i < 0x7F; i++) {
2781                                 if (Char.IsLetterOrDigit ((char) i)
2782                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2783                                         continue; // they are not added here.
2784                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2785                                 // Insert 3001 after ',' and 3002 after '.'
2786                                 if (i == 0x2C)
2787                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2788                                 else if (i == 0x2E)
2789                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2790                                 else if (i == 0x3A)
2791                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2792                         }
2793                         #endregion
2794
2795                         #region 07 - Punctuations and something else
2796                         for (int i = 0xA0; i < char.MaxValue; i++) {
2797                                 if (IsIgnorable (i))
2798                                         continue;
2799
2800                                 // FIXME: actually those reset should not be
2801                                 // done but here I put for easy goal.
2802                                 if (i == 0x0700)
2803                                         fillIndex [0x7] = 0xE2;
2804                                 if (i == 0x2016)
2805                                         fillIndex [0x7] = 0x77;
2806
2807                                 // SPECIAL CASES:
2808                                 switch (i) {
2809                                 case 0xAB: // 08
2810                                 case 0xB7: // 0A
2811                                 case 0xBB: // 08
2812                                 case 0x2329: // 09
2813                                 case 0x232A: // 09
2814                                         continue;
2815                                 }
2816
2817                                 switch (Char.GetUnicodeCategory ((char) i)) {
2818                                 case UnicodeCategory.OtherPunctuation:
2819                                 case UnicodeCategory.ClosePunctuation:
2820                                 case UnicodeCategory.OpenPunctuation:
2821                                 case UnicodeCategory.InitialQuotePunctuation:
2822                                 case UnicodeCategory.FinalQuotePunctuation:
2823                                 case UnicodeCategory.ModifierSymbol:
2824                                         // SPECIAL CASES: // 0xA
2825                                         if (0x2020 <= i && i <= 0x2031)
2826                                                 continue;
2827                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
2828                                         break;
2829                                 default:
2830                                         if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2831                                                 goto case UnicodeCategory.OtherPunctuation;
2832                                         break;
2833                                 }
2834                         }
2835                         // Control pictures
2836                         // FIXME: it should not need to reset level 1, but
2837                         // it's for easy goal.
2838                         fillIndex [0x7] = 0xB6;
2839                         for (int i = 0x2400; i <= 0x2421; i++)
2840                                 AddCharMap ((char) i, 0x7, 1, 0);
2841                         #endregion
2842
2843                         // FIXME: for 07 xx we need more love.
2844
2845                         // Characters w/ diacritical marks (NFKD)
2846                         for (int i = 0; i <= char.MaxValue; i++) {
2847                                 if (map [i].Defined || IsIgnorable (i))
2848                                         continue;
2849                                 if (decompIndex [i] == 0)
2850                                         continue;
2851
2852                                 int start = decompIndex [i];
2853                                 int primaryChar = decompValues [start];
2854                                 int secondary = 0;
2855                                 bool skip = false;
2856                                 int length = decompLength [i];
2857                                 // special processing for parenthesized ones.
2858                                 if (length == 3 &&
2859                                         decompValues [start] == '(' &&
2860                                         decompValues [start + 2] == ')') {
2861                                         primaryChar = decompValues [start + 1];
2862                                         length = 1;
2863                                 }
2864
2865                                 if (map [primaryChar].Level1 == 0)
2866                                         continue;
2867
2868                                 for (int l = 1; l < length; l++) {
2869                                         int c = decompValues [start + l];
2870                                         if (map [c].Level1 != 0)
2871                                                 skip = true;
2872                                         secondary += diacritical [c];
2873                                 }
2874                                 if (skip)
2875                                         continue;
2876                                 map [i] = new CharMapEntry (
2877                                         map [primaryChar].Category,
2878                                         map [primaryChar].Level1,
2879                                         (byte) secondary);
2880
2881                         }
2882
2883                         // category 08 - symbols
2884                         fillIndex [0x8] = 2;
2885                         // Here Windows mapping is not straightforward. It is
2886                         // not based on computation but seems manual sorting.
2887                         AddCharMapGroup ('+', 0x8, 1, 0); // plus
2888                         AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
2889                         AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
2890                         AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
2891                         AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
2892                         AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
2893                         AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
2894                         AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
2895                         AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
2896                         AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
2897                         AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
2898                         AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
2899                         AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
2900
2901                         for (int cp = 0; cp < 0x2300; cp++) {
2902                                 if (cp == 0xAC) // SPECIAL CASE: skip
2903                                         continue;
2904                                 if (cp == 0x200) {
2905                                         cp = 0x2200; // skip to 2200
2906                                         fillIndex [0x8] = 0x21;
2907                                 }
2908                                 if (cp == 0x2295)
2909                                         fillIndex [0x8] = 0x3;
2910                                 if (cp == 0x22B2)
2911                                         fillIndex [0x8] = 0xB9;
2912                                 if (!map [cp].Defined &&
2913 //                                      Char.GetUnicodeCategory ((char) cp) ==
2914 //                                      UnicodeCategory.MathSymbol)
2915                                         Char.IsSymbol ((char) cp))
2916                                         AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
2917                                 // SPECIAL CASES: no idea why Windows sorts as such
2918                                 switch (cp) {
2919                                 case 0x3E:
2920                                         AddCharMap ('\u227B', 0x8, 1, 0);
2921                                         AddCharMap ('\u22B1', 0x8, 1, 0);
2922                                         break;
2923                                 case 0xB1:
2924                                         AddCharMapGroup ('\u00AB', 0x8, 1, 0);
2925                                         AddCharMapGroup ('\u226A', 0x8, 1, 0);
2926                                         AddCharMapGroup ('\u00BB', 0x8, 1, 0);
2927                                         AddCharMapGroup ('\u226B', 0x8, 1, 0);
2928                                         break;
2929                                 case 0xF7:
2930                                         AddCharMap ('\u01C0', 0x8, 1, 0);
2931                                         AddCharMap ('\u01C1', 0x8, 1, 0);
2932                                         AddCharMap ('\u01C2', 0x8, 1, 0);
2933                                         break;
2934                                 }
2935                         }
2936
2937                         #region Level2 adjustment
2938                         // Arabic Hamzah
2939                         diacritical [0x624] = 0x5;
2940                         diacritical [0x626] = 0x7;
2941                         diacritical [0x622] = 0x9;
2942                         diacritical [0x623] = 0xA;
2943                         diacritical [0x625] = 0xB;
2944                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
2945                         diacritical [0x64A] = 0x7; // Yaa'
2946
2947                         for (int i = 0; i < char.MaxValue; i++) {
2948                                 byte mod = 0;
2949                                 byte cat = map [i].Category;
2950                                 switch (cat) {
2951                                 case 0xE: // Latin diacritics
2952                                 case 0x22: // Japanese: circled characters
2953                                         mod = diacritical [i];
2954                                         break;
2955                                 case 0x13: // Arabic
2956                                         if (diacritical [i] == 0 && i >= 0xFE8D)
2957                                                 mod = 0x8; // default for arabic
2958                                         break;
2959                                 }
2960                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
2961                                         mod = diacritical [i];
2962                                 if (mod > 0)
2963                                         map [i] = new CharMapEntry (
2964                                                 cat, map [i].Level1, mod);
2965                         }
2966                         #endregion
2967
2968                         // FIXME: this is hack but those NonSpacingMark
2969                         // characters and still undefined are likely to
2970                         // be nonspacing.
2971                         for (int i = 0; i < char.MaxValue; i++)
2972                                 if (!map [i].Defined &&
2973                                         !IsIgnorable (i) &&
2974                                         Char.GetUnicodeCategory ((char) i) ==
2975                                         UnicodeCategory.NonSpacingMark)
2976                                         AddCharMap ((char) i, 1, 1);
2977
2978                         // FIXME: this is hack but those Symbol characters
2979                         // are likely to fall into 0xA category.
2980                         for (int i = 0; i < char.MaxValue; i++)
2981                                 if (!map [i].Defined &&
2982                                         !IsIgnorable (i) &&
2983                                         Char.IsSymbol ((char) i))
2984                                         AddCharMap ((char) i, 0xA, 1);
2985                 }
2986
2987                 private void IncrementSequentialIndex (ref byte hangulCat)
2988                 {
2989                         fillIndex [hangulCat]++;
2990                         if (fillIndex [hangulCat] == 0) { // overflown
2991                                 hangulCat++;
2992                                 fillIndex [hangulCat] = 0x2;
2993                         }
2994                 }
2995
2996                 // Reset fillIndex to fixed value and call AddLetterMap().
2997                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2998                 {
2999                         fillIndex [category] = alphaWeight;
3000                         AddLetterMap (c, category, 0);
3001
3002                         ArrayList al = latinMap [c] as ArrayList;
3003                         if (al == null)
3004                                 return;
3005
3006                         foreach (int cp in al)
3007                                 AddLetterMap ((char) cp, category, 0);
3008                 }
3009
3010                 private void AddKanaMap (int i, byte voices)
3011                 {
3012                         for (byte b = 0; b < voices; b++) {
3013                                 char c = (char) (i + b);
3014                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
3015                                 // Hiragana
3016                                 AddLetterMapCore (c, 0x22, 0, arg);
3017                                 // Katakana
3018                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
3019                         }
3020                 }
3021
3022                 private void AddLetterMap (char c, byte category, byte updateCount)
3023                 {
3024                         AddLetterMapCore (c, category, updateCount, 0);
3025                 }
3026
3027                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
3028                 {
3029                         char c2;
3030                         // <small> updates index
3031                         c2 = ToSmallForm (c);
3032                         if (c2 != c)
3033                                 AddCharMapGroup (c2, category, updateCount, level2);
3034                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3035                         if (c2 != c && !map [(int) c2].Defined)
3036                                 AddLetterMapCore (c2, category, 0, level2);
3037                         bool doUpdate = true;
3038                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3039                                 doUpdate = false;
3040                         else
3041                                 AddCharMapGroup (c, category, 0, level2);
3042                         if (doUpdate)
3043                                 fillIndex [category] += updateCount;
3044                 }
3045
3046                 private bool AddCharMap (char c, byte category, byte increment)
3047                 {
3048                         return AddCharMap (c, category, increment, 0);
3049                 }
3050
3051                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3052                 {
3053                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3054                                 return false; // do nothing
3055                         map [(int) c] = new CharMapEntry (category,
3056                                 category == 1 ? alt : fillIndex [category],
3057                                 category == 1 ? fillIndex [category] : alt);
3058                         fillIndex [category] += increment;
3059                         return true;
3060                 }
3061
3062                 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
3063                 {
3064                         char c2 = ToSmallFormTail (c);
3065                         if (c2 != c)
3066                                 AddCharMap (c2, category, updateCount, 0);
3067                         // itself
3068                         AddCharMap (c, category, updateCount, 0);
3069                         // <full>
3070                         c2 = ToFullWidthTail (c);
3071                         if (c2 != c)
3072                                 AddCharMapGroupTail (c2, category, updateCount);
3073                 }
3074
3075                 //
3076                 // Adds characters to table in the order below
3077                 // (+ increases weight):
3078                 //      (<small> +)
3079                 //      itself
3080                 //      <fraction>
3081                 //      <full> | <super> | <sub>
3082                 //      <circle> | <wide> (| <narrow>)
3083                 //      +
3084                 //      (vertical +)
3085                 //
3086                 // level2 is fixed (does not increase).
3087                 int [] sameWeightItems = new int [] {
3088                         DecompositionFraction,
3089                         DecompositionFull,
3090                         DecompositionSuper,
3091                         DecompositionSub,
3092                         DecompositionCircle,
3093                         DecompositionWide,
3094                         DecompositionNarrow,
3095                         };
3096                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3097                 {
3098                         if (map [(int) c].Defined)
3099                                 return;
3100
3101                         char small = char.MinValue;
3102                         char vertical = char.MinValue;
3103                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3104                         if (nfkd != null) {
3105                                 object smv = nfkd [(byte) DecompositionSmall];
3106                                 if (smv != null)
3107                                         small = (char) ((int) smv);
3108                                 object vv = nfkd [(byte) DecompositionVertical];
3109                                 if (vv != null)
3110                                         vertical = (char) ((int) vv);
3111                         }
3112
3113                         // <small> updates index
3114                         if (small != char.MinValue)
3115                                 AddCharMap (small, category, updateCount);
3116
3117                         // itself
3118                         AddCharMap (c, category, 0, level2);
3119
3120                         if (nfkd != null) {
3121                                 foreach (int weight in sameWeightItems) {
3122                                         object wv = nfkd [(byte) weight];
3123                                         if (wv != null)
3124                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
3125                                 }
3126                         }
3127
3128                         // update index here.
3129                         fillIndex [category] += updateCount;
3130
3131                         if (vertical != char.MinValue)
3132                                 AddCharMap (vertical, category, updateCount, level2);
3133                 }
3134
3135                 private void AddCharMapCJK (char c, ref byte category)
3136                 {
3137                         AddCharMap (c, category, 0, 0);
3138                         IncrementSequentialIndex (ref category);
3139
3140                         // Special. I wonder why but Windows skips 9E F9.
3141                         if (category == 0x9E && fillIndex [category] == 0xF9)
3142                                 IncrementSequentialIndex (ref category);
3143                 }
3144
3145                 private void AddCharMapGroupCJK (char c, ref byte category)
3146                 {
3147                         AddCharMapCJK (c, ref category);
3148
3149                         // LAMESPEC: see below.
3150                         if (c == '\u5B78') {
3151                                 AddCharMapCJK ('\u32AB', ref category);
3152                                 AddCharMapCJK ('\u323B', ref category);
3153                         }
3154                         if (c == '\u52DE') {
3155                                 AddCharMapCJK ('\u3298', ref category);
3156                                 AddCharMapCJK ('\u3238', ref category);
3157                         }
3158                         if (c == '\u5BEB')
3159                                 AddCharMapCJK ('\u32A2', ref category);
3160                         if (c == '\u91AB')
3161                                 // Especially this mapping order totally does
3162                                 // not make sense to me.
3163                                 AddCharMapCJK ('\u32A9', ref category);
3164
3165                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3166                         if (nfkd == null)
3167                                 return;
3168                         for (byte weight = 0; weight <= 0x12; weight++) {
3169                                 object wv = nfkd [weight];
3170                                 if (wv == null)
3171                                         continue;
3172                                 int w = (int) wv;
3173
3174                                 // Special: they are ignored in this area.
3175                                 // FIXME: check if it is sane
3176                                 if (0xF900 <= w && w <= 0xFAD9)
3177                                         continue;
3178                                 // LAMESPEC: on Windows some of CJK characters
3179                                 // in 3200-32B0 are incorrectly mapped. They
3180                                 // mix Chinise and Japanese Kanji when
3181                                 // ordering those characters.
3182                                 switch (w) {
3183                                 case 0x32A2: case 0x3298: case 0x3238:
3184                                 case 0x32A9: case 0x323B: case 0x32AB:
3185                                         continue;
3186                                 }
3187
3188                                 AddCharMapCJK ((char) w, ref category);
3189                         }
3190                 }
3191
3192                 // For now it is only for 0x7 category.
3193                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3194                 {
3195                         char small = char.MinValue;
3196                         char vertical = char.MinValue;
3197                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3198                         if (nfkd != null) {
3199                                 object smv = nfkd [(byte) DecompositionSmall];
3200                                 if (smv != null)
3201                                         small = (char) ((int) smv);
3202                                 object vv = nfkd [(byte) DecompositionVertical];
3203                                 if (vv != null)
3204                                         vertical = (char) ((int) vv);
3205                         }
3206
3207                         // <small> updates index
3208                         if (small != char.MinValue)
3209                                 // SPECIAL CASE excluded (FIXME: why?)
3210                                 if (small != '\u2024')
3211                                         AddCharMap (small, category, updateCount);
3212
3213                         // itself
3214                         AddCharMap (c, category, updateCount, level2);
3215
3216                         // Since nfkdMap is problematic to have two or more
3217                         // NFKD to an identical character, here I iterate all.
3218                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3219                                 if (decompLength [c2] == 1 &&
3220                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3221                                         switch (decompType [c2]) {
3222                                         case DecompositionCompat:
3223                                                 AddCharMap ((char) c2, category, updateCount, level2);
3224                                                 break;
3225                                         }
3226                                 }
3227                         }
3228
3229                         if (vertical != char.MinValue)
3230                                 // SPECIAL CASE excluded (FIXME: why?)
3231                                 if (vertical != '\uFE33' && vertical != '\uFE34')
3232                                         AddCharMap (vertical, category, updateCount, level2);
3233                 }
3234
3235                 private void AddArabicCharMap (char c)
3236                 {
3237                         byte category = 6;
3238                         byte updateCount = 1;
3239                         byte level2 = 0;
3240
3241                         // itself
3242                         AddCharMap (c, category, 0, level2);
3243
3244                         // Since nfkdMap is problematic to have two or more
3245                         // NFKD to an identical character, here I iterate all.
3246                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3247                                 if (decompLength [c2] == 0)
3248                                         continue;
3249                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
3250                                 if ((int) (decompValues [idx]) == (int) c)
3251                                         AddCharMap ((char) c2, category,
3252                                                 0, level2);
3253                         }
3254                         fillIndex [category] += updateCount;
3255                 }
3256
3257                 char ToFullWidth (char c)
3258                 {
3259                         return ToDecomposed (c, DecompositionFull, false);
3260                 }
3261
3262                 char ToFullWidthTail (char c)
3263                 {
3264                         return ToDecomposed (c, DecompositionFull, true);
3265                 }
3266
3267                 char ToSmallForm (char c)
3268                 {
3269                         return ToDecomposed (c, DecompositionSmall, false);
3270                 }
3271
3272                 char ToSmallFormTail (char c)
3273                 {
3274                         return ToDecomposed (c, DecompositionSmall, true);
3275                 }
3276
3277                 char ToDecomposed (char c, byte d, bool tail)
3278                 {
3279                         if (decompType [(int) c] != d)
3280                                 return c;
3281                         int idx = decompIndex [(int) c];
3282                         if (tail)
3283                                 idx += decompLength [(int) c] - 1;
3284                         return (char) decompValues [idx];
3285                 }
3286
3287                 bool ExistsJIS (int cp)
3288                 {
3289                         foreach (JISCharacter j in jisJapanese)
3290                                 if (j.CP == cp)
3291                                         return true;
3292                         return false;
3293                 }
3294
3295                 #endregion
3296
3297                 #region Level 3 properties (Case/Width)
3298
3299                 private byte ComputeLevel3Weight (char c)
3300                 {
3301                         byte b = ComputeLevel3WeightRaw (c);
3302                         return b > 0 ? (byte) (b + 2) : b;
3303                 }
3304
3305                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3306                 {
3307                         // CJK compat
3308                         if ('\u3192' <= c && c <= '\u319F')
3309                                 return 0;
3310                         // Japanese reading marks
3311                         if (c == '\u3001' || c == '\u3002')
3312                                 return 2;
3313                         // Korean
3314                         if ('\u11A8' <= c && c <= '\u11F9')
3315                                 return 2;
3316                         if ('\uFFA0' <= c && c <= '\uFFDC')
3317                                 return 4;
3318                         if ('\u3130' <= c && c <= '\u3164')
3319                                 return 5;
3320                         if ('\u3165' <= c && c <= '\u318E')
3321                                 return 4;
3322                         // Georgian Capital letters
3323                         if ('\u10A0' <= c && c <= '\u10C5')
3324                                 return 0x10;
3325                         // numbers
3326                         if ('\u2776' <= c && c <= '\u277F')
3327                                 return 4;
3328                         if ('\u2780' <= c && c <= '\u2789')
3329                                 return 8;
3330                         if ('\u2776' <= c && c <= '\u2793')
3331                                 return 0xC;
3332                         if ('\u2160' <= c && c <= '\u216F')
3333                                 return 0x10;
3334                         if ('\u2181' <= c && c <= '\u2182')
3335                                 return 0x18;
3336                         // Arabic
3337                         if ('\u2135' <= c && c <= '\u2138')
3338                                 return 4;
3339                         if ('\uFE80' <= c && c < '\uFF00') {
3340                                 // 2(Isolated)/8(Final)/0x18(Medial)
3341                                 switch (decompType [(int) c]) {
3342                                 case DecompositionIsolated:
3343                                         return 2;
3344                                 case DecompositionFinal:
3345                                         return 8;
3346                                 case DecompositionMedial:
3347                                         return 0x18;
3348                                 }
3349                         }
3350
3351                         // actually I dunno the reason why they have weights.
3352                         switch (c) {
3353                         case '\u01BC':
3354                                 return 0x10;
3355                         case '\u06A9':
3356                                 return 0x20;
3357                         case '\u06AA':
3358                                 return 0x28;
3359                         }
3360
3361                         byte ret = 0;
3362                         switch (c) {
3363                         case '\u03C2':
3364                         case '\u2104':
3365                         case '\u212B':
3366                                 ret |= 8;
3367                                 break;
3368                         case '\uFE42':
3369                                 ret |= 0xC;
3370                                 break;
3371                         }
3372
3373                         // misc
3374                         switch (decompType [(int) c]) {
3375                         case DecompositionWide: // <wide>
3376                         case DecompositionSub: // <sub>
3377                         case DecompositionSuper: // <super>
3378                                 ret |= decompType [(int) c];
3379                                 break;
3380                         }
3381                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3382                                 ret |= 8;
3383                         if (isUppercase [(int) c]) // DerivedCoreProperties
3384                                 ret |= 0x10;
3385
3386                         return ret;
3387                 }
3388
3389                 #endregion
3390
3391                 #region IsIgnorable
3392 /*
3393                 static bool IsIgnorable (int i)
3394                 {
3395                         if (unicodeAge [i] >= 3.1)
3396                                 return true;
3397                         switch (char.GetUnicodeCategory ((char) i)) {
3398                         case UnicodeCategory.OtherNotAssigned:
3399                         case UnicodeCategory.Format:
3400                                 return true;
3401                         }
3402                         return false;
3403                 }
3404 */
3405
3406                 // FIXME: In the future use DerivedAge.txt to examine character
3407                 // versions and set those ones that have higher version than
3408                 // 1.0 as ignorable.
3409                 static bool IsIgnorable (int i)
3410                 {
3411                         switch (i) {
3412                         case 0:
3413                         // I guess, those characters are added between
3414                         // Unicode 1.0 (LCMapString) and Unicode 3.1
3415                         // (UnicodeCategory), so they used to be
3416                         // something like OtherNotAssigned as of Unicode 1.1.
3417                         case 0x2df: case 0x387:
3418                         case 0x3d7: case 0x3d8: case 0x3d9:
3419                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3420                         case 0x400: case 0x40d: case 0x450: case 0x45d:
3421                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
3422                         case 0x653: case 0x654: case 0x655: case 0x66d:
3423                         case 0xb56:
3424                         case 0x1e9b: case 0x202f: case 0x20ad:
3425                         case 0x20ae: case 0x20af:
3426                         case 0x20e2: case 0x20e3:
3427                         case 0x2139: case 0x213a: case 0x2183:
3428                         case 0x2425: case 0x2426: case 0x2619:
3429                         case 0x2670: case 0x2671: case 0x3007:
3430                         case 0x3190: case 0x3191:
3431                         case 0xfffc: case 0xfffd:
3432                                 return true;
3433                         // exceptional characters filtered by the
3434                         // following conditions. Originally those exceptional
3435                         // ranges are incorrect (they should not be ignored)
3436                         // and most of those characters are unfortunately in
3437                         // those ranges.
3438                         case 0x4d8: case 0x4d9:
3439                         case 0x4e8: case 0x4e9:
3440                         case 0x70F:
3441                         case 0x3036: case 0x303f:
3442                         case 0x337b: case 0xfb1e:
3443                                 return false;
3444                         }
3445
3446                         if (
3447                                 // The whole Sinhala characters.
3448                                 0x0D82 <= i && i <= 0x0DF4
3449                                 // The whole Tibetan characters.
3450                                 || 0x0F00 <= i && i <= 0x0FD1
3451                                 // The whole Myanmar characters.
3452                                 || 0x1000 <= i && i <= 0x1059
3453                                 // The whole Etiopic, Cherokee,
3454                                 // Canadian Syllablic, Ogham, Runic,
3455                                 // Tagalog, Hanunoo, Philippine,
3456                                 // Buhid, Tagbanwa, Khmer and Mongorian
3457                                 // characters.
3458                                 || 0x1200 <= i && i <= 0x1DFF
3459                                 // Greek extension characters.
3460                                 || 0x1F00 <= i && i <= 0x1FFF
3461                                 // The whole Braille characters.
3462                                 || 0x2800 <= i && i <= 0x28FF
3463                                 // CJK radical characters.
3464                                 || 0x2E80 <= i && i <= 0x2EF3
3465                                 // Kangxi radical characters.
3466                                 || 0x2F00 <= i && i <= 0x2FD5
3467                                 // Ideographic description characters.
3468                                 || 0x2FF0 <= i && i <= 0x2FFB
3469                                 // Bopomofo letter and final
3470                                 || 0x31A0 <= i && i <= 0x31B7
3471                                 // White square with quadrant characters.
3472                                 || 0x25F0 <= i && i <= 0x25F7
3473                                 // Ideographic telegraph symbols.
3474                                 || 0x32C0 <= i && i <= 0x32CB
3475                                 || 0x3358 <= i && i <= 0x3370
3476                                 || 0x33E0 <= i && i <= 0x33FF
3477                                 // The whole YI characters.
3478                                 || 0xA000 <= i && i <= 0xA48C
3479                                 || 0xA490 <= i && i <= 0xA4C6
3480                                 // American small ligatures
3481                                 || 0xFB13 <= i && i <= 0xFB17
3482                                 // hebrew, arabic, variation selector.
3483                                 || 0xFB1D <= i && i <= 0xFE2F
3484                                 // Arabic ligatures.
3485                                 || 0xFEF5 <= i && i <= 0xFEFC
3486                                 // FIXME: why are they excluded?
3487                                 || 0x01F6 <= i && i <= 0x01F9
3488                                 || 0x0218 <= i && i <= 0x0233
3489                                 || 0x02A9 <= i && i <= 0x02AD
3490                                 || 0x02EA <= i && i <= 0x02EE
3491                                 || 0x0349 <= i && i <= 0x036F
3492                                 || 0x0488 <= i && i <= 0x048F
3493                                 || 0x04D0 <= i && i <= 0x04FF
3494                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3495                                 || 0x06D6 <= i && i <= 0x06ED
3496                                 || 0x06FA <= i && i <= 0x06FE
3497                                 || 0x2048 <= i && i <= 0x204D
3498                                 || 0x20e4 <= i && i <= 0x20ea
3499                                 || 0x213C <= i && i <= 0x214B
3500                                 || 0x21EB <= i && i <= 0x21FF
3501                                 || 0x22F2 <= i && i <= 0x22FF
3502                                 || 0x237B <= i && i <= 0x239A
3503                                 || 0x239B <= i && i <= 0x23CF
3504                                 || 0x24EB <= i && i <= 0x24FF
3505                                 || 0x2596 <= i && i <= 0x259F
3506                                 || 0x25F8 <= i && i <= 0x25FF
3507                                 || 0x2672 <= i && i <= 0x2689
3508                                 || 0x2768 <= i && i <= 0x2775
3509                                 || 0x27d0 <= i && i <= 0x27ff
3510                                 || 0x2900 <= i && i <= 0x2aff
3511                                 || 0x3033 <= i && i <= 0x303F
3512                                 || 0x31F0 <= i && i <= 0x31FF
3513                                 || 0x3250 <= i && i <= 0x325F
3514                                 || 0x32B1 <= i && i <= 0x32BF
3515                                 || 0x3371 <= i && i <= 0x337B
3516                                 || 0xFA30 <= i && i <= 0xFA6A
3517                         )
3518                                 return true;
3519
3520                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3521                         switch (uc) {
3522                         case UnicodeCategory.PrivateUse:
3523                         case UnicodeCategory.Surrogate:
3524                                 return false;
3525                         // ignored by nature
3526                         case UnicodeCategory.Format:
3527                         case UnicodeCategory.OtherNotAssigned:
3528                                 return true;
3529                         default:
3530                                 return false;
3531                         }
3532                 }
3533
3534                 // To check IsIgnorable sanity, try the driver below under MS.NET.
3535
3536                 /*
3537                 public static void Main ()
3538                 {
3539                         for (int i = 0; i <= char.MaxValue; i++)
3540                                 Dump (i, IsIgnorable (i));
3541                 }
3542
3543                 static void Dump (int i, bool ignore)
3544                 {
3545                         switch (Char.GetUnicodeCategory ((char) i)) {
3546                         case UnicodeCategory.PrivateUse:
3547                         case UnicodeCategory.Surrogate:
3548                                 return; // check nothing
3549                         }
3550
3551                         string s1 = "";
3552                         string s2 = new string ((char) i, 10);
3553                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3554                         if ((ret == 0) == ignore)
3555                                 return;
3556                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3557                 }
3558                 */
3559                 #endregion // IsIgnorable
3560
3561                 #region IsIgnorableSymbol
3562                 static bool IsIgnorableSymbol (int i)
3563                 {
3564                         if (IsIgnorable (i))
3565                                 return true;
3566
3567                         switch (i) {
3568                         // *Letter
3569                         case 0x00b5: case 0x01C0: case 0x01C1:
3570                         case 0x01C2: case 0x01C3: case 0x01F6:
3571                         case 0x01F7: case 0x01F8: case 0x01F9:
3572                         case 0x02D0: case 0x02EE: case 0x037A:
3573                         case 0x03D7: case 0x03F3:
3574                         case 0x0400: case 0x040d:
3575                         case 0x0450: case 0x045d:
3576                         case 0x048C: case 0x048D:
3577                         case 0x048E: case 0x048F:
3578                         case 0x0587: case 0x0640: case 0x06E5:
3579                         case 0x06E6: case 0x06FA: case 0x06FB:
3580                         case 0x06FC: case 0x093D: case 0x0950:
3581                         case 0x1E9B: case 0x2139: case 0x3006:
3582                         case 0x3033: case 0x3034: case 0x3035:
3583                         case 0xFE7E: case 0xFE7F:
3584                         // OtherNumber
3585                         case 0x16EE: case 0x16EF: case 0x16F0:
3586                         // LetterNumber
3587                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3588                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3589                         case 0x3038: // HANGZHOU NUMERAL TEN
3590                         case 0x3039: // HANGZHOU NUMERAL TWENTY
3591                         case 0x303a: // HANGZHOU NUMERAL THIRTY
3592                         // OtherSymbol
3593                         case 0x2117:
3594                         case 0x327F:
3595                                 return true;
3596                         // ModifierSymbol
3597                         case 0x02B9: case 0x02BA: case 0x02C2:
3598                         case 0x02C3: case 0x02C4: case 0x02C5:
3599                         case 0x02C8: case 0x02CC: case 0x02CD:
3600                         case 0x02CE: case 0x02CF: case 0x02D2:
3601                         case 0x02D3: case 0x02D4: case 0x02D5:
3602                         case 0x02D6: case 0x02D7: case 0x02DE:
3603                         case 0x02E5: case 0x02E6: case 0x02E7:
3604                         case 0x02E8: case 0x02E9:
3605                         case 0x309B: case 0x309C:
3606                         // OtherPunctuation
3607                         case 0x055A: // American Apos
3608                         case 0x05C0: // Hebrew Punct
3609                         case 0x0E4F: // Thai FONGMAN
3610                         case 0x0E5A: // Thai ANGKHANKHU
3611                         case 0x0E5B: // Thai KHOMUT
3612                         // CurencySymbol
3613                         case 0x09F2: // Bengali Rupee Mark
3614                         case 0x09F3: // Bengali Rupee Sign
3615                         // MathSymbol
3616                         case 0x221e: // INF.
3617                         // OtherSymbol
3618                         case 0x0482:
3619                         case 0x09FA:
3620                         case 0x0B70:
3621                                 return false;
3622                         }
3623
3624                         // *Letter
3625                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3626 #if NET_2_0
3627                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3628                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3629 #endif
3630                         )
3631                                 return true;
3632
3633                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3634                         switch (uc) {
3635                         case UnicodeCategory.Surrogate:
3636                                 return false; // inconsistent
3637
3638                         case UnicodeCategory.SpacingCombiningMark:
3639                         case UnicodeCategory.EnclosingMark:
3640                         case UnicodeCategory.NonSpacingMark:
3641                         case UnicodeCategory.PrivateUse:
3642                                 // NonSpacingMark
3643                                 if (0x064B <= i && i <= 0x0652) // Arabic
3644                                         return true;
3645                                 return false;
3646
3647                         case UnicodeCategory.Format:
3648                         case UnicodeCategory.OtherNotAssigned:
3649                                 return true;
3650
3651                         default:
3652                                 bool use = false;
3653                                 // OtherSymbols
3654                                 if (
3655                                         // latin in a circle
3656                                         0x249A <= i && i <= 0x24E9
3657                                         || 0x2100 <= i && i <= 0x2132
3658                                         // Japanese
3659                                         || 0x3196 <= i && i <= 0x31A0
3660                                         // Korean
3661                                         || 0x3200 <= i && i <= 0x321C
3662                                         // Chinese/Japanese
3663                                         || 0x322A <= i && i <= 0x3243
3664                                         // CJK
3665                                         || 0x3260 <= i && i <= 0x32B0
3666                                         || 0x32D0 <= i && i <= 0x3357
3667                                         || 0x337B <= i && i <= 0x33DD
3668                                 )
3669                                         use = !Char.IsLetterOrDigit ((char) i);
3670                                 if (use)
3671                                         return false;
3672
3673                                 // This "Digit" rule is mystery.
3674                                 // It filters some symbols out.
3675                                 if (Char.IsLetterOrDigit ((char) i))
3676                                         return false;
3677                                 if (Char.IsNumber ((char) i))
3678                                         return false;
3679                                 if (Char.IsControl ((char) i)
3680                                         || Char.IsSeparator ((char) i)
3681                                         || Char.IsPunctuation ((char) i))
3682                                         return true;
3683                                 if (Char.IsSymbol ((char) i))
3684                                         return true;
3685
3686                                 // FIXME: should check more
3687                                 return false;
3688                         }
3689                 }
3690
3691                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3692 /*
3693                 public static void Main ()
3694                 {
3695                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3696                         for (int i = 0; i <= char.MaxValue; i++) {
3697                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3698                                 if (uc == UnicodeCategory.Surrogate)
3699                                         continue;
3700
3701                                 bool ret = IsIgnorableSymbol (i);
3702
3703                                 string s1 = "TEST ";
3704                                 string s2 = "TEST " + (char) i;
3705
3706                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3707
3708                                 if (ret != (result == 0))
3709                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3710                                                 ret ? "should not ignore" :
3711                                                         "should ignore",
3712                                                 i,(char) i, uc);
3713                         }
3714                 }
3715 */
3716                 #endregion
3717
3718                 #region NonSpacing
3719                 static bool IsIgnorableNonSpacing (int i)
3720                 {
3721                         if (IsIgnorable (i))
3722                                 return true;
3723
3724                         switch (i) {
3725                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3726                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3727                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3728                                 return true;
3729                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3730                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3731                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3732                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3733                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3734                         case 0x0CCD: case 0x0E4E:
3735                                 return false;
3736                         }
3737
3738                         if (0x02b9 <= i && i <= 0x02c5
3739                                 || 0x02cc <= i && i <= 0x02d7
3740                                 || 0x02e4 <= i && i <= 0x02ef
3741                                 || 0x20DD <= i && i <= 0x20E0
3742                         )
3743                                 return true;
3744
3745                         if (0x064B <= i && i <= 0x00652
3746                                 || 0x0941 <= i && i <= 0x0948
3747                                 || 0x0AC1 <= i && i <= 0x0ACD
3748                                 || 0x0C3E <= i && i <= 0x0C4F
3749                                 || 0x0E31 <= i && i <= 0x0E3F
3750                         )
3751                                 return false;
3752
3753                         return Char.GetUnicodeCategory ((char) i) ==
3754                                 UnicodeCategory.NonSpacingMark;
3755                 }
3756
3757                 // We can reuse IsIgnorableSymbol testcode
3758                 // for IsIgnorableNonSpacing.
3759                 #endregion
3760         }
3761
3762         struct CharMapEntry
3763         {
3764                 public byte Category;
3765                 public byte Level1;
3766                 public byte Level2; // It is always single byte.
3767                 public bool Defined;
3768
3769                 public CharMapEntry (byte category, byte level1, byte level2)
3770                 {
3771                         Category = category;
3772                         Level1 = level1;
3773                         Level2 = level2;
3774                         Defined = true;
3775                 }
3776         }
3777
3778         class JISCharacter
3779         {
3780                 public readonly int CP;
3781                 public readonly int JIS;
3782
3783                 public JISCharacter (int cp, int cpJIS)
3784                 {
3785                         CP = cp;
3786                         JIS = cpJIS;
3787                 }
3788         }
3789
3790         class JISComparer : IComparer
3791         {
3792                 public static readonly JISComparer Instance =
3793                         new JISComparer ();
3794
3795                 public int Compare (object o1, object o2)
3796                 {
3797                         JISCharacter j1 = (JISCharacter) o1;
3798                         JISCharacter j2 = (JISCharacter) o2;
3799                         return j1.JIS - j2.JIS;
3800                 }
3801         }
3802
3803         class NonJISCharacter
3804         {
3805                 public readonly int CP;
3806                 public readonly string Name;
3807
3808                 public NonJISCharacter (int cp, string name)
3809                 {
3810                         CP = cp;
3811                         Name = name;
3812                 }
3813         }
3814
3815         class NonJISComparer : IComparer
3816         {
3817                 public static readonly NonJISComparer Instance =
3818                         new NonJISComparer ();
3819
3820                 public int Compare (object o1, object o2)
3821                 {
3822                         NonJISCharacter j1 = (NonJISCharacter) o1;
3823                         NonJISCharacter j2 = (NonJISCharacter) o2;
3824                         return string.CompareOrdinal (j1.Name, j2.Name);
3825                 }
3826         }
3827
3828         class DecimalDictionaryValueComparer : IComparer
3829         {
3830                 public static readonly DecimalDictionaryValueComparer Instance
3831                         = new DecimalDictionaryValueComparer ();
3832
3833                 private DecimalDictionaryValueComparer ()
3834                 {
3835                 }
3836
3837                 public int Compare (object o1, object o2)
3838                 {
3839                         DictionaryEntry e1 = (DictionaryEntry) o1;
3840                         DictionaryEntry e2 = (DictionaryEntry) o2;
3841                         // FIXME: in case of 0, compare decomposition categories
3842                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3843                         if (ret != 0)
3844                                 return ret;
3845                         int i1 = (int) e1.Key;
3846                         int i2 = (int) e2.Key;
3847                         return i1 - i2;
3848                 }
3849         }
3850
3851         class StringDictionaryValueComparer : IComparer
3852         {
3853                 public static readonly StringDictionaryValueComparer Instance
3854                         = new StringDictionaryValueComparer ();
3855
3856                 private StringDictionaryValueComparer ()
3857                 {
3858                 }
3859
3860                 public int Compare (object o1, object o2)
3861                 {
3862                         DictionaryEntry e1 = (DictionaryEntry) o1;
3863                         DictionaryEntry e2 = (DictionaryEntry) o2;
3864                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3865                         if (ret != 0)
3866                                 return ret;
3867                         int i1 = (int) e1.Key;
3868                         int i2 = (int) e2.Key;
3869                         return i1 - i2;
3870                 }
3871         }
3872
3873         class UCAComparer : IComparer
3874         {
3875                 public static readonly UCAComparer Instance
3876                         = new UCAComparer ();
3877
3878                 private UCAComparer ()
3879                 {
3880                 }
3881
3882                 public int Compare (object o1, object o2)
3883                 {
3884                         char i1 = (char) o1;
3885                         char i2 = (char) o2;
3886
3887                         int l1 = CollationElementTable.GetSortKeyCount (i1);
3888                         int l2 = CollationElementTable.GetSortKeyCount (i2);
3889                         int l = l1 > l2 ? l2 : l1;
3890
3891                         for (int i = 0; i < l; i++) {
3892                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3893                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3894                                 int v = k1.Primary - k2.Primary;
3895                                 if (v != 0)
3896                                         return v;
3897                                 v = k1.Secondary - k2.Secondary;
3898                                 if (v != 0)
3899                                         return v;
3900                                 v = k1.Thirtiary - k2.Thirtiary;
3901                                 if (v != 0)
3902                                         return v;
3903                                 v = k1.Quarternary - k2.Quarternary;
3904                                 if (v != 0)
3905                                         return v;
3906                         }
3907                         return l1 - l2;
3908                 }
3909         }
3910
3911         class Tailoring
3912         {
3913                 int lcid;
3914                 int alias;
3915                 bool frenchSort;
3916                 ArrayList items = new ArrayList ();
3917
3918                 public Tailoring (int lcid)
3919                         : this (lcid, 0)
3920                 {
3921                 }
3922
3923                 public Tailoring (int lcid, int alias)
3924                 {
3925                         this.lcid = lcid;
3926                         this.alias = alias;
3927                 }
3928
3929                 public int LCID {
3930                         get { return lcid; }
3931                 }
3932
3933                 public int Alias {
3934                         get { return alias; }
3935                 }
3936
3937                 public bool FrenchSort {
3938                         get { return frenchSort; }
3939                         set { frenchSort = value; }
3940                 }
3941
3942                 public void AddDiacriticalMap (byte target, byte replace)
3943                 {
3944                         items.Add (new DiacriticalMap (target, replace));
3945                 }
3946
3947                 public void AddSortKeyMap (string source, byte [] sortkey)
3948                 {
3949                         items.Add (new SortKeyMap (source, sortkey));
3950                 }
3951
3952                 public void AddReplacementMap (string source, string replace)
3953                 {
3954                         items.Add (new ReplacementMap (source, replace));
3955                 }
3956
3957                 public char [] ItemToCharArray ()
3958                 {
3959                         ArrayList al = new ArrayList ();
3960                         foreach (ITailoringMap m in items)
3961                                 al.AddRange (m.ToCharArray ());
3962                         return al.ToArray (typeof (char)) as char [];
3963                 }
3964
3965                 interface ITailoringMap
3966                 {
3967                         char [] ToCharArray ();
3968                 }
3969
3970                 class DiacriticalMap : ITailoringMap
3971                 {
3972                         public readonly byte Target;
3973                         public readonly byte Replace;
3974
3975                         public DiacriticalMap (byte target, byte replace)
3976                         {
3977                                 Target = target;
3978                                 Replace = replace;
3979                         }
3980
3981                         public char [] ToCharArray ()
3982                         {
3983                                 char [] ret = new char [3];
3984                                 ret [0] = (char) 02; // kind:DiacriticalMap
3985                                 ret [1] = (char) Target;
3986                                 ret [2] = (char) Replace;
3987                                 return ret;
3988                         }
3989                 }
3990
3991                 class SortKeyMap : ITailoringMap
3992                 {
3993                         public readonly string Source;
3994                         public readonly byte [] SortKey;
3995
3996                         public SortKeyMap (string source, byte [] sortkey)
3997                         {
3998                                 Source = source;
3999                                 SortKey = sortkey;
4000                         }
4001
4002                         public char [] ToCharArray ()
4003                         {
4004                                 char [] ret = new char [Source.Length + 7];
4005                                 ret [0] = (char) 01; // kind:SortKeyMap
4006                                 for (int i = 0; i < Source.Length; i++)
4007                                         ret [i + 1] = Source [i];
4008                                 // null terminate
4009                                 for (int i = 0; i < 4; i++)
4010                                         ret [i + Source.Length + 2] = (char) SortKey [i];
4011                                 return ret;
4012                         }
4013                 }
4014
4015                 class ReplacementMap : ITailoringMap
4016                 {
4017                         public readonly string Source;
4018                         public readonly string Replace;
4019
4020                         public ReplacementMap (string source, string replace)
4021                         {
4022                                 Source = source;
4023                                 Replace = replace;
4024                         }
4025
4026                         public char [] ToCharArray ()
4027                         {
4028                                 char [] ret = new char [Source.Length + Replace.Length + 3];
4029                                 ret [0] = (char) 03; // kind:ReplaceMap
4030                                 int pos = 1;
4031                                 for (int i = 0; i < Source.Length; i++)
4032                                         ret [pos++] = Source [i];
4033                                 // null terminate
4034                                 pos++;
4035                                 for (int i = 0; i < Replace.Length; i++)
4036                                         ret [pos++] = Replace [i];
4037                                 // null terminate
4038                                 return ret;
4039                         }
4040                 }
4041         }
4042 }