mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14
  15 //
  16 // * sortkey getter signature
  17 //
  18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
  19 //      Stores sort key for corresponding character element into buf and
  20 //      returns the length of the consumed _source_ character element in s.
  21 //
  22 // * character length to consume
  23 //
  24 //      If there are characters whose primary weight is 0, they are consumed
  25 //      and considered as a part of the character element.
  26 //
  27 #define Binary
  28
  29 using System;
  30 using System.IO;
  31 using System.Collections;
  32 using System.Globalization;
  33 using System.Text;
  34 using System.Xml;
  35
  36 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
  37
  38 namespace Mono.Globalization.Unicode
  39 {
  40         internal class MSCompatSortKeyTableGenerator
  41         {
  42                 public static void Main (string [] args)
  43                 {
  44                         new MSCompatSortKeyTableGenerator ().Run (args);
  45                 }
  46
  47                 const int DecompositionWide = 1; // fixed
  48                 const int DecompositionSub = 2; // fixed
  49                 const int DecompositionSmall = 3;
  50                 const int DecompositionIsolated = 4;
  51                 const int DecompositionInitial = 5;
  52                 const int DecompositionFinal = 6;
  53                 const int DecompositionMedial = 7;
  54                 const int DecompositionNoBreak = 8;
  55                 const int DecompositionVertical = 9;
  56                 const int DecompositionFraction = 0xA;
  57                 const int DecompositionFont = 0xB;
  58                 const int DecompositionSuper = 0xC; // fixed
  59                 const int DecompositionFull = 0xE;
  60                 const int DecompositionNarrow = 0xD;
  61                 const int DecompositionCircle = 0xF;
  62                 const int DecompositionSquare = 0x10;
  63                 const int DecompositionCompat = 0x11;
  64                 const int DecompositionCanonical = 0x12;
  65
  66                 TextWriter Result = Console.Out;
  67
  68                 byte [] fillIndex = new byte [256]; // by category
  69                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  70
  71                 char [] specialIgnore = new char [] {
  72                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  73                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  74                         };
  75
  76                 // FIXME: need more love (as always)
  77                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  78                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  79                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  80                         '\u0292', '\u01BE', '\u0298'};
  81                 byte [] alphaWeights = new byte [] {
  82                         2, 9, 0xA, 0x1A, 0x21,
  83                         0x23, 0x25, 0x2C, 0x32, 0x35,
  84                         0x36, 0x48, 0x51, 0x70, 0x7C,
  85                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  86                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  87                         0xA9, 0xAA, 0xB3, 0xB4};
  88
  89                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  90                 bool [] isUppercase = new bool [char.MaxValue + 1];
  91
  92                 byte [] decompType = new byte [char.MaxValue + 1];
  93                 int [] decompIndex = new int [char.MaxValue + 1];
  94                 int [] decompLength = new int [char.MaxValue + 1];
  95                 int [] decompValues;
  96                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  97
  98                 byte [] diacritical = new byte [char.MaxValue + 1];
  99
 100                 string [] diacritics = new string [] {
 101                         "DOUBLE VERTICAL LINE ABOVE",
 102                         "ABKHASIAN CHE WITH DESCENDER",
 103                         // LATIN, CYRILLIC etc.
 104                         "VERTICAL LINE ABOVE", "UPTURN", "DOUBLE-STRUCK",
 105                         "ABKHASIAN",
 106                         "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
 107                         "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
 108                         "WITH ACUTE;", "WITH GRAVE;",
 109                         //
 110                         "WITH DOT ABOVE;", " MIDDLE DOT;",
 111                         "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
 112                         "WITH DIALYTIKA;",
 113                         "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 114                         "DIALYTIKA TONOS", "DIALYTIKA AND TONOS", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
 115                         "WITH OGONEK;", "WITH CEDILLA;",
 116                         //
 117                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 118                         "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
 119                         "STROKE OVERLAY",
 120                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 121                         " DIAERESIS AND GRAVE;",
 122                         " BREVE AND ACUTE;",
 123                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 124                         " MACRON AND ACUTE;",
 125                         " MACRON AND GRAVE;",
 126                         //
 127                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 128                         " RING ABOVE AND ACUTE",
 129                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 130                         " CIRCUMFLEX AND TILDE",
 131                         " TILDE AND DIAERESIS",
 132                         " STROKE AND ACUTE",
 133                         " BREVE AND TILDE",
 134                         " CEDILLA AND BREVE",
 135                         " OGONEK AND MACRON",
 136                         //
 137                         "WITH OVERLINE",
 138                         "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 139                         " DOUBLE GRAVE",
 140                         " INVERTED BREVE",
 141                         "ROMAN NUMERAL",
 142                         " PRECEDED BY APOSTROPHE",
 143                         "WITH HORN;",
 144                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 145                         " PALATAL HOOK",
 146                         " DOT BELOW;",
 147                         " RETROFLEX;", "DIAERESIS BELOW",
 148                         " RING BELOW",
 149                         //
 150                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 151                         " BREVE BELOW;", " HORN AND GRAVE",
 152                         " TILDE BELOW",
 153                         " TOPBAR",
 154                         " DOT BELOW AND DOT ABOVE",
 155                         " RIGHT HALF RING", " HORN AND TILDE",
 156                         " CIRCUMFLEX AND DOT BELOW",
 157                         " BREVE AND DOT BELOW",
 158                         " DOT BELOW AND MACRON",
 159                         " TONE TWO",
 160                         " HORN AND HOOK ABOVE",
 161                         " HORN AND DOT",
 162                         // CIRCLED, PARENTHESIZED and so on
 163                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
 164                         "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
 165                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 166                         };
 167                 byte [] diacriticWeights = new byte [] {
 168                         // this is to pick U+30E (DOUBLE VERTICAL LINE ABOVE)
 169                         // before being picked as VERTICAL LINE ABOVE
 170                         41,
 171                         // this is to pick ABKHASIAN CHE WITH DESCENDER before
 172                         // being picked as ABKHASIAN
 173                         17,
 174                         // LATIN.
 175                         3, 3, 3, 5, 5, 5, 5,
 176                         0xE, 0xF,
 177                         0xE, 0xF,
 178                         //
 179                         0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
 180                         0x16, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
 181                         //
 182                         0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 183                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 184                         //
 185                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 186                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 187                         //
 188                         0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
 189                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
 190                         //
 191                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
 192                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 193                         0x87, 0x95, 0xAA,
 194                         // CIRCLED, PARENTHESIZED and so on.
 195                         0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
 196                         0xF3, 0xF3, 0xF3
 197                         };
 198
 199                 int [] numberSecondaryWeightBounds = new int [] {
 200                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 201                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 202                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 203                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 204                         0xE50, 0xE60, 0xED0, 0xEE0
 205                         };
 206
 207                 char [] orderedGurmukhi;
 208                 char [] orderedGujarati;
 209                 char [] orderedGeorgian;
 210                 char [] orderedThaana;
 211
 212                 static readonly char [] orderedTamilConsonants = new char [] {
 213                         // based on traditional Tamil consonants, except for
 214                         // Grantha (where Microsoft breaks traditionalism).
 215                         // http://www.angelfire.com/empire/thamizh/padanGaL
 216                         '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
 217                         '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
 218                         '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
 219                         '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
 220                         '\u0BB7', '\u0BB9'};
 221
 222                 // cp -> character name (only for some characters)
 223                 ArrayList sortableCharNames = new ArrayList ();
 224
 225                 // cp -> arrow value (int)
 226                 ArrayList arrowValues = new ArrayList ();
 227
 228                 // cp -> box value (int)
 229                 ArrayList boxValues = new ArrayList ();
 230
 231                 // cp -> level1 value
 232                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 233
 234                 // letterName -> cp
 235                 Hashtable arabicNameMap = new Hashtable ();
 236
 237                 // cp -> Hashtable [decompType] -> cp
 238                 Hashtable nfkdMap = new Hashtable ();
 239
 240                 // Latin letter -> ArrayList [int]
 241                 Hashtable latinMap = new Hashtable ();
 242
 243                 ArrayList jisJapanese = new ArrayList ();
 244                 ArrayList nonJisJapanese = new ArrayList ();
 245
 246                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
 247                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
 248                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
 249                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
 250                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
 251
 252                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 253
 254                 static double [] unicodeAge = new double [char.MaxValue + 1];
 255
 256                 ArrayList tailorings = new ArrayList ();
 257
 258                 void Run (string [] args)
 259                 {
 260                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 261                         ParseSources (dirname);
 262                         Console.Error.WriteLine ("parse done.");
 263
 264                         ModifyParsedValues ();
 265                         GenerateCore ();
 266                         Console.Error.WriteLine ("generation done.");
 267                         Serialize ();
 268                         Console.Error.WriteLine ("serialization done.");
 269 /*
 270 StreamWriter sw = new StreamWriter ("agelog.txt");
 271 for (int i = 0; i < char.MaxValue; i++) {
 272 bool shouldBe = false;
 273 switch (Char.GetUnicodeCategory ((char) i)) {
 274 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
 275         shouldBe = true; break;
 276 }
 277 if (unicodeAge [i] >= 3.1)
 278         shouldBe = true;
 279 //if (IsIgnorable (i) != shouldBe)
 280 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
 281 }
 282 sw.Close ();
 283 */
 284                 }
 285
 286                 byte [] CompressArray (byte [] source, CodePointIndexer i)
 287                 {
 288                         return (byte []) CodePointIndexer.CompressArray  (
 289                                 source, typeof (byte), i);
 290                 }
 291
 292                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
 293                 {
 294                         return (ushort []) CodePointIndexer.CompressArray  (
 295                                 source, typeof (ushort), i);
 296                 }
 297
 298                 void Serialize ()
 299                 {
 300                         // Tailorings
 301                         SerializeTailorings ();
 302
 303                         byte [] categories = new byte [map.Length];
 304                         byte [] level1 = new byte [map.Length];
 305                         byte [] level2 = new byte [map.Length];
 306                         byte [] level3 = new byte [map.Length];
 307                         ushort [] widthCompat = new ushort [map.Length];
 308                         for (int i = 0; i < map.Length; i++) {
 309                                 categories [i] = map [i].Category;
 310                                 level1 [i] = map [i].Level1;
 311                                 level2 [i] = map [i].Level2;
 312                                 level3 [i] = ComputeLevel3Weight ((char) i);
 313                                 // For Japanese Half-width characters, don't
 314                                 // map widthCompat. It is IgnoreKanaType that
 315                                 // handles those width differences.
 316                                 if (0xFF6D <= i && i <= 0xFF9D)
 317                                         continue;
 318                                 switch (decompType [i]) {
 319                                 case DecompositionNarrow:
 320                                 case DecompositionWide:
 321                                 case DecompositionSuper:
 322                                 case DecompositionSub:
 323                                         // they are always 1 char
 324                                         widthCompat [i] = (ushort) decompValues [decompIndex [i]];
 325                                         break;
 326                                 }
 327                         }
 328
 329                         // compress
 330                         ignorableFlags = CompressArray (ignorableFlags,
 331                                 UUtil.Ignorable);
 332                         categories = CompressArray (categories, UUtil.Category);
 333                         level1 = CompressArray (level1, UUtil.Level1);
 334                         level2 = CompressArray (level2, UUtil.Level2);
 335                         level3 = CompressArray (level3, UUtil.Level3);
 336                         widthCompat = (ushort []) CodePointIndexer.CompressArray (
 337                                 widthCompat, typeof (ushort), UUtil.WidthCompat);
 338                         cjkCHS = CompressArray (cjkCHS, UUtil.CjkCHS);
 339                         cjkCHT = CompressArray (cjkCHT,UUtil.Cjk);
 340                         cjkJA = CompressArray (cjkJA, UUtil.Cjk);
 341                         cjkKO = CompressArray (cjkKO, UUtil.Cjk);
 342                         cjkKOlv2 = CompressArray (cjkKOlv2, UUtil.Cjk);
 343
 344                         // Ignorables
 345                         Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
 346 #if Binary
 347                         MemoryStream ms = new MemoryStream ();
 348                         BinaryWriter binary = new BinaryWriter (ms);
 349                         binary.Write (ignorableFlags.Length);
 350 #endif
 351                         for (int i = 0; i < ignorableFlags.Length; i++) {
 352                                 byte value = ignorableFlags [i];
 353                                 if (value < 10)
 354                                         Result.Write ("{0},", value);
 355                                 else
 356                                         Result.Write ("0x{0:X02},", value);
 357 #if Binary
 358                                 binary.Write (value);
 359 #endif
 360                                 if ((i & 0xF) == 0xF)
 361                                         Result.WriteLine ("// {0:X04}",
 362                                                 UUtil.Ignorable.ToCodePoint (i - 0xF));
 363                         }
 364                         Result.WriteLine ("};");
 365                         Result.WriteLine ();
 366
 367                         // Primary category
 368                         Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
 369 #if Binary
 370                         binary.Write (categories.Length);
 371 #endif
 372                         for (int i = 0; i < categories.Length; i++) {
 373                                 byte value = categories [i];
 374                                 if (value < 10)
 375                                         Result.Write ("{0},", value);
 376                                 else
 377                                         Result.Write ("0x{0:X02},", value);
 378 #if Binary
 379                                 binary.Write (value);
 380 #endif
 381                                 if ((i & 0xF) == 0xF)
 382                                         Result.WriteLine ("// {0:X04}",
 383                                                 UUtil.Category.ToCodePoint (i - 0xF));
 384                         }
 385                         Result.WriteLine ("};");
 386                         Result.WriteLine ();
 387
 388                         // Primary weight value
 389                         Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
 390 #if Binary
 391                         binary.Write (level1.Length);
 392 #endif
 393                         for (int i = 0; i < level1.Length; i++) {
 394                                 byte value = level1 [i];
 395                                 if (value < 10)
 396                                         Result.Write ("{0},", value);
 397                                 else
 398                                         Result.Write ("0x{0:X02},", value);
 399 #if Binary
 400                                 binary.Write (value);
 401 #endif
 402                                 if ((i & 0xF) == 0xF)
 403                                         Result.WriteLine ("// {0:X04}",
 404                                                 UUtil.Level1.ToCodePoint (i - 0xF));
 405                         }
 406                         Result.WriteLine ("};");
 407                         Result.WriteLine ();
 408
 409                         // Secondary weight
 410                         Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
 411 #if Binary
 412                         binary.Write (level2.Length);
 413 #endif
 414                         for (int i = 0; i < level2.Length; i++) {
 415                                 byte value = level2 [i];
 416                                 if (value < 10)
 417                                         Result.Write ("{0},", value);
 418                                 else
 419                                         Result.Write ("0x{0:X02},", value);
 420 #if Binary
 421                                 binary.Write (value);
 422 #endif
 423                                 if ((i & 0xF) == 0xF)
 424                                         Result.WriteLine ("// {0:X04}",
 425                                                 UUtil.Level2.ToCodePoint (i - 0xF));
 426                         }
 427                         Result.WriteLine ("};");
 428                         Result.WriteLine ();
 429
 430                         // Thirtiary weight
 431                         Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
 432 #if Binary
 433                         binary.Write (level3.Length);
 434 #endif
 435                         for (int i = 0; i < level3.Length; i++) {
 436                                 byte value = level3 [i];
 437                                 if (value < 10)
 438                                         Result.Write ("{0},", value);
 439                                 else
 440                                         Result.Write ("0x{0:X02},", value);
 441 #if Binary
 442                                 binary.Write (value);
 443 #endif
 444                                 if ((i & 0xF) == 0xF)
 445                                         Result.WriteLine ("// {0:X04}",
 446                                                 UUtil.Level3.ToCodePoint (i - 0xF));
 447                         }
 448                         Result.WriteLine ("};");
 449                         Result.WriteLine ();
 450
 451                         // Width insensitivity mappings
 452                         // (for now it is more lightweight than dumping the
 453                         // entire NFKD table).
 454                         Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
 455 #if Binary
 456                         binary.Write (widthCompat.Length);
 457 #endif
 458                         for (int i = 0; i < widthCompat.Length; i++) {
 459                                 ushort value = widthCompat [i];
 460                                 if (value < 10)
 461                                         Result.Write ("{0},", value);
 462                                 else
 463                                         Result.Write ("0x{0:X02},", value);
 464 #if Binary
 465                                 binary.Write (value);
 466 #endif
 467                                 if ((i & 0xF) == 0xF)
 468                                         Result.WriteLine ("// {0:X04}",
 469                                                 UUtil.WidthCompat.ToCodePoint (i - 0xF));
 470                         }
 471                         Result.WriteLine ("};");
 472                         Result.WriteLine ();
 473 #if Binary
 474                         using (FileStream fs = File.Create ("../collation.core.bin")) {
 475                                 byte [] array = ms.ToArray ();
 476                                 fs.Write (array, 0, array.Length);
 477                         }
 478 #endif
 479
 480                         // CJK
 481                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 482                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 483                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 484                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 485                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 486                 }
 487
 488                 void SerializeCJK (string name, ushort [] cjk, int max)
 489                 {
 490                         int offset = 0;//char.MaxValue - cjk.Length;
 491                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
 492 #if Binary
 493                         MemoryStream ms = new MemoryStream ();
 494                         BinaryWriter binary = new BinaryWriter (ms);
 495                         binary.Write (cjk.Length);
 496 #endif
 497                         for (int i = 0; i < cjk.Length; i++) {
 498                                 if (i + offset == max)
 499                                         break;
 500                                 ushort value = cjk [i];
 501                                 if (value < 10)
 502                                         Result.Write ("{0},", value);
 503                                 else
 504                                         Result.Write ("0x{0:X04},", value);
 505 #if Binary
 506                                 binary.Write (value);
 507 #endif
 508                                 if ((i & 0xF) == 0xF)
 509                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 510                         }
 511                         Result.WriteLine ("};");
 512                         Result.WriteLine ();
 513 #if Binary
 514                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 515                                 byte [] array = ms.ToArray ();
 516                                 fs.Write (array, 0, array.Length);
 517                         }
 518 #endif
 519                 }
 520
 521                 void SerializeCJK (string name, byte [] cjk, int max)
 522                 {
 523                         int offset = 0;//char.MaxValue - cjk.Length;
 524                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
 525 #if Binary
 526                         MemoryStream ms = new MemoryStream ();
 527                         BinaryWriter binary = new BinaryWriter (ms);
 528 #endif
 529                         for (int i = 0; i < cjk.Length; i++) {
 530                                 if (i + offset == max)
 531                                         break;
 532                                 byte value = cjk [i];
 533                                 if (value < 10)
 534                                         Result.Write ("{0},", value);
 535                                 else
 536                                         Result.Write ("0x{0:X02},", value);
 537 #if Binary
 538                                 binary.Write (value);
 539 #endif
 540                                 if ((i & 0xF) == 0xF)
 541                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 542                         }
 543                         Result.WriteLine ("};");
 544                         Result.WriteLine ();
 545 #if Binary
 546                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 547                                 byte [] array = ms.ToArray ();
 548                                 fs.Write (array, 0, array.Length);
 549                         }
 550 #endif
 551                 }
 552
 553                 void SerializeTailorings ()
 554                 {
 555                         Hashtable indexes = new Hashtable ();
 556                         Hashtable counts = new Hashtable ();
 557                         Result.WriteLine ("static char [] tailorings = new char [] {");
 558                         int count = 0;
 559 #if Binary
 560                         MemoryStream ms = new MemoryStream ();
 561                         BinaryWriter binary = new BinaryWriter (ms);
 562 #endif
 563                         foreach (Tailoring t in tailorings) {
 564                                 if (t.Alias != 0)
 565                                         continue;
 566                                 Result.Write ("/*{0}*/", t.LCID);
 567                                 indexes.Add (t.LCID, count);
 568                                 char [] values = t.ItemToCharArray ();
 569                                 counts.Add (t.LCID, values.Length);
 570                                 foreach (char c in values) {
 571                                         Result.Write ("'\\x{0:X}', ", (int) c);
 572                                         if (++count % 16 == 0)
 573                                                 Result.WriteLine (" // {0:X04}", count - 16);
 574 #if Binary
 575                                         binary.Write ((ushort) c);
 576 #endif
 577                                 }
 578                         }
 579                         Result.WriteLine ("};");
 580
 581                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
 582 #if Binary
 583                         byte [] rawdata = ms.ToArray ();
 584                         ms = new MemoryStream ();
 585                         binary = new BinaryWriter (ms);
 586                         binary.Write (tailorings.Count);
 587 #endif
 588                         foreach (Tailoring t in tailorings) {
 589                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
 590                                 if (!indexes.ContainsKey (target)) {
 591                                         throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
 592                                         continue;
 593                                 }
 594                                 int idx = (int) indexes [target];
 595                                 int cnt = (int) counts [target];
 596                                 bool french = t.FrenchSort;
 597                                 if (t.Alias != 0)
 598                                         foreach (Tailoring t2 in tailorings)
 599                                                 if (t2.LCID == t.LCID)
 600                                                         french = t2.FrenchSort;
 601                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
 602 #if Binary
 603                                 binary.Write (t.LCID);
 604                                 binary.Write (idx);
 605                                 binary.Write (cnt);
 606                                 binary.Write (french);
 607 #endif
 608                         }
 609                         Result.WriteLine ("};");
 610 #if Binary
 611                         binary.Write ((byte) 0xFF);
 612                         binary.Write ((byte) 0xFF);
 613                         binary.Write (rawdata.Length / 2);
 614                         binary.Write (rawdata, 0, rawdata.Length);
 615
 616
 617                         using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
 618                                 byte [] array = ms.ToArray ();
 619                                 fs.Write (array, 0, array.Length);
 620                         }
 621 #endif
 622                 }
 623
 624                 #region Parse
 625
 626                 void ParseSources (string dirname)
 627                 {
 628                         string unidata =
 629                                 dirname + "/UnicodeData.txt";
 630                         string derivedCoreProps =
 631                                 dirname + "/DerivedCoreProperties.txt";
 632                         string scripts =
 633                                 dirname + "/Scripts.txt";
 634                         string cp932 =
 635                                 dirname + "/CP932.TXT";
 636                         string derivedAge =
 637                                 dirname + "/DerivedAge.txt";
 638                         string chXML = dirname + "/common/collation/zh.xml";
 639                         string jaXML = dirname + "/common/collation/ja.xml";
 640                         string koXML = dirname + "/common/collation/ko.xml";
 641
 642                         ParseDerivedAge (derivedAge);
 643
 644                         FillIgnorables ();
 645
 646                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 647                         ParseUnidata (unidata);
 648                         ModifyUnidata ();
 649                         ParseDerivedCoreProperties (derivedCoreProps);
 650                         ParseScripts (scripts);
 651                         ParseCJK (chXML, jaXML, koXML);
 652
 653                         ParseTailorings ("mono-tailoring-source.txt");
 654                 }
 655
 656                 void ParseTailorings (string filename)
 657                 {
 658                         Tailoring t = null;
 659                         int line = 0;
 660                         using (StreamReader sr = new StreamReader (filename)) {
 661                                 try {
 662                                         while (sr.Peek () >= 0) {
 663                                                 line++;
 664                                                 ProcessTailoringLine (ref t,
 665                                                         sr.ReadLine ().Trim ());
 666                                         }
 667                                 } catch (Exception) {
 668                                         Console.Error.WriteLine ("ERROR at line {0}", line);
 669                                         throw;
 670                                 }
 671                         }
 672                 }
 673
 674                 // For now this is enough.
 675                 string ParseTailoringSourceValue (string s)
 676                 {
 677                         StringBuilder sb = new StringBuilder ();
 678                         for (int i = 0; i < s.Length; i++) {
 679                                 if (i + 5 < s.Length &&
 680                                         s [i] == '\\' && s [i + 1] == 'u') {
 681                                         sb.Append (
 682                                                 (char) int.Parse (
 683                                                         s.Substring (i + 2, 4),
 684                                                         NumberStyles.HexNumber),
 685                                                 1);
 686                                         i += 5;
 687                                 }
 688                                 else
 689                                         sb.Append (s [i]);
 690                         }
 691                         return sb.ToString ();
 692                 }
 693
 694                 void ProcessTailoringLine (ref Tailoring t, string s)
 695                 {
 696                         int idx = s.IndexOf ('#');
 697                         if (idx > 0)
 698                                 s = s.Substring (0, idx).Trim ();
 699                         if (s.Length == 0 || s [0] == '#')
 700                                 return;
 701                         if (s [0] == '@') {
 702                                 idx = s.IndexOf ('=');
 703                                 if (idx > 0)
 704                                         t = new Tailoring (
 705                                                 int.Parse (s.Substring (1, idx - 1)),
 706                                                 int.Parse (s.Substring (idx + 1)));
 707                                 else
 708                                         t = new Tailoring (int.Parse (s.Substring (1)));
 709                                 tailorings.Add (t);
 710                                 return;
 711                         }
 712                         if (s.StartsWith ("*FrenchSort")) {
 713                                 t.FrenchSort = true;
 714                                 return;
 715                         }
 716                         string d = "*Diacritical";
 717                         if (s.StartsWith (d)) {
 718                                 idx = s.IndexOf ("->");
 719                                 t.AddDiacriticalMap (
 720                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
 721                                                 NumberStyles.HexNumber),
 722                                         byte.Parse (s.Substring (idx + 2).Trim (),
 723                                                 NumberStyles.HexNumber));
 724                                 return;
 725                         }
 726                         idx = s.IndexOf (':');
 727                         if (idx > 0) {
 728                                 string source = s.Substring (0, idx).Trim ();
 729                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
 730                                 byte [] b = new byte [4];
 731                                 for (int i = 0; i < 4; i++) {
 732                                         if (l [i] == "*")
 733                                                 b [i] = 0;
 734                                         else
 735                                                 b [i] = byte.Parse (l [i],
 736                                                         NumberStyles.HexNumber);
 737                                 }
 738                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
 739                                         b);
 740                         }
 741                         idx = s.IndexOf ('=');
 742                         if (idx > 0)
 743                                 t.AddReplacementMap (
 744                                         ParseTailoringSourceValue (
 745                                                 s.Substring (0, idx).Trim ()),
 746                                         ParseTailoringSourceValue (
 747                                                 s.Substring (idx + 1).Trim ()));
 748                 }
 749
 750                 void ParseDerivedAge (string filename)
 751                 {
 752                         using (StreamReader file =
 753                                 new StreamReader (filename)) {
 754                                 while (file.Peek () >= 0) {
 755                                         string s = file.ReadLine ();
 756                                         int idx = s.IndexOf ('#');
 757                                         if (idx >= 0)
 758                                                 s = s.Substring (0, idx);
 759                                         idx = s.IndexOf (';');
 760                                         if (idx < 0)
 761                                                 continue;
 762
 763                                         string cpspec = s.Substring (0, idx);
 764                                         idx = cpspec.IndexOf ("..");
 765                                         NumberStyles nf = NumberStyles.HexNumber |
 766                                                 NumberStyles.AllowTrailingWhite;
 767                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 768                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 769                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 770
 771                                         // FIXME: use index
 772                                         if (cp > char.MaxValue)
 773                                                 continue;
 774
 775                                         double v = double.Parse (value);
 776                                         for (int i = cp; i <= cpEnd; i++)
 777                                                 unicodeAge [i] = v;
 778                                 }
 779                         }
 780                         unicodeAge [0] = double.MaxValue; // never be supported
 781                 }
 782
 783                 void ParseUnidata (string filename)
 784                 {
 785                         ArrayList decompValues = new ArrayList ();
 786                         using (StreamReader unidata =
 787                                 new StreamReader (filename)) {
 788                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 789                                         try {
 790                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 791                                         } catch (Exception) {
 792                                                 Console.Error.WriteLine ("**** At line " + line);
 793                                                 throw;
 794                                         }
 795                                 }
 796                         }
 797                         this.decompValues = (int [])
 798                                 decompValues.ToArray (typeof (int));
 799                 }
 800
 801                 char previousLatinTarget = char.MinValue;
 802                 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
 803
 804                 void ProcessUnidataLine (string s, ArrayList decompValues)
 805                 {
 806                         int idx = s.IndexOf ('#');
 807                         if (idx >= 0)
 808                                 s = s.Substring (0, idx);
 809                         idx = s.IndexOf (';');
 810                         if (idx < 0)
 811                                 return;
 812                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 813                         string [] values = s.Substring (idx + 1).Split (';');
 814
 815                         // FIXME: use index
 816                         if (cp > char.MaxValue)
 817                                 return;
 818                         if (IsIgnorable (cp))
 819                                 return;
 820
 821                         string name = values [0];
 822
 823                         // SPECIAL CASE: rename some characters for diacritical
 824                         // remapping. FIXME: why are they different?
 825                         // FIXME: it's still not working.
 826                         if (cp == 0x018B || cp == 0x018C)
 827                                 name = name.Replace ("TOPBAR", "STROKE");
 828
 829                         // isSmallCapital
 830                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 831                                 isSmallCapital [cp] = true;
 832
 833                         // latin mapping by character name
 834                         if (s.IndexOf ("LATIN") >= 0) {
 835                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 836                                 int offset = lidx + 15;
 837                                 if (lidx < 0) {
 838                                         lidx = s.IndexOf ("LETTER TURNED ");
 839                                         offset = lidx + 14;
 840                                 }
 841                                 if (lidx < 0) {
 842                                         lidx = s.IndexOf ("LETTER CAPITAL ");
 843                                         offset = lidx + 15;
 844                                 }
 845                                 if (lidx < 0) {
 846                                         lidx = s.IndexOf ("LETTER SCRIPT ");
 847                                         offset = lidx + 14;
 848                                 }
 849                                 if (lidx < 0) {
 850                                         lidx = s.IndexOf ("LETTER ");
 851                                         offset = lidx + 7;
 852                                 }
 853                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 854                                 char n = s [offset + 1];
 855                                 char target = char.MinValue;
 856                                 if ('A' <= c && c <= 'Z' &&
 857                                         (n == ' ') || n == ';') {
 858                                         target = c;
 859                                         // FIXME: After 'Z', I cannot reset this state.
 860                                         previousLatinTarget = c == 'Z' ? char.MinValue : c;
 861                                 }
 862
 863                                 if (s.Substring (offset).StartsWith ("ALPHA"))
 864                                         target = 'A';
 865                                 else if (s.Substring (offset).StartsWith ("TONE SIX"))
 866                                         target = 'B';
 867                                 else if (s.Substring (offset).StartsWith ("OPEN O"))
 868                                         target = 'C';
 869                                 else if (s.Substring (offset).StartsWith ("SCHWA"))
 870                                         target = 'E';
 871                                 else if (s.Substring (offset).StartsWith ("ENG"))
 872                                         target = 'N';
 873                                 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
 874                                         target = 'O';
 875                                 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
 876                                         target = 'R';
 877                                 else if (s.Substring (offset).StartsWith ("TONE TWO"))
 878                                         target = 'S';
 879                                 else if (s.Substring (offset).StartsWith ("ESH"))
 880                                         target = 'S';
 881
 882                                 // For remaining IPA chars, direct mapping is
 883                                 // much faster.
 884                                 switch (cp) {
 885                                 case 0x0299: target = 'B'; break;
 886                                 case 0x029A: target = 'E'; break;
 887                                 case 0x029B: target = 'G'; break;
 888                                 case 0x029C: target = 'H'; break;
 889                                 case 0x029D: target = 'J'; break;
 890                                 case 0x029E: target = 'K'; break;
 891                                 case 0x029F: target = 'L'; break;
 892                                 case 0x02A0: target = 'Q'; break;
 893                                 case 0x02A7: target = 'T'; break;
 894                                 case 0x02A8: target = 'T'; break;
 895                                 }
 896
 897                                 if (target == char.MinValue)
 898                                         target = previousLatinTarget;
 899
 900                                 if (target != char.MinValue) {
 901                                         ArrayList entry = (ArrayList) latinMap [target];
 902                                         if (entry == null) {
 903                                                 entry = new ArrayList ();
 904                                                 latinMap [target] = entry;
 905                                         }
 906                                         entry.Add (cp);
 907                                         // FIXME: This secondary weight is hack.
 908                                         // They are here because they must not
 909                                         // be identical to the corresponding
 910                                         // ASCII latins.
 911                                         if (c != target && diacritical [cp] == 0) {
 912                                                 diacriticalOffset [c - 'A']++;
 913                                                 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
 914                                         }
 915                                 }
 916                         }
 917
 918                         // Arrow names
 919                         if (0x2000 <= cp && cp < 0x3000) {
 920                                 int value = 0;
 921                                 // SPECIAL CASES. FIXME: why?
 922                                 switch (cp) {
 923                                 case 0x21C5: value = -1; break; // E2
 924                                 case 0x261D: value = 1; break;
 925                                 case 0x27A6: value = 3; break;
 926                                 case 0x21B0: value = 7; break;
 927                                 case 0x21B1: value = 3; break;
 928                                 case 0x21B2: value = 7; break;
 929                                 case 0x21B4: value = 5; break;
 930                                 case 0x21B5: value = 7; break;
 931                                 case 0x21B9: value = -1; break; // E1
 932                                 case 0x21CF: value = 7; break;
 933                                 case 0x21D0: value = 3; break;
 934                                 }
 935                                 string [] arrowTargets = new string [] {
 936                                         "",
 937                                         "UPWARDS",
 938                                         "NORTH EAST",
 939                                         "RIGHTWARDS",
 940                                         "SOUTH EAST",
 941                                         "DOWNWARDS",
 942                                         "SOUTH WEST",
 943                                         "LEFTWARDS",
 944                                         "NORTH WEST",
 945                                         "LEFT RIGHT",
 946                                         "UP DOWN",
 947                                         };
 948                                 if (s.IndexOf ("RIGHTWARDS") >= 0 &&
 949                                         s.IndexOf ("LEFTWARDS") >= 0)
 950                                         value = 0xE1 - 0xD8;
 951                                 else if (s.IndexOf ("UPWARDS") >= 0 &&
 952                                         s.IndexOf ("DOWNWARDS") >= 0)
 953                                         value = 0xE2 - 0xD8;
 954                                 else if (s.IndexOf ("ARROW") >= 0 &&
 955                                         s.IndexOf ("COMBINING") < 0 &&
 956                                         s.IndexOf ("CLOCKWISE") >= 0)
 957                                         value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8;
 958                                 if (value == 0)
 959                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
 960                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
 961                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
 962                                                         s.IndexOf (" OVER") < 0
 963                                                 )
 964                                                         value = i;
 965                                 if (value > 0)
 966                                         arrowValues.Add (new DictionaryEntry (
 967                                                 cp, value));
 968                         }
 969
 970                         // Box names
 971                         if (0x2500 <= cp && cp < 0x2600) {
 972                                 int value = int.MinValue;
 973                                 // flags:
 974                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
 975                                 // [h,rl] [r] [l]
 976                                 // [v,ud] [u] [d]
 977                                 // [dr] [dl] [ur] [ul]
 978                                 // [vr,udr] [vl,vdl]
 979                                 // [hd,rld] [hu,rlu]
 980                                 // [hv,udrl,rlv,udh]
 981                                 ArrayList flags = new ArrayList (new int [] {
 982                                         32, 8 + 4, 8, 4,
 983                                         16, 1 + 2, 1, 2,
 984                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
 985                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
 986                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
 987                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
 988                                         });
 989                                 byte [] offsets = new byte [] {
 990                                         0, 0, 1, 2,
 991                                         3, 3, 4, 5,
 992                                         6, 7, 8, 9,
 993                                         10, 10, 11, 11,
 994                                         12, 12, 13, 13,
 995                                         14, 14, 14, 14};
 996                                 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
 997                                         int flag = 0;
 998                                         if (s.IndexOf (" UP") >= 0)
 999                                                 flag |= 1;
1000                                         if (s.IndexOf (" DOWN") >= 0)
1001                                                 flag |= 2;
1002                                         if (s.IndexOf (" RIGHT") >= 0)
1003                                                 flag |= 4;
1004                                         if (s.IndexOf (" LEFT") >= 0)
1005                                                 flag |= 8;
1006                                         if (s.IndexOf (" VERTICAL") >= 0)
1007                                                 flag |= 16;
1008                                         if (s.IndexOf (" HORIZONTAL") >= 0)
1009                                                 flag |= 32;
1010
1011                                         int fidx = flags.IndexOf (flag);
1012                                         if (fidx >= 0)
1013                                                 value = offsets [fidx];
1014                                 } else if (s.IndexOf ("BLOCK") >= 0) {
1015                                         if (s.IndexOf ("ONE EIGHTH") >= 0)
1016                                                 value = 0x12;
1017                                         else if (s.IndexOf ("ONE QUARTER") >= 0)
1018                                                 value = 0x13;
1019                                         else if (s.IndexOf ("THREE EIGHTHS") >= 0)
1020                                                 value = 0x14;
1021                                         else if (s.IndexOf ("HALF") >= 0)
1022                                                 value = 0x15;
1023                                         else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1024                                                 value = 0x16;
1025                                         else if (s.IndexOf ("THREE QUARTERS") >= 0)
1026                                                 value = 0x17;
1027                                         else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1028                                                 value = 0x18;
1029                                         else
1030                                                 value = 0x19;
1031                                 }
1032                                 else if (s.IndexOf ("SHADE") >= 0)
1033                                         value = 0x19;
1034                                 else if (s.IndexOf ("SQUARE") >= 0)
1035                                         value = 0xBC - 0xE5;
1036                                 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1037                                         value = 0xBE - 0xE5;
1038                                 else if (s.IndexOf ("RECTANGLE") >= 0)
1039                                         value = 0xBD - 0xE5;
1040                                 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1041                                         value = 0xBF - 0xE5;
1042                                 else if (s.IndexOf ("TRIANGLE") >= 0) {
1043                                         if (s.IndexOf ("UP-POINTING") >= 0)
1044                                                 value = 0xC0 - 0xE5;
1045                                         else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1046                                                 value = 0xC1 - 0xE5;
1047                                         else if (s.IndexOf ("DOWN-POINTING") >= 0)
1048                                                 value = 0xC2 - 0xE5;
1049                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1050                                                 value = 0xC3 - 0xE5;
1051                                 }
1052                                 else if (s.IndexOf ("POINTER") >= 0) {
1053                                         if (s.IndexOf ("RIGHT-POINTING") >= 0)
1054                                                 value = 0xC4 - 0xE5;
1055                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1056                                                 value = 0xC5 - 0xE5;
1057                                 }
1058                                 else if (s.IndexOf ("DIAMOND") >= 0)
1059                                         value = 0xC6 - 0xE5;
1060                                 else if (s.IndexOf ("FISHEYE") >= 0)
1061                                         value = 0xC7 - 0xE5;
1062                                 else if (s.IndexOf ("LOZENGE") >= 0)
1063                                         value = 0xC8 - 0xE5;
1064                                 else if (s.IndexOf ("BULLSEYE") >= 0)
1065                                         value = 0xC9 - 0xE5;
1066                                 else if (s.IndexOf ("CIRCLE") >= 0) {
1067                                         if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1068                                                 value = 0xCA - 0xE5;
1069                                         else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1070                                                 value = 0xCB - 0xE5;
1071                                         else
1072                                                 value = 0xC9 - 0xE5;
1073                                 }
1074                                 else if (s.IndexOf ("BULLET") >= 0)
1075                                         value = 0xCC - 0xE5;
1076                                 if (0x25DA <= cp && cp <= 0x25E5)
1077                                         value = 0xCD + cp - 0x25DA - 0xE5;
1078
1079                                 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1080                                 switch (cp) {
1081                                 case 0x2571: value = 0xF; break;
1082                                 case 0x2572: value = 0x10; break;
1083                                 case 0x2573: value = 0x11; break;
1084                                 }
1085                                 if (value != int.MinValue)
1086                                         boxValues.Add (new DictionaryEntry (
1087                                                 cp, value));
1088                         }
1089
1090                         // For some characters store the name and sort later
1091                         // to determine sorting.
1092                         if (0x2100 <= cp && cp <= 0x213F &&
1093                                 Char.IsSymbol ((char) cp))
1094                                 sortableCharNames.Add (
1095                                         new DictionaryEntry (cp, name));
1096                         else if (0x3380 <= cp && cp <= 0x33DD)
1097                                 sortableCharNames.Add (new DictionaryEntry (
1098                                         cp, name.Substring (7)));
1099
1100                         if (Char.GetUnicodeCategory ((char) cp) ==
1101                                 UnicodeCategory.MathSymbol) {
1102                                 if (name.StartsWith ("CIRCLED "))
1103                                         diacritical [cp] = 0xEE;
1104                                 if (name.StartsWith ("SQUARED "))
1105                                         diacritical [cp] = 0xEF;
1106                         }
1107
1108                         // diacritical weights by character name
1109 if (diacritics.Length != diacriticWeights.Length)
1110 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1111                         for (int d = 0; d < diacritics.Length; d++) {
1112                                 if (s.IndexOf (diacritics [d]) > 0) {
1113                                         diacritical [cp] += diacriticWeights [d];
1114                                         if (s.IndexOf ("COMBINING") >= 0)
1115                                                 diacritical [cp] -= (byte) 2;
1116                                         continue;
1117                                 }
1118                                 // also process "COMBINING blah" here
1119                                 // For now it is limited to cp < 0x0370
1120 //                              if (cp < 0x0300 || cp >= 0x0370)
1121 //                                      continue;
1122                                 string tmp = diacritics [d].TrimEnd (';');
1123                                 if (tmp.IndexOf ("WITH ") == 0)
1124                                         tmp = tmp.Substring (4);
1125                                 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1126                                 if (name == tmp) {
1127                                         diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1128                                         break;
1129                                 }
1130 //if (name == tmp)
1131 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1132                         }
1133                         // Two-step grep required for it.
1134                         if (s.IndexOf ("FULL STOP") > 0 &&
1135                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1136                                 diacritical [cp] |= 0xF4;
1137                         if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1138                                 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1139                                         s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1140
1141                         // Arabic letter name
1142                         if (0x0621 <= cp && cp <= 0x064A &&
1143                                 Char.GetUnicodeCategory ((char) cp)
1144                                 == UnicodeCategory.OtherLetter) {
1145                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1146                                 switch (cp) {
1147                                 case 0x0621:
1148                                 case 0x0624:
1149                                 case 0x0626:
1150                                         // hamza, waw, yeh ... special cases.
1151                                         value = 0x07;
1152                                         break;
1153                                 case 0x0649:
1154                                 case 0x064A:
1155                                         value = 0x77; // special cases.
1156                                         break;
1157                                 default:
1158                                         // Get primary letter name i.e.
1159                                         // XXX part of ARABIC LETTER XXX yyy
1160                                         // e.g. that of "TEH MARBUTA" is "TEH".
1161                                         string letterName =
1162                                                 (cp == 0x0640) ?
1163                                                 // 0x0640 is special: it does
1164                                                 // not start with ARABIC LETTER
1165                                                 name :
1166                                                 name.Substring (14);
1167                                         int tmpIdx = letterName.IndexOf (' ');
1168                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1169 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1170                                         if (arabicNameMap.ContainsKey (letterName))
1171                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1172                                         else
1173                                                 arabicNameMap [letterName] = cp;
1174                                         break;
1175                                 }
1176                                 arabicLetterPrimaryValues [cp] = value;
1177                         }
1178
1179                         // Japanese square letter
1180                         if (0x3300 <= cp && cp <= 0x3357)
1181                                 if (!ExistsJIS (cp))
1182                                         nonJisJapanese.Add (new NonJISCharacter (cp, name));
1183
1184                         // normalizationType
1185                         string decomp = values [4];
1186                         idx = decomp.IndexOf ('<');
1187                         if (idx >= 0) {
1188                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1189                                 case "full":
1190                                         decompType [cp] = DecompositionFull;
1191                                         break;
1192                                 case "sub":
1193                                         decompType [cp] = DecompositionSub;
1194                                         break;
1195                                 case "super":
1196                                         decompType [cp] = DecompositionSuper;
1197                                         break;
1198                                 case "small":
1199                                         decompType [cp] = DecompositionSmall;
1200                                         break;
1201                                 case "isolated":
1202                                         decompType [cp] = DecompositionIsolated;
1203                                         break;
1204                                 case "initial":
1205                                         decompType [cp] = DecompositionInitial;
1206                                         break;
1207                                 case "final":
1208                                         decompType [cp] = DecompositionFinal;
1209                                         break;
1210                                 case "medial":
1211                                         decompType [cp] = DecompositionMedial;
1212                                         break;
1213                                 case "noBreak":
1214                                         decompType [cp] = DecompositionNoBreak;
1215                                         break;
1216                                 case "compat":
1217                                         decompType [cp] = DecompositionCompat;
1218                                         break;
1219                                 case "fraction":
1220                                         decompType [cp] = DecompositionFraction;
1221                                         break;
1222                                 case "font":
1223                                         decompType [cp] = DecompositionFont;
1224                                         break;
1225                                 case "circle":
1226                                         decompType [cp] = DecompositionCircle;
1227                                         break;
1228                                 case "square":
1229                                         decompType [cp] = DecompositionSquare;
1230                                         break;
1231                                 case "wide":
1232                                         decompType [cp] = DecompositionWide;
1233                                         break;
1234                                 case "narrow":
1235                                         decompType [cp] = DecompositionNarrow;
1236                                         break;
1237                                 case "vertical":
1238                                         decompType [cp] = DecompositionVertical;
1239                                         break;
1240                                 default:
1241                                         throw new Exception ("Support NFKD type : " + decomp);
1242                                 }
1243                         }
1244                         else
1245                                 decompType [cp] = DecompositionCanonical;
1246                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1247                         if (decomp.Length > 0) {
1248
1249                                 string [] velems = decomp.Split (' ');
1250                                 int didx = decompValues.Count;
1251                                 decompIndex [cp] = didx;
1252                                 foreach (string v in velems)
1253                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1254                                 decompLength [cp] = velems.Length;
1255
1256                                 // [decmpType] -> this_cp
1257                                 int targetCP = (int) decompValues [didx];
1258                                 // for "(x)" it specially maps to 'x' .
1259                                 // FIXME: check if it is sane
1260                                 if (velems.Length == 3 &&
1261                                         (int) decompValues [didx] == '(' &&
1262                                         (int) decompValues [didx + 2] == ')')
1263                                         targetCP = (int) decompValues [didx + 1];
1264                                 // special: 0x215F "1/"
1265                                 else if (cp == 0x215F)
1266                                         targetCP = '1';
1267                                 else if (velems.Length > 1 &&
1268                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
1269                                         // skip them, except for CJK ideograph compat
1270                                         targetCP = 0;
1271
1272                                 if (targetCP != 0) {
1273                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
1274                                         if (entry == null) {
1275                                                 entry = new Hashtable ();
1276                                                 nfkdMap [targetCP] = entry;
1277                                         }
1278                                         entry [(byte) decompType [cp]] = cp;
1279                                 }
1280                         }
1281                         // numeric values
1282                         if (values [5].Length > 0)
1283                                 decimalValue [cp] = decimal.Parse (values [5]);
1284                         else if (values [6].Length > 0)
1285                                 decimalValue [cp] = decimal.Parse (values [6]);
1286                         else if (values [7].Length > 0) {
1287                                 string decstr = values [7];
1288                                 idx = decstr.IndexOf ('/');
1289                                 if (cp == 0x215F) // special. "1/"
1290                                         decimalValue [cp] = 0x1;
1291                                 else if (idx > 0)
1292                                         // m/n
1293                                         decimalValue [cp] =
1294                                                 decimal.Parse (decstr.Substring (0, idx))
1295                                                 / decimal.Parse (decstr.Substring (idx + 1));
1296                                 else if (decstr [0] == '(' &&
1297                                         decstr [decstr.Length - 1] == ')')
1298                                         // (n)
1299                                         decimalValue [cp] =
1300                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1301                                 else if (decstr [decstr.Length - 1] == '.')
1302                                         // n.
1303                                         decimalValue [cp] =
1304                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1305                                 else
1306                                         decimalValue [cp] = decimal.Parse (decstr);
1307                         }
1308                 }
1309
1310                 void ParseDerivedCoreProperties (string filename)
1311                 {
1312                         // IsUppercase
1313                         using (StreamReader file =
1314                                 new StreamReader (filename)) {
1315                                 for (int line = 1; file.Peek () >= 0; line++) {
1316                                         try {
1317                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1318                                         } catch (Exception) {
1319                                                 Console.Error.WriteLine ("**** At line " + line);
1320                                                 throw;
1321                                         }
1322                                 }
1323                         }
1324                 }
1325
1326                 void ProcessDerivedCorePropLine (string s)
1327                 {
1328                         int idx = s.IndexOf ('#');
1329                         if (idx >= 0)
1330                                 s = s.Substring (0, idx);
1331                         idx = s.IndexOf (';');
1332                         if (idx < 0)
1333                                 return;
1334                         string cpspec = s.Substring (0, idx);
1335                         idx = cpspec.IndexOf ("..");
1336                         NumberStyles nf = NumberStyles.HexNumber |
1337                                 NumberStyles.AllowTrailingWhite;
1338                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1339                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1340                         string value = s.Substring (cpspec.Length + 1).Trim ();
1341
1342                         // FIXME: use index
1343                         if (cp > char.MaxValue)
1344                                 return;
1345
1346                         switch (value) {
1347                         case "Uppercase":
1348                                 for (int x = cp; x <= cpEnd; x++)
1349                                         isUppercase [x] = true;
1350                                 break;
1351                         }
1352                 }
1353
1354                 void ParseScripts (string filename)
1355                 {
1356                         ArrayList gurmukhi = new ArrayList ();
1357                         ArrayList gujarati = new ArrayList ();
1358                         ArrayList georgian = new ArrayList ();
1359                         ArrayList thaana = new ArrayList ();
1360
1361                         using (StreamReader file =
1362                                 new StreamReader (filename)) {
1363                                 while (file.Peek () >= 0) {
1364                                         string s = file.ReadLine ();
1365                                         int idx = s.IndexOf ('#');
1366                                         if (idx >= 0)
1367                                                 s = s.Substring (0, idx);
1368                                         idx = s.IndexOf (';');
1369                                         if (idx < 0)
1370                                                 continue;
1371
1372                                         string cpspec = s.Substring (0, idx);
1373                                         idx = cpspec.IndexOf ("..");
1374                                         NumberStyles nf = NumberStyles.HexNumber |
1375                                                 NumberStyles.AllowTrailingWhite;
1376                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1377                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1378                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1379
1380                                         // FIXME: use index
1381                                         if (cp > char.MaxValue)
1382                                                 continue;
1383
1384                                         switch (value) {
1385                                         case "Gurmukhi":
1386                                                 for (int x = cp; x <= cpEnd; x++)
1387                                                         if (!IsIgnorable (x))
1388                                                                 gurmukhi.Add ((char) x);
1389                                                 break;
1390                                         case "Gujarati":
1391                                                 for (int x = cp; x <= cpEnd; x++)
1392                                                         if (!IsIgnorable (x))
1393                                                                 gujarati.Add ((char) x);
1394                                                 break;
1395                                         case "Georgian":
1396                                                 for (int x = cp; x <= cpEnd; x++)
1397                                                         if (!IsIgnorable (x))
1398                                                                 georgian.Add ((char) x);
1399                                                 break;
1400                                         case "Thaana":
1401                                                 for (int x = cp; x <= cpEnd; x++)
1402                                                         if (!IsIgnorable (x))
1403                                                                 thaana.Add ((char) x);
1404                                                 break;
1405                                         }
1406                                 }
1407                         }
1408                         gurmukhi.Sort (UCAComparer.Instance);
1409                         gujarati.Sort (UCAComparer.Instance);
1410                         georgian.Sort (UCAComparer.Instance);
1411                         thaana.Sort (UCAComparer.Instance);
1412                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1413                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1414                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1415                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1416                 }
1417
1418                 void ParseJISOrder (string filename)
1419                 {
1420                         int line = 1;
1421                         try {
1422                                 using (StreamReader file =
1423                                         new StreamReader (filename)) {
1424                                         for (;file.Peek () >= 0; line++)
1425                                                 ProcessJISOrderLine (file.ReadLine ());
1426                                 }
1427                         } catch (Exception) {
1428                                 Console.Error.WriteLine ("---- line {0}", line);
1429                                 throw;
1430                         }
1431                 }
1432
1433                 char [] ws = new char [] {'\t', ' '};
1434
1435                 void ProcessJISOrderLine (string s)
1436                 {
1437                         int idx = s.IndexOf ('#');
1438                         if (idx >= 0)
1439                                 s = s.Substring (0, idx).Trim ();
1440                         if (s.Length == 0)
1441                                 return;
1442                         idx = s.IndexOfAny (ws);
1443                         if (idx < 0)
1444                                 return;
1445                         // They start with "0x" so cut them out.
1446                         int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1447                         int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1448                         jisJapanese.Add (new JISCharacter (cp, jis));
1449                 }
1450
1451                 void ParseCJK (string zhXML, string jaXML, string koXML)
1452                 {
1453                         XmlDocument doc = new XmlDocument ();
1454                         doc.XmlResolver = null;
1455                         int v;
1456                         string s;
1457                         string category;
1458                         int offset;
1459                         ushort [] arr;
1460
1461                         // Chinese Simplified
1462                         category = "chs";
1463                         arr = cjkCHS;
1464                         offset = 0;//char.MaxValue - arr.Length;
1465                         doc.Load (zhXML);
1466                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1467                         v = 0x8008;
1468                         foreach (char c in s) {
1469                                 if (c < '\u3100')
1470                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1471                                 else {
1472                                         arr [(int) c - offset] = (ushort) v++;
1473                                         if (v % 256 == 0)
1474                                                 v += 2;
1475                                 }
1476                         }
1477
1478                         // Chinese Traditional
1479                         category = "cht";
1480                         arr = cjkCHT;
1481                         offset = 0;//char.MaxValue - arr.Length;
1482                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1483                         v = 0x8002;
1484                         foreach (char c in s) {
1485                                 if (c < '\u4E00')
1486                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1487                                 else {
1488                                         arr [(int) c - offset] = (ushort) v++;
1489                                         if (v % 256 == 0)
1490                                                 v += 2;
1491                                 }
1492                         }
1493
1494                         // Japanese
1495                         category = "ja";
1496                         arr = cjkJA;
1497                         offset = 0;//char.MaxValue - arr.Length;
1498
1499                         // SPECIAL CASES
1500                         arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1501                         arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1502                         arr [0x337E] = 0x8005;
1503                         arr [0x337D] = 0x8006;
1504                         arr [0x337C] = 0x8007;
1505
1506                         v = 0x8008;
1507                         foreach (JISCharacter jc in jisJapanese) {
1508                                 if (jc.JIS < 0x8800)
1509                                         continue;
1510                                 char c = (char) jc.CP;
1511
1512                                 if (c < '\u4E00')
1513                                         // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1514                                         continue;
1515                                 else {
1516                                         arr [(int) c - offset] = (ushort) v++;
1517                                         if (v % 256 == 0)
1518                                                 v += 2;
1519
1520                                         // SPECIAL CASES:
1521                                         if (c == '\u662D') // U+337C
1522                                                 continue;
1523                                         if (c == '\u5927') // U+337D
1524                                                 continue;
1525                                         if (c == '\u5E73') // U+337B
1526                                                 continue;
1527                                         if (c == '\u660E') // U+337E
1528                                                 continue;
1529                                         if (c == '\u9686') // U+F9DC
1530                                                 continue;
1531
1532                                         // FIXME: there are still remaining
1533                                         // characters after U+FA0C.
1534 //                                      for (int k = 0; k < char.MaxValue; k++) {
1535                                         for (int k = 0; k < '\uFA0D'; k++) {
1536                                                 if (decompIndex [k] == 0 || IsIgnorable (k))
1537                                                         continue;
1538                                                 if (decompValues [decompIndex [k]] == c /*&&
1539                                                         decompLength [k] == 1*/ ||
1540                                                         decompLength [k] == 3 &&
1541                                                         decompValues [decompIndex [k] + 1] == c) {
1542                                                         arr [k - offset] = (ushort) v++;
1543                                                         if (v % 256 == 0)
1544                                                                 v += 2;
1545                                                 }
1546                                         }
1547                                 }
1548                         }
1549
1550                         // Korean
1551                         // Korean weight is somewhat complex. It first shifts
1552                         // Hangul category from 52-x to 80-x (they are anyways
1553                         // computed). CJK ideographs are placed at secondary
1554                         // weight, like XX YY 01 zz 01, where XX and YY are
1555                         // corresponding "reset" value and zz is 41,43,45...
1556                         //
1557                         // Unlike chs,cht and ja, Korean value is a combined
1558                         // ushort which is computed as category
1559                         //
1560                         category = "ko";
1561                         arr = cjkKO;
1562                         offset = 0;//char.MaxValue - arr.Length;
1563                         doc.Load (koXML);
1564                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1565                                 XmlElement sc = (XmlElement) reset.NextSibling;
1566                                 // compute "category" and "level 1" for the
1567                                 // target "reset" Hangle syllable
1568                                 char rc = reset.InnerText [0];
1569                                 int ri = ((int) rc - 0xAC00) + 1;
1570                                 ushort p = (ushort)
1571                                         ((ri / 254) * 256 + (ri % 254) + 2);
1572                                 // Place the characters after the target.
1573                                 s = sc.InnerText;
1574                                 v = 0x41;
1575                                 foreach (char c in s) {
1576                                         arr [(int) c - offset] = p;
1577                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1578                                         v += 2;
1579                                 }
1580                         }
1581                 }
1582
1583                 #endregion
1584
1585                 #region Generation
1586
1587                 void FillIgnorables ()
1588                 {
1589                         for (int i = 0; i <= char.MaxValue; i++) {
1590                                 if (Char.GetUnicodeCategory ((char) i) ==
1591                                         UnicodeCategory.OtherNotAssigned)
1592                                         continue;
1593                                 if (IsIgnorable (i))
1594                                         ignorableFlags [i] |= 1;
1595                                 if (IsIgnorableSymbol (i))
1596                                         ignorableFlags [i] |= 2;
1597                                 if (IsIgnorableNonSpacing (i))
1598                                         ignorableFlags [i] |= 4;
1599                         }
1600                 }
1601
1602                 void ModifyUnidata ()
1603                 {
1604                         // Modify some decomposition equivalence
1605                         for (int i = 0xFE31; i <= 0xFE34; i++) {
1606                                 decompType [i] = 0;
1607                                 decompIndex [i] = 0;
1608                                 decompLength [i] = 0;
1609                         }
1610                         decompType [0x037E] = 0;
1611                         decompIndex [0x037E] = 0;
1612                         decompLength [0x037E] = 0;
1613
1614                         // Hangzhou numbers
1615                         for (int i = 0x3021; i <= 0x3029; i++)
1616                                 diacritical [i] = 0x4E;
1617                         // Korean parens numbers
1618                         for (int i = 0x3200; i <= 0x321C; i++)
1619                                 diacritical [i] = 0xA;
1620                         for (int i = 0x3260; i <= 0x327B; i++)
1621                                 diacritical [i] = 0xC;
1622
1623                         // LAMESPEC: these remapping should not be done.
1624                         // Windows have incorrect CJK compat mappings.
1625                         decompValues [decompIndex [0x32A9]] = 0x91AB;
1626                         decompLength [0x323B] = 1;
1627                         decompValues [decompIndex [0x323B]] = 0x5B78;
1628                         decompValues [decompIndex [0x32AB]] = 0x5B78;
1629                         decompValues [decompIndex [0x32A2]] = 0x5BEB;
1630                         decompLength [0x3238] = 1;
1631                         decompValues [decompIndex [0x3238]] = 0x52DE;
1632                         decompValues [decompIndex [0x3298]] = 0x52DE;
1633
1634                         // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1635                         decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1636                         decompValues [decompIndex [0xFA0C]] = 0x5140;
1637                         decompLength [0xFA0C] = 1;
1638                         decompIndex [0xF929] = decompLength [0xF929] = 0;
1639
1640                         decompValues [decompIndex [0xF92C]] = 0x90DE;
1641                 }
1642
1643                 void ModifyParsedValues ()
1644                 {
1645                         // some cyrillic diacritical weight. They seem to be
1646                         // based on old character names, so it's quicker to
1647                         // set them directly here.
1648                         diacritical [0x0496] = diacritical [0x0497] = 7;
1649                         diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1650                         diacritical [0x049A] = diacritical [0x049B] = 0x17;
1651                         diacritical [0x049C] = diacritical [0x049D] = 9;
1652                         diacritical [0x049E] = diacritical [0x049F] = 4;
1653                         diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1654                         diacritical [0x04A2] = diacritical [0x04A3] = 7;
1655                         diacritical [0x04A4] = diacritical [0x04A5] = 8;
1656
1657                         // number, secondary weights
1658                         byte weight = 0x38;
1659                         int [] numarr = numberSecondaryWeightBounds;
1660                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1661                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1662                                         if (Char.IsNumber ((char) cp))
1663                                                 diacritical [cp] = weight;
1664
1665                         // Update name part of named characters
1666                         for (int i = 0; i < sortableCharNames.Count; i++) {
1667                                 DictionaryEntry de =
1668                                         (DictionaryEntry) sortableCharNames [i];
1669                                 int cp = (int) de.Key;
1670                                 string renamed = null;
1671                                 switch (cp) {
1672                                 case 0x2101: renamed = "A_1"; break;
1673                                 case 0x33C3: renamed = "A_2"; break;
1674                                 case 0x2105: renamed = "C_1"; break;
1675                                 case 0x2106: renamed = "C_2"; break;
1676                                 case 0x211E: renamed = "R1"; break;
1677                                 case 0x211F: renamed = "R2"; break;
1678                                 // Remove some of them!
1679                                 case 0x2103:
1680                                 case 0x2109:
1681                                 case 0x2116:
1682                                 case 0x2117:
1683                                 case 0x2118:
1684                                 case 0x2125:
1685                                 case 0x2127:
1686                                 case 0x2129:
1687                                 case 0x212E:
1688                                 case 0x2132:
1689                                         sortableCharNames.RemoveAt (i);
1690                                         i--;
1691                                         continue;
1692                                 }
1693                                 if (renamed != null)
1694                                         sortableCharNames [i] =
1695                                                 new DictionaryEntry (cp, renamed);
1696                         }
1697                 }
1698
1699                 void GenerateCore ()
1700                 {
1701                         UnicodeCategory uc;
1702
1703                         #region Specially ignored // 01
1704                         // This will raise "Defined" flag up.
1705                         // FIXME: Check If it is really fine. Actually for
1706                         // Japanese voice marks this code does remapping.
1707                         foreach (char c in specialIgnore)
1708                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1709                         #endregion
1710
1711                         #region Extenders (FF FF)
1712                         fillIndex [0xFF] = 0xFF;
1713                         char [] specialBiggest = new char [] {
1714                                 '\u3005', '\u3031', '\u3032', '\u309D',
1715                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1716                                 '\uFE7C', '\uFE7D', '\uFF70'};
1717                         foreach (char c in specialBiggest)
1718                                 AddCharMap (c, 0xFF, 0);
1719                         #endregion
1720
1721                         #region Variable weights
1722                         // Controls : 06 03 - 06 3D
1723                         fillIndex [0x6] = 3;
1724                         for (int i = 0; i < 65536; i++) {
1725                                 if (IsIgnorable (i))
1726                                         continue;
1727                                 char c = (char) i;
1728                                 uc = Char.GetUnicodeCategory (c);
1729                                 // NEL is whitespace but not ignored here.
1730                                 if (uc == UnicodeCategory.Control &&
1731                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1732                                         AddCharMap (c, 6, 1);
1733                         }
1734
1735                         // Apostrophe 06 80
1736                         fillIndex [0x6] = 0x80;
1737                         AddCharMap ('\'', 6, 0);
1738                         AddCharMap ('\uFF07', 6, 1);
1739                         AddCharMap ('\uFE63', 6, 1);
1740
1741                         // SPECIAL CASE: fill FE32 here in prior to be added
1742                         // at 2013. Windows does not always respect NFKD.
1743                         map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1744
1745                         // Hyphen/Dash : 06 81 - 06 90
1746                         for (int i = 0; i < char.MaxValue; i++) {
1747                                 if (!IsIgnorable (i) &&
1748                                         Char.GetUnicodeCategory ((char) i) ==
1749                                         UnicodeCategory.DashPunctuation) {
1750                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1751                                         if (i == 0x2011) {
1752                                                 // SPECIAL: add 2027 and 2043
1753                                                 // Maybe they are regarded the
1754                                                 // same hyphens in "central"
1755                                                 // position.
1756                                                 AddCharMap ('\u2027', 6, 1);
1757                                                 AddCharMap ('\u2043', 6, 1);
1758                                         }
1759                                 }
1760                         }
1761                         // They are regarded as primarily equivalent to '-'
1762                         map [0x208B] = new CharMapEntry (6, 0x82, 0);
1763                         map [0x207B] = new CharMapEntry (6, 0x82, 0);
1764                         map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1765
1766                         // Arabic variable weight chars 06 A0 -
1767                         fillIndex [6] = 0xA0;
1768                         // vowels
1769                         for (int i = 0x64B; i <= 0x650; i++)
1770                                 AddArabicCharMap ((char) i);
1771                         // sukun
1772                         AddCharMapGroup ('\u0652', 6, 1, 0);
1773                         // shadda
1774                         AddCharMapGroup ('\u0651', 6, 1, 0);
1775                         #endregion
1776
1777
1778                         #region Nonspacing marks // 01
1779                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1780
1781                         // Combining diacritical marks: 01 DC -
1782
1783                         fillIndex [0x1] = 0x41;
1784                         for (int i = 0x030E; i <= 0x0326; i++)
1785                                 if (!IsIgnorable (i))
1786                                         AddCharMap ((char) i, 0x1, 1);
1787                         for (int i = 0x0329; i <= 0x0334; i++)
1788                                 if (!IsIgnorable (i))
1789                                         AddCharMap ((char) i, 0x1, 1);
1790                         fillIndex [0x1]++;
1791                         for (int i = 0x0339; i <= 0x0341; i++)
1792                                 if (!IsIgnorable (i))
1793                                         AddCharMap ((char) i, 0x1, 1);
1794                         fillIndex [0x1] = 0x74;
1795                         for (int i = 0x0346; i <= 0x0348; i++)
1796                                 if (!IsIgnorable (i))
1797                                         AddCharMap ((char) i, 0x1, 1);
1798                         for (int i = 0x02BE; i <= 0x02BF; i++)
1799                                 if (!IsIgnorable (i))
1800                                         AddCharMap ((char) i, 0x1, 1);
1801                         for (int i = 0x02C1; i <= 0x02C5; i++)
1802                                 if (!IsIgnorable (i))
1803                                         AddCharMap ((char) i, 0x1, 1);
1804                         for (int i = 0x02CE; i <= 0x02CF; i++)
1805                                 if (!IsIgnorable (i))
1806                                         AddCharMap ((char) i, 0x1, 1);
1807                         fillIndex [0x1]++;
1808                         for (int i = 0x02D1; i <= 0x02D3; i++)
1809                                 if (!IsIgnorable (i))
1810                                         AddCharMap ((char) i, 0x1, 1);
1811                         AddCharMap ('\u02DE', 0x1, 1);
1812                         for (int i = 0x02E4; i <= 0x02E9; i++)
1813                                 if (!IsIgnorable (i))
1814                                         AddCharMap ((char) i, 0x1, 1);
1815
1816                         // FIXME: needs more love here (it should eliminate
1817                         // all the hacky code above).
1818                         for (int i = 0x0300; i < 0x0370; i++)
1819                                 if (!IsIgnorable (i) && diacritical [i] != 0
1820                                         /* especiall here*/ && !map [i].Defined)
1821                                         map [i] = new CharMapEntry (
1822                                                 0x1, 0x1, diacritical [i]);
1823
1824                         // Cyrillic and Armenian nonspacing mark
1825                         fillIndex [0x1] = 0x94;
1826                         for (int i = 0x400; i < 0x580; i++)
1827                                 if (!IsIgnorable (i) &&
1828                                         Char.GetUnicodeCategory ((char) i) ==
1829                                         UnicodeCategory.NonSpacingMark)
1830                                         AddCharMap ((char) i, 1, 1);
1831
1832                         fillIndex [0x1] = 0x8D;
1833                         // syriac dotted nonspacing marks (1)
1834                         AddCharMap ('\u0740', 0x1, 1);
1835                         AddCharMap ('\u0741', 0x1, 1);
1836                         AddCharMap ('\u0742', 0x1, 1);
1837                         // syriac oblique nonspacing marks
1838                         AddCharMap ('\u0747', 0x1, 1);
1839                         AddCharMap ('\u0748', 0x1, 1);
1840                         // syriac dotted nonspacing marks (2)
1841                         fillIndex [0x1] = 0x94; // this reset is mandatory
1842                         AddCharMap ('\u0732', 0x1, 1);
1843                         AddCharMap ('\u0735', 0x1, 1);
1844                         AddCharMap ('\u0738', 0x1, 1);
1845                         AddCharMap ('\u0739', 0x1, 1);
1846                         AddCharMap ('\u073C', 0x1, 1);
1847                         // SPECIAL CASES: superscripts
1848                         AddCharMap ('\u073F', 0x1, 1);
1849                         AddCharMap ('\u0711', 0x1, 1);
1850                         // syriac "DOTS"
1851                         for (int i = 0x0743; i <= 0x0746; i++)
1852                                 AddCharMap ((char) i, 0x1, 1);
1853                         for (int i = 0x0730; i <= 0x0780; i++)
1854                                 if (!map [i].Defined &&
1855                                         Char.GetUnicodeCategory ((char) i) ==
1856                                         UnicodeCategory.NonSpacingMark)
1857                                         AddCharMap ((char) i, 0x1, 1);
1858
1859                         // LAMESPEC: It should not stop at '\u20E1'. There are
1860                         // a few more characters (that however results in
1861                         // overflow of level 2 unless we start before 0xDD).
1862                         fillIndex [0x1] = 0xDD;
1863                         for (int i = 0x20D0; i <= 0x20DC; i++)
1864                                 AddCharMap ((char) i, 0x1, 1);
1865                         fillIndex [0x1] = 0xEC;
1866                         for (int i = 0x20DD; i <= 0x20E1; i++)
1867                                 AddCharMap ((char) i, 0x1, 1);
1868                         fillIndex [0x1] = 0x7;
1869                         for (int i = 0x302A; i <= 0x302D; i++)
1870                                 AddCharMap ((char) i, 0x1, 1);
1871                         fillIndex [0x1] = 0x50; // I wonder how they are sorted
1872                         for (int i = 0x02D4; i <= 0x02D7; i++)
1873                                 AddCharMap ((char) i, 0x1, 1);
1874
1875                         // They are not part of Nonspacing marks, but have
1876                         // only diacritical weight.
1877                         for (int i = 0x3099; i <= 0x309C; i++)
1878                                 map [i] = new CharMapEntry (1, 1, 1);
1879                         map [0xFF9E] = new CharMapEntry (1, 1, 1);
1880                         map [0xFF9F] = new CharMapEntry (1, 1, 2);
1881                         map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1882                         map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1883                         for (int i = 0x30FC; i <= 0x30FE; i++)
1884                                 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1885
1886                         fillIndex [0x1] = 0xA;
1887                         for (int i = 0x0951; i <= 0x0954; i++)
1888                                 AddCharMap ((char) i, 0x1, 2);
1889
1890                         #endregion
1891
1892
1893                         #region Whitespaces // 07 03 -
1894                         fillIndex [0x7] = 0x2;
1895                         AddCharMap (' ', 0x7, 2);
1896                         AddCharMap ('\u00A0', 0x7, 1);
1897                         for (int i = 9; i <= 0xD; i++)
1898                                 AddCharMap ((char) i, 0x7, 1);
1899                         for (int i = 0x2000; i <= 0x200B; i++)
1900                                 AddCharMap ((char) i, 0x7, 1);
1901
1902                         fillIndex [0x7] = 0x17;
1903                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1904                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1905
1906                         // Characters which used to represent layout control.
1907                         // LAMESPEC: Windows developers seem to have thought
1908                         // that those characters are kind of whitespaces,
1909                         // while they aren't.
1910                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1911                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1912
1913                         #endregion
1914
1915                         // category 09 - continued symbols from 08
1916                         fillIndex [0x9] = 2;
1917                         // misc tech mark
1918                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1919                                 AddCharMap ((char) cp, 0x9, 1, 0);
1920
1921                         // arrows
1922                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
1923                         foreach (DictionaryEntry de in arrowValues) {
1924                                 int idx = (int) de.Value;
1925                                 int cp = (int) de.Key;
1926                                 if (map [cp].Defined)
1927                                         continue;
1928                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1929                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1930                                 arrowLv2 [idx]++;
1931                         }
1932                         // boxes
1933                         byte [] boxLv2 = new byte [128];
1934                         // 0-63 will be used for those offsets are positive,
1935                         // and 64-127 are for negative ones.
1936                         for (int i = 0; i < boxLv2.Length; i++)
1937                                 boxLv2 [i] = 3;
1938                         foreach (DictionaryEntry de in boxValues) {
1939                                 int cp = (int) de.Key;
1940                                 int off = (int) de.Value;
1941                                 if (map [cp].Defined)
1942                                         continue;
1943                                 if (off < 0) {
1944                                         fillIndex [0x9] = (byte) (0xE5 + off);
1945                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++);
1946                                 }
1947                                 else {
1948                                         fillIndex [0x9] = (byte) (0xE5 + off);
1949                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1950                                 }
1951                         }
1952                         // Some special characters (slanted)
1953                         fillIndex [0x9] = 0xF4;
1954                         AddCharMap ('\u2571', 0x9, 3);
1955                         AddCharMap ('\u2572', 0x9, 3);
1956                         AddCharMap ('\u2573', 0x9, 3);
1957
1958                         // FIXME: implement 0A
1959                         #region Symbols
1960                         fillIndex [0xA] = 2;
1961                         // byte currency symbols
1962                         for (int cp = 0; cp < 0x100; cp++) {
1963                                 uc = Char.GetUnicodeCategory ((char) cp);
1964                                 if (!IsIgnorable (cp) &&
1965                                         uc == UnicodeCategory.CurrencySymbol &&
1966                                         cp != '$')
1967                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1968                         }
1969                         // byte other symbols
1970                         for (int cp = 0; cp < 0x100; cp++) {
1971                                 if (cp == 0xA6)
1972                                         continue; // SPECIAL: skip FIXME: why?
1973                                 uc = Char.GetUnicodeCategory ((char) cp);
1974                                 if (!IsIgnorable (cp) &&
1975                                         uc == UnicodeCategory.OtherSymbol ||
1976                                         cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7')
1977                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1978                         }
1979                         // U+30FB here
1980                         AddCharMapGroup ('\u30FB', 0xA, 1, 0);
1981
1982                         for (int cp = 0x2020; cp <= 0x2031; cp++)
1983                                 if (Char.IsPunctuation ((char) cp))
1984                                         AddCharMap ((char) cp, 0xA, 1, 0);
1985                         // SPECIAL CASES: why?
1986                         AddCharMap ('\u203B', 0xA, 1, 0);
1987                         AddCharMap ('\u2040', 0xA, 1, 0);
1988                         AddCharMap ('\u2041', 0xA, 1, 0);
1989                         AddCharMap ('\u2042', 0xA, 1, 0);
1990
1991                         for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1992                                 AddCharMap ((char) cp, 0xA, 1, 0);
1993
1994                         // 3004 is skipped at first...
1995                         for (int cp = 0x3010; cp <= 0x3040; cp++)
1996                                 if (Char.IsSymbol ((char) cp))
1997                                         AddCharMap ((char) cp, 0xA, 1, 0);
1998                         // SPECIAL CASES: added here
1999                         AddCharMap ('\u3004', 0xA, 1, 0);
2000                         AddCharMap ('\u327F', 0xA, 1, 0);
2001
2002                         for (int cp = 0x2600; cp <= 0x2613; cp++)
2003                                 AddCharMap ((char) cp, 0xA, 1, 0);
2004                         // Dingbats
2005                         for (int cp = 0x2620; cp <= 0x2770; cp++)
2006                                 if (Char.IsSymbol ((char) cp))
2007                                         AddCharMap ((char) cp, 0xA, 1, 0);
2008                         // OCR
2009                         for (int i = 0x2440; i < 0x2460; i++)
2010                                 AddCharMap ((char) i, 0xA, 1, 0);
2011
2012                         // SPECIAL CASES: why?
2013                         AddCharMap ('\u0E3F', 0xA, 1, 0);
2014                         AddCharMap ('\u2117', 0xA, 1, 0);
2015                         AddCharMap ('\u20AC', 0xA, 1, 0);
2016                         #endregion
2017
2018                         #region Numbers // 0C 02 - 0C E1
2019                         fillIndex [0xC] = 2;
2020
2021                         // 9F8 : Bengali "one less than the denominator"
2022                         AddCharMap ('\u09F8', 0xC, 1, 0x3C);
2023
2024                         ArrayList numbers = new ArrayList ();
2025                         for (int i = 0; i < 65536; i++)
2026                                 if (!IsIgnorable (i) &&
2027                                         Char.IsNumber ((char) i) &&
2028                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
2029                                         numbers.Add (i);
2030
2031                         ArrayList numberValues = new ArrayList ();
2032                         foreach (int i in numbers)
2033                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
2034                         // SPECIAL CASE: Cyrillic Thousand sign
2035                         numberValues.Add (new DictionaryEntry (0x0482, 1000m));
2036                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
2037
2038 //foreach (DictionaryEntry de in numberValues)
2039 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
2040
2041                         // FIXME: fillIndex adjustment lines are too
2042                         // complicated. It must be simpler.
2043                         decimal prevValue = -1;
2044                         foreach (DictionaryEntry de in numberValues) {
2045                                 int cp = (int) de.Key;
2046                                 decimal currValue = (decimal) de.Value;
2047                                 bool addnew = false;
2048                                 if (prevValue < currValue &&
2049                                         prevValue - (int) prevValue == 0 &&
2050                                         prevValue >= 1) {
2051
2052                                         addnew = true;
2053                                         // Process Hangzhou and Roman numbers
2054
2055                                         // There are some SPECIAL cases.
2056                                         if (currValue != 4) // no increment for 4
2057                                                 fillIndex [0xC]++;
2058
2059                                         int xcp;
2060                                         if (currValue <= 13) {
2061                                                 if (currValue == 4)
2062                                                         fillIndex [0xC]++;
2063                                                 // SPECIAL CASE
2064                                                 if (currValue == 11)
2065                                                         AddCharMap ('\u0BF0', 0xC, 1);
2066                                                 xcp = (int) prevValue + 0x2160 - 1;
2067                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2068                                                 xcp = (int) prevValue + 0x2170 - 1;
2069                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2070                                                 fillIndex [0xC]++;
2071                                         }
2072                                         if (currValue < 12)
2073                                                 fillIndex [0xC]++;
2074                                         if (currValue <= 10) {
2075                                                 xcp = (int) prevValue + 0x3021 - 1;
2076                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2077                                                 fillIndex [0xC]++;
2078                                         }
2079                                 }
2080                                 if (prevValue < currValue)
2081                                         prevValue = currValue;
2082                                 if (map [cp].Defined)
2083                                         continue;
2084                                 // HangZhou and Roman are add later
2085                                 // (code is above)
2086                                 if (0x3021 <= cp && cp < 0x302A
2087                                         || 0x2160 <= cp && cp < 0x216C
2088                                         || 0x2170 <= cp && cp < 0x217C)
2089                                         continue;
2090
2091                                 if (cp == 0x215B) // FIXME: why?
2092                                         fillIndex [0xC] += 2;
2093                                 else if (cp == 0x3021) // FIXME: why?
2094                                         fillIndex [0xC]++;
2095                                 if (addnew || cp <= '9') {
2096                                         int mod = (int) currValue - 1;
2097                                         int xcp;
2098                                         if (1 <= currValue && currValue <= 11) {
2099                                                 xcp = mod + 0x2776;
2100                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2101                                                 xcp = mod + 0x2780;
2102                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2103                                                 xcp = mod + 0x278A;
2104                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2105                                         }
2106                                         if (1 <= currValue && currValue <= 20) {
2107                                                 xcp = mod + 0x2460;
2108                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2109                                                 xcp = mod + 0x2474;
2110                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2111                                                 xcp = mod + 0x2488;
2112                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2113                                         }
2114                                 }
2115                                 if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9)
2116                                         fillIndex [0xC]++;
2117                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2118
2119                                 switch (cp) {
2120                                 // Maybe Bengali digit numbers do not increase
2121                                 // indexes, but 0x09E6 does.
2122                                 case 0x09E7: case 0x09E8: case 0x09E9:
2123                                 case 0x09EA:
2124                                 // SPECIAL CASES
2125                                 case 0x0BF0: case 0x2180: case 0x2181:
2126                                         break;
2127                                 // SPECIAL CASE
2128                                 case 0x0BF1:
2129                                         fillIndex [0xC]++;
2130                                         break;
2131                                 default:
2132                                         if (currValue < 11 || currValue == 1000)
2133                                                 fillIndex [0xC]++;
2134                                         break;
2135                                 }
2136
2137                                 // Add special cases that are not regarded as
2138                                 // numbers in UnicodeCategory speak.
2139                                 if (cp == '5') {
2140                                         // TONE FIVE
2141                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2142                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2143                                 }
2144                                 else if (cp == '2' || cp == '6') // FIXME: why?
2145                                         fillIndex [0xC]++;
2146                         }
2147
2148                         // 221E: infinity
2149                         fillIndex [0xC] = 0xFF;
2150                         AddCharMap ('\u221E', 0xC, 1);
2151                         #endregion
2152
2153                         #region Letters and NonSpacing Marks (general)
2154
2155                         // ASCII Latin alphabets
2156                         for (int i = 0; i < alphabets.Length; i++)
2157                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2158
2159                         // non-ASCII Latin alphabets
2160                         // FIXME: there is no such characters that are placed
2161                         // *after* "alphabets" array items. This is nothing
2162                         // more than a hack that creates dummy weight for
2163                         // primary characters.
2164                         for (int i = 0x0080; i < 0x0300; i++) {
2165                                 if (!Char.IsLetter ((char) i))
2166                                         continue;
2167                                 // For those Latin Letters which has NFKD are
2168                                 // not added as independent primary character.
2169                                 if (decompIndex [i] != 0)
2170                                         continue;
2171                                 // SPECIAL CASES:
2172                                 // 1.some alphabets have primarily
2173                                 //   equivalent ASCII alphabets.
2174                                 // 2.some have independent primary weights,
2175                                 //   but inside a-to-z range.
2176                                 // 3.there are some expanded characters that
2177                                 //   are not part of Unicode Standard NFKD.
2178                                 // 4. some characters are letter in IsLetter
2179                                 //   but not in sortkeys (maybe unicode version
2180                                 //   difference caused it).
2181                                 switch (i) {
2182                                 // 1. skipping them does not make sense
2183 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
2184 //                              case 0x184: case 0x185: case 0x186: case 0x189:
2185 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
2186 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
2187 //                              case 0x19B: case 0x19C:
2188                                 // 2. skipping them does not make sense
2189 //                              case 0x14A: // Ng
2190 //                              case 0x14B: // ng
2191                                 // 3.
2192                                 case 0xC6: // AE
2193                                 case 0xE6: // ae
2194                                 case 0xDE: // Icelandic Thorn
2195                                 case 0xFE: // Icelandic Thorn
2196                                 case 0xDF: // German ss
2197                                 case 0xFF: // German ss
2198                                 // 4.
2199                                 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2200                                 // not classified yet
2201 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2202 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2203 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2204 //                              case 0x1DD:
2205                                         continue;
2206                                 }
2207                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
2208                         }
2209
2210                         // Greek and Coptic
2211                         fillIndex [0xF] = 02;
2212                         for (int i = 0x0380; i < 0x0390; i++)
2213                                 if (Char.IsLetter ((char) i))
2214                                         AddLetterMap ((char) i, 0xF, 1);
2215                         fillIndex [0xF] = 02;
2216                         for (int i = 0x0391; i < 0x03CF; i++)
2217                                 if (Char.IsLetter ((char) i))
2218                                         AddLetterMap ((char) i, 0xF, 1);
2219                         fillIndex [0xF] = 0x40;
2220                         for (int i = 0x03D0; i < 0x0400; i++)
2221                                 if (Char.IsLetter ((char) i))
2222                                         AddLetterMap ((char) i, 0xF, 1);
2223
2224                         // Cyrillic.
2225                         // Cyrillic letters are sorted like Latin letters i.e.
2226                         // containing culture-specific letters between the
2227                         // standard Cyrillic sequence.
2228                         //
2229                         // We can't use UCA here; it has different sorting.
2230                         char [] orderedCyrillic = new char [] {
2231                                 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2232                                 '\u0452', // DJE for Serbocroatian
2233                                 '\u0435',
2234                                 '\u0454', // IE for Ukrainian
2235                                 '\u0436', '\u0437',
2236                                 '\u0455', // DZE
2237                                 '\u0438',
2238                                 '\u0456', // Byelorussian-Ukrainian I
2239                                 '\u0457', // YI
2240                                 '\u0439',
2241                                 '\u0458', // JE
2242                                 '\u043A', '\u043B',
2243                                 '\u0459', // LJE
2244                                 '\u043C', '\u043D',
2245                                 '\u045A', // NJE
2246                                 '\u043E',
2247                                 // 4E9 goes here.
2248                                 '\u043F', '\u0440', '\u0441', '\u0442',
2249                                 '\u045B', // TSHE for Serbocroatian
2250                                 '\u0443',
2251                                 '\u045E', // Short U for Byelorussian
2252                                 '\u04B1', // Straight U w/ stroke (diacritical!)
2253                                 '\u0444', '\u0445', '\u0446', '\u0447',
2254                                 '\u045F', // DZHE
2255                                 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2256                                 '\u044D', '\u044E', '\u044F'};
2257
2258                         // For some characters here is a map to basic cyrillic
2259                         // letters. See UnicodeData.txt character names for
2260                         // the sources. Here I simply declare an equiv. array.
2261                         // The content characters are map from U+490(,491),
2262                         // skipping small letters.
2263                         char [] cymap_src = new char [] {
2264                                 '\u0433', '\u0433', '\u0433', '\u0436',
2265                                 '\u0437', '\u043A', '\u043A', '\u043A',
2266                                 '\u043A', '\u043D', '\u043D', '\u043F',
2267                                 '\u0445', '\u0441', '\u0442', '\u0443',
2268                                 '\u0443', '\u0445', '\u0446', '\u0447',
2269                                 '\u0447', '\u0432', '\u0435', '\u0435',
2270                                 '\u0406', '\u0436', '\u043A', '\u043D',
2271                                 '\u0447', '\u0435'};
2272
2273                         fillIndex [0x10] = 0x8D;
2274                         for (int i = 0x0460; i < 0x0481; i++) {
2275                                 if (Char.IsLetter ((char) i)) {
2276                                         if (i == 0x0476)
2277                                                 // U+476/477 have the same
2278                                                 // primary weight as U+474/475.
2279                                                 fillIndex [0x10] -= 3;
2280                                         AddLetterMap ((char) i, 0x10, 3);
2281                                 }
2282                         }
2283
2284                         fillIndex [0x10] = 0x6;
2285                         for (int i = 0; i < orderedCyrillic.Length; i++) {
2286                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2287                                 if (!IsIgnorable ((int) c) &&
2288                                         Char.IsLetter (c) &&
2289                                         !map [c].Defined) {
2290                                         AddLetterMap (c, 0x10, 0);
2291                                         fillIndex [0x10] += 3;
2292                                 }
2293                         }
2294
2295                         for (int i = 0; i < cymap_src.Length; i++) {
2296                                 char c = cymap_src [i];
2297                                 fillIndex [0x10] = map [c].Level1;
2298                                 int c2 = 0x0490 + i * 2;
2299                                 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2300                         }
2301
2302                         // Armenian
2303                         fillIndex [0x11] = 0x3;
2304                         fillIndex [0x1] = 0x98;
2305                         for (int i = 0x0531; i < 0x0586; i++) {
2306                                 if (i == 0x0559 || i == 0x55A)
2307                                         AddCharMap ((char) i, 1, 1);
2308                                 if (Char.IsLetter ((char) i))
2309                                         AddLetterMap ((char) i, 0x11, 1);
2310                         }
2311
2312                         // Hebrew
2313                         // -Letters
2314                         fillIndex [0x12] = 0x2;
2315                         for (int i = 0x05D0; i < 0x05FF; i++)
2316                                 if (Char.IsLetter ((char) i))
2317                                         AddLetterMap ((char) i, 0x12, 1);
2318                         // -Accents
2319                         fillIndex [0x1] = 0x3;
2320                         for (int i = 0x0591; i <= 0x05C2; i++) {
2321                                 if (i == 0x05A3 || i == 0x05BB)
2322                                         fillIndex [0x1]++;
2323                                 if (i != 0x05BE)
2324                                         AddCharMap ((char) i, 0x1, 1);
2325                         }
2326
2327                         // Arabic
2328                         fillIndex [0x1] = 0x8E;
2329                         fillIndex [0x13] = 0x3;
2330                         for (int i = 0x0621; i <= 0x064A; i++) {
2331                                 // Abjad
2332                                 if (Char.GetUnicodeCategory ((char) i)
2333                                         != UnicodeCategory.OtherLetter) {
2334                                         // FIXME: arabic nonspacing marks are
2335                                         // in different order.
2336                                         AddCharMap ((char) i, 0x1, 1);
2337                                         continue;
2338                                 }
2339 //                              map [i] = new CharMapEntry (0x13,
2340 //                                      (byte) arabicLetterPrimaryValues [i], 1);
2341                                 fillIndex [0x13] =
2342                                         (byte) arabicLetterPrimaryValues [i];
2343                                 byte formDiacritical = 8; // default
2344                                 // SPECIAL CASES:
2345                                 switch (i) {
2346                                 case 0x0622: formDiacritical = 9; break;
2347                                 case 0x0623: formDiacritical = 0xA; break;
2348                                 case 0x0624: formDiacritical = 5; break;
2349                                 case 0x0625: formDiacritical = 0xB; break;
2350                                 case 0x0626: formDiacritical = 7; break;
2351                                 case 0x0649: formDiacritical = 5; break;
2352                                 case 0x064A: formDiacritical = 7; break;
2353                                 }
2354                                 AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2355                         }
2356                         for (int i = 0x0670; i < 0x0673; i++)
2357                                 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2358                         fillIndex [0x13] = 0x84;
2359                         for (int i = 0x0674; i < 0x06D6; i++)
2360                                 if (Char.IsLetter ((char) i))
2361                                         AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2362
2363                         // Devanagari
2364
2365                         // FIXME: this could be fixed in more decent way
2366                         for (int i = 0x0958; i <= 0x095F; i++)
2367                                 diacritical [i] = 8;
2368
2369                         // FIXME: it does seem straight codepoint mapping.
2370                         fillIndex [0x14] = 04;
2371                         for (int i = 0x0901; i < 0x0905; i++)
2372                                 if (!IsIgnorable (i))
2373                                         AddLetterMap ((char) i, 0x14, 2);
2374                         fillIndex [0x14] = 0xB;
2375                         for (int i = 0x0905; i < 0x093A; i++) {
2376                                 if (i == 0x0928)
2377                                         AddCharMap ('\u0929', 0x14, 0, 8);
2378                                 if (i == 0x0930)
2379                                         AddCharMap ('\u0931', 0x14, 0, 8);
2380                                 if (i == 0x0933)
2381                                         AddCharMap ('\u0934', 0x14, 0, 8);
2382                                 if (Char.IsLetter ((char) i))
2383                                         AddLetterMap ((char) i, 0x14, 4);
2384                                 if (i == 0x090B)
2385                                         AddCharMap ('\u0960', 0x14, 4);
2386                                 if (i == 0x090C)
2387                                         AddCharMap ('\u0961', 0x14, 4);
2388                         }
2389                         fillIndex [0x14] = 0xDA;
2390                         for (int i = 0x093E; i < 0x0945; i++)
2391                                 if (!IsIgnorable (i))
2392                                         AddLetterMap ((char) i, 0x14, 2);
2393                         fillIndex [0x14] = 0xEC;
2394                         for (int i = 0x0945; i < 0x094F; i++)
2395                                 if (!IsIgnorable (i))
2396                                         AddLetterMap ((char) i, 0x14, 2);
2397
2398                         // Bengali
2399                         // -Letters
2400                         fillIndex [0x15] = 02;
2401                         for (int i = 0x0980; i < 0x9FF; i++) {
2402                                 if (IsIgnorable (i))
2403                                         continue;
2404                                 if (i == 0x09E0)
2405                                         fillIndex [0x15] = 0x3B;
2406                                 switch (Char.GetUnicodeCategory ((char) i)) {
2407                                 case UnicodeCategory.NonSpacingMark:
2408                                 case UnicodeCategory.DecimalDigitNumber:
2409                                 case UnicodeCategory.OtherNumber:
2410                                         continue;
2411                                 }
2412                                 AddLetterMap ((char) i, 0x15, 1);
2413                         }
2414                         // -Signs
2415                         fillIndex [0x1] = 0x3;
2416                         for (int i = 0x0981; i < 0x0A00; i++)
2417                                 if (Char.GetUnicodeCategory ((char) i) ==
2418                                         UnicodeCategory.NonSpacingMark)
2419                                         AddCharMap ((char) i, 0x1, 1);
2420
2421                         // Gurmukhi. orderedGurmukhi is from UCA
2422                         // FIXME: it does not look equivalent to UCA.
2423                         fillIndex [0x16] = 04;
2424                         fillIndex [0x1] = 3;
2425                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
2426                                 char c = orderedGurmukhi [i];
2427                                 if (IsIgnorable ((int) c))
2428                                         continue;
2429                                 if (IsIgnorableNonSpacing (c)) {
2430                                         AddLetterMap (c, 0x1, 1);
2431                                         continue;
2432                                 }
2433                                 if (c == '\u0A3C' || c == '\u0A4D' ||
2434                                         '\u0A66' <= c && c <= '\u0A71')
2435                                         continue;
2436                                 // SPECIAL CASES
2437                                 byte shift = 4;
2438                                 switch (c) {
2439                                 case '\u0A33': case '\u0A36': case '\u0A16':
2440                                 case '\u0A17': case '\u0A5B': case '\u0A5E':
2441                                         shift = 0;
2442                                         break;
2443                                 }
2444                                 if (c == '\u0A3E') // Skip
2445                                         fillIndex [0x16] = 0xC0;
2446                                 AddLetterMap (c, 0x16, shift);
2447                         }
2448
2449                         // Gujarati. orderedGujarati is from UCA
2450                         fillIndex [0x17] = 0x4;
2451                         // nonspacing marks
2452                         map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2453                         map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2454                         map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2455                         map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2456                         map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2457                         map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2458                         // letters go first.
2459                         for (int i = 0; i < orderedGujarati.Length; i++) {
2460                                 // SPECIAL CASE
2461                                 char c = orderedGujarati [i];
2462                                 if (Char.IsLetter (c)) {
2463                                         // SPECIAL CASES
2464                                         if (c == '\u0AB3' || c == '\u0A32')
2465                                                 continue;
2466                                         if (c == '\u0A33') {
2467                                                 AddCharMap ('\u0A32', 0x17, 0);
2468                                                 AddCharMap ('\u0A33', 0x17, 4, 4);
2469                                                 continue;
2470                                         }
2471                                         if (c == '\u0A8B')
2472                                                 AddCharMap ('\u0AE0', 0x17, 0, 5);
2473                                         AddCharMap (c, 0x17, 4);
2474
2475                                         if (c == '\u0AB9')
2476                                                 AddCharMap ('\u0AB3', 0x17, 6);
2477                                 }
2478                         }
2479                         // non-letters
2480                         byte gujaratiShift = 4;
2481                         fillIndex [0x17] = 0xC0;
2482                         for (int i = 0; i < orderedGujarati.Length; i++) {
2483                                 char c = orderedGujarati [i];
2484                                 if (fillIndex [0x17] == 0xCC)
2485                                         gujaratiShift = 3;
2486                                 if (!Char.IsLetter (c)) {
2487                                         // SPECIAL CASES
2488                                         if (c == '\u0A82')
2489                                                 AddCharMap ('\u0A81', 0x17, 2);
2490                                         if (c == '\u0AC2')
2491                                                 fillIndex [0x17]++;
2492                                         AddLetterMap (c, 0x17, gujaratiShift);
2493                                 }
2494                         }
2495
2496                         // Oriya
2497                         fillIndex [0x1] = 03;
2498                         fillIndex [0x18] = 02;
2499                         for (int i = 0x0B00; i < 0x0B7F; i++) {
2500                                 switch (Char.GetUnicodeCategory ((char) i)) {
2501                                 case UnicodeCategory.NonSpacingMark:
2502                                 case UnicodeCategory.DecimalDigitNumber:
2503                                         AddLetterMap ((char) i, 0x1, 1);
2504                                         continue;
2505                                 }
2506                                 AddLetterMap ((char) i, 0x18, 1);
2507                         }
2508
2509                         // Tamil
2510                         fillIndex [0x19] = 2;
2511                         AddCharMap ('\u0BD7', 0x19, 0);
2512                         fillIndex [0x19] = 0xA;
2513                         // vowels
2514                         for (int i = 0x0B82; i <= 0x0B94; i++)
2515                                 if (!IsIgnorable ((char) i))
2516                                         AddCharMap ((char) i, 0x19, 2);
2517                         // special vowel
2518                         fillIndex [0x19] = 0x28;
2519                         // The array for Tamil consonants is a constant.
2520                         // Windows have almost similar sequence to TAM from
2521                         // tamilnet but a bit different in Grantha.
2522                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
2523                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2524                         // combining marks
2525                         fillIndex [0x19] = 0x82;
2526                         for (int i = 0x0BBE; i < 0x0BCD; i++)
2527                                 if (Char.GetUnicodeCategory ((char) i) ==
2528                                         UnicodeCategory.SpacingCombiningMark
2529                                         || i == 0x0BC0)
2530                                         AddLetterMap ((char) i, 0x19, 2);
2531
2532                         // Telugu
2533                         fillIndex [0x1A] = 0x4;
2534                         for (int i = 0x0C00; i < 0x0C62; i++) {
2535                                 if (i == 0x0C55 || i == 0x0C56)
2536                                         continue; // skip
2537                                 AddCharMap ((char) i, 0x1A, 3);
2538                                 char supp = (i == 0x0C0B) ? '\u0C60':
2539                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
2540                                 if (supp == char.MinValue)
2541                                         continue;
2542                                 AddCharMap (supp, 0x1A, 3);
2543                         }
2544
2545                         // Kannada
2546                         fillIndex [0x1B] = 4;
2547                         for (int i = 0x0C80; i < 0x0CE5; i++) {
2548                                 if (i == 0x0CD5 || i == 0x0CD6)
2549                                         continue; // ignore
2550                                 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2551                                         continue; // shift after 0xCB9
2552                                 AddCharMap ((char) i, 0x1B, 3);
2553                                 if (i == 0x0CB9) {
2554                                         // SPECIAL CASES: but why?
2555                                         AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2556                                         AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2557                                         AddCharMap ('\u0CDE', 0x1B, 3); // FA
2558                                 }
2559                                 if (i == 0x0CB2)
2560                                         AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2561                         }
2562
2563                         // Malayalam
2564                         fillIndex [0x1C] = 2;
2565                         fillIndex [0x1] = 3;
2566                         for (int i = 0x0D02; i < 0x0D61; i++) {
2567                                 // FIXME: I avoided MSCompatUnicodeTable usage
2568                                 // here (it results in recursion). So check if
2569                                 // using NonSpacingMark makes sense or not.
2570                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2571 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2572                                         AddCharMap ((char) i, 0x1C, 1);
2573                                 else if (!IsIgnorable ((char) i))
2574                                         AddCharMap ((char) i, 1, 1);
2575                         }
2576
2577                         // Thai ... note that it breaks 0x1E wall after E2B!
2578                         // Also, all Thai characters have level 2 value 3.
2579                         fillIndex [0x1E] = 2;
2580                         fillIndex [0x1] = 3;
2581                         for (int i = 0xE40; i <= 0xE44; i++)
2582                                 AddCharMap ((char) i, 0x1E, 1, 3);
2583                         for (int i = 0xE01; i < 0xE2B; i++)
2584                                 AddCharMap ((char) i, 0x1E, 6, 3);
2585                         fillIndex [0x1F] = 5;
2586                         for (int i = 0xE2B; i < 0xE30; i++)
2587                                 AddCharMap ((char) i, 0x1F, 6, 3);
2588                         fillIndex [0x1F] = 0x1E;
2589                         for (int i = 0xE30; i < 0xE3B; i++)
2590                                 AddCharMap ((char) i, 0x1F, 1, 3);
2591                         // some Thai characters remains.
2592                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
2593                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2594                         foreach (char c in specialThai)
2595                                 AddCharMap (c, 0x1F, 1, 3);
2596
2597                         for (int i = 0xE00; i < 0xE80; i++)
2598                                 if (Char.GetUnicodeCategory ((char) i) ==
2599                                         UnicodeCategory.NonSpacingMark)
2600                                         AddCharMap ((char) i, 1, 1);
2601
2602                         // Lao
2603                         fillIndex [0x1F] = 2;
2604                         fillIndex [0x1] = 3;
2605                         for (int i = 0xE80; i < 0xEDF; i++) {
2606                                 if (IsIgnorable ((char) i))
2607                                         continue;
2608                                 else if (Char.IsLetter ((char) i))
2609                                         AddCharMap ((char) i, 0x1F, 1);
2610                                 else if (Char.GetUnicodeCategory ((char) i) ==
2611                                         UnicodeCategory.NonSpacingMark)
2612                                         AddCharMap ((char) i, 1, 1);
2613                         }
2614
2615                         // Georgian. orderedGeorgian is from UCA DUCET.
2616                         fillIndex [0x21] = 5;
2617                         for (int i = 0; i < orderedGeorgian.Length; i++) {
2618                                 char c = orderedGeorgian [i];
2619                                 if (map [(int) c].Defined)
2620                                         continue;
2621                                 AddCharMap (c, 0x21, 0);
2622                                 if (c < '\u10F6')
2623                                         AddCharMap ((char) (c - 0x30), 0x21, 0);
2624                                 fillIndex [0x21] += 5;
2625                         }
2626
2627                         // Japanese Kana.
2628                         fillIndex [0x22] = 2;
2629                         int kanaOffset = 0x3041;
2630                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2631
2632                         for (int gyo = 0; gyo < 9; gyo++) {
2633                                 for (int dan = 0; dan < 5; dan++) {
2634                                         if (gyo == 7 && dan % 2 == 1) {
2635                                                 // 'ya'-gyo
2636                                                 fillIndex [0x22]++;
2637                                                 kanaOffset -= 2; // There is no space for yi and ye.
2638                                                 continue;
2639                                         }
2640                                         int cp = kanaOffset + dan * kanaLines [gyo];
2641                                         // small lines (a-gyo, ya-gyo)
2642                                         if (gyo == 0 || gyo == 7) {
2643                                                 AddKanaMap (cp, 1); // small
2644                                                 AddKanaMap (cp + 1, 1);
2645                                         }
2646                                         else
2647                                                 AddKanaMap (cp, kanaLines [gyo]);
2648                                         fillIndex [0x22]++;
2649
2650                                         if (cp == 0x30AB) {
2651                                                 // add small 'ka' (before normal one)
2652                                                 AddKanaMap (0x30F5, 1);
2653                                                 kanaOffset++;
2654                                         }
2655                                         if (cp == 0x30B1) {
2656                                                 // add small 'ke' (before normal one)
2657                                                 AddKanaMap (0x30F6, 1);
2658                                                 kanaOffset++;
2659                                         }
2660                                         if (cp == 0x3061) {
2661                                                 // add small 'Tsu' (before normal one)
2662                                                 AddKanaMap (0x3063, 1);
2663                                                 kanaOffset++;
2664                                         }
2665                                 }
2666                                 fillIndex [0x22] += 3;
2667                                 kanaOffset += 5 * kanaLines [gyo];
2668                         }
2669
2670                         // Wa-gyo is almost special, so I just manually add.
2671                         AddLetterMap ((char) 0x308E, 0x22, 0);
2672                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2673                         AddLetterMap ((char) 0x308F, 0x22, 0);
2674                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2675                         fillIndex [0x22]++;
2676                         AddLetterMap ((char) 0x3090, 0x22, 0);
2677                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2678                         fillIndex [0x22] += 2;
2679                         // no "Wu" in Japanese.
2680                         AddLetterMap ((char) 0x3091, 0x22, 0);
2681                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2682                         fillIndex [0x22]++;
2683                         AddLetterMap ((char) 0x3092, 0x22, 0);
2684                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2685                         // Nn
2686                         fillIndex [0x22] = 0x80;
2687                         AddLetterMap ((char) 0x3093, 0x22, 0);
2688                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2689
2690                         map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2691                                 map [0x30A6].Level1, 3);// voiced hiragana U
2692                         map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2693                                 map [0x30A6].Level1, 3);// voiced katakana U
2694
2695                         map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2696                                 map [0x30AB].Level1, 0);// small katakana Ka
2697                         map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2698                                 map [0x30B1].Level1, 0);// small katakana Ke
2699                         // voiced Wa lines
2700                         for (int i = 0x30F7; i < 0x30FB; i++)
2701                                 map [i] = new CharMapEntry (map [i - 8].Category,
2702                                         map [i - 8].Level1,
2703                                         3);
2704
2705                         // JIS Japanese square chars.
2706                         fillIndex [0x22] = 0x97;
2707                         jisJapanese.Sort (JISComparer.Instance);
2708                         foreach (JISCharacter j in jisJapanese)
2709                                 if (0x3300 <= j.CP && j.CP <= 0x3357)
2710                                         AddCharMap ((char) j.CP, 0x22, 1);
2711                         // non-JIS Japanese square chars.
2712                         nonJisJapanese.Sort (NonJISComparer.Instance);
2713                         foreach (NonJISCharacter j in nonJisJapanese)
2714                                 AddCharMap ((char) j.CP, 0x22, 1);
2715
2716                         // Bopomofo
2717                         fillIndex [0x23] = 0x02;
2718                         for (int i = 0x3105; i <= 0x312C; i++)
2719                                 AddCharMap ((char) i, 0x23, 1);
2720
2721                         // Estrangela: ancient Syriac
2722                         fillIndex [0x24] = 0x0B;
2723                         // FIXME: is 0x71E really alternative form?
2724                         ArrayList syriacAlternatives = new ArrayList (
2725                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2726                         for (int i = 0x0710; i <= 0x072C; i++) {
2727                                 if (i == 0x0711) // NonSpacingMark
2728                                         continue;
2729                                 if (syriacAlternatives.Contains (i))
2730                                         continue;
2731                                 AddCharMap ((char) i, 0x24, 4);
2732                                 // FIXME: why?
2733                                 if (i == 0x721)
2734                                         fillIndex [0x24]++;
2735                         }
2736                         foreach (int cp in syriacAlternatives)
2737                                 map [cp] = new CharMapEntry (0x24,
2738                                         (byte) (map [cp - 1].Level1 + 2),
2739                                         0);
2740                         // FIXME: Syriac NonSpacingMark should go here.
2741
2742                         // Thaana
2743                         // FIXME: it turned out that it does not look like UCA
2744                         fillIndex [0x24] = 0x6E;
2745                         fillIndex [0x1] = 0xAC;
2746                         for (int i = 0; i < orderedThaana.Length; i++) {
2747                                 char c = orderedThaana [i];
2748                                 if (IsIgnorableNonSpacing ((int) c))
2749                                         AddCharMap (c, 1, 1);
2750                                 AddCharMap (c, 0x24, 2);
2751                                 if (c == '\u0782') // SPECIAL CASE: why?
2752                                         fillIndex [0x24] += 2;
2753                         }
2754                         #endregion
2755
2756                         // FIXME: Add more culture-specific letters (that are
2757                         // not supported in Windows collation) here.
2758
2759                         // Surrogate ... they are computed.
2760
2761                         #region Hangul
2762                         // Hangul.
2763                         //
2764                         // Unlike UCA Windows Hangul sequence mixes Jongseong
2765                         // with Choseong sequence as well as Jungseong,
2766                         // adjusted to have the same primary weight for the
2767                         // same base character. So it is impossible to compute
2768                         // those sort keys.
2769                         //
2770                         // Here I introduce an ordered sequence of mixed
2771                         // 'commands' and 'characters' that is similar to
2772                         // LDML text:
2773                         //      - ',' increases primary weight.
2774                         //      - [A B] means a range, increasing index
2775                         //      - {A B} means a range, without increasing index
2776                         //      - '=' is no operation (it means the characters
2777                         //        of both sides have the same weight).
2778                         //      - '>' inserts a Hangul Syllable block that
2779                         //        contains 0x251 characters.
2780                         //      - '<' decreases the index
2781                         //      - '0'-'9' means skip count
2782                         //      - whitespaces are ignored
2783                         //
2784
2785                         string hangulSequence =
2786                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2787                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2788                         + "<{\u1113 \u1116}, \u3165,"
2789                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2790                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2791                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2792                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2793                                 + "[\u11D1 \u11D2], \u11B2,"
2794                                 + "[\u11D3 \u11D5], \u11B3,"
2795                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2796                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2797                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2798                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2799                         + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2800                                 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2801                                 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2802                         + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2803                                 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2804                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2805                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2806                         + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
2807                                 + "\u11F1,, \u11F2,,,"
2808                                 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
2809                         + "<\u114D, \u110D,,  >"
2810                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2811                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2812                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2813                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2814                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2815                                 + "[\u11F5 \u11F8]"
2816                         ;
2817
2818                         byte hangulCat = 0x52;
2819                         fillIndex [hangulCat] = 0x2;
2820
2821                         int syllableBlock = 0;
2822                         for (int n = 0; n < hangulSequence.Length; n++) {
2823                                 char c = hangulSequence [n];
2824                                 int start, end;
2825                                 if (Char.IsWhiteSpace (c))
2826                                         continue;
2827                                 switch (c) {
2828                                 case '=':
2829                                         break; // NOP
2830                                 case ',':
2831                                         IncrementSequentialIndex (ref hangulCat);
2832                                         break;
2833                                 case '<':
2834                                         if (fillIndex [hangulCat] == 2)
2835                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2836                                         fillIndex [hangulCat]--;
2837                                         break;
2838                                 case '>':
2839                                         IncrementSequentialIndex (ref hangulCat);
2840                                         for (int l = 0; l < 0x15; l++)
2841                                                 for (int v = 0; v < 0x1C; v++) {
2842                                                         AddCharMap (
2843                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2844                                                         IncrementSequentialIndex (ref hangulCat);
2845                                                 }
2846                                         syllableBlock++;
2847                                         break;
2848                                 case '[':
2849                                         start = hangulSequence [n + 1];
2850                                         end = hangulSequence [n + 3];
2851                                         for (int i = start; i <= end; i++) {
2852                                                 AddCharMap ((char) i, hangulCat, 0);
2853                                                 if (end > i)
2854                                                         IncrementSequentialIndex (ref hangulCat);
2855                                         }
2856                                         n += 4; // consumes 5 characters for this operation
2857                                         break;
2858                                 case '{':
2859                                         start = hangulSequence [n + 1];
2860                                         end = hangulSequence [n + 3];
2861                                         for (int i = start; i <= end; i++)
2862                                                 AddCharMap ((char) i, hangulCat, 0);
2863                                         n += 4; // consumes 5 characters for this operation
2864                                         break;
2865                                 default:
2866                                         AddCharMap (c, hangulCat, 0);
2867                                         break;
2868                                 }
2869                         }
2870
2871                         // Some Jamo NFKD.
2872                         for (int i = 0x3200; i < 0x3300; i++) {
2873                                 if (IsIgnorable (i) || map [i].Defined)
2874                                         continue;
2875                                 int ch = 0;
2876                                 // w/ bracket
2877                                 if (decompLength [i] == 4 &&
2878                                         decompValues [decompIndex [i]] == '(')
2879                                         ch = decompIndex [i] + 1;
2880                                 // circled
2881                                 else if (decompLength [i] == 2 &&
2882                                         decompValues [decompIndex [i] + 1] == '\u1161')
2883                                         ch = decompIndex [i];
2884                                 else if (decompLength [i] == 1)
2885                                         ch = decompIndex [i];
2886                                 else
2887                                         continue;
2888                                 ch = decompValues [ch];
2889                                 if (ch < 0x1100 || 0x1200 < ch &&
2890                                         ch < 0xAC00 || 0xD800 < ch)
2891                                         continue;
2892
2893                                 // SPECIAL CASE ?
2894                                 int offset = i < 0x3260 ? 1 : 0;
2895                                 if (0x326E <= i && i <= 0x3273)
2896                                         offset = 1;
2897
2898                                 map [i] = new CharMapEntry (map [ch].Category,
2899                                         (byte) (map [ch].Level1 + offset),
2900                                         map [ch].Level2);
2901 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2902                         }
2903
2904
2905                         #endregion
2906
2907                         // Letterlike characters and CJK compatibility square
2908                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2909                         int [] counts = new int ['Z' - 'A' + 1];
2910                         char [] namedChars = new char [sortableCharNames.Count];
2911                         int nCharNames = 0;
2912                         foreach (DictionaryEntry de in sortableCharNames) {
2913                                 counts [((string) de.Value) [0] - 'A']++;
2914                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2915                         }
2916                         nCharNames = 0; // reset
2917                         for (int a = 0; a < counts.Length; a++) {
2918                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2919                                 for (int i = 0; i < counts [a]; i++)
2920 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2921                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2922                         }
2923
2924                         // CJK unified ideograph.
2925                         byte cjkCat = 0x9E;
2926                         fillIndex [cjkCat] = 0x2;
2927                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2928                                 if (!IsIgnorable (cp))
2929                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2930                         // CJK Extensions goes here.
2931                         // LAMESPEC: With this Windows style CJK layout, it is
2932                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2933                         // 0x9FBB can never be added w/o breaking compat.
2934                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2935                                 if (!IsIgnorable (cp))
2936                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2937
2938                         // PrivateUse ... computed.
2939                         // remaining Surrogate ... computed.
2940
2941                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2942                         // non-alphanumeric ASCII except for: + - < = > '
2943                         for (int i = 0x21; i < 0x7F; i++) {
2944                                 // SPECIAL CASE: 02C6 looks regarded as
2945                                 // equivalent to '^', which does not conform
2946                                 // to Unicode standard character database.
2947                                 if (i == 0x005B)
2948                                         AddCharMap ('\u2045', 0x7, 0, 0x1C);
2949                                 if (i == 0x005D)
2950                                         AddCharMap ('\u2046', 0x7, 0, 0x1C);
2951                                 if (i == 0x005E)
2952                                         AddCharMap ('\u02C6', 0x7, 0, 3);
2953                                 if (i == 0x0060)
2954                                         AddCharMap ('\u02CB', 0x7, 0, 3);
2955
2956                                 if (Char.IsLetterOrDigit ((char) i)
2957                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2958                                         continue; // they are not added here.
2959
2960                                 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2961                                 // Insert 3001 after ',' and 3002 after '.'
2962                                 if (i == 0x2C)
2963                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2964                                 else if (i == 0x2E)
2965                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2966                                 else if (i == 0x3A)
2967                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2968                         }
2969                         #endregion
2970
2971                         #region 07 - Punctuations and something else
2972                         for (int i = 0xA0; i < char.MaxValue; i++) {
2973                                 if (IsIgnorable (i))
2974                                         continue;
2975
2976                                 // FIXME: actually those reset should not be
2977                                 // done but here I put for easy goal.
2978                                 if (i == 0x05C3)
2979                                         fillIndex [0x7]++;
2980                                 if (i == 0x0700)
2981                                         fillIndex [0x7] = 0xE2;
2982                                 if (i == 0x2016)
2983                                         fillIndex [0x7] = 0x77;
2984                                 if (i == 0x3008)
2985                                         fillIndex [0x7] = 0x93;
2986
2987                                 if (0x02C8 <= i && i <= 0x02CD)
2988                                         continue; // nonspacing marks
2989
2990                                 // SPECIAL CASE: maybe they could be allocated
2991                                 // dummy NFKD mapping and no special processing
2992                                 // would be required here.
2993                                 if (i == 0x00AF)
2994                                         AddCharMap ('\u02C9', 0x7, 0, 3);
2995                                 if (i == 0x00B4)
2996                                         AddCharMap ('\u02CA', 0x7, 0, 3);
2997                                 if (i == 0x02C7)
2998                                         AddCharMap ('\u02D8', 0x7, 0, 3);
2999
3000                                 // SPECIAL CASES:
3001                                 switch (i) {
3002                                 case 0xAB: // 08
3003                                 case 0xB7: // 0A
3004                                 case 0xBB: // 08
3005                                 case 0x02B9: // 01
3006                                 case 0x02BA: // 01
3007                                 case 0x2329: // 09
3008                                 case 0x232A: // 09
3009                                         continue;
3010                                 }
3011
3012                                 switch (Char.GetUnicodeCategory ((char) i)) {
3013                                 case UnicodeCategory.OtherPunctuation:
3014                                 case UnicodeCategory.ClosePunctuation:
3015                                 case UnicodeCategory.OpenPunctuation:
3016                                 case UnicodeCategory.ConnectorPunctuation:
3017                                 case UnicodeCategory.InitialQuotePunctuation:
3018                                 case UnicodeCategory.FinalQuotePunctuation:
3019                                 case UnicodeCategory.ModifierSymbol:
3020                                         // SPECIAL CASES: // 0xA
3021                                         if (0x2020 <= i && i <= 0x2031)
3022                                                 continue;
3023                                         if (i == 0x3003) // added later
3024                                                 continue;
3025                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3026                                         break;
3027                                 default:
3028                                         if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why?
3029                                                 goto case UnicodeCategory.OtherPunctuation;
3030                                         break;
3031                                 }
3032                         }
3033
3034                         // Control pictures
3035                         // FIXME: it should not need to reset level 1, but
3036                         // it's for easy goal.
3037                         fillIndex [0x7] = 0xB6;
3038                         for (int i = 0x2400; i <= 0x2424; i++)
3039                                 AddCharMap ((char) i, 0x7, 1, 0);
3040
3041                         // FIXME: what are they?
3042                         AddCharMap ('\u3003', 0x7, 1);
3043                         AddCharMap ('\u3006', 0x7, 1);
3044                         AddCharMap ('\u02D0', 0x7, 1);
3045                         AddCharMap ('\u10FB', 0x7, 1);
3046                         AddCharMap ('\u0950', 0x7, 1);
3047                         AddCharMap ('\u093D', 0x7, 1);
3048                         AddCharMap ('\u0964', 0x7, 1);
3049                         AddCharMap ('\u0965', 0x7, 1);
3050                         AddCharMap ('\u0970', 0x7, 1);
3051
3052                         #endregion
3053
3054                         #region category 08 - symbols
3055                         fillIndex [0x8] = 2;
3056                         // Here Windows mapping is not straightforward. It is
3057                         // not based on computation but seems manual sorting.
3058                         AddCharMapGroup ('+', 0x8, 1, 0); // plus
3059                         AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
3060                         AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
3061                         AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
3062                         AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
3063                         AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
3064                         AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
3065                         AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
3066                         AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
3067                         AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
3068                         AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
3069                         AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
3070                         AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
3071
3072                         for (int cp = 0; cp < 0x2300; cp++) {
3073                                 if (cp == 0xAC) // SPECIAL CASE: skip
3074                                         continue;
3075                                 if (cp == 0x200) {
3076                                         cp = 0x2200; // skip to 2200
3077                                         fillIndex [0x8] = 0x21;
3078                                 }
3079                                 if (cp == 0x2295)
3080                                         fillIndex [0x8] = 0x3;
3081                                 if (cp == 0x22A2)
3082                                         fillIndex [0x8] = 0xAB;
3083                                 if (cp == 0x22B2)
3084                                         fillIndex [0x8] = 0xB9;
3085                                 if (!map [cp].Defined &&
3086 //                                      Char.GetUnicodeCategory ((char) cp) ==
3087 //                                      UnicodeCategory.MathSymbol)
3088                                         Char.IsSymbol ((char) cp))
3089                                         AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
3090                                 // SPECIAL CASES: no idea why Windows sorts as such
3091                                 switch (cp) {
3092                                 case 0x3E:
3093                                         AddCharMap ('\u227B', 0x8, 1, 0);
3094                                         AddCharMap ('\u22B1', 0x8, 1, 0);
3095                                         break;
3096                                 case 0xB1:
3097                                         AddCharMapGroup ('\u00AB', 0x8, 1, 0);
3098                                         AddCharMapGroup ('\u226A', 0x8, 1, 0);
3099                                         AddCharMapGroup ('\u00BB', 0x8, 1, 0);
3100                                         AddCharMapGroup ('\u226B', 0x8, 1, 0);
3101                                         break;
3102                                 case 0xF7:
3103                                         AddCharMap ('\u01C0', 0x8, 1, 0);
3104                                         AddCharMap ('\u01C1', 0x8, 1, 0);
3105                                         AddCharMap ('\u01C2', 0x8, 1, 0);
3106                                         break;
3107                                 }
3108                         }
3109                         #endregion
3110
3111                         #region Hack!
3112
3113                         // Characters w/ diacritical marks (NFKD)
3114                         for (int i = 0; i <= char.MaxValue; i++) {
3115                                 if (map [i].Defined || IsIgnorable (i))
3116                                         continue;
3117                                 if (decompIndex [i] == 0)
3118                                         continue;
3119
3120                                 int start = decompIndex [i];
3121                                 int primaryChar = decompValues [start];
3122                                 int secondary = diacritical [i];
3123                                 bool skip = false;
3124                                 int length = decompLength [i];
3125                                 // special processing for parenthesized ones.
3126                                 if (length == 3 &&
3127                                         decompValues [start] == '(' &&
3128                                         decompValues [start + 2] == ')') {
3129                                         primaryChar = decompValues [start + 1];
3130                                         length = 1;
3131                                 }
3132
3133                                 if (map [primaryChar].Level1 == 0)
3134                                         continue;
3135
3136                                 for (int l = 1; l < length; l++) {
3137                                         int c = decompValues [start + l];
3138                                         if (map [c].Level1 != 0)
3139                                                 skip = true;
3140                                         secondary += diacritical [c];
3141                                 }
3142                                 if (skip)
3143                                         continue;
3144                                 map [i] = new CharMapEntry (
3145                                         map [primaryChar].Category,
3146                                         map [primaryChar].Level1,
3147                                         (byte) secondary);
3148
3149                         }
3150
3151                         // Diacritical weight adjustment
3152
3153                         // Arabic Hamzah
3154                         diacritical [0x624] = 0x5;
3155                         diacritical [0x626] = 0x7;
3156                         diacritical [0x622] = 0x9;
3157                         diacritical [0x623] = 0xA;
3158                         diacritical [0x625] = 0xB;
3159                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
3160                         diacritical [0x64A] = 0x7; // Yaa'
3161
3162                         for (int i = 0; i < char.MaxValue; i++) {
3163                                 byte mod = 0;
3164                                 byte cat = map [i].Category;
3165                                 switch (cat) {
3166                                 case 0xE: // Latin diacritics
3167                                 case 0x22: // Japanese: circled characters
3168                                         mod = diacritical [i];
3169                                         break;
3170                                 case 0x13: // Arabic
3171                                         if (diacritical [i] == 0 && i >= 0xFE8D)
3172                                                 mod = 0x8; // default for arabic
3173                                         break;
3174                                 }
3175                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
3176                                         mod = diacritical [i];
3177                                 if (mod > 0)
3178                                         map [i] = new CharMapEntry (
3179                                                 cat, map [i].Level1, mod);
3180                         }
3181
3182                         // FIXME: this is halfly hack but those NonSpacingMark
3183                         // characters and still undefined are likely to
3184                         // be nonspacing.
3185                         for (int i = 0; i < char.MaxValue; i++) {
3186                                 if (map [i].Defined ||
3187                                         IsIgnorable (i))
3188                                         continue;
3189                                 switch (i) {
3190                                 // SPECIAL CASES.
3191                                 case 0x02B9:
3192                                 case 0x02BA:
3193                                         break;
3194                                 default:
3195                                         if (Char.GetUnicodeCategory ((char) i) !=
3196                                         UnicodeCategory.NonSpacingMark)
3197                                                 continue;
3198                                         break;
3199                                 }
3200                                 if (diacritical [i] != 0)
3201                                         map [i] = new CharMapEntry (1, 1, diacritical [i]);
3202                                 else
3203                                         AddCharMap ((char) i, 1, 1);
3204                         }
3205
3206                         #endregion
3207                 }
3208
3209                 private void IncrementSequentialIndex (ref byte hangulCat)
3210                 {
3211                         fillIndex [hangulCat]++;
3212                         if (fillIndex [hangulCat] == 0) { // overflown
3213                                 hangulCat++;
3214                                 fillIndex [hangulCat] = 0x2;
3215                         }
3216                 }
3217
3218                 // Reset fillIndex to fixed value and call AddLetterMap().
3219                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3220                 {
3221                         fillIndex [category] = alphaWeight;
3222                         AddLetterMap (c, category, 0);
3223
3224                         ArrayList al = latinMap [c] as ArrayList;
3225                         if (al == null)
3226                                 return;
3227
3228                         foreach (int cp in al)
3229                                 AddLetterMap ((char) cp, category, 0);
3230                 }
3231
3232                 private void AddKanaMap (int i, byte voices)
3233                 {
3234                         for (byte b = 0; b < voices; b++) {
3235                                 char c = (char) (i + b);
3236                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
3237                                 // Hiragana
3238                                 AddLetterMapCore (c, 0x22, 0, arg, false);
3239                                 // Katakana
3240                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3241                         }
3242                 }
3243
3244                 private void AddLetterMap (char c, byte category, byte updateCount)
3245                 {
3246                         AddLetterMapCore (c, category, updateCount, 0, true);
3247                 }
3248
3249                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3250                 {
3251                         char c2;
3252                         // <small> updates index
3253                         c2 = ToSmallForm (c);
3254                         if (c2 != c)
3255                                 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3256                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3257                         if (c2 != c && !map [(int) c2].Defined)
3258                                 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3259                         bool doUpdate = true;
3260                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3261                                 doUpdate = false;
3262                         else
3263                                 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3264                         if (doUpdate)
3265                                 fillIndex [category] += updateCount;
3266                 }
3267
3268                 private bool AddCharMap (char c, byte category, byte increment)
3269                 {
3270                         return AddCharMap (c, category, increment, 0);
3271                 }
3272
3273                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3274                 {
3275                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3276                                 return false; // do nothing
3277                         map [(int) c] = new CharMapEntry (category,
3278                                 category == 1 ? alt : fillIndex [category],
3279                                 category == 1 ? fillIndex [category] : alt);
3280                         fillIndex [category] += increment;
3281                         return true;
3282                 }
3283
3284                 //
3285                 // Adds characters to table in the order below
3286                 // (+ increases weight):
3287                 //      (<small> +)
3288                 //      itself
3289                 //      <fraction>
3290                 //      <full> | <super> | <sub>
3291                 //      <circle> | <wide> (| <narrow>)
3292                 //      +
3293                 //      (vertical +)
3294                 //
3295                 // level2 is fixed (does not increase).
3296                 int [] sameWeightItems = new int [] {
3297                         DecompositionFraction,
3298                         DecompositionFull,
3299                         DecompositionSuper,
3300                         DecompositionSub,
3301                         DecompositionCircle,
3302                         DecompositionWide,
3303                         DecompositionNarrow,
3304                         };
3305                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3306                 {
3307                         AddCharMapGroup (c, category, updateCount, level2, false);
3308                 }
3309
3310                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3311                 {
3312                         if (map [(int) c].Defined)
3313                                 return;
3314
3315                         if (deferLevel2)
3316                                 level2 = diacritical [(int) c];
3317
3318                         char small = char.MinValue;
3319                         char vertical = char.MinValue;
3320                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3321                         if (nfkd != null) {
3322                                 object smv = nfkd [(byte) DecompositionSmall];
3323                                 if (smv != null)
3324                                         small = (char) ((int) smv);
3325                                 object vv = nfkd [(byte) DecompositionVertical];
3326                                 if (vv != null)
3327                                         vertical = (char) ((int) vv);
3328                         }
3329
3330                         // <small> updates index
3331                         if (small != char.MinValue) {
3332                                 if (level2 == 0 && deferLevel2)
3333                                         level2 = diacritical [small];
3334                                 AddCharMap (small, category, updateCount, level2);
3335                         }
3336
3337                         // itself
3338                         AddCharMap (c, category, 0, level2);
3339
3340                         if (nfkd != null) {
3341                                 foreach (int weight in sameWeightItems) {
3342                                         object wv = nfkd [(byte) weight];
3343                                         if (wv != null) {
3344                                                 if (deferLevel2)
3345                                                         level2 = diacritical [(int) wv];
3346                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
3347                                         }
3348                                 }
3349                         }
3350
3351                         // update index here.
3352                         fillIndex [category] += updateCount;
3353
3354                         if (vertical != char.MinValue) {
3355                                 if (level2 == 0 && deferLevel2)
3356                                         level2 = diacritical [vertical];
3357                                 AddCharMap (vertical, category, updateCount, level2);
3358                         }
3359                 }
3360
3361                 private void AddCharMapCJK (char c, ref byte category)
3362                 {
3363                         AddCharMap (c, category, 0, 0);
3364                         IncrementSequentialIndex (ref category);
3365
3366                         // Special. I wonder why but Windows skips 9E F9.
3367                         if (category == 0x9E && fillIndex [category] == 0xF9)
3368                                 IncrementSequentialIndex (ref category);
3369                 }
3370
3371                 private void AddCharMapGroupCJK (char c, ref byte category)
3372                 {
3373                         AddCharMapCJK (c, ref category);
3374
3375                         // LAMESPEC: see below.
3376                         if (c == '\u5B78') {
3377                                 AddCharMapCJK ('\u32AB', ref category);
3378                                 AddCharMapCJK ('\u323B', ref category);
3379                         }
3380                         if (c == '\u52DE') {
3381                                 AddCharMapCJK ('\u3298', ref category);
3382                                 AddCharMapCJK ('\u3238', ref category);
3383                         }
3384                         if (c == '\u5BEB')
3385                                 AddCharMapCJK ('\u32A2', ref category);
3386                         if (c == '\u91AB')
3387                                 // Especially this mapping order totally does
3388                                 // not make sense to me.
3389                                 AddCharMapCJK ('\u32A9', ref category);
3390
3391                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3392                         if (nfkd == null)
3393                                 return;
3394                         for (byte weight = 0; weight <= 0x12; weight++) {
3395                                 object wv = nfkd [weight];
3396                                 if (wv == null)
3397                                         continue;
3398                                 int w = (int) wv;
3399
3400                                 // Special: they are ignored in this area.
3401                                 // FIXME: check if it is sane
3402                                 if (0xF900 <= w && w <= 0xFAD9)
3403                                         continue;
3404                                 // LAMESPEC: on Windows some of CJK characters
3405                                 // in 3200-32B0 are incorrectly mapped. They
3406                                 // mix Chinise and Japanese Kanji when
3407                                 // ordering those characters.
3408                                 switch (w) {
3409                                 case 0x32A2: case 0x3298: case 0x3238:
3410                                 case 0x32A9: case 0x323B: case 0x32AB:
3411                                         continue;
3412                                 }
3413
3414                                 AddCharMapCJK ((char) w, ref category);
3415                         }
3416                 }
3417
3418                 // For now it is only for 0x7 category.
3419                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3420                 {
3421                         if (map [(int) c].Defined)
3422                                 return;
3423
3424                         bool updateWeight = false;
3425                         // Process in advance (lower primary weight)
3426                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3427                                 if (!map [c2].Defined &&
3428                                         decompLength [c2] == 1 &&
3429                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3430                                         switch (decompType [c2]) {
3431                                         case DecompositionSmall:
3432                                                 updateWeight = true;
3433                                                 AddCharMap ((char) c2, category,
3434                                                         0, level2);
3435                                                 break;
3436                                         }
3437                                 }
3438                         }
3439                         if (updateWeight)
3440                                 fillIndex [category] = (byte)
3441                                         (fillIndex [category] + updateCount);
3442
3443                         // Identical weight
3444                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3445                                 if (!map [c2].Defined &&
3446                                         decompLength [c2] == 1 &&
3447                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3448                                         switch (decompType [c2]) {
3449                                         case DecompositionSub:
3450                                         case DecompositionSuper:
3451                                         case DecompositionWide:
3452                                         case DecompositionNarrow:
3453                                                 AddCharMap ((char) c2, category,
3454                                                         0, level2);
3455                                                 break;
3456                                         }
3457                                 }
3458                         }
3459
3460                         // itself
3461                         AddCharMap (c, category, updateCount, level2);
3462
3463                         // Since nfkdMap is problematic to have two or more
3464                         // NFKD to an identical character, here I iterate all.
3465                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3466                                 if (!map [c2].Defined &&
3467                                         decompLength [c2] == 1 &&
3468                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3469                                         switch (decompType [c2]) {
3470                                         case DecompositionWide:
3471                                         case DecompositionNarrow:
3472                                         case DecompositionSmall:
3473                                         case DecompositionSub:
3474                                         case DecompositionSuper:
3475                                                 continue;
3476                                         default:
3477                                                 AddCharMap ((char) c2, category, updateCount, level2);
3478                                                 break;
3479                                         }
3480                                 }
3481                         }
3482                 }
3483
3484                 private void AddArabicCharMap (char c)
3485                 {
3486                         byte category = 6;
3487                         byte updateCount = 1;
3488                         byte level2 = 0;
3489
3490                         // itself
3491                         AddCharMap (c, category, 0, level2);
3492
3493                         // Since nfkdMap is problematic to have two or more
3494                         // NFKD to an identical character, here I iterate all.
3495                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3496                                 if (decompLength [c2] == 0)
3497                                         continue;
3498                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
3499                                 if ((int) (decompValues [idx]) == (int) c)
3500                                         AddCharMap ((char) c2, category,
3501                                                 0, level2);
3502                         }
3503                         fillIndex [category] += updateCount;
3504                 }
3505
3506                 char ToSmallForm (char c)
3507                 {
3508                         return ToDecomposed (c, DecompositionSmall, false);
3509                 }
3510
3511                 char ToDecomposed (char c, byte d, bool tail)
3512                 {
3513                         if (decompType [(int) c] != d)
3514                                 return c;
3515                         int idx = decompIndex [(int) c];
3516                         if (tail)
3517                                 idx += decompLength [(int) c] - 1;
3518                         return (char) decompValues [idx];
3519                 }
3520
3521                 bool ExistsJIS (int cp)
3522                 {
3523                         foreach (JISCharacter j in jisJapanese)
3524                                 if (j.CP == cp)
3525                                         return true;
3526                         return false;
3527                 }
3528
3529                 #endregion
3530
3531                 #region Level 3 properties (Case/Width)
3532
3533                 private byte ComputeLevel3Weight (char c)
3534                 {
3535                         byte b = ComputeLevel3WeightRaw (c);
3536                         return b > 0 ? (byte) (b + 2) : b;
3537                 }
3538
3539                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3540                 {
3541                         // CJK compat
3542                         if ('\u3192' <= c && c <= '\u319F')
3543                                 return 0;
3544
3545                         // They have <narrow> NFKD mapping, and on Windows
3546                         // those narrow characters are regarded as "normal",
3547                         // thus those characters themselves are regarded as
3548                         // "wide". grep "<narrow>" and you can pick them up
3549                         // (ignoring Kana, Hangul etc.)
3550                         switch (c) {
3551                         case '\u3002':
3552                         case '\u300C':
3553                         case '\u300D':
3554                         case '\u3001':
3555                         case '\u30FB':
3556                         case '\u2502':
3557                         case '\u2190':
3558                         case '\u2191':
3559                         case '\u2192':
3560                         case '\u2193':
3561                         case '\u25A0':
3562                         case '\u25CB':
3563                                 return 1;
3564                         }
3565                         // Korean
3566                         if ('\u11A8' <= c && c <= '\u11F9')
3567                                 return 2;
3568                         if ('\uFFA0' <= c && c <= '\uFFDC')
3569                                 return 4;
3570                         if ('\u3130' <= c && c <= '\u3164')
3571                                 return 5;
3572                         if ('\u3165' <= c && c <= '\u318E')
3573                                 return 4;
3574                         // Georgian Capital letters
3575                         if ('\u10A0' <= c && c <= '\u10C5')
3576                                 return 0x10;
3577                         // numbers
3578                         if ('\u2776' <= c && c <= '\u277F')
3579                                 return 4;
3580                         if ('\u2780' <= c && c <= '\u2789')
3581                                 return 8;
3582                         if ('\u2776' <= c && c <= '\u2793')
3583                                 return 0xC;
3584                         if ('\u2160' <= c && c <= '\u216F')
3585                                 return 0x10;
3586                         if ('\u2181' <= c && c <= '\u2182')
3587                                 return 0x10;
3588                         // Arabic
3589                         if ('\u2135' <= c && c <= '\u2138')
3590                                 return 4;
3591                         byte [] arabicTmp = new byte [] {0x18, 0, 0x8, 0x10};
3592                         if ('\uFEB5' <= c && c < '\uFEED' ||
3593                                 '\uFEF1' <= c && c < '\uFEF5')
3594                                 return arabicTmp [c % 4];
3595                         if ('\uFE80' <= c && c < '\uFF00') {
3596                                 // 2(Isolated)/8(Final)/0x18(Medial)
3597                                 switch (decompType [(int) c]) {
3598                                 case DecompositionIsolated:
3599                                         return 2;
3600                                 case DecompositionFinal:
3601                                         return 8;
3602                                 case DecompositionMedial:
3603                                         return 0x18;
3604                                 }
3605                         }
3606
3607                         // actually I dunno the reason why they have weights.
3608                         switch (c) {
3609                         case '\u01BC':
3610                                 return 0x10;
3611                         case '\u06A9':
3612                                 return 0x20;
3613                         case '\u06AA':
3614                                 return 0x28;
3615                         // Gurmukhi
3616                         case '\u0A39':
3617                         case '\u0A59':
3618                         case '\u0A5A':
3619                         case '\u0A5B':
3620                         case '\u0A5E':
3621                                 return 0x10;
3622                         }
3623
3624                         byte ret = 0;
3625                         switch (c) {
3626                         case '\u03C2':
3627                         case '\u2104':
3628                         case '\u212B':
3629                                 ret = 8;
3630                                 break;
3631                         case '\uFE42':
3632                                 ret = 0xA;
3633                                 break;
3634                         }
3635
3636                         // misc
3637                         switch (decompType [(int) c]) {
3638                         case DecompositionWide: // <wide>
3639                         case DecompositionSub: // <sub>
3640                         case DecompositionSuper: // <super>
3641                                 ret |= decompType [(int) c];
3642                                 break;
3643                         }
3644                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3645                                 ret |= 8;
3646                         if (isUppercase [(int) c]) // DerivedCoreProperties
3647                                 ret |= 0x10;
3648
3649                         return ret;
3650                 }
3651
3652                 #endregion
3653
3654                 #region IsIgnorable
3655 /*
3656                 static bool IsIgnorable (int i)
3657                 {
3658                         if (unicodeAge [i] >= 3.1)
3659                                 return true;
3660                         switch (char.GetUnicodeCategory ((char) i)) {
3661                         case UnicodeCategory.OtherNotAssigned:
3662                         case UnicodeCategory.Format:
3663                                 return true;
3664                         }
3665                         return false;
3666                 }
3667 */
3668
3669                 // FIXME: In the future use DerivedAge.txt to examine character
3670                 // versions and set those ones that have higher version than
3671                 // 1.0 as ignorable.
3672                 static bool IsIgnorable (int i)
3673                 {
3674                         switch (i) {
3675                         case 0:
3676                         // I guess, those characters are added between
3677                         // Unicode 1.0 (LCMapString) and Unicode 3.1
3678                         // (UnicodeCategory), so they used to be
3679                         // something like OtherNotAssigned as of Unicode 1.1.
3680                         case 0x2df: case 0x387:
3681                         case 0x3d7: case 0x3d8: case 0x3d9:
3682                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3683                         case 0x400: case 0x40d: case 0x450: case 0x45d:
3684                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
3685                         case 0x653: case 0x654: case 0x655: case 0x66d:
3686                         case 0xb56:
3687                         case 0x1e9b: case 0x202f: case 0x20ad:
3688                         case 0x20ae: case 0x20af:
3689                         case 0x20e2: case 0x20e3:
3690                         case 0x2139: case 0x213a: case 0x2183:
3691                         case 0x2425: case 0x2426: case 0x2619:
3692                         case 0x2670: case 0x2671: case 0x3007:
3693                         case 0x3190: case 0x3191:
3694                         case 0xfffc: case 0xfffd:
3695                                 return true;
3696                         // exceptional characters filtered by the
3697                         // following conditions. Originally those exceptional
3698                         // ranges are incorrect (they should not be ignored)
3699                         // and most of those characters are unfortunately in
3700                         // those ranges.
3701                         case 0x4d8: case 0x4d9:
3702                         case 0x4e8: case 0x4e9:
3703                         case 0x70F:
3704                         case 0x3036: case 0x303f:
3705                         case 0x337b: case 0xfb1e:
3706                                 return false;
3707                         }
3708
3709                         if (
3710                                 // The whole Sinhala characters.
3711                                 0x0D82 <= i && i <= 0x0DF4
3712                                 // The whole Tibetan characters.
3713                                 || 0x0F00 <= i && i <= 0x0FD1
3714                                 // The whole Myanmar characters.
3715                                 || 0x1000 <= i && i <= 0x1059
3716                                 // The whole Etiopic, Cherokee,
3717                                 // Canadian Syllablic, Ogham, Runic,
3718                                 // Tagalog, Hanunoo, Philippine,
3719                                 // Buhid, Tagbanwa, Khmer and Mongorian
3720                                 // characters.
3721                                 || 0x1200 <= i && i <= 0x1DFF
3722                                 // Greek extension characters.
3723                                 || 0x1F00 <= i && i <= 0x1FFF
3724                                 // The whole Braille characters.
3725                                 || 0x2800 <= i && i <= 0x28FF
3726                                 // CJK radical characters.
3727                                 || 0x2E80 <= i && i <= 0x2EF3
3728                                 // Kangxi radical characters.
3729                                 || 0x2F00 <= i && i <= 0x2FD5
3730                                 // Ideographic description characters.
3731                                 || 0x2FF0 <= i && i <= 0x2FFB
3732                                 // Bopomofo letter and final
3733                                 || 0x31A0 <= i && i <= 0x31B7
3734                                 // White square with quadrant characters.
3735                                 || 0x25F0 <= i && i <= 0x25F7
3736                                 // Ideographic telegraph symbols.
3737                                 || 0x32C0 <= i && i <= 0x32CB
3738                                 || 0x3358 <= i && i <= 0x3370
3739                                 || 0x33E0 <= i && i <= 0x33FF
3740                                 // The whole YI characters.
3741                                 || 0xA000 <= i && i <= 0xA48C
3742                                 || 0xA490 <= i && i <= 0xA4C6
3743                                 // American small ligatures
3744                                 || 0xFB13 <= i && i <= 0xFB17
3745                                 // hebrew, arabic, variation selector.
3746                                 || 0xFB1D <= i && i <= 0xFE2F
3747                                 // Arabic ligatures.
3748                                 || 0xFEF5 <= i && i <= 0xFEFC
3749                                 // FIXME: why are they excluded?
3750                                 || 0x01F6 <= i && i <= 0x01F9
3751                                 || 0x0218 <= i && i <= 0x0233
3752                                 || 0x02A9 <= i && i <= 0x02AD
3753                                 || 0x02EA <= i && i <= 0x02EE
3754                                 || 0x0349 <= i && i <= 0x036F
3755                                 || 0x0488 <= i && i <= 0x048F
3756                                 || 0x04D0 <= i && i <= 0x04FF
3757                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3758                                 || 0x06D6 <= i && i <= 0x06ED
3759                                 || 0x06FA <= i && i <= 0x06FE
3760                                 || 0x2048 <= i && i <= 0x204D
3761                                 || 0x20e4 <= i && i <= 0x20ea
3762                                 || 0x213C <= i && i <= 0x214B
3763                                 || 0x21EB <= i && i <= 0x21FF
3764                                 || 0x22F2 <= i && i <= 0x22FF
3765                                 || 0x237B <= i && i <= 0x239A
3766                                 || 0x239B <= i && i <= 0x23CF
3767                                 || 0x24EB <= i && i <= 0x24FF
3768                                 || 0x2596 <= i && i <= 0x259F
3769                                 || 0x25F8 <= i && i <= 0x25FF
3770                                 || 0x2672 <= i && i <= 0x2689
3771                                 || 0x2768 <= i && i <= 0x2775
3772                                 || 0x27d0 <= i && i <= 0x27ff
3773                                 || 0x2900 <= i && i <= 0x2aff
3774                                 || 0x3033 <= i && i <= 0x303F
3775                                 || 0x31F0 <= i && i <= 0x31FF
3776                                 || 0x3250 <= i && i <= 0x325F
3777                                 || 0x32B1 <= i && i <= 0x32BF
3778                                 || 0x3371 <= i && i <= 0x337B
3779                                 || 0xFA30 <= i && i <= 0xFA6A
3780                         )
3781                                 return true;
3782
3783                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3784                         switch (uc) {
3785                         case UnicodeCategory.PrivateUse:
3786                         case UnicodeCategory.Surrogate:
3787                                 return false;
3788                         // ignored by nature
3789                         case UnicodeCategory.Format:
3790                         case UnicodeCategory.OtherNotAssigned:
3791                                 return true;
3792                         default:
3793                                 return false;
3794                         }
3795                 }
3796
3797                 // To check IsIgnorable sanity, try the driver below under MS.NET.
3798
3799                 /*
3800                 public static void Main ()
3801                 {
3802                         for (int i = 0; i <= char.MaxValue; i++)
3803                                 Dump (i, IsIgnorable (i));
3804                 }
3805
3806                 static void Dump (int i, bool ignore)
3807                 {
3808                         switch (Char.GetUnicodeCategory ((char) i)) {
3809                         case UnicodeCategory.PrivateUse:
3810                         case UnicodeCategory.Surrogate:
3811                                 return; // check nothing
3812                         }
3813
3814                         string s1 = "";
3815                         string s2 = new string ((char) i, 10);
3816                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3817                         if ((ret == 0) == ignore)
3818                                 return;
3819                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3820                 }
3821                 */
3822                 #endregion // IsIgnorable
3823
3824                 #region IsIgnorableSymbol
3825                 static bool IsIgnorableSymbol (int i)
3826                 {
3827                         if (IsIgnorable (i))
3828                                 return true;
3829
3830                         switch (i) {
3831                         // *Letter
3832                         case 0x00b5: case 0x01C0: case 0x01C1:
3833                         case 0x01C2: case 0x01C3: case 0x01F6:
3834                         case 0x01F7: case 0x01F8: case 0x01F9:
3835                         case 0x02D0: case 0x02EE: case 0x037A:
3836                         case 0x03D7: case 0x03F3:
3837                         case 0x0400: case 0x040d:
3838                         case 0x0450: case 0x045d:
3839                         case 0x048C: case 0x048D:
3840                         case 0x048E: case 0x048F:
3841                         case 0x0587: case 0x0640: case 0x06E5:
3842                         case 0x06E6: case 0x06FA: case 0x06FB:
3843                         case 0x06FC: case 0x093D: case 0x0950:
3844                         case 0x1E9B: case 0x2139: case 0x3006:
3845                         case 0x3033: case 0x3034: case 0x3035:
3846                         case 0xFE7E: case 0xFE7F:
3847                         // OtherNumber
3848                         case 0x16EE: case 0x16EF: case 0x16F0:
3849                         // LetterNumber
3850                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3851                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3852                         case 0x3038: // HANGZHOU NUMERAL TEN
3853                         case 0x3039: // HANGZHOU NUMERAL TWENTY
3854                         case 0x303a: // HANGZHOU NUMERAL THIRTY
3855                         // OtherSymbol
3856                         case 0x2117:
3857                         case 0x327F:
3858                                 return true;
3859                         // ModifierSymbol
3860                         case 0x02B9: case 0x02BA: case 0x02C2:
3861                         case 0x02C3: case 0x02C4: case 0x02C5:
3862                         case 0x02C8: case 0x02CC: case 0x02CD:
3863                         case 0x02CE: case 0x02CF: case 0x02D2:
3864                         case 0x02D3: case 0x02D4: case 0x02D5:
3865                         case 0x02D6: case 0x02D7: case 0x02DE:
3866                         case 0x02E5: case 0x02E6: case 0x02E7:
3867                         case 0x02E8: case 0x02E9:
3868                         case 0x309B: case 0x309C:
3869                         // OtherPunctuation
3870                         case 0x055A: // American Apos
3871                         case 0x05C0: // Hebrew Punct
3872                         case 0x0E4F: // Thai FONGMAN
3873                         case 0x0E5A: // Thai ANGKHANKHU
3874                         case 0x0E5B: // Thai KHOMUT
3875                         // CurencySymbol
3876                         case 0x09F2: // Bengali Rupee Mark
3877                         case 0x09F3: // Bengali Rupee Sign
3878                         // MathSymbol
3879                         case 0x221e: // INF.
3880                         // OtherSymbol
3881                         case 0x0482:
3882                         case 0x09FA:
3883                         case 0x0B70:
3884                                 return false;
3885                         }
3886
3887                         // *Letter
3888                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3889 #if NET_2_0
3890                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3891                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3892 #endif
3893                         )
3894                                 return true;
3895
3896                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3897                         switch (uc) {
3898                         case UnicodeCategory.Surrogate:
3899                                 return false; // inconsistent
3900
3901                         case UnicodeCategory.SpacingCombiningMark:
3902                         case UnicodeCategory.EnclosingMark:
3903                         case UnicodeCategory.NonSpacingMark:
3904                         case UnicodeCategory.PrivateUse:
3905                                 // NonSpacingMark
3906                                 if (0x064B <= i && i <= 0x0652) // Arabic
3907                                         return true;
3908                                 return false;
3909
3910                         case UnicodeCategory.Format:
3911                         case UnicodeCategory.OtherNotAssigned:
3912                                 return true;
3913
3914                         default:
3915                                 bool use = false;
3916                                 // OtherSymbols
3917                                 if (
3918                                         // latin in a circle
3919                                         0x249A <= i && i <= 0x24E9
3920                                         || 0x2100 <= i && i <= 0x2132
3921                                         // Japanese
3922                                         || 0x3196 <= i && i <= 0x31A0
3923                                         // Korean
3924                                         || 0x3200 <= i && i <= 0x321C
3925                                         // Chinese/Japanese
3926                                         || 0x322A <= i && i <= 0x3243
3927                                         // CJK
3928                                         || 0x3260 <= i && i <= 0x32B0
3929                                         || 0x32D0 <= i && i <= 0x3357
3930                                         || 0x337B <= i && i <= 0x33DD
3931                                 )
3932                                         use = !Char.IsLetterOrDigit ((char) i);
3933                                 if (use)
3934                                         return false;
3935
3936                                 // This "Digit" rule is mystery.
3937                                 // It filters some symbols out.
3938                                 if (Char.IsLetterOrDigit ((char) i))
3939                                         return false;
3940                                 if (Char.IsNumber ((char) i))
3941                                         return false;
3942                                 if (Char.IsControl ((char) i)
3943                                         || Char.IsSeparator ((char) i)
3944                                         || Char.IsPunctuation ((char) i))
3945                                         return true;
3946                                 if (Char.IsSymbol ((char) i))
3947                                         return true;
3948
3949                                 // FIXME: should check more
3950                                 return false;
3951                         }
3952                 }
3953
3954                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3955 /*
3956                 public static void Main ()
3957                 {
3958                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3959                         for (int i = 0; i <= char.MaxValue; i++) {
3960                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3961                                 if (uc == UnicodeCategory.Surrogate)
3962                                         continue;
3963
3964                                 bool ret = IsIgnorableSymbol (i);
3965
3966                                 string s1 = "TEST ";
3967                                 string s2 = "TEST " + (char) i;
3968
3969                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3970
3971                                 if (ret != (result == 0))
3972                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3973                                                 ret ? "should not ignore" :
3974                                                         "should ignore",
3975                                                 i,(char) i, uc);
3976                         }
3977                 }
3978 */
3979                 #endregion
3980
3981                 #region NonSpacing
3982                 static bool IsIgnorableNonSpacing (int i)
3983                 {
3984                         if (IsIgnorable (i))
3985                                 return true;
3986
3987                         switch (i) {
3988                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3989                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3990                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3991                                 return true;
3992                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3993                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3994                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3995                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3996                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3997                         case 0x0CCD: case 0x0E4E:
3998                                 return false;
3999                         }
4000
4001                         if (0x02b9 <= i && i <= 0x02c5
4002                                 || 0x02cc <= i && i <= 0x02d7
4003                                 || 0x02e4 <= i && i <= 0x02ef
4004                                 || 0x20DD <= i && i <= 0x20E0
4005                         )
4006                                 return true;
4007
4008                         if (0x064B <= i && i <= 0x00652
4009                                 || 0x0941 <= i && i <= 0x0948
4010                                 || 0x0AC1 <= i && i <= 0x0ACD
4011                                 || 0x0C3E <= i && i <= 0x0C4F
4012                                 || 0x0E31 <= i && i <= 0x0E3F
4013                         )
4014                                 return false;
4015
4016                         return Char.GetUnicodeCategory ((char) i) ==
4017                                 UnicodeCategory.NonSpacingMark;
4018                 }
4019
4020                 // We can reuse IsIgnorableSymbol testcode
4021                 // for IsIgnorableNonSpacing.
4022                 #endregion
4023         }
4024
4025         struct CharMapEntry
4026         {
4027                 public byte Category;
4028                 public byte Level1;
4029                 public byte Level2; // It is always single byte.
4030                 public bool Defined;
4031
4032                 public CharMapEntry (byte category, byte level1, byte level2)
4033                 {
4034                         Category = category;
4035                         Level1 = level1;
4036                         Level2 = level2;
4037                         Defined = true;
4038                 }
4039         }
4040
4041         class JISCharacter
4042         {
4043                 public readonly int CP;
4044                 public readonly int JIS;
4045
4046                 public JISCharacter (int cp, int cpJIS)
4047                 {
4048                         CP = cp;
4049                         JIS = cpJIS;
4050                 }
4051         }
4052
4053         class JISComparer : IComparer
4054         {
4055                 public static readonly JISComparer Instance =
4056                         new JISComparer ();
4057
4058                 public int Compare (object o1, object o2)
4059                 {
4060                         JISCharacter j1 = (JISCharacter) o1;
4061                         JISCharacter j2 = (JISCharacter) o2;
4062                         return j1.JIS - j2.JIS;
4063                 }
4064         }
4065
4066         class NonJISCharacter
4067         {
4068                 public readonly int CP;
4069                 public readonly string Name;
4070
4071                 public NonJISCharacter (int cp, string name)
4072                 {
4073                         CP = cp;
4074                         Name = name;
4075                 }
4076         }
4077
4078         class NonJISComparer : IComparer
4079         {
4080                 public static readonly NonJISComparer Instance =
4081                         new NonJISComparer ();
4082
4083                 public int Compare (object o1, object o2)
4084                 {
4085                         NonJISCharacter j1 = (NonJISCharacter) o1;
4086                         NonJISCharacter j2 = (NonJISCharacter) o2;
4087                         return string.CompareOrdinal (j1.Name, j2.Name);
4088                 }
4089         }
4090
4091         class DecimalDictionaryValueComparer : IComparer
4092         {
4093                 public static readonly DecimalDictionaryValueComparer Instance
4094                         = new DecimalDictionaryValueComparer ();
4095
4096                 private DecimalDictionaryValueComparer ()
4097                 {
4098                 }
4099
4100                 public int Compare (object o1, object o2)
4101                 {
4102                         DictionaryEntry e1 = (DictionaryEntry) o1;
4103                         DictionaryEntry e2 = (DictionaryEntry) o2;
4104                         // FIXME: in case of 0, compare decomposition categories
4105                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4106                         if (ret != 0)
4107                                 return ret;
4108                         int i1 = (int) e1.Key;
4109                         int i2 = (int) e2.Key;
4110                         return i1 - i2;
4111                 }
4112         }
4113
4114         class StringDictionaryValueComparer : IComparer
4115         {
4116                 public static readonly StringDictionaryValueComparer Instance
4117                         = new StringDictionaryValueComparer ();
4118
4119                 private StringDictionaryValueComparer ()
4120                 {
4121                 }
4122
4123                 public int Compare (object o1, object o2)
4124                 {
4125                         DictionaryEntry e1 = (DictionaryEntry) o1;
4126                         DictionaryEntry e2 = (DictionaryEntry) o2;
4127                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4128                         if (ret != 0)
4129                                 return ret;
4130                         int i1 = (int) e1.Key;
4131                         int i2 = (int) e2.Key;
4132                         return i1 - i2;
4133                 }
4134         }
4135
4136         class UCAComparer : IComparer
4137         {
4138                 public static readonly UCAComparer Instance
4139                         = new UCAComparer ();
4140
4141                 private UCAComparer ()
4142                 {
4143                 }
4144
4145                 public int Compare (object o1, object o2)
4146                 {
4147                         char i1 = (char) o1;
4148                         char i2 = (char) o2;
4149
4150                         int l1 = CollationElementTable.GetSortKeyCount (i1);
4151                         int l2 = CollationElementTable.GetSortKeyCount (i2);
4152                         int l = l1 > l2 ? l2 : l1;
4153
4154                         for (int i = 0; i < l; i++) {
4155                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4156                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4157                                 int v = k1.Primary - k2.Primary;
4158                                 if (v != 0)
4159                                         return v;
4160                                 v = k1.Secondary - k2.Secondary;
4161                                 if (v != 0)
4162                                         return v;
4163                                 v = k1.Thirtiary - k2.Thirtiary;
4164                                 if (v != 0)
4165                                         return v;
4166                                 v = k1.Quarternary - k2.Quarternary;
4167                                 if (v != 0)
4168                                         return v;
4169                         }
4170                         return l1 - l2;
4171                 }
4172         }
4173
4174         class Tailoring
4175         {
4176                 int lcid;
4177                 int alias;
4178                 bool frenchSort;
4179                 ArrayList items = new ArrayList ();
4180
4181                 public Tailoring (int lcid)
4182                         : this (lcid, 0)
4183                 {
4184                 }
4185
4186                 public Tailoring (int lcid, int alias)
4187                 {
4188                         this.lcid = lcid;
4189                         this.alias = alias;
4190                 }
4191
4192                 public int LCID {
4193                         get { return lcid; }
4194                 }
4195
4196                 public int Alias {
4197                         get { return alias; }
4198                 }
4199
4200                 public bool FrenchSort {
4201                         get { return frenchSort; }
4202                         set { frenchSort = value; }
4203                 }
4204
4205                 public void AddDiacriticalMap (byte target, byte replace)
4206                 {
4207                         items.Add (new DiacriticalMap (target, replace));
4208                 }
4209
4210                 public void AddSortKeyMap (string source, byte [] sortkey)
4211                 {
4212                         items.Add (new SortKeyMap (source, sortkey));
4213                 }
4214
4215                 public void AddReplacementMap (string source, string replace)
4216                 {
4217                         items.Add (new ReplacementMap (source, replace));
4218                 }
4219
4220                 public char [] ItemToCharArray ()
4221                 {
4222                         ArrayList al = new ArrayList ();
4223                         foreach (ITailoringMap m in items)
4224                                 al.AddRange (m.ToCharArray ());
4225                         return al.ToArray (typeof (char)) as char [];
4226                 }
4227
4228                 interface ITailoringMap
4229                 {
4230                         char [] ToCharArray ();
4231                 }
4232
4233                 class DiacriticalMap : ITailoringMap
4234                 {
4235                         public readonly byte Target;
4236                         public readonly byte Replace;
4237
4238                         public DiacriticalMap (byte target, byte replace)
4239                         {
4240                                 Target = target;
4241                                 Replace = replace;
4242                         }
4243
4244                         public char [] ToCharArray ()
4245                         {
4246                                 char [] ret = new char [3];
4247                                 ret [0] = (char) 02; // kind:DiacriticalMap
4248                                 ret [1] = (char) Target;
4249                                 ret [2] = (char) Replace;
4250                                 return ret;
4251                         }
4252                 }
4253
4254                 class SortKeyMap : ITailoringMap
4255                 {
4256                         public readonly string Source;
4257                         public readonly byte [] SortKey;
4258
4259                         public SortKeyMap (string source, byte [] sortkey)
4260                         {
4261                                 Source = source;
4262                                 SortKey = sortkey;
4263                         }
4264
4265                         public char [] ToCharArray ()
4266                         {
4267                                 char [] ret = new char [Source.Length + 7];
4268                                 ret [0] = (char) 01; // kind:SortKeyMap
4269                                 for (int i = 0; i < Source.Length; i++)
4270                                         ret [i + 1] = Source [i];
4271                                 // null terminate
4272                                 for (int i = 0; i < 4; i++)
4273                                         ret [i + Source.Length + 2] = (char) SortKey [i];
4274                                 return ret;
4275                         }
4276                 }
4277
4278                 class ReplacementMap : ITailoringMap
4279                 {
4280                         public readonly string Source;
4281                         public readonly string Replace;
4282
4283                         public ReplacementMap (string source, string replace)
4284                         {
4285                                 Source = source;
4286                                 Replace = replace;
4287                         }
4288
4289                         public char [] ToCharArray ()
4290                         {
4291                                 char [] ret = new char [Source.Length + Replace.Length + 3];
4292                                 ret [0] = (char) 03; // kind:ReplaceMap
4293                                 int pos = 1;
4294                                 for (int i = 0; i < Source.Length; i++)
4295                                         ret [pos++] = Source [i];
4296                                 // null terminate
4297                                 pos++;
4298                                 for (int i = 0; i < Replace.Length; i++)
4299                                         ret [pos++] = Replace [i];
4300                                 // null terminate
4301                                 return ret;
4302                         }
4303                 }
4304         }
4305 }