mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14
  15 //
  16 // * sortkey getter signature
  17 //
  18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
  19 //      Stores sort key for corresponding character element into buf and
  20 //      returns the length of the consumed _source_ character element in s.
  21 //
  22 // * character length to consume
  23 //
  24 //      If there are characters whose primary weight is 0, they are consumed
  25 //      and considered as a part of the character element.
  26 //
  27 #define Binary
  28
  29 using System;
  30 using System.IO;
  31 using System.Collections;
  32 using System.Globalization;
  33 using System.Text;
  34 using System.Xml;
  35
  36 namespace Mono.Globalization.Unicode
  37 {
  38         internal class MSCompatSortKeyTableGenerator
  39         {
  40                 public static void Main (string [] args)
  41                 {
  42                         new MSCompatSortKeyTableGenerator ().Run (args);
  43                 }
  44
  45                 const int DecompositionWide = 1; // fixed
  46                 const int DecompositionSub = 2; // fixed
  47                 const int DecompositionSmall = 3;
  48                 const int DecompositionIsolated = 4;
  49                 const int DecompositionInitial = 5;
  50                 const int DecompositionFinal = 6;
  51                 const int DecompositionMedial = 7;
  52                 const int DecompositionNoBreak = 8;
  53                 const int DecompositionVertical = 9;
  54                 const int DecompositionFraction = 0xA;
  55                 const int DecompositionFont = 0xB;
  56                 const int DecompositionSuper = 0xC; // fixed
  57                 const int DecompositionFull = 0xE;
  58                 const int DecompositionNarrow = 0xD;
  59                 const int DecompositionCircle = 0xF;
  60                 const int DecompositionSquare = 0x10;
  61                 const int DecompositionCompat = 0x11;
  62                 const int DecompositionCanonical = 0x12;
  63
  64                 TextWriter Result = Console.Out;
  65
  66                 byte [] fillIndex = new byte [256]; // by category
  67                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  68
  69                 char [] specialIgnore = new char [] {
  70                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  71                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  72                         };
  73
  74                 // FIXME: need more love (as always)
  75                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  76                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  77                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  78                         '\u0292', '\u01BE', '\u0298'};
  79                 byte [] alphaWeights = new byte [] {
  80                         2, 9, 0xA, 0x1A, 0x21,
  81                         0x23, 0x25, 0x2C, 0x32, 0x35,
  82                         0x36, 0x48, 0x51, 0x70, 0x7C,
  83                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  84                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  85                         0xA9, 0xAA, 0xB3, 0xB4};
  86
  87                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  88                 bool [] isUppercase = new bool [char.MaxValue + 1];
  89
  90                 byte [] decompType = new byte [char.MaxValue + 1];
  91                 int [] decompIndex = new int [char.MaxValue + 1];
  92                 int [] decompLength = new int [char.MaxValue + 1];
  93                 int [] decompValues;
  94                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  95
  96                 byte [] diacritical = new byte [char.MaxValue + 1];
  97
  98                 string [] diacritics = new string [] {
  99                         // LATIN, CYRILLIC etc.
 100                         "UPTURN", "DOUBLE-STRUCK",
 101                         "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;",
 102                         "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
 103                         "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
 104                         "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 105                         " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
 106                         "WITH OGONEK;", "WITH CEDILLA;",
 107                         //
 108                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 109                         "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
 110                         "STROKE OVERLAY",
 111                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 112                         " DIAERESIS AND GRAVE;",
 113                         " BREVE AND ACUTE;",
 114                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 115                         " MACRON AND ACUTE;",
 116                         " MACRON AND GRAVE;",
 117                         //
 118                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 119                         " RING ABOVE AND ACUTE",
 120                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 121                         " CIRCUMFLEX AND TILDE",
 122                         " TILDE AND DIAERESIS",
 123                         " STROKE AND ACUTE",
 124                         " BREVE AND TILDE",
 125                         " CEDILLA AND BREVE",
 126                         " OGONEK AND MACRON",
 127                         //
 128                         "WITH OVERLINE",
 129                         "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 130                         " DOUBLE GRAVE",
 131                         " INVERTED BREVE",
 132                         "ROMAN NUMERAL",
 133                         " PRECEDED BY APOSTROPHE",
 134                         "WITH HORN;",
 135                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 136                         " PALATAL HOOK",
 137                         " DOT BELOW;",
 138                         " RETROFLEX;", "DIAERESIS BELOW",
 139                         " RING BELOW",
 140                         //
 141                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 142                         " BREVE BELOW;", " HORN AND GRAVE",
 143                         " TILDE BELOW",
 144                         " TOPBAR",
 145                         " DOT BELOW AND DOT ABOVE",
 146                         " RIGHT HALF RING", " HORN AND TILDE",
 147                         " CIRCUMFLEX AND DOT BELOW",
 148                         " BREVE AND DOT BELOW",
 149                         " DOT BELOW AND MACRON",
 150                         " TONE TWO",
 151                         " HORN AND HOOK ABOVE",
 152                         " HORN AND DOT",
 153                         // CIRCLED, PARENTHESIZED and so on
 154                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
 155                         "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
 156                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 157                         };
 158                 byte [] diacriticWeights = new byte [] {
 159                         // LATIN.
 160                         3, 3, 5, 5,
 161                         0xF, 0xE, 0x12,
 162                         0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 163                         0x17, 0x19, 0x1A, 0x1B, 0x1C,
 164                         //
 165                         0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 166                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 167                         //
 168                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 169                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 170                         //
 171                         0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
 172                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
 173                         //
 174                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
 175                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 176                         0x87, 0x95, 0xAA,
 177                         // CIRCLED, PARENTHESIZED and so on.
 178                         0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
 179                         0xF3, 0xF3, 0xF3
 180                         };
 181
 182                 int [] numberSecondaryWeightBounds = new int [] {
 183                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 184                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 185                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 186                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 187                         0xE50, 0xE60, 0xED0, 0xEE0
 188                         };
 189
 190                 char [] orderedGurmukhi;
 191                 char [] orderedGujarati;
 192                 char [] orderedGeorgian;
 193                 char [] orderedThaana;
 194
 195                 static readonly char [] orderedTamilConsonants = new char [] {
 196                         // based on traditional Tamil consonants, except for
 197                         // Grantha (where Microsoft breaks traditionalism).
 198                         // http://www.angelfire.com/empire/thamizh/padanGaL
 199                         '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
 200                         '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
 201                         '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
 202                         '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
 203                         '\u0BB7', '\u0BB9'};
 204
 205                 // cp -> character name (only for some characters)
 206                 ArrayList sortableCharNames = new ArrayList ();
 207
 208                 // cp -> arrow value (int)
 209                 ArrayList arrowValues = new ArrayList ();
 210
 211                 // cp -> box value (int)
 212                 ArrayList boxValues = new ArrayList ();
 213
 214                 // cp -> level1 value
 215                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 216
 217                 // letterName -> cp
 218                 Hashtable arabicNameMap = new Hashtable ();
 219
 220                 // cp -> Hashtable [decompType] -> cp
 221                 Hashtable nfkdMap = new Hashtable ();
 222
 223                 // Latin letter -> ArrayList [int]
 224                 Hashtable latinMap = new Hashtable ();
 225
 226                 ArrayList jisJapanese = new ArrayList ();
 227                 ArrayList nonJisJapanese = new ArrayList ();
 228
 229                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
 230                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
 231                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
 232                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
 233                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
 234
 235                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 236
 237                 static double [] unicodeAge = new double [char.MaxValue + 1];
 238
 239                 ArrayList tailorings = new ArrayList ();
 240
 241                 void Run (string [] args)
 242                 {
 243                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 244                         ParseSources (dirname);
 245                         Console.Error.WriteLine ("parse done.");
 246
 247                         ModifyParsedValues ();
 248                         GenerateCore ();
 249                         Console.Error.WriteLine ("generation done.");
 250                         Serialize ();
 251                         Console.Error.WriteLine ("serialization done.");
 252 /*
 253 StreamWriter sw = new StreamWriter ("agelog.txt");
 254 for (int i = 0; i < char.MaxValue; i++) {
 255 bool shouldBe = false;
 256 switch (Char.GetUnicodeCategory ((char) i)) {
 257 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
 258         shouldBe = true; break;
 259 }
 260 if (unicodeAge [i] >= 3.1)
 261         shouldBe = true;
 262 //if (IsIgnorable (i) != shouldBe)
 263 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
 264 }
 265 sw.Close ();
 266 */
 267                 }
 268
 269                 byte [] CompressArray (byte [] source, CodePointIndexer i)
 270                 {
 271                         return (byte []) CodePointIndexer.CompressArray  (
 272                                 source, typeof (byte), i);
 273                 }
 274
 275                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
 276                 {
 277                         return (ushort []) CodePointIndexer.CompressArray  (
 278                                 source, typeof (ushort), i);
 279                 }
 280
 281                 void Serialize ()
 282                 {
 283                         // Tailorings
 284                         SerializeTailorings ();
 285
 286                         byte [] categories = new byte [map.Length];
 287                         byte [] level1 = new byte [map.Length];
 288                         byte [] level2 = new byte [map.Length];
 289                         byte [] level3 = new byte [map.Length];
 290                         ushort [] widthCompat = new ushort [map.Length];
 291                         for (int i = 0; i < map.Length; i++) {
 292                                 categories [i] = map [i].Category;
 293                                 level1 [i] = map [i].Level1;
 294                                 level2 [i] = map [i].Level2;
 295                                 level3 [i] = ComputeLevel3Weight ((char) i);
 296                                 // For Japanese Half-width characters, don't
 297                                 // map widthCompat. It is IgnoreKanaType that
 298                                 // handles those width differences.
 299                                 if (0xFF6D <= i && i <= 0xFF9D)
 300                                         continue;
 301                                 switch (decompType [i]) {
 302                                 case DecompositionNarrow:
 303                                 case DecompositionWide:
 304                                 case DecompositionSuper:
 305                                 case DecompositionSub:
 306                                         // they are always 1 char
 307                                         widthCompat [i] = (ushort) decompValues [decompIndex [i]];
 308                                         break;
 309                                 }
 310                         }
 311
 312                         // compress
 313                         ignorableFlags = CompressArray (ignorableFlags,
 314                                 MSCompatUnicodeTableUtil.Ignorable);
 315                         categories = CompressArray (categories,
 316                                 MSCompatUnicodeTableUtil.Category);
 317                         level1 = CompressArray (level1,
 318                                 MSCompatUnicodeTableUtil.Level1);
 319                         level2 = CompressArray (level2,
 320                                 MSCompatUnicodeTableUtil.Level2);
 321                         level3 = CompressArray (level3,
 322                                 MSCompatUnicodeTableUtil.Level3);
 323                         widthCompat = (ushort []) CodePointIndexer.CompressArray (
 324                                 widthCompat, typeof (ushort),
 325                                 MSCompatUnicodeTableUtil.WidthCompat);
 326                         cjkCHS = CompressArray (cjkCHS,
 327                                 MSCompatUnicodeTableUtil.CjkCHS);
 328                         cjkCHT = CompressArray (cjkCHT,
 329                                 MSCompatUnicodeTableUtil.Cjk);
 330                         cjkJA = CompressArray (cjkJA,
 331                                 MSCompatUnicodeTableUtil.Cjk);
 332                         cjkKO = CompressArray (cjkKO,
 333                                 MSCompatUnicodeTableUtil.Cjk);
 334                         cjkKOlv2 = CompressArray (cjkKOlv2,
 335                                 MSCompatUnicodeTableUtil.Cjk);
 336
 337                         // Ignorables
 338                         Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
 339 #if Binary
 340                         MemoryStream ms = new MemoryStream ();
 341                         BinaryWriter binary = new BinaryWriter (ms);
 342                         binary.Write (ignorableFlags.Length);
 343 #endif
 344                         for (int i = 0; i < ignorableFlags.Length; i++) {
 345                                 byte value = ignorableFlags [i];
 346                                 if (value < 10)
 347                                         Result.Write ("{0},", value);
 348                                 else
 349                                         Result.Write ("0x{0:X02},", value);
 350 #if Binary
 351                                 binary.Write (value);
 352 #endif
 353                                 if ((i & 0xF) == 0xF)
 354                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 355                         }
 356                         Result.WriteLine ("};");
 357                         Result.WriteLine ();
 358
 359                         // Primary category
 360                         Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
 361 #if Binary
 362                         binary.Write (categories.Length);
 363 #endif
 364                         for (int i = 0; i < categories.Length; i++) {
 365                                 byte value = categories [i];
 366                                 if (value < 10)
 367                                         Result.Write ("{0},", value);
 368                                 else
 369                                         Result.Write ("0x{0:X02},", value);
 370 #if Binary
 371                                 binary.Write (value);
 372 #endif
 373                                 if ((i & 0xF) == 0xF)
 374                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 375                         }
 376                         Result.WriteLine ("};");
 377                         Result.WriteLine ();
 378
 379                         // Primary weight value
 380                         Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
 381 #if Binary
 382                         binary.Write (level1.Length);
 383 #endif
 384                         for (int i = 0; i < level1.Length; i++) {
 385                                 byte value = level1 [i];
 386                                 if (value < 10)
 387                                         Result.Write ("{0},", value);
 388                                 else
 389                                         Result.Write ("0x{0:X02},", value);
 390 #if Binary
 391                                 binary.Write (value);
 392 #endif
 393                                 if ((i & 0xF) == 0xF)
 394                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 395                         }
 396                         Result.WriteLine ("};");
 397                         Result.WriteLine ();
 398
 399                         // Secondary weight
 400                         Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
 401 #if Binary
 402                         binary.Write (level2.Length);
 403 #endif
 404                         for (int i = 0; i < level2.Length; i++) {
 405                                 byte value = level2 [i];
 406                                 if (value < 10)
 407                                         Result.Write ("{0},", value);
 408                                 else
 409                                         Result.Write ("0x{0:X02},", value);
 410 #if Binary
 411                                 binary.Write (value);
 412 #endif
 413                                 if ((i & 0xF) == 0xF)
 414                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 415                         }
 416                         Result.WriteLine ("};");
 417                         Result.WriteLine ();
 418
 419                         // Thirtiary weight
 420                         Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
 421 #if Binary
 422                         binary.Write (level3.Length);
 423 #endif
 424                         for (int i = 0; i < level3.Length; i++) {
 425                                 byte value = level3 [i];
 426                                 if (value < 10)
 427                                         Result.Write ("{0},", value);
 428                                 else
 429                                         Result.Write ("0x{0:X02},", value);
 430 #if Binary
 431                                 binary.Write (value);
 432 #endif
 433                                 if ((i & 0xF) == 0xF)
 434                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 435                         }
 436                         Result.WriteLine ("};");
 437                         Result.WriteLine ();
 438
 439                         // Width insensitivity mappings
 440                         // (for now it is more lightweight than dumping the
 441                         // entire NFKD table).
 442                         Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
 443 #if Binary
 444                         binary.Write (widthCompat.Length);
 445 #endif
 446                         for (int i = 0; i < widthCompat.Length; i++) {
 447                                 ushort value = widthCompat [i];
 448                                 if (value < 10)
 449                                         Result.Write ("{0},", value);
 450                                 else
 451                                         Result.Write ("0x{0:X02},", value);
 452 #if Binary
 453                                 binary.Write (value);
 454 #endif
 455                                 if ((i & 0xF) == 0xF)
 456                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 457                         }
 458                         Result.WriteLine ("};");
 459                         Result.WriteLine ();
 460 #if Binary
 461                         using (FileStream fs = File.Create ("../collation.core.bin")) {
 462                                 byte [] array = ms.ToArray ();
 463                                 fs.Write (array, 0, array.Length);
 464                         }
 465 #endif
 466
 467                         // CJK
 468                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 469                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 470                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 471                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 472                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 473                 }
 474
 475                 void SerializeCJK (string name, ushort [] cjk, int max)
 476                 {
 477                         int offset = 0;//char.MaxValue - cjk.Length;
 478                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
 479 #if Binary
 480                         MemoryStream ms = new MemoryStream ();
 481                         BinaryWriter binary = new BinaryWriter (ms);
 482                         binary.Write (cjk.Length);
 483 #endif
 484                         for (int i = 0; i < cjk.Length; i++) {
 485                                 if (i + offset == max)
 486                                         break;
 487                                 ushort value = cjk [i];
 488                                 if (value < 10)
 489                                         Result.Write ("{0},", value);
 490                                 else
 491                                         Result.Write ("0x{0:X04},", value);
 492 #if Binary
 493                                 binary.Write (value);
 494 #endif
 495                                 if ((i & 0xF) == 0xF)
 496                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 497                         }
 498                         Result.WriteLine ("};");
 499                         Result.WriteLine ();
 500 #if Binary
 501                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 502                                 byte [] array = ms.ToArray ();
 503                                 fs.Write (array, 0, array.Length);
 504                         }
 505 #endif
 506                 }
 507
 508                 void SerializeCJK (string name, byte [] cjk, int max)
 509                 {
 510                         int offset = 0;//char.MaxValue - cjk.Length;
 511                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
 512 #if Binary
 513                         MemoryStream ms = new MemoryStream ();
 514                         BinaryWriter binary = new BinaryWriter (ms);
 515 #endif
 516                         for (int i = 0; i < cjk.Length; i++) {
 517                                 if (i + offset == max)
 518                                         break;
 519                                 byte value = cjk [i];
 520                                 if (value < 10)
 521                                         Result.Write ("{0},", value);
 522                                 else
 523                                         Result.Write ("0x{0:X02},", value);
 524 #if Binary
 525                                 binary.Write (value);
 526 #endif
 527                                 if ((i & 0xF) == 0xF)
 528                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 529                         }
 530                         Result.WriteLine ("};");
 531                         Result.WriteLine ();
 532 #if Binary
 533                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 534                                 byte [] array = ms.ToArray ();
 535                                 fs.Write (array, 0, array.Length);
 536                         }
 537 #endif
 538                 }
 539
 540                 void SerializeTailorings ()
 541                 {
 542                         Hashtable indexes = new Hashtable ();
 543                         Hashtable counts = new Hashtable ();
 544                         Result.WriteLine ("static char [] tailorings = new char [] {");
 545                         int count = 0;
 546 #if Binary
 547                         MemoryStream ms = new MemoryStream ();
 548                         BinaryWriter binary = new BinaryWriter (ms);
 549 #endif
 550                         foreach (Tailoring t in tailorings) {
 551                                 if (t.Alias != 0)
 552                                         continue;
 553                                 Result.Write ("/*{0}*/", t.LCID);
 554                                 indexes.Add (t.LCID, count);
 555                                 char [] values = t.ItemToCharArray ();
 556                                 counts.Add (t.LCID, values.Length);
 557                                 foreach (char c in values) {
 558                                         Result.Write ("'\\x{0:X}', ", (int) c);
 559                                         if (++count % 16 == 0)
 560                                                 Result.WriteLine (" // {0:X04}", count - 16);
 561 #if Binary
 562                                         binary.Write ((ushort) c);
 563 #endif
 564                                 }
 565                         }
 566                         Result.WriteLine ("};");
 567
 568                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
 569 #if Binary
 570                         byte [] rawdata = ms.ToArray ();
 571                         ms = new MemoryStream ();
 572                         binary = new BinaryWriter (ms);
 573                         binary.Write (tailorings.Count);
 574 #endif
 575                         foreach (Tailoring t in tailorings) {
 576                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
 577                                 if (!indexes.ContainsKey (target)) {
 578                                         throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
 579                                         continue;
 580                                 }
 581                                 int idx = (int) indexes [target];
 582                                 int cnt = (int) counts [target];
 583                                 bool french = t.FrenchSort;
 584                                 if (t.Alias != 0)
 585                                         foreach (Tailoring t2 in tailorings)
 586                                                 if (t2.LCID == t.LCID)
 587                                                         french = t2.FrenchSort;
 588                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
 589 #if Binary
 590                                 binary.Write (t.LCID);
 591                                 binary.Write (idx);
 592                                 binary.Write (cnt);
 593                                 binary.Write (french);
 594 #endif
 595                         }
 596                         Result.WriteLine ("};");
 597 #if Binary
 598                         binary.Write ((byte) 0xFF);
 599                         binary.Write ((byte) 0xFF);
 600                         binary.Write (rawdata.Length / 2);
 601                         binary.Write (rawdata, 0, rawdata.Length);
 602
 603
 604                         using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
 605                                 byte [] array = ms.ToArray ();
 606                                 fs.Write (array, 0, array.Length);
 607                         }
 608 #endif
 609                 }
 610
 611                 #region Parse
 612
 613                 void ParseSources (string dirname)
 614                 {
 615                         string unidata =
 616                                 dirname + "/UnicodeData.txt";
 617                         string derivedCoreProps =
 618                                 dirname + "/DerivedCoreProperties.txt";
 619                         string scripts =
 620                                 dirname + "/Scripts.txt";
 621                         string cp932 =
 622                                 dirname + "/CP932.TXT";
 623                         string derivedAge =
 624                                 dirname + "/DerivedAge.txt";
 625                         string chXML = dirname + "/common/collation/zh.xml";
 626                         string jaXML = dirname + "/common/collation/ja.xml";
 627                         string koXML = dirname + "/common/collation/ko.xml";
 628
 629                         ParseDerivedAge (derivedAge);
 630
 631                         FillIgnorables ();
 632
 633                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 634                         ParseUnidata (unidata);
 635                         ModifyUnidata ();
 636                         ParseDerivedCoreProperties (derivedCoreProps);
 637                         ParseScripts (scripts);
 638                         ParseCJK (chXML, jaXML, koXML);
 639
 640                         ParseTailorings ("mono-tailoring-source.txt");
 641                 }
 642
 643                 void ParseTailorings (string filename)
 644                 {
 645                         Tailoring t = null;
 646                         int line = 0;
 647                         using (StreamReader sr = new StreamReader (filename)) {
 648                                 try {
 649                                         while (sr.Peek () >= 0) {
 650                                                 line++;
 651                                                 ProcessTailoringLine (ref t,
 652                                                         sr.ReadLine ().Trim ());
 653                                         }
 654                                 } catch (Exception) {
 655                                         Console.Error.WriteLine ("ERROR at line {0}", line);
 656                                         throw;
 657                                 }
 658                         }
 659                 }
 660
 661                 // For now this is enough.
 662                 string ParseTailoringSourceValue (string s)
 663                 {
 664                         StringBuilder sb = new StringBuilder ();
 665                         for (int i = 0; i < s.Length; i++) {
 666                                 if (s.StartsWith ("\\u")) {
 667                                         sb.Append ((char) int.Parse (
 668                                                 s.Substring (2, 4), NumberStyles.HexNumber),
 669                                                 1);
 670                                         i += 5;
 671                                 }
 672                         else
 673                                 sb.Append (s [i]);
 674                         }
 675                         return sb.ToString ();
 676                 }
 677
 678                 void ProcessTailoringLine (ref Tailoring t, string s)
 679                 {
 680                         int idx = s.IndexOf ('#');
 681                         if (idx > 0)
 682                                 s = s.Substring (0, idx).Trim ();
 683                         if (s.Length == 0 || s [0] == '#')
 684                                 return;
 685                         if (s [0] == '@') {
 686                                 idx = s.IndexOf ('=');
 687                                 if (idx > 0)
 688                                         t = new Tailoring (
 689                                                 int.Parse (s.Substring (1, idx - 1)),
 690                                                 int.Parse (s.Substring (idx + 1)));
 691                                 else
 692                                         t = new Tailoring (int.Parse (s.Substring (1)));
 693                                 tailorings.Add (t);
 694                                 return;
 695                         }
 696                         if (s.StartsWith ("*FrenchSort")) {
 697                                 t.FrenchSort = true;
 698                                 return;
 699                         }
 700                         string d = "*Diacritical";
 701                         if (s.StartsWith (d)) {
 702                                 idx = s.IndexOf ("->");
 703                                 t.AddDiacriticalMap (
 704                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
 705                                                 NumberStyles.HexNumber),
 706                                         byte.Parse (s.Substring (idx + 2).Trim (),
 707                                                 NumberStyles.HexNumber));
 708                                 return;
 709                         }
 710                         idx = s.IndexOf (':');
 711                         if (idx > 0) {
 712                                 string source = s.Substring (0, idx).Trim ();
 713                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
 714                                 byte [] b = new byte [4];
 715                                 for (int i = 0; i < 4; i++) {
 716                                         if (l [i] == "*")
 717                                                 b [i] = 0;
 718                                         else
 719                                                 b [i] = byte.Parse (l [i],
 720                                                         NumberStyles.HexNumber);
 721                                 }
 722                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
 723                                         b);
 724                         }
 725                         idx = s.IndexOf ('=');
 726                         if (idx > 0)
 727                                 t.AddReplacementMap (
 728                                         ParseTailoringSourceValue (
 729                                                 s.Substring (0, idx).Trim ()),
 730                                         ParseTailoringSourceValue (
 731                                                 s.Substring (idx + 1).Trim ()));
 732                 }
 733
 734                 void ParseDerivedAge (string filename)
 735                 {
 736                         using (StreamReader file =
 737                                 new StreamReader (filename)) {
 738                                 while (file.Peek () >= 0) {
 739                                         string s = file.ReadLine ();
 740                                         int idx = s.IndexOf ('#');
 741                                         if (idx >= 0)
 742                                                 s = s.Substring (0, idx);
 743                                         idx = s.IndexOf (';');
 744                                         if (idx < 0)
 745                                                 continue;
 746
 747                                         string cpspec = s.Substring (0, idx);
 748                                         idx = cpspec.IndexOf ("..");
 749                                         NumberStyles nf = NumberStyles.HexNumber |
 750                                                 NumberStyles.AllowTrailingWhite;
 751                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 752                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 753                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 754
 755                                         // FIXME: use index
 756                                         if (cp > char.MaxValue)
 757                                                 continue;
 758
 759                                         double v = double.Parse (value);
 760                                         for (int i = cp; i <= cpEnd; i++)
 761                                                 unicodeAge [i] = v;
 762                                 }
 763                         }
 764                         unicodeAge [0] = double.MaxValue; // never be supported
 765                 }
 766
 767                 void ParseUnidata (string filename)
 768                 {
 769                         ArrayList decompValues = new ArrayList ();
 770                         using (StreamReader unidata =
 771                                 new StreamReader (filename)) {
 772                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 773                                         try {
 774                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 775                                         } catch (Exception) {
 776                                                 Console.Error.WriteLine ("**** At line " + line);
 777                                                 throw;
 778                                         }
 779                                 }
 780                         }
 781                         this.decompValues = (int [])
 782                                 decompValues.ToArray (typeof (int));
 783                 }
 784
 785                 char previousLatinTarget = char.MinValue;
 786                 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
 787
 788                 void ProcessUnidataLine (string s, ArrayList decompValues)
 789                 {
 790                         int idx = s.IndexOf ('#');
 791                         if (idx >= 0)
 792                                 s = s.Substring (0, idx);
 793                         idx = s.IndexOf (';');
 794                         if (idx < 0)
 795                                 return;
 796                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 797                         string [] values = s.Substring (idx + 1).Split (';');
 798
 799                         // FIXME: use index
 800                         if (cp > char.MaxValue)
 801                                 return;
 802                         if (IsIgnorable (cp))
 803                                 return;
 804
 805                         string name = values [0];
 806
 807                         // SPECIAL CASE: rename some characters for diacritical
 808                         // remapping. FIXME: why are they different?
 809                         // FIXME: it's still not working.
 810                         if (cp == 0x018B || cp == 0x018C)
 811                                 name = name.Replace ("TOPBAR", "STROKE");
 812
 813                         // isSmallCapital
 814                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 815                                 isSmallCapital [cp] = true;
 816
 817                         // latin mapping by character name
 818                         if (s.IndexOf ("LATIN") >= 0) {
 819                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 820                                 int offset = lidx + 15;
 821                                 if (lidx < 0) {
 822                                         lidx = s.IndexOf ("LETTER TURNED ");
 823                                         offset = lidx + 14;
 824                                 }
 825                                 if (lidx < 0) {
 826                                         lidx = s.IndexOf ("LETTER CAPITAL ");
 827                                         offset = lidx + 15;
 828                                 }
 829                                 if (lidx < 0) {
 830                                         lidx = s.IndexOf ("LETTER SCRIPT ");
 831                                         offset = lidx + 14;
 832                                 }
 833                                 if (lidx < 0) {
 834                                         lidx = s.IndexOf ("LETTER ");
 835                                         offset = lidx + 7;
 836                                 }
 837                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 838                                 char n = s [offset + 1];
 839                                 char target = char.MinValue;
 840                                 if ('A' <= c && c <= 'Z' &&
 841                                         (n == ' ') || n == ';') {
 842                                         target = c;
 843                                         // FIXME: After 'Z', I cannot reset this state.
 844                                         previousLatinTarget = c == 'Z' ? char.MinValue : c;
 845                                 }
 846
 847                                 if (s.Substring (offset).StartsWith ("ALPHA"))
 848                                         target = 'A';
 849                                 else if (s.Substring (offset).StartsWith ("TONE SIX"))
 850                                         target = 'B';
 851                                 else if (s.Substring (offset).StartsWith ("OPEN O"))
 852                                         target = 'C';
 853                                 else if (s.Substring (offset).StartsWith ("SCHWA"))
 854                                         target = 'E';
 855                                 else if (s.Substring (offset).StartsWith ("ENG"))
 856                                         target = 'N';
 857                                 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
 858                                         target = 'O';
 859                                 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
 860                                         target = 'R';
 861                                 else if (s.Substring (offset).StartsWith ("TONE TWO"))
 862                                         target = 'S';
 863                                 else if (s.Substring (offset).StartsWith ("ESH"))
 864                                         target = 'S';
 865
 866                                 if (target == char.MinValue)
 867                                         target = previousLatinTarget;
 868
 869                                 if (target != char.MinValue) {
 870                                         ArrayList entry = (ArrayList) latinMap [target];
 871                                         if (entry == null) {
 872                                                 entry = new ArrayList ();
 873                                                 latinMap [target] = entry;
 874                                         }
 875                                         entry.Add (cp);
 876                                         // FIXME: This secondary weight is hack.
 877                                         // They are here because they must not
 878                                         // be identical to the corresponding
 879                                         // ASCII latins.
 880                                         if (c != target && diacritical [cp] == 0) {
 881                                                 diacriticalOffset [c - 'A']++;
 882                                                 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
 883                                         }
 884                                 }
 885                         }
 886
 887                         // Arrow names
 888                         if (0x2000 <= cp && cp < 0x3000) {
 889                                 int value = 0;
 890                                 // SPECIAL CASES. FIXME: why?
 891                                 switch (cp) {
 892                                 case 0x21C5: value = -1; break; // E2
 893                                 case 0x261D: value = 1; break;
 894                                 case 0x27A6: value = 3; break;
 895                                 case 0x21B0: value = 7; break;
 896                                 case 0x21B1: value = 3; break;
 897                                 case 0x21B2: value = 7; break;
 898                                 case 0x21B4: value = 5; break;
 899                                 case 0x21B5: value = 7; break;
 900                                 case 0x21B9: value = -1; break; // E1
 901                                 case 0x21CF: value = 7; break;
 902                                 case 0x21D0: value = 3; break;
 903                                 }
 904                                 string [] arrowTargets = new string [] {
 905                                         "",
 906                                         "UPWARDS",
 907                                         "NORTH EAST",
 908                                         "RIGHTWARDS",
 909                                         "SOUTH EAST",
 910                                         "DOWNWARDS",
 911                                         "SOUTH WEST",
 912                                         "LEFTWARDS",
 913                                         "NORTH WEST",
 914                                         };
 915                                 if (value == 0)
 916                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
 917                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
 918                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
 919                                                         s.IndexOf (" OVER") < 0
 920                                                 )
 921                                                         value = i;
 922                                 if (value > 0)
 923                                         arrowValues.Add (new DictionaryEntry (
 924                                                 cp, value));
 925                         }
 926
 927                         // Box names
 928                         if (0x2500 <= cp && cp < 0x2600) {
 929                                 int value = 0;
 930                                 // flags:
 931                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
 932                                 // [h,rl] [r] [l]
 933                                 // [v,ud] [u] [d]
 934                                 // [dr] [dl] [ur] [ul]
 935                                 // [vr,udr] [vl,vdl]
 936                                 // [hd,rld] [hu,rlu]
 937                                 // [hv,udrl,rlv,udh]
 938                                 ArrayList flags = new ArrayList (new int [] {
 939                                         32, 8 + 4, 8, 4,
 940                                         16, 1 + 2, 1, 2,
 941                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
 942                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
 943                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
 944                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
 945                                         });
 946                                 byte [] offsets = new byte [] {
 947                                         0, 0, 1, 2,
 948                                         3, 3, 4, 5,
 949                                         6, 7, 8, 9,
 950                                         10, 10, 11, 11,
 951                                         12, 12, 13, 13,
 952                                         14, 14, 14, 14};
 953                                 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
 954                                         int flag = 0;
 955                                         if (s.IndexOf (" UP") >= 0)
 956                                                 flag |= 1;
 957                                         if (s.IndexOf (" DOWN") >= 0)
 958                                                 flag |= 2;
 959                                         if (s.IndexOf (" RIGHT") >= 0)
 960                                                 flag |= 4;
 961                                         if (s.IndexOf (" LEFT") >= 0)
 962                                                 flag |= 8;
 963                                         if (s.IndexOf (" VERTICAL") >= 0)
 964                                                 flag |= 16;
 965                                         if (s.IndexOf (" HORIZONTAL") >= 0)
 966                                                 flag |= 32;
 967
 968                                         int fidx = flags.IndexOf (flag);
 969                                         value = fidx < 0 ? fidx : offsets [fidx];
 970                                 } else if (s.IndexOf ("BLOCK") >= 0) {
 971                                         if (s.IndexOf ("ONE EIGHTH") >= 0)
 972                                                 value = 0x12;
 973                                         else if (s.IndexOf ("ONE QUARTER") >= 0)
 974                                                 value = 0x13;
 975                                         else if (s.IndexOf ("THREE EIGHTHS") >= 0)
 976                                                 value = 0x14;
 977                                         else if (s.IndexOf ("HALF") >= 0)
 978                                                 value = 0x15;
 979                                         else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
 980                                                 value = 0x16;
 981                                         else if (s.IndexOf ("THREE QUARTERS") >= 0)
 982                                                 value = 0x17;
 983                                         else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
 984                                                 value = 0x18;
 985                                         else
 986                                                 value = 0x19;
 987                                 }
 988                                 else if (s.IndexOf ("SHADE") >= 0)
 989                                         value = 0x19;
 990                                 else if (s.IndexOf ("SQUARE") >= 0)
 991                                         value = 0xBC - 0xE5;
 992                                 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
 993                                         value = 0xBE - 0xE5;
 994                                 else if (s.IndexOf ("RECTANGLE") >= 0)
 995                                         value = 0xBD - 0xE5;
 996                                 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
 997                                         value = 0xBF - 0xE5;
 998                                 else if (s.IndexOf ("TRIANGLE") >= 0) {
 999                                         if (s.IndexOf ("UP-POINTING") >= 0)
1000                                                 value = 0xC0 - 0xE5;
1001                                         else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1002                                                 value = 0xC1 - 0xE5;
1003                                         else if (s.IndexOf ("DOWN-POINTING") >= 0)
1004                                                 value = 0xC2 - 0xE5;
1005                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1006                                                 value = 0xC3 - 0xE5;
1007                                 }
1008                                 else if (s.IndexOf ("POINTER") >= 0) {
1009                                         if (s.IndexOf ("RIGHT-POINTING") >= 0)
1010                                                 value = 0xC4 - 0xE5;
1011                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1012                                                 value = 0xC5 - 0xE5;
1013                                 }
1014                                 else if (s.IndexOf ("DIAMOND") >= 0)
1015                                         value = 0xC6 - 0xE5;
1016                                 else if (s.IndexOf ("FISHEYE") >= 0)
1017                                         value = 0xC7 - 0xE5;
1018                                 else if (s.IndexOf ("LOZENGE") >= 0)
1019                                         value = 0xC8 - 0xE5;
1020                                 else if (s.IndexOf ("BULLSEYE") >= 0)
1021                                         value = 0xC9 - 0xE5;
1022                                 else if (s.IndexOf ("CIRCLE") >= 0) {
1023                                         if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1024                                                 value = 0xCA - 0xE5;
1025                                         else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1026                                                 value = 0xCB - 0xE5;
1027                                         else
1028                                                 value = 0xC9 - 0xE5;
1029                                 }
1030                                 if (0x25DA <= cp && cp <= 0x25E5)
1031                                         value = 0xCD + cp - 0x25DA - 0xE5;
1032
1033                                 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1034                                 switch (cp) {
1035                                 case 0x2571: value = 0xF; break;
1036                                 case 0x2572: value = 0x10; break;
1037                                 case 0x2573: value = 0x11; break;
1038                                 }
1039                                 if (value != 0)
1040                                         boxValues.Add (new DictionaryEntry (
1041                                                 cp, value));
1042                         }
1043
1044                         // For some characters store the name and sort later
1045                         // to determine sorting.
1046                         if (0x2100 <= cp && cp <= 0x213F &&
1047                                 Char.IsSymbol ((char) cp))
1048                                 sortableCharNames.Add (
1049                                         new DictionaryEntry (cp, name));
1050                         else if (0x3380 <= cp && cp <= 0x33DD)
1051                                 sortableCharNames.Add (new DictionaryEntry (
1052                                         cp, name.Substring (7)));
1053
1054                         if (Char.GetUnicodeCategory ((char) cp) ==
1055                                 UnicodeCategory.MathSymbol) {
1056                                 if (name.StartsWith ("CIRCLED "))
1057                                         diacritical [cp] = 0xEE;
1058                                 if (name.StartsWith ("SQUARED "))
1059                                         diacritical [cp] = 0xEF;
1060                         }
1061
1062                         // diacritical weights by character name
1063 if (diacritics.Length != diacriticWeights.Length)
1064 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1065                         for (int d = 0; d < diacritics.Length; d++) {
1066                                 if (s.IndexOf (diacritics [d]) > 0) {
1067                                         diacritical [cp] += diacriticWeights [d];
1068                                         if (s.IndexOf ("COMBINING") >= 0)
1069                                                 diacritical [cp] -= (byte) 2;
1070                                         continue;
1071                                 }
1072                                 // also process "COMBINING blah" here
1073                                 // For now it is limited to cp < 0x0370
1074 //                              if (cp < 0x0300 || cp >= 0x0370)
1075 //                                      continue;
1076                                 string tmp = diacritics [d].TrimEnd (';');
1077                                 if (tmp.IndexOf ("WITH ") == 0)
1078                                         tmp = tmp.Substring (4);
1079                                 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1080                                 if (name == tmp)
1081                                         diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1082 //if (name == tmp)
1083 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1084                         }
1085                         // Two-step grep required for it.
1086                         if (s.IndexOf ("FULL STOP") > 0 &&
1087                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1088                                 diacritical [cp] |= 0xF4;
1089
1090                         // Arabic letter name
1091                         if (0x0621 <= cp && cp <= 0x064A &&
1092                                 Char.GetUnicodeCategory ((char) cp)
1093                                 == UnicodeCategory.OtherLetter) {
1094                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1095                                 switch (cp) {
1096                                 case 0x0621:
1097                                 case 0x0624:
1098                                 case 0x0626:
1099                                         // hamza, waw, yeh ... special cases.
1100                                         value = 0x07;
1101                                         break;
1102                                 case 0x0649:
1103                                 case 0x064A:
1104                                         value = 0x77; // special cases.
1105                                         break;
1106                                 default:
1107                                         // Get primary letter name i.e.
1108                                         // XXX part of ARABIC LETTER XXX yyy
1109                                         // e.g. that of "TEH MARBUTA" is "TEH".
1110                                         string letterName =
1111                                                 (cp == 0x0640) ?
1112                                                 // 0x0640 is special: it does
1113                                                 // not start with ARABIC LETTER
1114                                                 name :
1115                                                 name.Substring (14);
1116                                         int tmpIdx = letterName.IndexOf (' ');
1117                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1118 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1119                                         if (arabicNameMap.ContainsKey (letterName))
1120                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1121                                         else
1122                                                 arabicNameMap [letterName] = cp;
1123                                         break;
1124                                 }
1125                                 arabicLetterPrimaryValues [cp] = value;
1126                         }
1127
1128                         // Japanese square letter
1129                         if (0x3300 <= cp && cp <= 0x3357)
1130                                 if (!ExistsJIS (cp))
1131                                         nonJisJapanese.Add (new NonJISCharacter (cp, name));
1132
1133                         // normalizationType
1134                         string decomp = values [4];
1135                         idx = decomp.IndexOf ('<');
1136                         if (idx >= 0) {
1137                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1138                                 case "full":
1139                                         decompType [cp] = DecompositionFull;
1140                                         break;
1141                                 case "sub":
1142                                         decompType [cp] = DecompositionSub;
1143                                         break;
1144                                 case "super":
1145                                         decompType [cp] = DecompositionSuper;
1146                                         break;
1147                                 case "small":
1148                                         decompType [cp] = DecompositionSmall;
1149                                         break;
1150                                 case "isolated":
1151                                         decompType [cp] = DecompositionIsolated;
1152                                         break;
1153                                 case "initial":
1154                                         decompType [cp] = DecompositionInitial;
1155                                         break;
1156                                 case "final":
1157                                         decompType [cp] = DecompositionFinal;
1158                                         break;
1159                                 case "medial":
1160                                         decompType [cp] = DecompositionMedial;
1161                                         break;
1162                                 case "noBreak":
1163                                         decompType [cp] = DecompositionNoBreak;
1164                                         break;
1165                                 case "compat":
1166                                         decompType [cp] = DecompositionCompat;
1167                                         break;
1168                                 case "fraction":
1169                                         decompType [cp] = DecompositionFraction;
1170                                         break;
1171                                 case "font":
1172                                         decompType [cp] = DecompositionFont;
1173                                         break;
1174                                 case "circle":
1175                                         decompType [cp] = DecompositionCircle;
1176                                         break;
1177                                 case "square":
1178                                         decompType [cp] = DecompositionSquare;
1179                                         break;
1180                                 case "wide":
1181                                         decompType [cp] = DecompositionWide;
1182                                         break;
1183                                 case "narrow":
1184                                         decompType [cp] = DecompositionNarrow;
1185                                         break;
1186                                 case "vertical":
1187                                         decompType [cp] = DecompositionVertical;
1188                                         break;
1189                                 default:
1190                                         throw new Exception ("Support NFKD type : " + decomp);
1191                                 }
1192                         }
1193                         else
1194                                 decompType [cp] = DecompositionCanonical;
1195                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1196                         if (decomp.Length > 0) {
1197
1198                                 string [] velems = decomp.Split (' ');
1199                                 int didx = decompValues.Count;
1200                                 decompIndex [cp] = didx;
1201                                 foreach (string v in velems)
1202                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1203                                 decompLength [cp] = velems.Length;
1204
1205                                 // [decmpType] -> this_cp
1206                                 int targetCP = (int) decompValues [didx];
1207                                 // for "(x)" it specially maps to 'x' .
1208                                 // FIXME: check if it is sane
1209                                 if (velems.Length == 3 &&
1210                                         (int) decompValues [didx] == '(' &&
1211                                         (int) decompValues [didx + 2] == ')')
1212                                         targetCP = (int) decompValues [didx + 1];
1213                                 // special: 0x215F "1/"
1214                                 else if (cp == 0x215F)
1215                                         targetCP = '1';
1216                                 else if (velems.Length > 1 &&
1217                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
1218                                         // skip them, except for CJK ideograph compat
1219                                         targetCP = 0;
1220
1221                                 if (targetCP != 0) {
1222                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
1223                                         if (entry == null) {
1224                                                 entry = new Hashtable ();
1225                                                 nfkdMap [targetCP] = entry;
1226                                         }
1227                                         entry [(byte) decompType [cp]] = cp;
1228                                 }
1229                         }
1230                         // numeric values
1231                         if (values [5].Length > 0)
1232                                 decimalValue [cp] = decimal.Parse (values [5]);
1233                         else if (values [6].Length > 0)
1234                                 decimalValue [cp] = decimal.Parse (values [6]);
1235                         else if (values [7].Length > 0) {
1236                                 string decstr = values [7];
1237                                 idx = decstr.IndexOf ('/');
1238                                 if (cp == 0x215F) // special. "1/"
1239                                         decimalValue [cp] = 0x1;
1240                                 else if (idx > 0)
1241                                         // m/n
1242                                         decimalValue [cp] =
1243                                                 decimal.Parse (decstr.Substring (0, idx))
1244                                                 / decimal.Parse (decstr.Substring (idx + 1));
1245                                 else if (decstr [0] == '(' &&
1246                                         decstr [decstr.Length - 1] == ')')
1247                                         // (n)
1248                                         decimalValue [cp] =
1249                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1250                                 else if (decstr [decstr.Length - 1] == '.')
1251                                         // n.
1252                                         decimalValue [cp] =
1253                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1254                                 else
1255                                         decimalValue [cp] = decimal.Parse (decstr);
1256                         }
1257                 }
1258
1259                 void ParseDerivedCoreProperties (string filename)
1260                 {
1261                         // IsUppercase
1262                         using (StreamReader file =
1263                                 new StreamReader (filename)) {
1264                                 for (int line = 1; file.Peek () >= 0; line++) {
1265                                         try {
1266                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1267                                         } catch (Exception) {
1268                                                 Console.Error.WriteLine ("**** At line " + line);
1269                                                 throw;
1270                                         }
1271                                 }
1272                         }
1273                 }
1274
1275                 void ProcessDerivedCorePropLine (string s)
1276                 {
1277                         int idx = s.IndexOf ('#');
1278                         if (idx >= 0)
1279                                 s = s.Substring (0, idx);
1280                         idx = s.IndexOf (';');
1281                         if (idx < 0)
1282                                 return;
1283                         string cpspec = s.Substring (0, idx);
1284                         idx = cpspec.IndexOf ("..");
1285                         NumberStyles nf = NumberStyles.HexNumber |
1286                                 NumberStyles.AllowTrailingWhite;
1287                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1288                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1289                         string value = s.Substring (cpspec.Length + 1).Trim ();
1290
1291                         // FIXME: use index
1292                         if (cp > char.MaxValue)
1293                                 return;
1294
1295                         switch (value) {
1296                         case "Uppercase":
1297                                 for (int x = cp; x <= cpEnd; x++)
1298                                         isUppercase [x] = true;
1299                                 break;
1300                         }
1301                 }
1302
1303                 void ParseScripts (string filename)
1304                 {
1305                         ArrayList gurmukhi = new ArrayList ();
1306                         ArrayList gujarati = new ArrayList ();
1307                         ArrayList georgian = new ArrayList ();
1308                         ArrayList thaana = new ArrayList ();
1309
1310                         using (StreamReader file =
1311                                 new StreamReader (filename)) {
1312                                 while (file.Peek () >= 0) {
1313                                         string s = file.ReadLine ();
1314                                         int idx = s.IndexOf ('#');
1315                                         if (idx >= 0)
1316                                                 s = s.Substring (0, idx);
1317                                         idx = s.IndexOf (';');
1318                                         if (idx < 0)
1319                                                 continue;
1320
1321                                         string cpspec = s.Substring (0, idx);
1322                                         idx = cpspec.IndexOf ("..");
1323                                         NumberStyles nf = NumberStyles.HexNumber |
1324                                                 NumberStyles.AllowTrailingWhite;
1325                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1326                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1327                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1328
1329                                         // FIXME: use index
1330                                         if (cp > char.MaxValue)
1331                                                 continue;
1332
1333                                         switch (value) {
1334                                         case "Gurmukhi":
1335                                                 for (int x = cp; x <= cpEnd; x++)
1336                                                         if (!IsIgnorable (x))
1337                                                                 gurmukhi.Add ((char) x);
1338                                                 break;
1339                                         case "Gujarati":
1340                                                 for (int x = cp; x <= cpEnd; x++)
1341                                                         if (!IsIgnorable (x))
1342                                                                 gujarati.Add ((char) x);
1343                                                 break;
1344                                         case "Georgian":
1345                                                 for (int x = cp; x <= cpEnd; x++)
1346                                                         if (!IsIgnorable (x))
1347                                                                 georgian.Add ((char) x);
1348                                                 break;
1349                                         case "Thaana":
1350                                                 for (int x = cp; x <= cpEnd; x++)
1351                                                         if (!IsIgnorable (x))
1352                                                                 thaana.Add ((char) x);
1353                                                 break;
1354                                         }
1355                                 }
1356                         }
1357                         gurmukhi.Sort (UCAComparer.Instance);
1358                         gujarati.Sort (UCAComparer.Instance);
1359                         georgian.Sort (UCAComparer.Instance);
1360                         thaana.Sort (UCAComparer.Instance);
1361                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1362                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1363                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1364                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1365                 }
1366
1367                 void ParseJISOrder (string filename)
1368                 {
1369                         int line = 1;
1370                         try {
1371                                 using (StreamReader file =
1372                                         new StreamReader (filename)) {
1373                                         for (;file.Peek () >= 0; line++)
1374                                                 ProcessJISOrderLine (file.ReadLine ());
1375                                 }
1376                         } catch (Exception) {
1377                                 Console.Error.WriteLine ("---- line {0}", line);
1378                                 throw;
1379                         }
1380                 }
1381
1382                 char [] ws = new char [] {'\t', ' '};
1383
1384                 void ProcessJISOrderLine (string s)
1385                 {
1386                         int idx = s.IndexOf ('#');
1387                         if (idx >= 0)
1388                                 s = s.Substring (0, idx).Trim ();
1389                         if (s.Length == 0)
1390                                 return;
1391                         idx = s.IndexOfAny (ws);
1392                         if (idx < 0)
1393                                 return;
1394                         // They start with "0x" so cut them out.
1395                         int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1396                         int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1397                         jisJapanese.Add (new JISCharacter (cp, jis));
1398                 }
1399
1400                 void ParseCJK (string zhXML, string jaXML, string koXML)
1401                 {
1402                         XmlDocument doc = new XmlDocument ();
1403                         doc.XmlResolver = null;
1404                         int v;
1405                         string s;
1406                         string category;
1407                         int offset;
1408                         ushort [] arr;
1409
1410                         // Chinese Simplified
1411                         category = "chs";
1412                         arr = cjkCHS;
1413                         offset = 0;//char.MaxValue - arr.Length;
1414                         doc.Load (zhXML);
1415                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1416                         v = 0x8008;
1417                         foreach (char c in s) {
1418                                 if (c < '\u3100')
1419                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1420                                 else {
1421                                         arr [(int) c - offset] = (ushort) v++;
1422                                         if (v % 256 == 0)
1423                                                 v += 2;
1424                                 }
1425                         }
1426
1427                         // Chinese Traditional
1428                         category = "cht";
1429                         arr = cjkCHT;
1430                         offset = 0;//char.MaxValue - arr.Length;
1431                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1432                         v = 0x8002;
1433                         foreach (char c in s) {
1434                                 if (c < '\u4E00')
1435                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1436                                 else {
1437                                         arr [(int) c - offset] = (ushort) v++;
1438                                         if (v % 256 == 0)
1439                                                 v += 2;
1440                                 }
1441                         }
1442
1443                         // Japanese
1444                         category = "ja";
1445                         arr = cjkJA;
1446                         offset = 0;//char.MaxValue - arr.Length;
1447
1448                         // SPECIAL CASES
1449                         arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1450                         arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1451                         arr [0x337E] = 0x8005;
1452                         arr [0x337D] = 0x8006;
1453                         arr [0x337C] = 0x8007;
1454
1455                         v = 0x8008;
1456                         foreach (JISCharacter jc in jisJapanese) {
1457                                 if (jc.JIS < 0x8800)
1458                                         continue;
1459                                 char c = (char) jc.CP;
1460
1461                                 if (c < '\u4E00')
1462                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1463                                 else {
1464                                         arr [(int) c - offset] = (ushort) v++;
1465                                         if (v % 256 == 0)
1466                                                 v += 2;
1467
1468                                         // SPECIAL CASES:
1469                                         if (c == '\u662D') // U+337C
1470                                                 continue;
1471                                         if (c == '\u5927') // U+337D
1472                                                 continue;
1473                                         if (c == '\u5E73') // U+337B
1474                                                 continue;
1475                                         if (c == '\u660E') // U+337E
1476                                                 continue;
1477                                         if (c == '\u9686') // U+F9DC
1478                                                 continue;
1479
1480                                         // FIXME: there are still remaining
1481                                         // characters after U+FA0C.
1482 //                                      for (int k = 0; k < char.MaxValue; k++) {
1483                                         for (int k = 0; k < '\uFA0D'; k++) {
1484                                                 if (decompIndex [k] == 0 || IsIgnorable (k))
1485                                                         continue;
1486                                                 if (decompValues [decompIndex [k]] == c /*&&
1487                                                         decompLength [k] == 1*/ ||
1488                                                         decompLength [k] == 3 &&
1489                                                         decompValues [decompIndex [k] + 1] == c) {
1490                                                         arr [k - offset] = (ushort) v++;
1491                                                         if (v % 256 == 0)
1492                                                                 v += 2;
1493                                                 }
1494                                         }
1495                                 }
1496                         }
1497
1498                         // Korean
1499                         // Korean weight is somewhat complex. It first shifts
1500                         // Hangul category from 52-x to 80-x (they are anyways
1501                         // computed). CJK ideographs are placed at secondary
1502                         // weight, like XX YY 01 zz 01, where XX and YY are
1503                         // corresponding "reset" value and zz is 41,43,45...
1504                         //
1505                         // Unlike chs,cht and ja, Korean value is a combined
1506                         // ushort which is computed as category
1507                         //
1508                         category = "ko";
1509                         arr = cjkKO;
1510                         offset = 0;//char.MaxValue - arr.Length;
1511                         doc.Load (koXML);
1512                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1513                                 XmlElement sc = (XmlElement) reset.NextSibling;
1514                                 // compute "category" and "level 1" for the
1515                                 // target "reset" Hangle syllable
1516                                 char rc = reset.InnerText [0];
1517                                 int ri = ((int) rc - 0xAC00) + 1;
1518                                 ushort p = (ushort)
1519                                         ((ri / 254) * 256 + (ri % 254) + 2);
1520                                 // Place the characters after the target.
1521                                 s = sc.InnerText;
1522                                 v = 0x41;
1523                                 foreach (char c in s) {
1524                                         arr [(int) c - offset] = p;
1525                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1526                                         v += 2;
1527                                 }
1528                         }
1529                 }
1530
1531                 #endregion
1532
1533                 #region Generation
1534
1535                 void FillIgnorables ()
1536                 {
1537                         for (int i = 0; i <= char.MaxValue; i++) {
1538                                 if (Char.GetUnicodeCategory ((char) i) ==
1539                                         UnicodeCategory.OtherNotAssigned)
1540                                         continue;
1541                                 if (IsIgnorable (i))
1542                                         ignorableFlags [i] |= 1;
1543                                 if (IsIgnorableSymbol (i))
1544                                         ignorableFlags [i] |= 2;
1545                                 if (IsIgnorableNonSpacing (i))
1546                                         ignorableFlags [i] |= 4;
1547                         }
1548                 }
1549
1550                 void ModifyUnidata ()
1551                 {
1552                         // Modify some decomposition equivalence
1553                         decompType [0xFE31] = 0;
1554                         decompIndex [0xFE31] = 0;
1555                         decompLength [0xFE31] = 0;
1556                         decompType [0xFE32] = 0;
1557                         decompIndex [0xFE32] = 0;
1558                         decompLength [0xFE32] = 0;
1559
1560                         // Korean parens numbers
1561                         for (int i = 0x3200; i <= 0x321C; i++)
1562                                 diacritical [i] = 0xA;
1563                         for (int i = 0x3260; i <= 0x327B; i++)
1564                                 diacritical [i] = 0xC;
1565
1566                         // LAMESPEC: these remapping should not be done.
1567                         // Windows have incorrect CJK compat mappings.
1568                         decompValues [decompIndex [0x32A9]] = 0x91AB;
1569                         decompLength [0x323B] = 1;
1570                         decompValues [decompIndex [0x323B]] = 0x5B78;
1571                         decompValues [decompIndex [0x32AB]] = 0x5B78;
1572                         decompValues [decompIndex [0x32A2]] = 0x5BEB;
1573                         decompLength [0x3238] = 1;
1574                         decompValues [decompIndex [0x3238]] = 0x52DE;
1575                         decompValues [decompIndex [0x3298]] = 0x52DE;
1576
1577                         // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1578                         decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1579                         decompValues [decompIndex [0xFA0C]] = 0x5140;
1580                         decompLength [0xFA0C] = 1;
1581                         decompIndex [0xF929] = decompLength [0xF929] = 0;
1582
1583                         decompValues [decompIndex [0xF92C]] = 0x90DE;
1584                 }
1585
1586                 void ModifyParsedValues ()
1587                 {
1588                         // number, secondary weights
1589                         byte weight = 0x38;
1590                         int [] numarr = numberSecondaryWeightBounds;
1591                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1592                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1593                                         if (Char.IsNumber ((char) cp))
1594                                                 diacritical [cp] = weight;
1595
1596                         // Update name part of named characters
1597                         for (int i = 0; i < sortableCharNames.Count; i++) {
1598                                 DictionaryEntry de =
1599                                         (DictionaryEntry) sortableCharNames [i];
1600                                 int cp = (int) de.Key;
1601                                 string renamed = null;
1602                                 switch (cp) {
1603                                 case 0x2101: renamed = "A_1"; break;
1604                                 case 0x33C3: renamed = "A_2"; break;
1605                                 case 0x2105: renamed = "C_1"; break;
1606                                 case 0x2106: renamed = "C_2"; break;
1607                                 case 0x211E: renamed = "R1"; break;
1608                                 case 0x211F: renamed = "R2"; break;
1609                                 // Remove some of them!
1610                                 case 0x2103:
1611                                 case 0x2109:
1612                                 case 0x2116:
1613                                 case 0x2117:
1614                                 case 0x2118:
1615                                 case 0x2125:
1616                                 case 0x2127:
1617                                 case 0x2129:
1618                                 case 0x212E:
1619                                 case 0x2132:
1620                                         sortableCharNames.RemoveAt (i);
1621                                         i--;
1622                                         continue;
1623                                 }
1624                                 if (renamed != null)
1625                                         sortableCharNames [i] =
1626                                                 new DictionaryEntry (cp, renamed);
1627                         }
1628                 }
1629
1630                 void GenerateCore ()
1631                 {
1632                         UnicodeCategory uc;
1633
1634                         #region Specially ignored // 01
1635                         // This will raise "Defined" flag up.
1636                         foreach (char c in specialIgnore)
1637                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1638                         #endregion
1639
1640
1641                         #region Variable weights
1642                         // Controls : 06 03 - 06 3D
1643                         fillIndex [6] = 3;
1644                         for (int i = 0; i < 65536; i++) {
1645                                 if (IsIgnorable (i))
1646                                         continue;
1647                                 char c = (char) i;
1648                                 uc = Char.GetUnicodeCategory (c);
1649                                 // NEL is whitespace but not ignored here.
1650                                 if (uc == UnicodeCategory.Control &&
1651                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1652                                         AddCharMap (c, 6, 1);
1653                         }
1654
1655                         // Apostrophe 06 80
1656                         fillIndex [6] = 0x80;
1657                         AddCharMapGroup ('\'', 6, 1, 0);
1658                         AddCharMap ('\uFE63', 6, 1);
1659
1660                         // Hyphen/Dash : 06 81 - 06 90
1661                         for (int i = 0; i < char.MaxValue; i++) {
1662                                 if (!IsIgnorable (i) &&
1663                                         Char.GetUnicodeCategory ((char) i) ==
1664                                         UnicodeCategory.DashPunctuation) {
1665                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1666                                         if (i == 0x2011) {
1667                                                 // SPECIAL: add 2027 and 2043
1668                                                 // Maybe they are regarded the
1669                                                 // same hyphens in "central"
1670                                                 // position.
1671                                                 AddCharMap ('\u2027', 6, 1);
1672                                                 AddCharMap ('\u2043', 6, 1);
1673                                         }
1674                                 }
1675                         }
1676
1677                         // Arabic variable weight chars 06 A0 -
1678                         fillIndex [6] = 0xA0;
1679                         // vowels
1680                         for (int i = 0x64B; i <= 0x650; i++)
1681                                 AddArabicCharMap ((char) i);
1682                         // sukun
1683                         AddCharMapGroup ('\u0652', 6, 1, 0);
1684                         // shadda
1685                         AddCharMapGroup ('\u0651', 6, 1, 0);
1686                         #endregion
1687
1688
1689                         #region Nonspacing marks // 01
1690                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1691
1692                         // Combining diacritical marks: 01 DC -
1693
1694                         fillIndex [0x1] = 0x41;
1695                         for (int i = 0x030E; i <= 0x0326; i++)
1696                                 if (!IsIgnorable (i))
1697                                         AddCharMap ((char) i, 0x1, 1);
1698                         for (int i = 0x0329; i <= 0x0334; i++)
1699                                 if (!IsIgnorable (i))
1700                                         AddCharMap ((char) i, 0x1, 1);
1701                         for (int i = 0x0339; i <= 0x0341; i++)
1702                                 if (!IsIgnorable (i))
1703                                         AddCharMap ((char) i, 0x1, 1);
1704                         fillIndex [0x1] = 0x72;
1705                         for (int i = 0x0346; i <= 0x0348; i++)
1706                                 if (!IsIgnorable (i))
1707                                         AddCharMap ((char) i, 0x1, 1);
1708                         for (int i = 0x02BE; i <= 0x02BF; i++)
1709                                 if (!IsIgnorable (i))
1710                                         AddCharMap ((char) i, 0x1, 1);
1711                         for (int i = 0x02C1; i <= 0x02C5; i++)
1712                                 if (!IsIgnorable (i))
1713                                         AddCharMap ((char) i, 0x1, 1);
1714                         for (int i = 0x02CE; i <= 0x02CF; i++)
1715                                 if (!IsIgnorable (i))
1716                                         AddCharMap ((char) i, 0x1, 1);
1717                         for (int i = 0x02D1; i <= 0x02D3; i++)
1718                                 if (!IsIgnorable (i))
1719                                         AddCharMap ((char) i, 0x1, 1);
1720                         AddCharMap ('\u02DE', 0x1, 1);
1721                         for (int i = 0x02E4; i <= 0x02E9; i++)
1722                                 if (!IsIgnorable (i))
1723                                         AddCharMap ((char) i, 0x1, 1);
1724
1725                         // FIXME: needs more love here (it should eliminate
1726                         // all the hacky code above).
1727                         for (int i = 0x0300; i < 0x0370; i++)
1728                                 if (!IsIgnorable (i) && diacritical [i] != 0
1729                                         /* especiall here*/ && !map [i].Defined)
1730                                         map [i] = new CharMapEntry (
1731                                                 0x1, 0x1, diacritical [i]);
1732
1733                         fillIndex [0x1] = 0x94;
1734                         // syriac dotted nonspacing marks
1735                         AddCharMap ('\u0732', 0x1, 1);
1736                         AddCharMap ('\u0735', 0x1, 1);
1737                         AddCharMap ('\u0738', 0x1, 1);
1738                         AddCharMap ('\u0739', 0x1, 1);
1739                         AddCharMap ('\u073C', 0x1, 1);
1740                         fillIndex [0x1] = 0x9F;
1741                         for (int i = 0x0730; i <= 0x07B0; i++)
1742                                 if (!IsIgnorable (i) && !map [i].Defined)
1743                                         AddCharMap ((char) i, 0x1, 1);
1744
1745                         fillIndex [0x1] = 0x0C;
1746                         for (int i = 0x0EC8; i <= 0x0ECD; i++)
1747                                 if (!IsIgnorable (i))
1748                                         AddCharMap ((char) i, 0x1, 1);
1749
1750                         // LAMESPEC: It should not stop at '\u20E1'. There are
1751                         // a few more characters (that however results in
1752                         // overflow of level 2 unless we start before 0xDD).
1753                         fillIndex [0x1] = 0xDD;
1754                         for (int i = 0x20d0; i <= 0x20e1; i++)
1755                                 AddCharMap ((char) i, 0x1, 1);
1756
1757                         // They are not part of Nonspacing marks, but have
1758                         // only diacritical weight.
1759                         for (int i = 0x3099; i <= 0x309C; i++)
1760                                 map [i] = new CharMapEntry (1, 1, 1);
1761                         map [0xFF9E] = new CharMapEntry (1, 1, 1);
1762                         map [0xFF9F] = new CharMapEntry (1, 1, 2);
1763                         map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1764                         map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1765                         for (int i = 0x30FC; i <= 0x30FE; i++)
1766                                 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1767
1768                         #endregion
1769
1770
1771                         #region Whitespaces // 07 03 -
1772                         fillIndex [0x7] = 0x2;
1773                         AddCharMap (' ', 0x7, 2);
1774                         AddCharMap ('\u00A0', 0x7, 1);
1775                         for (int i = 9; i <= 0xD; i++)
1776                                 AddCharMap ((char) i, 0x7, 1);
1777                         for (int i = 0x2000; i <= 0x200B; i++)
1778                                 AddCharMap ((char) i, 0x7, 1);
1779
1780                         fillIndex [0x7] = 0x17;
1781                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1782                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1783
1784                         // Characters which used to represent layout control.
1785                         // LAMESPEC: Windows developers seem to have thought
1786                         // that those characters are kind of whitespaces,
1787                         // while they aren't.
1788                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1789                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1790                         #endregion
1791
1792                         // category 09 - continued symbols from 08
1793                         fillIndex [0x9] = 2;
1794                         // misc tech mark
1795                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1796                                 AddCharMap ((char) cp, 0x9, 1, 0);
1797
1798                         // arrows
1799                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1800                         foreach (DictionaryEntry de in arrowValues) {
1801                                 int idx = (int) de.Value;
1802                                 int cp = (int) de.Key;
1803                                 if (map [cp].Defined)
1804                                         continue;
1805                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1806                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1807                                 arrowLv2 [idx]++;
1808                         }
1809                         // boxes
1810                         byte [] boxLv2 = new byte [128];
1811                         for (int i = 0; i < boxLv2.Length; i++)
1812                                 boxLv2 [i] = 3;
1813                         foreach (DictionaryEntry de in boxValues) {
1814                                 int cp = (int) de.Key;
1815                                 int off = (int) de.Value;
1816                                 if (map [cp].Defined)
1817                                         continue;
1818                                 if (off < 0) {
1819                                         fillIndex [0x9] = (byte) (0xE5 + off);
1820                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1821                                 }
1822                                 else {
1823                                         fillIndex [0x9] = (byte) (0xE5 + off);
1824                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1825                                 }
1826                         }
1827                         // Some special characters (slanted)
1828                         fillIndex [0x9] = 0xF4;
1829                         AddCharMap ('\u2571', 0x9, 3);
1830                         AddCharMap ('\u2572', 0x9, 3);
1831                         AddCharMap ('\u2573', 0x9, 3);
1832
1833                         // FIXME: implement 0A
1834                         #region Symbols
1835                         fillIndex [0xA] = 2;
1836                         // byte currency symbols
1837                         for (int cp = 0; cp < 0x100; cp++) {
1838                                 uc = Char.GetUnicodeCategory ((char) cp);
1839                                 if (!IsIgnorable (cp) &&
1840                                         uc == UnicodeCategory.CurrencySymbol &&
1841                                         cp != '$' ||
1842                                         cp == 0xAC)
1843                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1844                         }
1845                         // byte other symbols
1846                         for (int cp = 0; cp < 0x100; cp++) {
1847                                 if (cp == 0xA6)
1848                                         continue; // SPECIAL: skip FIXME: why?
1849                                 uc = Char.GetUnicodeCategory ((char) cp);
1850                                 if (!IsIgnorable (cp) &&
1851                                         uc == UnicodeCategory.OtherSymbol ||
1852                                         cp == '\u00B5' || cp == '\u00B7')
1853                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1854                         }
1855
1856                         fillIndex [0xA] = 0x0F; // FIXME: it won't be needed
1857                         for (int cp = 0x2020; cp <= 0x2031; cp++)
1858                                 if (Char.IsPunctuation ((char) cp))
1859                                         AddCharMap ((char) cp, 0xA, 1, 0);
1860                         // SPECIAL CASES: why?
1861                         AddCharMap ('\u203B', 0xA, 1, 0);
1862                         AddCharMap ('\u2040', 0xA, 1, 0);
1863                         AddCharMap ('\u2041', 0xA, 1, 0);
1864                         AddCharMap ('\u2042', 0xA, 1, 0);
1865
1866                         for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1867                                 AddCharMap ((char) cp, 0xA, 1, 0);
1868                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1869                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1870                                 AddCharMap ((char) cp, 0xA, 1, 0);
1871                         // Dingbats
1872                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1873                                 if (Char.IsSymbol ((char) cp))
1874                                         AddCharMap ((char) cp, 0xA, 1, 0);
1875                         // OCR
1876                         for (int i = 0x2440; i < 0x2460; i++)
1877                                 AddCharMap ((char) i, 0xA, 1, 0);
1878
1879                         #endregion
1880
1881                         #region Numbers // 0C 02 - 0C E1
1882                         fillIndex [0xC] = 2;
1883
1884                         // 9F8 : Bengali "one less than the denominator"
1885                         AddCharMap ('\u09F8', 0xC, 1);
1886
1887                         ArrayList numbers = new ArrayList ();
1888                         for (int i = 0; i < 65536; i++)
1889                                 if (!IsIgnorable (i) &&
1890                                         Char.IsNumber ((char) i) &&
1891                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1892                                         numbers.Add (i);
1893
1894                         ArrayList numberValues = new ArrayList ();
1895                         foreach (int i in numbers)
1896                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1897                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1898
1899 //foreach (DictionaryEntry de in numberValues)
1900 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1901
1902                         decimal prevValue = -1;
1903                         foreach (DictionaryEntry de in numberValues) {
1904                                 int cp = (int) de.Key;
1905                                 decimal currValue = (decimal) de.Value;
1906                                 bool addnew = false;
1907                                 if (prevValue < currValue &&
1908                                         prevValue - (int) prevValue == 0 &&
1909                                         prevValue >= 1) {
1910
1911                                         addnew = true;
1912                                         // Process Hangzhou and Roman numbers
1913
1914                                         // There are some SPECIAL cases.
1915                                         if (currValue != 4) // no increment for 4
1916                                                 fillIndex [0xC]++;
1917
1918                                         int xcp;
1919                                         if (currValue <= 10) {
1920                                                 xcp = (int) prevValue + 0x2170 - 1;
1921                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1922                                                 xcp = (int) prevValue + 0x2160 - 1;
1923                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1924                                                 fillIndex [0xC] += 2;
1925                                                 xcp = (int) prevValue + 0x3021 - 1;
1926                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1927                                                 fillIndex [0xC]++;
1928                                         }
1929                                         else if (currValue == 11)
1930                                                 fillIndex [0xC]++;
1931                                 }
1932                                 if (prevValue < currValue)
1933                                         prevValue = currValue;
1934                                 if (map [cp].Defined)
1935                                         continue;
1936                                 // HangZhou and Roman are add later
1937                                 // (code is above)
1938                                 else if (0x3021 <= cp && cp < 0x302A
1939                                         || 0x2160 <= cp && cp < 0x216A
1940                                         || 0x2170 <= cp && cp < 0x217A)
1941                                         continue;
1942
1943                                 if (cp ==  0x215B) // FIXME: why?
1944                                         fillIndex [0xC] += 2;
1945                                 else if (cp == 0x3021) // FIXME: why?
1946                                         fillIndex [0xC]++;
1947                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1948                                 if (addnew || cp <= '9') {
1949                                         int mod = (int) currValue - 1;
1950                                         int xcp;
1951                                         if (1 <= currValue && currValue <= 10) {
1952                                                 xcp = mod + 0x2776;
1953                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1954                                                 xcp = mod + 0x2780;
1955                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1956                                                 xcp = mod + 0x278A;
1957                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1958                                         }
1959                                         if (1 <= currValue && currValue <= 20) {
1960                                                 xcp = mod + 0x2460;
1961                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1962                                                 xcp = mod + 0x2474;
1963                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1964                                                 xcp = mod + 0x2488;
1965                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1966                                         }
1967                                 }
1968
1969                                 if (cp != 0x09E7 && cp != 0x09EA)
1970                                         fillIndex [0xC]++;
1971
1972                                 // Add special cases that are not regarded as
1973                                 // numbers in UnicodeCategory speak.
1974                                 if (cp == '5') {
1975                                         // TONE FIVE
1976                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1977                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1978                                 }
1979                                 else if (cp == '6') // FIXME: why?
1980                                         fillIndex [0xC]++;
1981                         }
1982
1983                         // 221E: infinity
1984                         fillIndex [0xC] = 0xFF;
1985                         AddCharMap ('\u221E', 0xC, 1);
1986                         #endregion
1987
1988                         #region Letters and NonSpacing Marks (general)
1989
1990                         // ASCII Latin alphabets
1991                         for (int i = 0; i < alphabets.Length; i++)
1992                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1993
1994
1995                         // non-ASCII Latin alphabets
1996                         // FIXME: there is no such characters that are placed
1997                         // *after* "alphabets" array items. This is nothing
1998                         // more than a hack that creates dummy weight for
1999                         // primary characters.
2000                         for (int i = 0x0080; i < 0x0300; i++) {
2001                                 if (!Char.IsLetter ((char) i))
2002                                         continue;
2003                                 // For those Latin Letters which has NFKD are
2004                                 // not added as independent primary character.
2005                                 if (decompIndex [i] != 0)
2006                                         continue;
2007                                 // SPECIAL CASES:
2008                                 // 1.some alphabets have primarily
2009                                 //   equivalent ASCII alphabets.
2010                                 // 2.some have independent primary weights,
2011                                 //   but inside a-to-z range.
2012                                 // 3.there are some expanded characters that
2013                                 //   are not part of Unicode Standard NFKD.
2014                                 // 4. some characters are letter in IsLetter
2015                                 //   but not in sortkeys (maybe unicode version
2016                                 //   difference caused it).
2017                                 switch (i) {
2018                                 // 1. skipping them does not make sense
2019 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
2020 //                              case 0x184: case 0x185: case 0x186: case 0x189:
2021 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
2022 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
2023 //                              case 0x19B: case 0x19C:
2024                                 // 2. skipping them does not make sense
2025 //                              case 0x14A: // Ng
2026 //                              case 0x14B: // ng
2027                                 // 3.
2028                                 case 0xC6: // AE
2029                                 case 0xE6: // ae
2030                                 case 0xDE: // Icelandic Thorn
2031                                 case 0xFE: // Icelandic Thorn
2032                                 case 0xDF: // German ss
2033                                 case 0xFF: // German ss
2034                                 // 4.
2035                                 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2036                                 // not classified yet
2037 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2038 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2039 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2040 //                              case 0x1DD:
2041                                         continue;
2042                                 }
2043                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
2044                         }
2045
2046                         // Greek and Coptic
2047                         fillIndex [0xF] = 02;
2048                         for (int i = 0x0380; i < 0x0390; i++)
2049                                 if (Char.IsLetter ((char) i))
2050                                         AddLetterMap ((char) i, 0xF, 1);
2051                         fillIndex [0xF] = 02;
2052                         for (int i = 0x0391; i < 0x03CF; i++)
2053                                 if (Char.IsLetter ((char) i))
2054                                         AddLetterMap ((char) i, 0xF, 1);
2055                         fillIndex [0xF] = 0x40;
2056                         for (int i = 0x03D0; i < 0x0400; i++)
2057                                 if (Char.IsLetter ((char) i))
2058                                         AddLetterMap ((char) i, 0xF, 1);
2059
2060                         // Cyrillic.
2061                         // Cyrillic letters are sorted like Latin letters i.e.
2062                         // containing culture-specific letters between the
2063                         // standard Cyrillic sequence.
2064                         //
2065                         // We can't use UCA here; it has different sorting.
2066                         char [] orderedCyrillic = new char [] {
2067                                 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2068                                 '\u0452', // DJE for Serbocroatian
2069                                 '\u0435',
2070                                 '\u0454', // IE for Ukrainian
2071                                 '\u0436', '\u0437',
2072                                 '\u0455', // DZE
2073                                 '\u0438',
2074                                 '\u0456', // Byelorussian-Ukrainian I
2075                                 '\u0457', // YI
2076                                 '\u0439',
2077                                 '\u0458', // JE
2078                                 '\u043A', '\u043B',
2079                                 '\u0459', // LJE
2080                                 '\u043C', '\u043D',
2081                                 '\u045A', // NJE
2082                                 '\u043E',
2083                                 // 4E9 goes here.
2084                                 '\u043F', '\u0440', '\u0441', '\u0442',
2085                                 '\u045B', // TSHE for Serbocroatian
2086                                 '\u0443',
2087                                 '\u045E', // Short U for Byelorussian
2088                                 '\u04B1', // Straight U w/ stroke (diacritical!)
2089                                 '\u0444', '\u0445', '\u0446', '\u0447',
2090                                 '\u045F', // DZHE
2091                                 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2092                                 '\u044D', '\u044E', '\u044F'};
2093
2094                         // For some characters here is a map to basic cyrillic
2095                         // letters. See UnicodeData.txt character names for
2096                         // the sources. Here I simply declare an equiv. array.
2097                         // The content characters are map from U+490(,491),
2098                         // skipping small letters.
2099                         char [] cymap_src = new char [] {
2100                                 '\u0433', '\u0433', '\u0433', '\u0436',
2101                                 '\u0437', '\u043A', '\u043A', '\u043A',
2102                                 '\u043A', '\u043D', '\u043D', '\u043F',
2103                                 '\u0445', '\u0441', '\u0442', '\u0443',
2104                                 '\u0443', '\u0445', '\u0446', '\u0447',
2105                                 '\u0447', '\u0432', '\u0435', '\u0435',
2106                                 '\u0406', '\u0436', '\u043A', '\u043D',
2107                                 '\u0447', '\u0435'};
2108
2109                         fillIndex [0x10] = 0x8D;
2110                         for (int i = 0x0460; i < 0x0481; i++) {
2111                                 if (Char.IsLetter ((char) i)) {
2112                                         if (i == 0x0476)
2113                                                 // U+476/477 have the same
2114                                                 // primary weight as U+474/475.
2115                                                 fillIndex [0x10] -= 3;
2116                                         AddLetterMap ((char) i, 0x10, 3);
2117                                 }
2118                         }
2119
2120                         fillIndex [0x10] = 0x6;
2121                         for (int i = 0; i < orderedCyrillic.Length; i++) {
2122                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2123                                 if (!IsIgnorable ((int) c) &&
2124                                         Char.IsLetter (c) &&
2125                                         !map [c].Defined) {
2126                                         AddLetterMap (c, 0x10, 0);
2127                                         fillIndex [0x10] += 3;
2128                                 }
2129                         }
2130
2131                         for (int i = 0; i < cymap_src.Length; i++) {
2132                                 char c = cymap_src [i];
2133                                 fillIndex [0x10] = map [c].Level1;
2134                                 AddLetterMap ((char) (0x0490 + i * 2),
2135                                         0x10, 0);
2136                         }
2137
2138                         // Armenian
2139                         fillIndex [0x11] = 0x3;
2140                         for (int i = 0x0531; i < 0x0586; i++)
2141                                 if (Char.IsLetter ((char) i))
2142                                         AddLetterMap ((char) i, 0x11, 1);
2143
2144                         // Hebrew
2145                         // -Letters
2146                         fillIndex [0x12] = 0x2;
2147                         for (int i = 0x05D0; i < 0x05FF; i++)
2148                                 if (Char.IsLetter ((char) i))
2149                                         AddLetterMap ((char) i, 0x12, 1);
2150                         // -Accents
2151                         fillIndex [0x1] = 0x3;
2152                         for (int i = 0x0591; i <= 0x05C2; i++) {
2153                                 if (i == 0x05A3 || i == 0x05BB)
2154                                         fillIndex [0x1]++;
2155                                 if (i != 0x05BE)
2156                                         AddCharMap ((char) i, 0x1, 1);
2157                         }
2158
2159                         // Arabic
2160                         fillIndex [0x1] = 0x8E;
2161                         fillIndex [0x13] = 0x3;
2162                         for (int i = 0x0621; i <= 0x064A; i++) {
2163                                 // Abjad
2164                                 if (Char.GetUnicodeCategory ((char) i)
2165                                         != UnicodeCategory.OtherLetter) {
2166                                         // FIXME: arabic nonspacing marks are
2167                                         // in different order.
2168                                         AddCharMap ((char) i, 0x1, 1);
2169                                         continue;
2170                                 }
2171 //                              map [i] = new CharMapEntry (0x13,
2172 //                                      (byte) arabicLetterPrimaryValues [i], 1);
2173                                 fillIndex [0x13] =
2174                                         (byte) arabicLetterPrimaryValues [i];
2175                                 AddLetterMap ((char) i, 0x13, 0);
2176                         }
2177                         fillIndex [0x13] = 0x84;
2178                         for (int i = 0x0674; i < 0x06D6; i++)
2179                                 if (Char.IsLetter ((char) i))
2180                                         AddLetterMap ((char) i, 0x13, 1);
2181
2182                         // Devanagari
2183                         // FIXME: it does seem straight codepoint mapping.
2184                         fillIndex [0x14] = 04;
2185                         for (int i = 0x0901; i < 0x0905; i++)
2186                                 if (!IsIgnorable (i))
2187                                         AddLetterMap ((char) i, 0x14, 2);
2188                         fillIndex [0x14] = 0xB;
2189                         for (int i = 0x0905; i < 0x093A; i++) {
2190                                 if (i == 0x0928)
2191                                         AddCharMap ('\u0929', 0x14, 0, 8);
2192                                 if (i == 0x0930)
2193                                         AddCharMap ('\u0931', 0x14, 0, 8);
2194                                 if (i == 0x0933)
2195                                         AddCharMap ('\u0934', 0x14, 0, 8);
2196                                 if (Char.IsLetter ((char) i))
2197                                         AddLetterMap ((char) i, 0x14, 4);
2198                                 if (i == 0x090B)
2199                                         AddCharMap ('\u0960', 0x14, 4);
2200                                 if (i == 0x090C)
2201                                         AddCharMap ('\u0961', 0x14, 4);
2202                         }
2203                         fillIndex [0x14] = 0xDA;
2204                         for (int i = 0x093E; i < 0x0945; i++)
2205                                 if (!IsIgnorable (i))
2206                                         AddLetterMap ((char) i, 0x14, 2);
2207                         fillIndex [0x14] = 0xEC;
2208                         for (int i = 0x0945; i < 0x094F; i++)
2209                                 if (!IsIgnorable (i))
2210                                         AddLetterMap ((char) i, 0x14, 2);
2211
2212                         // Bengali
2213                         // -Letters
2214                         fillIndex [0x15] = 02;
2215                         for (int i = 0x0980; i < 0x9FF; i++) {
2216                                 if (IsIgnorable (i))
2217                                         continue;
2218                                 if (i == 0x09E0)
2219                                         fillIndex [0x15] = 0x3B;
2220                                 switch (Char.GetUnicodeCategory ((char) i)) {
2221                                 case UnicodeCategory.NonSpacingMark:
2222                                 case UnicodeCategory.DecimalDigitNumber:
2223                                 case UnicodeCategory.OtherNumber:
2224                                         continue;
2225                                 }
2226                                 AddLetterMap ((char) i, 0x15, 1);
2227                         }
2228                         // -Signs
2229                         fillIndex [0x1] = 0x3;
2230                         for (int i = 0x0981; i < 0x0A00; i++)
2231                                 if (Char.GetUnicodeCategory ((char) i) ==
2232                                         UnicodeCategory.NonSpacingMark)
2233                                         AddCharMap ((char) i, 0x1, 1);
2234
2235                         // Gurmukhi. orderedGurmukhi is from UCA
2236                         // FIXME: it does not look equivalent to UCA.
2237                         fillIndex [0x16] = 04;
2238                         fillIndex [0x1] = 3;
2239                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
2240                                 char c = orderedGurmukhi [i];
2241                                 if (IsIgnorable ((int) c))
2242                                         continue;
2243                                 if (IsIgnorableNonSpacing (c)) {
2244                                         AddLetterMap (c, 0x1, 1);
2245                                         continue;
2246                                 }
2247                                 if (c == '\u0A3C' || c == '\u0A4D' ||
2248                                         '\u0A66' <= c && c <= '\u0A71')
2249                                         continue;
2250                                 // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
2251                                 byte shift = 4;
2252                                 if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
2253                                         shift = 0;
2254                                 AddLetterMap (c, 0x16, shift);
2255                         }
2256
2257                         // Gujarati. orderedGujarati is from UCA
2258                         fillIndex [0x17] = 0x4;
2259                         // nonspacing marks
2260                         map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2261                         map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2262                         map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2263                         map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2264                         map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2265                         map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2266                         // letters go first.
2267                         for (int i = 0; i < orderedGujarati.Length; i++) {
2268                                 // SPECIAL CASE
2269                                 char c = orderedGujarati [i];
2270                                 if (Char.IsLetter (c)) {
2271                                         // SPECIAL CASES
2272                                         if (c == '\u0AB3' || c == '\u0A32')
2273                                                 continue;
2274                                         if (c == '\u0A33') {
2275                                                 AddCharMap ('\u0A32', 0x17, 0);
2276                                                 AddCharMap ('\u0A33', 0x17, 4, 4);
2277                                                 continue;
2278                                         }
2279                                         if (c == '\u0A8B')
2280                                                 AddCharMap ('\u0AE0', 0x17, 0, 5);
2281                                         AddCharMap (c, 0x17, 4);
2282
2283                                         if (c == '\u0AB9')
2284                                                 AddCharMap ('\u0AB3', 0x17, 6);
2285                                 }
2286                         }
2287                         // non-letters
2288                         byte gujaratiShift = 4;
2289                         fillIndex [0x17] = 0xC0;
2290                         for (int i = 0; i < orderedGujarati.Length; i++) {
2291                                 char c = orderedGujarati [i];
2292                                 if (fillIndex [0x17] == 0xCC)
2293                                         gujaratiShift = 3;
2294                                 if (!Char.IsLetter (c)) {
2295                                         // SPECIAL CASES
2296                                         if (c == '\u0A82')
2297                                                 AddCharMap ('\u0A81', 0x17, 2);
2298                                         if (c == '\u0AC2')
2299                                                 fillIndex [0x17]++;
2300                                         AddLetterMap (c, 0x17, gujaratiShift);
2301                                 }
2302                         }
2303
2304                         // Oriya
2305                         fillIndex [0x1] = 03;
2306                         fillIndex [0x18] = 02;
2307                         for (int i = 0x0B00; i < 0x0B7F; i++) {
2308                                 switch (Char.GetUnicodeCategory ((char) i)) {
2309                                 case UnicodeCategory.NonSpacingMark:
2310                                 case UnicodeCategory.DecimalDigitNumber:
2311                                         AddLetterMap ((char) i, 0x1, 1);
2312                                         continue;
2313                                 }
2314                                 AddLetterMap ((char) i, 0x18, 1);
2315                         }
2316
2317                         // Tamil
2318                         fillIndex [0x19] = 2;
2319                         AddCharMap ('\u0BD7', 0x19, 0);
2320                         fillIndex [0x19] = 0xA;
2321                         // vowels
2322                         for (int i = 0x0B82; i <= 0x0B94; i++)
2323                                 if (!IsIgnorable ((char) i))
2324                                         AddCharMap ((char) i, 0x19, 2);
2325                         // special vowel
2326                         fillIndex [0x19] = 0x28;
2327                         // The array for Tamil consonants is a constant.
2328                         // Windows have almost similar sequence to TAM from
2329                         // tamilnet but a bit different in Grantha.
2330                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
2331                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2332                         // combining marks
2333                         fillIndex [0x19] = 0x82;
2334                         for (int i = 0x0BBE; i < 0x0BCD; i++)
2335                                 if (Char.GetUnicodeCategory ((char) i) ==
2336                                         UnicodeCategory.SpacingCombiningMark
2337                                         || i == 0x0BC0)
2338                                         AddLetterMap ((char) i, 0x19, 2);
2339
2340                         // Telugu
2341                         fillIndex [0x1A] = 0x4;
2342                         for (int i = 0x0C00; i < 0x0C62; i++) {
2343                                 if (i == 0x0C55 || i == 0x0C56)
2344                                         continue; // skip
2345                                 AddCharMap ((char) i, 0x1A, 3);
2346                                 char supp = (i == 0x0C0B) ? '\u0C60':
2347                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
2348                                 if (supp == char.MinValue)
2349                                         continue;
2350                                 AddCharMap (supp, 0x1A, 3);
2351                         }
2352
2353                         // Kannada
2354                         fillIndex [0x1B] = 4;
2355                         for (int i = 0x0C80; i < 0x0CE5; i++) {
2356                                 if (i == 0x0CD5 || i == 0x0CD6)
2357                                         continue; // ignore
2358                                 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2359                                         continue; // shift after 0xCB9
2360                                 AddCharMap ((char) i, 0x1B, 3);
2361                                 if (i == 0x0CB9) {
2362                                         // SPECIAL CASES: but why?
2363                                         AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2364                                         AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2365                                         AddCharMap ('\u0CDE', 0x1B, 3); // FA
2366                                 }
2367                                 if (i == 0x0CB2)
2368                                         AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2369                         }
2370
2371                         // Malayalam
2372                         fillIndex [0x1C] = 2;
2373                         for (int i = 0x0D02; i < 0x0D61; i++)
2374                                 // FIXME: I avoided MSCompatUnicodeTable usage
2375                                 // here (it results in recursion). So check if
2376                                 // using NonSpacingMark makes sense or not.
2377                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2378 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2379                                         AddCharMap ((char) i, 0x1C, 1);
2380
2381                         // Thai ... note that it breaks 0x1E wall after E2B!
2382                         // Also, all Thai characters have level 2 value 3.
2383                         fillIndex [0x1E] = 2;
2384                         for (int i = 0xE40; i <= 0xE44; i++)
2385                                 AddCharMap ((char) i, 0x1E, 1, 3);
2386                         for (int i = 0xE01; i < 0xE2B; i++)
2387                                 AddCharMap ((char) i, 0x1E, 6, 3);
2388                         fillIndex [0x1F] = 5;
2389                         for (int i = 0xE2B; i < 0xE30; i++)
2390                                 AddCharMap ((char) i, 0x1F, 6, 3);
2391                         fillIndex [0x1F] = 0x1E;
2392                         for (int i = 0xE30; i < 0xE3B; i++)
2393                                 AddCharMap ((char) i, 0x1F, 1, 3);
2394                         // some Thai characters remains.
2395                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
2396                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2397                         foreach (char c in specialThai)
2398                                 AddCharMap (c, 0x1F, 1);
2399
2400                         // Lao
2401                         fillIndex [0x1F] = 2;
2402                         for (int i = 0xE80; i < 0xEDF; i++)
2403                                 if (Char.IsLetter ((char) i))
2404                                         AddCharMap ((char) i, 0x1F, 1);
2405
2406                         // Georgian. orderedGeorgian is from UCA DUCET.
2407                         fillIndex [0x21] = 5;
2408                         for (int i = 0; i < orderedGeorgian.Length; i++) {
2409                                 char c = orderedGeorgian [i];
2410                                 if (map [(int) c].Defined)
2411                                         continue;
2412                                 AddCharMap (c, 0x21, 0);
2413                                 if (c < '\u10F6')
2414                                         AddCharMap ((char) (c - 0x30), 0x21, 0);
2415                                 fillIndex [0x21] += 5;
2416                         }
2417
2418                         // Japanese Kana.
2419                         fillIndex [0x22] = 2;
2420                         int kanaOffset = 0x3041;
2421                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2422
2423                         for (int gyo = 0; gyo < 9; gyo++) {
2424                                 for (int dan = 0; dan < 5; dan++) {
2425                                         if (gyo == 7 && dan % 2 == 1) {
2426                                                 // 'ya'-gyo
2427                                                 fillIndex [0x22]++;
2428                                                 kanaOffset -= 2; // There is no space for yi and ye.
2429                                                 continue;
2430                                         }
2431                                         int cp = kanaOffset + dan * kanaLines [gyo];
2432                                         // small lines (a-gyo, ya-gyo)
2433                                         if (gyo == 0 || gyo == 7) {
2434                                                 AddKanaMap (cp, 1); // small
2435                                                 AddKanaMap (cp + 1, 1);
2436                                         }
2437                                         else
2438                                                 AddKanaMap (cp, kanaLines [gyo]);
2439                                         fillIndex [0x22]++;
2440
2441                                         if (cp == 0x30AB) {
2442                                                 // add small 'ka' (before normal one)
2443                                                 AddKanaMap (0x30F5, 1);
2444                                                 kanaOffset++;
2445                                         }
2446                                         if (cp == 0x30B1) {
2447                                                 // add small 'ke' (before normal one)
2448                                                 AddKanaMap (0x30F6, 1);
2449                                                 kanaOffset++;
2450                                         }
2451                                         if (cp == 0x3061) {
2452                                                 // add small 'Tsu' (before normal one)
2453                                                 AddKanaMap (0x3063, 1);
2454                                                 kanaOffset++;
2455                                         }
2456                                 }
2457                                 fillIndex [0x22] += 3;
2458                                 kanaOffset += 5 * kanaLines [gyo];
2459                         }
2460
2461                         // Wa-gyo is almost special, so I just manually add.
2462                         AddLetterMap ((char) 0x308E, 0x22, 0);
2463                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2464                         AddLetterMap ((char) 0x308F, 0x22, 0);
2465                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2466                         fillIndex [0x22]++;
2467                         AddLetterMap ((char) 0x3090, 0x22, 0);
2468                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2469                         fillIndex [0x22] += 2;
2470                         // no "Wu" in Japanese.
2471                         AddLetterMap ((char) 0x3091, 0x22, 0);
2472                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2473                         fillIndex [0x22]++;
2474                         AddLetterMap ((char) 0x3092, 0x22, 0);
2475                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2476                         // Nn
2477                         fillIndex [0x22] = 0x80;
2478                         AddLetterMap ((char) 0x3093, 0x22, 0);
2479                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2480
2481                         map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2482                                 map [0x30A6].Level1, 3);// voiced hiragana U
2483                         map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2484                                 map [0x30A6].Level1, 3);// voiced katakana U
2485
2486                         map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2487                                 map [0x30AB].Level1, 0);// small katakana Ka
2488                         map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2489                                 map [0x30B1].Level1, 0);// small katakana Ke
2490                         // voiced Wa lines
2491                         for (int i = 0x30F7; i < 0x30FB; i++)
2492                                 map [i] = new CharMapEntry (map [i - 8].Category,
2493                                         map [i - 8].Level1,
2494                                         3);
2495
2496                         // JIS Japanese square chars.
2497                         fillIndex [0x22] = 0x97;
2498                         jisJapanese.Sort (JISComparer.Instance);
2499                         foreach (JISCharacter j in jisJapanese)
2500                                 if (0x3300 <= j.CP && j.CP <= 0x3357)
2501                                         AddCharMap ((char) j.CP, 0x22, 1);
2502                         // non-JIS Japanese square chars.
2503                         nonJisJapanese.Sort (NonJISComparer.Instance);
2504                         foreach (NonJISCharacter j in nonJisJapanese)
2505                                 AddCharMap ((char) j.CP, 0x22, 1);
2506
2507                         // Bopomofo
2508                         fillIndex [0x23] = 0x02;
2509                         for (int i = 0x3105; i <= 0x312C; i++)
2510                                 AddCharMap ((char) i, 0x23, 1);
2511
2512                         // Estrangela: ancient Syriac
2513                         fillIndex [0x24] = 0x0B;
2514                         // FIXME: is 0x71E really alternative form?
2515                         ArrayList syriacAlternatives = new ArrayList (
2516                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2517                         for (int i = 0x0710; i <= 0x072C; i++) {
2518                                 if (i == 0x0711) // NonSpacingMark
2519                                         continue;
2520                                 if (syriacAlternatives.Contains (i))
2521                                         continue;
2522                                 AddCharMap ((char) i, 0x24, 4);
2523                                 // FIXME: why?
2524                                 if (i == 0x721)
2525                                         fillIndex [0x24]++;
2526                         }
2527                         foreach (int cp in syriacAlternatives)
2528                                 map [cp] = new CharMapEntry (0x24,
2529                                         (byte) (map [cp - 1].Level1 + 2),
2530                                         0);
2531                         // FIXME: Syriac NonSpacingMark should go here.
2532
2533                         // Thaana
2534                         // FIXME: it turned out that it does not look like UCA
2535                         fillIndex [0x24] = 0x6E;
2536                         for (int i = 0; i < orderedThaana.Length; i++) {
2537                                 char c = orderedThaana [i];
2538                                 if (IsIgnorableNonSpacing ((int) c))
2539                                         continue;
2540                                 AddCharMap (c, 0x24, 2);
2541                                 if (c == '\u0782') // SPECIAL CASE: why?
2542                                         fillIndex [0x24] += 2;
2543                         }
2544                         #endregion
2545
2546                         // FIXME: Add more culture-specific letters (that are
2547                         // not supported in Windows collation) here.
2548
2549                         // Surrogate ... they are computed.
2550
2551                         #region Hangul
2552                         // Hangul.
2553                         //
2554                         // Unlike UCA Windows Hangul sequence mixes Jongseong
2555                         // with Choseong sequence as well as Jungseong,
2556                         // adjusted to have the same primary weight for the
2557                         // same base character. So it is impossible to compute
2558                         // those sort keys.
2559                         //
2560                         // Here I introduce an ordered sequence of mixed
2561                         // 'commands' and 'characters' that is similar to
2562                         // LDML text:
2563                         //      - ',' increases primary weight.
2564                         //      - [A B] means a range, increasing index
2565                         //      - {A B} means a range, without increasing index
2566                         //      - '=' is no operation (it means the characters
2567                         //        of both sides have the same weight).
2568                         //      - '>' inserts a Hangul Syllable block that
2569                         //        contains 0x251 characters.
2570                         //      - '<' decreases the index
2571                         //      - '0'-'9' means skip count
2572                         //      - whitespaces are ignored
2573                         //
2574
2575                         string hangulSequence =
2576                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2577                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2578                         + "<{\u1113 \u1116}, \u3165,"
2579                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2580                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2581                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2582                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2583                                 + "[\u11D1 \u11D2], \u11B2,"
2584                                 + "[\u11D3 \u11D5], \u11B3,"
2585                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2586                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2587                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2588                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2589                         + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2590                                 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2591                                 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2592                         + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2593                                 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2594                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2595                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2596                         + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2597                                 + "\u11F1,, \u11F2,,,"
2598                                 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2599                         + "<\u114D, \u110D,,  >"
2600                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2601                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2602                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2603                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2604                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2605                                 + "[\u11F5 \u11F8]"
2606                         ;
2607
2608                         byte hangulCat = 0x52;
2609                         fillIndex [hangulCat] = 0x2;
2610
2611                         int syllableBlock = 0;
2612                         for (int n = 0; n < hangulSequence.Length; n++) {
2613                                 char c = hangulSequence [n];
2614                                 int start, end;
2615                                 if (Char.IsWhiteSpace (c))
2616                                         continue;
2617                                 switch (c) {
2618                                 case '=':
2619                                         break; // NOP
2620                                 case ',':
2621                                         IncrementSequentialIndex (ref hangulCat);
2622                                         break;
2623                                 case '<':
2624                                         if (fillIndex [hangulCat] == 2)
2625                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2626                                         fillIndex [hangulCat]--;
2627                                         break;
2628                                 case '>':
2629                                         IncrementSequentialIndex (ref hangulCat);
2630                                         for (int l = 0; l < 0x15; l++)
2631                                                 for (int v = 0; v < 0x1C; v++) {
2632                                                         AddCharMap (
2633                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2634                                                         IncrementSequentialIndex (ref hangulCat);
2635                                                 }
2636                                         syllableBlock++;
2637                                         break;
2638                                 case '[':
2639                                         start = hangulSequence [n + 1];
2640                                         end = hangulSequence [n + 3];
2641                                         for (int i = start; i <= end; i++) {
2642                                                 AddCharMap ((char) i, hangulCat, 0);
2643                                                 if (end > i)
2644                                                         IncrementSequentialIndex (ref hangulCat);
2645                                         }
2646                                         n += 4; // consumes 5 characters for this operation
2647                                         break;
2648                                 case '{':
2649                                         start = hangulSequence [n + 1];
2650                                         end = hangulSequence [n + 3];
2651                                         for (int i = start; i <= end; i++)
2652                                                 AddCharMap ((char) i, hangulCat, 0);
2653                                         n += 4; // consumes 5 characters for this operation
2654                                         break;
2655                                 default:
2656                                         AddCharMap (c, hangulCat, 0);
2657                                         break;
2658                                 }
2659                         }
2660
2661                         // Some Jamo NFKD.
2662                         for (int i = 0x3200; i < 0x3300; i++) {
2663                                 if (IsIgnorable (i) || map [i].Defined)
2664                                         continue;
2665                                 int ch = 0;
2666                                 // w/ bracket
2667                                 if (decompLength [i] == 4 &&
2668                                         decompValues [decompIndex [i]] == '(')
2669                                         ch = decompIndex [i] + 1;
2670                                 // circled
2671                                 else if (decompLength [i] == 2 &&
2672                                         decompValues [decompIndex [i] + 1] == '\u1161')
2673                                         ch = decompIndex [i];
2674                                 else if (decompLength [i] == 1)
2675                                         ch = decompIndex [i];
2676                                 else
2677                                         continue;
2678                                 ch = decompValues [ch];
2679                                 if (ch < 0x1100 || 0x1200 < ch &&
2680                                         ch < 0xAC00 || 0xD800 < ch)
2681                                         continue;
2682
2683                                 // SPECIAL CASE ?
2684                                 int offset = i < 0x3260 ? 1 : 0;
2685                                 if (0x326E <= i && i <= 0x3273)
2686                                         offset = 1;
2687
2688                                 map [i] = new CharMapEntry (map [ch].Category,
2689                                         (byte) (map [ch].Level1 + offset),
2690                                         map [ch].Level2);
2691 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2692                         }
2693
2694
2695                         #endregion
2696
2697                         // Letterlike characters and CJK compatibility square
2698                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2699                         int [] counts = new int ['Z' - 'A' + 1];
2700                         char [] namedChars = new char [sortableCharNames.Count];
2701                         int nCharNames = 0;
2702                         foreach (DictionaryEntry de in sortableCharNames) {
2703                                 counts [((string) de.Value) [0] - 'A']++;
2704                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2705                         }
2706                         nCharNames = 0; // reset
2707                         for (int a = 0; a < counts.Length; a++) {
2708                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2709                                 for (int i = 0; i < counts [a]; i++)
2710 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2711                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2712                         }
2713
2714                         // CJK unified ideograph.
2715                         byte cjkCat = 0x9E;
2716                         fillIndex [cjkCat] = 0x2;
2717                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2718                                 if (!IsIgnorable (cp))
2719                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2720                         // CJK Extensions goes here.
2721                         // LAMESPEC: With this Windows style CJK layout, it is
2722                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2723                         // 0x9FBB can never be added w/o breaking compat.
2724                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2725                                 if (!IsIgnorable (cp))
2726                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2727
2728                         // PrivateUse ... computed.
2729                         // remaining Surrogate ... computed.
2730
2731                         #region Special "biggest" area (FF FF)
2732                         fillIndex [0xFF] = 0xFF;
2733                         char [] specialBiggest = new char [] {
2734                                 '\u3005', '\u3031', '\u3032', '\u309D',
2735                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2736                                 '\uFE7C', '\uFE7D', '\uFF70'};
2737                         foreach (char c in specialBiggest)
2738                                 AddCharMap (c, 0xFF, 0);
2739                         #endregion
2740
2741                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2742                         // non-alphanumeric ASCII except for: + - < = > '
2743                         for (int i = 0x21; i < 0x7F; i++) {
2744                                 if (Char.IsLetterOrDigit ((char) i)
2745                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2746                                         continue; // they are not added here.
2747                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2748                                 // Insert 3001 after ',' and 3002 after '.'
2749                                 if (i == 0x2C)
2750                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2751                                 else if (i == 0x2E)
2752                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2753                                 else if (i == 0x3A)
2754                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2755                         }
2756                         #endregion
2757
2758                         #region 07 - Punctuations and something else
2759                         for (int i = 0xA0; i < char.MaxValue; i++) {
2760                                 if (IsIgnorable (i))
2761                                         continue;
2762
2763                                 // FIXME: actually those reset should not be
2764                                 // done but here I put for easy goal.
2765                                 if (i == 0x0700)
2766                                         fillIndex [0x7] = 0xE2;
2767                                 if (i == 0x2016)
2768                                         fillIndex [0x7] = 0x77;
2769
2770                                 // SPECIAL CASES:
2771                                 switch (i) {
2772                                 case 0xAB: // 08
2773                                 case 0xB7: // 0A
2774                                 case 0xBB: // 08
2775                                 case 0x2329: // 09
2776                                 case 0x232A: // 09
2777                                         continue;
2778                                 }
2779
2780                                 switch (Char.GetUnicodeCategory ((char) i)) {
2781                                 case UnicodeCategory.OtherPunctuation:
2782                                 case UnicodeCategory.ClosePunctuation:
2783                                 case UnicodeCategory.OpenPunctuation:
2784                                 case UnicodeCategory.InitialQuotePunctuation:
2785                                 case UnicodeCategory.FinalQuotePunctuation:
2786                                 case UnicodeCategory.ModifierSymbol:
2787                                         // SPECIAL CASES: // 0xA
2788                                         if (0x2020 <= i && i <= 0x2031)
2789                                                 continue;
2790                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
2791                                         break;
2792                                 default:
2793                                         if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2794                                                 goto case UnicodeCategory.OtherPunctuation;
2795                                         break;
2796                                 }
2797                         }
2798                         // Control pictures
2799                         // FIXME: it should not need to reset level 1, but
2800                         // it's for easy goal.
2801                         fillIndex [0x7] = 0xB6;
2802                         for (int i = 0x2400; i <= 0x2421; i++)
2803                                 AddCharMap ((char) i, 0x7, 1, 0);
2804                         #endregion
2805
2806                         // FIXME: for 07 xx we need more love.
2807
2808                         // Characters w/ diacritical marks (NFKD)
2809                         for (int i = 0; i <= char.MaxValue; i++) {
2810                                 if (map [i].Defined || IsIgnorable (i))
2811                                         continue;
2812                                 if (decompIndex [i] == 0)
2813                                         continue;
2814
2815                                 int start = decompIndex [i];
2816                                 int primaryChar = decompValues [start];
2817                                 int secondary = 0;
2818                                 bool skip = false;
2819                                 int length = decompLength [i];
2820                                 // special processing for parenthesized ones.
2821                                 if (length == 3 &&
2822                                         decompValues [start] == '(' &&
2823                                         decompValues [start + 2] == ')') {
2824                                         primaryChar = decompValues [start + 1];
2825                                         length = 1;
2826                                 }
2827
2828                                 if (map [primaryChar].Level1 == 0)
2829                                         continue;
2830
2831                                 for (int l = 1; l < length; l++) {
2832                                         int c = decompValues [start + l];
2833                                         if (map [c].Level1 != 0)
2834                                                 skip = true;
2835                                         secondary += diacritical [c];
2836                                 }
2837                                 if (skip)
2838                                         continue;
2839                                 map [i] = new CharMapEntry (
2840                                         map [primaryChar].Category,
2841                                         map [primaryChar].Level1,
2842                                         (byte) secondary);
2843
2844                         }
2845
2846                         // category 08 - symbols
2847                         fillIndex [0x8] = 2;
2848                         // Here Windows mapping is not straightforward. It is
2849                         // not based on computation but seems manual sorting.
2850                         AddCharMapGroup ('+', 0x8, 1, 0); // plus
2851                         AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
2852                         AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
2853                         AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
2854                         AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
2855                         AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
2856                         AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
2857                         AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
2858                         AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
2859                         AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
2860                         AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
2861                         AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
2862                         AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
2863
2864                         for (int cp = 0; cp < 0x2300; cp++) {
2865                                 if (cp == 0xAC) // SPECIAL CASE: skip
2866                                         continue;
2867                                 if (cp == 0x200) {
2868                                         cp = 0x2200; // skip to 2200
2869                                         fillIndex [0x8] = 0x21;
2870                                 }
2871                                 if (cp == 0x2295)
2872                                         fillIndex [0x8] = 0x3;
2873                                 if (cp == 0x22B2)
2874                                         fillIndex [0x8] = 0xB9;
2875                                 if (!map [cp].Defined &&
2876 //                                      Char.GetUnicodeCategory ((char) cp) ==
2877 //                                      UnicodeCategory.MathSymbol)
2878                                         Char.IsSymbol ((char) cp))
2879                                         AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
2880                                 // SPECIAL CASES: no idea why Windows sorts as such
2881                                 switch (cp) {
2882                                 case 0x3E:
2883                                         AddCharMap ('\u227B', 0x8, 1, 0);
2884                                         AddCharMap ('\u22B1', 0x8, 1, 0);
2885                                         break;
2886                                 case 0xB1:
2887                                         AddCharMapGroup ('\u00AB', 0x8, 1, 0);
2888                                         AddCharMapGroup ('\u226A', 0x8, 1, 0);
2889                                         AddCharMapGroup ('\u00BB', 0x8, 1, 0);
2890                                         AddCharMapGroup ('\u226B', 0x8, 1, 0);
2891                                         break;
2892                                 case 0xF7:
2893                                         AddCharMap ('\u01C0', 0x8, 1, 0);
2894                                         AddCharMap ('\u01C1', 0x8, 1, 0);
2895                                         AddCharMap ('\u01C2', 0x8, 1, 0);
2896                                         break;
2897                                 }
2898                         }
2899
2900                         #region Level2 adjustment
2901                         // Arabic Hamzah
2902                         diacritical [0x624] = 0x5;
2903                         diacritical [0x626] = 0x7;
2904                         diacritical [0x622] = 0x9;
2905                         diacritical [0x623] = 0xA;
2906                         diacritical [0x625] = 0xB;
2907                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
2908                         diacritical [0x64A] = 0x7; // Yaa'
2909
2910                         for (int i = 0; i < char.MaxValue; i++) {
2911                                 byte mod = 0;
2912                                 byte cat = map [i].Category;
2913                                 switch (cat) {
2914                                 case 0xE: // Latin diacritics
2915                                 case 0x22: // Japanese: circled characters
2916                                         mod = diacritical [i];
2917                                         break;
2918                                 case 0x13: // Arabic
2919                                         if (diacritical [i] == 0 && i >= 0xFE8D)
2920                                                 mod = 0x8; // default for arabic
2921                                         break;
2922                                 }
2923                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
2924                                         mod = diacritical [i];
2925                                 if (mod > 0)
2926                                         map [i] = new CharMapEntry (
2927                                                 cat, map [i].Level1, mod);
2928                         }
2929                         #endregion
2930
2931                         // FIXME: this is hack but those NonSpacingMark
2932                         // characters and still undefined are likely to
2933                         // be nonspacing.
2934                         for (int i = 0; i < char.MaxValue; i++)
2935                                 if (!map [i].Defined &&
2936                                         !IsIgnorable (i) &&
2937                                         Char.GetUnicodeCategory ((char) i) ==
2938                                         UnicodeCategory.NonSpacingMark)
2939                                         AddCharMap ((char) i, 1, 1);
2940
2941                         // FIXME: this is hack but those Symbol characters
2942                         // are likely to fall into 0xA category.
2943                         for (int i = 0; i < char.MaxValue; i++)
2944                                 if (!map [i].Defined &&
2945                                         !IsIgnorable (i) &&
2946                                         Char.IsSymbol ((char) i))
2947                                         AddCharMap ((char) i, 0xA, 1);
2948                 }
2949
2950                 private void IncrementSequentialIndex (ref byte hangulCat)
2951                 {
2952                         fillIndex [hangulCat]++;
2953                         if (fillIndex [hangulCat] == 0) { // overflown
2954                                 hangulCat++;
2955                                 fillIndex [hangulCat] = 0x2;
2956                         }
2957                 }
2958
2959                 // Reset fillIndex to fixed value and call AddLetterMap().
2960                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2961                 {
2962                         fillIndex [category] = alphaWeight;
2963                         AddLetterMap (c, category, 0);
2964
2965                         ArrayList al = latinMap [c] as ArrayList;
2966                         if (al == null)
2967                                 return;
2968
2969                         foreach (int cp in al)
2970                                 AddLetterMap ((char) cp, category, 0);
2971                 }
2972
2973                 private void AddKanaMap (int i, byte voices)
2974                 {
2975                         for (byte b = 0; b < voices; b++) {
2976                                 char c = (char) (i + b);
2977                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
2978                                 // Hiragana
2979                                 AddLetterMapCore (c, 0x22, 0, arg);
2980                                 // Katakana
2981                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2982                         }
2983                 }
2984
2985                 private void AddLetterMap (char c, byte category, byte updateCount)
2986                 {
2987                         AddLetterMapCore (c, category, updateCount, 0);
2988                 }
2989
2990                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2991                 {
2992                         char c2;
2993                         // <small> updates index
2994                         c2 = ToSmallForm (c);
2995                         if (c2 != c)
2996                                 AddCharMapGroup (c2, category, updateCount, level2);
2997                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2998                         if (c2 != c && !map [(int) c2].Defined)
2999                                 AddLetterMapCore (c2, category, 0, level2);
3000                         bool doUpdate = true;
3001                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3002                                 doUpdate = false;
3003                         else
3004                                 AddCharMapGroup (c, category, 0, level2);
3005                         if (doUpdate)
3006                                 fillIndex [category] += updateCount;
3007                 }
3008
3009                 private bool AddCharMap (char c, byte category, byte increment)
3010                 {
3011                         return AddCharMap (c, category, increment, 0);
3012                 }
3013
3014                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3015                 {
3016                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3017                                 return false; // do nothing
3018                         map [(int) c] = new CharMapEntry (category,
3019                                 category == 1 ? alt : fillIndex [category],
3020                                 category == 1 ? fillIndex [category] : alt);
3021                         fillIndex [category] += increment;
3022                         return true;
3023                 }
3024
3025                 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
3026                 {
3027                         char c2 = ToSmallFormTail (c);
3028                         if (c2 != c)
3029                                 AddCharMap (c2, category, updateCount, 0);
3030                         // itself
3031                         AddCharMap (c, category, updateCount, 0);
3032                         // <full>
3033                         c2 = ToFullWidthTail (c);
3034                         if (c2 != c)
3035                                 AddCharMapGroupTail (c2, category, updateCount);
3036                 }
3037
3038                 //
3039                 // Adds characters to table in the order below
3040                 // (+ increases weight):
3041                 //      (<small> +)
3042                 //      itself
3043                 //      <fraction>
3044                 //      <full> | <super> | <sub>
3045                 //      <circle> | <wide> (| <narrow>)
3046                 //      +
3047                 //      (vertical +)
3048                 //
3049                 // level2 is fixed (does not increase).
3050                 int [] sameWeightItems = new int [] {
3051                         DecompositionFraction,
3052                         DecompositionFull,
3053                         DecompositionSuper,
3054                         DecompositionSub,
3055                         DecompositionCircle,
3056                         DecompositionWide,
3057                         DecompositionNarrow,
3058                         };
3059                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3060                 {
3061                         if (map [(int) c].Defined)
3062                                 return;
3063
3064                         char small = char.MinValue;
3065                         char vertical = char.MinValue;
3066                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3067                         if (nfkd != null) {
3068                                 object smv = nfkd [(byte) DecompositionSmall];
3069                                 if (smv != null)
3070                                         small = (char) ((int) smv);
3071                                 object vv = nfkd [(byte) DecompositionVertical];
3072                                 if (vv != null)
3073                                         vertical = (char) ((int) vv);
3074                         }
3075
3076                         // <small> updates index
3077                         if (small != char.MinValue)
3078                                 AddCharMap (small, category, updateCount);
3079
3080                         // itself
3081                         AddCharMap (c, category, 0, level2);
3082
3083                         if (nfkd != null) {
3084                                 foreach (int weight in sameWeightItems) {
3085                                         object wv = nfkd [(byte) weight];
3086                                         if (wv != null)
3087                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
3088                                 }
3089                         }
3090
3091                         // update index here.
3092                         fillIndex [category] += updateCount;
3093
3094                         if (vertical != char.MinValue)
3095                                 AddCharMap (vertical, category, updateCount, level2);
3096                 }
3097
3098                 private void AddCharMapCJK (char c, ref byte category)
3099                 {
3100                         AddCharMap (c, category, 0, 0);
3101                         IncrementSequentialIndex (ref category);
3102
3103                         // Special. I wonder why but Windows skips 9E F9.
3104                         if (category == 0x9E && fillIndex [category] == 0xF9)
3105                                 IncrementSequentialIndex (ref category);
3106                 }
3107
3108                 private void AddCharMapGroupCJK (char c, ref byte category)
3109                 {
3110                         AddCharMapCJK (c, ref category);
3111
3112                         // LAMESPEC: see below.
3113                         if (c == '\u5B78') {
3114                                 AddCharMapCJK ('\u32AB', ref category);
3115                                 AddCharMapCJK ('\u323B', ref category);
3116                         }
3117                         if (c == '\u52DE') {
3118                                 AddCharMapCJK ('\u3298', ref category);
3119                                 AddCharMapCJK ('\u3238', ref category);
3120                         }
3121                         if (c == '\u5BEB')
3122                                 AddCharMapCJK ('\u32A2', ref category);
3123                         if (c == '\u91AB')
3124                                 // Especially this mapping order totally does
3125                                 // not make sense to me.
3126                                 AddCharMapCJK ('\u32A9', ref category);
3127
3128                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3129                         if (nfkd == null)
3130                                 return;
3131                         for (byte weight = 0; weight <= 0x12; weight++) {
3132                                 object wv = nfkd [weight];
3133                                 if (wv == null)
3134                                         continue;
3135                                 int w = (int) wv;
3136
3137                                 // Special: they are ignored in this area.
3138                                 // FIXME: check if it is sane
3139                                 if (0xF900 <= w && w <= 0xFAD9)
3140                                         continue;
3141                                 // LAMESPEC: on Windows some of CJK characters
3142                                 // in 3200-32B0 are incorrectly mapped. They
3143                                 // mix Chinise and Japanese Kanji when
3144                                 // ordering those characters.
3145                                 switch (w) {
3146                                 case 0x32A2: case 0x3298: case 0x3238:
3147                                 case 0x32A9: case 0x323B: case 0x32AB:
3148                                         continue;
3149                                 }
3150
3151                                 AddCharMapCJK ((char) w, ref category);
3152                         }
3153                 }
3154
3155                 // For now it is only for 0x7 category.
3156                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3157                 {
3158                         char small = char.MinValue;
3159                         char vertical = char.MinValue;
3160                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3161                         if (nfkd != null) {
3162                                 object smv = nfkd [(byte) DecompositionSmall];
3163                                 if (smv != null)
3164                                         small = (char) ((int) smv);
3165                                 object vv = nfkd [(byte) DecompositionVertical];
3166                                 if (vv != null)
3167                                         vertical = (char) ((int) vv);
3168                         }
3169
3170                         // <small> updates index
3171                         if (small != char.MinValue)
3172                                 // SPECIAL CASE excluded (FIXME: why?)
3173                                 if (small != '\u2024')
3174                                         AddCharMap (small, category, updateCount);
3175
3176                         // itself
3177                         AddCharMap (c, category, updateCount, level2);
3178
3179                         // Since nfkdMap is problematic to have two or more
3180                         // NFKD to an identical character, here I iterate all.
3181                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3182                                 if (decompLength [c2] == 1 &&
3183                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3184                                         switch (decompType [c2]) {
3185                                         case DecompositionCompat:
3186                                                 AddCharMap ((char) c2, category, updateCount, level2);
3187                                                 break;
3188                                         }
3189                                 }
3190                         }
3191
3192                         if (vertical != char.MinValue)
3193                                 // SPECIAL CASE excluded (FIXME: why?)
3194                                 if (vertical != '\uFE33' && vertical != '\uFE34')
3195                                         AddCharMap (vertical, category, updateCount, level2);
3196                 }
3197
3198                 private void AddArabicCharMap (char c)
3199                 {
3200                         byte category = 6;
3201                         byte updateCount = 1;
3202                         byte level2 = 0;
3203
3204                         // itself
3205                         AddCharMap (c, category, 0, level2);
3206
3207                         // Since nfkdMap is problematic to have two or more
3208                         // NFKD to an identical character, here I iterate all.
3209                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3210                                 if (decompLength [c2] == 0)
3211                                         continue;
3212                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
3213                                 if ((int) (decompValues [idx]) == (int) c)
3214                                         AddCharMap ((char) c2, category,
3215                                                 0, level2);
3216                         }
3217                         fillIndex [category] += updateCount;
3218                 }
3219
3220                 char ToFullWidth (char c)
3221                 {
3222                         return ToDecomposed (c, DecompositionFull, false);
3223                 }
3224
3225                 char ToFullWidthTail (char c)
3226                 {
3227                         return ToDecomposed (c, DecompositionFull, true);
3228                 }
3229
3230                 char ToSmallForm (char c)
3231                 {
3232                         return ToDecomposed (c, DecompositionSmall, false);
3233                 }
3234
3235                 char ToSmallFormTail (char c)
3236                 {
3237                         return ToDecomposed (c, DecompositionSmall, true);
3238                 }
3239
3240                 char ToDecomposed (char c, byte d, bool tail)
3241                 {
3242                         if (decompType [(int) c] != d)
3243                                 return c;
3244                         int idx = decompIndex [(int) c];
3245                         if (tail)
3246                                 idx += decompLength [(int) c] - 1;
3247                         return (char) decompValues [idx];
3248                 }
3249
3250                 bool ExistsJIS (int cp)
3251                 {
3252                         foreach (JISCharacter j in jisJapanese)
3253                                 if (j.CP == cp)
3254                                         return true;
3255                         return false;
3256                 }
3257
3258                 #endregion
3259
3260                 #region Level 3 properties (Case/Width)
3261
3262                 private byte ComputeLevel3Weight (char c)
3263                 {
3264                         byte b = ComputeLevel3WeightRaw (c);
3265                         return b > 0 ? (byte) (b + 2) : b;
3266                 }
3267
3268                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3269                 {
3270                         // CJK compat
3271                         if ('\u3192' <= c && c <= '\u319F')
3272                                 return 0;
3273                         // Japanese reading marks
3274                         if (c == '\u3001' || c == '\u3002')
3275                                 return 2;
3276                         // Korean
3277                         if ('\u11A8' <= c && c <= '\u11F9')
3278                                 return 2;
3279                         if ('\uFFA0' <= c && c <= '\uFFDC')
3280                                 return 4;
3281                         if ('\u3130' <= c && c <= '\u3164')
3282                                 return 5;
3283                         if ('\u3165' <= c && c <= '\u318E')
3284                                 return 4;
3285                         // Georgian Capital letters
3286                         if ('\u10A0' <= c && c <= '\u10C5')
3287                                 return 0x10;
3288                         // numbers
3289                         if ('\u2776' <= c && c <= '\u277F')
3290                                 return 4;
3291                         if ('\u2780' <= c && c <= '\u2789')
3292                                 return 8;
3293                         if ('\u2776' <= c && c <= '\u2793')
3294                                 return 0xC;
3295                         if ('\u2160' <= c && c <= '\u216F')
3296                                 return 0x10;
3297                         if ('\u2181' <= c && c <= '\u2182')
3298                                 return 0x18;
3299                         // Arabic
3300                         if ('\u2135' <= c && c <= '\u2138')
3301                                 return 4;
3302                         if ('\uFE80' <= c && c < '\uFF00') {
3303                                 // 2(Isolated)/8(Final)/0x18(Medial)
3304                                 switch (decompType [(int) c]) {
3305                                 case DecompositionIsolated:
3306                                         return 2;
3307                                 case DecompositionFinal:
3308                                         return 8;
3309                                 case DecompositionMedial:
3310                                         return 0x18;
3311                                 }
3312                         }
3313
3314                         // actually I dunno the reason why they have weights.
3315                         switch (c) {
3316                         case '\u01BC':
3317                                 return 0x10;
3318                         case '\u06A9':
3319                                 return 0x20;
3320                         case '\u06AA':
3321                                 return 0x28;
3322                         }
3323
3324                         byte ret = 0;
3325                         switch (c) {
3326                         case '\u03C2':
3327                         case '\u2104':
3328                         case '\u212B':
3329                                 ret |= 8;
3330                                 break;
3331                         case '\uFE42':
3332                                 ret |= 0xC;
3333                                 break;
3334                         }
3335
3336                         // misc
3337                         switch (decompType [(int) c]) {
3338                         case DecompositionWide: // <wide>
3339                         case DecompositionSub: // <sub>
3340                         case DecompositionSuper: // <super>
3341                                 ret |= decompType [(int) c];
3342                                 break;
3343                         }
3344                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3345                                 ret |= 8;
3346                         if (isUppercase [(int) c]) // DerivedCoreProperties
3347                                 ret |= 0x10;
3348
3349                         return ret;
3350                 }
3351
3352                 #endregion
3353
3354                 #region IsIgnorable
3355 /*
3356                 static bool IsIgnorable (int i)
3357                 {
3358                         if (unicodeAge [i] >= 3.1)
3359                                 return true;
3360                         switch (char.GetUnicodeCategory ((char) i)) {
3361                         case UnicodeCategory.OtherNotAssigned:
3362                         case UnicodeCategory.Format:
3363                                 return true;
3364                         }
3365                         return false;
3366                 }
3367 */
3368
3369                 // FIXME: In the future use DerivedAge.txt to examine character
3370                 // versions and set those ones that have higher version than
3371                 // 1.0 as ignorable.
3372                 static bool IsIgnorable (int i)
3373                 {
3374                         switch (i) {
3375                         case 0:
3376                         // I guess, those characters are added between
3377                         // Unicode 1.0 (LCMapString) and Unicode 3.1
3378                         // (UnicodeCategory), so they used to be
3379                         // something like OtherNotAssigned as of Unicode 1.1.
3380                         case 0x2df: case 0x387:
3381                         case 0x3d7: case 0x3d8: case 0x3d9:
3382                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3383                         case 0x400: case 0x40d: case 0x450: case 0x45d:
3384                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
3385                         case 0x653: case 0x654: case 0x655: case 0x66d:
3386                         case 0xb56:
3387                         case 0x1e9b: case 0x202f: case 0x20ad:
3388                         case 0x20ae: case 0x20af:
3389                         case 0x20e2: case 0x20e3:
3390                         case 0x2139: case 0x213a: case 0x2183:
3391                         case 0x2425: case 0x2426: case 0x2619:
3392                         case 0x2670: case 0x2671: case 0x3007:
3393                         case 0x3190: case 0x3191:
3394                         case 0xfffc: case 0xfffd:
3395                                 return true;
3396                         // exceptional characters filtered by the
3397                         // following conditions. Originally those exceptional
3398                         // ranges are incorrect (they should not be ignored)
3399                         // and most of those characters are unfortunately in
3400                         // those ranges.
3401                         case 0x4d8: case 0x4d9:
3402                         case 0x4e8: case 0x4e9:
3403                         case 0x70F:
3404                         case 0x3036: case 0x303f:
3405                         case 0x337b: case 0xfb1e:
3406                                 return false;
3407                         }
3408
3409                         if (
3410                                 // The whole Sinhala characters.
3411                                 0x0D82 <= i && i <= 0x0DF4
3412                                 // The whole Tibetan characters.
3413                                 || 0x0F00 <= i && i <= 0x0FD1
3414                                 // The whole Myanmar characters.
3415                                 || 0x1000 <= i && i <= 0x1059
3416                                 // The whole Etiopic, Cherokee,
3417                                 // Canadian Syllablic, Ogham, Runic,
3418                                 // Tagalog, Hanunoo, Philippine,
3419                                 // Buhid, Tagbanwa, Khmer and Mongorian
3420                                 // characters.
3421                                 || 0x1200 <= i && i <= 0x1DFF
3422                                 // Greek extension characters.
3423                                 || 0x1F00 <= i && i <= 0x1FFF
3424                                 // The whole Braille characters.
3425                                 || 0x2800 <= i && i <= 0x28FF
3426                                 // CJK radical characters.
3427                                 || 0x2E80 <= i && i <= 0x2EF3
3428                                 // Kangxi radical characters.
3429                                 || 0x2F00 <= i && i <= 0x2FD5
3430                                 // Ideographic description characters.
3431                                 || 0x2FF0 <= i && i <= 0x2FFB
3432                                 // Bopomofo letter and final
3433                                 || 0x31A0 <= i && i <= 0x31B7
3434                                 // White square with quadrant characters.
3435                                 || 0x25F0 <= i && i <= 0x25F7
3436                                 // Ideographic telegraph symbols.
3437                                 || 0x32C0 <= i && i <= 0x32CB
3438                                 || 0x3358 <= i && i <= 0x3370
3439                                 || 0x33E0 <= i && i <= 0x33FF
3440                                 // The whole YI characters.
3441                                 || 0xA000 <= i && i <= 0xA48C
3442                                 || 0xA490 <= i && i <= 0xA4C6
3443                                 // American small ligatures
3444                                 || 0xFB13 <= i && i <= 0xFB17
3445                                 // hebrew, arabic, variation selector.
3446                                 || 0xFB1D <= i && i <= 0xFE2F
3447                                 // Arabic ligatures.
3448                                 || 0xFEF5 <= i && i <= 0xFEFC
3449                                 // FIXME: why are they excluded?
3450                                 || 0x01F6 <= i && i <= 0x01F9
3451                                 || 0x0218 <= i && i <= 0x0233
3452                                 || 0x02A9 <= i && i <= 0x02AD
3453                                 || 0x02EA <= i && i <= 0x02EE
3454                                 || 0x0349 <= i && i <= 0x036F
3455                                 || 0x0488 <= i && i <= 0x048F
3456                                 || 0x04D0 <= i && i <= 0x04FF
3457                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3458                                 || 0x06D6 <= i && i <= 0x06ED
3459                                 || 0x06FA <= i && i <= 0x06FE
3460                                 || 0x2048 <= i && i <= 0x204D
3461                                 || 0x20e4 <= i && i <= 0x20ea
3462                                 || 0x213C <= i && i <= 0x214B
3463                                 || 0x21EB <= i && i <= 0x21FF
3464                                 || 0x22F2 <= i && i <= 0x22FF
3465                                 || 0x237B <= i && i <= 0x239A
3466                                 || 0x239B <= i && i <= 0x23CF
3467                                 || 0x24EB <= i && i <= 0x24FF
3468                                 || 0x2596 <= i && i <= 0x259F
3469                                 || 0x25F8 <= i && i <= 0x25FF
3470                                 || 0x2672 <= i && i <= 0x2689
3471                                 || 0x2768 <= i && i <= 0x2775
3472                                 || 0x27d0 <= i && i <= 0x27ff
3473                                 || 0x2900 <= i && i <= 0x2aff
3474                                 || 0x3033 <= i && i <= 0x303F
3475                                 || 0x31F0 <= i && i <= 0x31FF
3476                                 || 0x3250 <= i && i <= 0x325F
3477                                 || 0x32B1 <= i && i <= 0x32BF
3478                                 || 0x3371 <= i && i <= 0x337B
3479                                 || 0xFA30 <= i && i <= 0xFA6A
3480                         )
3481                                 return true;
3482
3483                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3484                         switch (uc) {
3485                         case UnicodeCategory.PrivateUse:
3486                         case UnicodeCategory.Surrogate:
3487                                 return false;
3488                         // ignored by nature
3489                         case UnicodeCategory.Format:
3490                         case UnicodeCategory.OtherNotAssigned:
3491                                 return true;
3492                         default:
3493                                 return false;
3494                         }
3495                 }
3496
3497                 // To check IsIgnorable sanity, try the driver below under MS.NET.
3498
3499                 /*
3500                 public static void Main ()
3501                 {
3502                         for (int i = 0; i <= char.MaxValue; i++)
3503                                 Dump (i, IsIgnorable (i));
3504                 }
3505
3506                 static void Dump (int i, bool ignore)
3507                 {
3508                         switch (Char.GetUnicodeCategory ((char) i)) {
3509                         case UnicodeCategory.PrivateUse:
3510                         case UnicodeCategory.Surrogate:
3511                                 return; // check nothing
3512                         }
3513
3514                         string s1 = "";
3515                         string s2 = new string ((char) i, 10);
3516                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3517                         if ((ret == 0) == ignore)
3518                                 return;
3519                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3520                 }
3521                 */
3522                 #endregion // IsIgnorable
3523
3524                 #region IsIgnorableSymbol
3525                 static bool IsIgnorableSymbol (int i)
3526                 {
3527                         if (IsIgnorable (i))
3528                                 return true;
3529
3530                         switch (i) {
3531                         // *Letter
3532                         case 0x00b5: case 0x01C0: case 0x01C1:
3533                         case 0x01C2: case 0x01C3: case 0x01F6:
3534                         case 0x01F7: case 0x01F8: case 0x01F9:
3535                         case 0x02D0: case 0x02EE: case 0x037A:
3536                         case 0x03D7: case 0x03F3:
3537                         case 0x0400: case 0x040d:
3538                         case 0x0450: case 0x045d:
3539                         case 0x048C: case 0x048D:
3540                         case 0x048E: case 0x048F:
3541                         case 0x0587: case 0x0640: case 0x06E5:
3542                         case 0x06E6: case 0x06FA: case 0x06FB:
3543                         case 0x06FC: case 0x093D: case 0x0950:
3544                         case 0x1E9B: case 0x2139: case 0x3006:
3545                         case 0x3033: case 0x3034: case 0x3035:
3546                         case 0xFE7E: case 0xFE7F:
3547                         // OtherNumber
3548                         case 0x16EE: case 0x16EF: case 0x16F0:
3549                         // LetterNumber
3550                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3551                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3552                         case 0x3038: // HANGZHOU NUMERAL TEN
3553                         case 0x3039: // HANGZHOU NUMERAL TWENTY
3554                         case 0x303a: // HANGZHOU NUMERAL THIRTY
3555                         // OtherSymbol
3556                         case 0x2117:
3557                         case 0x327F:
3558                                 return true;
3559                         // ModifierSymbol
3560                         case 0x02B9: case 0x02BA: case 0x02C2:
3561                         case 0x02C3: case 0x02C4: case 0x02C5:
3562                         case 0x02C8: case 0x02CC: case 0x02CD:
3563                         case 0x02CE: case 0x02CF: case 0x02D2:
3564                         case 0x02D3: case 0x02D4: case 0x02D5:
3565                         case 0x02D6: case 0x02D7: case 0x02DE:
3566                         case 0x02E5: case 0x02E6: case 0x02E7:
3567                         case 0x02E8: case 0x02E9:
3568                         case 0x309B: case 0x309C:
3569                         // OtherPunctuation
3570                         case 0x055A: // American Apos
3571                         case 0x05C0: // Hebrew Punct
3572                         case 0x0E4F: // Thai FONGMAN
3573                         case 0x0E5A: // Thai ANGKHANKHU
3574                         case 0x0E5B: // Thai KHOMUT
3575                         // CurencySymbol
3576                         case 0x09F2: // Bengali Rupee Mark
3577                         case 0x09F3: // Bengali Rupee Sign
3578                         // MathSymbol
3579                         case 0x221e: // INF.
3580                         // OtherSymbol
3581                         case 0x0482:
3582                         case 0x09FA:
3583                         case 0x0B70:
3584                                 return false;
3585                         }
3586
3587                         // *Letter
3588                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3589 #if NET_2_0
3590                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3591                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3592 #endif
3593                         )
3594                                 return true;
3595
3596                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3597                         switch (uc) {
3598                         case UnicodeCategory.Surrogate:
3599                                 return false; // inconsistent
3600
3601                         case UnicodeCategory.SpacingCombiningMark:
3602                         case UnicodeCategory.EnclosingMark:
3603                         case UnicodeCategory.NonSpacingMark:
3604                         case UnicodeCategory.PrivateUse:
3605                                 // NonSpacingMark
3606                                 if (0x064B <= i && i <= 0x0652) // Arabic
3607                                         return true;
3608                                 return false;
3609
3610                         case UnicodeCategory.Format:
3611                         case UnicodeCategory.OtherNotAssigned:
3612                                 return true;
3613
3614                         default:
3615                                 bool use = false;
3616                                 // OtherSymbols
3617                                 if (
3618                                         // latin in a circle
3619                                         0x249A <= i && i <= 0x24E9
3620                                         || 0x2100 <= i && i <= 0x2132
3621                                         // Japanese
3622                                         || 0x3196 <= i && i <= 0x31A0
3623                                         // Korean
3624                                         || 0x3200 <= i && i <= 0x321C
3625                                         // Chinese/Japanese
3626                                         || 0x322A <= i && i <= 0x3243
3627                                         // CJK
3628                                         || 0x3260 <= i && i <= 0x32B0
3629                                         || 0x32D0 <= i && i <= 0x3357
3630                                         || 0x337B <= i && i <= 0x33DD
3631                                 )
3632                                         use = !Char.IsLetterOrDigit ((char) i);
3633                                 if (use)
3634                                         return false;
3635
3636                                 // This "Digit" rule is mystery.
3637                                 // It filters some symbols out.
3638                                 if (Char.IsLetterOrDigit ((char) i))
3639                                         return false;
3640                                 if (Char.IsNumber ((char) i))
3641                                         return false;
3642                                 if (Char.IsControl ((char) i)
3643                                         || Char.IsSeparator ((char) i)
3644                                         || Char.IsPunctuation ((char) i))
3645                                         return true;
3646                                 if (Char.IsSymbol ((char) i))
3647                                         return true;
3648
3649                                 // FIXME: should check more
3650                                 return false;
3651                         }
3652                 }
3653
3654                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3655 /*
3656                 public static void Main ()
3657                 {
3658                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3659                         for (int i = 0; i <= char.MaxValue; i++) {
3660                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3661                                 if (uc == UnicodeCategory.Surrogate)
3662                                         continue;
3663
3664                                 bool ret = IsIgnorableSymbol (i);
3665
3666                                 string s1 = "TEST ";
3667                                 string s2 = "TEST " + (char) i;
3668
3669                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3670
3671                                 if (ret != (result == 0))
3672                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3673                                                 ret ? "should not ignore" :
3674                                                         "should ignore",
3675                                                 i,(char) i, uc);
3676                         }
3677                 }
3678 */
3679                 #endregion
3680
3681                 #region NonSpacing
3682                 static bool IsIgnorableNonSpacing (int i)
3683                 {
3684                         if (IsIgnorable (i))
3685                                 return true;
3686
3687                         switch (i) {
3688                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3689                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3690                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3691                                 return true;
3692                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3693                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3694                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3695                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3696                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3697                         case 0x0CCD: case 0x0E4E:
3698                                 return false;
3699                         }
3700
3701                         if (0x02b9 <= i && i <= 0x02c5
3702                                 || 0x02cc <= i && i <= 0x02d7
3703                                 || 0x02e4 <= i && i <= 0x02ef
3704                                 || 0x20DD <= i && i <= 0x20E0
3705                         )
3706                                 return true;
3707
3708                         if (0x064B <= i && i <= 0x00652
3709                                 || 0x0941 <= i && i <= 0x0948
3710                                 || 0x0AC1 <= i && i <= 0x0ACD
3711                                 || 0x0C3E <= i && i <= 0x0C4F
3712                                 || 0x0E31 <= i && i <= 0x0E3F
3713                         )
3714                                 return false;
3715
3716                         return Char.GetUnicodeCategory ((char) i) ==
3717                                 UnicodeCategory.NonSpacingMark;
3718                 }
3719
3720                 // We can reuse IsIgnorableSymbol testcode
3721                 // for IsIgnorableNonSpacing.
3722                 #endregion
3723         }
3724
3725         struct CharMapEntry
3726         {
3727                 public byte Category;
3728                 public byte Level1;
3729                 public byte Level2; // It is always single byte.
3730                 public bool Defined;
3731
3732                 public CharMapEntry (byte category, byte level1, byte level2)
3733                 {
3734                         Category = category;
3735                         Level1 = level1;
3736                         Level2 = level2;
3737                         Defined = true;
3738                 }
3739         }
3740
3741         class JISCharacter
3742         {
3743                 public readonly int CP;
3744                 public readonly int JIS;
3745
3746                 public JISCharacter (int cp, int cpJIS)
3747                 {
3748                         CP = cp;
3749                         JIS = cpJIS;
3750                 }
3751         }
3752
3753         class JISComparer : IComparer
3754         {
3755                 public static readonly JISComparer Instance =
3756                         new JISComparer ();
3757
3758                 public int Compare (object o1, object o2)
3759                 {
3760                         JISCharacter j1 = (JISCharacter) o1;
3761                         JISCharacter j2 = (JISCharacter) o2;
3762                         return j1.JIS - j2.JIS;
3763                 }
3764         }
3765
3766         class NonJISCharacter
3767         {
3768                 public readonly int CP;
3769                 public readonly string Name;
3770
3771                 public NonJISCharacter (int cp, string name)
3772                 {
3773                         CP = cp;
3774                         Name = name;
3775                 }
3776         }
3777
3778         class NonJISComparer : IComparer
3779         {
3780                 public static readonly NonJISComparer Instance =
3781                         new NonJISComparer ();
3782
3783                 public int Compare (object o1, object o2)
3784                 {
3785                         NonJISCharacter j1 = (NonJISCharacter) o1;
3786                         NonJISCharacter j2 = (NonJISCharacter) o2;
3787                         return string.CompareOrdinal (j1.Name, j2.Name);
3788                 }
3789         }
3790
3791         class DecimalDictionaryValueComparer : IComparer
3792         {
3793                 public static readonly DecimalDictionaryValueComparer Instance
3794                         = new DecimalDictionaryValueComparer ();
3795
3796                 private DecimalDictionaryValueComparer ()
3797                 {
3798                 }
3799
3800                 public int Compare (object o1, object o2)
3801                 {
3802                         DictionaryEntry e1 = (DictionaryEntry) o1;
3803                         DictionaryEntry e2 = (DictionaryEntry) o2;
3804                         // FIXME: in case of 0, compare decomposition categories
3805                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3806                         if (ret != 0)
3807                                 return ret;
3808                         int i1 = (int) e1.Key;
3809                         int i2 = (int) e2.Key;
3810                         return i1 - i2;
3811                 }
3812         }
3813
3814         class StringDictionaryValueComparer : IComparer
3815         {
3816                 public static readonly StringDictionaryValueComparer Instance
3817                         = new StringDictionaryValueComparer ();
3818
3819                 private StringDictionaryValueComparer ()
3820                 {
3821                 }
3822
3823                 public int Compare (object o1, object o2)
3824                 {
3825                         DictionaryEntry e1 = (DictionaryEntry) o1;
3826                         DictionaryEntry e2 = (DictionaryEntry) o2;
3827                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3828                         if (ret != 0)
3829                                 return ret;
3830                         int i1 = (int) e1.Key;
3831                         int i2 = (int) e2.Key;
3832                         return i1 - i2;
3833                 }
3834         }
3835
3836         class UCAComparer : IComparer
3837         {
3838                 public static readonly UCAComparer Instance
3839                         = new UCAComparer ();
3840
3841                 private UCAComparer ()
3842                 {
3843                 }
3844
3845                 public int Compare (object o1, object o2)
3846                 {
3847                         char i1 = (char) o1;
3848                         char i2 = (char) o2;
3849
3850                         int l1 = CollationElementTable.GetSortKeyCount (i1);
3851                         int l2 = CollationElementTable.GetSortKeyCount (i2);
3852                         int l = l1 > l2 ? l2 : l1;
3853
3854                         for (int i = 0; i < l; i++) {
3855                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3856                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3857                                 int v = k1.Primary - k2.Primary;
3858                                 if (v != 0)
3859                                         return v;
3860                                 v = k1.Secondary - k2.Secondary;
3861                                 if (v != 0)
3862                                         return v;
3863                                 v = k1.Thirtiary - k2.Thirtiary;
3864                                 if (v != 0)
3865                                         return v;
3866                                 v = k1.Quarternary - k2.Quarternary;
3867                                 if (v != 0)
3868                                         return v;
3869                         }
3870                         return l1 - l2;
3871                 }
3872         }
3873
3874         class Tailoring
3875         {
3876                 int lcid;
3877                 int alias;
3878                 bool frenchSort;
3879                 ArrayList items = new ArrayList ();
3880
3881                 public Tailoring (int lcid)
3882                         : this (lcid, 0)
3883                 {
3884                 }
3885
3886                 public Tailoring (int lcid, int alias)
3887                 {
3888                         this.lcid = lcid;
3889                         this.alias = alias;
3890                 }
3891
3892                 public int LCID {
3893                         get { return lcid; }
3894                 }
3895
3896                 public int Alias {
3897                         get { return alias; }
3898                 }
3899
3900                 public bool FrenchSort {
3901                         get { return frenchSort; }
3902                         set { frenchSort = value; }
3903                 }
3904
3905                 public void AddDiacriticalMap (byte target, byte replace)
3906                 {
3907                         items.Add (new DiacriticalMap (target, replace));
3908                 }
3909
3910                 public void AddSortKeyMap (string source, byte [] sortkey)
3911                 {
3912                         items.Add (new SortKeyMap (source, sortkey));
3913                 }
3914
3915                 public void AddReplacementMap (string source, string replace)
3916                 {
3917                         items.Add (new ReplacementMap (source, replace));
3918                 }
3919
3920                 public char [] ItemToCharArray ()
3921                 {
3922                         ArrayList al = new ArrayList ();
3923                         foreach (ITailoringMap m in items)
3924                                 al.AddRange (m.ToCharArray ());
3925                         return al.ToArray (typeof (char)) as char [];
3926                 }
3927
3928                 interface ITailoringMap
3929                 {
3930                         char [] ToCharArray ();
3931                 }
3932
3933                 class DiacriticalMap : ITailoringMap
3934                 {
3935                         public readonly byte Target;
3936                         public readonly byte Replace;
3937
3938                         public DiacriticalMap (byte target, byte replace)
3939                         {
3940                                 Target = target;
3941                                 Replace = replace;
3942                         }
3943
3944                         public char [] ToCharArray ()
3945                         {
3946                                 char [] ret = new char [3];
3947                                 ret [0] = (char) 02; // kind:DiacriticalMap
3948                                 ret [1] = (char) Target;
3949                                 ret [2] = (char) Replace;
3950                                 return ret;
3951                         }
3952                 }
3953
3954                 class SortKeyMap : ITailoringMap
3955                 {
3956                         public readonly string Source;
3957                         public readonly byte [] SortKey;
3958
3959                         public SortKeyMap (string source, byte [] sortkey)
3960                         {
3961                                 Source = source;
3962                                 SortKey = sortkey;
3963                         }
3964
3965                         public char [] ToCharArray ()
3966                         {
3967                                 char [] ret = new char [Source.Length + 7];
3968                                 ret [0] = (char) 01; // kind:SortKeyMap
3969                                 for (int i = 0; i < Source.Length; i++)
3970                                         ret [i + 1] = Source [i];
3971                                 // null terminate
3972                                 for (int i = 0; i < 4; i++)
3973                                         ret [i + Source.Length + 2] = (char) SortKey [i];
3974                                 return ret;
3975                         }
3976                 }
3977
3978                 class ReplacementMap : ITailoringMap
3979                 {
3980                         public readonly string Source;
3981                         public readonly string Replace;
3982
3983                         public ReplacementMap (string source, string replace)
3984                         {
3985                                 Source = source;
3986                                 Replace = replace;
3987                         }
3988
3989                         public char [] ToCharArray ()
3990                         {
3991                                 char [] ret = new char [Source.Length + Replace.Length + 3];
3992                                 ret [0] = (char) 03; // kind:ReplaceMap
3993                                 int pos = 1;
3994                                 for (int i = 0; i < Source.Length; i++)
3995                                         ret [pos++] = Source [i];
3996                                 // null terminate
3997                                 pos++;
3998                                 for (int i = 0; i < Replace.Length; i++)
3999                                         ret [pos++] = Replace [i];
4000                                 // null terminate
4001                                 return ret;
4002                         }
4003                 }
4004         }
4005 }