mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14
  15 //
  16 // * sortkey getter signature
  17 //
  18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
  19 //      Stores sort key for corresponding character element into buf and
  20 //      returns the length of the consumed _source_ character element in s.
  21 //
  22 // * character length to consume
  23 //
  24 //      If there are characters whose primary weight is 0, they are consumed
  25 //      and considered as a part of the character element.
  26 //
  27 #define Binary
  28
  29 using System;
  30 using System.IO;
  31 using System.Collections;
  32 using System.Globalization;
  33 using System.Text;
  34 using System.Xml;
  35
  36 namespace Mono.Globalization.Unicode
  37 {
  38         internal class MSCompatSortKeyTableGenerator
  39         {
  40                 public static void Main (string [] args)
  41                 {
  42                         new MSCompatSortKeyTableGenerator ().Run (args);
  43                 }
  44
  45                 const int DecompositionWide = 1; // fixed
  46                 const int DecompositionSub = 2; // fixed
  47                 const int DecompositionSmall = 3;
  48                 const int DecompositionIsolated = 4;
  49                 const int DecompositionInitial = 5;
  50                 const int DecompositionFinal = 6;
  51                 const int DecompositionMedial = 7;
  52                 const int DecompositionNoBreak = 8;
  53                 const int DecompositionVertical = 9;
  54                 const int DecompositionFraction = 0xA;
  55                 const int DecompositionFont = 0xB;
  56                 const int DecompositionSuper = 0xC; // fixed
  57                 const int DecompositionFull = 0xE;
  58                 const int DecompositionNarrow = 0xD;
  59                 const int DecompositionCircle = 0xF;
  60                 const int DecompositionSquare = 0x10;
  61                 const int DecompositionCompat = 0x11;
  62                 const int DecompositionCanonical = 0x12;
  63
  64                 TextWriter Result = Console.Out;
  65
  66                 byte [] fillIndex = new byte [256]; // by category
  67                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  68
  69                 char [] specialIgnore = new char [] {
  70                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  71                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  72                         };
  73
  74                 // FIXME: need more love (as always)
  75                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  76                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  77                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  78                         '\u0292', '\u01BE', '\u0298'};
  79                 byte [] alphaWeights = new byte [] {
  80                         2, 9, 0xA, 0x1A, 0x21,
  81                         0x23, 0x25, 0x2C, 0x32, 0x35,
  82                         0x36, 0x48, 0x51, 0x70, 0x7C,
  83                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  84                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  85                         0xA9, 0xAA, 0xB3, 0xB4};
  86
  87                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  88                 bool [] isUppercase = new bool [char.MaxValue + 1];
  89
  90                 byte [] decompType = new byte [char.MaxValue + 1];
  91                 int [] decompIndex = new int [char.MaxValue + 1];
  92                 int [] decompLength = new int [char.MaxValue + 1];
  93                 int [] decompValues;
  94                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  95
  96                 byte [] diacritical = new byte [char.MaxValue + 1];
  97
  98                 string [] diacritics = new string [] {
  99                         // LATIN, CYRILLIC etc.
 100                         "UPTURN", "DOUBLE-STRUCK",
 101                         "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;",
 102                         "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
 103                         "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
 104                         "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 105                         " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
 106                         "WITH OGONEK;", "WITH CEDILLA;",
 107                         //
 108                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 109                         "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
 110                         "STROKE OVERLAY",
 111                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 112                         " DIAERESIS AND GRAVE;",
 113                         " BREVE AND ACUTE;",
 114                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 115                         " MACRON AND ACUTE;",
 116                         " MACRON AND GRAVE;",
 117                         //
 118                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 119                         " RING ABOVE AND ACUTE",
 120                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 121                         " CIRCUMFLEX AND TILDE",
 122                         " TILDE AND DIAERESIS",
 123                         " STROKE AND ACUTE",
 124                         " BREVE AND TILDE",
 125                         " CEDILLA AND BREVE",
 126                         " OGONEK AND MACRON",
 127                         //
 128                         "WITH OVERLINE",
 129                         "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 130                         " DOUBLE GRAVE",
 131                         " INVERTED BREVE",
 132                         "ROMAN NUMERAL",
 133                         " PRECEDED BY APOSTROPHE",
 134                         "WITH HORN;",
 135                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 136                         " PALATAL HOOK",
 137                         " DOT BELOW;",
 138                         " RETROFLEX;", "DIAERESIS BELOW",
 139                         " RING BELOW",
 140                         //
 141                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 142                         " BREVE BELOW;", " HORN AND GRAVE",
 143                         " TILDE BELOW",
 144                         " TOPBAR",
 145                         " DOT BELOW AND DOT ABOVE",
 146                         " RIGHT HALF RING", " HORN AND TILDE",
 147                         " CIRCUMFLEX AND DOT BELOW",
 148                         " BREVE AND DOT BELOW",
 149                         " DOT BELOW AND MACRON",
 150                         " TONE TWO",
 151                         " HORN AND HOOK ABOVE",
 152                         " HORN AND DOT",
 153                         // CIRCLED, PARENTHESIZED and so on
 154                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
 155                         "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
 156                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 157                         };
 158                 byte [] diacriticWeights = new byte [] {
 159                         // LATIN.
 160                         3, 3, 5, 5,
 161                         0xF, 0xE, 0x12,
 162                         0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 163                         0x17, 0x19, 0x1A, 0x1B, 0x1C,
 164                         //
 165                         0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 166                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 167                         //
 168                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 169                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 170                         //
 171                         0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
 172                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
 173                         //
 174                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
 175                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 176                         0x87, 0x95, 0xAA,
 177                         // CIRCLED, PARENTHESIZED and so on.
 178                         0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
 179                         0xF3, 0xF3, 0xF3
 180                         };
 181
 182                 int [] numberSecondaryWeightBounds = new int [] {
 183                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 184                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 185                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 186                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 187                         0xE50, 0xE60, 0xED0, 0xEE0
 188                         };
 189
 190                 char [] orderedGurmukhi;
 191                 char [] orderedGujarati;
 192                 char [] orderedGeorgian;
 193                 char [] orderedThaana;
 194
 195                 static readonly char [] orderedTamilConsonants = new char [] {
 196                         // based on traditional Tamil consonants, except for
 197                         // Grantha (where Microsoft breaks traditionalism).
 198                         // http://www.angelfire.com/empire/thamizh/padanGaL
 199                         '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
 200                         '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
 201                         '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
 202                         '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
 203                         '\u0BB7', '\u0BB9'};
 204
 205                 // cp -> character name (only for some characters)
 206                 ArrayList sortableCharNames = new ArrayList ();
 207
 208                 // cp -> arrow value (int)
 209                 ArrayList arrowValues = new ArrayList ();
 210
 211                 // cp -> box value (int)
 212                 ArrayList boxValues = new ArrayList ();
 213
 214                 // cp -> level1 value
 215                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 216
 217                 // letterName -> cp
 218                 Hashtable arabicNameMap = new Hashtable ();
 219
 220                 // cp -> Hashtable [decompType] -> cp
 221                 Hashtable nfkdMap = new Hashtable ();
 222
 223                 // Latin letter -> ArrayList [int]
 224                 Hashtable latinMap = new Hashtable ();
 225
 226                 ArrayList jisJapanese = new ArrayList ();
 227                 ArrayList nonJisJapanese = new ArrayList ();
 228
 229                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
 230                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
 231                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
 232                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
 233                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
 234
 235                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 236
 237                 static double [] unicodeAge = new double [char.MaxValue + 1];
 238
 239                 ArrayList tailorings = new ArrayList ();
 240
 241                 void Run (string [] args)
 242                 {
 243                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 244                         ParseSources (dirname);
 245                         Console.Error.WriteLine ("parse done.");
 246
 247                         ModifyParsedValues ();
 248                         GenerateCore ();
 249                         Console.Error.WriteLine ("generation done.");
 250                         Serialize ();
 251                         Console.Error.WriteLine ("serialization done.");
 252 /*
 253 StreamWriter sw = new StreamWriter ("agelog.txt");
 254 for (int i = 0; i < char.MaxValue; i++) {
 255 bool shouldBe = false;
 256 switch (Char.GetUnicodeCategory ((char) i)) {
 257 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
 258         shouldBe = true; break;
 259 }
 260 if (unicodeAge [i] >= 3.1)
 261         shouldBe = true;
 262 //if (IsIgnorable (i) != shouldBe)
 263 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
 264 }
 265 sw.Close ();
 266 */
 267                 }
 268
 269                 byte [] CompressArray (byte [] source, CodePointIndexer i)
 270                 {
 271                         return (byte []) CodePointIndexer.CompressArray  (
 272                                 source, typeof (byte), i);
 273                 }
 274
 275                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
 276                 {
 277                         return (ushort []) CodePointIndexer.CompressArray  (
 278                                 source, typeof (ushort), i);
 279                 }
 280
 281                 void Serialize ()
 282                 {
 283                         // Tailorings
 284                         SerializeTailorings ();
 285
 286                         byte [] categories = new byte [map.Length];
 287                         byte [] level1 = new byte [map.Length];
 288                         byte [] level2 = new byte [map.Length];
 289                         byte [] level3 = new byte [map.Length];
 290                         ushort [] widthCompat = new ushort [map.Length];
 291                         for (int i = 0; i < map.Length; i++) {
 292                                 categories [i] = map [i].Category;
 293                                 level1 [i] = map [i].Level1;
 294                                 level2 [i] = map [i].Level2;
 295                                 level3 [i] = ComputeLevel3Weight ((char) i);
 296                                 switch (decompType [i]) {
 297                                 case DecompositionNarrow:
 298                                 case DecompositionWide:
 299                                 case DecompositionSuper:
 300                                 case DecompositionSub:
 301                                         // they are always 1 char
 302                                         widthCompat [i] = (ushort) decompValues [decompIndex [i]];
 303                                         break;
 304                                 }
 305                         }
 306
 307                         // compress
 308                         ignorableFlags = CompressArray (ignorableFlags,
 309                                 MSCompatUnicodeTableUtil.Ignorable);
 310                         categories = CompressArray (categories,
 311                                 MSCompatUnicodeTableUtil.Category);
 312                         level1 = CompressArray (level1,
 313                                 MSCompatUnicodeTableUtil.Level1);
 314                         level2 = CompressArray (level2,
 315                                 MSCompatUnicodeTableUtil.Level2);
 316                         level3 = CompressArray (level3,
 317                                 MSCompatUnicodeTableUtil.Level3);
 318                         widthCompat = (ushort []) CodePointIndexer.CompressArray (
 319                                 widthCompat, typeof (ushort),
 320                                 MSCompatUnicodeTableUtil.WidthCompat);
 321                         cjkCHS = CompressArray (cjkCHS,
 322                                 MSCompatUnicodeTableUtil.CjkCHS);
 323                         cjkCHT = CompressArray (cjkCHT,
 324                                 MSCompatUnicodeTableUtil.Cjk);
 325                         cjkJA = CompressArray (cjkJA,
 326                                 MSCompatUnicodeTableUtil.Cjk);
 327                         cjkKO = CompressArray (cjkKO,
 328                                 MSCompatUnicodeTableUtil.Cjk);
 329                         cjkKOlv2 = CompressArray (cjkKOlv2,
 330                                 MSCompatUnicodeTableUtil.Cjk);
 331
 332                         // Ignorables
 333                         Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
 334 #if Binary
 335                         MemoryStream ms = new MemoryStream ();
 336                         BinaryWriter binary = new BinaryWriter (ms);
 337                         binary.Write (ignorableFlags.Length);
 338 #endif
 339                         for (int i = 0; i < ignorableFlags.Length; i++) {
 340                                 byte value = ignorableFlags [i];
 341                                 if (value < 10)
 342                                         Result.Write ("{0},", value);
 343                                 else
 344                                         Result.Write ("0x{0:X02},", value);
 345 #if Binary
 346                                 binary.Write (value);
 347 #endif
 348                                 if ((i & 0xF) == 0xF)
 349                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 350                         }
 351                         Result.WriteLine ("};");
 352                         Result.WriteLine ();
 353
 354                         // Primary category
 355                         Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
 356 #if Binary
 357                         binary.Write (categories.Length);
 358 #endif
 359                         for (int i = 0; i < categories.Length; i++) {
 360                                 byte value = categories [i];
 361                                 if (value < 10)
 362                                         Result.Write ("{0},", value);
 363                                 else
 364                                         Result.Write ("0x{0:X02},", value);
 365 #if Binary
 366                                 binary.Write (value);
 367 #endif
 368                                 if ((i & 0xF) == 0xF)
 369                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 370                         }
 371                         Result.WriteLine ("};");
 372                         Result.WriteLine ();
 373
 374                         // Primary weight value
 375                         Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
 376 #if Binary
 377                         binary.Write (level1.Length);
 378 #endif
 379                         for (int i = 0; i < level1.Length; i++) {
 380                                 byte value = level1 [i];
 381                                 if (value < 10)
 382                                         Result.Write ("{0},", value);
 383                                 else
 384                                         Result.Write ("0x{0:X02},", value);
 385 #if Binary
 386                                 binary.Write (value);
 387 #endif
 388                                 if ((i & 0xF) == 0xF)
 389                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 390                         }
 391                         Result.WriteLine ("};");
 392                         Result.WriteLine ();
 393
 394                         // Secondary weight
 395                         Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
 396 #if Binary
 397                         binary.Write (level2.Length);
 398 #endif
 399                         for (int i = 0; i < level2.Length; i++) {
 400                                 byte value = level2 [i];
 401                                 if (value < 10)
 402                                         Result.Write ("{0},", value);
 403                                 else
 404                                         Result.Write ("0x{0:X02},", value);
 405 #if Binary
 406                                 binary.Write (value);
 407 #endif
 408                                 if ((i & 0xF) == 0xF)
 409                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 410                         }
 411                         Result.WriteLine ("};");
 412                         Result.WriteLine ();
 413
 414                         // Thirtiary weight
 415                         Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
 416 #if Binary
 417                         binary.Write (level3.Length);
 418 #endif
 419                         for (int i = 0; i < level3.Length; i++) {
 420                                 byte value = level3 [i];
 421                                 if (value < 10)
 422                                         Result.Write ("{0},", value);
 423                                 else
 424                                         Result.Write ("0x{0:X02},", value);
 425 #if Binary
 426                                 binary.Write (value);
 427 #endif
 428                                 if ((i & 0xF) == 0xF)
 429                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 430                         }
 431                         Result.WriteLine ("};");
 432                         Result.WriteLine ();
 433
 434                         // Width insensitivity mappings
 435                         // (for now it is more lightweight than dumping the
 436                         // entire NFKD table).
 437                         Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
 438 #if Binary
 439                         binary.Write (widthCompat.Length);
 440 #endif
 441                         for (int i = 0; i < widthCompat.Length; i++) {
 442                                 ushort value = widthCompat [i];
 443                                 if (value < 10)
 444                                         Result.Write ("{0},", value);
 445                                 else
 446                                         Result.Write ("0x{0:X02},", value);
 447 #if Binary
 448                                 binary.Write (value);
 449 #endif
 450                                 if ((i & 0xF) == 0xF)
 451                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 452                         }
 453                         Result.WriteLine ("};");
 454                         Result.WriteLine ();
 455 #if Binary
 456                         using (FileStream fs = File.Create ("../collation.core.bin")) {
 457                                 byte [] array = ms.ToArray ();
 458                                 fs.Write (array, 0, array.Length);
 459                         }
 460 #endif
 461
 462                         // CJK
 463                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 464                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 465                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 466                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 467                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 468                 }
 469
 470                 void SerializeCJK (string name, ushort [] cjk, int max)
 471                 {
 472                         int offset = 0;//char.MaxValue - cjk.Length;
 473                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
 474 #if Binary
 475                         MemoryStream ms = new MemoryStream ();
 476                         BinaryWriter binary = new BinaryWriter (ms);
 477                         binary.Write (cjk.Length);
 478 #endif
 479                         for (int i = 0; i < cjk.Length; i++) {
 480                                 if (i + offset == max)
 481                                         break;
 482                                 ushort value = cjk [i];
 483                                 if (value < 10)
 484                                         Result.Write ("{0},", value);
 485                                 else
 486                                         Result.Write ("0x{0:X04},", value);
 487 #if Binary
 488                                 binary.Write (value);
 489 #endif
 490                                 if ((i & 0xF) == 0xF)
 491                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 492                         }
 493                         Result.WriteLine ("};");
 494                         Result.WriteLine ();
 495 #if Binary
 496                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 497                                 byte [] array = ms.ToArray ();
 498                                 fs.Write (array, 0, array.Length);
 499                         }
 500 #endif
 501                 }
 502
 503                 void SerializeCJK (string name, byte [] cjk, int max)
 504                 {
 505                         int offset = 0;//char.MaxValue - cjk.Length;
 506                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
 507 #if Binary
 508                         MemoryStream ms = new MemoryStream ();
 509                         BinaryWriter binary = new BinaryWriter (ms);
 510 #endif
 511                         for (int i = 0; i < cjk.Length; i++) {
 512                                 if (i + offset == max)
 513                                         break;
 514                                 byte value = cjk [i];
 515                                 if (value < 10)
 516                                         Result.Write ("{0},", value);
 517                                 else
 518                                         Result.Write ("0x{0:X02},", value);
 519 #if Binary
 520                                 binary.Write (value);
 521 #endif
 522                                 if ((i & 0xF) == 0xF)
 523                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 524                         }
 525                         Result.WriteLine ("};");
 526                         Result.WriteLine ();
 527 #if Binary
 528                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 529                                 byte [] array = ms.ToArray ();
 530                                 fs.Write (array, 0, array.Length);
 531                         }
 532 #endif
 533                 }
 534
 535                 void SerializeTailorings ()
 536                 {
 537                         Hashtable indexes = new Hashtable ();
 538                         Hashtable counts = new Hashtable ();
 539                         Result.WriteLine ("static char [] tailorings = new char [] {");
 540                         int count = 0;
 541 #if Binary
 542                         MemoryStream ms = new MemoryStream ();
 543                         BinaryWriter binary = new BinaryWriter (ms);
 544 #endif
 545                         foreach (Tailoring t in tailorings) {
 546                                 if (t.Alias != 0)
 547                                         continue;
 548                                 Result.Write ("/*{0}*/", t.LCID);
 549                                 indexes.Add (t.LCID, count);
 550                                 char [] values = t.ItemToCharArray ();
 551                                 counts.Add (t.LCID, values.Length);
 552                                 foreach (char c in values) {
 553                                         Result.Write ("'\\x{0:X}', ", (int) c);
 554                                         if (++count % 16 == 0)
 555                                                 Result.WriteLine (" // {0:X04}", count - 16);
 556 #if Binary
 557                                         binary.Write ((ushort) c);
 558 #endif
 559                                 }
 560                         }
 561                         Result.WriteLine ("};");
 562
 563                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
 564 #if Binary
 565                         byte [] rawdata = ms.ToArray ();
 566                         ms = new MemoryStream ();
 567                         binary = new BinaryWriter (ms);
 568                         binary.Write (tailorings.Count);
 569 #endif
 570                         foreach (Tailoring t in tailorings) {
 571                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
 572                                 if (!indexes.ContainsKey (target)) {
 573                                         throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
 574                                         continue;
 575                                 }
 576                                 int idx = (int) indexes [target];
 577                                 int cnt = (int) counts [target];
 578                                 bool french = t.FrenchSort;
 579                                 if (t.Alias != 0)
 580                                         foreach (Tailoring t2 in tailorings)
 581                                                 if (t2.LCID == t.LCID)
 582                                                         french = t2.FrenchSort;
 583                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
 584 #if Binary
 585                                 binary.Write (t.LCID);
 586                                 binary.Write (idx);
 587                                 binary.Write (cnt);
 588                                 binary.Write (french);
 589 #endif
 590                         }
 591                         Result.WriteLine ("};");
 592 #if Binary
 593                         binary.Write ((byte) 0xFF);
 594                         binary.Write ((byte) 0xFF);
 595                         binary.Write (rawdata.Length / 2);
 596                         binary.Write (rawdata, 0, rawdata.Length);
 597
 598
 599                         using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
 600                                 byte [] array = ms.ToArray ();
 601                                 fs.Write (array, 0, array.Length);
 602                         }
 603 #endif
 604                 }
 605
 606                 #region Parse
 607
 608                 void ParseSources (string dirname)
 609                 {
 610                         string unidata =
 611                                 dirname + "/UnicodeData.txt";
 612                         string derivedCoreProps =
 613                                 dirname + "/DerivedCoreProperties.txt";
 614                         string scripts =
 615                                 dirname + "/Scripts.txt";
 616                         string cp932 =
 617                                 dirname + "/CP932.TXT";
 618                         string derivedAge =
 619                                 dirname + "/DerivedAge.txt";
 620                         string chXML = dirname + "/common/collation/zh.xml";
 621                         string jaXML = dirname + "/common/collation/ja.xml";
 622                         string koXML = dirname + "/common/collation/ko.xml";
 623
 624                         ParseDerivedAge (derivedAge);
 625
 626                         FillIgnorables ();
 627
 628                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 629                         ParseUnidata (unidata);
 630                         ModifyUnidata ();
 631                         ParseDerivedCoreProperties (derivedCoreProps);
 632                         ParseScripts (scripts);
 633                         ParseCJK (chXML, jaXML, koXML);
 634
 635                         ParseTailorings ("mono-tailoring-source.txt");
 636                 }
 637
 638                 void ParseTailorings (string filename)
 639                 {
 640                         Tailoring t = null;
 641                         int line = 0;
 642                         using (StreamReader sr = new StreamReader (filename)) {
 643                                 try {
 644                                         while (sr.Peek () >= 0) {
 645                                                 line++;
 646                                                 ProcessTailoringLine (ref t,
 647                                                         sr.ReadLine ().Trim ());
 648                                         }
 649                                 } catch (Exception) {
 650                                         Console.Error.WriteLine ("ERROR at line {0}", line);
 651                                         throw;
 652                                 }
 653                         }
 654                 }
 655
 656                 // For now this is enough.
 657                 string ParseTailoringSourceValue (string s)
 658                 {
 659                         StringBuilder sb = new StringBuilder ();
 660                         for (int i = 0; i < s.Length; i++) {
 661                                 if (s.StartsWith ("\\u")) {
 662                                         sb.Append ((char) int.Parse (
 663                                                 s.Substring (2, 4), NumberStyles.HexNumber),
 664                                                 1);
 665                                         i += 5;
 666                                 }
 667                         else
 668                                 sb.Append (s [i]);
 669                         }
 670                         return sb.ToString ();
 671                 }
 672
 673                 void ProcessTailoringLine (ref Tailoring t, string s)
 674                 {
 675                         int idx = s.IndexOf ('#');
 676                         if (idx > 0)
 677                                 s = s.Substring (0, idx).Trim ();
 678                         if (s.Length == 0 || s [0] == '#')
 679                                 return;
 680                         if (s [0] == '@') {
 681                                 idx = s.IndexOf ('=');
 682                                 if (idx > 0)
 683                                         t = new Tailoring (
 684                                                 int.Parse (s.Substring (1, idx - 1)),
 685                                                 int.Parse (s.Substring (idx + 1)));
 686                                 else
 687                                         t = new Tailoring (int.Parse (s.Substring (1)));
 688                                 tailorings.Add (t);
 689                                 return;
 690                         }
 691                         if (s.StartsWith ("*FrenchSort")) {
 692                                 t.FrenchSort = true;
 693                                 return;
 694                         }
 695                         string d = "*Diacritical";
 696                         if (s.StartsWith (d)) {
 697                                 idx = s.IndexOf ("->");
 698                                 t.AddDiacriticalMap (
 699                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
 700                                                 NumberStyles.HexNumber),
 701                                         byte.Parse (s.Substring (idx + 2).Trim (),
 702                                                 NumberStyles.HexNumber));
 703                                 return;
 704                         }
 705                         idx = s.IndexOf (':');
 706                         if (idx > 0) {
 707                                 string source = s.Substring (0, idx).Trim ();
 708                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
 709                                 byte [] b = new byte [4];
 710                                 for (int i = 0; i < 4; i++) {
 711                                         if (l [i] == "*")
 712                                                 b [i] = 0;
 713                                         else
 714                                                 b [i] = byte.Parse (l [i],
 715                                                         NumberStyles.HexNumber);
 716                                 }
 717                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
 718                                         b);
 719                         }
 720                         idx = s.IndexOf ('=');
 721                         if (idx > 0)
 722                                 t.AddReplacementMap (
 723                                         ParseTailoringSourceValue (
 724                                                 s.Substring (0, idx).Trim ()),
 725                                         ParseTailoringSourceValue (
 726                                                 s.Substring (idx + 1).Trim ()));
 727                 }
 728
 729                 void ParseDerivedAge (string filename)
 730                 {
 731                         using (StreamReader file =
 732                                 new StreamReader (filename)) {
 733                                 while (file.Peek () >= 0) {
 734                                         string s = file.ReadLine ();
 735                                         int idx = s.IndexOf ('#');
 736                                         if (idx >= 0)
 737                                                 s = s.Substring (0, idx);
 738                                         idx = s.IndexOf (';');
 739                                         if (idx < 0)
 740                                                 continue;
 741
 742                                         string cpspec = s.Substring (0, idx);
 743                                         idx = cpspec.IndexOf ("..");
 744                                         NumberStyles nf = NumberStyles.HexNumber |
 745                                                 NumberStyles.AllowTrailingWhite;
 746                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 747                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 748                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 749
 750                                         // FIXME: use index
 751                                         if (cp > char.MaxValue)
 752                                                 continue;
 753
 754                                         double v = double.Parse (value);
 755                                         for (int i = cp; i <= cpEnd; i++)
 756                                                 unicodeAge [i] = v;
 757                                 }
 758                         }
 759                         unicodeAge [0] = double.MaxValue; // never be supported
 760                 }
 761
 762                 void ParseUnidata (string filename)
 763                 {
 764                         ArrayList decompValues = new ArrayList ();
 765                         using (StreamReader unidata =
 766                                 new StreamReader (filename)) {
 767                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 768                                         try {
 769                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 770                                         } catch (Exception) {
 771                                                 Console.Error.WriteLine ("**** At line " + line);
 772                                                 throw;
 773                                         }
 774                                 }
 775                         }
 776                         this.decompValues = (int [])
 777                                 decompValues.ToArray (typeof (int));
 778                 }
 779
 780                 char previousLatinTarget = char.MinValue;
 781                 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
 782
 783                 void ProcessUnidataLine (string s, ArrayList decompValues)
 784                 {
 785                         int idx = s.IndexOf ('#');
 786                         if (idx >= 0)
 787                                 s = s.Substring (0, idx);
 788                         idx = s.IndexOf (';');
 789                         if (idx < 0)
 790                                 return;
 791                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 792                         string [] values = s.Substring (idx + 1).Split (';');
 793
 794                         // FIXME: use index
 795                         if (cp > char.MaxValue)
 796                                 return;
 797                         if (IsIgnorable (cp))
 798                                 return;
 799
 800                         string name = values [0];
 801
 802                         // SPECIAL CASE: rename some characters for diacritical
 803                         // remapping. FIXME: why are they different?
 804                         // FIXME: it's still not working.
 805                         if (cp == 0x018B || cp == 0x018C)
 806                                 name = name.Replace ("TOPBAR", "STROKE");
 807
 808                         // isSmallCapital
 809                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 810                                 isSmallCapital [cp] = true;
 811
 812                         // latin mapping by character name
 813                         if (s.IndexOf ("LATIN") >= 0) {
 814                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 815                                 int offset = lidx + 15;
 816                                 if (lidx < 0) {
 817                                         lidx = s.IndexOf ("LETTER TURNED ");
 818                                         offset = lidx + 14;
 819                                 }
 820                                 if (lidx < 0) {
 821                                         lidx = s.IndexOf ("LETTER CAPITAL ");
 822                                         offset = lidx + 15;
 823                                 }
 824                                 if (lidx < 0) {
 825                                         lidx = s.IndexOf ("LETTER SCRIPT ");
 826                                         offset = lidx + 14;
 827                                 }
 828                                 if (lidx < 0) {
 829                                         lidx = s.IndexOf ("LETTER ");
 830                                         offset = lidx + 7;
 831                                 }
 832                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 833                                 char n = s [offset + 1];
 834                                 char target = char.MinValue;
 835                                 if ('A' <= c && c <= 'Z' &&
 836                                         (n == ' ') || n == ';') {
 837                                         target = c;
 838                                         // FIXME: After 'Z', I cannot reset this state.
 839                                         previousLatinTarget = c == 'Z' ? char.MinValue : c;
 840                                 }
 841
 842                                 if (s.Substring (offset).StartsWith ("ALPHA"))
 843                                         target = 'A';
 844                                 else if (s.Substring (offset).StartsWith ("TONE SIX"))
 845                                         target = 'B';
 846                                 else if (s.Substring (offset).StartsWith ("OPEN O"))
 847                                         target = 'C';
 848                                 else if (s.Substring (offset).StartsWith ("SCHWA"))
 849                                         target = 'E';
 850                                 else if (s.Substring (offset).StartsWith ("ENG"))
 851                                         target = 'N';
 852                                 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
 853                                         target = 'O';
 854                                 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
 855                                         target = 'R';
 856                                 else if (s.Substring (offset).StartsWith ("TONE TWO"))
 857                                         target = 'S';
 858                                 else if (s.Substring (offset).StartsWith ("ESH"))
 859                                         target = 'S';
 860
 861                                 if (target == char.MinValue)
 862                                         target = previousLatinTarget;
 863
 864                                 if (target != char.MinValue) {
 865                                         ArrayList entry = (ArrayList) latinMap [target];
 866                                         if (entry == null) {
 867                                                 entry = new ArrayList ();
 868                                                 latinMap [target] = entry;
 869                                         }
 870                                         entry.Add (cp);
 871                                         // FIXME: This secondary weight is hack.
 872                                         // They are here because they must not
 873                                         // be identical to the corresponding
 874                                         // ASCII latins.
 875                                         if (c != target && diacritical [cp] == 0) {
 876                                                 diacriticalOffset [c - 'A']++;
 877                                                 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
 878                                         }
 879                                 }
 880                         }
 881
 882                         // Arrow names
 883                         if (0x2000 <= cp && cp < 0x3000) {
 884                                 int value = 0;
 885                                 // SPECIAL CASES. FIXME: why?
 886                                 switch (cp) {
 887                                 case 0x21C5: value = -1; break; // E2
 888                                 case 0x261D: value = 1; break;
 889                                 case 0x27A6: value = 3; break;
 890                                 case 0x21B0: value = 7; break;
 891                                 case 0x21B1: value = 3; break;
 892                                 case 0x21B2: value = 7; break;
 893                                 case 0x21B4: value = 5; break;
 894                                 case 0x21B5: value = 7; break;
 895                                 case 0x21B9: value = -1; break; // E1
 896                                 case 0x21CF: value = 7; break;
 897                                 case 0x21D0: value = 3; break;
 898                                 }
 899                                 string [] arrowTargets = new string [] {
 900                                         "",
 901                                         "UPWARDS",
 902                                         "NORTH EAST",
 903                                         "RIGHTWARDS",
 904                                         "SOUTH EAST",
 905                                         "DOWNWARDS",
 906                                         "SOUTH WEST",
 907                                         "LEFTWARDS",
 908                                         "NORTH WEST",
 909                                         };
 910                                 if (value == 0)
 911                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
 912                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
 913                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
 914                                                         s.IndexOf (" OVER") < 0
 915                                                 )
 916                                                         value = i;
 917                                 if (value > 0)
 918                                         arrowValues.Add (new DictionaryEntry (
 919                                                 cp, value));
 920                         }
 921
 922                         // Box names
 923                         if (0x2500 <= cp && cp < 0x2600) {
 924                                 int value = 0;
 925                                 // flags:
 926                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
 927                                 // [h,rl] [r] [l]
 928                                 // [v,ud] [u] [d]
 929                                 // [dr] [dl] [ur] [ul]
 930                                 // [vr,udr] [vl,vdl]
 931                                 // [hd,rld] [hu,rlu]
 932                                 // [hv,udrl,rlv,udh]
 933                                 ArrayList flags = new ArrayList (new int [] {
 934                                         32, 8 + 4, 8, 4,
 935                                         16, 1 + 2, 1, 2,
 936                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
 937                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
 938                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
 939                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
 940                                         });
 941                                 byte [] offsets = new byte [] {
 942                                         0, 0, 1, 2,
 943                                         3, 3, 4, 5,
 944                                         6, 7, 8, 9,
 945                                         10, 10, 11, 11,
 946                                         12, 12, 13, 13,
 947                                         14, 14, 14, 14};
 948                                 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
 949                                         int flag = 0;
 950                                         if (s.IndexOf (" UP") >= 0)
 951                                                 flag |= 1;
 952                                         if (s.IndexOf (" DOWN") >= 0)
 953                                                 flag |= 2;
 954                                         if (s.IndexOf (" RIGHT") >= 0)
 955                                                 flag |= 4;
 956                                         if (s.IndexOf (" LEFT") >= 0)
 957                                                 flag |= 8;
 958                                         if (s.IndexOf (" VERTICAL") >= 0)
 959                                                 flag |= 16;
 960                                         if (s.IndexOf (" HORIZONTAL") >= 0)
 961                                                 flag |= 32;
 962
 963                                         int fidx = flags.IndexOf (flag);
 964                                         value = fidx < 0 ? fidx : offsets [fidx];
 965                                 } else if (s.IndexOf ("BLOCK") >= 0) {
 966                                         if (s.IndexOf ("ONE EIGHTH") >= 0)
 967                                                 value = 0x12;
 968                                         else if (s.IndexOf ("ONE QUARTER") >= 0)
 969                                                 value = 0x13;
 970                                         else if (s.IndexOf ("THREE EIGHTHS") >= 0)
 971                                                 value = 0x14;
 972                                         else if (s.IndexOf ("HALF") >= 0)
 973                                                 value = 0x15;
 974                                         else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
 975                                                 value = 0x16;
 976                                         else if (s.IndexOf ("THREE QUARTERS") >= 0)
 977                                                 value = 0x17;
 978                                         else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
 979                                                 value = 0x18;
 980                                         else
 981                                                 value = 0x19;
 982                                 }
 983                                 else if (s.IndexOf ("SHADE") >= 0)
 984                                         value = 0x19;
 985                                 else if (s.IndexOf ("SQUARE") >= 0)
 986                                         value = 0xBC - 0xE5;
 987                                 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
 988                                         value = 0xBE - 0xE5;
 989                                 else if (s.IndexOf ("RECTANGLE") >= 0)
 990                                         value = 0xBD - 0xE5;
 991                                 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
 992                                         value = 0xBF - 0xE5;
 993                                 else if (s.IndexOf ("TRIANGLE") >= 0) {
 994                                         if (s.IndexOf ("UP-POINTING") >= 0)
 995                                                 value = 0xC0 - 0xE5;
 996                                         else if (s.IndexOf ("RIGHT-POINTING") >= 0)
 997                                                 value = 0xC1 - 0xE5;
 998                                         else if (s.IndexOf ("DOWN-POINTING") >= 0)
 999                                                 value = 0xC2 - 0xE5;
1000                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1001                                                 value = 0xC3 - 0xE5;
1002                                 }
1003                                 else if (s.IndexOf ("POINTER") >= 0) {
1004                                         if (s.IndexOf ("RIGHT-POINTING") >= 0)
1005                                                 value = 0xC4 - 0xE5;
1006                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1007                                                 value = 0xC5 - 0xE5;
1008                                 }
1009                                 else if (s.IndexOf ("DIAMOND") >= 0)
1010                                         value = 0xC6 - 0xE5;
1011                                 else if (s.IndexOf ("FISHEYE") >= 0)
1012                                         value = 0xC7 - 0xE5;
1013                                 else if (s.IndexOf ("LOZENGE") >= 0)
1014                                         value = 0xC8 - 0xE5;
1015                                 else if (s.IndexOf ("BULLSEYE") >= 0)
1016                                         value = 0xC9 - 0xE5;
1017                                 else if (s.IndexOf ("CIRCLE") >= 0) {
1018                                         if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1019                                                 value = 0xCA - 0xE5;
1020                                         else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1021                                                 value = 0xCB - 0xE5;
1022                                         else
1023                                                 value = 0xC9 - 0xE5;
1024                                 }
1025                                 if (0x25DA <= cp && cp <= 0x25E5)
1026                                         value = 0xCD + cp - 0x25DA - 0xE5;
1027
1028                                 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1029                                 switch (cp) {
1030                                 case 0x2571: value = 0xF; break;
1031                                 case 0x2572: value = 0x10; break;
1032                                 case 0x2573: value = 0x11; break;
1033                                 }
1034                                 if (value != 0)
1035                                         boxValues.Add (new DictionaryEntry (
1036                                                 cp, value));
1037                         }
1038
1039                         // For some characters store the name and sort later
1040                         // to determine sorting.
1041                         if (0x2100 <= cp && cp <= 0x213F &&
1042                                 Char.IsSymbol ((char) cp))
1043                                 sortableCharNames.Add (
1044                                         new DictionaryEntry (cp, name));
1045                         else if (0x3380 <= cp && cp <= 0x33DD)
1046                                 sortableCharNames.Add (new DictionaryEntry (
1047                                         cp, name.Substring (7)));
1048
1049                         if (Char.GetUnicodeCategory ((char) cp) ==
1050                                 UnicodeCategory.MathSymbol) {
1051                                 if (name.StartsWith ("CIRCLED "))
1052                                         diacritical [cp] = 0xEE;
1053                                 if (name.StartsWith ("SQUARED "))
1054                                         diacritical [cp] = 0xEF;
1055                         }
1056
1057                         // diacritical weights by character name
1058 if (diacritics.Length != diacriticWeights.Length)
1059 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1060                         for (int d = 0; d < diacritics.Length; d++) {
1061                                 if (s.IndexOf (diacritics [d]) > 0) {
1062                                         diacritical [cp] += diacriticWeights [d];
1063                                         if (s.IndexOf ("COMBINING") >= 0)
1064                                                 diacritical [cp] -= (byte) 2;
1065                                         continue;
1066                                 }
1067                                 // also process "COMBINING blah" here
1068                                 // For now it is limited to cp < 0x0370
1069 //                              if (cp < 0x0300 || cp >= 0x0370)
1070 //                                      continue;
1071                                 string tmp = diacritics [d].TrimEnd (';');
1072                                 if (tmp.IndexOf ("WITH ") == 0)
1073                                         tmp = tmp.Substring (4);
1074                                 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1075                                 if (name == tmp)
1076                                         diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1077 //if (name == tmp)
1078 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1079                         }
1080                         // Two-step grep required for it.
1081                         if (s.IndexOf ("FULL STOP") > 0 &&
1082                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1083                                 diacritical [cp] |= 0xF4;
1084
1085                         // Arabic letter name
1086                         if (0x0621 <= cp && cp <= 0x064A &&
1087                                 Char.GetUnicodeCategory ((char) cp)
1088                                 == UnicodeCategory.OtherLetter) {
1089                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1090                                 switch (cp) {
1091                                 case 0x0621:
1092                                 case 0x0624:
1093                                 case 0x0626:
1094                                         // hamza, waw, yeh ... special cases.
1095                                         value = 0x07;
1096                                         break;
1097                                 case 0x0649:
1098                                 case 0x064A:
1099                                         value = 0x77; // special cases.
1100                                         break;
1101                                 default:
1102                                         // Get primary letter name i.e.
1103                                         // XXX part of ARABIC LETTER XXX yyy
1104                                         // e.g. that of "TEH MARBUTA" is "TEH".
1105                                         string letterName =
1106                                                 (cp == 0x0640) ?
1107                                                 // 0x0640 is special: it does
1108                                                 // not start with ARABIC LETTER
1109                                                 name :
1110                                                 name.Substring (14);
1111                                         int tmpIdx = letterName.IndexOf (' ');
1112                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1113 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1114                                         if (arabicNameMap.ContainsKey (letterName))
1115                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1116                                         else
1117                                                 arabicNameMap [letterName] = cp;
1118                                         break;
1119                                 }
1120                                 arabicLetterPrimaryValues [cp] = value;
1121                         }
1122
1123                         // Japanese square letter
1124                         if (0x3300 <= cp && cp <= 0x3357)
1125                                 if (!ExistsJIS (cp))
1126                                         nonJisJapanese.Add (new NonJISCharacter (cp, name));
1127
1128                         // normalizationType
1129                         string decomp = values [4];
1130                         idx = decomp.IndexOf ('<');
1131                         if (idx >= 0) {
1132                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1133                                 case "full":
1134                                         decompType [cp] = DecompositionFull;
1135                                         break;
1136                                 case "sub":
1137                                         decompType [cp] = DecompositionSub;
1138                                         break;
1139                                 case "super":
1140                                         decompType [cp] = DecompositionSuper;
1141                                         break;
1142                                 case "small":
1143                                         decompType [cp] = DecompositionSmall;
1144                                         break;
1145                                 case "isolated":
1146                                         decompType [cp] = DecompositionIsolated;
1147                                         break;
1148                                 case "initial":
1149                                         decompType [cp] = DecompositionInitial;
1150                                         break;
1151                                 case "final":
1152                                         decompType [cp] = DecompositionFinal;
1153                                         break;
1154                                 case "medial":
1155                                         decompType [cp] = DecompositionMedial;
1156                                         break;
1157                                 case "noBreak":
1158                                         decompType [cp] = DecompositionNoBreak;
1159                                         break;
1160                                 case "compat":
1161                                         decompType [cp] = DecompositionCompat;
1162                                         break;
1163                                 case "fraction":
1164                                         decompType [cp] = DecompositionFraction;
1165                                         break;
1166                                 case "font":
1167                                         decompType [cp] = DecompositionFont;
1168                                         break;
1169                                 case "circle":
1170                                         decompType [cp] = DecompositionCircle;
1171                                         break;
1172                                 case "square":
1173                                         decompType [cp] = DecompositionSquare;
1174                                         break;
1175                                 case "wide":
1176                                         decompType [cp] = DecompositionWide;
1177                                         break;
1178                                 case "narrow":
1179                                         decompType [cp] = DecompositionNarrow;
1180                                         break;
1181                                 case "vertical":
1182                                         decompType [cp] = DecompositionVertical;
1183                                         break;
1184                                 default:
1185                                         throw new Exception ("Support NFKD type : " + decomp);
1186                                 }
1187                         }
1188                         else
1189                                 decompType [cp] = DecompositionCanonical;
1190                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1191                         if (decomp.Length > 0) {
1192
1193                                 string [] velems = decomp.Split (' ');
1194                                 int didx = decompValues.Count;
1195                                 decompIndex [cp] = didx;
1196                                 foreach (string v in velems)
1197                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1198                                 decompLength [cp] = velems.Length;
1199
1200                                 // [decmpType] -> this_cp
1201                                 int targetCP = (int) decompValues [didx];
1202                                 // for "(x)" it specially maps to 'x' .
1203                                 // FIXME: check if it is sane
1204                                 if (velems.Length == 3 &&
1205                                         (int) decompValues [didx] == '(' &&
1206                                         (int) decompValues [didx + 2] == ')')
1207                                         targetCP = (int) decompValues [didx + 1];
1208                                 // special: 0x215F "1/"
1209                                 else if (cp == 0x215F)
1210                                         targetCP = '1';
1211                                 else if (velems.Length > 1 &&
1212                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
1213                                         // skip them, except for CJK ideograph compat
1214                                         targetCP = 0;
1215
1216                                 if (targetCP != 0) {
1217                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
1218                                         if (entry == null) {
1219                                                 entry = new Hashtable ();
1220                                                 nfkdMap [targetCP] = entry;
1221                                         }
1222                                         entry [(byte) decompType [cp]] = cp;
1223                                 }
1224                         }
1225                         // numeric values
1226                         if (values [5].Length > 0)
1227                                 decimalValue [cp] = decimal.Parse (values [5]);
1228                         else if (values [6].Length > 0)
1229                                 decimalValue [cp] = decimal.Parse (values [6]);
1230                         else if (values [7].Length > 0) {
1231                                 string decstr = values [7];
1232                                 idx = decstr.IndexOf ('/');
1233                                 if (cp == 0x215F) // special. "1/"
1234                                         decimalValue [cp] = 0x1;
1235                                 else if (idx > 0)
1236                                         // m/n
1237                                         decimalValue [cp] =
1238                                                 decimal.Parse (decstr.Substring (0, idx))
1239                                                 / decimal.Parse (decstr.Substring (idx + 1));
1240                                 else if (decstr [0] == '(' &&
1241                                         decstr [decstr.Length - 1] == ')')
1242                                         // (n)
1243                                         decimalValue [cp] =
1244                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1245                                 else if (decstr [decstr.Length - 1] == '.')
1246                                         // n.
1247                                         decimalValue [cp] =
1248                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1249                                 else
1250                                         decimalValue [cp] = decimal.Parse (decstr);
1251                         }
1252                 }
1253
1254                 void ParseDerivedCoreProperties (string filename)
1255                 {
1256                         // IsUppercase
1257                         using (StreamReader file =
1258                                 new StreamReader (filename)) {
1259                                 for (int line = 1; file.Peek () >= 0; line++) {
1260                                         try {
1261                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1262                                         } catch (Exception) {
1263                                                 Console.Error.WriteLine ("**** At line " + line);
1264                                                 throw;
1265                                         }
1266                                 }
1267                         }
1268                 }
1269
1270                 void ProcessDerivedCorePropLine (string s)
1271                 {
1272                         int idx = s.IndexOf ('#');
1273                         if (idx >= 0)
1274                                 s = s.Substring (0, idx);
1275                         idx = s.IndexOf (';');
1276                         if (idx < 0)
1277                                 return;
1278                         string cpspec = s.Substring (0, idx);
1279                         idx = cpspec.IndexOf ("..");
1280                         NumberStyles nf = NumberStyles.HexNumber |
1281                                 NumberStyles.AllowTrailingWhite;
1282                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1283                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1284                         string value = s.Substring (cpspec.Length + 1).Trim ();
1285
1286                         // FIXME: use index
1287                         if (cp > char.MaxValue)
1288                                 return;
1289
1290                         switch (value) {
1291                         case "Uppercase":
1292                                 for (int x = cp; x <= cpEnd; x++)
1293                                         isUppercase [x] = true;
1294                                 break;
1295                         }
1296                 }
1297
1298                 void ParseScripts (string filename)
1299                 {
1300                         ArrayList gurmukhi = new ArrayList ();
1301                         ArrayList gujarati = new ArrayList ();
1302                         ArrayList georgian = new ArrayList ();
1303                         ArrayList thaana = new ArrayList ();
1304
1305                         using (StreamReader file =
1306                                 new StreamReader (filename)) {
1307                                 while (file.Peek () >= 0) {
1308                                         string s = file.ReadLine ();
1309                                         int idx = s.IndexOf ('#');
1310                                         if (idx >= 0)
1311                                                 s = s.Substring (0, idx);
1312                                         idx = s.IndexOf (';');
1313                                         if (idx < 0)
1314                                                 continue;
1315
1316                                         string cpspec = s.Substring (0, idx);
1317                                         idx = cpspec.IndexOf ("..");
1318                                         NumberStyles nf = NumberStyles.HexNumber |
1319                                                 NumberStyles.AllowTrailingWhite;
1320                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1321                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1322                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1323
1324                                         // FIXME: use index
1325                                         if (cp > char.MaxValue)
1326                                                 continue;
1327
1328                                         switch (value) {
1329                                         case "Gurmukhi":
1330                                                 for (int x = cp; x <= cpEnd; x++)
1331                                                         if (!IsIgnorable (x))
1332                                                                 gurmukhi.Add ((char) x);
1333                                                 break;
1334                                         case "Gujarati":
1335                                                 for (int x = cp; x <= cpEnd; x++)
1336                                                         if (!IsIgnorable (x))
1337                                                                 gujarati.Add ((char) x);
1338                                                 break;
1339                                         case "Georgian":
1340                                                 for (int x = cp; x <= cpEnd; x++)
1341                                                         if (!IsIgnorable (x))
1342                                                                 georgian.Add ((char) x);
1343                                                 break;
1344                                         case "Thaana":
1345                                                 for (int x = cp; x <= cpEnd; x++)
1346                                                         if (!IsIgnorable (x))
1347                                                                 thaana.Add ((char) x);
1348                                                 break;
1349                                         }
1350                                 }
1351                         }
1352                         gurmukhi.Sort (UCAComparer.Instance);
1353                         gujarati.Sort (UCAComparer.Instance);
1354                         georgian.Sort (UCAComparer.Instance);
1355                         thaana.Sort (UCAComparer.Instance);
1356                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1357                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1358                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1359                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1360                 }
1361
1362                 void ParseJISOrder (string filename)
1363                 {
1364                         int line = 1;
1365                         try {
1366                                 using (StreamReader file =
1367                                         new StreamReader (filename)) {
1368                                         for (;file.Peek () >= 0; line++)
1369                                                 ProcessJISOrderLine (file.ReadLine ());
1370                                 }
1371                         } catch (Exception) {
1372                                 Console.Error.WriteLine ("---- line {0}", line);
1373                                 throw;
1374                         }
1375                 }
1376
1377                 char [] ws = new char [] {'\t', ' '};
1378
1379                 void ProcessJISOrderLine (string s)
1380                 {
1381                         int idx = s.IndexOf ('#');
1382                         if (idx >= 0)
1383                                 s = s.Substring (0, idx).Trim ();
1384                         if (s.Length == 0)
1385                                 return;
1386                         idx = s.IndexOfAny (ws);
1387                         if (idx < 0)
1388                                 return;
1389                         // They start with "0x" so cut them out.
1390                         int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1391                         int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1392                         jisJapanese.Add (new JISCharacter (cp, jis));
1393                 }
1394
1395                 void ParseCJK (string zhXML, string jaXML, string koXML)
1396                 {
1397                         XmlDocument doc = new XmlDocument ();
1398                         doc.XmlResolver = null;
1399                         int v;
1400                         string s;
1401                         string category;
1402                         int offset;
1403                         ushort [] arr;
1404
1405                         // Chinese Simplified
1406                         category = "chs";
1407                         arr = cjkCHS;
1408                         offset = 0;//char.MaxValue - arr.Length;
1409                         doc.Load (zhXML);
1410                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1411                         v = 0x8008;
1412                         foreach (char c in s) {
1413                                 if (c < '\u3100')
1414                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1415                                 else {
1416                                         arr [(int) c - offset] = (ushort) v++;
1417                                         if (v % 256 == 0)
1418                                                 v += 2;
1419                                 }
1420                         }
1421
1422                         // Chinese Traditional
1423                         category = "cht";
1424                         arr = cjkCHT;
1425                         offset = 0;//char.MaxValue - arr.Length;
1426                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1427                         v = 0x8002;
1428                         foreach (char c in s) {
1429                                 if (c < '\u4E00')
1430                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1431                                 else {
1432                                         arr [(int) c - offset] = (ushort) v++;
1433                                         if (v % 256 == 0)
1434                                                 v += 2;
1435                                 }
1436                         }
1437
1438                         // Japanese
1439                         category = "ja";
1440                         arr = cjkJA;
1441                         offset = 0;//char.MaxValue - arr.Length;
1442
1443                         // SPECIAL CASES
1444                         arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1445                         arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1446                         arr [0x337E] = 0x8005;
1447                         arr [0x337D] = 0x8006;
1448                         arr [0x337C] = 0x8007;
1449
1450                         v = 0x8008;
1451                         foreach (JISCharacter jc in jisJapanese) {
1452                                 if (jc.JIS < 0x8800)
1453                                         continue;
1454                                 char c = (char) jc.CP;
1455
1456                                 if (c < '\u4E00')
1457                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1458                                 else {
1459                                         arr [(int) c - offset] = (ushort) v++;
1460                                         if (v % 256 == 0)
1461                                                 v += 2;
1462
1463                                         // SPECIAL CASES:
1464                                         if (c == '\u662D') // U+337C
1465                                                 continue;
1466                                         if (c == '\u5927') // U+337D
1467                                                 continue;
1468                                         if (c == '\u5E73') // U+337B
1469                                                 continue;
1470                                         if (c == '\u660E') // U+337E
1471                                                 continue;
1472                                         if (c == '\u9686') // U+F9DC
1473                                                 continue;
1474
1475                                         // FIXME: there are still remaining
1476                                         // characters after U+FA0C.
1477 //                                      for (int k = 0; k < char.MaxValue; k++) {
1478                                         for (int k = 0; k < '\uFA0D'; k++) {
1479                                                 if (decompIndex [k] == 0 || IsIgnorable (k))
1480                                                         continue;
1481                                                 if (decompValues [decompIndex [k]] == c /*&&
1482                                                         decompLength [k] == 1*/ ||
1483                                                         decompLength [k] == 3 &&
1484                                                         decompValues [decompIndex [k] + 1] == c) {
1485                                                         arr [k - offset] = (ushort) v++;
1486                                                         if (v % 256 == 0)
1487                                                                 v += 2;
1488                                                 }
1489                                         }
1490                                 }
1491                         }
1492
1493                         // Korean
1494                         // Korean weight is somewhat complex. It first shifts
1495                         // Hangul category from 52-x to 80-x (they are anyways
1496                         // computed). CJK ideographs are placed at secondary
1497                         // weight, like XX YY 01 zz 01, where XX and YY are
1498                         // corresponding "reset" value and zz is 41,43,45...
1499                         //
1500                         // Unlike chs,cht and ja, Korean value is a combined
1501                         // ushort which is computed as category
1502                         //
1503                         category = "ko";
1504                         arr = cjkKO;
1505                         offset = 0;//char.MaxValue - arr.Length;
1506                         doc.Load (koXML);
1507                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1508                                 XmlElement sc = (XmlElement) reset.NextSibling;
1509                                 // compute "category" and "level 1" for the
1510                                 // target "reset" Hangle syllable
1511                                 char rc = reset.InnerText [0];
1512                                 int ri = ((int) rc - 0xAC00) + 1;
1513                                 ushort p = (ushort)
1514                                         ((ri / 254) * 256 + (ri % 254) + 2);
1515                                 // Place the characters after the target.
1516                                 s = sc.InnerText;
1517                                 v = 0x41;
1518                                 foreach (char c in s) {
1519                                         arr [(int) c - offset] = p;
1520                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1521                                         v += 2;
1522                                 }
1523                         }
1524                 }
1525
1526                 #endregion
1527
1528                 #region Generation
1529
1530                 void FillIgnorables ()
1531                 {
1532                         for (int i = 0; i <= char.MaxValue; i++) {
1533                                 if (Char.GetUnicodeCategory ((char) i) ==
1534                                         UnicodeCategory.OtherNotAssigned)
1535                                         continue;
1536                                 if (IsIgnorable (i))
1537                                         ignorableFlags [i] |= 1;
1538                                 if (IsIgnorableSymbol (i))
1539                                         ignorableFlags [i] |= 2;
1540                                 if (IsIgnorableNonSpacing (i))
1541                                         ignorableFlags [i] |= 4;
1542                         }
1543                 }
1544
1545                 void ModifyUnidata ()
1546                 {
1547                         // Modify some decomposition equivalence
1548                         decompType [0xFE31] = 0;
1549                         decompIndex [0xFE31] = 0;
1550                         decompLength [0xFE31] = 0;
1551                         decompType [0xFE32] = 0;
1552                         decompIndex [0xFE32] = 0;
1553                         decompLength [0xFE32] = 0;
1554
1555                         // Korean parens numbers
1556                         for (int i = 0x3200; i <= 0x321C; i++)
1557                                 diacritical [i] = 0xA;
1558                         for (int i = 0x3260; i <= 0x327B; i++)
1559                                 diacritical [i] = 0xC;
1560
1561                         // LAMESPEC: these remapping should not be done.
1562                         // Windows have incorrect CJK compat mappings.
1563                         decompValues [decompIndex [0x32A9]] = 0x91AB;
1564                         decompLength [0x323B] = 1;
1565                         decompValues [decompIndex [0x323B]] = 0x5B78;
1566                         decompValues [decompIndex [0x32AB]] = 0x5B78;
1567                         decompValues [decompIndex [0x32A2]] = 0x5BEB;
1568                         decompLength [0x3238] = 1;
1569                         decompValues [decompIndex [0x3238]] = 0x52DE;
1570                         decompValues [decompIndex [0x3298]] = 0x52DE;
1571
1572                         // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1573                         decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1574                         decompValues [decompIndex [0xFA0C]] = 0x5140;
1575                         decompLength [0xFA0C] = 1;
1576                         decompIndex [0xF929] = decompLength [0xF929] = 0;
1577
1578                         decompValues [decompIndex [0xF92C]] = 0x90DE;
1579                 }
1580
1581                 void ModifyParsedValues ()
1582                 {
1583                         // number, secondary weights
1584                         byte weight = 0x38;
1585                         int [] numarr = numberSecondaryWeightBounds;
1586                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1587                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1588                                         if (Char.IsNumber ((char) cp))
1589                                                 diacritical [cp] = weight;
1590
1591                         // Update name part of named characters
1592                         for (int i = 0; i < sortableCharNames.Count; i++) {
1593                                 DictionaryEntry de =
1594                                         (DictionaryEntry) sortableCharNames [i];
1595                                 int cp = (int) de.Key;
1596                                 string renamed = null;
1597                                 switch (cp) {
1598                                 case 0x2101: renamed = "A_1"; break;
1599                                 case 0x33C3: renamed = "A_2"; break;
1600                                 case 0x2105: renamed = "C_1"; break;
1601                                 case 0x2106: renamed = "C_2"; break;
1602                                 case 0x211E: renamed = "R1"; break;
1603                                 case 0x211F: renamed = "R2"; break;
1604                                 // Remove some of them!
1605                                 case 0x2103:
1606                                 case 0x2109:
1607                                 case 0x2116:
1608                                 case 0x2117:
1609                                 case 0x2118:
1610                                 case 0x2125:
1611                                 case 0x2127:
1612                                 case 0x2129:
1613                                 case 0x212E:
1614                                 case 0x2132:
1615                                         sortableCharNames.RemoveAt (i);
1616                                         i--;
1617                                         continue;
1618                                 }
1619                                 if (renamed != null)
1620                                         sortableCharNames [i] =
1621                                                 new DictionaryEntry (cp, renamed);
1622                         }
1623                 }
1624
1625                 void GenerateCore ()
1626                 {
1627                         UnicodeCategory uc;
1628
1629                         #region Specially ignored // 01
1630                         // This will raise "Defined" flag up.
1631                         foreach (char c in specialIgnore)
1632                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1633                         #endregion
1634
1635
1636                         #region Variable weights
1637                         // Controls : 06 03 - 06 3D
1638                         fillIndex [6] = 3;
1639                         for (int i = 0; i < 65536; i++) {
1640                                 if (IsIgnorable (i))
1641                                         continue;
1642                                 char c = (char) i;
1643                                 uc = Char.GetUnicodeCategory (c);
1644                                 // NEL is whitespace but not ignored here.
1645                                 if (uc == UnicodeCategory.Control &&
1646                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1647                                         AddCharMap (c, 6, 1);
1648                         }
1649
1650                         // Apostrophe 06 80
1651                         fillIndex [6] = 0x80;
1652                         AddCharMapGroup ('\'', 6, 1, 0);
1653                         AddCharMap ('\uFE63', 6, 1);
1654
1655                         // Hyphen/Dash : 06 81 - 06 90
1656                         for (int i = 0; i < char.MaxValue; i++) {
1657                                 if (!IsIgnorable (i) &&
1658                                         Char.GetUnicodeCategory ((char) i) ==
1659                                         UnicodeCategory.DashPunctuation) {
1660                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1661                                         if (i == 0x2011) {
1662                                                 // SPECIAL: add 2027 and 2043
1663                                                 // Maybe they are regarded the
1664                                                 // same hyphens in "central"
1665                                                 // position.
1666                                                 AddCharMap ('\u2027', 6, 1);
1667                                                 AddCharMap ('\u2043', 6, 1);
1668                                         }
1669                                 }
1670                         }
1671
1672                         // Arabic variable weight chars 06 A0 -
1673                         fillIndex [6] = 0xA0;
1674                         // vowels
1675                         for (int i = 0x64B; i <= 0x650; i++)
1676                                 AddArabicCharMap ((char) i);
1677                         // sukun
1678                         AddCharMapGroup ('\u0652', 6, 1, 0);
1679                         // shadda
1680                         AddCharMapGroup ('\u0651', 6, 1, 0);
1681                         #endregion
1682
1683
1684                         #region Nonspacing marks // 01
1685                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1686
1687                         // Combining diacritical marks: 01 DC -
1688
1689                         fillIndex [0x1] = 0x41;
1690                         for (int i = 0x030E; i <= 0x0326; i++)
1691                                 if (!IsIgnorable (i))
1692                                         AddCharMap ((char) i, 0x1, 1);
1693                         for (int i = 0x0329; i <= 0x0334; i++)
1694                                 if (!IsIgnorable (i))
1695                                         AddCharMap ((char) i, 0x1, 1);
1696                         for (int i = 0x0339; i <= 0x0341; i++)
1697                                 if (!IsIgnorable (i))
1698                                         AddCharMap ((char) i, 0x1, 1);
1699                         fillIndex [0x1] = 0x72;
1700                         for (int i = 0x0346; i <= 0x0348; i++)
1701                                 if (!IsIgnorable (i))
1702                                         AddCharMap ((char) i, 0x1, 1);
1703                         for (int i = 0x02BE; i <= 0x02BF; i++)
1704                                 if (!IsIgnorable (i))
1705                                         AddCharMap ((char) i, 0x1, 1);
1706                         for (int i = 0x02C1; i <= 0x02C5; i++)
1707                                 if (!IsIgnorable (i))
1708                                         AddCharMap ((char) i, 0x1, 1);
1709                         for (int i = 0x02CE; i <= 0x02CF; i++)
1710                                 if (!IsIgnorable (i))
1711                                         AddCharMap ((char) i, 0x1, 1);
1712                         for (int i = 0x02D1; i <= 0x02D3; i++)
1713                                 if (!IsIgnorable (i))
1714                                         AddCharMap ((char) i, 0x1, 1);
1715                         AddCharMap ('\u02DE', 0x1, 1);
1716                         for (int i = 0x02E4; i <= 0x02E9; i++)
1717                                 if (!IsIgnorable (i))
1718                                         AddCharMap ((char) i, 0x1, 1);
1719
1720                         // FIXME: needs more love here (it should eliminate
1721                         // all the hacky code above).
1722                         for (int i = 0x0300; i < 0x0370; i++)
1723                                 if (!IsIgnorable (i) && diacritical [i] != 0
1724                                         /* especiall here*/ && !map [i].Defined)
1725                                         map [i] = new CharMapEntry (
1726                                                 0x1, 0x1, diacritical [i]);
1727
1728                         fillIndex [0x1] = 0x94;
1729                         // syriac dotted nonspacing marks
1730                         AddCharMap ('\u0732', 0x1, 1);
1731                         AddCharMap ('\u0735', 0x1, 1);
1732                         AddCharMap ('\u0738', 0x1, 1);
1733                         AddCharMap ('\u0739', 0x1, 1);
1734                         AddCharMap ('\u073C', 0x1, 1);
1735                         fillIndex [0x1] = 0x9F;
1736                         for (int i = 0x0730; i <= 0x07B0; i++)
1737                                 if (!IsIgnorable (i) && !map [i].Defined)
1738                                         AddCharMap ((char) i, 0x1, 1);
1739
1740                         fillIndex [0x1] = 0x0C;
1741                         for (int i = 0x0EC8; i <= 0x0ECD; i++)
1742                                 if (!IsIgnorable (i))
1743                                         AddCharMap ((char) i, 0x1, 1);
1744
1745                         // LAMESPEC: It should not stop at '\u20E1'. There are
1746                         // a few more characters (that however results in
1747                         // overflow of level 2 unless we start before 0xDD).
1748                         fillIndex [0x1] = 0xDD;
1749                         for (int i = 0x20d0; i <= 0x20e1; i++)
1750                                 AddCharMap ((char) i, 0x1, 1);
1751
1752                         // They are not part of Nonspacing marks, but have
1753                         // only diacritical weight.
1754                         for (int i = 0x3099; i <= 0x309C; i++)
1755                                 map [i] = new CharMapEntry (1, 1, 1);
1756                         map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1757                         map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1758                         for (int i = 0x30FC; i <= 0x30FE; i++)
1759                                 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1760
1761                         #endregion
1762
1763
1764                         #region Whitespaces // 07 03 -
1765                         fillIndex [0x7] = 0x2;
1766                         AddCharMap (' ', 0x7, 2);
1767                         AddCharMap ('\u00A0', 0x7, 1);
1768                         for (int i = 9; i <= 0xD; i++)
1769                                 AddCharMap ((char) i, 0x7, 1);
1770                         for (int i = 0x2000; i <= 0x200B; i++)
1771                                 AddCharMap ((char) i, 0x7, 1);
1772
1773                         fillIndex [0x7] = 0x17;
1774                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1775                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1776
1777                         // Characters which used to represent layout control.
1778                         // LAMESPEC: Windows developers seem to have thought
1779                         // that those characters are kind of whitespaces,
1780                         // while they aren't.
1781                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1782                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1783                         #endregion
1784
1785                         // category 09 - continued symbols from 08
1786                         fillIndex [0x9] = 2;
1787                         // misc tech mark
1788                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1789                                 AddCharMap ((char) cp, 0x9, 1, 0);
1790
1791                         // arrows
1792                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1793                         foreach (DictionaryEntry de in arrowValues) {
1794                                 int idx = (int) de.Value;
1795                                 int cp = (int) de.Key;
1796                                 if (map [cp].Defined)
1797                                         continue;
1798                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1799                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1800                                 arrowLv2 [idx]++;
1801                         }
1802                         // boxes
1803                         byte [] boxLv2 = new byte [128];
1804                         for (int i = 0; i < boxLv2.Length; i++)
1805                                 boxLv2 [i] = 3;
1806                         foreach (DictionaryEntry de in boxValues) {
1807                                 int cp = (int) de.Key;
1808                                 int off = (int) de.Value;
1809                                 if (map [cp].Defined)
1810                                         continue;
1811                                 if (off < 0) {
1812                                         fillIndex [0x9] = (byte) (0xE5 + off);
1813                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1814                                 }
1815                                 else {
1816                                         fillIndex [0x9] = (byte) (0xE5 + off);
1817                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1818                                 }
1819                         }
1820                         // Some special characters (slanted)
1821                         fillIndex [0x9] = 0xF4;
1822                         AddCharMap ('\u2571', 0x9, 3);
1823                         AddCharMap ('\u2572', 0x9, 3);
1824                         AddCharMap ('\u2573', 0x9, 3);
1825
1826                         // FIXME: implement 0A
1827                         #region Symbols
1828                         fillIndex [0xA] = 2;
1829                         // byte currency symbols
1830                         for (int cp = 0; cp < 0x100; cp++) {
1831                                 uc = Char.GetUnicodeCategory ((char) cp);
1832                                 if (!IsIgnorable (cp) &&
1833                                         uc == UnicodeCategory.CurrencySymbol &&
1834                                         cp != '$' ||
1835                                         cp == 0xAC)
1836                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1837                         }
1838                         // byte other symbols
1839                         for (int cp = 0; cp < 0x100; cp++) {
1840                                 if (cp == 0xA6)
1841                                         continue; // SPECIAL: skip FIXME: why?
1842                                 uc = Char.GetUnicodeCategory ((char) cp);
1843                                 if (!IsIgnorable (cp) &&
1844                                         uc == UnicodeCategory.OtherSymbol ||
1845                                         cp == '\u00B5' || cp == '\u00B7')
1846                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1847                         }
1848
1849                         fillIndex [0xA] = 0x0F; // FIXME: it won't be needed
1850                         for (int cp = 0x2020; cp <= 0x2031; cp++)
1851                                 if (Char.IsPunctuation ((char) cp))
1852                                         AddCharMap ((char) cp, 0xA, 1, 0);
1853                         // SPECIAL CASES: why?
1854                         AddCharMap ('\u203B', 0xA, 1, 0);
1855                         AddCharMap ('\u2040', 0xA, 1, 0);
1856                         AddCharMap ('\u2041', 0xA, 1, 0);
1857                         AddCharMap ('\u2042', 0xA, 1, 0);
1858
1859                         for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1860                                 AddCharMap ((char) cp, 0xA, 1, 0);
1861                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1862                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1863                                 AddCharMap ((char) cp, 0xA, 1, 0);
1864                         // Dingbats
1865                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1866                                 if (Char.IsSymbol ((char) cp))
1867                                         AddCharMap ((char) cp, 0xA, 1, 0);
1868                         // OCR
1869                         for (int i = 0x2440; i < 0x2460; i++)
1870                                 AddCharMap ((char) i, 0xA, 1, 0);
1871
1872                         #endregion
1873
1874                         #region Numbers // 0C 02 - 0C E1
1875                         fillIndex [0xC] = 2;
1876
1877                         // 9F8 : Bengali "one less than the denominator"
1878                         AddCharMap ('\u09F8', 0xC, 1);
1879
1880                         ArrayList numbers = new ArrayList ();
1881                         for (int i = 0; i < 65536; i++)
1882                                 if (!IsIgnorable (i) &&
1883                                         Char.IsNumber ((char) i) &&
1884                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1885                                         numbers.Add (i);
1886
1887                         ArrayList numberValues = new ArrayList ();
1888                         foreach (int i in numbers)
1889                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1890                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1891
1892 //foreach (DictionaryEntry de in numberValues)
1893 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1894
1895                         decimal prevValue = -1;
1896                         foreach (DictionaryEntry de in numberValues) {
1897                                 int cp = (int) de.Key;
1898                                 decimal currValue = (decimal) de.Value;
1899                                 bool addnew = false;
1900                                 if (prevValue < currValue &&
1901                                         prevValue - (int) prevValue == 0 &&
1902                                         prevValue >= 1) {
1903
1904                                         addnew = true;
1905                                         // Process Hangzhou and Roman numbers
1906
1907                                         // There are some SPECIAL cases.
1908                                         if (currValue != 4) // no increment for 4
1909                                                 fillIndex [0xC]++;
1910
1911                                         int xcp;
1912                                         if (currValue <= 10) {
1913                                                 xcp = (int) prevValue + 0x2170 - 1;
1914                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1915                                                 xcp = (int) prevValue + 0x2160 - 1;
1916                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1917                                                 fillIndex [0xC] += 2;
1918                                                 xcp = (int) prevValue + 0x3021 - 1;
1919                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1920                                                 fillIndex [0xC]++;
1921                                         }
1922                                         else if (currValue == 11)
1923                                                 fillIndex [0xC]++;
1924                                 }
1925                                 if (prevValue < currValue)
1926                                         prevValue = currValue;
1927                                 if (map [cp].Defined)
1928                                         continue;
1929                                 // HangZhou and Roman are add later
1930                                 // (code is above)
1931                                 else if (0x3021 <= cp && cp < 0x302A
1932                                         || 0x2160 <= cp && cp < 0x216A
1933                                         || 0x2170 <= cp && cp < 0x217A)
1934                                         continue;
1935
1936                                 if (cp ==  0x215B) // FIXME: why?
1937                                         fillIndex [0xC] += 2;
1938                                 else if (cp == 0x3021) // FIXME: why?
1939                                         fillIndex [0xC]++;
1940                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1941                                 if (addnew || cp <= '9') {
1942                                         int mod = (int) currValue - 1;
1943                                         int xcp;
1944                                         if (1 <= currValue && currValue <= 10) {
1945                                                 xcp = mod + 0x2776;
1946                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1947                                                 xcp = mod + 0x2780;
1948                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1949                                                 xcp = mod + 0x278A;
1950                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1951                                         }
1952                                         if (1 <= currValue && currValue <= 20) {
1953                                                 xcp = mod + 0x2460;
1954                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1955                                                 xcp = mod + 0x2474;
1956                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1957                                                 xcp = mod + 0x2488;
1958                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1959                                         }
1960                                 }
1961
1962                                 if (cp != 0x09E7 && cp != 0x09EA)
1963                                         fillIndex [0xC]++;
1964
1965                                 // Add special cases that are not regarded as
1966                                 // numbers in UnicodeCategory speak.
1967                                 if (cp == '5') {
1968                                         // TONE FIVE
1969                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1970                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1971                                 }
1972                                 else if (cp == '6') // FIXME: why?
1973                                         fillIndex [0xC]++;
1974                         }
1975
1976                         // 221E: infinity
1977                         fillIndex [0xC] = 0xFF;
1978                         AddCharMap ('\u221E', 0xC, 1);
1979                         #endregion
1980
1981                         #region Letters and NonSpacing Marks (general)
1982
1983                         // ASCII Latin alphabets
1984                         for (int i = 0; i < alphabets.Length; i++)
1985                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1986
1987
1988                         // non-ASCII Latin alphabets
1989                         // FIXME: there is no such characters that are placed
1990                         // *after* "alphabets" array items. This is nothing
1991                         // more than a hack that creates dummy weight for
1992                         // primary characters.
1993                         for (int i = 0x0080; i < 0x0300; i++) {
1994                                 if (!Char.IsLetter ((char) i))
1995                                         continue;
1996                                 // For those Latin Letters which has NFKD are
1997                                 // not added as independent primary character.
1998                                 if (decompIndex [i] != 0)
1999                                         continue;
2000                                 // SPECIAL CASES:
2001                                 // 1.some alphabets have primarily
2002                                 //   equivalent ASCII alphabets.
2003                                 // 2.some have independent primary weights,
2004                                 //   but inside a-to-z range.
2005                                 // 3.there are some expanded characters that
2006                                 //   are not part of Unicode Standard NFKD.
2007                                 // 4. some characters are letter in IsLetter
2008                                 //   but not in sortkeys (maybe unicode version
2009                                 //   difference caused it).
2010                                 switch (i) {
2011                                 // 1. skipping them does not make sense
2012 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
2013 //                              case 0x184: case 0x185: case 0x186: case 0x189:
2014 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
2015 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
2016 //                              case 0x19B: case 0x19C:
2017                                 // 2. skipping them does not make sense
2018 //                              case 0x14A: // Ng
2019 //                              case 0x14B: // ng
2020                                 // 3.
2021                                 case 0xC6: // AE
2022                                 case 0xE6: // ae
2023                                 case 0xDE: // Icelandic Thorn
2024                                 case 0xFE: // Icelandic Thorn
2025                                 case 0xDF: // German ss
2026                                 case 0xFF: // German ss
2027                                 // 4.
2028                                 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2029                                 // not classified yet
2030 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2031 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2032 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2033 //                              case 0x1DD:
2034                                         continue;
2035                                 }
2036                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
2037                         }
2038
2039                         // Greek and Coptic
2040                         fillIndex [0xF] = 02;
2041                         for (int i = 0x0380; i < 0x0390; i++)
2042                                 if (Char.IsLetter ((char) i))
2043                                         AddLetterMap ((char) i, 0xF, 1);
2044                         fillIndex [0xF] = 02;
2045                         for (int i = 0x0391; i < 0x03CF; i++)
2046                                 if (Char.IsLetter ((char) i))
2047                                         AddLetterMap ((char) i, 0xF, 1);
2048                         fillIndex [0xF] = 0x40;
2049                         for (int i = 0x03D0; i < 0x0400; i++)
2050                                 if (Char.IsLetter ((char) i))
2051                                         AddLetterMap ((char) i, 0xF, 1);
2052
2053                         // Cyrillic.
2054                         // Cyrillic letters are sorted like Latin letters i.e.
2055                         // containing culture-specific letters between the
2056                         // standard Cyrillic sequence.
2057                         //
2058                         // We can't use UCA here; it has different sorting.
2059                         char [] orderedCyrillic = new char [] {
2060                                 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2061                                 '\u0452', // DJE for Serbocroatian
2062                                 '\u0435',
2063                                 '\u0454', // IE for Ukrainian
2064                                 '\u0436', '\u0437',
2065                                 '\u0455', // DZE
2066                                 '\u0438',
2067                                 '\u0456', // Byelorussian-Ukrainian I
2068                                 '\u0457', // YI
2069                                 '\u0439',
2070                                 '\u0458', // JE
2071                                 '\u043A', '\u043B',
2072                                 '\u0459', // LJE
2073                                 '\u043C', '\u043D',
2074                                 '\u045A', // NJE
2075                                 '\u043E',
2076                                 // 4E9 goes here.
2077                                 '\u043F', '\u0440', '\u0441', '\u0442',
2078                                 '\u045B', // TSHE for Serbocroatian
2079                                 '\u0443',
2080                                 '\u045E', // Short U for Byelorussian
2081                                 '\u04B1', // Straight U w/ stroke (diacritical!)
2082                                 '\u0444', '\u0445', '\u0446', '\u0447',
2083                                 '\u045F', // DZHE
2084                                 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2085                                 '\u044D', '\u044E', '\u044F'};
2086
2087                         // For some characters here is a map to basic cyrillic
2088                         // letters. See UnicodeData.txt character names for
2089                         // the sources. Here I simply declare an equiv. array.
2090                         // The content characters are map from U+490(,491),
2091                         // skipping small letters.
2092                         char [] cymap_src = new char [] {
2093                                 '\u0433', '\u0433', '\u0433', '\u0436',
2094                                 '\u0437', '\u043A', '\u043A', '\u043A',
2095                                 '\u043A', '\u043D', '\u043D', '\u043F',
2096                                 '\u0445', '\u0441', '\u0442', '\u0443',
2097                                 '\u0443', '\u0445', '\u0446', '\u0447',
2098                                 '\u0447', '\u0432', '\u0435', '\u0435',
2099                                 '\u0406', '\u0436', '\u043A', '\u043D',
2100                                 '\u0447', '\u0435'};
2101
2102                         fillIndex [0x10] = 0x8D;
2103                         for (int i = 0x0460; i < 0x0481; i++) {
2104                                 if (Char.IsLetter ((char) i)) {
2105                                         if (i == 0x0476)
2106                                                 // U+476/477 have the same
2107                                                 // primary weight as U+474/475.
2108                                                 fillIndex [0x10] -= 3;
2109                                         AddLetterMap ((char) i, 0x10, 3);
2110                                 }
2111                         }
2112
2113                         fillIndex [0x10] = 0x6;
2114                         for (int i = 0; i < orderedCyrillic.Length; i++) {
2115                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2116                                 if (!IsIgnorable ((int) c) &&
2117                                         Char.IsLetter (c) &&
2118                                         !map [c].Defined) {
2119                                         AddLetterMap (c, 0x10, 0);
2120                                         fillIndex [0x10] += 3;
2121                                 }
2122                         }
2123
2124                         for (int i = 0; i < cymap_src.Length; i++) {
2125                                 char c = cymap_src [i];
2126                                 fillIndex [0x10] = map [c].Level1;
2127                                 AddLetterMap ((char) (0x0490 + i * 2),
2128                                         0x10, 0);
2129                         }
2130
2131                         // Armenian
2132                         fillIndex [0x11] = 0x3;
2133                         for (int i = 0x0531; i < 0x0586; i++)
2134                                 if (Char.IsLetter ((char) i))
2135                                         AddLetterMap ((char) i, 0x11, 1);
2136
2137                         // Hebrew
2138                         // -Letters
2139                         fillIndex [0x12] = 0x2;
2140                         for (int i = 0x05D0; i < 0x05FF; i++)
2141                                 if (Char.IsLetter ((char) i))
2142                                         AddLetterMap ((char) i, 0x12, 1);
2143                         // -Accents
2144                         fillIndex [0x1] = 0x3;
2145                         for (int i = 0x0591; i <= 0x05C2; i++) {
2146                                 if (i == 0x05A3 || i == 0x05BB)
2147                                         fillIndex [0x1]++;
2148                                 if (i != 0x05BE)
2149                                         AddCharMap ((char) i, 0x1, 1);
2150                         }
2151
2152                         // Arabic
2153                         fillIndex [0x1] = 0x8E;
2154                         fillIndex [0x13] = 0x3;
2155                         for (int i = 0x0621; i <= 0x064A; i++) {
2156                                 // Abjad
2157                                 if (Char.GetUnicodeCategory ((char) i)
2158                                         != UnicodeCategory.OtherLetter) {
2159                                         // FIXME: arabic nonspacing marks are
2160                                         // in different order.
2161                                         AddCharMap ((char) i, 0x1, 1);
2162                                         continue;
2163                                 }
2164 //                              map [i] = new CharMapEntry (0x13,
2165 //                                      (byte) arabicLetterPrimaryValues [i], 1);
2166                                 fillIndex [0x13] =
2167                                         (byte) arabicLetterPrimaryValues [i];
2168                                 AddLetterMap ((char) i, 0x13, 0);
2169                         }
2170                         fillIndex [0x13] = 0x84;
2171                         for (int i = 0x0674; i < 0x06D6; i++)
2172                                 if (Char.IsLetter ((char) i))
2173                                         AddLetterMap ((char) i, 0x13, 1);
2174
2175                         // Devanagari
2176                         // FIXME: it does seem straight codepoint mapping.
2177                         fillIndex [0x14] = 04;
2178                         for (int i = 0x0901; i < 0x0905; i++)
2179                                 if (!IsIgnorable (i))
2180                                         AddLetterMap ((char) i, 0x14, 2);
2181                         fillIndex [0x14] = 0xB;
2182                         for (int i = 0x0905; i < 0x093A; i++) {
2183                                 if (i == 0x0928)
2184                                         AddCharMap ('\u0929', 0x14, 0, 8);
2185                                 if (i == 0x0930)
2186                                         AddCharMap ('\u0931', 0x14, 0, 8);
2187                                 if (i == 0x0933)
2188                                         AddCharMap ('\u0934', 0x14, 0, 8);
2189                                 if (Char.IsLetter ((char) i))
2190                                         AddLetterMap ((char) i, 0x14, 4);
2191                                 if (i == 0x090B)
2192                                         AddCharMap ('\u0960', 0x14, 4);
2193                                 if (i == 0x090C)
2194                                         AddCharMap ('\u0961', 0x14, 4);
2195                         }
2196                         fillIndex [0x14] = 0xDA;
2197                         for (int i = 0x093E; i < 0x0945; i++)
2198                                 if (!IsIgnorable (i))
2199                                         AddLetterMap ((char) i, 0x14, 2);
2200                         fillIndex [0x14] = 0xEC;
2201                         for (int i = 0x0945; i < 0x094F; i++)
2202                                 if (!IsIgnorable (i))
2203                                         AddLetterMap ((char) i, 0x14, 2);
2204
2205                         // Bengali
2206                         // -Letters
2207                         fillIndex [0x15] = 02;
2208                         for (int i = 0x0980; i < 0x9FF; i++) {
2209                                 if (IsIgnorable (i))
2210                                         continue;
2211                                 if (i == 0x09E0)
2212                                         fillIndex [0x15] = 0x3B;
2213                                 switch (Char.GetUnicodeCategory ((char) i)) {
2214                                 case UnicodeCategory.NonSpacingMark:
2215                                 case UnicodeCategory.DecimalDigitNumber:
2216                                 case UnicodeCategory.OtherNumber:
2217                                         continue;
2218                                 }
2219                                 AddLetterMap ((char) i, 0x15, 1);
2220                         }
2221                         // -Signs
2222                         fillIndex [0x1] = 0x3;
2223                         for (int i = 0x0981; i < 0x0A00; i++)
2224                                 if (Char.GetUnicodeCategory ((char) i) ==
2225                                         UnicodeCategory.NonSpacingMark)
2226                                         AddCharMap ((char) i, 0x1, 1);
2227
2228                         // Gurmukhi. orderedGurmukhi is from UCA
2229                         // FIXME: it does not look equivalent to UCA.
2230                         fillIndex [0x16] = 04;
2231                         fillIndex [0x1] = 3;
2232                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
2233                                 char c = orderedGurmukhi [i];
2234                                 if (IsIgnorable ((int) c))
2235                                         continue;
2236                                 if (IsIgnorableNonSpacing (c)) {
2237                                         AddLetterMap (c, 0x1, 1);
2238                                         continue;
2239                                 }
2240                                 if (c == '\u0A3C' || c == '\u0A4D' ||
2241                                         '\u0A66' <= c && c <= '\u0A71')
2242                                         continue;
2243                                 // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
2244                                 byte shift = 4;
2245                                 if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
2246                                         shift = 0;
2247                                 AddLetterMap (c, 0x16, shift);
2248                         }
2249
2250                         // Gujarati. orderedGujarati is from UCA
2251                         fillIndex [0x17] = 0x4;
2252                         // nonspacing marks
2253                         map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2254                         map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2255                         map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2256                         map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2257                         map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2258                         map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2259                         // letters go first.
2260                         for (int i = 0; i < orderedGujarati.Length; i++) {
2261                                 // SPECIAL CASE
2262                                 char c = orderedGujarati [i];
2263                                 if (Char.IsLetter (c)) {
2264                                         // SPECIAL CASES
2265                                         if (c == '\u0AB3' || c == '\u0A32')
2266                                                 continue;
2267                                         if (c == '\u0A33') {
2268                                                 AddCharMap ('\u0A32', 0x17, 0);
2269                                                 AddCharMap ('\u0A33', 0x17, 4, 4);
2270                                                 continue;
2271                                         }
2272                                         if (c == '\u0A8B')
2273                                                 AddCharMap ('\u0AE0', 0x17, 0, 5);
2274                                         AddCharMap (c, 0x17, 4);
2275
2276                                         if (c == '\u0AB9')
2277                                                 AddCharMap ('\u0AB3', 0x17, 6);
2278                                 }
2279                         }
2280                         // non-letters
2281                         byte gujaratiShift = 4;
2282                         fillIndex [0x17] = 0xC0;
2283                         for (int i = 0; i < orderedGujarati.Length; i++) {
2284                                 char c = orderedGujarati [i];
2285                                 if (fillIndex [0x17] == 0xCC)
2286                                         gujaratiShift = 3;
2287                                 if (!Char.IsLetter (c)) {
2288                                         // SPECIAL CASES
2289                                         if (c == '\u0A82')
2290                                                 AddCharMap ('\u0A81', 0x17, 2);
2291                                         if (c == '\u0AC2')
2292                                                 fillIndex [0x17]++;
2293                                         AddLetterMap (c, 0x17, gujaratiShift);
2294                                 }
2295                         }
2296
2297                         // Oriya
2298                         fillIndex [0x1] = 03;
2299                         fillIndex [0x18] = 02;
2300                         for (int i = 0x0B00; i < 0x0B7F; i++) {
2301                                 switch (Char.GetUnicodeCategory ((char) i)) {
2302                                 case UnicodeCategory.NonSpacingMark:
2303                                 case UnicodeCategory.DecimalDigitNumber:
2304                                         AddLetterMap ((char) i, 0x1, 1);
2305                                         continue;
2306                                 }
2307                                 AddLetterMap ((char) i, 0x18, 1);
2308                         }
2309
2310                         // Tamil
2311                         fillIndex [0x19] = 2;
2312                         AddCharMap ('\u0BD7', 0x19, 0);
2313                         fillIndex [0x19] = 0xA;
2314                         // vowels
2315                         for (int i = 0x0B82; i <= 0x0B94; i++)
2316                                 if (!IsIgnorable ((char) i))
2317                                         AddCharMap ((char) i, 0x19, 2);
2318                         // special vowel
2319                         fillIndex [0x19] = 0x28;
2320                         // The array for Tamil consonants is a constant.
2321                         // Windows have almost similar sequence to TAM from
2322                         // tamilnet but a bit different in Grantha.
2323                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
2324                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2325                         // combining marks
2326                         fillIndex [0x19] = 0x82;
2327                         for (int i = 0x0BBE; i < 0x0BCD; i++)
2328                                 if (Char.GetUnicodeCategory ((char) i) ==
2329                                         UnicodeCategory.SpacingCombiningMark
2330                                         || i == 0x0BC0)
2331                                         AddLetterMap ((char) i, 0x19, 2);
2332
2333                         // Telugu
2334                         fillIndex [0x1A] = 0x4;
2335                         for (int i = 0x0C00; i < 0x0C62; i++) {
2336                                 if (i == 0x0C55 || i == 0x0C56)
2337                                         continue; // skip
2338                                 AddCharMap ((char) i, 0x1A, 3);
2339                                 char supp = (i == 0x0C0B) ? '\u0C60':
2340                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
2341                                 if (supp == char.MinValue)
2342                                         continue;
2343                                 AddCharMap (supp, 0x1A, 3);
2344                         }
2345
2346                         // Kannada
2347                         fillIndex [0x1B] = 4;
2348                         for (int i = 0x0C80; i < 0x0CE5; i++) {
2349                                 if (i == 0x0CD5 || i == 0x0CD6)
2350                                         continue; // ignore
2351                                 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2352                                         continue; // shift after 0xCB9
2353                                 AddCharMap ((char) i, 0x1B, 3);
2354                                 if (i == 0x0CB9) {
2355                                         // SPECIAL CASES: but why?
2356                                         AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2357                                         AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2358                                         AddCharMap ('\u0CDE', 0x1B, 3); // FA
2359                                 }
2360                                 if (i == 0x0CB2)
2361                                         AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2362                         }
2363
2364                         // Malayalam
2365                         fillIndex [0x1C] = 2;
2366                         for (int i = 0x0D02; i < 0x0D61; i++)
2367                                 // FIXME: I avoided MSCompatUnicodeTable usage
2368                                 // here (it results in recursion). So check if
2369                                 // using NonSpacingMark makes sense or not.
2370                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2371 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2372                                         AddCharMap ((char) i, 0x1C, 1);
2373
2374                         // Thai ... note that it breaks 0x1E wall after E2B!
2375                         // Also, all Thai characters have level 2 value 3.
2376                         fillIndex [0x1E] = 2;
2377                         for (int i = 0xE40; i <= 0xE44; i++)
2378                                 AddCharMap ((char) i, 0x1E, 1, 3);
2379                         for (int i = 0xE01; i < 0xE2B; i++)
2380                                 AddCharMap ((char) i, 0x1E, 6, 3);
2381                         fillIndex [0x1F] = 5;
2382                         for (int i = 0xE2B; i < 0xE30; i++)
2383                                 AddCharMap ((char) i, 0x1F, 6, 3);
2384                         fillIndex [0x1F] = 0x1E;
2385                         for (int i = 0xE30; i < 0xE3B; i++)
2386                                 AddCharMap ((char) i, 0x1F, 1, 3);
2387                         // some Thai characters remains.
2388                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
2389                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2390                         foreach (char c in specialThai)
2391                                 AddCharMap (c, 0x1F, 1);
2392
2393                         // Lao
2394                         fillIndex [0x1F] = 2;
2395                         for (int i = 0xE80; i < 0xEDF; i++)
2396                                 if (Char.IsLetter ((char) i))
2397                                         AddCharMap ((char) i, 0x1F, 1);
2398
2399                         // Georgian. orderedGeorgian is from UCA DUCET.
2400                         fillIndex [0x21] = 5;
2401                         for (int i = 0; i < orderedGeorgian.Length; i++) {
2402                                 char c = orderedGeorgian [i];
2403                                 if (map [(int) c].Defined)
2404                                         continue;
2405                                 AddCharMap (c, 0x21, 0);
2406                                 if (c < '\u10F6')
2407                                         AddCharMap ((char) (c - 0x30), 0x21, 0);
2408                                 fillIndex [0x21] += 5;
2409                         }
2410
2411                         // Japanese Kana.
2412                         fillIndex [0x22] = 2;
2413                         int kanaOffset = 0x3041;
2414                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2415
2416                         for (int gyo = 0; gyo < 9; gyo++) {
2417                                 for (int dan = 0; dan < 5; dan++) {
2418                                         if (gyo == 7 && dan % 2 == 1) {
2419                                                 // 'ya'-gyo
2420                                                 fillIndex [0x22]++;
2421                                                 kanaOffset -= 2; // There is no space for yi and ye.
2422                                                 continue;
2423                                         }
2424                                         int cp = kanaOffset + dan * kanaLines [gyo];
2425                                         // small lines (a-gyo, ya-gyo)
2426                                         if (gyo == 0 || gyo == 7) {
2427                                                 AddKanaMap (cp, 1); // small
2428                                                 AddKanaMap (cp + 1, 1);
2429                                         }
2430                                         else
2431                                                 AddKanaMap (cp, kanaLines [gyo]);
2432                                         fillIndex [0x22]++;
2433
2434                                         if (cp == 0x30AB) {
2435                                                 // add small 'ka' (before normal one)
2436                                                 AddKanaMap (0x30F5, 1);
2437                                                 kanaOffset++;
2438                                         }
2439                                         if (cp == 0x30B1) {
2440                                                 // add small 'ke' (before normal one)
2441                                                 AddKanaMap (0x30F6, 1);
2442                                                 kanaOffset++;
2443                                         }
2444                                         if (cp == 0x3061) {
2445                                                 // add small 'Tsu' (before normal one)
2446                                                 AddKanaMap (0x3063, 1);
2447                                                 kanaOffset++;
2448                                         }
2449                                 }
2450                                 fillIndex [0x22] += 3;
2451                                 kanaOffset += 5 * kanaLines [gyo];
2452                         }
2453
2454                         // Wa-gyo is almost special, so I just manually add.
2455                         AddLetterMap ((char) 0x308E, 0x22, 0);
2456                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2457                         AddLetterMap ((char) 0x308F, 0x22, 0);
2458                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2459                         fillIndex [0x22]++;
2460                         AddLetterMap ((char) 0x3090, 0x22, 0);
2461                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2462                         fillIndex [0x22] += 2;
2463                         // no "Wu" in Japanese.
2464                         AddLetterMap ((char) 0x3091, 0x22, 0);
2465                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2466                         fillIndex [0x22]++;
2467                         AddLetterMap ((char) 0x3092, 0x22, 0);
2468                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2469                         // Nn
2470                         fillIndex [0x22] = 0x80;
2471                         AddLetterMap ((char) 0x3093, 0x22, 0);
2472                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2473
2474                         // JIS Japanese square chars.
2475                         fillIndex [0x22] = 0x97;
2476                         jisJapanese.Sort (JISComparer.Instance);
2477                         foreach (JISCharacter j in jisJapanese)
2478                                 if (0x3300 <= j.CP && j.CP <= 0x3357)
2479                                         AddCharMap ((char) j.CP, 0x22, 1);
2480                         // non-JIS Japanese square chars.
2481                         nonJisJapanese.Sort (NonJISComparer.Instance);
2482                         foreach (NonJISCharacter j in nonJisJapanese)
2483                                 AddCharMap ((char) j.CP, 0x22, 1);
2484
2485                         // Bopomofo
2486                         fillIndex [0x23] = 0x02;
2487                         for (int i = 0x3105; i <= 0x312C; i++)
2488                                 AddCharMap ((char) i, 0x23, 1);
2489
2490                         // Estrangela: ancient Syriac
2491                         fillIndex [0x24] = 0x0B;
2492                         // FIXME: is 0x71E really alternative form?
2493                         ArrayList syriacAlternatives = new ArrayList (
2494                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2495                         for (int i = 0x0710; i <= 0x072C; i++) {
2496                                 if (i == 0x0711) // NonSpacingMark
2497                                         continue;
2498                                 if (syriacAlternatives.Contains (i))
2499                                         continue;
2500                                 AddCharMap ((char) i, 0x24, 4);
2501                                 // FIXME: why?
2502                                 if (i == 0x721)
2503                                         fillIndex [0x24]++;
2504                         }
2505                         foreach (int cp in syriacAlternatives)
2506                                 map [cp] = new CharMapEntry (0x24,
2507                                         (byte) (map [cp - 1].Level1 + 2),
2508                                         0);
2509                         // FIXME: Syriac NonSpacingMark should go here.
2510
2511                         // Thaana
2512                         // FIXME: it turned out that it does not look like UCA
2513                         fillIndex [0x24] = 0x6E;
2514                         for (int i = 0; i < orderedThaana.Length; i++) {
2515                                 char c = orderedThaana [i];
2516                                 if (IsIgnorableNonSpacing ((int) c))
2517                                         continue;
2518                                 AddCharMap (c, 0x24, 2);
2519                                 if (c == '\u0782') // SPECIAL CASE: why?
2520                                         fillIndex [0x24] += 2;
2521                         }
2522                         #endregion
2523
2524                         // FIXME: Add more culture-specific letters (that are
2525                         // not supported in Windows collation) here.
2526
2527                         // Surrogate ... they are computed.
2528
2529                         #region Hangul
2530                         // Hangul.
2531                         //
2532                         // Unlike UCA Windows Hangul sequence mixes Jongseong
2533                         // with Choseong sequence as well as Jungseong,
2534                         // adjusted to have the same primary weight for the
2535                         // same base character. So it is impossible to compute
2536                         // those sort keys.
2537                         //
2538                         // Here I introduce an ordered sequence of mixed
2539                         // 'commands' and 'characters' that is similar to
2540                         // LDML text:
2541                         //      - ',' increases primary weight.
2542                         //      - [A B] means a range, increasing index
2543                         //      - {A B} means a range, without increasing index
2544                         //      - '=' is no operation (it means the characters
2545                         //        of both sides have the same weight).
2546                         //      - '>' inserts a Hangul Syllable block that
2547                         //        contains 0x251 characters.
2548                         //      - '<' decreases the index
2549                         //      - '0'-'9' means skip count
2550                         //      - whitespaces are ignored
2551                         //
2552
2553                         string hangulSequence =
2554                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2555                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2556                         + "<{\u1113 \u1116}, \u3165,"
2557                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2558                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2559                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2560                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2561                                 + "[\u11D1 \u11D2], \u11B2,"
2562                                 + "[\u11D3 \u11D5], \u11B3,"
2563                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2564                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2565                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2566                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2567                         + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2568                                 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2569                                 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2570                         + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2571                                 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2572                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2573                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2574                         + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2575                                 + "\u11F1,, \u11F2,,,"
2576                                 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2577                         + "<\u114D, \u110D,,  >"
2578                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2579                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2580                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2581                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2582                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2583                                 + "[\u11F5 \u11F8]"
2584                         ;
2585
2586                         byte hangulCat = 0x52;
2587                         fillIndex [hangulCat] = 0x2;
2588
2589                         int syllableBlock = 0;
2590                         for (int n = 0; n < hangulSequence.Length; n++) {
2591                                 char c = hangulSequence [n];
2592                                 int start, end;
2593                                 if (Char.IsWhiteSpace (c))
2594                                         continue;
2595                                 switch (c) {
2596                                 case '=':
2597                                         break; // NOP
2598                                 case ',':
2599                                         IncrementSequentialIndex (ref hangulCat);
2600                                         break;
2601                                 case '<':
2602                                         if (fillIndex [hangulCat] == 2)
2603                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2604                                         fillIndex [hangulCat]--;
2605                                         break;
2606                                 case '>':
2607                                         IncrementSequentialIndex (ref hangulCat);
2608                                         for (int l = 0; l < 0x15; l++)
2609                                                 for (int v = 0; v < 0x1C; v++) {
2610                                                         AddCharMap (
2611                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2612                                                         IncrementSequentialIndex (ref hangulCat);
2613                                                 }
2614                                         syllableBlock++;
2615                                         break;
2616                                 case '[':
2617                                         start = hangulSequence [n + 1];
2618                                         end = hangulSequence [n + 3];
2619                                         for (int i = start; i <= end; i++) {
2620                                                 AddCharMap ((char) i, hangulCat, 0);
2621                                                 if (end > i)
2622                                                         IncrementSequentialIndex (ref hangulCat);
2623                                         }
2624                                         n += 4; // consumes 5 characters for this operation
2625                                         break;
2626                                 case '{':
2627                                         start = hangulSequence [n + 1];
2628                                         end = hangulSequence [n + 3];
2629                                         for (int i = start; i <= end; i++)
2630                                                 AddCharMap ((char) i, hangulCat, 0);
2631                                         n += 4; // consumes 5 characters for this operation
2632                                         break;
2633                                 default:
2634                                         AddCharMap (c, hangulCat, 0);
2635                                         break;
2636                                 }
2637                         }
2638
2639                         // Some Jamo NFKD.
2640                         for (int i = 0x3200; i < 0x3300; i++) {
2641                                 if (IsIgnorable (i) || map [i].Defined)
2642                                         continue;
2643                                 int ch = 0;
2644                                 // w/ bracket
2645                                 if (decompLength [i] == 4 &&
2646                                         decompValues [decompIndex [i]] == '(')
2647                                         ch = decompIndex [i] + 1;
2648                                 // circled
2649                                 else if (decompLength [i] == 2 &&
2650                                         decompValues [decompIndex [i] + 1] == '\u1161')
2651                                         ch = decompIndex [i];
2652                                 else if (decompLength [i] == 1)
2653                                         ch = decompIndex [i];
2654                                 else
2655                                         continue;
2656                                 ch = decompValues [ch];
2657                                 if (ch < 0x1100 || 0x1200 < ch &&
2658                                         ch < 0xAC00 || 0xD800 < ch)
2659                                         continue;
2660
2661                                 // SPECIAL CASE ?
2662                                 int offset = i < 0x3260 ? 1 : 0;
2663                                 if (0x326E <= i && i <= 0x3273)
2664                                         offset = 1;
2665
2666                                 map [i] = new CharMapEntry (map [ch].Category,
2667                                         (byte) (map [ch].Level1 + offset),
2668                                         map [ch].Level2);
2669 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2670                         }
2671
2672
2673                         #endregion
2674
2675                         // Letterlike characters and CJK compatibility square
2676                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2677                         int [] counts = new int ['Z' - 'A' + 1];
2678                         char [] namedChars = new char [sortableCharNames.Count];
2679                         int nCharNames = 0;
2680                         foreach (DictionaryEntry de in sortableCharNames) {
2681                                 counts [((string) de.Value) [0] - 'A']++;
2682                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2683                         }
2684                         nCharNames = 0; // reset
2685                         for (int a = 0; a < counts.Length; a++) {
2686                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2687                                 for (int i = 0; i < counts [a]; i++)
2688 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2689                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2690                         }
2691
2692                         // CJK unified ideograph.
2693                         byte cjkCat = 0x9E;
2694                         fillIndex [cjkCat] = 0x2;
2695                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2696                                 if (!IsIgnorable (cp))
2697                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2698                         // CJK Extensions goes here.
2699                         // LAMESPEC: With this Windows style CJK layout, it is
2700                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2701                         // 0x9FBB can never be added w/o breaking compat.
2702                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2703                                 if (!IsIgnorable (cp))
2704                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2705
2706                         // PrivateUse ... computed.
2707                         // remaining Surrogate ... computed.
2708
2709                         #region Special "biggest" area (FF FF)
2710                         fillIndex [0xFF] = 0xFF;
2711                         char [] specialBiggest = new char [] {
2712                                 '\u3005', '\u3031', '\u3032', '\u309D',
2713                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2714                                 '\uFE7C', '\uFE7D', '\uFF70'};
2715                         foreach (char c in specialBiggest)
2716                                 AddCharMap (c, 0xFF, 0);
2717                         #endregion
2718
2719                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2720                         // non-alphanumeric ASCII except for: + - < = > '
2721                         for (int i = 0x21; i < 0x7F; i++) {
2722                                 if (Char.IsLetterOrDigit ((char) i)
2723                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2724                                         continue; // they are not added here.
2725                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2726                                 // Insert 3001 after ',' and 3002 after '.'
2727                                 if (i == 0x2C)
2728                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2729                                 else if (i == 0x2E)
2730                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2731                                 else if (i == 0x3A)
2732                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2733                         }
2734                         #endregion
2735
2736                         #region 07 - Punctuations and something else
2737                         for (int i = 0xA0; i < char.MaxValue; i++) {
2738                                 if (IsIgnorable (i))
2739                                         continue;
2740
2741                                 // FIXME: actually those reset should not be
2742                                 // done but here I put for easy goal.
2743                                 if (i == 0x0700)
2744                                         fillIndex [0x7] = 0xE2;
2745                                 if (i == 0x2016)
2746                                         fillIndex [0x7] = 0x77;
2747
2748                                 // SPECIAL CASES:
2749                                 switch (i) {
2750                                 case 0xAB: // 08
2751                                 case 0xB7: // 0A
2752                                 case 0xBB: // 08
2753                                 case 0x2329: // 09
2754                                 case 0x232A: // 09
2755                                         continue;
2756                                 }
2757
2758                                 switch (Char.GetUnicodeCategory ((char) i)) {
2759                                 case UnicodeCategory.OtherPunctuation:
2760                                 case UnicodeCategory.ClosePunctuation:
2761                                 case UnicodeCategory.OpenPunctuation:
2762                                 case UnicodeCategory.InitialQuotePunctuation:
2763                                 case UnicodeCategory.FinalQuotePunctuation:
2764                                 case UnicodeCategory.ModifierSymbol:
2765                                         // SPECIAL CASES: // 0xA
2766                                         if (0x2020 <= i && i <= 0x2031)
2767                                                 continue;
2768                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
2769                                         break;
2770                                 default:
2771                                         if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2772                                                 goto case UnicodeCategory.OtherPunctuation;
2773                                         break;
2774                                 }
2775                         }
2776                         // Control pictures
2777                         // FIXME: it should not need to reset level 1, but
2778                         // it's for easy goal.
2779                         fillIndex [0x7] = 0xB6;
2780                         for (int i = 0x2400; i <= 0x2421; i++)
2781                                 AddCharMap ((char) i, 0x7, 1, 0);
2782                         #endregion
2783
2784                         // FIXME: for 07 xx we need more love.
2785
2786                         // Characters w/ diacritical marks (NFKD)
2787                         for (int i = 0; i <= char.MaxValue; i++) {
2788                                 if (map [i].Defined || IsIgnorable (i))
2789                                         continue;
2790                                 if (decompIndex [i] == 0)
2791                                         continue;
2792
2793                                 int start = decompIndex [i];
2794                                 int primaryChar = decompValues [start];
2795                                 int secondary = 0;
2796                                 bool skip = false;
2797                                 int length = decompLength [i];
2798                                 // special processing for parenthesized ones.
2799                                 if (length == 3 &&
2800                                         decompValues [start] == '(' &&
2801                                         decompValues [start + 2] == ')') {
2802                                         primaryChar = decompValues [start + 1];
2803                                         length = 1;
2804                                 }
2805
2806                                 if (map [primaryChar].Level1 == 0)
2807                                         continue;
2808
2809                                 for (int l = 1; l < length; l++) {
2810                                         int c = decompValues [start + l];
2811                                         if (map [c].Level1 != 0)
2812                                                 skip = true;
2813                                         secondary += diacritical [c];
2814                                 }
2815                                 if (skip)
2816                                         continue;
2817                                 map [i] = new CharMapEntry (
2818                                         map [primaryChar].Category,
2819                                         map [primaryChar].Level1,
2820                                         (byte) secondary);
2821
2822                         }
2823
2824                         // category 08 - symbols
2825                         fillIndex [0x8] = 2;
2826                         // Here Windows mapping is not straightforward. It is
2827                         // not based on computation but seems manual sorting.
2828                         AddCharMapGroup ('+', 0x8, 1, 0); // plus
2829                         AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
2830                         AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
2831                         AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
2832                         AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
2833                         AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
2834                         AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
2835                         AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
2836                         AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
2837                         AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
2838                         AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
2839                         AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
2840                         AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
2841
2842                         for (int cp = 0; cp < 0x2300; cp++) {
2843                                 if (cp == 0xAC) // SPECIAL CASE: skip
2844                                         continue;
2845                                 if (cp == 0x200) {
2846                                         cp = 0x2200; // skip to 2200
2847                                         fillIndex [0x8] = 0x21;
2848                                 }
2849                                 if (cp == 0x2295)
2850                                         fillIndex [0x8] = 0x3;
2851                                 if (cp == 0x22B2)
2852                                         fillIndex [0x8] = 0xB9;
2853                                 if (!map [cp].Defined &&
2854 //                                      Char.GetUnicodeCategory ((char) cp) ==
2855 //                                      UnicodeCategory.MathSymbol)
2856                                         Char.IsSymbol ((char) cp))
2857                                         AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
2858                                 // SPECIAL CASES: no idea why Windows sorts as such
2859                                 switch (cp) {
2860                                 case 0x3E:
2861                                         AddCharMap ('\u227B', 0x8, 1, 0);
2862                                         AddCharMap ('\u22B1', 0x8, 1, 0);
2863                                         break;
2864                                 case 0xB1:
2865                                         AddCharMapGroup ('\u00AB', 0x8, 1, 0);
2866                                         AddCharMapGroup ('\u226A', 0x8, 1, 0);
2867                                         AddCharMapGroup ('\u00BB', 0x8, 1, 0);
2868                                         AddCharMapGroup ('\u226B', 0x8, 1, 0);
2869                                         break;
2870                                 case 0xF7:
2871                                         AddCharMap ('\u01C0', 0x8, 1, 0);
2872                                         AddCharMap ('\u01C1', 0x8, 1, 0);
2873                                         AddCharMap ('\u01C2', 0x8, 1, 0);
2874                                         break;
2875                                 }
2876                         }
2877
2878                         #region Level2 adjustment
2879                         // Arabic Hamzah
2880                         diacritical [0x624] = 0x5;
2881                         diacritical [0x626] = 0x7;
2882                         diacritical [0x622] = 0x9;
2883                         diacritical [0x623] = 0xA;
2884                         diacritical [0x625] = 0xB;
2885                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
2886                         diacritical [0x64A] = 0x7; // Yaa'
2887
2888                         for (int i = 0; i < char.MaxValue; i++) {
2889                                 byte mod = 0;
2890                                 byte cat = map [i].Category;
2891                                 switch (cat) {
2892                                 case 0xE: // Latin diacritics
2893                                 case 0x22: // Japanese: circled characters
2894                                         mod = diacritical [i];
2895                                         break;
2896                                 case 0x13: // Arabic
2897                                         if (diacritical [i] == 0 && i >= 0xFE8D)
2898                                                 mod = 0x8; // default for arabic
2899                                         break;
2900                                 }
2901                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
2902                                         mod = diacritical [i];
2903                                 if (mod > 0)
2904                                         map [i] = new CharMapEntry (
2905                                                 cat, map [i].Level1, mod);
2906                         }
2907                         #endregion
2908
2909                         // FIXME: this is hack but those NonSpacingMark
2910                         // characters and still undefined are likely to
2911                         // be nonspacing.
2912                         for (int i = 0; i < char.MaxValue; i++)
2913                                 if (!map [i].Defined &&
2914                                         !IsIgnorable (i) &&
2915                                         Char.GetUnicodeCategory ((char) i) ==
2916                                         UnicodeCategory.NonSpacingMark)
2917                                         AddCharMap ((char) i, 1, 1);
2918
2919                         // FIXME: this is hack but those Symbol characters
2920                         // are likely to fall into 0xA category.
2921                         for (int i = 0; i < char.MaxValue; i++)
2922                                 if (!map [i].Defined &&
2923                                         !IsIgnorable (i) &&
2924                                         Char.IsSymbol ((char) i))
2925                                         AddCharMap ((char) i, 0xA, 1);
2926                 }
2927
2928                 private void IncrementSequentialIndex (ref byte hangulCat)
2929                 {
2930                         fillIndex [hangulCat]++;
2931                         if (fillIndex [hangulCat] == 0) { // overflown
2932                                 hangulCat++;
2933                                 fillIndex [hangulCat] = 0x2;
2934                         }
2935                 }
2936
2937                 // Reset fillIndex to fixed value and call AddLetterMap().
2938                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2939                 {
2940                         fillIndex [category] = alphaWeight;
2941                         AddLetterMap (c, category, 0);
2942
2943                         ArrayList al = latinMap [c] as ArrayList;
2944                         if (al == null)
2945                                 return;
2946
2947                         foreach (int cp in al)
2948                                 AddLetterMap ((char) cp, category, 0);
2949                 }
2950
2951                 private void AddKanaMap (int i, byte voices)
2952                 {
2953                         for (byte b = 0; b < voices; b++) {
2954                                 char c = (char) (i + b);
2955                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
2956                                 // Hiragana
2957                                 AddLetterMapCore (c, 0x22, 0, arg);
2958                                 // Katakana
2959                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2960                         }
2961                 }
2962
2963                 private void AddLetterMap (char c, byte category, byte updateCount)
2964                 {
2965                         AddLetterMapCore (c, category, updateCount, 0);
2966                 }
2967
2968                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2969                 {
2970                         char c2;
2971                         // <small> updates index
2972                         c2 = ToSmallForm (c);
2973                         if (c2 != c)
2974                                 AddCharMapGroup (c2, category, updateCount, level2);
2975                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2976                         if (c2 != c && !map [(int) c2].Defined)
2977                                 AddLetterMapCore (c2, category, 0, level2);
2978                         bool doUpdate = true;
2979                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2980                                 doUpdate = false;
2981                         else
2982                                 AddCharMapGroup (c, category, 0, level2);
2983                         if (doUpdate)
2984                                 fillIndex [category] += updateCount;
2985                 }
2986
2987                 private bool AddCharMap (char c, byte category, byte increment)
2988                 {
2989                         return AddCharMap (c, category, increment, 0);
2990                 }
2991
2992                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2993                 {
2994                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2995                                 return false; // do nothing
2996                         map [(int) c] = new CharMapEntry (category,
2997                                 category == 1 ? alt : fillIndex [category],
2998                                 category == 1 ? fillIndex [category] : alt);
2999                         fillIndex [category] += increment;
3000                         return true;
3001                 }
3002
3003                 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
3004                 {
3005                         char c2 = ToSmallFormTail (c);
3006                         if (c2 != c)
3007                                 AddCharMap (c2, category, updateCount, 0);
3008                         // itself
3009                         AddCharMap (c, category, updateCount, 0);
3010                         // <full>
3011                         c2 = ToFullWidthTail (c);
3012                         if (c2 != c)
3013                                 AddCharMapGroupTail (c2, category, updateCount);
3014                 }
3015
3016                 //
3017                 // Adds characters to table in the order below
3018                 // (+ increases weight):
3019                 //      (<small> +)
3020                 //      itself
3021                 //      <fraction>
3022                 //      <full> | <super> | <sub>
3023                 //      <circle> | <wide> (| <narrow>)
3024                 //      +
3025                 //      (vertical +)
3026                 //
3027                 // level2 is fixed (does not increase).
3028                 int [] sameWeightItems = new int [] {
3029                         DecompositionFraction,
3030                         DecompositionFull,
3031                         DecompositionSuper,
3032                         DecompositionSub,
3033                         DecompositionCircle,
3034                         DecompositionWide,
3035                         DecompositionNarrow,
3036                         };
3037                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3038                 {
3039                         if (map [(int) c].Defined)
3040                                 return;
3041
3042                         char small = char.MinValue;
3043                         char vertical = char.MinValue;
3044                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3045                         if (nfkd != null) {
3046                                 object smv = nfkd [(byte) DecompositionSmall];
3047                                 if (smv != null)
3048                                         small = (char) ((int) smv);
3049                                 object vv = nfkd [(byte) DecompositionVertical];
3050                                 if (vv != null)
3051                                         vertical = (char) ((int) vv);
3052                         }
3053
3054                         // <small> updates index
3055                         if (small != char.MinValue)
3056                                 AddCharMap (small, category, updateCount);
3057
3058                         // itself
3059                         AddCharMap (c, category, 0, level2);
3060
3061                         if (nfkd != null) {
3062                                 foreach (int weight in sameWeightItems) {
3063                                         object wv = nfkd [(byte) weight];
3064                                         if (wv != null)
3065                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
3066                                 }
3067                         }
3068
3069                         // update index here.
3070                         fillIndex [category] += updateCount;
3071
3072                         if (vertical != char.MinValue)
3073                                 AddCharMap (vertical, category, updateCount, level2);
3074                 }
3075
3076                 private void AddCharMapCJK (char c, ref byte category)
3077                 {
3078                         AddCharMap (c, category, 0, 0);
3079                         IncrementSequentialIndex (ref category);
3080
3081                         // Special. I wonder why but Windows skips 9E F9.
3082                         if (category == 0x9E && fillIndex [category] == 0xF9)
3083                                 IncrementSequentialIndex (ref category);
3084                 }
3085
3086                 private void AddCharMapGroupCJK (char c, ref byte category)
3087                 {
3088                         AddCharMapCJK (c, ref category);
3089
3090                         // LAMESPEC: see below.
3091                         if (c == '\u5B78') {
3092                                 AddCharMapCJK ('\u32AB', ref category);
3093                                 AddCharMapCJK ('\u323B', ref category);
3094                         }
3095                         if (c == '\u52DE') {
3096                                 AddCharMapCJK ('\u3298', ref category);
3097                                 AddCharMapCJK ('\u3238', ref category);
3098                         }
3099                         if (c == '\u5BEB')
3100                                 AddCharMapCJK ('\u32A2', ref category);
3101                         if (c == '\u91AB')
3102                                 // Especially this mapping order totally does
3103                                 // not make sense to me.
3104                                 AddCharMapCJK ('\u32A9', ref category);
3105
3106                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3107                         if (nfkd == null)
3108                                 return;
3109                         for (byte weight = 0; weight <= 0x12; weight++) {
3110                                 object wv = nfkd [weight];
3111                                 if (wv == null)
3112                                         continue;
3113                                 int w = (int) wv;
3114
3115                                 // Special: they are ignored in this area.
3116                                 // FIXME: check if it is sane
3117                                 if (0xF900 <= w && w <= 0xFAD9)
3118                                         continue;
3119                                 // LAMESPEC: on Windows some of CJK characters
3120                                 // in 3200-32B0 are incorrectly mapped. They
3121                                 // mix Chinise and Japanese Kanji when
3122                                 // ordering those characters.
3123                                 switch (w) {
3124                                 case 0x32A2: case 0x3298: case 0x3238:
3125                                 case 0x32A9: case 0x323B: case 0x32AB:
3126                                         continue;
3127                                 }
3128
3129                                 AddCharMapCJK ((char) w, ref category);
3130                         }
3131                 }
3132
3133                 // For now it is only for 0x7 category.
3134                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3135                 {
3136                         char small = char.MinValue;
3137                         char vertical = char.MinValue;
3138                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3139                         if (nfkd != null) {
3140                                 object smv = nfkd [(byte) DecompositionSmall];
3141                                 if (smv != null)
3142                                         small = (char) ((int) smv);
3143                                 object vv = nfkd [(byte) DecompositionVertical];
3144                                 if (vv != null)
3145                                         vertical = (char) ((int) vv);
3146                         }
3147
3148                         // <small> updates index
3149                         if (small != char.MinValue)
3150                                 // SPECIAL CASE excluded (FIXME: why?)
3151                                 if (small != '\u2024')
3152                                         AddCharMap (small, category, updateCount);
3153
3154                         // itself
3155                         AddCharMap (c, category, updateCount, level2);
3156
3157                         // Since nfkdMap is problematic to have two or more
3158                         // NFKD to an identical character, here I iterate all.
3159                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3160                                 if (decompLength [c2] == 1 &&
3161                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3162                                         switch (decompType [c2]) {
3163                                         case DecompositionCompat:
3164                                                 AddCharMap ((char) c2, category, updateCount, level2);
3165                                                 break;
3166                                         }
3167                                 }
3168                         }
3169
3170                         if (vertical != char.MinValue)
3171                                 // SPECIAL CASE excluded (FIXME: why?)
3172                                 if (vertical != '\uFE33' && vertical != '\uFE34')
3173                                         AddCharMap (vertical, category, updateCount, level2);
3174                 }
3175
3176                 private void AddArabicCharMap (char c)
3177                 {
3178                         byte category = 6;
3179                         byte updateCount = 1;
3180                         byte level2 = 0;
3181
3182                         // itself
3183                         AddCharMap (c, category, 0, level2);
3184
3185                         // Since nfkdMap is problematic to have two or more
3186                         // NFKD to an identical character, here I iterate all.
3187                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3188                                 if (decompLength [c2] == 0)
3189                                         continue;
3190                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
3191                                 if ((int) (decompValues [idx]) == (int) c)
3192                                         AddCharMap ((char) c2, category,
3193                                                 0, level2);
3194                         }
3195                         fillIndex [category] += updateCount;
3196                 }
3197
3198                 char ToFullWidth (char c)
3199                 {
3200                         return ToDecomposed (c, DecompositionFull, false);
3201                 }
3202
3203                 char ToFullWidthTail (char c)
3204                 {
3205                         return ToDecomposed (c, DecompositionFull, true);
3206                 }
3207
3208                 char ToSmallForm (char c)
3209                 {
3210                         return ToDecomposed (c, DecompositionSmall, false);
3211                 }
3212
3213                 char ToSmallFormTail (char c)
3214                 {
3215                         return ToDecomposed (c, DecompositionSmall, true);
3216                 }
3217
3218                 char ToDecomposed (char c, byte d, bool tail)
3219                 {
3220                         if (decompType [(int) c] != d)
3221                                 return c;
3222                         int idx = decompIndex [(int) c];
3223                         if (tail)
3224                                 idx += decompLength [(int) c] - 1;
3225                         return (char) decompValues [idx];
3226                 }
3227
3228                 bool ExistsJIS (int cp)
3229                 {
3230                         foreach (JISCharacter j in jisJapanese)
3231                                 if (j.CP == cp)
3232                                         return true;
3233                         return false;
3234                 }
3235
3236                 #endregion
3237
3238                 #region Level 3 properties (Case/Width)
3239
3240                 private byte ComputeLevel3Weight (char c)
3241                 {
3242                         byte b = ComputeLevel3WeightRaw (c);
3243                         return b > 0 ? (byte) (b + 2) : b;
3244                 }
3245
3246                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3247                 {
3248                         // CJK compat
3249                         if ('\u3192' <= c && c <= '\u319F')
3250                                 return 0;
3251                         // Japanese reading marks
3252                         if (c == '\u3001' || c == '\u3002')
3253                                 return 2;
3254                         // Korean
3255                         if ('\u11A8' <= c && c <= '\u11F9')
3256                                 return 2;
3257                         if ('\uFFA0' <= c && c <= '\uFFDC')
3258                                 return 4;
3259                         if ('\u3130' <= c && c <= '\u3164')
3260                                 return 5;
3261                         if ('\u3165' <= c && c <= '\u318E')
3262                                 return 4;
3263                         // Georgian Capital letters
3264                         if ('\u10A0' <= c && c <= '\u10C5')
3265                                 return 0x10;
3266                         // numbers
3267                         if ('\u2776' <= c && c <= '\u277F')
3268                                 return 4;
3269                         if ('\u2780' <= c && c <= '\u2789')
3270                                 return 8;
3271                         if ('\u2776' <= c && c <= '\u2793')
3272                                 return 0xC;
3273                         if ('\u2160' <= c && c <= '\u216F')
3274                                 return 0x10;
3275                         if ('\u2181' <= c && c <= '\u2182')
3276                                 return 0x18;
3277                         // Arabic
3278                         if ('\u2135' <= c && c <= '\u2138')
3279                                 return 4;
3280                         if ('\uFE80' <= c && c < '\uFF00') {
3281                                 // 2(Isolated)/8(Final)/0x18(Medial)
3282                                 switch (decompType [(int) c]) {
3283                                 case DecompositionIsolated:
3284                                         return 2;
3285                                 case DecompositionFinal:
3286                                         return 8;
3287                                 case DecompositionMedial:
3288                                         return 0x18;
3289                                 }
3290                         }
3291
3292                         // actually I dunno the reason why they have weights.
3293                         switch (c) {
3294                         case '\u01BC':
3295                                 return 0x10;
3296                         case '\u06A9':
3297                                 return 0x20;
3298                         case '\u06AA':
3299                                 return 0x28;
3300                         }
3301
3302                         byte ret = 0;
3303                         switch (c) {
3304                         case '\u03C2':
3305                         case '\u2104':
3306                         case '\u212B':
3307                                 ret |= 8;
3308                                 break;
3309                         case '\uFE42':
3310                                 ret |= 0xC;
3311                                 break;
3312                         }
3313
3314                         // misc
3315                         switch (decompType [(int) c]) {
3316                         case DecompositionWide: // <wide>
3317                         case DecompositionSub: // <sub>
3318                         case DecompositionSuper: // <super>
3319                                 ret |= decompType [(int) c];
3320                                 break;
3321                         }
3322                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3323                                 ret |= 8;
3324                         if (isUppercase [(int) c]) // DerivedCoreProperties
3325                                 ret |= 0x10;
3326
3327                         return ret;
3328                 }
3329
3330                 #endregion
3331
3332                 #region IsIgnorable
3333 /*
3334                 static bool IsIgnorable (int i)
3335                 {
3336                         if (unicodeAge [i] >= 3.1)
3337                                 return true;
3338                         switch (char.GetUnicodeCategory ((char) i)) {
3339                         case UnicodeCategory.OtherNotAssigned:
3340                         case UnicodeCategory.Format:
3341                                 return true;
3342                         }
3343                         return false;
3344                 }
3345 */
3346
3347                 // FIXME: In the future use DerivedAge.txt to examine character
3348                 // versions and set those ones that have higher version than
3349                 // 1.0 as ignorable.
3350                 static bool IsIgnorable (int i)
3351                 {
3352                         switch (i) {
3353                         case 0:
3354                         // I guess, those characters are added between
3355                         // Unicode 1.0 (LCMapString) and Unicode 3.1
3356                         // (UnicodeCategory), so they used to be
3357                         // something like OtherNotAssigned as of Unicode 1.1.
3358                         case 0x2df: case 0x387:
3359                         case 0x3d7: case 0x3d8: case 0x3d9:
3360                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3361                         case 0x400: case 0x40d: case 0x450: case 0x45d:
3362                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
3363                         case 0x653: case 0x654: case 0x655: case 0x66d:
3364                         case 0xb56:
3365                         case 0x1e9b: case 0x202f: case 0x20ad:
3366                         case 0x20ae: case 0x20af:
3367                         case 0x20e2: case 0x20e3:
3368                         case 0x2139: case 0x213a: case 0x2183:
3369                         case 0x2425: case 0x2426: case 0x2619:
3370                         case 0x2670: case 0x2671: case 0x3007:
3371                         case 0x3190: case 0x3191:
3372                         case 0xfffc: case 0xfffd:
3373                                 return true;
3374                         // exceptional characters filtered by the
3375                         // following conditions. Originally those exceptional
3376                         // ranges are incorrect (they should not be ignored)
3377                         // and most of those characters are unfortunately in
3378                         // those ranges.
3379                         case 0x4d8: case 0x4d9:
3380                         case 0x4e8: case 0x4e9:
3381                         case 0x70F:
3382                         case 0x3036: case 0x303f:
3383                         case 0x337b: case 0xfb1e:
3384                                 return false;
3385                         }
3386
3387                         if (
3388                                 // The whole Sinhala characters.
3389                                 0x0D82 <= i && i <= 0x0DF4
3390                                 // The whole Tibetan characters.
3391                                 || 0x0F00 <= i && i <= 0x0FD1
3392                                 // The whole Myanmar characters.
3393                                 || 0x1000 <= i && i <= 0x1059
3394                                 // The whole Etiopic, Cherokee,
3395                                 // Canadian Syllablic, Ogham, Runic,
3396                                 // Tagalog, Hanunoo, Philippine,
3397                                 // Buhid, Tagbanwa, Khmer and Mongorian
3398                                 // characters.
3399                                 || 0x1200 <= i && i <= 0x1DFF
3400                                 // Greek extension characters.
3401                                 || 0x1F00 <= i && i <= 0x1FFF
3402                                 // The whole Braille characters.
3403                                 || 0x2800 <= i && i <= 0x28FF
3404                                 // CJK radical characters.
3405                                 || 0x2E80 <= i && i <= 0x2EF3
3406                                 // Kangxi radical characters.
3407                                 || 0x2F00 <= i && i <= 0x2FD5
3408                                 // Ideographic description characters.
3409                                 || 0x2FF0 <= i && i <= 0x2FFB
3410                                 // Bopomofo letter and final
3411                                 || 0x31A0 <= i && i <= 0x31B7
3412                                 // White square with quadrant characters.
3413                                 || 0x25F0 <= i && i <= 0x25F7
3414                                 // Ideographic telegraph symbols.
3415                                 || 0x32C0 <= i && i <= 0x32CB
3416                                 || 0x3358 <= i && i <= 0x3370
3417                                 || 0x33E0 <= i && i <= 0x33FF
3418                                 // The whole YI characters.
3419                                 || 0xA000 <= i && i <= 0xA48C
3420                                 || 0xA490 <= i && i <= 0xA4C6
3421                                 // American small ligatures
3422                                 || 0xFB13 <= i && i <= 0xFB17
3423                                 // hebrew, arabic, variation selector.
3424                                 || 0xFB1D <= i && i <= 0xFE2F
3425                                 // Arabic ligatures.
3426                                 || 0xFEF5 <= i && i <= 0xFEFC
3427                                 // FIXME: why are they excluded?
3428                                 || 0x01F6 <= i && i <= 0x01F9
3429                                 || 0x0218 <= i && i <= 0x0233
3430                                 || 0x02A9 <= i && i <= 0x02AD
3431                                 || 0x02EA <= i && i <= 0x02EE
3432                                 || 0x0349 <= i && i <= 0x036F
3433                                 || 0x0488 <= i && i <= 0x048F
3434                                 || 0x04D0 <= i && i <= 0x04FF
3435                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3436                                 || 0x06D6 <= i && i <= 0x06ED
3437                                 || 0x06FA <= i && i <= 0x06FE
3438                                 || 0x2048 <= i && i <= 0x204D
3439                                 || 0x20e4 <= i && i <= 0x20ea
3440                                 || 0x213C <= i && i <= 0x214B
3441                                 || 0x21EB <= i && i <= 0x21FF
3442                                 || 0x22F2 <= i && i <= 0x22FF
3443                                 || 0x237B <= i && i <= 0x239A
3444                                 || 0x239B <= i && i <= 0x23CF
3445                                 || 0x24EB <= i && i <= 0x24FF
3446                                 || 0x2596 <= i && i <= 0x259F
3447                                 || 0x25F8 <= i && i <= 0x25FF
3448                                 || 0x2672 <= i && i <= 0x2689
3449                                 || 0x2768 <= i && i <= 0x2775
3450                                 || 0x27d0 <= i && i <= 0x27ff
3451                                 || 0x2900 <= i && i <= 0x2aff
3452                                 || 0x3033 <= i && i <= 0x303F
3453                                 || 0x31F0 <= i && i <= 0x31FF
3454                                 || 0x3250 <= i && i <= 0x325F
3455                                 || 0x32B1 <= i && i <= 0x32BF
3456                                 || 0x3371 <= i && i <= 0x337B
3457                                 || 0xFA30 <= i && i <= 0xFA6A
3458                         )
3459                                 return true;
3460
3461                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3462                         switch (uc) {
3463                         case UnicodeCategory.PrivateUse:
3464                         case UnicodeCategory.Surrogate:
3465                                 return false;
3466                         // ignored by nature
3467                         case UnicodeCategory.Format:
3468                         case UnicodeCategory.OtherNotAssigned:
3469                                 return true;
3470                         default:
3471                                 return false;
3472                         }
3473                 }
3474
3475                 // To check IsIgnorable sanity, try the driver below under MS.NET.
3476
3477                 /*
3478                 public static void Main ()
3479                 {
3480                         for (int i = 0; i <= char.MaxValue; i++)
3481                                 Dump (i, IsIgnorable (i));
3482                 }
3483
3484                 static void Dump (int i, bool ignore)
3485                 {
3486                         switch (Char.GetUnicodeCategory ((char) i)) {
3487                         case UnicodeCategory.PrivateUse:
3488                         case UnicodeCategory.Surrogate:
3489                                 return; // check nothing
3490                         }
3491
3492                         string s1 = "";
3493                         string s2 = new string ((char) i, 10);
3494                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3495                         if ((ret == 0) == ignore)
3496                                 return;
3497                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3498                 }
3499                 */
3500                 #endregion // IsIgnorable
3501
3502                 #region IsIgnorableSymbol
3503                 static bool IsIgnorableSymbol (int i)
3504                 {
3505                         if (IsIgnorable (i))
3506                                 return true;
3507
3508                         switch (i) {
3509                         // *Letter
3510                         case 0x00b5: case 0x01C0: case 0x01C1:
3511                         case 0x01C2: case 0x01C3: case 0x01F6:
3512                         case 0x01F7: case 0x01F8: case 0x01F9:
3513                         case 0x02D0: case 0x02EE: case 0x037A:
3514                         case 0x03D7: case 0x03F3:
3515                         case 0x0400: case 0x040d:
3516                         case 0x0450: case 0x045d:
3517                         case 0x048C: case 0x048D:
3518                         case 0x048E: case 0x048F:
3519                         case 0x0587: case 0x0640: case 0x06E5:
3520                         case 0x06E6: case 0x06FA: case 0x06FB:
3521                         case 0x06FC: case 0x093D: case 0x0950:
3522                         case 0x1E9B: case 0x2139: case 0x3006:
3523                         case 0x3033: case 0x3034: case 0x3035:
3524                         case 0xFE7E: case 0xFE7F:
3525                         // OtherNumber
3526                         case 0x16EE: case 0x16EF: case 0x16F0:
3527                         // LetterNumber
3528                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3529                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3530                         case 0x3038: // HANGZHOU NUMERAL TEN
3531                         case 0x3039: // HANGZHOU NUMERAL TWENTY
3532                         case 0x303a: // HANGZHOU NUMERAL THIRTY
3533                         // OtherSymbol
3534                         case 0x2117:
3535                         case 0x327F:
3536                                 return true;
3537                         // ModifierSymbol
3538                         case 0x02B9: case 0x02BA: case 0x02C2:
3539                         case 0x02C3: case 0x02C4: case 0x02C5:
3540                         case 0x02C8: case 0x02CC: case 0x02CD:
3541                         case 0x02CE: case 0x02CF: case 0x02D2:
3542                         case 0x02D3: case 0x02D4: case 0x02D5:
3543                         case 0x02D6: case 0x02D7: case 0x02DE:
3544                         case 0x02E5: case 0x02E6: case 0x02E7:
3545                         case 0x02E8: case 0x02E9:
3546                         case 0x309B: case 0x309C:
3547                         // OtherPunctuation
3548                         case 0x055A: // American Apos
3549                         case 0x05C0: // Hebrew Punct
3550                         case 0x0E4F: // Thai FONGMAN
3551                         case 0x0E5A: // Thai ANGKHANKHU
3552                         case 0x0E5B: // Thai KHOMUT
3553                         // CurencySymbol
3554                         case 0x09F2: // Bengali Rupee Mark
3555                         case 0x09F3: // Bengali Rupee Sign
3556                         // MathSymbol
3557                         case 0x221e: // INF.
3558                         // OtherSymbol
3559                         case 0x0482:
3560                         case 0x09FA:
3561                         case 0x0B70:
3562                                 return false;
3563                         }
3564
3565                         // *Letter
3566                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3567 #if NET_2_0
3568                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3569                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3570 #endif
3571                         )
3572                                 return true;
3573
3574                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3575                         switch (uc) {
3576                         case UnicodeCategory.Surrogate:
3577                                 return false; // inconsistent
3578
3579                         case UnicodeCategory.SpacingCombiningMark:
3580                         case UnicodeCategory.EnclosingMark:
3581                         case UnicodeCategory.NonSpacingMark:
3582                         case UnicodeCategory.PrivateUse:
3583                                 // NonSpacingMark
3584                                 if (0x064B <= i && i <= 0x0652) // Arabic
3585                                         return true;
3586                                 return false;
3587
3588                         case UnicodeCategory.Format:
3589                         case UnicodeCategory.OtherNotAssigned:
3590                                 return true;
3591
3592                         default:
3593                                 bool use = false;
3594                                 // OtherSymbols
3595                                 if (
3596                                         // latin in a circle
3597                                         0x249A <= i && i <= 0x24E9
3598                                         || 0x2100 <= i && i <= 0x2132
3599                                         // Japanese
3600                                         || 0x3196 <= i && i <= 0x31A0
3601                                         // Korean
3602                                         || 0x3200 <= i && i <= 0x321C
3603                                         // Chinese/Japanese
3604                                         || 0x322A <= i && i <= 0x3243
3605                                         // CJK
3606                                         || 0x3260 <= i && i <= 0x32B0
3607                                         || 0x32D0 <= i && i <= 0x3357
3608                                         || 0x337B <= i && i <= 0x33DD
3609                                 )
3610                                         use = !Char.IsLetterOrDigit ((char) i);
3611                                 if (use)
3612                                         return false;
3613
3614                                 // This "Digit" rule is mystery.
3615                                 // It filters some symbols out.
3616                                 if (Char.IsLetterOrDigit ((char) i))
3617                                         return false;
3618                                 if (Char.IsNumber ((char) i))
3619                                         return false;
3620                                 if (Char.IsControl ((char) i)
3621                                         || Char.IsSeparator ((char) i)
3622                                         || Char.IsPunctuation ((char) i))
3623                                         return true;
3624                                 if (Char.IsSymbol ((char) i))
3625                                         return true;
3626
3627                                 // FIXME: should check more
3628                                 return false;
3629                         }
3630                 }
3631
3632                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3633 /*
3634                 public static void Main ()
3635                 {
3636                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3637                         for (int i = 0; i <= char.MaxValue; i++) {
3638                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3639                                 if (uc == UnicodeCategory.Surrogate)
3640                                         continue;
3641
3642                                 bool ret = IsIgnorableSymbol (i);
3643
3644                                 string s1 = "TEST ";
3645                                 string s2 = "TEST " + (char) i;
3646
3647                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3648
3649                                 if (ret != (result == 0))
3650                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3651                                                 ret ? "should not ignore" :
3652                                                         "should ignore",
3653                                                 i,(char) i, uc);
3654                         }
3655                 }
3656 */
3657                 #endregion
3658
3659                 #region NonSpacing
3660                 static bool IsIgnorableNonSpacing (int i)
3661                 {
3662                         if (IsIgnorable (i))
3663                                 return true;
3664
3665                         switch (i) {
3666                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3667                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3668                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3669                                 return true;
3670                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3671                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3672                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3673                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3674                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3675                         case 0x0CCD: case 0x0E4E:
3676                                 return false;
3677                         }
3678
3679                         if (0x02b9 <= i && i <= 0x02c5
3680                                 || 0x02cc <= i && i <= 0x02d7
3681                                 || 0x02e4 <= i && i <= 0x02ef
3682                                 || 0x20DD <= i && i <= 0x20E0
3683                         )
3684                                 return true;
3685
3686                         if (0x064B <= i && i <= 0x00652
3687                                 || 0x0941 <= i && i <= 0x0948
3688                                 || 0x0AC1 <= i && i <= 0x0ACD
3689                                 || 0x0C3E <= i && i <= 0x0C4F
3690                                 || 0x0E31 <= i && i <= 0x0E3F
3691                         )
3692                                 return false;
3693
3694                         return Char.GetUnicodeCategory ((char) i) ==
3695                                 UnicodeCategory.NonSpacingMark;
3696                 }
3697
3698                 // We can reuse IsIgnorableSymbol testcode
3699                 // for IsIgnorableNonSpacing.
3700                 #endregion
3701         }
3702
3703         struct CharMapEntry
3704         {
3705                 public byte Category;
3706                 public byte Level1;
3707                 public byte Level2; // It is always single byte.
3708                 public bool Defined;
3709
3710                 public CharMapEntry (byte category, byte level1, byte level2)
3711                 {
3712                         Category = category;
3713                         Level1 = level1;
3714                         Level2 = level2;
3715                         Defined = true;
3716                 }
3717         }
3718
3719         class JISCharacter
3720         {
3721                 public readonly int CP;
3722                 public readonly int JIS;
3723
3724                 public JISCharacter (int cp, int cpJIS)
3725                 {
3726                         CP = cp;
3727                         JIS = cpJIS;
3728                 }
3729         }
3730
3731         class JISComparer : IComparer
3732         {
3733                 public static readonly JISComparer Instance =
3734                         new JISComparer ();
3735
3736                 public int Compare (object o1, object o2)
3737                 {
3738                         JISCharacter j1 = (JISCharacter) o1;
3739                         JISCharacter j2 = (JISCharacter) o2;
3740                         return j1.JIS - j2.JIS;
3741                 }
3742         }
3743
3744         class NonJISCharacter
3745         {
3746                 public readonly int CP;
3747                 public readonly string Name;
3748
3749                 public NonJISCharacter (int cp, string name)
3750                 {
3751                         CP = cp;
3752                         Name = name;
3753                 }
3754         }
3755
3756         class NonJISComparer : IComparer
3757         {
3758                 public static readonly NonJISComparer Instance =
3759                         new NonJISComparer ();
3760
3761                 public int Compare (object o1, object o2)
3762                 {
3763                         NonJISCharacter j1 = (NonJISCharacter) o1;
3764                         NonJISCharacter j2 = (NonJISCharacter) o2;
3765                         return string.CompareOrdinal (j1.Name, j2.Name);
3766                 }
3767         }
3768
3769         class DecimalDictionaryValueComparer : IComparer
3770         {
3771                 public static readonly DecimalDictionaryValueComparer Instance
3772                         = new DecimalDictionaryValueComparer ();
3773
3774                 private DecimalDictionaryValueComparer ()
3775                 {
3776                 }
3777
3778                 public int Compare (object o1, object o2)
3779                 {
3780                         DictionaryEntry e1 = (DictionaryEntry) o1;
3781                         DictionaryEntry e2 = (DictionaryEntry) o2;
3782                         // FIXME: in case of 0, compare decomposition categories
3783                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3784                         if (ret != 0)
3785                                 return ret;
3786                         int i1 = (int) e1.Key;
3787                         int i2 = (int) e2.Key;
3788                         return i1 - i2;
3789                 }
3790         }
3791
3792         class StringDictionaryValueComparer : IComparer
3793         {
3794                 public static readonly StringDictionaryValueComparer Instance
3795                         = new StringDictionaryValueComparer ();
3796
3797                 private StringDictionaryValueComparer ()
3798                 {
3799                 }
3800
3801                 public int Compare (object o1, object o2)
3802                 {
3803                         DictionaryEntry e1 = (DictionaryEntry) o1;
3804                         DictionaryEntry e2 = (DictionaryEntry) o2;
3805                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3806                         if (ret != 0)
3807                                 return ret;
3808                         int i1 = (int) e1.Key;
3809                         int i2 = (int) e2.Key;
3810                         return i1 - i2;
3811                 }
3812         }
3813
3814         class UCAComparer : IComparer
3815         {
3816                 public static readonly UCAComparer Instance
3817                         = new UCAComparer ();
3818
3819                 private UCAComparer ()
3820                 {
3821                 }
3822
3823                 public int Compare (object o1, object o2)
3824                 {
3825                         char i1 = (char) o1;
3826                         char i2 = (char) o2;
3827
3828                         int l1 = CollationElementTable.GetSortKeyCount (i1);
3829                         int l2 = CollationElementTable.GetSortKeyCount (i2);
3830                         int l = l1 > l2 ? l2 : l1;
3831
3832                         for (int i = 0; i < l; i++) {
3833                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3834                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3835                                 int v = k1.Primary - k2.Primary;
3836                                 if (v != 0)
3837                                         return v;
3838                                 v = k1.Secondary - k2.Secondary;
3839                                 if (v != 0)
3840                                         return v;
3841                                 v = k1.Thirtiary - k2.Thirtiary;
3842                                 if (v != 0)
3843                                         return v;
3844                                 v = k1.Quarternary - k2.Quarternary;
3845                                 if (v != 0)
3846                                         return v;
3847                         }
3848                         return l1 - l2;
3849                 }
3850         }
3851
3852         class Tailoring
3853         {
3854                 int lcid;
3855                 int alias;
3856                 bool frenchSort;
3857                 ArrayList items = new ArrayList ();
3858
3859                 public Tailoring (int lcid)
3860                         : this (lcid, 0)
3861                 {
3862                 }
3863
3864                 public Tailoring (int lcid, int alias)
3865                 {
3866                         this.lcid = lcid;
3867                         this.alias = alias;
3868                 }
3869
3870                 public int LCID {
3871                         get { return lcid; }
3872                 }
3873
3874                 public int Alias {
3875                         get { return alias; }
3876                 }
3877
3878                 public bool FrenchSort {
3879                         get { return frenchSort; }
3880                         set { frenchSort = value; }
3881                 }
3882
3883                 public void AddDiacriticalMap (byte target, byte replace)
3884                 {
3885                         items.Add (new DiacriticalMap (target, replace));
3886                 }
3887
3888                 public void AddSortKeyMap (string source, byte [] sortkey)
3889                 {
3890                         items.Add (new SortKeyMap (source, sortkey));
3891                 }
3892
3893                 public void AddReplacementMap (string source, string replace)
3894                 {
3895                         items.Add (new ReplacementMap (source, replace));
3896                 }
3897
3898                 public char [] ItemToCharArray ()
3899                 {
3900                         ArrayList al = new ArrayList ();
3901                         foreach (ITailoringMap m in items)
3902                                 al.AddRange (m.ToCharArray ());
3903                         return al.ToArray (typeof (char)) as char [];
3904                 }
3905
3906                 interface ITailoringMap
3907                 {
3908                         char [] ToCharArray ();
3909                 }
3910
3911                 class DiacriticalMap : ITailoringMap
3912                 {
3913                         public readonly byte Target;
3914                         public readonly byte Replace;
3915
3916                         public DiacriticalMap (byte target, byte replace)
3917                         {
3918                                 Target = target;
3919                                 Replace = replace;
3920                         }
3921
3922                         public char [] ToCharArray ()
3923                         {
3924                                 char [] ret = new char [3];
3925                                 ret [0] = (char) 02; // kind:DiacriticalMap
3926                                 ret [1] = (char) Target;
3927                                 ret [2] = (char) Replace;
3928                                 return ret;
3929                         }
3930                 }
3931
3932                 class SortKeyMap : ITailoringMap
3933                 {
3934                         public readonly string Source;
3935                         public readonly byte [] SortKey;
3936
3937                         public SortKeyMap (string source, byte [] sortkey)
3938                         {
3939                                 Source = source;
3940                                 SortKey = sortkey;
3941                         }
3942
3943                         public char [] ToCharArray ()
3944                         {
3945                                 char [] ret = new char [Source.Length + 7];
3946                                 ret [0] = (char) 01; // kind:SortKeyMap
3947                                 for (int i = 0; i < Source.Length; i++)
3948                                         ret [i + 1] = Source [i];
3949                                 // null terminate
3950                                 for (int i = 0; i < 4; i++)
3951                                         ret [i + Source.Length + 2] = (char) SortKey [i];
3952                                 return ret;
3953                         }
3954                 }
3955
3956                 class ReplacementMap : ITailoringMap
3957                 {
3958                         public readonly string Source;
3959                         public readonly string Replace;
3960
3961                         public ReplacementMap (string source, string replace)
3962                         {
3963                                 Source = source;
3964                                 Replace = replace;
3965                         }
3966
3967                         public char [] ToCharArray ()
3968                         {
3969                                 char [] ret = new char [Source.Length + Replace.Length + 3];
3970                                 ret [0] = (char) 03; // kind:ReplaceMap
3971                                 int pos = 1;
3972                                 for (int i = 0; i < Source.Length; i++)
3973                                         ret [pos++] = Source [i];
3974                                 // null terminate
3975                                 pos++;
3976                                 for (int i = 0; i < Replace.Length; i++)
3977                                         ret [pos++] = Replace [i];
3978                                 // null terminate
3979                                 return ret;
3980                         }
3981                 }
3982         }
3983 }