mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14
  15 //
  16 // * sortkey getter signature
  17 //
  18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
  19 //      Stores sort key for corresponding character element into buf and
  20 //      returns the length of the consumed _source_ character element in s.
  21 //
  22 // * character length to consume
  23 //
  24 //      If there are characters whose primary weight is 0, they are consumed
  25 //      and considered as a part of the character element.
  26 //
  27 #define Binary
  28
  29 using System;
  30 using System.IO;
  31 using System.Collections;
  32 using System.Globalization;
  33 using System.Text;
  34 using System.Xml;
  35
  36 namespace Mono.Globalization.Unicode
  37 {
  38         internal class MSCompatSortKeyTableGenerator
  39         {
  40                 public static void Main (string [] args)
  41                 {
  42                         new MSCompatSortKeyTableGenerator ().Run (args);
  43                 }
  44
  45                 const int DecompositionWide = 1; // fixed
  46                 const int DecompositionSub = 2; // fixed
  47                 const int DecompositionSmall = 3;
  48                 const int DecompositionIsolated = 4;
  49                 const int DecompositionInitial = 5;
  50                 const int DecompositionFinal = 6;
  51                 const int DecompositionMedial = 7;
  52                 const int DecompositionNoBreak = 8;
  53                 const int DecompositionVertical = 9;
  54                 const int DecompositionFraction = 0xA;
  55                 const int DecompositionFont = 0xB;
  56                 const int DecompositionSuper = 0xC; // fixed
  57                 const int DecompositionFull = 0xE;
  58                 const int DecompositionNarrow = 0xD;
  59                 const int DecompositionCircle = 0xF;
  60                 const int DecompositionSquare = 0x10;
  61                 const int DecompositionCompat = 0x11;
  62                 const int DecompositionCanonical = 0x12;
  63
  64                 TextWriter Result = Console.Out;
  65
  66                 byte [] fillIndex = new byte [256]; // by category
  67                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  68
  69                 char [] specialIgnore = new char [] {
  70                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  71                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  72                         };
  73
  74                 // FIXME: need more love (as always)
  75                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  76                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  77                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  78                         '\u0292', '\u01BE', '\u0298'};
  79                 byte [] alphaWeights = new byte [] {
  80                         2, 9, 0xA, 0x1A, 0x21,
  81                         0x23, 0x25, 0x2C, 0x32, 0x35,
  82                         0x36, 0x48, 0x51, 0x70, 0x7C,
  83                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  84                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  85                         0xA9, 0xAA, 0xB3, 0xB4};
  86
  87                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  88                 bool [] isUppercase = new bool [char.MaxValue + 1];
  89
  90                 byte [] decompType = new byte [char.MaxValue + 1];
  91                 int [] decompIndex = new int [char.MaxValue + 1];
  92                 int [] decompLength = new int [char.MaxValue + 1];
  93                 int [] decompValues;
  94                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  95
  96                 byte [] diacritical = new byte [char.MaxValue + 1];
  97
  98                 string [] diacritics = new string [] {
  99                         // LATIN, CYRILLIC etc.
 100                         "UPTURN", "DOUBLE-STRUCK",
 101                         "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;",
 102                         "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
 103                         "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
 104                         "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 105                         " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
 106                         "WITH OGONEK;", "WITH CEDILLA;",
 107                         //
 108                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 109                         "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
 110                         "STROKE OVERLAY",
 111                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 112                         " DIAERESIS AND GRAVE;",
 113                         " BREVE AND ACUTE;",
 114                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 115                         " MACRON AND ACUTE;",
 116                         " MACRON AND GRAVE;",
 117                         //
 118                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 119                         " RING ABOVE AND ACUTE",
 120                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 121                         " CIRCUMFLEX AND TILDE",
 122                         " TILDE AND DIAERESIS",
 123                         " STROKE AND ACUTE",
 124                         " BREVE AND TILDE",
 125                         " CEDILLA AND BREVE",
 126                         " OGONEK AND MACRON",
 127                         //
 128                         "WITH OVERLINE",
 129                         "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 130                         " DOUBLE GRAVE",
 131                         " INVERTED BREVE",
 132                         "ROMAN NUMERAL",
 133                         " PRECEDED BY APOSTROPHE",
 134                         "WITH HORN;",
 135                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 136                         " PALATAL HOOK",
 137                         " DOT BELOW;",
 138                         " RETROFLEX;", "DIAERESIS BELOW",
 139                         " RING BELOW",
 140                         //
 141                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 142                         " BREVE BELOW;", " HORN AND GRAVE",
 143                         " TILDE BELOW",
 144                         " TOPBAR",
 145                         " DOT BELOW AND DOT ABOVE",
 146                         " RIGHT HALF RING", " HORN AND TILDE",
 147                         " CIRCUMFLEX AND DOT BELOW",
 148                         " BREVE AND DOT BELOW",
 149                         " DOT BELOW AND MACRON",
 150                         " TONE TWO",
 151                         " HORN AND HOOK ABOVE",
 152                         " HORN AND DOT",
 153                         // CIRCLED, PARENTHESIZED and so on
 154                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
 155                         "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
 156                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 157                         };
 158                 byte [] diacriticWeights = new byte [] {
 159                         // LATIN.
 160                         3, 3, 5, 5,
 161                         0xF, 0xE, 0x12,
 162                         0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 163                         0x17, 0x19, 0x1A, 0x1B, 0x1C,
 164                         //
 165                         0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 166                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 167                         //
 168                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 169                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 170                         //
 171                         0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
 172                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
 173                         //
 174                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
 175                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 176                         0x87, 0x95, 0xAA,
 177                         // CIRCLED, PARENTHESIZED and so on.
 178                         0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
 179                         0xF3, 0xF3, 0xF3
 180                         };
 181
 182                 int [] numberSecondaryWeightBounds = new int [] {
 183                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 184                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 185                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 186                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 187                         0xE50, 0xE60, 0xED0, 0xEE0
 188                         };
 189
 190                 char [] orderedGurmukhi;
 191                 char [] orderedGujarati;
 192                 char [] orderedGeorgian;
 193                 char [] orderedThaana;
 194
 195                 static readonly char [] orderedTamilConsonants = new char [] {
 196                         // based on traditional Tamil consonants, except for
 197                         // Grantha (where Microsoft breaks traditionalism).
 198                         // http://www.angelfire.com/empire/thamizh/padanGaL
 199                         '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
 200                         '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
 201                         '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
 202                         '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
 203                         '\u0BB7', '\u0BB9'};
 204
 205                 // cp -> character name (only for some characters)
 206                 ArrayList sortableCharNames = new ArrayList ();
 207
 208                 // cp -> arrow value (int)
 209                 ArrayList arrowValues = new ArrayList ();
 210
 211                 // cp -> box value (int)
 212                 ArrayList boxValues = new ArrayList ();
 213
 214                 // cp -> level1 value
 215                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 216
 217                 // letterName -> cp
 218                 Hashtable arabicNameMap = new Hashtable ();
 219
 220                 // cp -> Hashtable [decompType] -> cp
 221                 Hashtable nfkdMap = new Hashtable ();
 222
 223                 // Latin letter -> ArrayList [int]
 224                 Hashtable latinMap = new Hashtable ();
 225
 226                 ArrayList jisJapanese = new ArrayList ();
 227                 ArrayList nonJisJapanese = new ArrayList ();
 228
 229                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
 230                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
 231                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
 232                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
 233                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
 234
 235                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 236
 237                 static double [] unicodeAge = new double [char.MaxValue + 1];
 238
 239                 ArrayList tailorings = new ArrayList ();
 240
 241                 void Run (string [] args)
 242                 {
 243                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 244                         ParseSources (dirname);
 245                         Console.Error.WriteLine ("parse done.");
 246
 247                         ModifyParsedValues ();
 248                         GenerateCore ();
 249                         Console.Error.WriteLine ("generation done.");
 250                         Serialize ();
 251                         Console.Error.WriteLine ("serialization done.");
 252 /*
 253 StreamWriter sw = new StreamWriter ("agelog.txt");
 254 for (int i = 0; i < char.MaxValue; i++) {
 255 bool shouldBe = false;
 256 switch (Char.GetUnicodeCategory ((char) i)) {
 257 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
 258         shouldBe = true; break;
 259 }
 260 if (unicodeAge [i] >= 3.1)
 261         shouldBe = true;
 262 //if (IsIgnorable (i) != shouldBe)
 263 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
 264 }
 265 sw.Close ();
 266 */
 267                 }
 268
 269                 byte [] CompressArray (byte [] source, CodePointIndexer i)
 270                 {
 271                         return (byte []) CodePointIndexer.CompressArray  (
 272                                 source, typeof (byte), i);
 273                 }
 274
 275                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
 276                 {
 277                         return (ushort []) CodePointIndexer.CompressArray  (
 278                                 source, typeof (ushort), i);
 279                 }
 280
 281                 void Serialize ()
 282                 {
 283                         // Tailorings
 284                         SerializeTailorings ();
 285
 286                         byte [] categories = new byte [map.Length];
 287                         byte [] level1 = new byte [map.Length];
 288                         byte [] level2 = new byte [map.Length];
 289                         byte [] level3 = new byte [map.Length];
 290                         ushort [] widthCompat = new ushort [map.Length];
 291                         for (int i = 0; i < map.Length; i++) {
 292                                 categories [i] = map [i].Category;
 293                                 level1 [i] = map [i].Level1;
 294                                 level2 [i] = map [i].Level2;
 295                                 level3 [i] = ComputeLevel3Weight ((char) i);
 296                                 switch (decompType [i]) {
 297                                 case DecompositionNarrow:
 298                                 case DecompositionWide:
 299                                 case DecompositionSuper:
 300                                 case DecompositionSub:
 301                                         // they are always 1 char
 302                                         widthCompat [i] = (ushort) decompValues [decompIndex [i]];
 303                                         break;
 304                                 }
 305                         }
 306
 307                         // compress
 308                         ignorableFlags = CompressArray (ignorableFlags,
 309                                 MSCompatUnicodeTableUtil.Ignorable);
 310                         categories = CompressArray (categories,
 311                                 MSCompatUnicodeTableUtil.Category);
 312                         level1 = CompressArray (level1,
 313                                 MSCompatUnicodeTableUtil.Level1);
 314                         level2 = CompressArray (level2,
 315                                 MSCompatUnicodeTableUtil.Level2);
 316                         level3 = CompressArray (level3,
 317                                 MSCompatUnicodeTableUtil.Level3);
 318                         widthCompat = (ushort []) CodePointIndexer.CompressArray (
 319                                 widthCompat, typeof (ushort),
 320                                 MSCompatUnicodeTableUtil.WidthCompat);
 321                         cjkCHS = CompressArray (cjkCHS,
 322                                 MSCompatUnicodeTableUtil.CjkCHS);
 323                         cjkCHT = CompressArray (cjkCHT,
 324                                 MSCompatUnicodeTableUtil.Cjk);
 325                         cjkJA = CompressArray (cjkJA,
 326                                 MSCompatUnicodeTableUtil.Cjk);
 327                         cjkKO = CompressArray (cjkKO,
 328                                 MSCompatUnicodeTableUtil.Cjk);
 329                         cjkKOlv2 = CompressArray (cjkKOlv2,
 330                                 MSCompatUnicodeTableUtil.Cjk);
 331
 332                         // Ignorables
 333                         Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
 334 #if Binary
 335                         MemoryStream ms = new MemoryStream ();
 336                         BinaryWriter binary = new BinaryWriter (ms);
 337                         binary.Write (ignorableFlags.Length);
 338 #endif
 339                         for (int i = 0; i < ignorableFlags.Length; i++) {
 340                                 byte value = ignorableFlags [i];
 341                                 if (value < 10)
 342                                         Result.Write ("{0},", value);
 343                                 else
 344                                         Result.Write ("0x{0:X02},", value);
 345 #if Binary
 346                                 binary.Write (value);
 347 #endif
 348                                 if ((i & 0xF) == 0xF)
 349                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 350                         }
 351                         Result.WriteLine ("};");
 352                         Result.WriteLine ();
 353
 354                         // Primary category
 355                         Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
 356 #if Binary
 357                         binary.Write (categories.Length);
 358 #endif
 359                         for (int i = 0; i < categories.Length; i++) {
 360                                 byte value = categories [i];
 361                                 if (value < 10)
 362                                         Result.Write ("{0},", value);
 363                                 else
 364                                         Result.Write ("0x{0:X02},", value);
 365 #if Binary
 366                                 binary.Write (value);
 367 #endif
 368                                 if ((i & 0xF) == 0xF)
 369                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 370                         }
 371                         Result.WriteLine ("};");
 372                         Result.WriteLine ();
 373
 374                         // Primary weight value
 375                         Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
 376 #if Binary
 377                         binary.Write (level1.Length);
 378 #endif
 379                         for (int i = 0; i < level1.Length; i++) {
 380                                 byte value = level1 [i];
 381                                 if (value < 10)
 382                                         Result.Write ("{0},", value);
 383                                 else
 384                                         Result.Write ("0x{0:X02},", value);
 385 #if Binary
 386                                 binary.Write (value);
 387 #endif
 388                                 if ((i & 0xF) == 0xF)
 389                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 390                         }
 391                         Result.WriteLine ("};");
 392                         Result.WriteLine ();
 393
 394                         // Secondary weight
 395                         Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
 396 #if Binary
 397                         binary.Write (level2.Length);
 398 #endif
 399                         for (int i = 0; i < level2.Length; i++) {
 400                                 byte value = level2 [i];
 401                                 if (value < 10)
 402                                         Result.Write ("{0},", value);
 403                                 else
 404                                         Result.Write ("0x{0:X02},", value);
 405 #if Binary
 406                                 binary.Write (value);
 407 #endif
 408                                 if ((i & 0xF) == 0xF)
 409                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 410                         }
 411                         Result.WriteLine ("};");
 412                         Result.WriteLine ();
 413
 414                         // Thirtiary weight
 415                         Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
 416 #if Binary
 417                         binary.Write (level3.Length);
 418 #endif
 419                         for (int i = 0; i < level3.Length; i++) {
 420                                 byte value = level3 [i];
 421                                 if (value < 10)
 422                                         Result.Write ("{0},", value);
 423                                 else
 424                                         Result.Write ("0x{0:X02},", value);
 425 #if Binary
 426                                 binary.Write (value);
 427 #endif
 428                                 if ((i & 0xF) == 0xF)
 429                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 430                         }
 431                         Result.WriteLine ("};");
 432                         Result.WriteLine ();
 433
 434                         // Width insensitivity mappings
 435                         // (for now it is more lightweight than dumping the
 436                         // entire NFKD table).
 437                         Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
 438 #if Binary
 439                         binary.Write (widthCompat.Length);
 440 #endif
 441                         for (int i = 0; i < widthCompat.Length; i++) {
 442                                 ushort value = widthCompat [i];
 443                                 if (value < 10)
 444                                         Result.Write ("{0},", value);
 445                                 else
 446                                         Result.Write ("0x{0:X02},", value);
 447 #if Binary
 448                                 binary.Write (value);
 449 #endif
 450                                 if ((i & 0xF) == 0xF)
 451                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 452                         }
 453                         Result.WriteLine ("};");
 454                         Result.WriteLine ();
 455 #if Binary
 456                         using (FileStream fs = File.Create ("../collation.core.bin")) {
 457                                 byte [] array = ms.ToArray ();
 458                                 fs.Write (array, 0, array.Length);
 459                         }
 460 #endif
 461
 462                         // CJK
 463                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 464                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 465                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 466                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 467                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 468                 }
 469
 470                 void SerializeCJK (string name, ushort [] cjk, int max)
 471                 {
 472                         int offset = 0;//char.MaxValue - cjk.Length;
 473                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
 474 #if Binary
 475                         MemoryStream ms = new MemoryStream ();
 476                         BinaryWriter binary = new BinaryWriter (ms);
 477                         binary.Write (cjk.Length);
 478 #endif
 479                         for (int i = 0; i < cjk.Length; i++) {
 480                                 if (i + offset == max)
 481                                         break;
 482                                 ushort value = cjk [i];
 483                                 if (value < 10)
 484                                         Result.Write ("{0},", value);
 485                                 else
 486                                         Result.Write ("0x{0:X04},", value);
 487 #if Binary
 488                                 binary.Write (value);
 489 #endif
 490                                 if ((i & 0xF) == 0xF)
 491                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 492                         }
 493                         Result.WriteLine ("};");
 494                         Result.WriteLine ();
 495 #if Binary
 496                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 497                                 byte [] array = ms.ToArray ();
 498                                 fs.Write (array, 0, array.Length);
 499                         }
 500 #endif
 501                 }
 502
 503                 void SerializeCJK (string name, byte [] cjk, int max)
 504                 {
 505                         int offset = 0;//char.MaxValue - cjk.Length;
 506                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
 507 #if Binary
 508                         MemoryStream ms = new MemoryStream ();
 509                         BinaryWriter binary = new BinaryWriter (ms);
 510 #endif
 511                         for (int i = 0; i < cjk.Length; i++) {
 512                                 if (i + offset == max)
 513                                         break;
 514                                 byte value = cjk [i];
 515                                 if (value < 10)
 516                                         Result.Write ("{0},", value);
 517                                 else
 518                                         Result.Write ("0x{0:X02},", value);
 519 #if Binary
 520                                 binary.Write (value);
 521 #endif
 522                                 if ((i & 0xF) == 0xF)
 523                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 524                         }
 525                         Result.WriteLine ("};");
 526                         Result.WriteLine ();
 527 #if Binary
 528                         using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
 529                                 byte [] array = ms.ToArray ();
 530                                 fs.Write (array, 0, array.Length);
 531                         }
 532 #endif
 533                 }
 534
 535                 void SerializeTailorings ()
 536                 {
 537                         Hashtable indexes = new Hashtable ();
 538                         Hashtable counts = new Hashtable ();
 539                         Result.WriteLine ("static char [] tailorings = new char [] {");
 540                         int count = 0;
 541 #if Binary
 542                         MemoryStream ms = new MemoryStream ();
 543                         BinaryWriter binary = new BinaryWriter (ms);
 544 #endif
 545                         foreach (Tailoring t in tailorings) {
 546                                 if (t.Alias != 0)
 547                                         continue;
 548                                 Result.Write ("/*{0}*/", t.LCID);
 549                                 indexes.Add (t.LCID, count);
 550                                 char [] values = t.ItemToCharArray ();
 551                                 counts.Add (t.LCID, values.Length);
 552                                 foreach (char c in values) {
 553                                         Result.Write ("'\\x{0:X}', ", (int) c);
 554                                         if (++count % 16 == 0)
 555                                                 Result.WriteLine (" // {0:X04}", count - 16);
 556 #if Binary
 557                                         binary.Write ((ushort) c);
 558 #endif
 559                                 }
 560                         }
 561                         Result.WriteLine ("};");
 562
 563                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
 564 #if Binary
 565                         byte [] rawdata = ms.ToArray ();
 566                         ms = new MemoryStream ();
 567                         binary = new BinaryWriter (ms);
 568                         binary.Write (tailorings.Count);
 569 #endif
 570                         foreach (Tailoring t in tailorings) {
 571                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
 572                                 if (!indexes.ContainsKey (target)) {
 573                                         throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
 574                                         continue;
 575                                 }
 576                                 int idx = (int) indexes [target];
 577                                 int cnt = (int) counts [target];
 578                                 bool french = t.FrenchSort;
 579                                 if (t.Alias != 0)
 580                                         foreach (Tailoring t2 in tailorings)
 581                                                 if (t2.LCID == t.LCID)
 582                                                         french = t2.FrenchSort;
 583                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
 584 #if Binary
 585                                 binary.Write (t.LCID);
 586                                 binary.Write (idx);
 587                                 binary.Write (cnt);
 588                                 binary.Write (french);
 589 #endif
 590                         }
 591                         Result.WriteLine ("};");
 592 #if Binary
 593                         binary.Write ((byte) 0xFF);
 594                         binary.Write ((byte) 0xFF);
 595                         binary.Write (rawdata.Length / 2);
 596                         binary.Write (rawdata, 0, rawdata.Length);
 597
 598
 599                         using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
 600                                 byte [] array = ms.ToArray ();
 601                                 fs.Write (array, 0, array.Length);
 602                         }
 603 #endif
 604                 }
 605
 606                 #region Parse
 607
 608                 void ParseSources (string dirname)
 609                 {
 610                         string unidata =
 611                                 dirname + "/UnicodeData.txt";
 612                         string derivedCoreProps =
 613                                 dirname + "/DerivedCoreProperties.txt";
 614                         string scripts =
 615                                 dirname + "/Scripts.txt";
 616                         string cp932 =
 617                                 dirname + "/CP932.TXT";
 618                         string derivedAge =
 619                                 dirname + "/DerivedAge.txt";
 620                         string chXML = dirname + "/common/collation/zh.xml";
 621                         string jaXML = dirname + "/common/collation/ja.xml";
 622                         string koXML = dirname + "/common/collation/ko.xml";
 623
 624                         ParseDerivedAge (derivedAge);
 625
 626                         FillIgnorables ();
 627
 628                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 629                         ParseUnidata (unidata);
 630                         ModifyUnidata ();
 631                         ParseDerivedCoreProperties (derivedCoreProps);
 632                         ParseScripts (scripts);
 633                         ParseCJK (chXML, jaXML, koXML);
 634
 635                         ParseTailorings ("mono-tailoring-source.txt");
 636                 }
 637
 638                 void ParseTailorings (string filename)
 639                 {
 640                         Tailoring t = null;
 641                         int line = 0;
 642                         using (StreamReader sr = new StreamReader (filename)) {
 643                                 try {
 644                                         while (sr.Peek () >= 0) {
 645                                                 line++;
 646                                                 ProcessTailoringLine (ref t,
 647                                                         sr.ReadLine ().Trim ());
 648                                         }
 649                                 } catch (Exception) {
 650                                         Console.Error.WriteLine ("ERROR at line {0}", line);
 651                                         throw;
 652                                 }
 653                         }
 654                 }
 655
 656                 // For now this is enough.
 657                 string ParseTailoringSourceValue (string s)
 658                 {
 659                         StringBuilder sb = new StringBuilder ();
 660                         for (int i = 0; i < s.Length; i++) {
 661                                 if (s.StartsWith ("\\u")) {
 662                                         sb.Append ((char) int.Parse (
 663                                                 s.Substring (2, 4), NumberStyles.HexNumber),
 664                                                 1);
 665                                         i += 5;
 666                                 }
 667                         else
 668                                 sb.Append (s [i]);
 669                         }
 670                         return sb.ToString ();
 671                 }
 672
 673                 void ProcessTailoringLine (ref Tailoring t, string s)
 674                 {
 675                         int idx = s.IndexOf ('#');
 676                         if (idx > 0)
 677                                 s = s.Substring (0, idx).Trim ();
 678                         if (s.Length == 0 || s [0] == '#')
 679                                 return;
 680                         if (s [0] == '@') {
 681                                 idx = s.IndexOf ('=');
 682                                 if (idx > 0)
 683                                         t = new Tailoring (
 684                                                 int.Parse (s.Substring (1, idx - 1)),
 685                                                 int.Parse (s.Substring (idx + 1)));
 686                                 else
 687                                         t = new Tailoring (int.Parse (s.Substring (1)));
 688                                 tailorings.Add (t);
 689                                 return;
 690                         }
 691                         if (s.StartsWith ("*FrenchSort")) {
 692                                 t.FrenchSort = true;
 693                                 return;
 694                         }
 695                         string d = "*Diacritical";
 696                         if (s.StartsWith (d)) {
 697                                 idx = s.IndexOf ("->");
 698                                 t.AddDiacriticalMap (
 699                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
 700                                                 NumberStyles.HexNumber),
 701                                         byte.Parse (s.Substring (idx + 2).Trim (),
 702                                                 NumberStyles.HexNumber));
 703                                 return;
 704                         }
 705                         idx = s.IndexOf (':');
 706                         if (idx > 0) {
 707                                 string source = s.Substring (0, idx).Trim ();
 708                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
 709                                 byte [] b = new byte [4];
 710                                 for (int i = 0; i < 4; i++) {
 711                                         if (l [i] == "*")
 712                                                 b [i] = 0;
 713                                         else
 714                                                 b [i] = byte.Parse (l [i],
 715                                                         NumberStyles.HexNumber);
 716                                 }
 717                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
 718                                         b);
 719                         }
 720                         idx = s.IndexOf ('=');
 721                         if (idx > 0)
 722                                 t.AddReplacementMap (
 723                                         ParseTailoringSourceValue (
 724                                                 s.Substring (0, idx).Trim ()),
 725                                         ParseTailoringSourceValue (
 726                                                 s.Substring (idx + 1).Trim ()));
 727                 }
 728
 729                 void ParseDerivedAge (string filename)
 730                 {
 731                         using (StreamReader file =
 732                                 new StreamReader (filename)) {
 733                                 while (file.Peek () >= 0) {
 734                                         string s = file.ReadLine ();
 735                                         int idx = s.IndexOf ('#');
 736                                         if (idx >= 0)
 737                                                 s = s.Substring (0, idx);
 738                                         idx = s.IndexOf (';');
 739                                         if (idx < 0)
 740                                                 continue;
 741
 742                                         string cpspec = s.Substring (0, idx);
 743                                         idx = cpspec.IndexOf ("..");
 744                                         NumberStyles nf = NumberStyles.HexNumber |
 745                                                 NumberStyles.AllowTrailingWhite;
 746                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 747                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 748                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 749
 750                                         // FIXME: use index
 751                                         if (cp > char.MaxValue)
 752                                                 continue;
 753
 754                                         double v = double.Parse (value);
 755                                         for (int i = cp; i <= cpEnd; i++)
 756                                                 unicodeAge [i] = v;
 757                                 }
 758                         }
 759                         unicodeAge [0] = double.MaxValue; // never be supported
 760                 }
 761
 762                 void ParseUnidata (string filename)
 763                 {
 764                         ArrayList decompValues = new ArrayList ();
 765                         using (StreamReader unidata =
 766                                 new StreamReader (filename)) {
 767                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 768                                         try {
 769                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 770                                         } catch (Exception) {
 771                                                 Console.Error.WriteLine ("**** At line " + line);
 772                                                 throw;
 773                                         }
 774                                 }
 775                         }
 776                         this.decompValues = (int [])
 777                                 decompValues.ToArray (typeof (int));
 778                 }
 779
 780                 char previousLatinTarget = char.MinValue;
 781                 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
 782
 783                 void ProcessUnidataLine (string s, ArrayList decompValues)
 784                 {
 785                         int idx = s.IndexOf ('#');
 786                         if (idx >= 0)
 787                                 s = s.Substring (0, idx);
 788                         idx = s.IndexOf (';');
 789                         if (idx < 0)
 790                                 return;
 791                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 792                         string [] values = s.Substring (idx + 1).Split (';');
 793
 794                         // FIXME: use index
 795                         if (cp > char.MaxValue)
 796                                 return;
 797                         if (IsIgnorable (cp))
 798                                 return;
 799
 800                         string name = values [0];
 801
 802                         // SPECIAL CASE: rename some characters for diacritical
 803                         // remapping. FIXME: why are they different?
 804                         // FIXME: it's still not working.
 805                         if (cp == 0x018B || cp == 0x018C)
 806                                 name = name.Replace ("TOPBAR", "STROKE");
 807
 808                         // isSmallCapital
 809                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 810                                 isSmallCapital [cp] = true;
 811
 812                         // latin mapping by character name
 813                         if (s.IndexOf ("LATIN") >= 0) {
 814                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 815                                 int offset = lidx + 15;
 816                                 if (lidx < 0) {
 817                                         lidx = s.IndexOf ("LETTER TURNED ");
 818                                         offset = lidx + 14;
 819                                 }
 820                                 if (lidx < 0) {
 821                                         lidx = s.IndexOf ("LETTER CAPITAL ");
 822                                         offset = lidx + 15;
 823                                 }
 824                                 if (lidx < 0) {
 825                                         lidx = s.IndexOf ("LETTER SCRIPT ");
 826                                         offset = lidx + 14;
 827                                 }
 828                                 if (lidx < 0) {
 829                                         lidx = s.IndexOf ("LETTER ");
 830                                         offset = lidx + 7;
 831                                 }
 832                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 833                                 char n = s [offset + 1];
 834                                 char target = char.MinValue;
 835                                 if ('A' <= c && c <= 'Z' &&
 836                                         (n == ' ') || n == ';') {
 837                                         target = c;
 838                                         // FIXME: After 'Z', I cannot reset this state.
 839                                         previousLatinTarget = c == 'Z' ? char.MinValue : c;
 840                                 }
 841
 842                                 if (s.Substring (offset).StartsWith ("ALPHA"))
 843                                         target = 'A';
 844                                 else if (s.Substring (offset).StartsWith ("TONE SIX"))
 845                                         target = 'B';
 846                                 else if (s.Substring (offset).StartsWith ("OPEN O"))
 847                                         target = 'C';
 848                                 else if (s.Substring (offset).StartsWith ("SCHWA"))
 849                                         target = 'E';
 850                                 else if (s.Substring (offset).StartsWith ("ENG"))
 851                                         target = 'N';
 852                                 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
 853                                         target = 'O';
 854                                 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
 855                                         target = 'R';
 856                                 else if (s.Substring (offset).StartsWith ("TONE TWO"))
 857                                         target = 'S';
 858                                 else if (s.Substring (offset).StartsWith ("ESH"))
 859                                         target = 'S';
 860
 861                                 if (target == char.MinValue)
 862                                         target = previousLatinTarget;
 863
 864                                 if (target != char.MinValue) {
 865                                         ArrayList entry = (ArrayList) latinMap [target];
 866                                         if (entry == null) {
 867                                                 entry = new ArrayList ();
 868                                                 latinMap [target] = entry;
 869                                         }
 870                                         entry.Add (cp);
 871                                         // FIXME: This secondary weight is hack.
 872                                         // They are here because they must not
 873                                         // be identical to the corresponding
 874                                         // ASCII latins.
 875                                         if (c != target && diacritical [cp] == 0) {
 876                                                 diacriticalOffset [c - 'A']++;
 877                                                 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
 878                                         }
 879                                 }
 880                         }
 881
 882                         // Arrow names
 883                         if (0x2000 <= cp && cp < 0x3000) {
 884                                 int value = 0;
 885                                 // SPECIAL CASES. FIXME: why?
 886                                 switch (cp) {
 887                                 case 0x21C5: value = -1; break; // E2
 888                                 case 0x261D: value = 1; break;
 889                                 case 0x27A6: value = 3; break;
 890                                 case 0x21B0: value = 7; break;
 891                                 case 0x21B1: value = 3; break;
 892                                 case 0x21B2: value = 7; break;
 893                                 case 0x21B4: value = 5; break;
 894                                 case 0x21B5: value = 7; break;
 895                                 case 0x21B9: value = -1; break; // E1
 896                                 case 0x21CF: value = 7; break;
 897                                 case 0x21D0: value = 3; break;
 898                                 }
 899                                 string [] arrowTargets = new string [] {
 900                                         "",
 901                                         "UPWARDS",
 902                                         "NORTH EAST",
 903                                         "RIGHTWARDS",
 904                                         "SOUTH EAST",
 905                                         "DOWNWARDS",
 906                                         "SOUTH WEST",
 907                                         "LEFTWARDS",
 908                                         "NORTH WEST",
 909                                         };
 910                                 if (value == 0)
 911                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
 912                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
 913                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
 914                                                         s.IndexOf (" OVER") < 0
 915                                                 )
 916                                                         value = i;
 917                                 if (value > 0)
 918                                         arrowValues.Add (new DictionaryEntry (
 919                                                 cp, value));
 920                         }
 921
 922                         // Box names
 923                         if (0x2500 <= cp && cp < 0x2600) {
 924                                 int value = 0;
 925                                 // flags:
 926                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
 927                                 // [h,rl] [r] [l]
 928                                 // [v,ud] [u] [d]
 929                                 // [dr] [dl] [ur] [ul]
 930                                 // [vr,udr] [vl,vdl]
 931                                 // [hd,rld] [hu,rlu]
 932                                 // [hv,udrl,rlv,udh]
 933                                 ArrayList flags = new ArrayList (new int [] {
 934                                         32, 8 + 4, 8, 4,
 935                                         16, 1 + 2, 1, 2,
 936                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
 937                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
 938                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
 939                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
 940                                         });
 941                                 byte [] offsets = new byte [] {
 942                                         0, 0, 1, 2,
 943                                         3, 3, 4, 5,
 944                                         6, 7, 8, 9,
 945                                         10, 10, 11, 11,
 946                                         12, 12, 13, 13,
 947                                         14, 14, 14, 14};
 948                                 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
 949                                         int flag = 0;
 950                                         if (s.IndexOf (" UP") >= 0)
 951                                                 flag |= 1;
 952                                         if (s.IndexOf (" DOWN") >= 0)
 953                                                 flag |= 2;
 954                                         if (s.IndexOf (" RIGHT") >= 0)
 955                                                 flag |= 4;
 956                                         if (s.IndexOf (" LEFT") >= 0)
 957                                                 flag |= 8;
 958                                         if (s.IndexOf (" VERTICAL") >= 0)
 959                                                 flag |= 16;
 960                                         if (s.IndexOf (" HORIZONTAL") >= 0)
 961                                                 flag |= 32;
 962
 963                                         int fidx = flags.IndexOf (flag);
 964                                         value = fidx < 0 ? fidx : offsets [fidx];
 965                                 } else if (s.IndexOf ("BLOCK") >= 0) {
 966                                         if (s.IndexOf ("ONE EIGHTH") >= 0)
 967                                                 value = 0x12;
 968                                         else if (s.IndexOf ("ONE QUARTER") >= 0)
 969                                                 value = 0x13;
 970                                         else if (s.IndexOf ("THREE EIGHTHS") >= 0)
 971                                                 value = 0x14;
 972                                         else if (s.IndexOf ("HALF") >= 0)
 973                                                 value = 0x15;
 974                                         else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
 975                                                 value = 0x16;
 976                                         else if (s.IndexOf ("THREE QUARTERS") >= 0)
 977                                                 value = 0x17;
 978                                         else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
 979                                                 value = 0x18;
 980                                         else
 981                                                 value = 0x19;
 982                                 }
 983                                 else if (s.IndexOf ("SHADE") >= 0)
 984                                         value = 0x19;
 985                                 else if (s.IndexOf ("SQUARE") >= 0)
 986                                         value = 0xBC - 0xE5;
 987                                 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
 988                                         value = 0xBE - 0xE5;
 989                                 else if (s.IndexOf ("RECTANGLE") >= 0)
 990                                         value = 0xBD - 0xE5;
 991                                 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
 992                                         value = 0xBF - 0xE5;
 993                                 else if (s.IndexOf ("TRIANGLE") >= 0) {
 994                                         if (s.IndexOf ("UP-POINTING") >= 0)
 995                                                 value = 0xC0 - 0xE5;
 996                                         else if (s.IndexOf ("RIGHT-POINTING") >= 0)
 997                                                 value = 0xC1 - 0xE5;
 998                                         else if (s.IndexOf ("DOWN-POINTING") >= 0)
 999                                                 value = 0xC2 - 0xE5;
1000                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1001                                                 value = 0xC3 - 0xE5;
1002                                 }
1003                                 else if (s.IndexOf ("POINTER") >= 0) {
1004                                         if (s.IndexOf ("RIGHT-POINTING") >= 0)
1005                                                 value = 0xC4 - 0xE5;
1006                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1007                                                 value = 0xC5 - 0xE5;
1008                                 }
1009                                 else if (s.IndexOf ("DIAMOND") >= 0)
1010                                         value = 0xC6 - 0xE5;
1011                                 else if (s.IndexOf ("FISHEYE") >= 0)
1012                                         value = 0xC7 - 0xE5;
1013                                 else if (s.IndexOf ("LOZENGE") >= 0)
1014                                         value = 0xC8 - 0xE5;
1015                                 else if (s.IndexOf ("BULLSEYE") >= 0)
1016                                         value = 0xC9 - 0xE5;
1017                                 else if (s.IndexOf ("CIRCLE") >= 0) {
1018                                         if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1019                                                 value = 0xCA - 0xE5;
1020                                         else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1021                                                 value = 0xCB - 0xE5;
1022                                         else
1023                                                 value = 0xC9 - 0xE5;
1024                                 }
1025                                 if (0x25DA <= cp && cp <= 0x25E5)
1026                                         value = 0xCD + cp - 0x25DA - 0xE5;
1027
1028                                 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1029                                 switch (cp) {
1030                                 case 0x2571: value = 0xF; break;
1031                                 case 0x2572: value = 0x10; break;
1032                                 case 0x2573: value = 0x11; break;
1033                                 }
1034                                 if (value != 0)
1035                                         boxValues.Add (new DictionaryEntry (
1036                                                 cp, value));
1037                         }
1038
1039                         // For some characters store the name and sort later
1040                         // to determine sorting.
1041                         if (0x2100 <= cp && cp <= 0x213F &&
1042                                 Char.IsSymbol ((char) cp))
1043                                 sortableCharNames.Add (
1044                                         new DictionaryEntry (cp, name));
1045                         else if (0x3380 <= cp && cp <= 0x33DD)
1046                                 sortableCharNames.Add (new DictionaryEntry (
1047                                         cp, name.Substring (7)));
1048
1049                         if (Char.GetUnicodeCategory ((char) cp) ==
1050                                 UnicodeCategory.MathSymbol) {
1051                                 if (name.StartsWith ("CIRCLED "))
1052                                         diacritical [cp] = 0xEE;
1053                                 if (name.StartsWith ("SQUARED "))
1054                                         diacritical [cp] = 0xEF;
1055                         }
1056
1057                         // diacritical weights by character name
1058 if (diacritics.Length != diacriticWeights.Length)
1059 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1060                         for (int d = 0; d < diacritics.Length; d++) {
1061                                 if (s.IndexOf (diacritics [d]) > 0) {
1062                                         diacritical [cp] += diacriticWeights [d];
1063                                         if (s.IndexOf ("COMBINING") >= 0)
1064                                                 diacritical [cp] -= (byte) 2;
1065                                         continue;
1066                                 }
1067                                 // also process "COMBINING blah" here
1068                                 // For now it is limited to cp < 0x0370
1069 //                              if (cp < 0x0300 || cp >= 0x0370)
1070 //                                      continue;
1071                                 string tmp = diacritics [d].TrimEnd (';');
1072                                 if (tmp.IndexOf ("WITH ") == 0)
1073                                         tmp = tmp.Substring (4);
1074                                 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1075                                 if (name == tmp)
1076                                         diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1077 //if (name == tmp)
1078 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1079                         }
1080                         // Two-step grep required for it.
1081                         if (s.IndexOf ("FULL STOP") > 0 &&
1082                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1083                                 diacritical [cp] |= 0xF4;
1084
1085                         // Arabic letter name
1086                         if (0x0621 <= cp && cp <= 0x064A &&
1087                                 Char.GetUnicodeCategory ((char) cp)
1088                                 == UnicodeCategory.OtherLetter) {
1089                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1090                                 switch (cp) {
1091                                 case 0x0621:
1092                                 case 0x0624:
1093                                 case 0x0626:
1094                                         // hamza, waw, yeh ... special cases.
1095                                         value = 0x07;
1096                                         break;
1097                                 case 0x0649:
1098                                 case 0x064A:
1099                                         value = 0x77; // special cases.
1100                                         break;
1101                                 default:
1102                                         // Get primary letter name i.e.
1103                                         // XXX part of ARABIC LETTER XXX yyy
1104                                         // e.g. that of "TEH MARBUTA" is "TEH".
1105                                         string letterName =
1106                                                 (cp == 0x0640) ?
1107                                                 // 0x0640 is special: it does
1108                                                 // not start with ARABIC LETTER
1109                                                 name :
1110                                                 name.Substring (14);
1111                                         int tmpIdx = letterName.IndexOf (' ');
1112                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1113 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1114                                         if (arabicNameMap.ContainsKey (letterName))
1115                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1116                                         else
1117                                                 arabicNameMap [letterName] = cp;
1118                                         break;
1119                                 }
1120                                 arabicLetterPrimaryValues [cp] = value;
1121                         }
1122
1123                         // Japanese square letter
1124                         if (0x3300 <= cp && cp <= 0x3357)
1125                                 if (!ExistsJIS (cp))
1126                                         nonJisJapanese.Add (new NonJISCharacter (cp, name));
1127
1128                         // normalizationType
1129                         string decomp = values [4];
1130                         idx = decomp.IndexOf ('<');
1131                         if (idx >= 0) {
1132                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1133                                 case "full":
1134                                         decompType [cp] = DecompositionFull;
1135                                         break;
1136                                 case "sub":
1137                                         decompType [cp] = DecompositionSub;
1138                                         break;
1139                                 case "super":
1140                                         decompType [cp] = DecompositionSuper;
1141                                         break;
1142                                 case "small":
1143                                         decompType [cp] = DecompositionSmall;
1144                                         break;
1145                                 case "isolated":
1146                                         decompType [cp] = DecompositionIsolated;
1147                                         break;
1148                                 case "initial":
1149                                         decompType [cp] = DecompositionInitial;
1150                                         break;
1151                                 case "final":
1152                                         decompType [cp] = DecompositionFinal;
1153                                         break;
1154                                 case "medial":
1155                                         decompType [cp] = DecompositionMedial;
1156                                         break;
1157                                 case "noBreak":
1158                                         decompType [cp] = DecompositionNoBreak;
1159                                         break;
1160                                 case "compat":
1161                                         decompType [cp] = DecompositionCompat;
1162                                         break;
1163                                 case "fraction":
1164                                         decompType [cp] = DecompositionFraction;
1165                                         break;
1166                                 case "font":
1167                                         decompType [cp] = DecompositionFont;
1168                                         break;
1169                                 case "circle":
1170                                         decompType [cp] = DecompositionCircle;
1171                                         break;
1172                                 case "square":
1173                                         decompType [cp] = DecompositionSquare;
1174                                         break;
1175                                 case "wide":
1176                                         decompType [cp] = DecompositionWide;
1177                                         break;
1178                                 case "narrow":
1179                                         decompType [cp] = DecompositionNarrow;
1180                                         break;
1181                                 case "vertical":
1182                                         decompType [cp] = DecompositionVertical;
1183                                         break;
1184                                 default:
1185                                         throw new Exception ("Support NFKD type : " + decomp);
1186                                 }
1187                         }
1188                         else
1189                                 decompType [cp] = DecompositionCanonical;
1190                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1191                         if (decomp.Length > 0) {
1192
1193                                 string [] velems = decomp.Split (' ');
1194                                 int didx = decompValues.Count;
1195                                 decompIndex [cp] = didx;
1196                                 foreach (string v in velems)
1197                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1198                                 decompLength [cp] = velems.Length;
1199
1200                                 // [decmpType] -> this_cp
1201                                 int targetCP = (int) decompValues [didx];
1202                                 // for "(x)" it specially maps to 'x' .
1203                                 // FIXME: check if it is sane
1204                                 if (velems.Length == 3 &&
1205                                         (int) decompValues [didx] == '(' &&
1206                                         (int) decompValues [didx + 2] == ')')
1207                                         targetCP = (int) decompValues [didx + 1];
1208                                 // special: 0x215F "1/"
1209                                 else if (cp == 0x215F)
1210                                         targetCP = '1';
1211                                 else if (velems.Length > 1 &&
1212                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
1213                                         // skip them, except for CJK ideograph compat
1214                                         targetCP = 0;
1215
1216                                 if (targetCP != 0) {
1217                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
1218                                         if (entry == null) {
1219                                                 entry = new Hashtable ();
1220                                                 nfkdMap [targetCP] = entry;
1221                                         }
1222                                         entry [(byte) decompType [cp]] = cp;
1223                                 }
1224                         }
1225                         // numeric values
1226                         if (values [5].Length > 0)
1227                                 decimalValue [cp] = decimal.Parse (values [5]);
1228                         else if (values [6].Length > 0)
1229                                 decimalValue [cp] = decimal.Parse (values [6]);
1230                         else if (values [7].Length > 0) {
1231                                 string decstr = values [7];
1232                                 idx = decstr.IndexOf ('/');
1233                                 if (cp == 0x215F) // special. "1/"
1234                                         decimalValue [cp] = 0x1;
1235                                 else if (idx > 0)
1236                                         // m/n
1237                                         decimalValue [cp] =
1238                                                 decimal.Parse (decstr.Substring (0, idx))
1239                                                 / decimal.Parse (decstr.Substring (idx + 1));
1240                                 else if (decstr [0] == '(' &&
1241                                         decstr [decstr.Length - 1] == ')')
1242                                         // (n)
1243                                         decimalValue [cp] =
1244                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1245                                 else if (decstr [decstr.Length - 1] == '.')
1246                                         // n.
1247                                         decimalValue [cp] =
1248                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1249                                 else
1250                                         decimalValue [cp] = decimal.Parse (decstr);
1251                         }
1252                 }
1253
1254                 void ParseDerivedCoreProperties (string filename)
1255                 {
1256                         // IsUppercase
1257                         using (StreamReader file =
1258                                 new StreamReader (filename)) {
1259                                 for (int line = 1; file.Peek () >= 0; line++) {
1260                                         try {
1261                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1262                                         } catch (Exception) {
1263                                                 Console.Error.WriteLine ("**** At line " + line);
1264                                                 throw;
1265                                         }
1266                                 }
1267                         }
1268                 }
1269
1270                 void ProcessDerivedCorePropLine (string s)
1271                 {
1272                         int idx = s.IndexOf ('#');
1273                         if (idx >= 0)
1274                                 s = s.Substring (0, idx);
1275                         idx = s.IndexOf (';');
1276                         if (idx < 0)
1277                                 return;
1278                         string cpspec = s.Substring (0, idx);
1279                         idx = cpspec.IndexOf ("..");
1280                         NumberStyles nf = NumberStyles.HexNumber |
1281                                 NumberStyles.AllowTrailingWhite;
1282                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1283                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1284                         string value = s.Substring (cpspec.Length + 1).Trim ();
1285
1286                         // FIXME: use index
1287                         if (cp > char.MaxValue)
1288                                 return;
1289
1290                         switch (value) {
1291                         case "Uppercase":
1292                                 for (int x = cp; x <= cpEnd; x++)
1293                                         isUppercase [x] = true;
1294                                 break;
1295                         }
1296                 }
1297
1298                 void ParseScripts (string filename)
1299                 {
1300                         ArrayList gurmukhi = new ArrayList ();
1301                         ArrayList gujarati = new ArrayList ();
1302                         ArrayList georgian = new ArrayList ();
1303                         ArrayList thaana = new ArrayList ();
1304
1305                         using (StreamReader file =
1306                                 new StreamReader (filename)) {
1307                                 while (file.Peek () >= 0) {
1308                                         string s = file.ReadLine ();
1309                                         int idx = s.IndexOf ('#');
1310                                         if (idx >= 0)
1311                                                 s = s.Substring (0, idx);
1312                                         idx = s.IndexOf (';');
1313                                         if (idx < 0)
1314                                                 continue;
1315
1316                                         string cpspec = s.Substring (0, idx);
1317                                         idx = cpspec.IndexOf ("..");
1318                                         NumberStyles nf = NumberStyles.HexNumber |
1319                                                 NumberStyles.AllowTrailingWhite;
1320                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1321                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1322                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1323
1324                                         // FIXME: use index
1325                                         if (cp > char.MaxValue)
1326                                                 continue;
1327
1328                                         switch (value) {
1329                                         case "Gurmukhi":
1330                                                 for (int x = cp; x <= cpEnd; x++)
1331                                                         if (!IsIgnorable (x))
1332                                                                 gurmukhi.Add ((char) x);
1333                                                 break;
1334                                         case "Gujarati":
1335                                                 for (int x = cp; x <= cpEnd; x++)
1336                                                         if (!IsIgnorable (x))
1337                                                                 gujarati.Add ((char) x);
1338                                                 break;
1339                                         case "Georgian":
1340                                                 for (int x = cp; x <= cpEnd; x++)
1341                                                         if (!IsIgnorable (x))
1342                                                                 georgian.Add ((char) x);
1343                                                 break;
1344                                         case "Thaana":
1345                                                 for (int x = cp; x <= cpEnd; x++)
1346                                                         if (!IsIgnorable (x))
1347                                                                 thaana.Add ((char) x);
1348                                                 break;
1349                                         }
1350                                 }
1351                         }
1352                         gurmukhi.Sort (UCAComparer.Instance);
1353                         gujarati.Sort (UCAComparer.Instance);
1354                         georgian.Sort (UCAComparer.Instance);
1355                         thaana.Sort (UCAComparer.Instance);
1356                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1357                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1358                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1359                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1360                 }
1361
1362                 void ParseJISOrder (string filename)
1363                 {
1364                         int line = 1;
1365                         try {
1366                                 using (StreamReader file =
1367                                         new StreamReader (filename)) {
1368                                         for (;file.Peek () >= 0; line++)
1369                                                 ProcessJISOrderLine (file.ReadLine ());
1370                                 }
1371                         } catch (Exception) {
1372                                 Console.Error.WriteLine ("---- line {0}", line);
1373                                 throw;
1374                         }
1375                 }
1376
1377                 char [] ws = new char [] {'\t', ' '};
1378
1379                 void ProcessJISOrderLine (string s)
1380                 {
1381                         int idx = s.IndexOf ('#');
1382                         if (idx >= 0)
1383                                 s = s.Substring (0, idx).Trim ();
1384                         if (s.Length == 0)
1385                                 return;
1386                         idx = s.IndexOfAny (ws);
1387                         if (idx < 0)
1388                                 return;
1389                         // They start with "0x" so cut them out.
1390                         int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1391                         int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1392                         jisJapanese.Add (new JISCharacter (cp, jis));
1393                 }
1394
1395                 void ParseCJK (string zhXML, string jaXML, string koXML)
1396                 {
1397                         XmlDocument doc = new XmlDocument ();
1398                         doc.XmlResolver = null;
1399                         int v;
1400                         string s;
1401                         string category;
1402                         int offset;
1403                         ushort [] arr;
1404
1405                         // Chinese Simplified
1406                         category = "chs";
1407                         arr = cjkCHS;
1408                         offset = 0;//char.MaxValue - arr.Length;
1409                         doc.Load (zhXML);
1410                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1411                         v = 0x8008;
1412                         foreach (char c in s) {
1413                                 if (c < '\u3100')
1414                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1415                                 else {
1416                                         arr [(int) c - offset] = (ushort) v++;
1417                                         if (v % 256 == 0)
1418                                                 v += 2;
1419                                 }
1420                         }
1421
1422                         // Chinese Traditional
1423                         category = "cht";
1424                         arr = cjkCHT;
1425                         offset = 0;//char.MaxValue - arr.Length;
1426                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1427                         v = 0x8002;
1428                         foreach (char c in s) {
1429                                 if (c < '\u4E00')
1430                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1431                                 else {
1432                                         arr [(int) c - offset] = (ushort) v++;
1433                                         if (v % 256 == 0)
1434                                                 v += 2;
1435                                 }
1436                         }
1437
1438                         // Japanese
1439                         category = "ja";
1440                         arr = cjkJA;
1441                         offset = 0;//char.MaxValue - arr.Length;
1442                         doc.Load (jaXML);
1443                         s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1444
1445                         // SPECIAL CASES
1446                         arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1447                         arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1448                         arr [0x337E] = 0x8005;
1449                         arr [0x337D] = 0x8006;
1450                         arr [0x337C] = 0x8007;
1451
1452                         v = 0x8008;
1453                         foreach (char c in s) {
1454                                 if (c < '\u4E00')
1455                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1456                                 else {
1457                                         arr [(int) c - offset] = (ushort) v++;
1458                                         if (v % 256 == 0)
1459                                                 v += 2;
1460
1461                                         // SPECIAL CASES:
1462                                         if (c == '\u662D') // U+337C
1463                                                 continue;
1464                                         if (c == '\u5927') // U+337D
1465                                                 continue;
1466                                         if (c == '\u5E73') // U+337B
1467                                                 continue;
1468                                         if (c == '\u660E') // U+337E
1469                                                 continue;
1470                                         if (c == '\u9686') // U+F9DC
1471                                                 continue;
1472
1473                                         // FIXME: there are still remaining
1474                                         // characters after U+FA0C.
1475 //                                      for (int k = 0; k < char.MaxValue; k++) {
1476                                         for (int k = 0; k < '\uFA0C'; k++) {
1477                                                 if (decompIndex [k] == 0)
1478                                                         continue;
1479                                                 if (decompValues [decompIndex [k]] == c /*&&
1480                                                         decompLength [k] == 1*/ ||
1481                                                         decompLength [k] == 3 &&
1482                                                         decompValues [decompIndex [k] + 1] == c) {
1483                                                         arr [k - offset] = (ushort) v++;
1484                                                         if (v % 256 == 0)
1485                                                                 v += 2;
1486                                                 }
1487                                         }
1488                                 }
1489                         }
1490
1491                         // Korean
1492                         // Korean weight is somewhat complex. It first shifts
1493                         // Hangul category from 52-x to 80-x (they are anyways
1494                         // computed). CJK ideographs are placed at secondary
1495                         // weight, like XX YY 01 zz 01, where XX and YY are
1496                         // corresponding "reset" value and zz is 41,43,45...
1497                         //
1498                         // Unlike chs,cht and ja, Korean value is a combined
1499                         // ushort which is computed as category
1500                         //
1501                         category = "ko";
1502                         arr = cjkKO;
1503                         offset = 0;//char.MaxValue - arr.Length;
1504                         doc.Load (koXML);
1505                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1506                                 XmlElement sc = (XmlElement) reset.NextSibling;
1507                                 // compute "category" and "level 1" for the
1508                                 // target "reset" Hangle syllable
1509                                 char rc = reset.InnerText [0];
1510                                 int ri = ((int) rc - 0xAC00) + 1;
1511                                 ushort p = (ushort)
1512                                         ((ri / 254) * 256 + (ri % 254) + 2);
1513                                 // Place the characters after the target.
1514                                 s = sc.InnerText;
1515                                 v = 0x41;
1516                                 foreach (char c in s) {
1517                                         arr [(int) c - offset] = p;
1518                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1519                                         v += 2;
1520                                 }
1521                         }
1522                 }
1523
1524                 #endregion
1525
1526                 #region Generation
1527
1528                 void FillIgnorables ()
1529                 {
1530                         for (int i = 0; i <= char.MaxValue; i++) {
1531                                 if (Char.GetUnicodeCategory ((char) i) ==
1532                                         UnicodeCategory.OtherNotAssigned)
1533                                         continue;
1534                                 if (IsIgnorable (i))
1535                                         ignorableFlags [i] |= 1;
1536                                 if (IsIgnorableSymbol (i))
1537                                         ignorableFlags [i] |= 2;
1538                                 if (IsIgnorableNonSpacing (i))
1539                                         ignorableFlags [i] |= 4;
1540                         }
1541                 }
1542
1543                 void ModifyUnidata ()
1544                 {
1545                         // Modify some decomposition equivalence
1546                         decompType [0xFE31] = 0;
1547                         decompIndex [0xFE31] = 0;
1548                         decompLength [0xFE31] = 0;
1549                         decompType [0xFE32] = 0;
1550                         decompIndex [0xFE32] = 0;
1551                         decompLength [0xFE32] = 0;
1552
1553                         // Korean parens numbers
1554                         for (int i = 0x3200; i <= 0x321C; i++)
1555                                 diacritical [i] = 0xA;
1556                         for (int i = 0x3260; i <= 0x327B; i++)
1557                                 diacritical [i] = 0xC;
1558
1559                         // LAMESPEC: these remapping should not be done.
1560                         // Windows have incorrect CJK compat mappings.
1561                         decompValues [decompIndex [0x32A9]] = 0x91AB;
1562                         decompLength [0x323B] = 1;
1563                         decompValues [decompIndex [0x323B]] = 0x5B78;
1564                         decompValues [decompIndex [0x32AB]] = 0x5B78;
1565                         decompValues [decompIndex [0x32A2]] = 0x5BEB;
1566                         decompLength [0x3238] = 1;
1567                         decompValues [decompIndex [0x3238]] = 0x52DE;
1568                         decompValues [decompIndex [0x3298]] = 0x52DE;
1569
1570                         // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1571                         decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1572                         decompValues [decompIndex [0xFA0C]] = 0x5140;
1573                         decompLength [0xFA0C] = 1;
1574                         decompIndex [0xF929] = decompLength [0xF929] = 0;
1575
1576                         decompIndex [0xF92C] = decompLength [0xF92C] = 0;
1577                 }
1578
1579                 void ModifyParsedValues ()
1580                 {
1581                         // number, secondary weights
1582                         byte weight = 0x38;
1583                         int [] numarr = numberSecondaryWeightBounds;
1584                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1585                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1586                                         if (Char.IsNumber ((char) cp))
1587                                                 diacritical [cp] = weight;
1588
1589                         // Update name part of named characters
1590                         for (int i = 0; i < sortableCharNames.Count; i++) {
1591                                 DictionaryEntry de =
1592                                         (DictionaryEntry) sortableCharNames [i];
1593                                 int cp = (int) de.Key;
1594                                 string renamed = null;
1595                                 switch (cp) {
1596                                 case 0x2101: renamed = "A_1"; break;
1597                                 case 0x33C3: renamed = "A_2"; break;
1598                                 case 0x2105: renamed = "C_1"; break;
1599                                 case 0x2106: renamed = "C_2"; break;
1600                                 case 0x211E: renamed = "R1"; break;
1601                                 case 0x211F: renamed = "R2"; break;
1602                                 // Remove some of them!
1603                                 case 0x2103:
1604                                 case 0x2109:
1605                                 case 0x2116:
1606                                 case 0x2117:
1607                                 case 0x2118:
1608                                 case 0x2125:
1609                                 case 0x2127:
1610                                 case 0x2129:
1611                                 case 0x212E:
1612                                 case 0x2132:
1613                                         sortableCharNames.RemoveAt (i);
1614                                         i--;
1615                                         continue;
1616                                 }
1617                                 if (renamed != null)
1618                                         sortableCharNames [i] =
1619                                                 new DictionaryEntry (cp, renamed);
1620                         }
1621                 }
1622
1623                 void GenerateCore ()
1624                 {
1625                         UnicodeCategory uc;
1626
1627                         #region Specially ignored // 01
1628                         // This will raise "Defined" flag up.
1629                         foreach (char c in specialIgnore)
1630                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1631                         #endregion
1632
1633
1634                         #region Variable weights
1635                         // Controls : 06 03 - 06 3D
1636                         fillIndex [6] = 3;
1637                         for (int i = 0; i < 65536; i++) {
1638                                 if (IsIgnorable (i))
1639                                         continue;
1640                                 char c = (char) i;
1641                                 uc = Char.GetUnicodeCategory (c);
1642                                 // NEL is whitespace but not ignored here.
1643                                 if (uc == UnicodeCategory.Control &&
1644                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1645                                         AddCharMap (c, 6, 1);
1646                         }
1647
1648                         // Apostrophe 06 80
1649                         fillIndex [6] = 0x80;
1650                         AddCharMapGroup ('\'', 6, 1, 0);
1651                         AddCharMap ('\uFE63', 6, 1);
1652
1653                         // Hyphen/Dash : 06 81 - 06 90
1654                         for (int i = 0; i < char.MaxValue; i++) {
1655                                 if (!IsIgnorable (i) &&
1656                                         Char.GetUnicodeCategory ((char) i) ==
1657                                         UnicodeCategory.DashPunctuation) {
1658                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1659                                         if (i == 0x2011) {
1660                                                 // SPECIAL: add 2027 and 2043
1661                                                 // Maybe they are regarded the
1662                                                 // same hyphens in "central"
1663                                                 // position.
1664                                                 AddCharMap ('\u2027', 6, 1);
1665                                                 AddCharMap ('\u2043', 6, 1);
1666                                         }
1667                                 }
1668                         }
1669
1670                         // Arabic variable weight chars 06 A0 -
1671                         fillIndex [6] = 0xA0;
1672                         // vowels
1673                         for (int i = 0x64B; i <= 0x650; i++)
1674                                 AddArabicCharMap ((char) i);
1675                         // sukun
1676                         AddCharMapGroup ('\u0652', 6, 1, 0);
1677                         // shadda
1678                         AddCharMapGroup ('\u0651', 6, 1, 0);
1679                         #endregion
1680
1681
1682                         #region Nonspacing marks // 01
1683                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1684
1685                         // Combining diacritical marks: 01 DC -
1686
1687                         fillIndex [0x1] = 0x41;
1688                         for (int i = 0x030E; i <= 0x0326; i++)
1689                                 if (!IsIgnorable (i))
1690                                         AddCharMap ((char) i, 0x1, 1);
1691                         for (int i = 0x0329; i <= 0x0334; i++)
1692                                 if (!IsIgnorable (i))
1693                                         AddCharMap ((char) i, 0x1, 1);
1694                         for (int i = 0x0339; i <= 0x0341; i++)
1695                                 if (!IsIgnorable (i))
1696                                         AddCharMap ((char) i, 0x1, 1);
1697                         fillIndex [0x1] = 0x72;
1698                         for (int i = 0x0346; i <= 0x0348; i++)
1699                                 if (!IsIgnorable (i))
1700                                         AddCharMap ((char) i, 0x1, 1);
1701                         for (int i = 0x02BE; i <= 0x02BF; i++)
1702                                 if (!IsIgnorable (i))
1703                                         AddCharMap ((char) i, 0x1, 1);
1704                         for (int i = 0x02C1; i <= 0x02C5; i++)
1705                                 if (!IsIgnorable (i))
1706                                         AddCharMap ((char) i, 0x1, 1);
1707                         for (int i = 0x02CE; i <= 0x02CF; i++)
1708                                 if (!IsIgnorable (i))
1709                                         AddCharMap ((char) i, 0x1, 1);
1710                         for (int i = 0x02D1; i <= 0x02D3; i++)
1711                                 if (!IsIgnorable (i))
1712                                         AddCharMap ((char) i, 0x1, 1);
1713                         AddCharMap ('\u02DE', 0x1, 1);
1714                         for (int i = 0x02E4; i <= 0x02E9; i++)
1715                                 if (!IsIgnorable (i))
1716                                         AddCharMap ((char) i, 0x1, 1);
1717
1718                         // FIXME: needs more love here (it should eliminate
1719                         // all the hacky code above).
1720                         for (int i = 0x0300; i < 0x0370; i++)
1721                                 if (!IsIgnorable (i) && diacritical [i] != 0
1722                                         /* especiall here*/ && !map [i].Defined)
1723                                         map [i] = new CharMapEntry (
1724                                                 0x1, 0x1, diacritical [i]);
1725
1726                         fillIndex [0x1] = 0x94;
1727                         // syriac dotted nonspacing marks
1728                         AddCharMap ('\u0732', 0x1, 1);
1729                         AddCharMap ('\u0735', 0x1, 1);
1730                         AddCharMap ('\u0738', 0x1, 1);
1731                         AddCharMap ('\u0739', 0x1, 1);
1732                         AddCharMap ('\u073C', 0x1, 1);
1733                         fillIndex [0x1] = 0x9F;
1734                         for (int i = 0x0730; i <= 0x07B0; i++)
1735                                 if (!IsIgnorable (i) && !map [i].Defined)
1736                                         AddCharMap ((char) i, 0x1, 1);
1737
1738                         fillIndex [0x1] = 0x0C;
1739                         for (int i = 0x0EC8; i <= 0x0ECD; i++)
1740                                 if (!IsIgnorable (i))
1741                                         AddCharMap ((char) i, 0x1, 1);
1742
1743                         // LAMESPEC: It should not stop at '\u20E1'. There are
1744                         // a few more characters (that however results in
1745                         // overflow of level 2 unless we start before 0xDD).
1746                         fillIndex [0x1] = 0xDC;
1747                         for (int i = 0x20d0; i <= 0x20e1; i++)
1748                                 AddCharMap ((char) i, 0x1, 1);
1749
1750                         // They are not part of Nonspacing marks, but have
1751                         // only diacritical weight.
1752                         for (int i = 0x3099; i <= 0x309C; i++)
1753                                 map [i] = new CharMapEntry (1, 1, 1);
1754                         map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1755                         map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1756                         for (int i = 0x30FC; i <= 0x30FE; i++)
1757                                 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1758
1759                         #endregion
1760
1761
1762                         #region Whitespaces // 07 03 -
1763                         fillIndex [0x7] = 0x2;
1764                         AddCharMap (' ', 0x7, 2);
1765                         AddCharMap ('\u00A0', 0x7, 1);
1766                         for (int i = 9; i <= 0xD; i++)
1767                                 AddCharMap ((char) i, 0x7, 1);
1768                         for (int i = 0x2000; i <= 0x200B; i++)
1769                                 AddCharMap ((char) i, 0x7, 1);
1770
1771                         fillIndex [0x7] = 0x17;
1772                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1773                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1774
1775                         // Characters which used to represent layout control.
1776                         // LAMESPEC: Windows developers seem to have thought
1777                         // that those characters are kind of whitespaces,
1778                         // while they aren't.
1779                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1780                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1781                         #endregion
1782
1783                         // category 09 - continued symbols from 08
1784                         fillIndex [0x9] = 2;
1785                         // misc tech mark
1786                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1787                                 AddCharMap ((char) cp, 0x9, 1, 0);
1788
1789                         // arrows
1790                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1791                         foreach (DictionaryEntry de in arrowValues) {
1792                                 int idx = (int) de.Value;
1793                                 int cp = (int) de.Key;
1794                                 if (map [cp].Defined)
1795                                         continue;
1796                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1797                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1798                                 arrowLv2 [idx]++;
1799                         }
1800                         // boxes
1801                         byte [] boxLv2 = new byte [128];
1802                         for (int i = 0; i < boxLv2.Length; i++)
1803                                 boxLv2 [i] = 3;
1804                         foreach (DictionaryEntry de in boxValues) {
1805                                 int cp = (int) de.Key;
1806                                 int off = (int) de.Value;
1807                                 if (map [cp].Defined)
1808                                         continue;
1809                                 if (off < 0) {
1810                                         fillIndex [0x9] = (byte) (0xE5 + off);
1811                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1812                                 }
1813                                 else {
1814                                         fillIndex [0x9] = (byte) (0xE5 + off);
1815                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1816                                 }
1817                         }
1818                         // Some special characters (slanted)
1819                         fillIndex [0x9] = 0xF4;
1820                         AddCharMap ('\u2571', 0x9, 3);
1821                         AddCharMap ('\u2572', 0x9, 3);
1822                         AddCharMap ('\u2573', 0x9, 3);
1823
1824                         // FIXME: implement 0A
1825                         #region Symbols
1826                         fillIndex [0xA] = 2;
1827                         // byte currency symbols
1828                         for (int cp = 0; cp < 0x100; cp++) {
1829                                 uc = Char.GetUnicodeCategory ((char) cp);
1830                                 if (!IsIgnorable (cp) &&
1831                                         uc == UnicodeCategory.CurrencySymbol &&
1832                                         cp != '$' ||
1833                                         cp == 0xAC)
1834                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1835                         }
1836                         // byte other symbols
1837                         for (int cp = 0; cp < 0x100; cp++) {
1838                                 if (cp == 0xA6)
1839                                         continue; // SPECIAL: skip FIXME: why?
1840                                 uc = Char.GetUnicodeCategory ((char) cp);
1841                                 if (!IsIgnorable (cp) &&
1842                                         uc == UnicodeCategory.OtherSymbol ||
1843                                         cp == '\u00B5' || cp == '\u00B7')
1844                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1845                         }
1846
1847                         fillIndex [0xA] = 0x0F; // FIXME: it won't be needed
1848                         for (int cp = 0x2020; cp <= 0x2031; cp++)
1849                                 if (Char.IsPunctuation ((char) cp))
1850                                         AddCharMap ((char) cp, 0xA, 1, 0);
1851                         // SPECIAL CASES: why?
1852                         AddCharMap ('\u203B', 0xA, 1, 0);
1853                         AddCharMap ('\u2040', 0xA, 1, 0);
1854                         AddCharMap ('\u2041', 0xA, 1, 0);
1855                         AddCharMap ('\u2042', 0xA, 1, 0);
1856
1857                         for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1858                                 AddCharMap ((char) cp, 0xA, 1, 0);
1859                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1860                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1861                                 AddCharMap ((char) cp, 0xA, 1, 0);
1862                         // Dingbats
1863                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1864                                 if (Char.IsSymbol ((char) cp))
1865                                         AddCharMap ((char) cp, 0xA, 1, 0);
1866                         // OCR
1867                         for (int i = 0x2440; i < 0x2460; i++)
1868                                 AddCharMap ((char) i, 0xA, 1, 0);
1869
1870                         #endregion
1871
1872                         #region Numbers // 0C 02 - 0C E1
1873                         fillIndex [0xC] = 2;
1874
1875                         // 9F8 : Bengali "one less than the denominator"
1876                         AddCharMap ('\u09F8', 0xC, 1);
1877
1878                         ArrayList numbers = new ArrayList ();
1879                         for (int i = 0; i < 65536; i++)
1880                                 if (!IsIgnorable (i) &&
1881                                         Char.IsNumber ((char) i) &&
1882                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1883                                         numbers.Add (i);
1884
1885                         ArrayList numberValues = new ArrayList ();
1886                         foreach (int i in numbers)
1887                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1888                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1889
1890 //foreach (DictionaryEntry de in numberValues)
1891 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1892
1893                         decimal prevValue = -1;
1894                         foreach (DictionaryEntry de in numberValues) {
1895                                 int cp = (int) de.Key;
1896                                 decimal currValue = (decimal) de.Value;
1897                                 bool addnew = false;
1898                                 if (prevValue < currValue &&
1899                                         prevValue - (int) prevValue == 0 &&
1900                                         prevValue >= 1) {
1901
1902                                         addnew = true;
1903                                         // Process Hangzhou and Roman numbers
1904
1905                                         // There are some SPECIAL cases.
1906                                         if (currValue != 4) // no increment for 4
1907                                                 fillIndex [0xC]++;
1908
1909                                         int xcp;
1910                                         if (currValue <= 10) {
1911                                                 xcp = (int) prevValue + 0x2170 - 1;
1912                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1913                                                 xcp = (int) prevValue + 0x2160 - 1;
1914                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1915                                                 fillIndex [0xC] += 2;
1916                                                 xcp = (int) prevValue + 0x3021 - 1;
1917                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1918                                                 fillIndex [0xC]++;
1919                                         }
1920                                         else if (currValue == 11)
1921                                                 fillIndex [0xC]++;
1922                                 }
1923                                 if (prevValue < currValue)
1924                                         prevValue = currValue;
1925                                 if (map [cp].Defined)
1926                                         continue;
1927                                 // HangZhou and Roman are add later
1928                                 // (code is above)
1929                                 else if (0x3021 <= cp && cp < 0x302A
1930                                         || 0x2160 <= cp && cp < 0x216A
1931                                         || 0x2170 <= cp && cp < 0x217A)
1932                                         continue;
1933
1934                                 if (cp ==  0x215B) // FIXME: why?
1935                                         fillIndex [0xC] += 2;
1936                                 else if (cp == 0x3021) // FIXME: why?
1937                                         fillIndex [0xC]++;
1938                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1939                                 if (addnew || cp <= '9') {
1940                                         int mod = (int) currValue - 1;
1941                                         int xcp;
1942                                         if (1 <= currValue && currValue <= 10) {
1943                                                 xcp = mod + 0x2776;
1944                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1945                                                 xcp = mod + 0x2780;
1946                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1947                                                 xcp = mod + 0x278A;
1948                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1949                                         }
1950                                         if (1 <= currValue && currValue <= 20) {
1951                                                 xcp = mod + 0x2460;
1952                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1953                                                 xcp = mod + 0x2474;
1954                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1955                                                 xcp = mod + 0x2488;
1956                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1957                                         }
1958                                 }
1959
1960                                 if (cp != 0x09E7 && cp != 0x09EA)
1961                                         fillIndex [0xC]++;
1962
1963                                 // Add special cases that are not regarded as
1964                                 // numbers in UnicodeCategory speak.
1965                                 if (cp == '5') {
1966                                         // TONE FIVE
1967                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1968                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1969                                 }
1970                                 else if (cp == '6') // FIXME: why?
1971                                         fillIndex [0xC]++;
1972                         }
1973
1974                         // 221E: infinity
1975                         fillIndex [0xC] = 0xFF;
1976                         AddCharMap ('\u221E', 0xC, 1);
1977                         #endregion
1978
1979                         #region Letters and NonSpacing Marks (general)
1980
1981                         // ASCII Latin alphabets
1982                         for (int i = 0; i < alphabets.Length; i++)
1983                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1984
1985
1986                         // non-ASCII Latin alphabets
1987                         // FIXME: there is no such characters that are placed
1988                         // *after* "alphabets" array items. This is nothing
1989                         // more than a hack that creates dummy weight for
1990                         // primary characters.
1991                         for (int i = 0x0080; i < 0x0300; i++) {
1992                                 if (!Char.IsLetter ((char) i))
1993                                         continue;
1994                                 // For those Latin Letters which has NFKD are
1995                                 // not added as independent primary character.
1996                                 if (decompIndex [i] != 0)
1997                                         continue;
1998                                 // SPECIAL CASES:
1999                                 // 1.some alphabets have primarily
2000                                 //   equivalent ASCII alphabets.
2001                                 // 2.some have independent primary weights,
2002                                 //   but inside a-to-z range.
2003                                 // 3.there are some expanded characters that
2004                                 //   are not part of Unicode Standard NFKD.
2005                                 // 4. some characters are letter in IsLetter
2006                                 //   but not in sortkeys (maybe unicode version
2007                                 //   difference caused it).
2008                                 switch (i) {
2009                                 // 1. skipping them does not make sense
2010 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
2011 //                              case 0x184: case 0x185: case 0x186: case 0x189:
2012 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
2013 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
2014 //                              case 0x19B: case 0x19C:
2015                                 // 2. skipping them does not make sense
2016 //                              case 0x14A: // Ng
2017 //                              case 0x14B: // ng
2018                                 // 3.
2019                                 case 0xC6: // AE
2020                                 case 0xE6: // ae
2021                                 case 0xDE: // Icelandic Thorn
2022                                 case 0xFE: // Icelandic Thorn
2023                                 case 0xDF: // German ss
2024                                 case 0xFF: // German ss
2025                                 // 4.
2026                                 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2027                                 // not classified yet
2028 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2029 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2030 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2031 //                              case 0x1DD:
2032                                         continue;
2033                                 }
2034                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
2035                         }
2036
2037                         // Greek and Coptic
2038                         fillIndex [0xF] = 02;
2039                         for (int i = 0x0380; i < 0x0390; i++)
2040                                 if (Char.IsLetter ((char) i))
2041                                         AddLetterMap ((char) i, 0xF, 1);
2042                         fillIndex [0xF] = 02;
2043                         for (int i = 0x0391; i < 0x03CF; i++)
2044                                 if (Char.IsLetter ((char) i))
2045                                         AddLetterMap ((char) i, 0xF, 1);
2046                         fillIndex [0xF] = 0x40;
2047                         for (int i = 0x03D0; i < 0x0400; i++)
2048                                 if (Char.IsLetter ((char) i))
2049                                         AddLetterMap ((char) i, 0xF, 1);
2050
2051                         // Cyrillic.
2052                         // Cyrillic letters are sorted like Latin letters i.e.
2053                         // containing culture-specific letters between the
2054                         // standard Cyrillic sequence.
2055                         //
2056                         // We can't use UCA here; it has different sorting.
2057                         char [] orderedCyrillic = new char [] {
2058                                 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2059                                 '\u0452', // DJE for Serbocroatian
2060                                 '\u0435',
2061                                 '\u0454', // IE for Ukrainian
2062                                 '\u0436', '\u0437',
2063                                 '\u0455', // DZE
2064                                 '\u0438',
2065                                 '\u0456', // Byelorussian-Ukrainian I
2066                                 '\u0457', // YI
2067                                 '\u0439',
2068                                 '\u0458', // JE
2069                                 '\u043A', '\u043B',
2070                                 '\u0459', // LJE
2071                                 '\u043C', '\u043D',
2072                                 '\u045A', // NJE
2073                                 '\u043E',
2074                                 // 4E9 goes here.
2075                                 '\u043F', '\u0440', '\u0441', '\u0442',
2076                                 '\u045B', // TSHE for Serbocroatian
2077                                 '\u0443',
2078                                 '\u045E', // Short U for Byelorussian
2079                                 '\u04B1', // Straight U w/ stroke (diacritical!)
2080                                 '\u0444', '\u0445', '\u0446', '\u0447',
2081                                 '\u045F', // DZHE
2082                                 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2083                                 '\u044D', '\u044E', '\u044F'};
2084
2085                         // For some characters here is a map to basic cyrillic
2086                         // letters. See UnicodeData.txt character names for
2087                         // the sources. Here I simply declare an equiv. array.
2088                         // The content characters are map from U+490(,491),
2089                         // skipping small letters.
2090                         char [] cymap_src = new char [] {
2091                                 '\u0433', '\u0433', '\u0433', '\u0436',
2092                                 '\u0437', '\u043A', '\u043A', '\u043A',
2093                                 '\u043A', '\u043D', '\u043D', '\u043F',
2094                                 '\u0445', '\u0441', '\u0442', '\u0443',
2095                                 '\u0443', '\u0445', '\u0446', '\u0447',
2096                                 '\u0447', '\u0432', '\u0435', '\u0435',
2097                                 '\u0406', '\u0436', '\u043A', '\u043D',
2098                                 '\u0447', '\u0435'};
2099
2100                         fillIndex [0x10] = 0x8D;
2101                         for (int i = 0x0460; i < 0x0481; i++) {
2102                                 if (Char.IsLetter ((char) i)) {
2103                                         if (i == 0x0476)
2104                                                 // U+476/477 have the same
2105                                                 // primary weight as U+474/475.
2106                                                 fillIndex [0x10] -= 3;
2107                                         AddLetterMap ((char) i, 0x10, 3);
2108                                 }
2109                         }
2110
2111                         fillIndex [0x10] = 0x6;
2112                         for (int i = 0; i < orderedCyrillic.Length; i++) {
2113                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2114                                 if (!IsIgnorable ((int) c) &&
2115                                         Char.IsLetter (c) &&
2116                                         !map [c].Defined) {
2117                                         AddLetterMap (c, 0x10, 0);
2118                                         fillIndex [0x10] += 3;
2119                                 }
2120                         }
2121
2122                         for (int i = 0; i < cymap_src.Length; i++) {
2123                                 char c = cymap_src [i];
2124                                 fillIndex [0x10] = map [c].Level1;
2125                                 AddLetterMap ((char) (0x0490 + i * 2),
2126                                         0x10, 0);
2127                         }
2128
2129                         // Armenian
2130                         fillIndex [0x11] = 0x3;
2131                         for (int i = 0x0531; i < 0x0586; i++)
2132                                 if (Char.IsLetter ((char) i))
2133                                         AddLetterMap ((char) i, 0x11, 1);
2134
2135                         // Hebrew
2136                         // -Letters
2137                         fillIndex [0x12] = 0x2;
2138                         for (int i = 0x05D0; i < 0x05FF; i++)
2139                                 if (Char.IsLetter ((char) i))
2140                                         AddLetterMap ((char) i, 0x12, 1);
2141                         // -Accents
2142                         fillIndex [0x1] = 0x3;
2143                         for (int i = 0x0591; i <= 0x05C2; i++) {
2144                                 if (i == 0x05A3 || i == 0x05BB)
2145                                         fillIndex [0x1]++;
2146                                 if (i != 0x05BE)
2147                                         AddCharMap ((char) i, 0x1, 1);
2148                         }
2149
2150                         // Arabic
2151                         fillIndex [0x1] = 0x8E;
2152                         fillIndex [0x13] = 0x3;
2153                         for (int i = 0x0621; i <= 0x064A; i++) {
2154                                 // Abjad
2155                                 if (Char.GetUnicodeCategory ((char) i)
2156                                         != UnicodeCategory.OtherLetter) {
2157                                         // FIXME: arabic nonspacing marks are
2158                                         // in different order.
2159                                         AddCharMap ((char) i, 0x1, 1);
2160                                         continue;
2161                                 }
2162 //                              map [i] = new CharMapEntry (0x13,
2163 //                                      (byte) arabicLetterPrimaryValues [i], 1);
2164                                 fillIndex [0x13] =
2165                                         (byte) arabicLetterPrimaryValues [i];
2166                                 AddLetterMap ((char) i, 0x13, 0);
2167                         }
2168                         fillIndex [0x13] = 0x84;
2169                         for (int i = 0x0674; i < 0x06D6; i++)
2170                                 if (Char.IsLetter ((char) i))
2171                                         AddLetterMap ((char) i, 0x13, 1);
2172
2173                         // Devanagari
2174                         // FIXME: it does seem straight codepoint mapping.
2175                         fillIndex [0x14] = 04;
2176                         for (int i = 0x0901; i < 0x0905; i++)
2177                                 if (!IsIgnorable (i))
2178                                         AddLetterMap ((char) i, 0x14, 2);
2179                         fillIndex [0x14] = 0xB;
2180                         for (int i = 0x0905; i < 0x093A; i++) {
2181                                 if (i == 0x0928)
2182                                         AddCharMap ('\u0929', 0x14, 0, 8);
2183                                 if (i == 0x0930)
2184                                         AddCharMap ('\u0931', 0x14, 0, 8);
2185                                 if (i == 0x0933)
2186                                         AddCharMap ('\u0934', 0x14, 0, 8);
2187                                 if (Char.IsLetter ((char) i))
2188                                         AddLetterMap ((char) i, 0x14, 4);
2189                                 if (i == 0x090B)
2190                                         AddCharMap ('\u0960', 0x14, 4);
2191                                 if (i == 0x090C)
2192                                         AddCharMap ('\u0961', 0x14, 4);
2193                         }
2194                         fillIndex [0x14] = 0xDA;
2195                         for (int i = 0x093E; i < 0x0945; i++)
2196                                 if (!IsIgnorable (i))
2197                                         AddLetterMap ((char) i, 0x14, 2);
2198                         fillIndex [0x14] = 0xEC;
2199                         for (int i = 0x0945; i < 0x094F; i++)
2200                                 if (!IsIgnorable (i))
2201                                         AddLetterMap ((char) i, 0x14, 2);
2202
2203                         // Bengali
2204                         // -Letters
2205                         fillIndex [0x15] = 02;
2206                         for (int i = 0x0980; i < 0x9FF; i++) {
2207                                 if (IsIgnorable (i))
2208                                         continue;
2209                                 if (i == 0x09E0)
2210                                         fillIndex [0x15] = 0x3B;
2211                                 switch (Char.GetUnicodeCategory ((char) i)) {
2212                                 case UnicodeCategory.NonSpacingMark:
2213                                 case UnicodeCategory.DecimalDigitNumber:
2214                                 case UnicodeCategory.OtherNumber:
2215                                         continue;
2216                                 }
2217                                 AddLetterMap ((char) i, 0x15, 1);
2218                         }
2219                         // -Signs
2220                         fillIndex [0x1] = 0x3;
2221                         for (int i = 0x0981; i < 0x0A00; i++)
2222                                 if (Char.GetUnicodeCategory ((char) i) ==
2223                                         UnicodeCategory.NonSpacingMark)
2224                                         AddCharMap ((char) i, 0x1, 1);
2225
2226                         // Gurmukhi. orderedGurmukhi is from UCA
2227                         // FIXME: it does not look equivalent to UCA.
2228                         fillIndex [0x16] = 04;
2229                         fillIndex [0x1] = 3;
2230                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
2231                                 char c = orderedGurmukhi [i];
2232                                 if (IsIgnorable ((int) c))
2233                                         continue;
2234                                 if (IsIgnorableNonSpacing (c)) {
2235                                         AddLetterMap (c, 0x1, 1);
2236                                         continue;
2237                                 }
2238                                 if (c == '\u0A3C' || c == '\u0A4D' ||
2239                                         '\u0A66' <= c && c <= '\u0A71')
2240                                         continue;
2241                                 // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
2242                                 byte shift = 4;
2243                                 if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
2244                                         shift = 0;
2245                                 AddLetterMap (c, 0x16, shift);
2246                         }
2247
2248                         // Gujarati. orderedGujarati is from UCA
2249                         fillIndex [0x17] = 0x4;
2250                         // nonspacing marks
2251                         map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2252                         map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2253                         map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2254                         map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2255                         map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2256                         map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2257                         // letters go first.
2258                         for (int i = 0; i < orderedGujarati.Length; i++) {
2259                                 // SPECIAL CASE
2260                                 char c = orderedGujarati [i];
2261                                 if (Char.IsLetter (c)) {
2262                                         // SPECIAL CASES
2263                                         if (c == '\u0AB3' || c == '\u0A32')
2264                                                 continue;
2265                                         if (c == '\u0A33') {
2266                                                 AddCharMap ('\u0A32', 0x17, 0);
2267                                                 AddCharMap ('\u0A33', 0x17, 4, 4);
2268                                                 continue;
2269                                         }
2270                                         if (c == '\u0A8B')
2271                                                 AddCharMap ('\u0AE0', 0x17, 0, 5);
2272                                         AddCharMap (c, 0x17, 4);
2273
2274                                         if (c == '\u0AB9')
2275                                                 AddCharMap ('\u0AB3', 0x17, 6);
2276                                 }
2277                         }
2278                         // non-letters
2279                         byte gujaratiShift = 4;
2280                         fillIndex [0x17] = 0xC0;
2281                         for (int i = 0; i < orderedGujarati.Length; i++) {
2282                                 char c = orderedGujarati [i];
2283                                 if (fillIndex [0x17] == 0xCC)
2284                                         gujaratiShift = 3;
2285                                 if (!Char.IsLetter (c)) {
2286                                         // SPECIAL CASES
2287                                         if (c == '\u0A82')
2288                                                 AddCharMap ('\u0A81', 0x17, 2);
2289                                         if (c == '\u0AC2')
2290                                                 fillIndex [0x17]++;
2291                                         AddLetterMap (c, 0x17, gujaratiShift);
2292                                 }
2293                         }
2294
2295                         // Oriya
2296                         fillIndex [0x1] = 03;
2297                         fillIndex [0x18] = 02;
2298                         for (int i = 0x0B00; i < 0x0B7F; i++) {
2299                                 switch (Char.GetUnicodeCategory ((char) i)) {
2300                                 case UnicodeCategory.NonSpacingMark:
2301                                 case UnicodeCategory.DecimalDigitNumber:
2302                                         AddLetterMap ((char) i, 0x1, 1);
2303                                         continue;
2304                                 }
2305                                 AddLetterMap ((char) i, 0x18, 1);
2306                         }
2307
2308                         // Tamil
2309                         fillIndex [0x19] = 2;
2310                         AddCharMap ('\u0BD7', 0x19, 0);
2311                         fillIndex [0x19] = 0xA;
2312                         // vowels
2313                         for (int i = 0x0B82; i <= 0x0B94; i++)
2314                                 if (!IsIgnorable ((char) i))
2315                                         AddCharMap ((char) i, 0x19, 2);
2316                         // special vowel
2317                         fillIndex [0x19] = 0x28;
2318                         // The array for Tamil consonants is a constant.
2319                         // Windows have almost similar sequence to TAM from
2320                         // tamilnet but a bit different in Grantha.
2321                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
2322                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2323                         // combining marks
2324                         fillIndex [0x19] = 0x82;
2325                         for (int i = 0x0BBE; i < 0x0BCD; i++)
2326                                 if (Char.GetUnicodeCategory ((char) i) ==
2327                                         UnicodeCategory.SpacingCombiningMark
2328                                         || i == 0x0BC0)
2329                                         AddLetterMap ((char) i, 0x19, 2);
2330
2331                         // Telugu
2332                         fillIndex [0x1A] = 0x4;
2333                         for (int i = 0x0C00; i < 0x0C62; i++) {
2334                                 if (i == 0x0C55 || i == 0x0C56)
2335                                         continue; // skip
2336                                 AddCharMap ((char) i, 0x1A, 3);
2337                                 char supp = (i == 0x0C0B) ? '\u0C60':
2338                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
2339                                 if (supp == char.MinValue)
2340                                         continue;
2341                                 AddCharMap (supp, 0x1A, 3);
2342                         }
2343
2344                         // Kannada
2345                         fillIndex [0x1B] = 4;
2346                         for (int i = 0x0C80; i < 0x0CE5; i++) {
2347                                 if (i == 0x0CD5 || i == 0x0CD6)
2348                                         continue; // ignore
2349                                 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2350                                         continue; // shift after 0xCB9
2351                                 AddCharMap ((char) i, 0x1B, 3);
2352                                 if (i == 0x0CB9) {
2353                                         // SPECIAL CASES: but why?
2354                                         AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2355                                         AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2356                                         AddCharMap ('\u0CDE', 0x1B, 3); // FA
2357                                 }
2358                                 if (i == 0x0CB2)
2359                                         AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2360                         }
2361
2362                         // Malayalam
2363                         fillIndex [0x1C] = 2;
2364                         for (int i = 0x0D02; i < 0x0D61; i++)
2365                                 // FIXME: I avoided MSCompatUnicodeTable usage
2366                                 // here (it results in recursion). So check if
2367                                 // using NonSpacingMark makes sense or not.
2368                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2369 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2370                                         AddCharMap ((char) i, 0x1C, 1);
2371
2372                         // Thai ... note that it breaks 0x1E wall after E2B!
2373                         // Also, all Thai characters have level 2 value 3.
2374                         fillIndex [0x1E] = 2;
2375                         for (int i = 0xE40; i <= 0xE44; i++)
2376                                 AddCharMap ((char) i, 0x1E, 1, 3);
2377                         for (int i = 0xE01; i < 0xE2B; i++)
2378                                 AddCharMap ((char) i, 0x1E, 6, 3);
2379                         fillIndex [0x1F] = 5;
2380                         for (int i = 0xE2B; i < 0xE30; i++)
2381                                 AddCharMap ((char) i, 0x1F, 6, 3);
2382                         fillIndex [0x1F] = 0x1E;
2383                         for (int i = 0xE30; i < 0xE3B; i++)
2384                                 AddCharMap ((char) i, 0x1F, 1, 3);
2385                         // some Thai characters remains.
2386                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
2387                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2388                         foreach (char c in specialThai)
2389                                 AddCharMap (c, 0x1F, 1);
2390
2391                         // Lao
2392                         fillIndex [0x1F] = 2;
2393                         for (int i = 0xE80; i < 0xEDF; i++)
2394                                 if (Char.IsLetter ((char) i))
2395                                         AddCharMap ((char) i, 0x1F, 1);
2396
2397                         // Georgian. orderedGeorgian is from UCA DUCET.
2398                         fillIndex [0x21] = 5;
2399                         for (int i = 0; i < orderedGeorgian.Length; i++) {
2400                                 char c = orderedGeorgian [i];
2401                                 if (map [(int) c].Defined)
2402                                         continue;
2403                                 AddCharMap (c, 0x21, 0);
2404                                 if (c < '\u10F6')
2405                                         AddCharMap ((char) (c - 0x30), 0x21, 0);
2406                                 fillIndex [0x21] += 5;
2407                         }
2408
2409                         // Japanese Kana.
2410                         fillIndex [0x22] = 2;
2411                         int kanaOffset = 0x3041;
2412                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2413
2414                         for (int gyo = 0; gyo < 9; gyo++) {
2415                                 for (int dan = 0; dan < 5; dan++) {
2416                                         if (gyo == 7 && dan % 2 == 1) {
2417                                                 // 'ya'-gyo
2418                                                 fillIndex [0x22]++;
2419                                                 kanaOffset -= 2; // There is no space for yi and ye.
2420                                                 continue;
2421                                         }
2422                                         int cp = kanaOffset + dan * kanaLines [gyo];
2423                                         // small lines (a-gyo, ya-gyo)
2424                                         if (gyo == 0 || gyo == 7) {
2425                                                 AddKanaMap (cp, 1); // small
2426                                                 AddKanaMap (cp + 1, 1);
2427                                         }
2428                                         else
2429                                                 AddKanaMap (cp, kanaLines [gyo]);
2430                                         fillIndex [0x22]++;
2431
2432                                         if (cp == 0x30AB) {
2433                                                 // add small 'ka' (before normal one)
2434                                                 AddKanaMap (0x30F5, 1);
2435                                                 kanaOffset++;
2436                                         }
2437                                         if (cp == 0x30B1) {
2438                                                 // add small 'ke' (before normal one)
2439                                                 AddKanaMap (0x30F6, 1);
2440                                                 kanaOffset++;
2441                                         }
2442                                         if (cp == 0x3061) {
2443                                                 // add small 'Tsu' (before normal one)
2444                                                 AddKanaMap (0x3063, 1);
2445                                                 kanaOffset++;
2446                                         }
2447                                 }
2448                                 fillIndex [0x22] += 3;
2449                                 kanaOffset += 5 * kanaLines [gyo];
2450                         }
2451
2452                         // Wa-gyo is almost special, so I just manually add.
2453                         AddLetterMap ((char) 0x308E, 0x22, 0);
2454                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2455                         AddLetterMap ((char) 0x308F, 0x22, 0);
2456                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2457                         fillIndex [0x22]++;
2458                         AddLetterMap ((char) 0x3090, 0x22, 0);
2459                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2460                         fillIndex [0x22] += 2;
2461                         // no "Wu" in Japanese.
2462                         AddLetterMap ((char) 0x3091, 0x22, 0);
2463                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2464                         fillIndex [0x22]++;
2465                         AddLetterMap ((char) 0x3092, 0x22, 0);
2466                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2467                         // Nn
2468                         fillIndex [0x22] = 0x80;
2469                         AddLetterMap ((char) 0x3093, 0x22, 0);
2470                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2471
2472                         // JIS Japanese square chars.
2473                         fillIndex [0x22] = 0x97;
2474                         jisJapanese.Sort (JISComparer.Instance);
2475                         foreach (JISCharacter j in jisJapanese)
2476                                 if (0x3300 <= j.CP && j.CP <= 0x3357)
2477                                         AddCharMap ((char) j.CP, 0x22, 1);
2478                         // non-JIS Japanese square chars.
2479                         nonJisJapanese.Sort (NonJISComparer.Instance);
2480                         foreach (NonJISCharacter j in nonJisJapanese)
2481                                 AddCharMap ((char) j.CP, 0x22, 1);
2482
2483                         // Bopomofo
2484                         fillIndex [0x23] = 0x02;
2485                         for (int i = 0x3105; i <= 0x312C; i++)
2486                                 AddCharMap ((char) i, 0x23, 1);
2487
2488                         // Estrangela: ancient Syriac
2489                         fillIndex [0x24] = 0x0B;
2490                         // FIXME: is 0x71E really alternative form?
2491                         ArrayList syriacAlternatives = new ArrayList (
2492                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2493                         for (int i = 0x0710; i <= 0x072C; i++) {
2494                                 if (i == 0x0711) // NonSpacingMark
2495                                         continue;
2496                                 if (syriacAlternatives.Contains (i))
2497                                         continue;
2498                                 AddCharMap ((char) i, 0x24, 4);
2499                                 // FIXME: why?
2500                                 if (i == 0x721)
2501                                         fillIndex [0x24]++;
2502                         }
2503                         foreach (int cp in syriacAlternatives)
2504                                 map [cp] = new CharMapEntry (0x24,
2505                                         (byte) (map [cp - 1].Level1 + 2),
2506                                         0);
2507                         // FIXME: Syriac NonSpacingMark should go here.
2508
2509                         // Thaana
2510                         // FIXME: it turned out that it does not look like UCA
2511                         fillIndex [0x24] = 0x6E;
2512                         for (int i = 0; i < orderedThaana.Length; i++) {
2513                                 char c = orderedThaana [i];
2514                                 if (IsIgnorableNonSpacing ((int) c))
2515                                         continue;
2516                                 AddCharMap (c, 0x24, 2);
2517                                 if (c == '\u0782') // SPECIAL CASE: why?
2518                                         fillIndex [0x24] += 2;
2519                         }
2520                         #endregion
2521
2522                         // FIXME: Add more culture-specific letters (that are
2523                         // not supported in Windows collation) here.
2524
2525                         // Surrogate ... they are computed.
2526
2527                         #region Hangul
2528                         // Hangul.
2529                         //
2530                         // Unlike UCA Windows Hangul sequence mixes Jongseong
2531                         // with Choseong sequence as well as Jungseong,
2532                         // adjusted to have the same primary weight for the
2533                         // same base character. So it is impossible to compute
2534                         // those sort keys.
2535                         //
2536                         // Here I introduce an ordered sequence of mixed
2537                         // 'commands' and 'characters' that is similar to
2538                         // LDML text:
2539                         //      - ',' increases primary weight.
2540                         //      - [A B] means a range, increasing index
2541                         //      - {A B} means a range, without increasing index
2542                         //      - '=' is no operation (it means the characters
2543                         //        of both sides have the same weight).
2544                         //      - '>' inserts a Hangul Syllable block that
2545                         //        contains 0x251 characters.
2546                         //      - '<' decreases the index
2547                         //      - '0'-'9' means skip count
2548                         //      - whitespaces are ignored
2549                         //
2550
2551                         string hangulSequence =
2552                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2553                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2554                         + "<{\u1113 \u1116}, \u3165,"
2555                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2556                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2557                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2558                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2559                                 + "[\u11D1 \u11D2], \u11B2,"
2560                                 + "[\u11D3 \u11D5], \u11B3,"
2561                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2562                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2563                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2564                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2565                         + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2566                                 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2567                                 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2568                         + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2569                                 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2570                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2571                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2572                         + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2573                                 + "\u11F1,, \u11F2,,,"
2574                                 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2575                         + "<\u114D, \u110D,,  >"
2576                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2577                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2578                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2579                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2580                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2581                                 + "[\u11F5 \u11F8]"
2582                         ;
2583
2584                         byte hangulCat = 0x52;
2585                         fillIndex [hangulCat] = 0x2;
2586
2587                         int syllableBlock = 0;
2588                         for (int n = 0; n < hangulSequence.Length; n++) {
2589                                 char c = hangulSequence [n];
2590                                 int start, end;
2591                                 if (Char.IsWhiteSpace (c))
2592                                         continue;
2593                                 switch (c) {
2594                                 case '=':
2595                                         break; // NOP
2596                                 case ',':
2597                                         IncrementSequentialIndex (ref hangulCat);
2598                                         break;
2599                                 case '<':
2600                                         if (fillIndex [hangulCat] == 2)
2601                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2602                                         fillIndex [hangulCat]--;
2603                                         break;
2604                                 case '>':
2605                                         IncrementSequentialIndex (ref hangulCat);
2606                                         for (int l = 0; l < 0x15; l++)
2607                                                 for (int v = 0; v < 0x1C; v++) {
2608                                                         AddCharMap (
2609                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2610                                                         IncrementSequentialIndex (ref hangulCat);
2611                                                 }
2612                                         syllableBlock++;
2613                                         break;
2614                                 case '[':
2615                                         start = hangulSequence [n + 1];
2616                                         end = hangulSequence [n + 3];
2617                                         for (int i = start; i <= end; i++) {
2618                                                 AddCharMap ((char) i, hangulCat, 0);
2619                                                 if (end > i)
2620                                                         IncrementSequentialIndex (ref hangulCat);
2621                                         }
2622                                         n += 4; // consumes 5 characters for this operation
2623                                         break;
2624                                 case '{':
2625                                         start = hangulSequence [n + 1];
2626                                         end = hangulSequence [n + 3];
2627                                         for (int i = start; i <= end; i++)
2628                                                 AddCharMap ((char) i, hangulCat, 0);
2629                                         n += 4; // consumes 5 characters for this operation
2630                                         break;
2631                                 default:
2632                                         AddCharMap (c, hangulCat, 0);
2633                                         break;
2634                                 }
2635                         }
2636
2637                         // Some Jamo NFKD.
2638                         for (int i = 0x3200; i < 0x3300; i++) {
2639                                 if (IsIgnorable (i) || map [i].Defined)
2640                                         continue;
2641                                 int ch = 0;
2642                                 // w/ bracket
2643                                 if (decompLength [i] == 4 &&
2644                                         decompValues [decompIndex [i]] == '(')
2645                                         ch = decompIndex [i] + 1;
2646                                 // circled
2647                                 else if (decompLength [i] == 2 &&
2648                                         decompValues [decompIndex [i] + 1] == '\u1161')
2649                                         ch = decompIndex [i];
2650                                 else if (decompLength [i] == 1)
2651                                         ch = decompIndex [i];
2652                                 else
2653                                         continue;
2654                                 ch = decompValues [ch];
2655                                 if (ch < 0x1100 || 0x1200 < ch &&
2656                                         ch < 0xAC00 || 0xD800 < ch)
2657                                         continue;
2658
2659                                 // SPECIAL CASE ?
2660                                 int offset = i < 0x3260 ? 1 : 0;
2661                                 if (0x326E <= i && i <= 0x3273)
2662                                         offset = 1;
2663
2664                                 map [i] = new CharMapEntry (map [ch].Category,
2665                                         (byte) (map [ch].Level1 + offset),
2666                                         map [ch].Level2);
2667 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2668                         }
2669
2670
2671                         #endregion
2672
2673                         // Letterlike characters and CJK compatibility square
2674                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2675                         int [] counts = new int ['Z' - 'A' + 1];
2676                         char [] namedChars = new char [sortableCharNames.Count];
2677                         int nCharNames = 0;
2678                         foreach (DictionaryEntry de in sortableCharNames) {
2679                                 counts [((string) de.Value) [0] - 'A']++;
2680                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2681                         }
2682                         nCharNames = 0; // reset
2683                         for (int a = 0; a < counts.Length; a++) {
2684                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2685                                 for (int i = 0; i < counts [a]; i++)
2686 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2687                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2688                         }
2689
2690                         // CJK unified ideograph.
2691                         byte cjkCat = 0x9E;
2692                         fillIndex [cjkCat] = 0x2;
2693                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2694                                 if (!IsIgnorable (cp))
2695                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2696                         // CJK Extensions goes here.
2697                         // LAMESPEC: With this Windows style CJK layout, it is
2698                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2699                         // 0x9FBB can never be added w/o breaking compat.
2700                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2701                                 if (!IsIgnorable (cp))
2702                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2703
2704                         // PrivateUse ... computed.
2705                         // remaining Surrogate ... computed.
2706
2707                         #region Special "biggest" area (FF FF)
2708                         fillIndex [0xFF] = 0xFF;
2709                         char [] specialBiggest = new char [] {
2710                                 '\u3005', '\u3031', '\u3032', '\u309D',
2711                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2712                                 '\uFE7C', '\uFE7D', '\uFF70'};
2713                         foreach (char c in specialBiggest)
2714                                 AddCharMap (c, 0xFF, 0);
2715                         #endregion
2716
2717                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2718                         // non-alphanumeric ASCII except for: + - < = > '
2719                         for (int i = 0x21; i < 0x7F; i++) {
2720                                 if (Char.IsLetterOrDigit ((char) i)
2721                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2722                                         continue; // they are not added here.
2723                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2724                                 // Insert 3001 after ',' and 3002 after '.'
2725                                 if (i == 0x2C)
2726                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2727                                 else if (i == 0x2E)
2728                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2729                                 else if (i == 0x3A)
2730                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2731                         }
2732                         #endregion
2733
2734                         #region 07 - Punctuations and something else
2735                         for (int i = 0xA0; i < char.MaxValue; i++) {
2736                                 if (IsIgnorable (i))
2737                                         continue;
2738
2739                                 // FIXME: actually those reset should not be
2740                                 // done but here I put for easy goal.
2741                                 if (i == 0x0700)
2742                                         fillIndex [0x7] = 0xE2;
2743                                 if (i == 0x2016)
2744                                         fillIndex [0x7] = 0x77;
2745
2746                                 // SPECIAL CASES:
2747                                 switch (i) {
2748                                 case 0xAB: // 08
2749                                 case 0xB7: // 0A
2750                                 case 0xBB: // 08
2751                                 case 0x2329: // 09
2752                                 case 0x232A: // 09
2753                                         continue;
2754                                 }
2755
2756                                 switch (Char.GetUnicodeCategory ((char) i)) {
2757                                 case UnicodeCategory.OtherPunctuation:
2758                                 case UnicodeCategory.ClosePunctuation:
2759                                 case UnicodeCategory.OpenPunctuation:
2760                                 case UnicodeCategory.InitialQuotePunctuation:
2761                                 case UnicodeCategory.FinalQuotePunctuation:
2762                                 case UnicodeCategory.ModifierSymbol:
2763                                         // SPECIAL CASES: // 0xA
2764                                         if (0x2020 <= i && i <= 0x2031)
2765                                                 continue;
2766                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
2767                                         break;
2768                                 default:
2769                                         if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2770                                                 goto case UnicodeCategory.OtherPunctuation;
2771                                         break;
2772                                 }
2773                         }
2774                         // Control pictures
2775                         // FIXME: it should not need to reset level 1, but
2776                         // it's for easy goal.
2777                         fillIndex [0x7] = 0xB6;
2778                         for (int i = 0x2400; i <= 0x2421; i++)
2779                                 AddCharMap ((char) i, 0x7, 1, 0);
2780                         #endregion
2781
2782                         // FIXME: for 07 xx we need more love.
2783
2784                         // Characters w/ diacritical marks (NFKD)
2785                         for (int i = 0; i <= char.MaxValue; i++) {
2786                                 if (map [i].Defined || IsIgnorable (i))
2787                                         continue;
2788                                 if (decompIndex [i] == 0)
2789                                         continue;
2790
2791                                 int start = decompIndex [i];
2792                                 int primaryChar = decompValues [start];
2793                                 int secondary = 0;
2794                                 bool skip = false;
2795                                 int length = decompLength [i];
2796                                 // special processing for parenthesized ones.
2797                                 if (length == 3 &&
2798                                         decompValues [start] == '(' &&
2799                                         decompValues [start + 2] == ')') {
2800                                         primaryChar = decompValues [start + 1];
2801                                         length = 1;
2802                                 }
2803
2804                                 if (map [primaryChar].Level1 == 0)
2805                                         continue;
2806
2807                                 for (int l = 1; l < length; l++) {
2808                                         int c = decompValues [start + l];
2809                                         if (map [c].Level1 != 0)
2810                                                 skip = true;
2811                                         secondary += diacritical [c];
2812                                 }
2813                                 if (skip)
2814                                         continue;
2815                                 map [i] = new CharMapEntry (
2816                                         map [primaryChar].Category,
2817                                         map [primaryChar].Level1,
2818                                         (byte) secondary);
2819
2820                         }
2821
2822                         // category 08 - symbols
2823                         fillIndex [0x8] = 2;
2824                         // Here Windows mapping is not straightforward. It is
2825                         // not based on computation but seems manual sorting.
2826                         AddCharMapGroup ('+', 0x8, 1, 0); // plus
2827                         AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
2828                         AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
2829                         AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
2830                         AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
2831                         AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
2832                         AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
2833                         AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
2834                         AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
2835                         AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
2836                         AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
2837                         AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
2838                         AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
2839
2840                         for (int cp = 0; cp < 0x2300; cp++) {
2841                                 if (cp == 0xAC) // SPECIAL CASE: skip
2842                                         continue;
2843                                 if (cp == 0x200) {
2844                                         cp = 0x2200; // skip to 2200
2845                                         fillIndex [0x8] = 0x21;
2846                                 }
2847                                 if (cp == 0x2295)
2848                                         fillIndex [0x8] = 0x3;
2849                                 if (cp == 0x22B2)
2850                                         fillIndex [0x8] = 0xB9;
2851                                 if (!map [cp].Defined &&
2852 //                                      Char.GetUnicodeCategory ((char) cp) ==
2853 //                                      UnicodeCategory.MathSymbol)
2854                                         Char.IsSymbol ((char) cp))
2855                                         AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
2856                                 // SPECIAL CASES: no idea why Windows sorts as such
2857                                 switch (cp) {
2858                                 case 0x3E:
2859                                         AddCharMap ('\u227B', 0x8, 1, 0);
2860                                         AddCharMap ('\u22B1', 0x8, 1, 0);
2861                                         break;
2862                                 case 0xB1:
2863                                         AddCharMapGroup ('\u00AB', 0x8, 1, 0);
2864                                         AddCharMapGroup ('\u226A', 0x8, 1, 0);
2865                                         AddCharMapGroup ('\u00BB', 0x8, 1, 0);
2866                                         AddCharMapGroup ('\u226B', 0x8, 1, 0);
2867                                         break;
2868                                 case 0xF7:
2869                                         AddCharMap ('\u01C0', 0x8, 1, 0);
2870                                         AddCharMap ('\u01C1', 0x8, 1, 0);
2871                                         AddCharMap ('\u01C2', 0x8, 1, 0);
2872                                         break;
2873                                 }
2874                         }
2875
2876                         #region Level2 adjustment
2877                         // Arabic Hamzah
2878                         diacritical [0x624] = 0x5;
2879                         diacritical [0x626] = 0x7;
2880                         diacritical [0x622] = 0x9;
2881                         diacritical [0x623] = 0xA;
2882                         diacritical [0x625] = 0xB;
2883                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
2884                         diacritical [0x64A] = 0x7; // Yaa'
2885
2886                         for (int i = 0; i < char.MaxValue; i++) {
2887                                 byte mod = 0;
2888                                 byte cat = map [i].Category;
2889                                 switch (cat) {
2890                                 case 0xE: // Latin diacritics
2891                                 case 0x22: // Japanese: circled characters
2892                                         mod = diacritical [i];
2893                                         break;
2894                                 case 0x13: // Arabic
2895                                         if (diacritical [i] == 0 && i >= 0xFE8D)
2896                                                 mod = 0x8; // default for arabic
2897                                         break;
2898                                 }
2899                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
2900                                         mod = diacritical [i];
2901                                 if (mod > 0)
2902                                         map [i] = new CharMapEntry (
2903                                                 cat, map [i].Level1, mod);
2904                         }
2905                         #endregion
2906
2907                         // FIXME: this is hack but those NonSpacingMark
2908                         // characters and still undefined are likely to
2909                         // be nonspacing.
2910                         for (int i = 0; i < char.MaxValue; i++)
2911                                 if (!map [i].Defined &&
2912                                         !IsIgnorable (i) &&
2913                                         Char.GetUnicodeCategory ((char) i) ==
2914                                         UnicodeCategory.NonSpacingMark)
2915                                         AddCharMap ((char) i, 1, 1);
2916
2917                         // FIXME: this is hack but those Symbol characters
2918                         // are likely to fall into 0xA category.
2919                         for (int i = 0; i < char.MaxValue; i++)
2920                                 if (!map [i].Defined &&
2921                                         !IsIgnorable (i) &&
2922                                         Char.IsSymbol ((char) i))
2923                                         AddCharMap ((char) i, 0xA, 1);
2924                 }
2925
2926                 private void IncrementSequentialIndex (ref byte hangulCat)
2927                 {
2928                         fillIndex [hangulCat]++;
2929                         if (fillIndex [hangulCat] == 0) { // overflown
2930                                 hangulCat++;
2931                                 fillIndex [hangulCat] = 0x2;
2932                         }
2933                 }
2934
2935                 // Reset fillIndex to fixed value and call AddLetterMap().
2936                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2937                 {
2938                         fillIndex [category] = alphaWeight;
2939                         AddLetterMap (c, category, 0);
2940
2941                         ArrayList al = latinMap [c] as ArrayList;
2942                         if (al == null)
2943                                 return;
2944
2945                         foreach (int cp in al)
2946                                 AddLetterMap ((char) cp, category, 0);
2947                 }
2948
2949                 private void AddKanaMap (int i, byte voices)
2950                 {
2951                         for (byte b = 0; b < voices; b++) {
2952                                 char c = (char) (i + b);
2953                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
2954                                 // Hiragana
2955                                 AddLetterMapCore (c, 0x22, 0, arg);
2956                                 // Katakana
2957                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2958                         }
2959                 }
2960
2961                 private void AddLetterMap (char c, byte category, byte updateCount)
2962                 {
2963                         AddLetterMapCore (c, category, updateCount, 0);
2964                 }
2965
2966                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2967                 {
2968                         char c2;
2969                         // <small> updates index
2970                         c2 = ToSmallForm (c);
2971                         if (c2 != c)
2972                                 AddCharMapGroup (c2, category, updateCount, level2);
2973                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2974                         if (c2 != c && !map [(int) c2].Defined)
2975                                 AddLetterMapCore (c2, category, 0, level2);
2976                         bool doUpdate = true;
2977                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2978                                 doUpdate = false;
2979                         else
2980                                 AddCharMapGroup (c, category, 0, level2);
2981                         if (doUpdate)
2982                                 fillIndex [category] += updateCount;
2983                 }
2984
2985                 private bool AddCharMap (char c, byte category, byte increment)
2986                 {
2987                         return AddCharMap (c, category, increment, 0);
2988                 }
2989
2990                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2991                 {
2992                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2993                                 return false; // do nothing
2994                         map [(int) c] = new CharMapEntry (category,
2995                                 category == 1 ? alt : fillIndex [category],
2996                                 category == 1 ? fillIndex [category] : alt);
2997                         fillIndex [category] += increment;
2998                         return true;
2999                 }
3000
3001                 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
3002                 {
3003                         char c2 = ToSmallFormTail (c);
3004                         if (c2 != c)
3005                                 AddCharMap (c2, category, updateCount, 0);
3006                         // itself
3007                         AddCharMap (c, category, updateCount, 0);
3008                         // <full>
3009                         c2 = ToFullWidthTail (c);
3010                         if (c2 != c)
3011                                 AddCharMapGroupTail (c2, category, updateCount);
3012                 }
3013
3014                 //
3015                 // Adds characters to table in the order below
3016                 // (+ increases weight):
3017                 //      (<small> +)
3018                 //      itself
3019                 //      <fraction>
3020                 //      <full> | <super> | <sub>
3021                 //      <circle> | <wide> (| <narrow>)
3022                 //      +
3023                 //      (vertical +)
3024                 //
3025                 // level2 is fixed (does not increase).
3026                 int [] sameWeightItems = new int [] {
3027                         DecompositionFraction,
3028                         DecompositionFull,
3029                         DecompositionSuper,
3030                         DecompositionSub,
3031                         DecompositionCircle,
3032                         DecompositionWide,
3033                         DecompositionNarrow,
3034                         };
3035                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3036                 {
3037                         if (map [(int) c].Defined)
3038                                 return;
3039
3040                         char small = char.MinValue;
3041                         char vertical = char.MinValue;
3042                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3043                         if (nfkd != null) {
3044                                 object smv = nfkd [(byte) DecompositionSmall];
3045                                 if (smv != null)
3046                                         small = (char) ((int) smv);
3047                                 object vv = nfkd [(byte) DecompositionVertical];
3048                                 if (vv != null)
3049                                         vertical = (char) ((int) vv);
3050                         }
3051
3052                         // <small> updates index
3053                         if (small != char.MinValue)
3054                                 AddCharMap (small, category, updateCount);
3055
3056                         // itself
3057                         AddCharMap (c, category, 0, level2);
3058
3059                         if (nfkd != null) {
3060                                 foreach (int weight in sameWeightItems) {
3061                                         object wv = nfkd [(byte) weight];
3062                                         if (wv != null)
3063                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
3064                                 }
3065                         }
3066
3067                         // update index here.
3068                         fillIndex [category] += updateCount;
3069
3070                         if (vertical != char.MinValue)
3071                                 AddCharMap (vertical, category, updateCount, level2);
3072                 }
3073
3074                 private void AddCharMapCJK (char c, ref byte category)
3075                 {
3076                         AddCharMap (c, category, 0, 0);
3077                         IncrementSequentialIndex (ref category);
3078
3079                         // Special. I wonder why but Windows skips 9E F9.
3080                         if (category == 0x9E && fillIndex [category] == 0xF9)
3081                                 IncrementSequentialIndex (ref category);
3082                 }
3083
3084                 private void AddCharMapGroupCJK (char c, ref byte category)
3085                 {
3086                         AddCharMapCJK (c, ref category);
3087
3088                         // LAMESPEC: see below.
3089                         if (c == '\u5B78') {
3090                                 AddCharMapCJK ('\u32AB', ref category);
3091                                 AddCharMapCJK ('\u323B', ref category);
3092                         }
3093                         if (c == '\u52DE') {
3094                                 AddCharMapCJK ('\u3298', ref category);
3095                                 AddCharMapCJK ('\u3238', ref category);
3096                         }
3097                         if (c == '\u5BEB')
3098                                 AddCharMapCJK ('\u32A2', ref category);
3099                         if (c == '\u91AB')
3100                                 // Especially this mapping order totally does
3101                                 // not make sense to me.
3102                                 AddCharMapCJK ('\u32A9', ref category);
3103
3104                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3105                         if (nfkd == null)
3106                                 return;
3107                         for (byte weight = 0; weight <= 0x12; weight++) {
3108                                 object wv = nfkd [weight];
3109                                 if (wv == null)
3110                                         continue;
3111                                 int w = (int) wv;
3112
3113                                 // Special: they are ignored in this area.
3114                                 // FIXME: check if it is sane
3115                                 if (0xF900 <= w && w <= 0xFAD9)
3116                                         continue;
3117                                 // LAMESPEC: on Windows some of CJK characters
3118                                 // in 3200-32B0 are incorrectly mapped. They
3119                                 // mix Chinise and Japanese Kanji when
3120                                 // ordering those characters.
3121                                 switch (w) {
3122                                 case 0x32A2: case 0x3298: case 0x3238:
3123                                 case 0x32A9: case 0x323B: case 0x32AB:
3124                                         continue;
3125                                 }
3126
3127                                 AddCharMapCJK ((char) w, ref category);
3128                         }
3129                 }
3130
3131                 // For now it is only for 0x7 category.
3132                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3133                 {
3134                         char small = char.MinValue;
3135                         char vertical = char.MinValue;
3136                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3137                         if (nfkd != null) {
3138                                 object smv = nfkd [(byte) DecompositionSmall];
3139                                 if (smv != null)
3140                                         small = (char) ((int) smv);
3141                                 object vv = nfkd [(byte) DecompositionVertical];
3142                                 if (vv != null)
3143                                         vertical = (char) ((int) vv);
3144                         }
3145
3146                         // <small> updates index
3147                         if (small != char.MinValue)
3148                                 // SPECIAL CASE excluded (FIXME: why?)
3149                                 if (small != '\u2024')
3150                                         AddCharMap (small, category, updateCount);
3151
3152                         // itself
3153                         AddCharMap (c, category, updateCount, level2);
3154
3155                         // Since nfkdMap is problematic to have two or more
3156                         // NFKD to an identical character, here I iterate all.
3157                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3158                                 if (decompLength [c2] == 1 &&
3159                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3160                                         switch (decompType [c2]) {
3161                                         case DecompositionCompat:
3162                                                 AddCharMap ((char) c2, category, updateCount, level2);
3163                                                 break;
3164                                         }
3165                                 }
3166                         }
3167
3168                         if (vertical != char.MinValue)
3169                                 // SPECIAL CASE excluded (FIXME: why?)
3170                                 if (vertical != '\uFE33' && vertical != '\uFE34')
3171                                         AddCharMap (vertical, category, updateCount, level2);
3172                 }
3173
3174                 private void AddArabicCharMap (char c)
3175                 {
3176                         byte category = 6;
3177                         byte updateCount = 1;
3178                         byte level2 = 0;
3179
3180                         // itself
3181                         AddCharMap (c, category, 0, level2);
3182
3183                         // Since nfkdMap is problematic to have two or more
3184                         // NFKD to an identical character, here I iterate all.
3185                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3186                                 if (decompLength [c2] == 0)
3187                                         continue;
3188                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
3189                                 if ((int) (decompValues [idx]) == (int) c)
3190                                         AddCharMap ((char) c2, category,
3191                                                 0, level2);
3192                         }
3193                         fillIndex [category] += updateCount;
3194                 }
3195
3196                 char ToFullWidth (char c)
3197                 {
3198                         return ToDecomposed (c, DecompositionFull, false);
3199                 }
3200
3201                 char ToFullWidthTail (char c)
3202                 {
3203                         return ToDecomposed (c, DecompositionFull, true);
3204                 }
3205
3206                 char ToSmallForm (char c)
3207                 {
3208                         return ToDecomposed (c, DecompositionSmall, false);
3209                 }
3210
3211                 char ToSmallFormTail (char c)
3212                 {
3213                         return ToDecomposed (c, DecompositionSmall, true);
3214                 }
3215
3216                 char ToDecomposed (char c, byte d, bool tail)
3217                 {
3218                         if (decompType [(int) c] != d)
3219                                 return c;
3220                         int idx = decompIndex [(int) c];
3221                         if (tail)
3222                                 idx += decompLength [(int) c] - 1;
3223                         return (char) decompValues [idx];
3224                 }
3225
3226                 bool ExistsJIS (int cp)
3227                 {
3228                         foreach (JISCharacter j in jisJapanese)
3229                                 if (j.CP == cp)
3230                                         return true;
3231                         return false;
3232                 }
3233
3234                 #endregion
3235
3236                 #region Level 3 properties (Case/Width)
3237
3238                 private byte ComputeLevel3Weight (char c)
3239                 {
3240                         byte b = ComputeLevel3WeightRaw (c);
3241                         return b > 0 ? (byte) (b + 2) : b;
3242                 }
3243
3244                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3245                 {
3246                         // CJK compat
3247                         if ('\u3192' <= c && c <= '\u319F')
3248                                 return 0;
3249                         // Japanese reading marks
3250                         if (c == '\u3001' || c == '\u3002')
3251                                 return 2;
3252                         // Korean
3253                         if ('\u11A8' <= c && c <= '\u11F9')
3254                                 return 2;
3255                         if ('\uFFA0' <= c && c <= '\uFFDC')
3256                                 return 4;
3257                         if ('\u3130' <= c && c <= '\u3164')
3258                                 return 5;
3259                         if ('\u3165' <= c && c <= '\u318E')
3260                                 return 4;
3261                         // Georgian Capital letters
3262                         if ('\u10A0' <= c && c <= '\u10C5')
3263                                 return 0x10;
3264                         // numbers
3265                         if ('\u2776' <= c && c <= '\u277F')
3266                                 return 4;
3267                         if ('\u2780' <= c && c <= '\u2789')
3268                                 return 8;
3269                         if ('\u2776' <= c && c <= '\u2793')
3270                                 return 0xC;
3271                         if ('\u2160' <= c && c <= '\u216F')
3272                                 return 0x10;
3273                         if ('\u2181' <= c && c <= '\u2182')
3274                                 return 0x18;
3275                         // Arabic
3276                         if ('\u2135' <= c && c <= '\u2138')
3277                                 return 4;
3278                         if ('\uFE80' <= c && c < '\uFF00') {
3279                                 // 2(Isolated)/8(Final)/0x18(Medial)
3280                                 switch (decompType [(int) c]) {
3281                                 case DecompositionIsolated:
3282                                         return 2;
3283                                 case DecompositionFinal:
3284                                         return 8;
3285                                 case DecompositionMedial:
3286                                         return 0x18;
3287                                 }
3288                         }
3289
3290                         // actually I dunno the reason why they have weights.
3291                         switch (c) {
3292                         case '\u01BC':
3293                                 return 0x10;
3294                         case '\u06A9':
3295                                 return 0x20;
3296                         case '\u06AA':
3297                                 return 0x28;
3298                         }
3299
3300                         byte ret = 0;
3301                         switch (c) {
3302                         case '\u03C2':
3303                         case '\u2104':
3304                         case '\u212B':
3305                                 ret |= 8;
3306                                 break;
3307                         case '\uFE42':
3308                                 ret |= 0xC;
3309                                 break;
3310                         }
3311
3312                         // misc
3313                         switch (decompType [(int) c]) {
3314                         case DecompositionWide: // <wide>
3315                         case DecompositionSub: // <sub>
3316                         case DecompositionSuper: // <super>
3317                                 ret |= decompType [(int) c];
3318                                 break;
3319                         }
3320                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3321                                 ret |= 8;
3322                         if (isUppercase [(int) c]) // DerivedCoreProperties
3323                                 ret |= 0x10;
3324
3325                         return ret;
3326                 }
3327
3328                 #endregion
3329
3330                 #region IsIgnorable
3331 /*
3332                 static bool IsIgnorable (int i)
3333                 {
3334                         if (unicodeAge [i] >= 3.1)
3335                                 return true;
3336                         switch (char.GetUnicodeCategory ((char) i)) {
3337                         case UnicodeCategory.OtherNotAssigned:
3338                         case UnicodeCategory.Format:
3339                                 return true;
3340                         }
3341                         return false;
3342                 }
3343 */
3344
3345                 // FIXME: In the future use DerivedAge.txt to examine character
3346                 // versions and set those ones that have higher version than
3347                 // 1.0 as ignorable.
3348                 static bool IsIgnorable (int i)
3349                 {
3350                         switch (i) {
3351                         case 0:
3352                         // I guess, those characters are added between
3353                         // Unicode 1.0 (LCMapString) and Unicode 3.1
3354                         // (UnicodeCategory), so they used to be
3355                         // something like OtherNotAssigned as of Unicode 1.1.
3356                         case 0x2df: case 0x387:
3357                         case 0x3d7: case 0x3d8: case 0x3d9:
3358                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3359                         case 0x400: case 0x40d: case 0x450: case 0x45d:
3360                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
3361                         case 0x653: case 0x654: case 0x655: case 0x66d:
3362                         case 0xb56:
3363                         case 0x1e9b: case 0x202f: case 0x20ad:
3364                         case 0x20ae: case 0x20af:
3365                         case 0x20e2: case 0x20e3:
3366                         case 0x2139: case 0x213a: case 0x2183:
3367                         case 0x2425: case 0x2426: case 0x2619:
3368                         case 0x2670: case 0x2671: case 0x3007:
3369                         case 0x3190: case 0x3191:
3370                         case 0xfffc: case 0xfffd:
3371                                 return true;
3372                         // exceptional characters filtered by the
3373                         // following conditions. Originally those exceptional
3374                         // ranges are incorrect (they should not be ignored)
3375                         // and most of those characters are unfortunately in
3376                         // those ranges.
3377                         case 0x4d8: case 0x4d9:
3378                         case 0x4e8: case 0x4e9:
3379                         case 0x70F:
3380                         case 0x3036: case 0x303f:
3381                         case 0x337b: case 0xfb1e:
3382                                 return false;
3383                         }
3384
3385                         if (
3386                                 // The whole Sinhala characters.
3387                                 0x0D82 <= i && i <= 0x0DF4
3388                                 // The whole Tibetan characters.
3389                                 || 0x0F00 <= i && i <= 0x0FD1
3390                                 // The whole Myanmar characters.
3391                                 || 0x1000 <= i && i <= 0x1059
3392                                 // The whole Etiopic, Cherokee,
3393                                 // Canadian Syllablic, Ogham, Runic,
3394                                 // Tagalog, Hanunoo, Philippine,
3395                                 // Buhid, Tagbanwa, Khmer and Mongorian
3396                                 // characters.
3397                                 || 0x1200 <= i && i <= 0x1DFF
3398                                 // Greek extension characters.
3399                                 || 0x1F00 <= i && i <= 0x1FFF
3400                                 // The whole Braille characters.
3401                                 || 0x2800 <= i && i <= 0x28FF
3402                                 // CJK radical characters.
3403                                 || 0x2E80 <= i && i <= 0x2EF3
3404                                 // Kangxi radical characters.
3405                                 || 0x2F00 <= i && i <= 0x2FD5
3406                                 // Ideographic description characters.
3407                                 || 0x2FF0 <= i && i <= 0x2FFB
3408                                 // Bopomofo letter and final
3409                                 || 0x31A0 <= i && i <= 0x31B7
3410                                 // White square with quadrant characters.
3411                                 || 0x25F0 <= i && i <= 0x25F7
3412                                 // Ideographic telegraph symbols.
3413                                 || 0x32C0 <= i && i <= 0x32CB
3414                                 || 0x3358 <= i && i <= 0x3370
3415                                 || 0x33E0 <= i && i <= 0x33FF
3416                                 // The whole YI characters.
3417                                 || 0xA000 <= i && i <= 0xA48C
3418                                 || 0xA490 <= i && i <= 0xA4C6
3419                                 // American small ligatures
3420                                 || 0xFB13 <= i && i <= 0xFB17
3421                                 // hebrew, arabic, variation selector.
3422                                 || 0xFB1D <= i && i <= 0xFE2F
3423                                 // Arabic ligatures.
3424                                 || 0xFEF5 <= i && i <= 0xFEFC
3425                                 // FIXME: why are they excluded?
3426                                 || 0x01F6 <= i && i <= 0x01F9
3427                                 || 0x0218 <= i && i <= 0x0233
3428                                 || 0x02A9 <= i && i <= 0x02AD
3429                                 || 0x02EA <= i && i <= 0x02EE
3430                                 || 0x0349 <= i && i <= 0x036F
3431                                 || 0x0488 <= i && i <= 0x048F
3432                                 || 0x04D0 <= i && i <= 0x04FF
3433                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3434                                 || 0x06D6 <= i && i <= 0x06ED
3435                                 || 0x06FA <= i && i <= 0x06FE
3436                                 || 0x2048 <= i && i <= 0x204D
3437                                 || 0x20e4 <= i && i <= 0x20ea
3438                                 || 0x213C <= i && i <= 0x214B
3439                                 || 0x21EB <= i && i <= 0x21FF
3440                                 || 0x22F2 <= i && i <= 0x22FF
3441                                 || 0x237B <= i && i <= 0x239A
3442                                 || 0x239B <= i && i <= 0x23CF
3443                                 || 0x24EB <= i && i <= 0x24FF
3444                                 || 0x2596 <= i && i <= 0x259F
3445                                 || 0x25F8 <= i && i <= 0x25FF
3446                                 || 0x2672 <= i && i <= 0x2689
3447                                 || 0x2768 <= i && i <= 0x2775
3448                                 || 0x27d0 <= i && i <= 0x27ff
3449                                 || 0x2900 <= i && i <= 0x2aff
3450                                 || 0x3033 <= i && i <= 0x303F
3451                                 || 0x31F0 <= i && i <= 0x31FF
3452                                 || 0x3250 <= i && i <= 0x325F
3453                                 || 0x32B1 <= i && i <= 0x32BF
3454                                 || 0x3371 <= i && i <= 0x337B
3455                                 || 0xFA30 <= i && i <= 0xFA6A
3456                         )
3457                                 return true;
3458
3459                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3460                         switch (uc) {
3461                         case UnicodeCategory.PrivateUse:
3462                         case UnicodeCategory.Surrogate:
3463                                 return false;
3464                         // ignored by nature
3465                         case UnicodeCategory.Format:
3466                         case UnicodeCategory.OtherNotAssigned:
3467                                 return true;
3468                         default:
3469                                 return false;
3470                         }
3471                 }
3472
3473                 // To check IsIgnorable sanity, try the driver below under MS.NET.
3474
3475                 /*
3476                 public static void Main ()
3477                 {
3478                         for (int i = 0; i <= char.MaxValue; i++)
3479                                 Dump (i, IsIgnorable (i));
3480                 }
3481
3482                 static void Dump (int i, bool ignore)
3483                 {
3484                         switch (Char.GetUnicodeCategory ((char) i)) {
3485                         case UnicodeCategory.PrivateUse:
3486                         case UnicodeCategory.Surrogate:
3487                                 return; // check nothing
3488                         }
3489
3490                         string s1 = "";
3491                         string s2 = new string ((char) i, 10);
3492                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3493                         if ((ret == 0) == ignore)
3494                                 return;
3495                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3496                 }
3497                 */
3498                 #endregion // IsIgnorable
3499
3500                 #region IsIgnorableSymbol
3501                 static bool IsIgnorableSymbol (int i)
3502                 {
3503                         if (IsIgnorable (i))
3504                                 return true;
3505
3506                         switch (i) {
3507                         // *Letter
3508                         case 0x00b5: case 0x01C0: case 0x01C1:
3509                         case 0x01C2: case 0x01C3: case 0x01F6:
3510                         case 0x01F7: case 0x01F8: case 0x01F9:
3511                         case 0x02D0: case 0x02EE: case 0x037A:
3512                         case 0x03D7: case 0x03F3:
3513                         case 0x0400: case 0x040d:
3514                         case 0x0450: case 0x045d:
3515                         case 0x048C: case 0x048D:
3516                         case 0x048E: case 0x048F:
3517                         case 0x0587: case 0x0640: case 0x06E5:
3518                         case 0x06E6: case 0x06FA: case 0x06FB:
3519                         case 0x06FC: case 0x093D: case 0x0950:
3520                         case 0x1E9B: case 0x2139: case 0x3006:
3521                         case 0x3033: case 0x3034: case 0x3035:
3522                         case 0xFE7E: case 0xFE7F:
3523                         // OtherNumber
3524                         case 0x16EE: case 0x16EF: case 0x16F0:
3525                         // LetterNumber
3526                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3527                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3528                         case 0x3038: // HANGZHOU NUMERAL TEN
3529                         case 0x3039: // HANGZHOU NUMERAL TWENTY
3530                         case 0x303a: // HANGZHOU NUMERAL THIRTY
3531                         // OtherSymbol
3532                         case 0x2117:
3533                         case 0x327F:
3534                                 return true;
3535                         // ModifierSymbol
3536                         case 0x02B9: case 0x02BA: case 0x02C2:
3537                         case 0x02C3: case 0x02C4: case 0x02C5:
3538                         case 0x02C8: case 0x02CC: case 0x02CD:
3539                         case 0x02CE: case 0x02CF: case 0x02D2:
3540                         case 0x02D3: case 0x02D4: case 0x02D5:
3541                         case 0x02D6: case 0x02D7: case 0x02DE:
3542                         case 0x02E5: case 0x02E6: case 0x02E7:
3543                         case 0x02E8: case 0x02E9:
3544                         case 0x309B: case 0x309C:
3545                         // OtherPunctuation
3546                         case 0x055A: // American Apos
3547                         case 0x05C0: // Hebrew Punct
3548                         case 0x0E4F: // Thai FONGMAN
3549                         case 0x0E5A: // Thai ANGKHANKHU
3550                         case 0x0E5B: // Thai KHOMUT
3551                         // CurencySymbol
3552                         case 0x09F2: // Bengali Rupee Mark
3553                         case 0x09F3: // Bengali Rupee Sign
3554                         // MathSymbol
3555                         case 0x221e: // INF.
3556                         // OtherSymbol
3557                         case 0x0482:
3558                         case 0x09FA:
3559                         case 0x0B70:
3560                                 return false;
3561                         }
3562
3563                         // *Letter
3564                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3565 #if NET_2_0
3566                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3567                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3568 #endif
3569                         )
3570                                 return true;
3571
3572                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3573                         switch (uc) {
3574                         case UnicodeCategory.Surrogate:
3575                                 return false; // inconsistent
3576
3577                         case UnicodeCategory.SpacingCombiningMark:
3578                         case UnicodeCategory.EnclosingMark:
3579                         case UnicodeCategory.NonSpacingMark:
3580                         case UnicodeCategory.PrivateUse:
3581                                 // NonSpacingMark
3582                                 if (0x064B <= i && i <= 0x0652) // Arabic
3583                                         return true;
3584                                 return false;
3585
3586                         case UnicodeCategory.Format:
3587                         case UnicodeCategory.OtherNotAssigned:
3588                                 return true;
3589
3590                         default:
3591                                 bool use = false;
3592                                 // OtherSymbols
3593                                 if (
3594                                         // latin in a circle
3595                                         0x249A <= i && i <= 0x24E9
3596                                         || 0x2100 <= i && i <= 0x2132
3597                                         // Japanese
3598                                         || 0x3196 <= i && i <= 0x31A0
3599                                         // Korean
3600                                         || 0x3200 <= i && i <= 0x321C
3601                                         // Chinese/Japanese
3602                                         || 0x322A <= i && i <= 0x3243
3603                                         // CJK
3604                                         || 0x3260 <= i && i <= 0x32B0
3605                                         || 0x32D0 <= i && i <= 0x3357
3606                                         || 0x337B <= i && i <= 0x33DD
3607                                 )
3608                                         use = !Char.IsLetterOrDigit ((char) i);
3609                                 if (use)
3610                                         return false;
3611
3612                                 // This "Digit" rule is mystery.
3613                                 // It filters some symbols out.
3614                                 if (Char.IsLetterOrDigit ((char) i))
3615                                         return false;
3616                                 if (Char.IsNumber ((char) i))
3617                                         return false;
3618                                 if (Char.IsControl ((char) i)
3619                                         || Char.IsSeparator ((char) i)
3620                                         || Char.IsPunctuation ((char) i))
3621                                         return true;
3622                                 if (Char.IsSymbol ((char) i))
3623                                         return true;
3624
3625                                 // FIXME: should check more
3626                                 return false;
3627                         }
3628                 }
3629
3630                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3631 /*
3632                 public static void Main ()
3633                 {
3634                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3635                         for (int i = 0; i <= char.MaxValue; i++) {
3636                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3637                                 if (uc == UnicodeCategory.Surrogate)
3638                                         continue;
3639
3640                                 bool ret = IsIgnorableSymbol (i);
3641
3642                                 string s1 = "TEST ";
3643                                 string s2 = "TEST " + (char) i;
3644
3645                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3646
3647                                 if (ret != (result == 0))
3648                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3649                                                 ret ? "should not ignore" :
3650                                                         "should ignore",
3651                                                 i,(char) i, uc);
3652                         }
3653                 }
3654 */
3655                 #endregion
3656
3657                 #region NonSpacing
3658                 static bool IsIgnorableNonSpacing (int i)
3659                 {
3660                         if (IsIgnorable (i))
3661                                 return true;
3662
3663                         switch (i) {
3664                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3665                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3666                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3667                                 return true;
3668                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3669                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3670                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3671                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3672                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3673                         case 0x0CCD: case 0x0E4E:
3674                                 return false;
3675                         }
3676
3677                         if (0x02b9 <= i && i <= 0x02c5
3678                                 || 0x02cc <= i && i <= 0x02d7
3679                                 || 0x02e4 <= i && i <= 0x02ef
3680                                 || 0x20DD <= i && i <= 0x20E0
3681                         )
3682                                 return true;
3683
3684                         if (0x064B <= i && i <= 0x00652
3685                                 || 0x0941 <= i && i <= 0x0948
3686                                 || 0x0AC1 <= i && i <= 0x0ACD
3687                                 || 0x0C3E <= i && i <= 0x0C4F
3688                                 || 0x0E31 <= i && i <= 0x0E3F
3689                         )
3690                                 return false;
3691
3692                         return Char.GetUnicodeCategory ((char) i) ==
3693                                 UnicodeCategory.NonSpacingMark;
3694                 }
3695
3696                 // We can reuse IsIgnorableSymbol testcode
3697                 // for IsIgnorableNonSpacing.
3698                 #endregion
3699         }
3700
3701         struct CharMapEntry
3702         {
3703                 public byte Category;
3704                 public byte Level1;
3705                 public byte Level2; // It is always single byte.
3706                 public bool Defined;
3707
3708                 public CharMapEntry (byte category, byte level1, byte level2)
3709                 {
3710                         Category = category;
3711                         Level1 = level1;
3712                         Level2 = level2;
3713                         Defined = true;
3714                 }
3715         }
3716
3717         class JISCharacter
3718         {
3719                 public readonly int CP;
3720                 public readonly int JIS;
3721
3722                 public JISCharacter (int cp, int cpJIS)
3723                 {
3724                         CP = cp;
3725                         JIS = cpJIS;
3726                 }
3727         }
3728
3729         class JISComparer : IComparer
3730         {
3731                 public static readonly JISComparer Instance =
3732                         new JISComparer ();
3733
3734                 public int Compare (object o1, object o2)
3735                 {
3736                         JISCharacter j1 = (JISCharacter) o1;
3737                         JISCharacter j2 = (JISCharacter) o2;
3738                         return j1.JIS - j2.JIS;
3739                 }
3740         }
3741
3742         class NonJISCharacter
3743         {
3744                 public readonly int CP;
3745                 public readonly string Name;
3746
3747                 public NonJISCharacter (int cp, string name)
3748                 {
3749                         CP = cp;
3750                         Name = name;
3751                 }
3752         }
3753
3754         class NonJISComparer : IComparer
3755         {
3756                 public static readonly NonJISComparer Instance =
3757                         new NonJISComparer ();
3758
3759                 public int Compare (object o1, object o2)
3760                 {
3761                         NonJISCharacter j1 = (NonJISCharacter) o1;
3762                         NonJISCharacter j2 = (NonJISCharacter) o2;
3763                         return string.CompareOrdinal (j1.Name, j2.Name);
3764                 }
3765         }
3766
3767         class DecimalDictionaryValueComparer : IComparer
3768         {
3769                 public static readonly DecimalDictionaryValueComparer Instance
3770                         = new DecimalDictionaryValueComparer ();
3771
3772                 private DecimalDictionaryValueComparer ()
3773                 {
3774                 }
3775
3776                 public int Compare (object o1, object o2)
3777                 {
3778                         DictionaryEntry e1 = (DictionaryEntry) o1;
3779                         DictionaryEntry e2 = (DictionaryEntry) o2;
3780                         // FIXME: in case of 0, compare decomposition categories
3781                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3782                         if (ret != 0)
3783                                 return ret;
3784                         int i1 = (int) e1.Key;
3785                         int i2 = (int) e2.Key;
3786                         return i1 - i2;
3787                 }
3788         }
3789
3790         class StringDictionaryValueComparer : IComparer
3791         {
3792                 public static readonly StringDictionaryValueComparer Instance
3793                         = new StringDictionaryValueComparer ();
3794
3795                 private StringDictionaryValueComparer ()
3796                 {
3797                 }
3798
3799                 public int Compare (object o1, object o2)
3800                 {
3801                         DictionaryEntry e1 = (DictionaryEntry) o1;
3802                         DictionaryEntry e2 = (DictionaryEntry) o2;
3803                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3804                         if (ret != 0)
3805                                 return ret;
3806                         int i1 = (int) e1.Key;
3807                         int i2 = (int) e2.Key;
3808                         return i1 - i2;
3809                 }
3810         }
3811
3812         class UCAComparer : IComparer
3813         {
3814                 public static readonly UCAComparer Instance
3815                         = new UCAComparer ();
3816
3817                 private UCAComparer ()
3818                 {
3819                 }
3820
3821                 public int Compare (object o1, object o2)
3822                 {
3823                         char i1 = (char) o1;
3824                         char i2 = (char) o2;
3825
3826                         int l1 = CollationElementTable.GetSortKeyCount (i1);
3827                         int l2 = CollationElementTable.GetSortKeyCount (i2);
3828                         int l = l1 > l2 ? l2 : l1;
3829
3830                         for (int i = 0; i < l; i++) {
3831                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3832                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3833                                 int v = k1.Primary - k2.Primary;
3834                                 if (v != 0)
3835                                         return v;
3836                                 v = k1.Secondary - k2.Secondary;
3837                                 if (v != 0)
3838                                         return v;
3839                                 v = k1.Thirtiary - k2.Thirtiary;
3840                                 if (v != 0)
3841                                         return v;
3842                                 v = k1.Quarternary - k2.Quarternary;
3843                                 if (v != 0)
3844                                         return v;
3845                         }
3846                         return l1 - l2;
3847                 }
3848         }
3849
3850         class Tailoring
3851         {
3852                 int lcid;
3853                 int alias;
3854                 bool frenchSort;
3855                 ArrayList items = new ArrayList ();
3856
3857                 public Tailoring (int lcid)
3858                         : this (lcid, 0)
3859                 {
3860                 }
3861
3862                 public Tailoring (int lcid, int alias)
3863                 {
3864                         this.lcid = lcid;
3865                         this.alias = alias;
3866                 }
3867
3868                 public int LCID {
3869                         get { return lcid; }
3870                 }
3871
3872                 public int Alias {
3873                         get { return alias; }
3874                 }
3875
3876                 public bool FrenchSort {
3877                         get { return frenchSort; }
3878                         set { frenchSort = value; }
3879                 }
3880
3881                 public void AddDiacriticalMap (byte target, byte replace)
3882                 {
3883                         items.Add (new DiacriticalMap (target, replace));
3884                 }
3885
3886                 public void AddSortKeyMap (string source, byte [] sortkey)
3887                 {
3888                         items.Add (new SortKeyMap (source, sortkey));
3889                 }
3890
3891                 public void AddReplacementMap (string source, string replace)
3892                 {
3893                         items.Add (new ReplacementMap (source, replace));
3894                 }
3895
3896                 public char [] ItemToCharArray ()
3897                 {
3898                         ArrayList al = new ArrayList ();
3899                         foreach (ITailoringMap m in items)
3900                                 al.AddRange (m.ToCharArray ());
3901                         return al.ToArray (typeof (char)) as char [];
3902                 }
3903
3904                 interface ITailoringMap
3905                 {
3906                         char [] ToCharArray ();
3907                 }
3908
3909                 class DiacriticalMap : ITailoringMap
3910                 {
3911                         public readonly byte Target;
3912                         public readonly byte Replace;
3913
3914                         public DiacriticalMap (byte target, byte replace)
3915                         {
3916                                 Target = target;
3917                                 Replace = replace;
3918                         }
3919
3920                         public char [] ToCharArray ()
3921                         {
3922                                 char [] ret = new char [3];
3923                                 ret [0] = (char) 02; // kind:DiacriticalMap
3924                                 ret [1] = (char) Target;
3925                                 ret [2] = (char) Replace;
3926                                 return ret;
3927                         }
3928                 }
3929
3930                 class SortKeyMap : ITailoringMap
3931                 {
3932                         public readonly string Source;
3933                         public readonly byte [] SortKey;
3934
3935                         public SortKeyMap (string source, byte [] sortkey)
3936                         {
3937                                 Source = source;
3938                                 SortKey = sortkey;
3939                         }
3940
3941                         public char [] ToCharArray ()
3942                         {
3943                                 char [] ret = new char [Source.Length + 7];
3944                                 ret [0] = (char) 01; // kind:SortKeyMap
3945                                 for (int i = 0; i < Source.Length; i++)
3946                                         ret [i + 1] = Source [i];
3947                                 // null terminate
3948                                 for (int i = 0; i < 4; i++)
3949                                         ret [i + Source.Length + 2] = (char) SortKey [i];
3950                                 return ret;
3951                         }
3952                 }
3953
3954                 class ReplacementMap : ITailoringMap
3955                 {
3956                         public readonly string Source;
3957                         public readonly string Replace;
3958
3959                         public ReplacementMap (string source, string replace)
3960                         {
3961                                 Source = source;
3962                                 Replace = replace;
3963                         }
3964
3965                         public char [] ToCharArray ()
3966                         {
3967                                 char [] ret = new char [Source.Length + Replace.Length + 3];
3968                                 ret [0] = (char) 03; // kind:ReplaceMap
3969                                 int pos = 1;
3970                                 for (int i = 0; i < Source.Length; i++)
3971                                         ret [pos++] = Source [i];
3972                                 // null terminate
3973                                 pos++;
3974                                 for (int i = 0; i < Replace.Length; i++)
3975                                         ret [pos++] = Replace [i];
3976                                 // null terminate
3977                                 return ret;
3978                         }
3979                 }
3980         }
3981 }