mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14
  15 //
  16 // * sortkey getter signature
  17 //
  18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
  19 //      Stores sort key for corresponding character element into buf and
  20 //      returns the length of the consumed _source_ character element in s.
  21 //
  22 // * character length to consume
  23 //
  24 //      If there are characters whose primary weight is 0, they are consumed
  25 //      and considered as a part of the character element.
  26 //
  27
  28 using System;
  29 using System.IO;
  30 using System.Collections;
  31 using System.Globalization;
  32 using System.Text;
  33 using System.Xml;
  34
  35 namespace Mono.Globalization.Unicode
  36 {
  37         internal class MSCompatSortKeyTableGenerator
  38         {
  39                 public static void Main (string [] args)
  40                 {
  41                         new MSCompatSortKeyTableGenerator ().Run (args);
  42                 }
  43
  44                 const int DecompositionWide = 1; // fixed
  45                 const int DecompositionSub = 2; // fixed
  46                 const int DecompositionSmall = 3;
  47                 const int DecompositionIsolated = 4;
  48                 const int DecompositionInitial = 5;
  49                 const int DecompositionFinal = 6;
  50                 const int DecompositionMedial = 7;
  51                 const int DecompositionNoBreak = 8;
  52                 const int DecompositionVertical = 9;
  53                 const int DecompositionFraction = 0xA;
  54                 const int DecompositionFont = 0xB;
  55                 const int DecompositionSuper = 0xC; // fixed
  56                 const int DecompositionFull = 0xE;
  57                 const int DecompositionNarrow = 0xD;
  58                 const int DecompositionCircle = 0xF;
  59                 const int DecompositionSquare = 0x10;
  60                 const int DecompositionCompat = 0x11;
  61                 const int DecompositionCanonical = 0x12;
  62
  63                 TextWriter Result = Console.Out;
  64
  65                 byte [] fillIndex = new byte [256]; // by category
  66                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  67
  68                 char [] specialIgnore = new char [] {
  69                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  70                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  71                         };
  72
  73                 // FIXME: need more love (as always)
  74                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  75                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  76                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  77                         '\u0292', '\u01BE', '\u0298'};
  78                 byte [] alphaWeights = new byte [] {
  79                         2, 9, 0xA, 0x1A, 0x21,
  80                         0x23, 0x25, 0x2C, 0x32, 0x35,
  81                         0x36, 0x48, 0x51, 0x70, 0x7C,
  82                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  83                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  84                         0xA9, 0xAA, 0xB3, 0xB4};
  85
  86                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  87                 bool [] isUppercase = new bool [char.MaxValue + 1];
  88
  89                 byte [] decompType = new byte [char.MaxValue + 1];
  90                 int [] decompIndex = new int [char.MaxValue + 1];
  91                 int [] decompLength = new int [char.MaxValue + 1];
  92                 int [] decompValues;
  93                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  94
  95                 byte [] diacritical = new byte [char.MaxValue + 1];
  96
  97                 string [] diacritics = new string [] {
  98                         // LATIN
  99                         "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
 100                         "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 101                         " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
 102                         " OGONEK;", " CEDILLA;",
 103                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 104                         " STROKE;", " CIRCUMFLEX AND ACUTE;",
 105                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 106                         " DIAERESIS AND GRAVE;",
 107                         " BREVE AND ACUTE;",
 108                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 109                         " MACRON AND ACUTE;",
 110                         " MACRON AND GRAVE;",
 111                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 112                         " RING ABOVE AND ACUTE",
 113                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 114                         " CIRCUMFLEX AND TILDE",
 115                         " TILDE AND DIAERESIS",
 116                         " STROKE AND ACUTE",
 117                         " BREVE AND TILDE",
 118                         " CEDILLA AND BREVE",
 119                         " OGONEK AND MACRON",
 120                         " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 121                         " DOUBLE GRAVE;",
 122                         " INVERTED BREVE",
 123                         " PRECEDED BY APOSTROPHE",
 124                         " HORN;",
 125                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 126                         " PALATAL HOOK",
 127                         " DOT BELOW;",
 128                         " RETROFLEX;", "DIAERESIS BELOW",
 129                         " RING BELOW",
 130                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 131                         " BREVE BELOW;", " HORN AND GRAVE",
 132                         " TILDE BELOW",
 133                         " DOT BELOW AND DOT ABOVE",
 134                         " RIGHT HALF RING", " HORN AND TILDE",
 135                         " CIRCUMFLEX AND DOT BELOW",
 136                         " BREVE AND DOT BELOW",
 137                         " DOT BELOW AND MACRON",
 138                         " HORN AND HOOK ABOVE",
 139                         " HORN AND DOT",
 140                         // CIRCLED, PARENTHESIZED and so on
 141                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
 142                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 143                         };
 144                 byte [] diacriticWeights = new byte [] {
 145                         // LATIN.
 146                         0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 147                         0x17, 0x19, 0x1A, 0x1B, 0x1C,
 148                         0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 149                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 150                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 151                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 152                         0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
 153                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
 154                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
 155                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 156                         0x95, 0xAA,
 157                         // CIRCLED, PARENTHESIZED and so on.
 158                         0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
 159                         };
 160
 161                 int [] numberSecondaryWeightBounds = new int [] {
 162                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 163                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 164                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 165                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 166                         0xE50, 0xE60, 0xED0, 0xEE0
 167                         };
 168
 169                 char [] orderedCyrillic;
 170                 char [] orderedGurmukhi;
 171                 char [] orderedGujarati;
 172                 char [] orderedGeorgian;
 173                 char [] orderedThaana;
 174
 175                 static readonly char [] orderedTamilConsonants = new char [] {
 176                         // based on traditional Tamil consonants, except for
 177                         // Grantha (where Microsoft breaks traditionalism).
 178                         // http://www.angelfire.com/empire/thamizh/padanGaL
 179                         '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
 180                         '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
 181                         '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
 182                         '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
 183                         '\u0BB9'};
 184
 185                 // cp -> character name (only for some characters)
 186                 ArrayList sortableCharNames = new ArrayList ();
 187
 188                 // cp -> arrow value (int)
 189                 ArrayList arrowValues = new ArrayList ();
 190
 191                 // cp -> box value (int)
 192                 ArrayList boxValues = new ArrayList ();
 193
 194                 // cp -> level1 value
 195                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 196
 197                 // letterName -> cp
 198                 Hashtable arabicNameMap = new Hashtable ();
 199
 200                 // cp -> Hashtable [decompType] -> cp
 201                 Hashtable nfkdMap = new Hashtable ();
 202
 203                 // Latin letter -> ArrayList [int]
 204                 Hashtable latinMap = new Hashtable ();
 205
 206                 ArrayList jisJapanese = new ArrayList ();
 207                 ArrayList nonJisJapanese = new ArrayList ();
 208
 209                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
 210                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
 211                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
 212                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
 213                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
 214
 215                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 216
 217                 static double [] unicodeAge = new double [char.MaxValue + 1];
 218
 219                 ArrayList tailorings = new ArrayList ();
 220
 221                 void Run (string [] args)
 222                 {
 223                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 224                         ParseSources (dirname);
 225                         Console.Error.WriteLine ("parse done.");
 226
 227                         ModifyParsedValues ();
 228                         GenerateCore ();
 229                         Console.Error.WriteLine ("generation done.");
 230                         Serialize ();
 231                         Console.Error.WriteLine ("serialization done.");
 232 /*
 233 StreamWriter sw = new StreamWriter ("agelog.txt");
 234 for (int i = 0; i < char.MaxValue; i++) {
 235 bool shouldBe = false;
 236 switch (Char.GetUnicodeCategory ((char) i)) {
 237 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
 238         shouldBe = true; break;
 239 }
 240 if (unicodeAge [i] >= 3.1)
 241         shouldBe = true;
 242 //if (IsIgnorable (i) != shouldBe)
 243 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
 244 }
 245 sw.Close ();
 246 */
 247                 }
 248
 249                 byte [] CompressArray (byte [] source, CodePointIndexer i)
 250                 {
 251                         return (byte []) CodePointIndexer.CompressArray  (
 252                                 source, typeof (byte), i);
 253                 }
 254
 255                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
 256                 {
 257                         return (ushort []) CodePointIndexer.CompressArray  (
 258                                 source, typeof (ushort), i);
 259                 }
 260
 261                 void Serialize ()
 262                 {
 263                         // Tailorings
 264                         SerializeTailorings ();
 265
 266                         byte [] categories = new byte [map.Length];
 267                         byte [] level1 = new byte [map.Length];
 268                         byte [] level2 = new byte [map.Length];
 269                         byte [] level3 = new byte [map.Length];
 270                         int [] widthCompat = new int [map.Length];
 271                         for (int i = 0; i < map.Length; i++) {
 272                                 categories [i] = map [i].Category;
 273                                 level1 [i] = map [i].Level1;
 274                                 level2 [i] = map [i].Level2;
 275                                 level3 [i] = ComputeLevel3Weight ((char) i);
 276                                 switch (decompType [i]) {
 277                                 case DecompositionNarrow:
 278                                 case DecompositionWide:
 279                                 case DecompositionSuper:
 280                                 case DecompositionSub:
 281                                         // they are always 1 char
 282                                         widthCompat [i] = decompValues [decompIndex [i]];
 283                                         break;
 284                                 }
 285                         }
 286
 287                         // compress
 288                         ignorableFlags = CompressArray (ignorableFlags,
 289                                 MSCompatUnicodeTableUtil.Ignorable);
 290                         categories = CompressArray (categories,
 291                                 MSCompatUnicodeTableUtil.Category);
 292                         level1 = CompressArray (level1,
 293                                 MSCompatUnicodeTableUtil.Level1);
 294                         level2 = CompressArray (level2,
 295                                 MSCompatUnicodeTableUtil.Level2);
 296                         level3 = CompressArray (level3,
 297                                 MSCompatUnicodeTableUtil.Level3);
 298                         widthCompat = (int []) CodePointIndexer.CompressArray (
 299                                 widthCompat, typeof (int),
 300                                 MSCompatUnicodeTableUtil.WidthCompat);
 301                         cjkCHS = CompressArray (cjkCHS,
 302                                 MSCompatUnicodeTableUtil.CjkCHS);
 303                         cjkCHT = CompressArray (cjkCHT,
 304                                 MSCompatUnicodeTableUtil.Cjk);
 305                         cjkJA = CompressArray (cjkJA,
 306                                 MSCompatUnicodeTableUtil.Cjk);
 307                         cjkKO = CompressArray (cjkKO,
 308                                 MSCompatUnicodeTableUtil.Cjk);
 309                         cjkKOlv2 = CompressArray (cjkKOlv2,
 310                                 MSCompatUnicodeTableUtil.Cjk);
 311
 312                         // Ignorables
 313                         Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
 314                         for (int i = 0; i < ignorableFlags.Length; i++) {
 315                                 byte value = ignorableFlags [i];
 316                                 if (value < 10)
 317                                         Result.Write ("{0},", value);
 318                                 else
 319                                         Result.Write ("0x{0:X02},", value);
 320                                 if ((i & 0xF) == 0xF)
 321                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 322                         }
 323                         Result.WriteLine ("};");
 324                         Result.WriteLine ();
 325
 326                         // Primary category
 327                         Result.WriteLine ("static byte [] categories = new byte [] {");
 328                         for (int i = 0; i < categories.Length; i++) {
 329                                 byte value = categories [i];
 330                                 if (value < 10)
 331                                         Result.Write ("{0},", value);
 332                                 else
 333                                         Result.Write ("0x{0:X02},", value);
 334                                 if ((i & 0xF) == 0xF)
 335                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 336                         }
 337                         Result.WriteLine ("};");
 338                         Result.WriteLine ();
 339
 340                         // Primary weight value
 341                         Result.WriteLine ("static byte [] level1 = new byte [] {");
 342                         for (int i = 0; i < level1.Length; i++) {
 343                                 byte value = level1 [i];
 344                                 if (value < 10)
 345                                         Result.Write ("{0},", value);
 346                                 else
 347                                         Result.Write ("0x{0:X02},", value);
 348                                 if ((i & 0xF) == 0xF)
 349                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 350                         }
 351                         Result.WriteLine ("};");
 352                         Result.WriteLine ();
 353
 354                         // Secondary weight
 355                         Result.WriteLine ("static byte [] level2 = new byte [] {");
 356                         for (int i = 0; i < level2.Length; i++) {
 357                                 int value = level2 [i];
 358                                 if (value < 10)
 359                                         Result.Write ("{0},", value);
 360                                 else
 361                                         Result.Write ("0x{0:X02},", value);
 362                                 if ((i & 0xF) == 0xF)
 363                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 364                         }
 365                         Result.WriteLine ("};");
 366                         Result.WriteLine ();
 367
 368                         // Thirtiary weight
 369                         Result.WriteLine ("static byte [] level3 = new byte [] {");
 370                         for (int i = 0; i < level3.Length; i++) {
 371                                 byte value = level3 [i];
 372                                 if (value < 10)
 373                                         Result.Write ("{0},", value);
 374                                 else
 375                                         Result.Write ("0x{0:X02},", value);
 376                                 if ((i & 0xF) == 0xF)
 377                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 378                         }
 379                         Result.WriteLine ("};");
 380                         Result.WriteLine ();
 381
 382                         // Width insensitivity mappings
 383                         // (for now it is more lightweight than dumping the
 384                         // entire NFKD table).
 385                         Result.WriteLine ("static int [] widthCompat = new int [] {");
 386                         for (int i = 0; i < widthCompat.Length; i++) {
 387                                 int value = widthCompat [i];
 388                                 if (value < 10)
 389                                         Result.Write ("{0},", value);
 390                                 else
 391                                         Result.Write ("0x{0:X02},", value);
 392                                 if ((i & 0xF) == 0xF)
 393                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 394                         }
 395                         Result.WriteLine ("};");
 396                         Result.WriteLine ();
 397
 398                         // CJK
 399                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 400                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 401                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 402                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 403                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 404                 }
 405
 406                 void SerializeCJK (string name, ushort [] cjk, int max)
 407                 {
 408                         int offset = 0;//char.MaxValue - cjk.Length;
 409                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
 410                         for (int i = 0; i < cjk.Length; i++) {
 411                                 if (i + offset == max)
 412                                         break;
 413                                 ushort value = cjk [i];
 414                                 if (value < 10)
 415                                         Result.Write ("{0},", value);
 416                                 else
 417                                         Result.Write ("0x{0:X04},", value);
 418                                 if ((i & 0xF) == 0xF)
 419                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 420                         }
 421                         Result.WriteLine ("};");
 422                         Result.WriteLine ();
 423                 }
 424
 425                 void SerializeCJK (string name, byte [] cjk, int max)
 426                 {
 427                         int offset = 0;//char.MaxValue - cjk.Length;
 428                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
 429                         for (int i = 0; i < cjk.Length; i++) {
 430                                 if (i + offset == max)
 431                                         break;
 432                                 byte value = cjk [i];
 433                                 if (value < 10)
 434                                         Result.Write ("{0},", value);
 435                                 else
 436                                         Result.Write ("0x{0:X02},", value);
 437                                 if ((i & 0xF) == 0xF)
 438                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 439                         }
 440                         Result.WriteLine ("};");
 441                         Result.WriteLine ();
 442                 }
 443
 444                 void SerializeTailorings ()
 445                 {
 446                         Hashtable indexes = new Hashtable ();
 447                         Hashtable counts = new Hashtable ();
 448                         Result.WriteLine ("static char [] tailorings = new char [] {");
 449                         int count = 0;
 450                         foreach (Tailoring t in tailorings) {
 451                                 if (t.Alias != 0)
 452                                         continue;
 453                                 Result.Write ("/*{0}*/", t.LCID);
 454                                 indexes.Add (t.LCID, count);
 455                                 char [] values = t.ItemToCharArray ();
 456                                 counts.Add (t.LCID, values.Length);
 457                                 foreach (char c in values) {
 458                                         Result.Write ("'\\x{0:X}', ", (int) c);
 459                                         if (++count % 16 == 0)
 460                                                 Result.WriteLine (" // {0:X04}", count - 16);
 461                                 }
 462                         }
 463                         Result.WriteLine ("};");
 464
 465                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
 466                         foreach (Tailoring t in tailorings) {
 467                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
 468                                 if (!indexes.ContainsKey (target)) {
 469                                         Console.Error.WriteLine ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias);
 470                                         continue;
 471                                 }
 472                                 int idx = (int) indexes [target];
 473                                 int cnt = (int) counts [target];
 474                                 bool french = t.FrenchSort;
 475                                 if (t.Alias != 0)
 476                                         foreach (Tailoring t2 in tailorings)
 477                                                 if (t2.LCID == t.LCID)
 478                                                         french = t2.FrenchSort;
 479                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
 480                         }
 481                         Result.WriteLine ("};");
 482                 }
 483
 484                 #region Parse
 485
 486                 void ParseSources (string dirname)
 487                 {
 488                         string unidata =
 489                                 dirname + "/UnicodeData.txt";
 490                         string derivedCoreProps =
 491                                 dirname + "/DerivedCoreProperties.txt";
 492                         string scripts =
 493                                 dirname + "/Scripts.txt";
 494                         string cp932 =
 495                                 dirname + "/CP932.TXT";
 496                         string derivedAge =
 497                                 dirname + "/DerivedAge.txt";
 498                         string chXML = dirname + "/common/collation/zh.xml";
 499                         string jaXML = dirname + "/common/collation/ja.xml";
 500                         string koXML = dirname + "/common/collation/ko.xml";
 501
 502                         ParseDerivedAge (derivedAge);
 503
 504                         FillIgnorables ();
 505
 506                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 507                         ParseUnidata (unidata);
 508                         ParseDerivedCoreProperties (derivedCoreProps);
 509                         ParseScripts (scripts);
 510                         ParseCJK (chXML, jaXML, koXML);
 511
 512                         ParseTailorings ("mono-tailoring-source.txt");
 513                 }
 514
 515                 void ParseTailorings (string filename)
 516                 {
 517                         Tailoring t = null;
 518                         int line = 0;
 519                         using (StreamReader sr = new StreamReader (filename)) {
 520                                 try {
 521                                         while (sr.Peek () >= 0) {
 522                                                 line++;
 523                                                 ProcessTailoringLine (ref t,
 524                                                         sr.ReadLine ().Trim ());
 525                                         }
 526                                 } catch (Exception) {
 527                                         Console.Error.WriteLine ("ERROR at line {0}", line);
 528                                         throw;
 529                                 }
 530                         }
 531                 }
 532
 533                 // For now this is enough.
 534                 string ParseTailoringSourceValue (string s)
 535                 {
 536                         StringBuilder sb = new StringBuilder ();
 537                         for (int i = 0; i < s.Length; i++) {
 538                                 if (s.StartsWith ("\\u")) {
 539                                         sb.Append ((char) int.Parse (
 540                                                 s.Substring (2, 4), NumberStyles.HexNumber),
 541                                                 1);
 542                                         i += 5;
 543                                 }
 544                         else
 545                                 sb.Append (s [i]);
 546                         }
 547                         return sb.ToString ();
 548                 }
 549
 550                 void ProcessTailoringLine (ref Tailoring t, string s)
 551                 {
 552                         int idx = s.IndexOf ('#');
 553                         if (idx > 0)
 554                                 s = s.Substring (0, idx).Trim ();
 555                         if (s.Length == 0 || s [0] == '#')
 556                                 return;
 557                         if (s [0] == '@') {
 558                                 idx = s.IndexOf ('=');
 559                                 if (idx > 0)
 560                                         t = new Tailoring (
 561                                                 int.Parse (s.Substring (1, idx - 1)),
 562                                                 int.Parse (s.Substring (idx + 1)));
 563                                 else
 564                                         t = new Tailoring (int.Parse (s.Substring (1)));
 565                                 tailorings.Add (t);
 566                                 return;
 567                         }
 568                         if (s.StartsWith ("*FrenchSort")) {
 569                                 t.FrenchSort = true;
 570                                 return;
 571                         }
 572                         string d = "*Diacritical";
 573                         if (s.StartsWith (d)) {
 574                                 idx = s.IndexOf ("->");
 575                                 t.AddDiacriticalMap (
 576                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
 577                                                 NumberStyles.HexNumber),
 578                                         byte.Parse (s.Substring (idx + 2).Trim (),
 579                                                 NumberStyles.HexNumber));
 580                                 return;
 581                         }
 582                         idx = s.IndexOf (':');
 583                         if (idx > 0) {
 584                                 string source = s.Substring (0, idx).Trim ();
 585                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
 586                                 byte [] b = new byte [5];
 587                                 for (int i = 0; i < 5; i++) {
 588                                         if (l [i] == "*")
 589                                                 b [i] = 0;
 590                                         else
 591                                                 b [i] = byte.Parse (l [i],
 592                                                         NumberStyles.HexNumber);
 593                                 }
 594                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
 595                                         b);
 596                         }
 597                         idx = s.IndexOf ('=');
 598                         if (idx > 0)
 599                                 t.AddReplacementMap (
 600                                         ParseTailoringSourceValue (
 601                                                 s.Substring (0, idx).Trim ()),
 602                                         ParseTailoringSourceValue (
 603                                                 s.Substring (idx + 1).Trim ()));
 604                 }
 605
 606                 void ParseDerivedAge (string filename)
 607                 {
 608                         using (StreamReader file =
 609                                 new StreamReader (filename)) {
 610                                 while (file.Peek () >= 0) {
 611                                         string s = file.ReadLine ();
 612                                         int idx = s.IndexOf ('#');
 613                                         if (idx >= 0)
 614                                                 s = s.Substring (0, idx);
 615                                         idx = s.IndexOf (';');
 616                                         if (idx < 0)
 617                                                 continue;
 618
 619                                         string cpspec = s.Substring (0, idx);
 620                                         idx = cpspec.IndexOf ("..");
 621                                         NumberStyles nf = NumberStyles.HexNumber |
 622                                                 NumberStyles.AllowTrailingWhite;
 623                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 624                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 625                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 626
 627                                         // FIXME: use index
 628                                         if (cp > char.MaxValue)
 629                                                 continue;
 630
 631                                         for (int i = cp; i <= cpEnd; i++)
 632                                                 unicodeAge [i] = double.Parse (value);
 633                                 }
 634                         }
 635                         unicodeAge [0] = double.MaxValue; // never be supported
 636                 }
 637
 638                 void ParseUnidata (string filename)
 639                 {
 640                         ArrayList decompValues = new ArrayList ();
 641                         using (StreamReader unidata =
 642                                 new StreamReader (filename)) {
 643                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 644                                         try {
 645                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 646                                         } catch (Exception) {
 647                                                 Console.Error.WriteLine ("**** At line " + line);
 648                                                 throw;
 649                                         }
 650                                 }
 651                         }
 652                         this.decompValues = (int [])
 653                                 decompValues.ToArray (typeof (int));
 654                 }
 655
 656                 void ProcessUnidataLine (string s, ArrayList decompValues)
 657                 {
 658                         int idx = s.IndexOf ('#');
 659                         if (idx >= 0)
 660                                 s = s.Substring (0, idx);
 661                         idx = s.IndexOf (';');
 662                         if (idx < 0)
 663                                 return;
 664                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 665                         string [] values = s.Substring (idx + 1).Split (';');
 666
 667                         // FIXME: use index
 668                         if (cp > char.MaxValue)
 669                                 return;
 670                         if (IsIgnorable (cp))
 671                                 return;
 672
 673                         string name = values [0];
 674
 675                         // isSmallCapital
 676                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 677                                 isSmallCapital [cp] = true;
 678
 679                         // latin mapping by character name
 680                         if (s.IndexOf ("LATIN") > 0) {
 681                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 682                                 int offset = lidx + 15;
 683                                 if (lidx < 0) {
 684                                         lidx = s.IndexOf ("LETTER TURNED ");
 685                                         offset = lidx + 14;
 686                                 }
 687                                 if (lidx < 0) {
 688                                         lidx = s.IndexOf ("LETTER ");
 689                                         offset = lidx + 7;
 690                                 }
 691                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 692                                 if ('A' <= c && c <= 'Z' &&
 693                                         (s.Length == offset + 1 || s [offset + 1] == ' ')) {
 694                                         ArrayList entry = (ArrayList) latinMap [c];
 695                                         if (entry == null) {
 696                                                 entry = new ArrayList ();
 697                                                 latinMap [c] = entry;
 698                                         }
 699                                         entry.Add (cp);
 700                                 }
 701                         }
 702
 703                         // Arrow names
 704                         if (0x2000 <= cp && cp < 0x3000) {
 705                                 int value = 0;
 706                                 // SPECIAL CASES. FIXME: why?
 707                                 switch (cp) {
 708                                 case 0x21C5: value = -1; break; // E2
 709                                 case 0x261D: value = 1; break;
 710                                 case 0x27A6: value = 3; break;
 711                                 case 0x21B0: value = 7; break;
 712                                 case 0x21B1: value = 3; break;
 713                                 case 0x21B2: value = 7; break;
 714                                 case 0x21B4: value = 5; break;
 715                                 case 0x21B5: value = 7; break;
 716                                 case 0x21B9: value = -1; break; // E1
 717                                 case 0x21CF: value = 7; break;
 718                                 case 0x21D0: value = 3; break;
 719                                 }
 720                                 string [] arrowTargets = new string [] {
 721                                         "",
 722                                         "UPWARDS",
 723                                         "NORTH EAST",
 724                                         "RIGHTWARDS",
 725                                         "SOUTH EAST",
 726                                         "DOWNWARDS",
 727                                         "SOUTH WEST",
 728                                         "LEFTWARDS",
 729                                         "NORTH WEST",
 730                                         };
 731                                 if (value == 0)
 732                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
 733                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
 734                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
 735                                                         s.IndexOf (" OVER") < 0
 736                                                 )
 737                                                         value = i;
 738                                 if (value > 0)
 739                                         arrowValues.Add (new DictionaryEntry (
 740                                                 cp, value));
 741                         }
 742
 743                         // Box names
 744                         if (0x2500 <= cp && cp < 0x25B0) {
 745                                 int value = 0;
 746                                 // flags:
 747                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
 748                                 // [h,rl] [r] [l]
 749                                 // [v,ud] [u] [d]
 750                                 // [dr] [dl] [ur] [ul]
 751                                 // [vr,udr] [vl,vdl]
 752                                 // [hd,rld] [hu,rlu]
 753                                 // [hv,udrl,rlv,udh]
 754                                 ArrayList flags = new ArrayList (new int [] {
 755                                         32, 8 + 4, 8, 4,
 756                                         16, 1 + 2, 1, 2,
 757                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
 758                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
 759                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
 760                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
 761                                         });
 762                                 byte [] offsets = new byte [] {
 763                                         0, 0, 1, 2,
 764                                         3, 3, 4, 5,
 765                                         6, 7, 8, 9,
 766                                         10, 10, 11, 11,
 767                                         12, 12, 13, 13,
 768                                         14, 14, 14, 14};
 769                                 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
 770                                         int flag = 0;
 771                                         if (s.IndexOf (" UP") > 0)
 772                                                 flag |= 1;
 773                                         if (s.IndexOf (" DOWN") > 0)
 774                                                 flag |= 2;
 775                                         if (s.IndexOf (" RIGHT") > 0)
 776                                                 flag |= 4;
 777                                         if (s.IndexOf (" LEFT") > 0)
 778                                                 flag |= 8;
 779                                         if (s.IndexOf (" VERTICAL") > 0)
 780                                                 flag |= 16;
 781                                         if (s.IndexOf (" HORIZONTAL") > 0)
 782                                                 flag |= 32;
 783
 784                                         int fidx = flags.IndexOf (flag);
 785                                         value = fidx < 0 ? fidx : offsets [fidx];
 786                                 } else if (s.IndexOf ("BLOCK") > 0) {
 787                                         if (s.IndexOf ("ONE EIGHTH") > 0)
 788                                                 value = 0x12;
 789                                         else if (s.IndexOf ("ONE QUARTER") > 0)
 790                                                 value = 0x13;
 791                                         else if (s.IndexOf ("THREE EIGHTHS") > 0)
 792                                                 value = 0x14;
 793                                         else if (s.IndexOf ("HALF") > 0)
 794                                                 value = 0x15;
 795                                         else if (s.IndexOf ("FIVE EIGHTHS") > 0)
 796                                                 value = 0x16;
 797                                         else if (s.IndexOf ("THREE QUARTERS") > 0)
 798                                                 value = 0x17;
 799                                         else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
 800                                                 value = 0x18;
 801                                         else
 802                                                 value = 0x19;
 803                                 }
 804                                 if (value >= 0)
 805                                         boxValues.Add (new DictionaryEntry (
 806                                                 cp, value));
 807                         }
 808
 809                         // For some characters store the name and sort later
 810                         // to determine sorting.
 811                         if (0x2100 <= cp && cp <= 0x213F &&
 812                                 Char.IsSymbol ((char) cp))
 813                                 sortableCharNames.Add (
 814                                         new DictionaryEntry (cp, values [0]));
 815                         else if (0x3380 <= cp && cp <= 0x33DD)
 816                                 sortableCharNames.Add (new DictionaryEntry (
 817                                         cp, values [0].Substring (7)));
 818
 819                         // diacritical weights by character name
 820                         for (int d = 0; d < diacritics.Length; d++)
 821                                 if (s.IndexOf (diacritics [d]) > 0)
 822                                         diacritical [cp] |= diacriticWeights [d];
 823                         // Two-step grep required for it.
 824                         if (s.IndexOf ("FULL STOP") > 0 &&
 825                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
 826                                 diacritical [cp] |= 0xF4;
 827
 828                         // Arabic letter name
 829                         if (0x0621 <= cp && cp <= 0x064A &&
 830                                 Char.GetUnicodeCategory ((char) cp)
 831                                 == UnicodeCategory.OtherLetter) {
 832                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
 833                                 switch (cp) {
 834                                 case 0x0621:
 835                                 case 0x0624:
 836                                 case 0x0626:
 837                                         // hamza, waw, yeh ... special cases.
 838                                         value = 0x07;
 839                                         break;
 840                                 case 0x0649:
 841                                 case 0x064A:
 842                                         value = 0x77; // special cases.
 843                                         break;
 844                                 default:
 845                                         // Get primary letter name i.e.
 846                                         // XXX part of ARABIC LETTER XXX yyy
 847                                         // e.g. that of "TEH MARBUTA" is "TEH".
 848                                         string letterName =
 849                                                 (cp == 0x0640) ?
 850                                                 // 0x0640 is special: it does
 851                                                 // not start with ARABIC LETTER
 852                                                 values [0] :
 853                                                 values [0].Substring (14);
 854                                         int tmpIdx = letterName.IndexOf (' ');
 855                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
 856 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
 857                                         if (arabicNameMap.ContainsKey (letterName))
 858                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
 859                                         else
 860                                                 arabicNameMap [letterName] = cp;
 861                                         break;
 862                                 }
 863                                 arabicLetterPrimaryValues [cp] = value;
 864                         }
 865
 866                         // Japanese square letter
 867                         if (0x3300 <= cp && cp <= 0x3357)
 868                                 if (!ExistsJIS (cp))
 869                                         nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
 870
 871                         // normalizationType
 872                         string decomp = values [4];
 873                         idx = decomp.IndexOf ('<');
 874                         if (idx >= 0) {
 875                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
 876                                 case "full":
 877                                         decompType [cp] = DecompositionFull;
 878                                         break;
 879                                 case "sub":
 880                                         decompType [cp] = DecompositionSub;
 881                                         break;
 882                                 case "super":
 883                                         decompType [cp] = DecompositionSuper;
 884                                         break;
 885                                 case "small":
 886                                         decompType [cp] = DecompositionSmall;
 887                                         break;
 888                                 case "isolated":
 889                                         decompType [cp] = DecompositionIsolated;
 890                                         break;
 891                                 case "initial":
 892                                         decompType [cp] = DecompositionInitial;
 893                                         break;
 894                                 case "final":
 895                                         decompType [cp] = DecompositionFinal;
 896                                         break;
 897                                 case "medial":
 898                                         decompType [cp] = DecompositionMedial;
 899                                         break;
 900                                 case "noBreak":
 901                                         decompType [cp] = DecompositionNoBreak;
 902                                         break;
 903                                 case "compat":
 904                                         decompType [cp] = DecompositionCompat;
 905                                         break;
 906                                 case "fraction":
 907                                         decompType [cp] = DecompositionFraction;
 908                                         break;
 909                                 case "font":
 910                                         decompType [cp] = DecompositionFont;
 911                                         break;
 912                                 case "circle":
 913                                         decompType [cp] = DecompositionCircle;
 914                                         break;
 915                                 case "square":
 916                                         decompType [cp] = DecompositionSquare;
 917                                         break;
 918                                 case "wide":
 919                                         decompType [cp] = DecompositionWide;
 920                                         break;
 921                                 case "narrow":
 922                                         decompType [cp] = DecompositionNarrow;
 923                                         break;
 924                                 case "vertical":
 925                                         decompType [cp] = DecompositionVertical;
 926                                         break;
 927                                 default:
 928                                         throw new Exception ("Support NFKD type : " + decomp);
 929                                 }
 930                         }
 931                         else
 932                                 decompType [cp] = DecompositionCanonical;
 933                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
 934                         if (decomp.Length > 0) {
 935
 936                                 string [] velems = decomp.Split (' ');
 937                                 int didx = decompValues.Count;
 938                                 decompIndex [cp] = didx;
 939                                 foreach (string v in velems)
 940                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
 941                                 decompLength [cp] = velems.Length;
 942
 943                                 // [decmpType] -> this_cp
 944                                 int targetCP = (int) decompValues [didx];
 945                                 // for "(x)" it specially maps to 'x' .
 946                                 // FIXME: check if it is sane
 947                                 if (velems.Length == 3 &&
 948                                         (int) decompValues [didx] == '(' &&
 949                                         (int) decompValues [didx + 2] == ')')
 950                                         targetCP = (int) decompValues [didx + 1];
 951                                 // special: 0x215F "1/"
 952                                 else if (cp == 0x215F)
 953                                         targetCP = '1';
 954                                 else if (velems.Length > 1 &&
 955                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
 956                                         // skip them, except for CJK ideograph compat
 957                                         targetCP = 0;
 958
 959                                 if (targetCP != 0) {
 960                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
 961                                         if (entry == null) {
 962                                                 entry = new Hashtable ();
 963                                                 nfkdMap [targetCP] = entry;
 964                                         }
 965                                         entry [(byte) decompType [cp]] = cp;
 966                                 }
 967                         }
 968                         // numeric values
 969                         if (values [5].Length > 0)
 970                                 decimalValue [cp] = decimal.Parse (values [5]);
 971                         else if (values [6].Length > 0)
 972                                 decimalValue [cp] = decimal.Parse (values [6]);
 973                         else if (values [7].Length > 0) {
 974                                 string decstr = values [7];
 975                                 idx = decstr.IndexOf ('/');
 976                                 if (cp == 0x215F) // special. "1/"
 977                                         decimalValue [cp] = 0x1;
 978                                 else if (idx > 0)
 979                                         // m/n
 980                                         decimalValue [cp] =
 981                                                 decimal.Parse (decstr.Substring (0, idx))
 982                                                 / decimal.Parse (decstr.Substring (idx + 1));
 983                                 else if (decstr [0] == '(' &&
 984                                         decstr [decstr.Length - 1] == ')')
 985                                         // (n)
 986                                         decimalValue [cp] =
 987                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
 988                                 else if (decstr [decstr.Length - 1] == '.')
 989                                         // n.
 990                                         decimalValue [cp] =
 991                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
 992                                 else
 993                                         decimalValue [cp] = decimal.Parse (decstr);
 994                         }
 995                 }
 996
 997                 void ParseDerivedCoreProperties (string filename)
 998                 {
 999                         // IsUppercase
1000                         using (StreamReader file =
1001                                 new StreamReader (filename)) {
1002                                 for (int line = 1; file.Peek () >= 0; line++) {
1003                                         try {
1004                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1005                                         } catch (Exception) {
1006                                                 Console.Error.WriteLine ("**** At line " + line);
1007                                                 throw;
1008                                         }
1009                                 }
1010                         }
1011                 }
1012
1013                 void ProcessDerivedCorePropLine (string s)
1014                 {
1015                         int idx = s.IndexOf ('#');
1016                         if (idx >= 0)
1017                                 s = s.Substring (0, idx);
1018                         idx = s.IndexOf (';');
1019                         if (idx < 0)
1020                                 return;
1021                         string cpspec = s.Substring (0, idx);
1022                         idx = cpspec.IndexOf ("..");
1023                         NumberStyles nf = NumberStyles.HexNumber |
1024                                 NumberStyles.AllowTrailingWhite;
1025                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1026                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1027                         string value = s.Substring (cpspec.Length + 1).Trim ();
1028
1029                         // FIXME: use index
1030                         if (cp > char.MaxValue)
1031                                 return;
1032
1033                         switch (value) {
1034                         case "Uppercase":
1035                                 for (int x = cp; x <= cpEnd; x++)
1036                                         isUppercase [x] = true;
1037                                 break;
1038                         }
1039                 }
1040
1041                 void ParseScripts (string filename)
1042                 {
1043                         ArrayList cyrillic = new ArrayList ();
1044                         ArrayList gurmukhi = new ArrayList ();
1045                         ArrayList gujarati = new ArrayList ();
1046                         ArrayList georgian = new ArrayList ();
1047                         ArrayList thaana = new ArrayList ();
1048
1049                         using (StreamReader file =
1050                                 new StreamReader (filename)) {
1051                                 while (file.Peek () >= 0) {
1052                                         string s = file.ReadLine ();
1053                                         int idx = s.IndexOf ('#');
1054                                         if (idx >= 0)
1055                                                 s = s.Substring (0, idx);
1056                                         idx = s.IndexOf (';');
1057                                         if (idx < 0)
1058                                                 continue;
1059
1060                                         string cpspec = s.Substring (0, idx);
1061                                         idx = cpspec.IndexOf ("..");
1062                                         NumberStyles nf = NumberStyles.HexNumber |
1063                                                 NumberStyles.AllowTrailingWhite;
1064                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1065                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1066                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1067
1068                                         // FIXME: use index
1069                                         if (cp > char.MaxValue)
1070                                                 continue;
1071
1072                                         switch (value) {
1073                                         case "Cyrillic":
1074                                                 for (int x = cp; x <= cpEnd; x++)
1075                                                         if (!IsIgnorable (x))
1076                                                                 cyrillic.Add ((char) x);
1077                                                 break;
1078                                         case "Gurmukhi":
1079                                                 for (int x = cp; x <= cpEnd; x++)
1080                                                         if (!IsIgnorable (x))
1081                                                                 gurmukhi.Add ((char) x);
1082                                                 break;
1083                                         case "Gujarati":
1084                                                 for (int x = cp; x <= cpEnd; x++)
1085                                                         if (!IsIgnorable (x))
1086                                                                 gujarati.Add ((char) x);
1087                                                 break;
1088                                         case "Georgian":
1089                                                 for (int x = cp; x <= cpEnd; x++)
1090                                                         if (!IsIgnorable (x))
1091                                                                 georgian.Add ((char) x);
1092                                                 break;
1093                                         case "Thaana":
1094                                                 for (int x = cp; x <= cpEnd; x++)
1095                                                         if (!IsIgnorable (x))
1096                                                                 thaana.Add ((char) x);
1097                                                 break;
1098                                         }
1099                                 }
1100                         }
1101                         cyrillic.Sort (UCAComparer.Instance);
1102                         gurmukhi.Sort (UCAComparer.Instance);
1103                         gujarati.Sort (UCAComparer.Instance);
1104                         georgian.Sort (UCAComparer.Instance);
1105                         thaana.Sort (UCAComparer.Instance);
1106                         orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1107                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1108                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1109                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1110                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1111                 }
1112
1113                 void ParseJISOrder (string filename)
1114                 {
1115                         using (StreamReader file =
1116                                 new StreamReader (filename)) {
1117                                 while (file.Peek () >= 0) {
1118                                         string s = file.ReadLine ();
1119                                         int idx = s.IndexOf ('#');
1120                                         if (idx >= 0)
1121                                                 s = s.Substring (0, idx).Trim ();
1122                                         if (s.Length == 0)
1123                                                 continue;
1124                                         idx = s.IndexOf (' ');
1125                                         if (idx < 0)
1126                                                 continue;
1127                                         // They start with "0x" so cut them out.
1128                                         int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
1129                                         int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
1130                                         jisJapanese.Add (new JISCharacter (cp, jis));
1131                                 }
1132                         }
1133                 }
1134
1135                 void ParseCJK (string zhXML, string jaXML, string koXML)
1136                 {
1137                         XmlDocument doc = new XmlDocument ();
1138                         doc.XmlResolver = null;
1139                         int v;
1140                         string s;
1141                         string category;
1142                         int offset;
1143                         ushort [] arr;
1144
1145                         // Chinese Simplified
1146                         category = "chs";
1147                         arr = cjkCHS;
1148                         offset = 0;//char.MaxValue - arr.Length;
1149                         doc.Load (zhXML);
1150                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1151                         v = 0x8008;
1152                         foreach (char c in s) {
1153                                 if (c < '\u3100')
1154                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1155                                 else {
1156                                         arr [(int) c - offset] = (ushort) v++;
1157                                         if (v % 256 == 0)
1158                                                 v += 2;
1159                                 }
1160                         }
1161
1162                         // Chinese Traditional
1163                         category = "cht";
1164                         arr = cjkCHT;
1165                         offset = 0;//char.MaxValue - arr.Length;
1166                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1167                         v = 0x8002;
1168                         foreach (char c in s) {
1169                                 if (c < '\u4E00')
1170                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1171                                 else {
1172                                         arr [(int) c - offset] = (ushort) v++;
1173                                         if (v % 256 == 0)
1174                                                 v += 2;
1175                                 }
1176                         }
1177
1178                         // Japanese
1179                         category = "ja";
1180                         arr = cjkJA;
1181                         offset = 0;//char.MaxValue - arr.Length;
1182                         doc.Load (jaXML);
1183                         s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1184                         v = 0x8008;
1185                         foreach (char c in s) {
1186                                 if (c < '\u4E00')
1187                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1188                                 else {
1189                                         arr [(int) c - offset] = (ushort) v++;
1190                                         if (v % 256 == 0)
1191                                                 v += 2;
1192                                 }
1193                         }
1194
1195                         // Korean
1196                         // Korean weight is somewhat complex. It first shifts
1197                         // Hangul category from 52-x to 80-x (they are anyways
1198                         // computed). CJK ideographs are placed at secondary
1199                         // weight, like XX YY 01 zz 01, where XX and YY are
1200                         // corresponding "reset" value and zz is 41,43,45...
1201                         //
1202                         // Unlike chs,cht and ja, Korean value is a combined
1203                         // ushort which is computed as category
1204                         //
1205                         category = "ko";
1206                         arr = cjkKO;
1207                         offset = 0;//char.MaxValue - arr.Length;
1208                         doc.Load (koXML);
1209                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1210                                 XmlElement sc = (XmlElement) reset.NextSibling;
1211                                 // compute "category" and "level 1" for the
1212                                 // target "reset" Hangle syllable
1213                                 char rc = reset.InnerText [0];
1214                                 int ri = ((int) rc - 0xAC00) + 1;
1215                                 ushort p = (ushort)
1216                                         ((ri / 254) * 256 + (ri % 254) + 2);
1217                                 // Place the characters after the target.
1218                                 s = sc.InnerText;
1219                                 v = 0x41;
1220                                 foreach (char c in s) {
1221                                         arr [(int) c - offset] = p;
1222                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1223                                         v += 2;
1224                                 }
1225                         }
1226                 }
1227
1228                 #endregion
1229
1230                 #region Generation
1231
1232                 void FillIgnorables ()
1233                 {
1234                         for (int i = 0; i <= char.MaxValue; i++) {
1235                                 if (Char.GetUnicodeCategory ((char) i) ==
1236                                         UnicodeCategory.OtherNotAssigned)
1237                                         continue;
1238                                 if (IsIgnorable (i))
1239                                         ignorableFlags [i] |= 1;
1240                                 if (IsIgnorableSymbol (i))
1241                                         ignorableFlags [i] |= 2;
1242                                 if (IsIgnorableNonSpacing (i))
1243                                         ignorableFlags [i] |= 4;
1244                         }
1245                 }
1246
1247                 void ModifyParsedValues ()
1248                 {
1249                         // number, secondary weights
1250                         byte weight = 0x38;
1251                         int [] numarr = numberSecondaryWeightBounds;
1252                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1253                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1254                                         if (Char.IsNumber ((char) cp))
1255                                                 diacritical [cp] = weight;
1256
1257                         // Modify some decomposition equivalence
1258                         decompType [0xFE31] = 0;
1259                         decompIndex [0xFE31] = 0;
1260                         decompLength [0xFE31] = 0;
1261                         decompType [0xFE32] = 0;
1262                         decompIndex [0xFE32] = 0;
1263                         decompLength [0xFE32] = 0;
1264
1265                         // Korean parens numbers
1266                         for (int i = 0x3200; i <= 0x321C; i++)
1267                                 diacritical [i] = 0xA;
1268                         for (int i = 0x3260; i <= 0x327B; i++)
1269                                 diacritical [i] = 0xC;
1270
1271                         // Update name part of named characters
1272                         for (int i = 0; i < sortableCharNames.Count; i++) {
1273                                 DictionaryEntry de =
1274                                         (DictionaryEntry) sortableCharNames [i];
1275                                 int cp = (int) de.Key;
1276                                 string renamed = null;
1277                                 switch (cp) {
1278                                 case 0x2101: renamed = "A_1"; break;
1279                                 case 0x33C3: renamed = "A_2"; break;
1280                                 case 0x2105: renamed = "C_1"; break;
1281                                 case 0x2106: renamed = "C_2"; break;
1282                                 case 0x211E: renamed = "R1"; break;
1283                                 case 0x211F: renamed = "R2"; break;
1284                                 // Remove some of them!
1285                                 case 0x2103:
1286                                 case 0x2109:
1287                                 case 0x2116:
1288                                 case 0x2117:
1289                                 case 0x2118:
1290                                 case 0x2125:
1291                                 case 0x2127:
1292                                 case 0x2129:
1293                                 case 0x212E:
1294                                 case 0x2132:
1295                                         sortableCharNames.RemoveAt (i);
1296                                         i--;
1297                                         continue;
1298                                 }
1299                                 if (renamed != null)
1300                                         sortableCharNames [i] =
1301                                                 new DictionaryEntry (cp, renamed);
1302                         }
1303                 }
1304
1305                 void GenerateCore ()
1306                 {
1307                         UnicodeCategory uc;
1308
1309                         #region Specially ignored // 01
1310                         // This will raise "Defined" flag up.
1311                         foreach (char c in specialIgnore)
1312                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1313                         #endregion
1314
1315
1316                         #region Variable weights
1317                         // Controls : 06 03 - 06 3D
1318                         fillIndex [6] = 3;
1319                         for (int i = 0; i < 65536; i++) {
1320                                 if (IsIgnorable (i))
1321                                         continue;
1322                                 char c = (char) i;
1323                                 uc = Char.GetUnicodeCategory (c);
1324                                 // NEL is whitespace but not ignored here.
1325                                 if (uc == UnicodeCategory.Control &&
1326                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1327                                         AddCharMap (c, 6, 1);
1328                         }
1329
1330                         // Apostrophe 06 80
1331                         fillIndex [6] = 0x80;
1332                         AddCharMapGroup ('\'', 6, 1, 0);
1333                         AddCharMap ('\uFE63', 6, 1);
1334
1335                         // Hyphen/Dash : 06 81 - 06 90
1336                         for (int i = 0; i < char.MaxValue; i++) {
1337                                 if (!IsIgnorable (i) &&
1338                                         Char.GetUnicodeCategory ((char) i) ==
1339                                         UnicodeCategory.DashPunctuation) {
1340                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1341                                         if (i == 0x2011) {
1342                                                 // SPECIAL: add 2027 and 2043
1343                                                 // Maybe they are regarded the
1344                                                 // same hyphens in "central"
1345                                                 // position.
1346                                                 AddCharMap ('\u2027', 6, 1);
1347                                                 AddCharMap ('\u2043', 6, 1);
1348                                         }
1349                                 }
1350                         }
1351
1352                         // Arabic variable weight chars 06 A0 -
1353                         fillIndex [6] = 0xA0;
1354                         // vowels
1355                         for (int i = 0x64B; i <= 0x650; i++)
1356                                 AddArabicCharMap ((char) i);
1357                         // sukun
1358                         AddCharMapGroup ('\u0652', 6, 1, 0);
1359                         // shadda
1360                         AddCharMapGroup ('\u0651', 6, 1, 0);
1361                         #endregion
1362
1363
1364                         #region Nonspacing marks // 01
1365                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1366
1367                         // Combining diacritical marks: 01 DC -
1368
1369                         fillIndex [0x1] = 0x41;
1370                         for (int i = 0x030E; i <= 0x0326; i++)
1371                                 if (!IsIgnorable (i))
1372                                         AddCharMap ((char) i, 0x1, 1);
1373                         for (int i = 0x0329; i <= 0x0334; i++)
1374                                 if (!IsIgnorable (i))
1375                                         AddCharMap ((char) i, 0x1, 1);
1376                         for (int i = 0x0339; i <= 0x0341; i++)
1377                                 if (!IsIgnorable (i))
1378                                         AddCharMap ((char) i, 0x1, 1);
1379                         fillIndex [0x1] = 0x72;
1380                         for (int i = 0x0346; i <= 0x0348; i++)
1381                                 if (!IsIgnorable (i))
1382                                         AddCharMap ((char) i, 0x1, 1);
1383                         for (int i = 0x02BE; i <= 0x02BF; i++)
1384                                 if (!IsIgnorable (i))
1385                                         AddCharMap ((char) i, 0x1, 1);
1386                         for (int i = 0x02C1; i <= 0x02C5; i++)
1387                                 if (!IsIgnorable (i))
1388                                         AddCharMap ((char) i, 0x1, 1);
1389                         for (int i = 0x02CE; i <= 0x02CF; i++)
1390                                 if (!IsIgnorable (i))
1391                                         AddCharMap ((char) i, 0x1, 1);
1392                         for (int i = 0x02D1; i <= 0x02D3; i++)
1393                                 if (!IsIgnorable (i))
1394                                         AddCharMap ((char) i, 0x1, 1);
1395                         AddCharMap ('\u02DE', 0x1, 1);
1396                         for (int i = 0x02E4; i <= 0x02E9; i++)
1397                                 if (!IsIgnorable (i))
1398                                         AddCharMap ((char) i, 0x1, 1);
1399
1400                         // LAMESPEC: It should not stop at '\u20E1'. There are
1401                         // a few more characters (that however results in
1402                         // overflow of level 2 unless we start before 0xDD).
1403                         fillIndex [0x1] = 0xDC;
1404                         for (int i = 0x20d0; i <= 0x20e1; i++)
1405                                 AddCharMap ((char) i, 0x1, 1);
1406                         #endregion
1407
1408
1409                         #region Whitespaces // 07 03 -
1410                         fillIndex [0x7] = 0x2;
1411                         AddCharMap (' ', 0x7, 2);
1412                         AddCharMap ('\u00A0', 0x7, 1);
1413                         for (int i = 9; i <= 0xD; i++)
1414                                 AddCharMap ((char) i, 0x7, 1);
1415                         for (int i = 0x2000; i <= 0x200B; i++)
1416                                 AddCharMap ((char) i, 0x7, 1);
1417
1418                         fillIndex [0x7] = 0x17;
1419                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1420                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1421
1422                         // Characters which used to represent layout control.
1423                         // LAMESPEC: Windows developers seem to have thought
1424                         // that those characters are kind of whitespaces,
1425                         // while they aren't.
1426                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1427                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1428                         #endregion
1429
1430                         // FIXME: 09 should be more complete.
1431                         fillIndex [0x9] = 2;
1432                         // misc tech mark
1433                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1434                                 AddCharMap ((char) cp, 0x9, 1, 0);
1435
1436                         // arrows
1437                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1438                         foreach (DictionaryEntry de in arrowValues) {
1439                                 int idx = (int) de.Value;
1440                                 int cp = (int) de.Key;
1441                                 if (map [cp].Defined)
1442                                         continue;
1443                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1444                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1445                                 arrowLv2 [idx]++;
1446                         }
1447                         // boxes
1448                         byte [] boxLv2 = new byte [128];
1449                         for (int i = 0; i < boxLv2.Length; i++)
1450                                 boxLv2 [i] = 3;
1451                         foreach (DictionaryEntry de in boxValues) {
1452                                 int cp = (int) de.Key;
1453                                 int idx = (int) de.Value;
1454                                 if (map [cp].Defined)
1455                                         continue;
1456                                 fillIndex [0x9] = (byte) (0xE5 + idx);
1457                                 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1458                                 boxLv2 [idx]++;
1459                         }
1460                         // Some special characters (slanted)
1461                         fillIndex [0x9] = 0xF4;
1462                         AddCharMap ('\u2571', 0x9, 3);
1463                         AddCharMap ('\u2572', 0x9, 3);
1464                         AddCharMap ('\u2573', 0x9, 3);
1465
1466                         // FIXME: implement 0A
1467                         #region Symbols
1468                         fillIndex [0xA] = 2;
1469                         // byte currency symbols
1470                         for (int cp = 0; cp < 0x100; cp++) {
1471                                 uc = Char.GetUnicodeCategory ((char) cp);
1472                                 if (!IsIgnorable (cp) &&
1473                                         uc == UnicodeCategory.CurrencySymbol &&
1474                                         cp != '$')
1475                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1476                         }
1477                         // byte other symbols
1478                         for (int cp = 0; cp < 0x100; cp++) {
1479                                 if (cp == 0xA6)
1480                                         continue; // SPECIAL: skip FIXME: why?
1481                                 uc = Char.GetUnicodeCategory ((char) cp);
1482                                 if (!IsIgnorable (cp) &&
1483                                         uc == UnicodeCategory.OtherSymbol)
1484                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1485                         }
1486
1487                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1488                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1489                                 AddCharMap ((char) cp, 0xA, 1, 0);
1490                         // Dingbats
1491                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1492                                 if (Char.IsSymbol ((char) cp))
1493                                         AddCharMap ((char) cp, 0xA, 1, 0);
1494                         // OCR
1495                         for (int i = 0x2440; i < 0x2460; i++)
1496                                 AddCharMap ((char) i, 0xA, 1, 0);
1497
1498                         #endregion
1499
1500                         #region Numbers // 0C 02 - 0C E1
1501                         fillIndex [0xC] = 2;
1502
1503                         // 9F8 : Bengali "one less than the denominator"
1504                         AddCharMap ('\u09F8', 0xC, 1);
1505
1506                         ArrayList numbers = new ArrayList ();
1507                         for (int i = 0; i < 65536; i++)
1508                                 if (!IsIgnorable (i) &&
1509                                         Char.IsNumber ((char) i) &&
1510                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1511                                         numbers.Add (i);
1512
1513                         ArrayList numberValues = new ArrayList ();
1514                         foreach (int i in numbers)
1515                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1516                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1517
1518 //foreach (DictionaryEntry de in numberValues)
1519 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1520
1521                         decimal prevValue = -1;
1522                         foreach (DictionaryEntry de in numberValues) {
1523                                 int cp = (int) de.Key;
1524                                 decimal currValue = (decimal) de.Value;
1525                                 bool addnew = false;
1526                                 if (prevValue < currValue &&
1527                                         prevValue - (int) prevValue == 0 &&
1528                                         prevValue >= 1) {
1529
1530                                         addnew = true;
1531                                         // Process Hangzhou and Roman numbers
1532
1533                                         // There are some SPECIAL cases.
1534                                         if (currValue != 4) // no increment for 4
1535                                                 fillIndex [0xC]++;
1536
1537                                         int xcp;
1538                                         xcp = (int) prevValue + 0x2170 - 1;
1539                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1540                                         xcp = (int) prevValue + 0x2160 - 1;
1541                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1542                                         fillIndex [0xC] += 2;
1543                                         xcp = (int) prevValue + 0x3021 - 1;
1544                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1545                                         fillIndex [0xC]++;
1546                                 }
1547                                 if (prevValue < currValue)
1548                                         prevValue = currValue;
1549                                 if (map [cp].Defined)
1550                                         continue;
1551                                 // HangZhou and Roman are add later
1552                                 // (code is above)
1553                                 else if (0x3021 <= cp && cp < 0x302A
1554                                         || 0x2160 <= cp && cp < 0x216A
1555                                         || 0x2170 <= cp && cp < 0x217A)
1556                                         continue;
1557
1558                                 if (cp ==  0x215B) // FIXME: why?
1559                                         fillIndex [0xC] += 2;
1560                                 else if (cp == 0x3021) // FIXME: why?
1561                                         fillIndex [0xC]++;
1562                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1563
1564                                 if (addnew || cp <= '9') {
1565                                         int xcp;
1566                                         if (1 <= currValue && currValue <= 10) {
1567                                                 xcp = cp - 0x31 + 0x2776;
1568                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1569                                                 xcp = cp - 0x31 + 0x2780;
1570                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1571                                                 xcp = cp - 0x31 + 0x278A;
1572                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1573                                         }
1574                                         if (1 <= currValue && currValue <= 20) {
1575                                                 xcp = cp - 0x31 + 0x2460;
1576                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1577                                                 xcp = cp - 0x31 + 0x2474;
1578                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1579                                                 xcp = cp - 0x31 + 0x2488;
1580                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1581                                         }
1582                                 }
1583
1584                                 if (cp != 0x09E7 && cp != 0x09EA)
1585                                         fillIndex [0xC]++;
1586
1587                                 // Add special cases that are not regarded as
1588                                 // numbers in UnicodeCategory speak.
1589                                 if (cp == '5') {
1590                                         // TONE FIVE
1591                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1592                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1593                                 }
1594                                 else if (cp == '6') // FIXME: why?
1595                                         fillIndex [0xC]++;
1596                         }
1597
1598                         // 221E: infinity
1599                         fillIndex [0xC] = 0xFF;
1600                         AddCharMap ('\u221E', 0xC, 1);
1601                         #endregion
1602
1603                         #region Letters and NonSpacing Marks (general)
1604
1605                         // ASCII Latin alphabets
1606                         for (int i = 0; i < alphabets.Length; i++)
1607                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1608
1609
1610                         // non-ASCII Latin alphabets
1611                         // FIXME: there is no such characters that are placed
1612                         // *after* "alphabets" array items. This is nothing
1613                         // more than a hack that creates dummy weight for
1614                         // primary characters.
1615                         for (int i = 0x0080; i < 0x0300; i++) {
1616                                 if (!Char.IsLetter ((char) i))
1617                                         continue;
1618                                 // For those Latin Letters which has NFKD are
1619                                 // not added as independent primary character.
1620                                 if (decompIndex [i] != 0)
1621                                         continue;
1622                                 // SPECIAL CASES:
1623                                 // 1.some alphabets have primarily
1624                                 //   equivalent ASCII alphabets.
1625                                 // 2.some have independent primary weights,
1626                                 //   but inside a-to-z range.
1627                                 // 3.there are some expanded characters that
1628                                 //   are not part of Unicode Standard NFKD.
1629                                 switch (i) {
1630                                 // 1. skipping them does not make sense
1631 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
1632 //                              case 0x184: case 0x185: case 0x186: case 0x189:
1633 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
1634 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
1635 //                              case 0x19B: case 0x19C:
1636                                 // 2. skipping them does not make sense
1637 //                              case 0x14A: // Ng
1638 //                              case 0x14B: // ng
1639                                 // 3.
1640                                 case 0xC6: // AE
1641                                 case 0xE6: // ae
1642                                 case 0xDE: // Icelandic Thorn
1643                                 case 0xFE: // Icelandic Thorn
1644                                 case 0xDF: // German ss
1645                                 case 0xFF: // German ss
1646                                 // not classified yet
1647 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1648 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1649 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1650 //                              case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1651 //                              case 0x1DD:
1652                                         continue;
1653                                 }
1654                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
1655                         }
1656
1657                         // Greek and Coptic
1658                         fillIndex [0xF] = 02;
1659                         for (int i = 0x0380; i < 0x0390; i++)
1660                                 if (Char.IsLetter ((char) i))
1661                                         AddLetterMap ((char) i, 0xF, 1);
1662                         fillIndex [0xF] = 02;
1663                         for (int i = 0x0391; i < 0x03CF; i++)
1664                                 if (Char.IsLetter ((char) i))
1665                                         AddLetterMap ((char) i, 0xF, 1);
1666                         fillIndex [0xF] = 0x40;
1667                         for (int i = 0x03D0; i < 0x0400; i++)
1668                                 if (Char.IsLetter ((char) i))
1669                                         AddLetterMap ((char) i, 0xF, 1);
1670
1671                         // Cyrillic - UCA order w/ some modification
1672                         fillIndex [0x10] = 0x3;
1673                         // table which is moslty from UCA DUCET.
1674                         for (int i = 0; i < orderedCyrillic.Length; i++) {
1675                                 char c = orderedCyrillic [i];
1676                                 if (Char.IsLetter (c))
1677                                         AddLetterMap (c, 0x10, 3);
1678                         }
1679                         for (int i = 0x0460; i < 0x0481; i++) {
1680                                 if (Char.IsLetter ((char) i))
1681                                         AddLetterMap ((char) i, 0x10, 3);
1682                         }
1683
1684                         // Armenian
1685                         fillIndex [0x11] = 0x3;
1686                         for (int i = 0x0531; i < 0x0586; i++)
1687                                 if (Char.IsLetter ((char) i))
1688                                         AddLetterMap ((char) i, 0x11, 1);
1689
1690                         // Hebrew
1691                         // -Letters
1692                         fillIndex [0x12] = 0x3;
1693                         for (int i = 0x05D0; i < 0x05FF; i++)
1694                                 if (Char.IsLetter ((char) i))
1695                                         AddLetterMap ((char) i, 0x12, 1);
1696                         // -Accents
1697                         fillIndex [0x1] = 0x3;
1698                         for (int i = 0x0591; i <= 0x05C2; i++)
1699                                 if (i != 0x05BE)
1700                                         AddCharMap ((char) i, 0x1, 1);
1701
1702                         // Arabic
1703                         fillIndex [0x1] = 0x8E;
1704                         fillIndex [0x13] = 0x3;
1705                         for (int i = 0x0621; i <= 0x064A; i++) {
1706                                 // Abjad
1707                                 if (Char.GetUnicodeCategory ((char) i)
1708                                         != UnicodeCategory.OtherLetter) {
1709                                         // FIXME: arabic nonspacing marks are
1710                                         // in different order.
1711                                         AddCharMap ((char) i, 0x1, 1);
1712                                         continue;
1713                                 }
1714 //                              map [i] = new CharMapEntry (0x13,
1715 //                                      (byte) arabicLetterPrimaryValues [i], 1);
1716                                 fillIndex [0x13] =
1717                                         (byte) arabicLetterPrimaryValues [i];
1718                                 AddLetterMap ((char) i, 0x13, 0);
1719                         }
1720                         fillIndex [0x13] = 0x84;
1721                         for (int i = 0x0674; i < 0x06D6; i++)
1722                                 if (Char.IsLetter ((char) i))
1723                                         AddLetterMap ((char) i, 0x13, 1);
1724
1725                         // Devanagari
1726                         // FIXME: it does seem straight codepoint mapping.
1727                         fillIndex [0x14] = 04;
1728                         for (int i = 0x0901; i < 0x0905; i++)
1729                                 if (!IsIgnorable (i))
1730                                         AddLetterMap ((char) i, 0x14, 2);
1731                         fillIndex [0x14] = 0xB;
1732                         for (int i = 0x0905; i < 0x093A; i++)
1733                                 if (Char.IsLetter ((char) i))
1734                                         AddLetterMap ((char) i, 0x14, 4);
1735                         for (int i = 0x093E; i < 0x094F; i++)
1736                                 if (!IsIgnorable (i))
1737                                         AddLetterMap ((char) i, 0x14, 2);
1738
1739                         // Bengali
1740                         // -Letters
1741                         fillIndex [0x15] = 02;
1742                         for (int i = 0x0980; i < 0x9FF; i++) {
1743                                 if (IsIgnorable (i))
1744                                         continue;
1745                                 if (i == 0x09E0)
1746                                         fillIndex [0x15] = 0x3B;
1747                                 switch (Char.GetUnicodeCategory ((char) i)) {
1748                                 case UnicodeCategory.NonSpacingMark:
1749                                 case UnicodeCategory.DecimalDigitNumber:
1750                                 case UnicodeCategory.OtherNumber:
1751                                         continue;
1752                                 }
1753                                 AddLetterMap ((char) i, 0x15, 1);
1754                         }
1755                         // -Signs
1756                         fillIndex [0x1] = 0x3;
1757                         for (int i = 0x0981; i < 0x0A00; i++)
1758                                 if (Char.GetUnicodeCategory ((char) i) ==
1759                                         UnicodeCategory.NonSpacingMark)
1760                                         AddCharMap ((char) i, 0x1, 1);
1761
1762                         // Gurmukhi. orderedGurmukhi is from UCA
1763                         // FIXME: it does not look equivalent to UCA.
1764                         fillIndex [0x1] = 03;
1765                         fillIndex [0x16] = 02;
1766                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
1767                                 char c = orderedGurmukhi [i];
1768                                 if (IsIgnorable ((int) c))
1769                                         continue;
1770                                 if (!Char.IsLetter (c)) {
1771                                         AddLetterMap (c, 0x1, 1);
1772                                         continue;
1773                                 }
1774                                 if (c == '\u0A3C' || c == '\u0A4D' ||
1775                                         '\u0A66' <= c && c <= '\u0A71')
1776                                         continue;
1777                                 AddLetterMap (c, 0x16, 4);
1778                         }
1779
1780                         // Gujarati. orderedGujarati is from UCA
1781                         fillIndex [0x17] = 02;
1782                         for (int i = 0; i < orderedGujarati.Length; i++)
1783                                 AddLetterMap (orderedGujarati [i], 0x17, 4);
1784
1785                         // Oriya
1786                         fillIndex [0x18] = 02;
1787                         for (int i = 0x0B00; i < 0x0B7F; i++) {
1788                                 switch (Char.GetUnicodeCategory ((char) i)) {
1789                                 case UnicodeCategory.NonSpacingMark:
1790                                 case UnicodeCategory.DecimalDigitNumber:
1791                                         continue;
1792                                 }
1793                                 AddLetterMap ((char) i, 0x18, 1);
1794                         }
1795
1796                         // Tamil
1797                         fillIndex [0x19] = 2;
1798                         AddCharMap ('\u0BD7', 0x19, 0);
1799                         fillIndex [0x19] = 0xA;
1800                         // vowels
1801                         for (int i = 0x0BD7; i < 0x0B94; i++)
1802                                 if (Char.IsLetter ((char) i))
1803                                         AddCharMap ((char) i, 0x19, 2);
1804                         // special vowel
1805                         fillIndex [0x19] = 0x24;
1806                         AddCharMap ('\u0B94', 0x19, 0);
1807                         fillIndex [0x19] = 0x26;
1808                         // The array for Tamil consonants is a constant.
1809                         // Windows have almost similar sequence to TAM from
1810                         // tamilnet but a bit different in Grantha.
1811                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
1812                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1813                         // combining marks
1814                         fillIndex [0x19] = 0x82;
1815                         for (int i = 0x0BBE; i < 0x0BCD; i++)
1816                                 if (Char.GetUnicodeCategory ((char) i) ==
1817                                         UnicodeCategory.SpacingCombiningMark
1818                                         || i == 0x0BC0)
1819                                         AddLetterMap ((char) i, 0x19, 2);
1820
1821                         // Telugu
1822                         fillIndex [0x1A] = 0x4;
1823                         for (int i = 0x0C00; i < 0x0C62; i++) {
1824                                 if (i == 0x0C55 || i == 0x0C56)
1825                                         continue; // skip
1826                                 AddCharMap ((char) i, 0x1A, 3);
1827                                 char supp = (i == 0x0C0B) ? '\u0C60':
1828                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
1829                                 if (supp == char.MinValue)
1830                                         continue;
1831                                 AddCharMap (supp, 0x1A, 3);
1832                         }
1833
1834                         // Kannada
1835                         fillIndex [0x1B] = 4;
1836                         for (int i = 0x0C80; i < 0x0CE5; i++) {
1837                                 if (i == 0x0CD5 || i == 0x0CD6)
1838                                         continue; // ignore
1839                                 AddCharMap ((char) i, 0x1B, 3);
1840                         }
1841
1842                         // Malayalam
1843                         fillIndex [0x1C] = 2;
1844                         for (int i = 0x0D02; i < 0x0D61; i++)
1845                                 // FIXME: I avoided MSCompatUnicodeTable usage
1846                                 // here (it results in recursion). So check if
1847                                 // using NonSpacingMark makes sense or not.
1848                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1849 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1850                                         AddCharMap ((char) i, 0x1C, 1);
1851
1852                         // Thai ... note that it breaks 0x1E wall after E2B!
1853                         // Also, all Thai characters have level 2 value 3.
1854                         fillIndex [0x1E] = 2;
1855                         for (int i = 0xE44; i < 0xE48; i++)
1856                                 AddCharMap ((char) i, 0x1E, 1, 3);
1857                         for (int i = 0xE01; i < 0xE2B; i++)
1858                                 AddCharMap ((char) i, 0x1E, 6, 0);
1859                         fillIndex [0x1F] = 5;
1860                         for (int i = 0xE2B; i < 0xE30; i++)
1861                                 AddCharMap ((char) i, 0x1F, 6, 0);
1862                         for (int i = 0xE30; i < 0xE3B; i++)
1863                                 AddCharMap ((char) i, 0x1F, 1, 3);
1864                         // some Thai characters remains.
1865                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
1866                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1867                         foreach (char c in specialThai)
1868                                 AddCharMap (c, 0x1F, 1);
1869
1870                         // Lao
1871                         fillIndex [0x1F] = 2;
1872                         for (int i = 0xE80; i < 0xEDF; i++)
1873                                 if (Char.IsLetter ((char) i))
1874                                         AddCharMap ((char) i, 0x1F, 1);
1875
1876                         // Georgian. orderedGeorgian is from UCA DUCET.
1877                         fillIndex [0x21] = 5;
1878                         for (int i = 0; i < orderedGeorgian.Length; i++)
1879                                 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1880
1881                         // Japanese Kana.
1882                         fillIndex [0x22] = 2;
1883                         int kanaOffset = 0x3041;
1884                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1885
1886                         for (int gyo = 0; gyo < 9; gyo++) {
1887                                 for (int dan = 0; dan < 5; dan++) {
1888                                         if (gyo == 7 && dan % 2 == 1) {
1889                                                 // 'ya'-gyo
1890                                                 fillIndex [0x22]++;
1891                                                 kanaOffset -= 2; // There is no space for yi and ye.
1892                                                 continue;
1893                                         }
1894                                         int cp = kanaOffset + dan * kanaLines [gyo];
1895                                         // small lines (a-gyo, ya-gyo)
1896                                         if (gyo == 0 || gyo == 7) {
1897                                                 AddKanaMap (cp, 1); // small
1898                                                 AddKanaMap (cp + 1, 1);
1899                                         }
1900                                         else
1901                                                 AddKanaMap (cp, kanaLines [gyo]);
1902                                         fillIndex [0x22]++;
1903
1904                                         if (cp == 0x3061) {
1905                                                 // add small 'Tsu' (before normal one)
1906                                                 AddKanaMap (0x3063, 1);
1907                                                 kanaOffset++;
1908                                         }
1909                                 }
1910                                 fillIndex [0x22] += 3;
1911                                 kanaOffset += 5 * kanaLines [gyo];
1912                         }
1913
1914                         // Wa-gyo is almost special, so I just manually add.
1915                         AddLetterMap ((char) 0x308E, 0x22, 0);
1916                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1917                         AddLetterMap ((char) 0x308F, 0x22, 0);
1918                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1919                         fillIndex [0x22]++;
1920                         AddLetterMap ((char) 0x3090, 0x22, 0);
1921                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1922                         fillIndex [0x22] += 2;
1923                         // no "Wu" in Japanese.
1924                         AddLetterMap ((char) 0x3091, 0x22, 0);
1925                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1926                         fillIndex [0x22]++;
1927                         AddLetterMap ((char) 0x3092, 0x22, 0);
1928                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1929                         // Nn
1930                         fillIndex [0x22] = 0x80;
1931                         AddLetterMap ((char) 0x3093, 0x22, 0);
1932                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1933
1934                         // JIS Japanese square chars.
1935                         fillIndex [0x22] = 0x97;
1936                         jisJapanese.Sort (JISComparer.Instance);
1937                         foreach (JISCharacter j in jisJapanese)
1938                                 AddCharMap ((char) j.CP, 0x22, 1);
1939                         // non-JIS Japanese square chars.
1940                         nonJisJapanese.Sort (NonJISComparer.Instance);
1941                         foreach (NonJISCharacter j in nonJisJapanese)
1942                                 AddCharMap ((char) j.CP, 0x22, 1);
1943
1944                         // Bopomofo
1945                         fillIndex [0x23] = 0x02;
1946                         for (int i = 0x3105; i <= 0x312C; i++)
1947                                 AddCharMap ((char) i, 0x23, 1);
1948
1949                         // Estrangela: ancient Syriac
1950                         fillIndex [0x24] = 0x0B;
1951                         // FIXME: is 0x71E really alternative form?
1952                         ArrayList syriacAlternatives = new ArrayList (
1953                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
1954                         for (int i = 0x0710; i <= 0x072C; i++) {
1955                                 if (i == 0x0711) // NonSpacingMark
1956                                         continue;
1957                                 if (syriacAlternatives.Contains (i))
1958                                         continue;
1959                                 AddCharMap ((char) i, 0x24, 4);
1960                                 // FIXME: why?
1961                                 if (i == 0x721)
1962                                         fillIndex [0x24]++;
1963                         }
1964                         foreach (int cp in syriacAlternatives)
1965                                 map [cp] = new CharMapEntry (0x24,
1966                                         (byte) (map [cp - 1].Level1 + 2),
1967                                         0);
1968
1969                         // Thaana
1970                         // FIXME: it turned out that it does not look like UCA
1971                         fillIndex [0x24] = 0x6E;
1972                         for (int i = 0; i < orderedThaana.Length; i++) {
1973                                 if (IsIgnorableNonSpacing (i))
1974                                         continue;
1975                                 AddCharMap (orderedThaana [i], 0x24, 2);
1976                         }
1977                         #endregion
1978
1979                         // FIXME: Add more culture-specific letters (that are
1980                         // not supported in Windows collation) here.
1981
1982                         // Surrogate ... they are computed.
1983
1984                         #region Hangul
1985                         // Hangul.
1986                         //
1987                         // Unlike UCA Windows Hangul sequence mixes Jongseong
1988                         // with Choseong sequence as well as Jungseong,
1989                         // adjusted to have the same primary weight for the
1990                         // same base character. So it is impossible to compute
1991                         // those sort keys.
1992                         //
1993                         // Here I introduce an ordered sequence of mixed
1994                         // 'commands' and 'characters' that is similar to
1995                         // LDML text:
1996                         //      - ',' increases primary weight.
1997                         //      - [A B] means a range, increasing index
1998                         //      - {A B} means a range, without increasing index
1999                         //      - '=' is no operation (it means the characters
2000                         //        of both sides have the same weight).
2001                         //      - '>' inserts a Hangul Syllable block that
2002                         //        contains 0x251 characters.
2003                         //      - '<' decreases the index
2004                         //      - '0'-'9' means skip count
2005                         //      - whitespaces are ignored
2006                         //
2007
2008                         string hangulSequence =
2009                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
2010                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2011                         + "<{\u1113 \u1116}, \u3165,"
2012                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2013                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
2014                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
2015                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2016                                 + "[\u11D1 \u11D2], \u11B2,"
2017                                 + "[\u11D3 \u11D5], \u11B3,"
2018                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2019                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2020                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2021                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2022                         + "<{\u1121 \u112C}, \u11B9,,,,,,,,, [\u11E4 \u11E6],,"
2023                                 + "\u1109=\u11BA,,, \u3214=\u3274 <>"
2024                         + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
2025                                 + "\u11EA,, \u110A=\u11BB,,, >"
2026                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2027                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2028                         + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2029                                 + "\u11F1,, \u11F2,,,"
2030                                 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2031                         + "<\u114D, \u110D,,  >"
2032                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
2033                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2034                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2035                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2036                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2037                                 + "[\u11F5 \u11F8]"
2038                         ;
2039
2040                         byte hangulCat = 0x52;
2041                         fillIndex [hangulCat] = 0x2;
2042
2043                         int syllableBlock = 0;
2044                         for (int n = 0; n < hangulSequence.Length; n++) {
2045                                 char c = hangulSequence [n];
2046                                 int start, end;
2047                                 if (Char.IsWhiteSpace (c))
2048                                         continue;
2049                                 switch (c) {
2050                                 case '=':
2051                                         break; // NOP
2052                                 case ',':
2053                                         IncrementSequentialIndex (ref hangulCat);
2054                                         break;
2055                                 case '<':
2056                                         if (fillIndex [hangulCat] == 2)
2057                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2058                                         fillIndex [hangulCat]--;
2059                                         break;
2060                                 case '>':
2061                                         IncrementSequentialIndex (ref hangulCat);
2062                                         for (int l = 0; l < 0x15; l++)
2063                                                 for (int v = 0; v < 0x1C; v++) {
2064                                                         AddCharMap (
2065                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2066                                                         IncrementSequentialIndex (ref hangulCat);
2067                                                 }
2068                                         syllableBlock++;
2069                                         break;
2070                                 case '[':
2071                                         start = hangulSequence [n + 1];
2072                                         end = hangulSequence [n + 3];
2073                                         for (int i = start; i <= end; i++) {
2074                                                 AddCharMap ((char) i, hangulCat, 0);
2075                                                 if (end > i)
2076                                                         IncrementSequentialIndex (ref hangulCat);
2077                                         }
2078                                         n += 4; // consumes 5 characters for this operation
2079                                         break;
2080                                 case '{':
2081                                         start = hangulSequence [n + 1];
2082                                         end = hangulSequence [n + 3];
2083                                         for (int i = start; i <= end; i++)
2084                                                 AddCharMap ((char) i, hangulCat, 0);
2085                                         n += 4; // consumes 5 characters for this operation
2086                                         break;
2087                                 default:
2088                                         AddCharMap (c, hangulCat, 0);
2089                                         break;
2090                                 }
2091                         }
2092
2093                         #endregion
2094
2095                         // Letterlike characters and CJK compatibility square
2096                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2097                         int [] counts = new int ['Z' - 'A' + 1];
2098                         char [] namedChars = new char [sortableCharNames.Count];
2099                         int nCharNames = 0;
2100                         foreach (DictionaryEntry de in sortableCharNames) {
2101                                 counts [((string) de.Value) [0] - 'A']++;
2102                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2103                         }
2104                         nCharNames = 0; // reset
2105                         for (int a = 0; a < counts.Length; a++) {
2106                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2107                                 for (int i = 0; i < counts [a]; i++)
2108 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2109                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2110                         }
2111
2112                         // CJK unified ideograph.
2113                         byte cjkCat = 0x9E;
2114                         fillIndex [cjkCat] = 0x2;
2115                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2116                                 if (!IsIgnorable (cp))
2117                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2118                         // CJK Extensions goes here.
2119                         // LAMESPEC: With this Windows style CJK layout, it is
2120                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2121                         // 0x9FBB can never be added w/o breaking compat.
2122                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2123                                 if (!IsIgnorable (cp))
2124                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2125
2126                         // PrivateUse ... computed.
2127                         // remaining Surrogate ... computed.
2128
2129                         #region Special "biggest" area (FF FF)
2130                         fillIndex [0xFF] = 0xFF;
2131                         char [] specialBiggest = new char [] {
2132                                 '\u3005', '\u3031', '\u3032', '\u309D',
2133                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2134                                 '\uFE7C', '\uFE7D', '\uFF70'};
2135                         foreach (char c in specialBiggest)
2136                                 AddCharMap (c, 0xFF, 0);
2137                         #endregion
2138
2139                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2140                         // non-alphanumeric ASCII except for: + - < = > '
2141                         for (int i = 0x21; i < 0x7F; i++) {
2142                                 if (Char.IsLetterOrDigit ((char) i)
2143                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2144                                         continue; // they are not added here.
2145                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2146                                 // Insert 3001 after ',' and 3002 after '.'
2147                                 if (i == 0x2C)
2148                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2149                                 else if (i == 0x2E) {
2150                                         fillIndex [0x7]--;
2151                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2152                                 }
2153                                 else if (i == 0x3A)
2154                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2155                         }
2156                         #endregion
2157
2158                         #region 07 - Punctuations and something else
2159                         for (int i = 0xA0; i < char.MaxValue; i++) {
2160                                 if (IsIgnorable (i))
2161                                         continue;
2162
2163                                 // SPECIAL CASES:
2164                                 switch (i) {
2165                                 case 0xAB: // 08
2166                                 case 0xB7: // 0A
2167                                 case 0x2329: // 09
2168                                 case 0x232A: // 09
2169                                         continue;
2170                                 }
2171
2172                                 switch (Char.GetUnicodeCategory ((char) i)) {
2173                                 case UnicodeCategory.OtherPunctuation:
2174                                 case UnicodeCategory.ClosePunctuation:
2175                                 case UnicodeCategory.OpenPunctuation:
2176                                 case UnicodeCategory.InitialQuotePunctuation:
2177                                 case UnicodeCategory.FinalQuotePunctuation:
2178                                 case UnicodeCategory.ModifierSymbol:
2179                                         // SPECIAL CASES: // 0xA
2180                                         if (0x2020 <= i && i <= 0x2042)
2181                                                 continue;
2182                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
2183                                         break;
2184                                 default:
2185                                         if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2186                                                 goto case UnicodeCategory.OtherPunctuation;
2187                                         break;
2188                                 }
2189                         }
2190                         // Control pictures
2191                         for (int i = 0x2400; i <= 0x2421; i++)
2192                                 AddCharMap ((char) i, 0x7, 1, 0);
2193                         #endregion
2194
2195                         // FIXME: for 07 xx we need more love.
2196
2197                         // FIXME: 08 should be more complete.
2198                         fillIndex [0x8] = 2;
2199                         for (int cp = 0; cp < char.MaxValue; cp++)
2200                                 if (!map [cp].Defined &&
2201                                         Char.GetUnicodeCategory ((char) cp) ==
2202                                         UnicodeCategory.MathSymbol)
2203                                         AddCharMapGroup ((char) cp, 0x8, 1, 0);
2204
2205                         // Characters w/ diacritical marks (NFKD)
2206                         for (int i = 0; i <= char.MaxValue; i++) {
2207                                 if (map [i].Defined || IsIgnorable (i))
2208                                         continue;
2209                                 if (decompIndex [i] == 0)
2210                                         continue;
2211
2212                                 int start = decompIndex [i];
2213                                 int primaryChar = decompValues [start];
2214                                 int secondary = 0;
2215                                 bool skip = false;
2216                                 int length = decompLength [i];
2217                                 // special processing for parenthesized ones.
2218                                 if (length == 3 &&
2219                                         decompValues [start] == '(' &&
2220                                         decompValues [start + 2] == ')') {
2221                                         primaryChar = decompValues [start + 1];
2222                                         length = 1;
2223                                 }
2224
2225                                 if (map [primaryChar].Level1 == 0)
2226                                         continue;
2227
2228                                 for (int l = 1; l < length; l++) {
2229                                         int c = decompValues [start + l];
2230                                         if (map [c].Level1 != 0)
2231                                                 skip = true;
2232                                         secondary += diacritical [c];
2233                                 }
2234                                 if (skip)
2235                                         continue;
2236                                 map [i] = new CharMapEntry (
2237                                         map [primaryChar].Category,
2238                                         map [primaryChar].Level1,
2239                                         (byte) secondary);
2240
2241                         }
2242
2243                         #region Level2 adjustment
2244                         // Arabic Hamzah
2245                         diacritical [0x624] = 0x5;
2246                         diacritical [0x626] = 0x7;
2247                         diacritical [0x622] = 0x9;
2248                         diacritical [0x623] = 0xA;
2249                         diacritical [0x625] = 0xB;
2250                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
2251                         diacritical [0x64A] = 0x7; // Yaa'
2252
2253
2254                         for (int i = 0; i < char.MaxValue; i++) {
2255                                 byte mod = 0;
2256                                 byte cat = map [i].Category;
2257                                 switch (cat) {
2258                                 case 0xE: // Latin diacritics
2259                                 case 0x22: // Japanese: circled characters
2260                                         mod = diacritical [i];
2261                                         break;
2262                                 case 0x13: // Arabic
2263                                         if (diacritical [i] == 0)
2264                                                 mod = 0x8; // default for arabic
2265                                         break;
2266                                 }
2267                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
2268                                         mod = diacritical [i];
2269                                 if (mod > 0)
2270                                         map [i] = new CharMapEntry (
2271                                                 cat, map [i].Level1, mod);
2272                         }
2273                         #endregion
2274
2275                         // FIXME: this is hack but those which are
2276                         // NonSpacingMark characters and still undefined
2277                         // are likely to be nonspacing.
2278                         for (int i = 0; i < char.MaxValue; i++)
2279                                 if (!map [i].Defined &&
2280                                         !IsIgnorable (i) &&
2281                                         Char.GetUnicodeCategory ((char) i) ==
2282                                         UnicodeCategory.NonSpacingMark)
2283                                         AddCharMap ((char) i, 1, 1);
2284                 }
2285
2286                 private void IncrementSequentialIndex (ref byte hangulCat)
2287                 {
2288                         fillIndex [hangulCat]++;
2289                         if (fillIndex [hangulCat] == 0) { // overflown
2290                                 hangulCat++;
2291                                 fillIndex [hangulCat] = 0x2;
2292                         }
2293                 }
2294
2295                 // Reset fillIndex to fixed value and call AddLetterMap().
2296                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2297                 {
2298                         fillIndex [category] = alphaWeight;
2299                         AddLetterMap (c, category, 0);
2300
2301                         ArrayList al = latinMap [c] as ArrayList;
2302                         if (al == null)
2303                                 return;
2304
2305                         foreach (int cp in al)
2306                                 AddLetterMap ((char) cp, category, 0);
2307                 }
2308
2309                 private void AddKanaMap (int i, byte voices)
2310                 {
2311                         for (byte b = 0; b < voices; b++) {
2312                                 char c = (char) (i + b);
2313                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
2314                                 // Hiragana
2315                                 AddLetterMapCore (c, 0x22, 0, arg);
2316                                 // Katakana
2317                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2318                         }
2319                 }
2320
2321                 private void AddLetterMap (char c, byte category, byte updateCount)
2322                 {
2323                         AddLetterMapCore (c, category, updateCount, 0);
2324                 }
2325
2326                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2327                 {
2328                         char c2;
2329                         // <small> updates index
2330                         c2 = ToSmallForm (c);
2331                         if (c2 != c)
2332                                 AddCharMapGroup (c2, category, updateCount, level2);
2333                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2334                         if (c2 != c && !map [(int) c2].Defined)
2335                                 AddLetterMapCore (c2, category, 0, level2);
2336                         bool doUpdate = true;
2337                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2338                                 doUpdate = false;
2339                         else
2340                                 AddCharMapGroup (c, category, 0, level2);
2341                         if (doUpdate)
2342                                 fillIndex [category] += updateCount;
2343                 }
2344
2345                 private bool AddCharMap (char c, byte category, byte increment)
2346                 {
2347                         return AddCharMap (c, category, increment, 0);
2348                 }
2349
2350                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2351                 {
2352                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2353                                 return false; // do nothing
2354                         map [(int) c] = new CharMapEntry (category,
2355                                 category == 1 ? alt : fillIndex [category],
2356                                 category == 1 ? fillIndex [category] : alt);
2357                         fillIndex [category] += increment;
2358                         return true;
2359                 }
2360
2361                 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2362                 {
2363                         char c2 = ToSmallFormTail (c);
2364                         if (c2 != c)
2365                                 AddCharMap (c2, category, updateCount, 0);
2366                         // itself
2367                         AddCharMap (c, category, updateCount, 0);
2368                         // <full>
2369                         c2 = ToFullWidthTail (c);
2370                         if (c2 != c)
2371                                 AddCharMapGroupTail (c2, category, updateCount);
2372                 }
2373
2374                 //
2375                 // Adds characters to table in the order below
2376                 // (+ increases weight):
2377                 //      (<small> +)
2378                 //      itself
2379                 //      <fraction>
2380                 //      <full> | <super> | <sub>
2381                 //      <circle> | <wide> (| <narrow>)
2382                 //      +
2383                 //      (vertical +)
2384                 //
2385                 // level2 is fixed (does not increase).
2386                 int [] sameWeightItems = new int [] {
2387                         DecompositionFraction,
2388                         DecompositionFull,
2389                         DecompositionSuper,
2390                         DecompositionSub,
2391                         DecompositionCircle,
2392                         DecompositionWide,
2393                         DecompositionNarrow,
2394                         };
2395                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2396                 {
2397                         if (map [(int) c].Defined)
2398                                 return;
2399
2400                         char small = char.MinValue;
2401                         char vertical = char.MinValue;
2402                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2403                         if (nfkd != null) {
2404                                 object smv = nfkd [(byte) DecompositionSmall];
2405                                 if (smv != null)
2406                                         small = (char) ((int) smv);
2407                                 object vv = nfkd [(byte) DecompositionVertical];
2408                                 if (vv != null)
2409                                         vertical = (char) ((int) vv);
2410                         }
2411
2412                         // <small> updates index
2413                         if (small != char.MinValue)
2414                                 AddCharMap (small, category, updateCount);
2415
2416                         // itself
2417                         AddCharMap (c, category, 0, level2);
2418
2419                         if (nfkd != null) {
2420                                 foreach (int weight in sameWeightItems) {
2421                                         object wv = nfkd [(byte) weight];
2422                                         if (wv != null)
2423                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
2424                                 }
2425                         }
2426
2427                         // update index here.
2428                         fillIndex [category] += updateCount;
2429
2430                         if (vertical != char.MinValue)
2431                                 AddCharMap (vertical, category, updateCount, level2);
2432                 }
2433
2434                 private void AddCharMapCJK (char c, ref byte category)
2435                 {
2436                         AddCharMap (c, category, 0, 0);
2437                         IncrementSequentialIndex (ref category);
2438
2439                         // Special. I wonder why but Windows skips 9E F9.
2440                         if (category == 0x9E && fillIndex [category] == 0xF9)
2441                                 IncrementSequentialIndex (ref category);
2442                 }
2443
2444                 private void AddCharMapGroupCJK (char c, ref byte category)
2445                 {
2446                         AddCharMapCJK (c, ref category);
2447
2448                         // LAMESPEC: see below.
2449                         if (c == '\u52DE') {
2450                                 AddCharMapCJK ('\u3298', ref category);
2451                                 AddCharMapCJK ('\u3238', ref category);
2452                         }
2453                         if (c == '\u5BEB')
2454                                 AddCharMapCJK ('\u32A2', ref category);
2455                         if (c == '\u91AB')
2456                                 // Especially this mapping order totally does
2457                                 // not make sense to me.
2458                                 AddCharMapCJK ('\u32A9', ref category);
2459
2460                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2461                         if (nfkd == null)
2462                                 return;
2463                         for (byte weight = 0; weight <= 0x12; weight++) {
2464                                 object wv = nfkd [weight];
2465                                 if (wv == null)
2466                                         continue;
2467                                 int w = (int) wv;
2468
2469                                 // Special: they are ignored in this area.
2470                                 // FIXME: check if it is sane
2471                                 if (0xF900 <= w && w <= 0xFAD9)
2472                                         continue;
2473                                 // LAMESPEC: on Windows some of CJK characters
2474                                 // in 3200-32B0 are incorrectly mapped. They
2475                                 // mix Chinise and Japanese Kanji when
2476                                 // ordering those characters.
2477                                 switch (w) {
2478                                 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
2479                                         continue;
2480                                 }
2481
2482                                 AddCharMapCJK ((char) w, ref category);
2483                         }
2484                 }
2485
2486                 // For now it is only for 0x7 category.
2487                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2488                 {
2489                         char small = char.MinValue;
2490                         char vertical = char.MinValue;
2491                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2492                         if (nfkd != null) {
2493                                 object smv = nfkd [(byte) DecompositionSmall];
2494                                 if (smv != null)
2495                                         small = (char) ((int) smv);
2496                                 object vv = nfkd [(byte) DecompositionVertical];
2497                                 if (vv != null)
2498                                         vertical = (char) ((int) vv);
2499                         }
2500
2501                         // <small> updates index
2502                         if (small != char.MinValue)
2503                                 // SPECIAL CASE excluded (FIXME: why?)
2504                                 if (small != '\u2024')
2505                                         AddCharMap (small, category, updateCount);
2506
2507                         // itself
2508                         AddCharMap (c, category, updateCount, level2);
2509
2510                         // Since nfkdMap is problematic to have two or more
2511                         // NFKD to an identical character, here I iterate all.
2512                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
2513                                 if (decompLength [c2] == 1 &&
2514                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
2515                                         switch (decompType [c2]) {
2516                                         case DecompositionCompat:
2517                                                 AddCharMap ((char) c2, category, updateCount, level2);
2518                                                 break;
2519                                         }
2520                                 }
2521                         }
2522
2523                         if (vertical != char.MinValue)
2524                                 // SPECIAL CASE excluded (FIXME: why?)
2525                                 if (vertical != '\uFE33' && vertical != '\uFE34')
2526                                         AddCharMap (vertical, category, updateCount, level2);
2527                 }
2528
2529                 private void AddArabicCharMap (char c)
2530                 {
2531                         byte category = 6;
2532                         byte updateCount = 1;
2533                         byte level2 = 0;
2534
2535                         // itself
2536                         AddCharMap (c, category, 0, level2);
2537
2538                         // Since nfkdMap is problematic to have two or more
2539                         // NFKD to an identical character, here I iterate all.
2540                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
2541                                 if (decompLength [c2] == 0)
2542                                         continue;
2543                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
2544                                 if ((int) (decompValues [idx]) == (int) c)
2545                                         AddCharMap ((char) c2, category,
2546                                                 0, level2);
2547                         }
2548                         fillIndex [category] += updateCount;
2549                 }
2550
2551                 char ToFullWidth (char c)
2552                 {
2553                         return ToDecomposed (c, DecompositionFull, false);
2554                 }
2555
2556                 char ToFullWidthTail (char c)
2557                 {
2558                         return ToDecomposed (c, DecompositionFull, true);
2559                 }
2560
2561                 char ToSmallForm (char c)
2562                 {
2563                         return ToDecomposed (c, DecompositionSmall, false);
2564                 }
2565
2566                 char ToSmallFormTail (char c)
2567                 {
2568                         return ToDecomposed (c, DecompositionSmall, true);
2569                 }
2570
2571                 char ToDecomposed (char c, byte d, bool tail)
2572                 {
2573                         if (decompType [(int) c] != d)
2574                                 return c;
2575                         int idx = decompIndex [(int) c];
2576                         if (tail)
2577                                 idx += decompLength [(int) c] - 1;
2578                         return (char) decompValues [idx];
2579                 }
2580
2581                 bool ExistsJIS (int cp)
2582                 {
2583                         foreach (JISCharacter j in jisJapanese)
2584                                 if (j.CP == cp)
2585                                         return true;
2586                         return false;
2587                 }
2588
2589                 #endregion
2590
2591                 #region Level 3 properties (Case/Width)
2592
2593                 private byte ComputeLevel3Weight (char c)
2594                 {
2595                         byte b = ComputeLevel3WeightRaw (c);
2596                         return b > 0 ? (byte) (b + 2) : b;
2597                 }
2598
2599                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
2600                 {
2601                         // Korean
2602                         if ('\u11A8' <= c && c <= '\u11F9')
2603                                 return 2;
2604                         if ('\uFFA0' <= c && c <= '\uFFDC')
2605                                 return 4;
2606                         if ('\u3130' <= c && c <= '\u3164')
2607                                 return 5;
2608                         // numbers
2609                         if ('\u2776' <= c && c <= '\u277F')
2610                                 return 4;
2611                         if ('\u2780' <= c && c <= '\u2789')
2612                                 return 8;
2613                         if ('\u2776' <= c && c <= '\u2793')
2614                                 return 0xC;
2615                         if ('\u2160' <= c && c <= '\u216F')
2616                                 return 0x18;
2617                         if ('\u2181' <= c && c <= '\u2182')
2618                                 return 0x18;
2619                         // Arabic
2620                         if ('\u2135' <= c && c <= '\u2138')
2621                                 return 4;
2622                         if ('\uFE80' <= c && c < '\uFE8E') {
2623                                 // 2(Isolated)/8(Final)/0x18(Medial)
2624                                 switch (decompType [(int) c]) {
2625                                 case DecompositionIsolated:
2626                                         return 2;
2627                                 case DecompositionFinal:
2628                                         return 8;
2629                                 case DecompositionMedial:
2630                                         return 0x18;
2631                                 }
2632                         }
2633
2634                         // actually I dunno the reason why they have weights.
2635                         switch (c) {
2636                         case '\u01BC':
2637                                 return 0x10;
2638                         case '\u06A9':
2639                                 return 0x20;
2640                         case '\u06AA':
2641                                 return 0x28;
2642                         }
2643
2644                         byte ret = 0;
2645                         switch (c) {
2646                         case '\u03C2':
2647                         case '\u2104':
2648                         case '\u212B':
2649                                 ret |= 8;
2650                                 break;
2651                         case '\uFE42':
2652                                 ret |= 0xC;
2653                                 break;
2654                         }
2655
2656                         // misc
2657                         switch (decompType [(int) c]) {
2658                         case DecompositionWide: // <wide>
2659                         case DecompositionSub: // <sub>
2660                         case DecompositionSuper: // <super>
2661                                 ret |= decompType [(int) c];
2662                                 break;
2663                         }
2664                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2665                                 ret |= 8;
2666                         if (isUppercase [(int) c]) // DerivedCoreProperties
2667                                 ret |= 0x10;
2668
2669                         return ret;
2670                 }
2671
2672                 #endregion
2673
2674                 #region IsIgnorable
2675 /*
2676                 static bool IsIgnorable (int i)
2677                 {
2678                         if (unicodeAge [i] >= 3.1)
2679                                 return true;
2680                         switch (char.GetUnicodeCategory ((char) i)) {
2681                         case UnicodeCategory.OtherNotAssigned:
2682                         case UnicodeCategory.Format:
2683                                 return true;
2684                         }
2685                         return false;
2686                 }
2687 */
2688
2689                 // FIXME: In the future use DerivedAge.txt to examine character
2690                 // versions and set those ones that have higher version than
2691                 // 1.0 as ignorable.
2692                 static bool IsIgnorable (int i)
2693                 {
2694                         switch (i) {
2695                         case 0:
2696                         // I guess, those characters are added between
2697                         // Unicode 1.0 (LCMapString) and Unicode 3.1
2698                         // (UnicodeCategory), so they used to be
2699                         // something like OtherNotAssigned as of Unicode 1.1.
2700                         case 0x2df: case 0x387:
2701                         case 0x3d7: case 0x3d8: case 0x3d9:
2702                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2703                         case 0x400: case 0x40d: case 0x450: case 0x45d:
2704                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
2705                         case 0x653: case 0x654: case 0x655: case 0x66d:
2706                         case 0xb56:
2707                         case 0x1e9b: case 0x202f: case 0x20ad:
2708                         case 0x20ae: case 0x20af:
2709                         case 0x20e2: case 0x20e3:
2710                         case 0x2139: case 0x213a: case 0x2183:
2711                         case 0x2425: case 0x2426: case 0x2619:
2712                         case 0x2670: case 0x2671: case 0x3007:
2713                         case 0x3190: case 0x3191:
2714                         case 0xfffc: case 0xfffd:
2715                                 return true;
2716                         // exceptional characters filtered by the
2717                         // following conditions. Originally those exceptional
2718                         // ranges are incorrect (they should not be ignored)
2719                         // and most of those characters are unfortunately in
2720                         // those ranges.
2721                         case 0x4d8: case 0x4d9:
2722                         case 0x4e8: case 0x4e9:
2723                         case 0x3036: case 0x303f:
2724                         case 0x337b: case 0xfb1e:
2725                                 return false;
2726                         }
2727
2728                         if (
2729                                 // The whole Sinhala characters.
2730                                 0x0D82 <= i && i <= 0x0DF4
2731                                 // The whole Tibetan characters.
2732                                 || 0x0F00 <= i && i <= 0x0FD1
2733                                 // The whole Myanmar characters.
2734                                 || 0x1000 <= i && i <= 0x1059
2735                                 // The whole Etiopic, Cherokee,
2736                                 // Canadian Syllablic, Ogham, Runic,
2737                                 // Tagalog, Hanunoo, Philippine,
2738                                 // Buhid, Tagbanwa, Khmer and Mongorian
2739                                 // characters.
2740                                 || 0x1200 <= i && i <= 0x1DFF
2741                                 // Greek extension characters.
2742                                 || 0x1F00 <= i && i <= 0x1FFF
2743                                 // The whole Braille characters.
2744                                 || 0x2800 <= i && i <= 0x28FF
2745                                 // CJK radical characters.
2746                                 || 0x2E80 <= i && i <= 0x2EF3
2747                                 // Kangxi radical characters.
2748                                 || 0x2F00 <= i && i <= 0x2FD5
2749                                 // Ideographic description characters.
2750                                 || 0x2FF0 <= i && i <= 0x2FFB
2751                                 // Bopomofo letter and final
2752                                 || 0x31A0 <= i && i <= 0x31B7
2753                                 // White square with quadrant characters.
2754                                 || 0x25F0 <= i && i <= 0x25F7
2755                                 // Ideographic telegraph symbols.
2756                                 || 0x32C0 <= i && i <= 0x32CB
2757                                 || 0x3358 <= i && i <= 0x3370
2758                                 || 0x33E0 <= i && i <= 0x33FF
2759                                 // The whole YI characters.
2760                                 || 0xA000 <= i && i <= 0xA48C
2761                                 || 0xA490 <= i && i <= 0xA4C6
2762                                 // American small ligatures
2763                                 || 0xFB13 <= i && i <= 0xFB17
2764                                 // hebrew, arabic, variation selector.
2765                                 || 0xFB1D <= i && i <= 0xFE2F
2766                                 // Arabic ligatures.
2767                                 || 0xFEF5 <= i && i <= 0xFEFC
2768                                 // FIXME: why are they excluded?
2769                                 || 0x01F6 <= i && i <= 0x01F9
2770                                 || 0x0218 <= i && i <= 0x0233
2771                                 || 0x02A9 <= i && i <= 0x02AD
2772                                 || 0x02EA <= i && i <= 0x02EE
2773                                 || 0x0349 <= i && i <= 0x036F
2774                                 || 0x0488 <= i && i <= 0x048F
2775                                 || 0x04D0 <= i && i <= 0x04FF
2776                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2777                                 || 0x06D6 <= i && i <= 0x06ED
2778                                 || 0x06FA <= i && i <= 0x06FE
2779                                 || 0x2048 <= i && i <= 0x204D
2780                                 || 0x20e4 <= i && i <= 0x20ea
2781                                 || 0x213C <= i && i <= 0x214B
2782                                 || 0x21EB <= i && i <= 0x21FF
2783                                 || 0x22F2 <= i && i <= 0x22FF
2784                                 || 0x237B <= i && i <= 0x239A
2785                                 || 0x239B <= i && i <= 0x23CF
2786                                 || 0x24EB <= i && i <= 0x24FF
2787                                 || 0x2596 <= i && i <= 0x259F
2788                                 || 0x25F8 <= i && i <= 0x25FF
2789                                 || 0x2672 <= i && i <= 0x2689
2790                                 || 0x2768 <= i && i <= 0x2775
2791                                 || 0x27d0 <= i && i <= 0x27ff
2792                                 || 0x2900 <= i && i <= 0x2aff
2793                                 || 0x3033 <= i && i <= 0x303F
2794                                 || 0x31F0 <= i && i <= 0x31FF
2795                                 || 0x3250 <= i && i <= 0x325F
2796                                 || 0x32B1 <= i && i <= 0x32BF
2797                                 || 0x3371 <= i && i <= 0x337B
2798                                 || 0xFA30 <= i && i <= 0xFA6A
2799                         )
2800                                 return true;
2801
2802                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2803                         switch (uc) {
2804                         case UnicodeCategory.PrivateUse:
2805                         case UnicodeCategory.Surrogate:
2806                                 return false;
2807                         // ignored by nature
2808                         case UnicodeCategory.Format:
2809                         case UnicodeCategory.OtherNotAssigned:
2810                                 return true;
2811                         default:
2812                                 return false;
2813                         }
2814                 }
2815
2816                 // To check IsIgnorable sanity, try the driver below under MS.NET.
2817
2818                 /*
2819                 public static void Main ()
2820                 {
2821                         for (int i = 0; i <= char.MaxValue; i++)
2822                                 Dump (i, IsIgnorable (i));
2823                 }
2824
2825                 static void Dump (int i, bool ignore)
2826                 {
2827                         switch (Char.GetUnicodeCategory ((char) i)) {
2828                         case UnicodeCategory.PrivateUse:
2829                         case UnicodeCategory.Surrogate:
2830                                 return; // check nothing
2831                         }
2832
2833                         string s1 = "";
2834                         string s2 = new string ((char) i, 10);
2835                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2836                         if ((ret == 0) == ignore)
2837                                 return;
2838                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2839                 }
2840                 */
2841                 #endregion // IsIgnorable
2842
2843                 #region IsIgnorableSymbol
2844                 static bool IsIgnorableSymbol (int i)
2845                 {
2846                         if (IsIgnorable (i))
2847                                 return true;
2848
2849                         switch (i) {
2850                         // *Letter
2851                         case 0x00b5: case 0x01C0: case 0x01C1:
2852                         case 0x01C2: case 0x01C3: case 0x01F6:
2853                         case 0x01F7: case 0x01F8: case 0x01F9:
2854                         case 0x02D0: case 0x02EE: case 0x037A:
2855                         case 0x03D7: case 0x03F3:
2856                         case 0x0400: case 0x040d:
2857                         case 0x0450: case 0x045d:
2858                         case 0x048C: case 0x048D:
2859                         case 0x048E: case 0x048F:
2860                         case 0x0587: case 0x0640: case 0x06E5:
2861                         case 0x06E6: case 0x06FA: case 0x06FB:
2862                         case 0x06FC: case 0x093D: case 0x0950:
2863                         case 0x1E9B: case 0x2139: case 0x3006:
2864                         case 0x3033: case 0x3034: case 0x3035:
2865                         case 0xFE7E: case 0xFE7F:
2866                         // OtherNumber
2867                         case 0x16EE: case 0x16EF: case 0x16F0:
2868                         // LetterNumber
2869                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2870                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2871                         case 0x3038: // HANGZHOU NUMERAL TEN
2872                         case 0x3039: // HANGZHOU NUMERAL TWENTY
2873                         case 0x303a: // HANGZHOU NUMERAL THIRTY
2874                         // OtherSymbol
2875                         case 0x2117:
2876                         case 0x327F:
2877                                 return true;
2878                         // ModifierSymbol
2879                         case 0x02B9: case 0x02BA: case 0x02C2:
2880                         case 0x02C3: case 0x02C4: case 0x02C5:
2881                         case 0x02C8: case 0x02CC: case 0x02CD:
2882                         case 0x02CE: case 0x02CF: case 0x02D2:
2883                         case 0x02D3: case 0x02D4: case 0x02D5:
2884                         case 0x02D6: case 0x02D7: case 0x02DE:
2885                         case 0x02E5: case 0x02E6: case 0x02E7:
2886                         case 0x02E8: case 0x02E9:
2887                         case 0x309B: case 0x309C:
2888                         // OtherPunctuation
2889                         case 0x055A: // American Apos
2890                         case 0x05C0: // Hebrew Punct
2891                         case 0x0E4F: // Thai FONGMAN
2892                         case 0x0E5A: // Thai ANGKHANKHU
2893                         case 0x0E5B: // Thai KHOMUT
2894                         // CurencySymbol
2895                         case 0x09F2: // Bengali Rupee Mark
2896                         case 0x09F3: // Bengali Rupee Sign
2897                         // MathSymbol
2898                         case 0x221e: // INF.
2899                         // OtherSymbol
2900                         case 0x0482:
2901                         case 0x09FA:
2902                         case 0x0B70:
2903                                 return false;
2904                         }
2905
2906                         // *Letter
2907                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2908 #if NET_2_0
2909                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2910                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2911 #endif
2912                         )
2913                                 return true;
2914
2915                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2916                         switch (uc) {
2917                         case UnicodeCategory.Surrogate:
2918                                 return false; // inconsistent
2919
2920                         case UnicodeCategory.SpacingCombiningMark:
2921                         case UnicodeCategory.EnclosingMark:
2922                         case UnicodeCategory.NonSpacingMark:
2923                         case UnicodeCategory.PrivateUse:
2924                                 // NonSpacingMark
2925                                 if (0x064B <= i && i <= 0x0652) // Arabic
2926                                         return true;
2927                                 return false;
2928
2929                         case UnicodeCategory.Format:
2930                         case UnicodeCategory.OtherNotAssigned:
2931                                 return true;
2932
2933                         default:
2934                                 bool use = false;
2935                                 // OtherSymbols
2936                                 if (
2937                                         // latin in a circle
2938                                         0x249A <= i && i <= 0x24E9
2939                                         || 0x2100 <= i && i <= 0x2132
2940                                         // Japanese
2941                                         || 0x3196 <= i && i <= 0x31A0
2942                                         // Korean
2943                                         || 0x3200 <= i && i <= 0x321C
2944                                         // Chinese/Japanese
2945                                         || 0x322A <= i && i <= 0x3243
2946                                         // CJK
2947                                         || 0x3260 <= i && i <= 0x32B0
2948                                         || 0x32D0 <= i && i <= 0x3357
2949                                         || 0x337B <= i && i <= 0x33DD
2950                                 )
2951                                         use = !Char.IsLetterOrDigit ((char) i);
2952                                 if (use)
2953                                         return false;
2954
2955                                 // This "Digit" rule is mystery.
2956                                 // It filters some symbols out.
2957                                 if (Char.IsLetterOrDigit ((char) i))
2958                                         return false;
2959                                 if (Char.IsNumber ((char) i))
2960                                         return false;
2961                                 if (Char.IsControl ((char) i)
2962                                         || Char.IsSeparator ((char) i)
2963                                         || Char.IsPunctuation ((char) i))
2964                                         return true;
2965                                 if (Char.IsSymbol ((char) i))
2966                                         return true;
2967
2968                                 // FIXME: should check more
2969                                 return false;
2970                         }
2971                 }
2972
2973                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
2974 /*
2975                 public static void Main ()
2976                 {
2977                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
2978                         for (int i = 0; i <= char.MaxValue; i++) {
2979                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2980                                 if (uc == UnicodeCategory.Surrogate)
2981                                         continue;
2982
2983                                 bool ret = IsIgnorableSymbol (i);
2984
2985                                 string s1 = "TEST ";
2986                                 string s2 = "TEST " + (char) i;
2987
2988                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
2989
2990                                 if (ret != (result == 0))
2991                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
2992                                                 ret ? "should not ignore" :
2993                                                         "should ignore",
2994                                                 i,(char) i, uc);
2995                         }
2996                 }
2997 */
2998                 #endregion
2999
3000                 #region NonSpacing
3001                 static bool IsIgnorableNonSpacing (int i)
3002                 {
3003                         if (IsIgnorable (i))
3004                                 return true;
3005
3006                         switch (i) {
3007                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3008                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3009                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3010                                 return true;
3011                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3012                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3013                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3014                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3015                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3016                         case 0x0CCD: case 0x0E4E:
3017                                 return false;
3018                         }
3019
3020                         if (0x02b9 <= i && i <= 0x02c5
3021                                 || 0x02cc <= i && i <= 0x02d7
3022                                 || 0x02e4 <= i && i <= 0x02ef
3023                                 || 0x20DD <= i && i <= 0x20E0
3024                         )
3025                                 return true;
3026
3027                         if (0x064B <= i && i <= 0x00652
3028                                 || 0x0941 <= i && i <= 0x0948
3029                                 || 0x0AC1 <= i && i <= 0x0ACD
3030                                 || 0x0C3E <= i && i <= 0x0C4F
3031                                 || 0x0E31 <= i && i <= 0x0E3F
3032                         )
3033                                 return false;
3034
3035                         return Char.GetUnicodeCategory ((char) i) ==
3036                                 UnicodeCategory.NonSpacingMark;
3037                 }
3038
3039                 // We can reuse IsIgnorableSymbol testcode
3040                 // for IsIgnorableNonSpacing.
3041                 #endregion
3042         }
3043
3044         struct CharMapEntry
3045         {
3046                 public byte Category;
3047                 public byte Level1;
3048                 public byte Level2; // It is always single byte.
3049                 public bool Defined;
3050
3051                 public CharMapEntry (byte category, byte level1, byte level2)
3052                 {
3053                         Category = category;
3054                         Level1 = level1;
3055                         Level2 = level2;
3056                         Defined = true;
3057                 }
3058         }
3059
3060         class JISCharacter
3061         {
3062                 public readonly int CP;
3063                 public readonly int JIS;
3064
3065                 public JISCharacter (int cp, int cpJIS)
3066                 {
3067                         CP = cp;
3068                         JIS = cpJIS;
3069                 }
3070         }
3071
3072         class JISComparer : IComparer
3073         {
3074                 public static readonly JISComparer Instance =
3075                         new JISComparer ();
3076
3077                 public int Compare (object o1, object o2)
3078                 {
3079                         JISCharacter j1 = (JISCharacter) o1;
3080                         JISCharacter j2 = (JISCharacter) o2;
3081                         return j2.JIS - j1.JIS;
3082                 }
3083         }
3084
3085         class NonJISCharacter
3086         {
3087                 public readonly int CP;
3088                 public readonly string Name;
3089
3090                 public NonJISCharacter (int cp, string name)
3091                 {
3092                         CP = cp;
3093                         Name = name;
3094                 }
3095         }
3096
3097         class NonJISComparer : IComparer
3098         {
3099                 public static readonly NonJISComparer Instance =
3100                         new NonJISComparer ();
3101
3102                 public int Compare (object o1, object o2)
3103                 {
3104                         NonJISCharacter j1 = (NonJISCharacter) o1;
3105                         NonJISCharacter j2 = (NonJISCharacter) o2;
3106                         return string.CompareOrdinal (j1.Name, j2.Name);
3107                 }
3108         }
3109
3110         class DecimalDictionaryValueComparer : IComparer
3111         {
3112                 public static readonly DecimalDictionaryValueComparer Instance
3113                         = new DecimalDictionaryValueComparer ();
3114
3115                 private DecimalDictionaryValueComparer ()
3116                 {
3117                 }
3118
3119                 public int Compare (object o1, object o2)
3120                 {
3121                         DictionaryEntry e1 = (DictionaryEntry) o1;
3122                         DictionaryEntry e2 = (DictionaryEntry) o2;
3123                         // FIXME: in case of 0, compare decomposition categories
3124                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3125                         if (ret != 0)
3126                                 return ret;
3127                         int i1 = (int) e1.Key;
3128                         int i2 = (int) e2.Key;
3129                         return i1 - i2;
3130                 }
3131         }
3132
3133         class StringDictionaryValueComparer : IComparer
3134         {
3135                 public static readonly StringDictionaryValueComparer Instance
3136                         = new StringDictionaryValueComparer ();
3137
3138                 private StringDictionaryValueComparer ()
3139                 {
3140                 }
3141
3142                 public int Compare (object o1, object o2)
3143                 {
3144                         DictionaryEntry e1 = (DictionaryEntry) o1;
3145                         DictionaryEntry e2 = (DictionaryEntry) o2;
3146                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3147                         if (ret != 0)
3148                                 return ret;
3149                         int i1 = (int) e1.Key;
3150                         int i2 = (int) e2.Key;
3151                         return i1 - i2;
3152                 }
3153         }
3154
3155         class UCAComparer : IComparer
3156         {
3157                 public static readonly UCAComparer Instance
3158                         = new UCAComparer ();
3159
3160                 private UCAComparer ()
3161                 {
3162                 }
3163
3164                 public int Compare (object o1, object o2)
3165                 {
3166                         char i1 = (char) o1;
3167                         char i2 = (char) o2;
3168
3169                         int l1 = CollationElementTable.GetSortKeyCount (i1);
3170                         int l2 = CollationElementTable.GetSortKeyCount (i2);
3171                         int l = l1 > l2 ? l2 : l1;
3172
3173                         for (int i = 0; i < l; i++) {
3174                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3175                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3176                                 int v = k1.Primary - k2.Primary;
3177                                 if (v != 0)
3178                                         return v;
3179                                 v = k1.Secondary - k2.Secondary;
3180                                 if (v != 0)
3181                                         return v;
3182                                 v = k1.Thirtiary - k2.Thirtiary;
3183                                 if (v != 0)
3184                                         return v;
3185                                 v = k1.Quarternary - k2.Quarternary;
3186                                 if (v != 0)
3187                                         return v;
3188                         }
3189                         return l1 - l2;
3190                 }
3191         }
3192
3193         class Tailoring
3194         {
3195                 int lcid;
3196                 int alias;
3197                 bool frenchSort;
3198                 ArrayList items = new ArrayList ();
3199
3200                 public Tailoring (int lcid)
3201                         : this (lcid, 0)
3202                 {
3203                 }
3204
3205                 public Tailoring (int lcid, int alias)
3206                 {
3207                         this.lcid = lcid;
3208                         this.alias = alias;
3209                 }
3210
3211                 public int LCID {
3212                         get { return lcid; }
3213                 }
3214
3215                 public int Alias {
3216                         get { return alias; }
3217                 }
3218
3219                 public bool FrenchSort {
3220                         get { return frenchSort; }
3221                         set { frenchSort = value; }
3222                 }
3223
3224                 public void AddDiacriticalMap (byte target, byte replace)
3225                 {
3226                         items.Add (new DiacriticalMap (target, replace));
3227                 }
3228
3229                 public void AddSortKeyMap (string source, byte [] sortkey)
3230                 {
3231                         items.Add (new SortKeyMap (source, sortkey));
3232                 }
3233
3234                 public void AddReplacementMap (string source, string replace)
3235                 {
3236                         items.Add (new ReplacementMap (source, replace));
3237                 }
3238
3239                 public char [] ItemToCharArray ()
3240                 {
3241                         ArrayList al = new ArrayList ();
3242                         foreach (ITailoringMap m in items)
3243                                 al.AddRange (m.ToCharArray ());
3244                         return al.ToArray (typeof (char)) as char [];
3245                 }
3246
3247                 interface ITailoringMap
3248                 {
3249                         char [] ToCharArray ();
3250                 }
3251
3252                 class DiacriticalMap : ITailoringMap
3253                 {
3254                         public readonly byte Target;
3255                         public readonly byte Replace;
3256
3257                         public DiacriticalMap (byte target, byte replace)
3258                         {
3259                                 Target = target;
3260                                 Replace = replace;
3261                         }
3262
3263                         public char [] ToCharArray ()
3264                         {
3265                                 char [] ret = new char [3];
3266                                 ret [0] = (char) 02; // kind:DiacriticalMap
3267                                 ret [1] = (char) Target;
3268                                 ret [2] = (char) Replace;
3269                                 return ret;
3270                         }
3271                 }
3272
3273                 class SortKeyMap : ITailoringMap
3274                 {
3275                         public readonly string Source;
3276                         public readonly byte [] SortKey;
3277
3278                         public SortKeyMap (string source, byte [] sortkey)
3279                         {
3280                                 Source = source;
3281                                 SortKey = sortkey;
3282                         }
3283
3284                         public char [] ToCharArray ()
3285                         {
3286                                 char [] ret = new char [Source.Length + 7];
3287                                 ret [0] = (char) 01; // kind:SortKeyMap
3288                                 for (int i = 0; i < Source.Length; i++)
3289                                         ret [i + 1] = Source [i];
3290                                 // null terminate
3291                                 for (int i = 0; i < 5; i++)
3292                                         ret [i + Source.Length + 2] = (char) SortKey [i];
3293                                 return ret;
3294                         }
3295                 }
3296
3297                 class ReplacementMap : ITailoringMap
3298                 {
3299                         public readonly string Source;
3300                         public readonly string Replace;
3301
3302                         public ReplacementMap (string source, string replace)
3303                         {
3304                                 Source = source;
3305                                 Replace = replace;
3306                         }
3307
3308                         public char [] ToCharArray ()
3309                         {
3310                                 char [] ret = new char [Source.Length + Replace.Length + 3];
3311                                 ret [0] = (char) 03; // kind:ReplaceMap
3312                                 int pos = 1;
3313                                 for (int i = 0; i < Source.Length; i++)
3314                                         ret [pos++] = Source [i];
3315                                 // null terminate
3316                                 pos++;
3317                                 for (int i = 0; i < Replace.Length; i++)
3318                                         ret [pos++] = Replace [i];
3319                                 // null terminate
3320                                 return ret;
3321                         }
3322                 }
3323         }
3324 }