mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14
  15 //
  16 // * sortkey getter signature
  17 //
  18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
  19 //      Stores sort key for corresponding character element into buf and
  20 //      returns the length of the consumed _source_ character element in s.
  21 //
  22 // * character length to consume
  23 //
  24 //      If there are characters whose primary weight is 0, they are consumed
  25 //      and considered as a part of the character element.
  26 //
  27
  28 using System;
  29 using System.IO;
  30 using System.Collections;
  31 using System.Globalization;
  32 using System.Text;
  33 using System.Xml;
  34
  35 namespace Mono.Globalization.Unicode
  36 {
  37         internal class MSCompatSortKeyTableGenerator
  38         {
  39                 public static void Main (string [] args)
  40                 {
  41                         new MSCompatSortKeyTableGenerator ().Run (args);
  42                 }
  43
  44                 const int DecompositionWide = 1; // fixed
  45                 const int DecompositionSub = 2; // fixed
  46                 const int DecompositionSmall = 3;
  47                 const int DecompositionIsolated = 4;
  48                 const int DecompositionInitial = 5;
  49                 const int DecompositionFinal = 6;
  50                 const int DecompositionMedial = 7;
  51                 const int DecompositionNoBreak = 8;
  52                 const int DecompositionVertical = 9;
  53                 const int DecompositionFraction = 0xA;
  54                 const int DecompositionFont = 0xB;
  55                 const int DecompositionSuper = 0xC; // fixed
  56                 const int DecompositionFull = 0xE;
  57                 const int DecompositionNarrow = 0xD;
  58                 const int DecompositionCircle = 0xF;
  59                 const int DecompositionSquare = 0x10;
  60                 const int DecompositionCompat = 0x11;
  61                 const int DecompositionCanonical = 0x12;
  62
  63                 TextWriter Result = Console.Out;
  64
  65                 byte [] fillIndex = new byte [256]; // by category
  66                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  67
  68                 char [] specialIgnore = new char [] {
  69                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  70                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  71                         };
  72
  73                 // FIXME: need more love (as always)
  74                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  75                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  76                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  77                         '\u0292', '\u01BE', '\u0298'};
  78                 byte [] alphaWeights = new byte [] {
  79                         2, 9, 0xA, 0x1A, 0x21,
  80                         0x23, 0x25, 0x2C, 0x32, 0x35,
  81                         0x36, 0x48, 0x51, 0x70, 0x7C,
  82                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  83                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  84                         0xA9, 0xAA, 0xB3, 0xB4};
  85
  86                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  87                 bool [] isUppercase = new bool [char.MaxValue + 1];
  88
  89                 byte [] decompType = new byte [char.MaxValue + 1];
  90                 int [] decompIndex = new int [char.MaxValue + 1];
  91                 int [] decompLength = new int [char.MaxValue + 1];
  92                 int [] decompValues;
  93                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  94
  95                 byte [] diacritical = new byte [char.MaxValue + 1];
  96
  97                 string [] diacritics = new string [] {
  98                         // LATIN
  99                         "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
 100                         "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 101                         " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
 102                         " OGONEK;", " CEDILLA;",
 103                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 104                         " STROKE;", " CIRCUMFLEX AND ACUTE;",
 105                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 106                         " DIAERESIS AND GRAVE;",
 107                         " BREVE AND ACUTE;",
 108                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 109                         " MACRON AND ACUTE;",
 110                         " MACRON AND GRAVE;",
 111                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 112                         " RING ABOVE AND ACUTE",
 113                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 114                         " CIRCUMFLEX AND TILDE",
 115                         " TILDE AND DIAERESIS",
 116                         " STROKE AND ACUTE",
 117                         " BREVE AND TILDE",
 118                         " CEDILLA AND BREVE",
 119                         " OGONEK AND MACRON",
 120                         " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 121                         " DOUBLE GRAVE;",
 122                         " INVERTED BREVE",
 123                         " PRECEDED BY APOSTROPHE",
 124                         " HORN;",
 125                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 126                         " PALATAL HOOK",
 127                         " DOT BELOW;",
 128                         " RETROFLEX;", "DIAERESIS BELOW",
 129                         " RING BELOW",
 130                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 131                         " BREVE BELOW;", " HORN AND GRAVE",
 132                         " TILDE BELOW",
 133                         " DOT BELOW AND DOT ABOVE",
 134                         " RIGHT HALF RING", " HORN AND TILDE",
 135                         " CIRCUMFLEX AND DOT BELOW",
 136                         " BREVE AND DOT BELOW",
 137                         " DOT BELOW AND MACRON",
 138                         " HORN AND HOOK ABOVE",
 139                         " HORN AND DOT",
 140                         // CIRCLED, PARENTHESIZED and so on
 141                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
 142                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 143                         };
 144                 byte [] diacriticWeights = new byte [] {
 145                         // LATIN.
 146                         0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 147                         0x17, 0x19, 0x1A, 0x1B, 0x1C,
 148                         0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 149                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 150                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 151                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 152                         0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
 153                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
 154                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
 155                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 156                         0x95, 0xAA,
 157                         // CIRCLED, PARENTHESIZED and so on.
 158                         0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
 159                         };
 160
 161                 int [] numberSecondaryWeightBounds = new int [] {
 162                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 163                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 164                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 165                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 166                         0xE50, 0xE60, 0xED0, 0xEE0
 167                         };
 168
 169                 char [] orderedCyrillic;
 170                 char [] orderedGurmukhi;
 171                 char [] orderedGujarati;
 172                 char [] orderedGeorgian;
 173                 char [] orderedThaana;
 174
 175                 static readonly char [] orderedTamilConsonants = new char [] {
 176                         // based on traditional Tamil consonants, except for
 177                         // Grantha (where Microsoft breaks traditionalism).
 178                         // http://www.angelfire.com/empire/thamizh/padanGaL
 179                         '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
 180                         '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
 181                         '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
 182                         '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
 183                         '\u0BB9'};
 184
 185                 // cp -> character name (only for some characters)
 186                 ArrayList sortableCharNames = new ArrayList ();
 187
 188                 // cp -> arrow value (int)
 189                 ArrayList arrowValues = new ArrayList ();
 190
 191                 // cp -> box value (int)
 192                 ArrayList boxValues = new ArrayList ();
 193
 194                 // cp -> level1 value
 195                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 196
 197                 // letterName -> cp
 198                 Hashtable arabicNameMap = new Hashtable ();
 199
 200                 // cp -> Hashtable [decompType] -> cp
 201                 Hashtable nfkdMap = new Hashtable ();
 202
 203                 // Latin letter -> ArrayList [int]
 204                 Hashtable latinMap = new Hashtable ();
 205
 206                 ArrayList jisJapanese = new ArrayList ();
 207                 ArrayList nonJisJapanese = new ArrayList ();
 208
 209                 ushort [] cjkJA = new ushort [char.MaxValue - 0x4E00];
 210                 ushort [] cjkCHS = new ushort [char.MaxValue - 0x3100];
 211                 ushort [] cjkCHT = new ushort [char.MaxValue - 0x4E00];
 212                 ushort [] cjkKO = new ushort [char.MaxValue - 0x4E00];
 213                 byte [] cjkKOlv2 = new byte [char.MaxValue - 0x4E00];
 214
 215                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 216
 217                 static double [] unicodeAge = new double [char.MaxValue + 1];
 218
 219                 ArrayList tailorings = new ArrayList ();
 220
 221                 void Run (string [] args)
 222                 {
 223                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 224                         ParseSources (dirname);
 225                         Console.Error.WriteLine ("parse done.");
 226
 227                         ModifyParsedValues ();
 228                         GenerateCore ();
 229                         Console.Error.WriteLine ("generation done.");
 230                         Serialize ();
 231                         Console.Error.WriteLine ("serialization done.");
 232 /*
 233 StreamWriter sw = new StreamWriter ("agelog.txt");
 234 for (int i = 0; i < char.MaxValue; i++) {
 235 bool shouldBe = false;
 236 switch (Char.GetUnicodeCategory ((char) i)) {
 237 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
 238         shouldBe = true; break;
 239 }
 240 if (unicodeAge [i] >= 3.1)
 241         shouldBe = true;
 242 //if (IsIgnorable (i) != shouldBe)
 243 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
 244 }
 245 sw.Close ();
 246 */
 247                 }
 248
 249                 byte [] CompressArray (byte [] source, CodePointIndexer i)
 250                 {
 251                         return (byte []) CodePointIndexer.CompressArray  (
 252                                 source, typeof (byte), i);
 253                 }
 254
 255                 void Serialize ()
 256                 {
 257                         // Tailorings
 258                         SerializeTailorings ();
 259
 260                         byte [] categories = new byte [map.Length];
 261                         byte [] level1 = new byte [map.Length];
 262                         byte [] level2 = new byte [map.Length];
 263                         byte [] level3 = new byte [map.Length];
 264                         int [] widthCompat = new int [map.Length];
 265                         for (int i = 0; i < map.Length; i++) {
 266                                 categories [i] = map [i].Category;
 267                                 level1 [i] = map [i].Level1;
 268                                 level2 [i] = map [i].Level2;
 269                                 level3 [i] = ComputeLevel3Weight ((char) i);
 270                                 switch (decompType [i]) {
 271                                 case DecompositionNarrow:
 272                                 case DecompositionWide:
 273                                 case DecompositionSuper:
 274                                 case DecompositionSub:
 275                                         // they are always 1 char
 276                                         widthCompat [i] = decompValues [decompIndex [i]];
 277                                         break;
 278                                 }
 279                         }
 280
 281                         // compress
 282                         ignorableFlags = CompressArray (ignorableFlags,
 283                                 MSCompatUnicodeTableUtil.Ignorable);
 284                         categories = CompressArray (categories,
 285                                 MSCompatUnicodeTableUtil.Category);
 286                         level1 = CompressArray (level1,
 287                                 MSCompatUnicodeTableUtil.Level1);
 288                         level2 = CompressArray (level2,
 289                                 MSCompatUnicodeTableUtil.Level2);
 290                         level3 = CompressArray (level3,
 291                                 MSCompatUnicodeTableUtil.Level3);
 292                         widthCompat = (int []) CodePointIndexer.CompressArray (
 293                                 widthCompat, typeof (int),
 294                                 MSCompatUnicodeTableUtil.WidthCompat);
 295
 296                         // Ignorables
 297                         Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
 298                         for (int i = 0; i < ignorableFlags.Length; i++) {
 299                                 byte value = ignorableFlags [i];
 300                                 if (value < 10)
 301                                         Result.Write ("{0},", value);
 302                                 else
 303                                         Result.Write ("0x{0:X02},", value);
 304                                 if ((i & 0xF) == 0xF)
 305                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 306                         }
 307                         Result.WriteLine ("};");
 308                         Result.WriteLine ();
 309
 310                         // Primary category
 311                         Result.WriteLine ("static byte [] categories = new byte [] {");
 312                         for (int i = 0; i < categories.Length; i++) {
 313                                 byte value = categories [i];
 314                                 if (value < 10)
 315                                         Result.Write ("{0},", value);
 316                                 else
 317                                         Result.Write ("0x{0:X02},", value);
 318                                 if ((i & 0xF) == 0xF)
 319                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 320                         }
 321                         Result.WriteLine ("};");
 322                         Result.WriteLine ();
 323
 324                         // Primary weight value
 325                         Result.WriteLine ("static byte [] level1 = new byte [] {");
 326                         for (int i = 0; i < level1.Length; i++) {
 327                                 byte value = level1 [i];
 328                                 if (value < 10)
 329                                         Result.Write ("{0},", value);
 330                                 else
 331                                         Result.Write ("0x{0:X02},", value);
 332                                 if ((i & 0xF) == 0xF)
 333                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 334                         }
 335                         Result.WriteLine ("};");
 336                         Result.WriteLine ();
 337
 338                         // Secondary weight
 339                         Result.WriteLine ("static byte [] level2 = new byte [] {");
 340                         for (int i = 0; i < level2.Length; i++) {
 341                                 int value = level2 [i];
 342                                 if (value < 10)
 343                                         Result.Write ("{0},", value);
 344                                 else
 345                                         Result.Write ("0x{0:X02},", value);
 346                                 if ((i & 0xF) == 0xF)
 347                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 348                         }
 349                         Result.WriteLine ("};");
 350                         Result.WriteLine ();
 351
 352                         // Thirtiary weight
 353                         Result.WriteLine ("static byte [] level3 = new byte [] {");
 354                         for (int i = 0; i < level3.Length; i++) {
 355                                 byte value = level3 [i];
 356                                 if (value < 10)
 357                                         Result.Write ("{0},", value);
 358                                 else
 359                                         Result.Write ("0x{0:X02},", value);
 360                                 if ((i & 0xF) == 0xF)
 361                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 362                         }
 363                         Result.WriteLine ("};");
 364                         Result.WriteLine ();
 365
 366                         // Width insensitivity mappings
 367                         // (for now it is more lightweight than dumping the
 368                         // entire NFKD table).
 369                         Result.WriteLine ("static int [] widthCompat = new int [] {");
 370                         for (int i = 0; i < widthCompat.Length; i++) {
 371                                 int value = widthCompat [i];
 372                                 if (value < 10)
 373                                         Result.Write ("{0},", value);
 374                                 else
 375                                         Result.Write ("0x{0:X02},", value);
 376                                 if ((i & 0xF) == 0xF)
 377                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 378                         }
 379                         Result.WriteLine ("};");
 380                         Result.WriteLine ();
 381
 382                         // CJK
 383                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 384                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 385                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 386                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 387                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 388                 }
 389
 390                 void SerializeCJK (string name, ushort [] cjk, int max)
 391                 {
 392                         int offset = char.MaxValue - cjk.Length;
 393                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
 394                         for (int i = 0; i < cjk.Length; i++) {
 395                                 if (i + offset == max)
 396                                         break;
 397                                 ushort value = cjk [i];
 398                                 if (value < 10)
 399                                         Result.Write ("{0},", value);
 400                                 else
 401                                         Result.Write ("0x{0:X04},", value);
 402                                 if ((i & 0xF) == 0xF)
 403                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 404                         }
 405                         Result.WriteLine ("};");
 406                         Result.WriteLine ();
 407                 }
 408
 409                 void SerializeCJK (string name, byte [] cjk, int max)
 410                 {
 411                         int offset = char.MaxValue - cjk.Length;
 412                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
 413                         for (int i = 0; i < cjk.Length; i++) {
 414                                 if (i + offset == max)
 415                                         break;
 416                                 byte value = cjk [i];
 417                                 if (value < 10)
 418                                         Result.Write ("{0},", value);
 419                                 else
 420                                         Result.Write ("0x{0:X02},", value);
 421                                 if ((i & 0xF) == 0xF)
 422                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 423                         }
 424                         Result.WriteLine ("};");
 425                         Result.WriteLine ();
 426                 }
 427
 428                 void SerializeTailorings ()
 429                 {
 430                         Hashtable indexes = new Hashtable ();
 431                         Hashtable counts = new Hashtable ();
 432                         Result.WriteLine ("static char [] tailorings = new char [] {");
 433                         int count = 0;
 434                         foreach (Tailoring t in tailorings) {
 435                                 if (t.Alias != 0)
 436                                         continue;
 437                                 Result.Write ("/*{0}*/", t.LCID);
 438                                 indexes.Add (t.LCID, count);
 439                                 char [] values = t.ItemToCharArray ();
 440                                 counts.Add (t.LCID, values.Length);
 441                                 foreach (char c in values) {
 442                                         Result.Write ("'\\x{0:X}', ", (int) c);
 443                                         if (++count % 16 == 0)
 444                                                 Result.WriteLine (" // {0:X04}", count - 16);
 445                                 }
 446                         }
 447                         Result.WriteLine ("};");
 448
 449                         Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
 450                         foreach (Tailoring t in tailorings) {
 451                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
 452                                 if (!indexes.ContainsKey (target)) {
 453                                         Console.Error.WriteLine ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias);
 454                                         continue;
 455                                 }
 456                                 int idx = (int) indexes [target];
 457                                 int cnt = (int) counts [target];
 458                                 bool french = t.FrenchSort;
 459                                 if (t.Alias != 0)
 460                                         foreach (Tailoring t2 in tailorings)
 461                                                 if (t2.LCID == t.LCID)
 462                                                         french = t2.FrenchSort;
 463                                 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
 464                         }
 465                         Result.WriteLine ("};");
 466                 }
 467
 468                 #region Parse
 469
 470                 void ParseSources (string dirname)
 471                 {
 472                         string unidata =
 473                                 dirname + "/UnicodeData.txt";
 474                         string derivedCoreProps =
 475                                 dirname + "/DerivedCoreProperties.txt";
 476                         string scripts =
 477                                 dirname + "/Scripts.txt";
 478                         string cp932 =
 479                                 dirname + "/CP932.TXT";
 480                         string derivedAge =
 481                                 dirname + "/DerivedAge.txt";
 482                         string chXML = dirname + "/common/collation/zh.xml";
 483                         string jaXML = dirname + "/common/collation/ja.xml";
 484                         string koXML = dirname + "/common/collation/ko.xml";
 485
 486                         ParseDerivedAge (derivedAge);
 487
 488                         FillIgnorables ();
 489
 490                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 491                         ParseUnidata (unidata);
 492                         ParseDerivedCoreProperties (derivedCoreProps);
 493                         ParseScripts (scripts);
 494                         ParseCJK (chXML, jaXML, koXML);
 495
 496                         ParseTailorings ("mono-tailoring-source.txt");
 497                 }
 498
 499                 void ParseTailorings (string filename)
 500                 {
 501                         Tailoring t = null;
 502                         int line = 0;
 503                         using (StreamReader sr = new StreamReader (filename)) {
 504                                 try {
 505                                         while (sr.Peek () >= 0) {
 506                                                 line++;
 507                                                 ProcessTailoringLine (ref t,
 508                                                         sr.ReadLine ().Trim ());
 509                                         }
 510                                 } catch (Exception) {
 511                                         Console.Error.WriteLine ("ERROR at line {0}", line);
 512                                         throw;
 513                                 }
 514                         }
 515                 }
 516
 517                 // For now this is enough.
 518                 string ParseTailoringSourceValue (string s)
 519                 {
 520                         StringBuilder sb = new StringBuilder ();
 521                         for (int i = 0; i < s.Length; i++) {
 522                                 if (s.StartsWith ("\\u")) {
 523                                         sb.Append ((char) int.Parse (
 524                                                 s.Substring (2, 4), NumberStyles.HexNumber),
 525                                                 1);
 526                                         i += 5;
 527                                 }
 528                         else
 529                                 sb.Append (s [i]);
 530                         }
 531                         return sb.ToString ();
 532                 }
 533
 534                 void ProcessTailoringLine (ref Tailoring t, string s)
 535                 {
 536                         int idx = s.IndexOf ('#');
 537                         if (idx > 0)
 538                                 s = s.Substring (0, idx).Trim ();
 539                         if (s.Length == 0 || s [0] == '#')
 540                                 return;
 541                         if (s [0] == '@') {
 542                                 idx = s.IndexOf ('=');
 543                                 if (idx > 0)
 544                                         t = new Tailoring (
 545                                                 int.Parse (s.Substring (1, idx - 1)),
 546                                                 int.Parse (s.Substring (idx + 1)));
 547                                 else
 548                                         t = new Tailoring (int.Parse (s.Substring (1)));
 549                                 tailorings.Add (t);
 550                                 return;
 551                         }
 552                         if (s.StartsWith ("*FrenchSort")) {
 553                                 t.FrenchSort = true;
 554                                 return;
 555                         }
 556                         string d = "*Diacritical";
 557                         if (s.StartsWith (d)) {
 558                                 idx = s.IndexOf ("->");
 559                                 t.AddDiacriticalMap (
 560                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
 561                                                 NumberStyles.HexNumber),
 562                                         byte.Parse (s.Substring (idx + 2).Trim (),
 563                                                 NumberStyles.HexNumber));
 564                                 return;
 565                         }
 566                         idx = s.IndexOf (':');
 567                         if (idx > 0) {
 568                                 string source = s.Substring (0, idx).Trim ();
 569                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
 570                                 byte [] b = new byte [5];
 571                                 for (int i = 0; i < 5; i++) {
 572                                         if (l [i] == "*")
 573                                                 b [i] = 0;
 574                                         else
 575                                                 b [i] = byte.Parse (l [i],
 576                                                         NumberStyles.HexNumber);
 577                                 }
 578                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
 579                                         b);
 580                         }
 581                         idx = s.IndexOf ('=');
 582                         if (idx > 0)
 583                                 t.AddReplacementMap (
 584                                         ParseTailoringSourceValue (
 585                                                 s.Substring (0, idx).Trim ()),
 586                                         ParseTailoringSourceValue (
 587                                                 s.Substring (idx + 1).Trim ()));
 588                 }
 589
 590                 void ParseDerivedAge (string filename)
 591                 {
 592                         using (StreamReader file =
 593                                 new StreamReader (filename)) {
 594                                 while (file.Peek () >= 0) {
 595                                         string s = file.ReadLine ();
 596                                         int idx = s.IndexOf ('#');
 597                                         if (idx >= 0)
 598                                                 s = s.Substring (0, idx);
 599                                         idx = s.IndexOf (';');
 600                                         if (idx < 0)
 601                                                 continue;
 602
 603                                         string cpspec = s.Substring (0, idx);
 604                                         idx = cpspec.IndexOf ("..");
 605                                         NumberStyles nf = NumberStyles.HexNumber |
 606                                                 NumberStyles.AllowTrailingWhite;
 607                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 608                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 609                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 610
 611                                         // FIXME: use index
 612                                         if (cp > char.MaxValue)
 613                                                 continue;
 614
 615                                         for (int i = cp; i <= cpEnd; i++)
 616                                                 unicodeAge [i] = double.Parse (value);
 617                                 }
 618                         }
 619                         unicodeAge [0] = double.MaxValue; // never be supported
 620                 }
 621
 622                 void ParseUnidata (string filename)
 623                 {
 624                         ArrayList decompValues = new ArrayList ();
 625                         using (StreamReader unidata =
 626                                 new StreamReader (filename)) {
 627                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 628                                         try {
 629                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 630                                         } catch (Exception) {
 631                                                 Console.Error.WriteLine ("**** At line " + line);
 632                                                 throw;
 633                                         }
 634                                 }
 635                         }
 636                         this.decompValues = (int [])
 637                                 decompValues.ToArray (typeof (int));
 638                 }
 639
 640                 void ProcessUnidataLine (string s, ArrayList decompValues)
 641                 {
 642                         int idx = s.IndexOf ('#');
 643                         if (idx >= 0)
 644                                 s = s.Substring (0, idx);
 645                         idx = s.IndexOf (';');
 646                         if (idx < 0)
 647                                 return;
 648                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 649                         string [] values = s.Substring (idx + 1).Split (';');
 650
 651                         // FIXME: use index
 652                         if (cp > char.MaxValue)
 653                                 return;
 654                         if (IsIgnorable (cp))
 655                                 return;
 656
 657                         string name = values [0];
 658
 659                         // isSmallCapital
 660                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 661                                 isSmallCapital [cp] = true;
 662
 663                         // latin mapping by character name
 664                         if (s.IndexOf ("LATIN") > 0) {
 665                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 666                                 int offset = lidx + 15;
 667                                 if (lidx < 0) {
 668                                         lidx = s.IndexOf ("LETTER TURNED ");
 669                                         offset = lidx + 14;
 670                                 }
 671                                 if (lidx < 0) {
 672                                         lidx = s.IndexOf ("LETTER ");
 673                                         offset = lidx + 7;
 674                                 }
 675                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 676                                 if ('A' <= c && c <= 'Z' &&
 677                                         (s.Length == offset + 1 || s [offset + 1] == ' ')) {
 678                                         ArrayList entry = (ArrayList) latinMap [c];
 679                                         if (entry == null) {
 680                                                 entry = new ArrayList ();
 681                                                 latinMap [c] = entry;
 682                                         }
 683                                         entry.Add (cp);
 684                                 }
 685                         }
 686
 687                         // Arrow names
 688                         if (0x2000 <= cp && cp < 0x3000) {
 689                                 int value = 0;
 690                                 // SPECIAL CASES. FIXME: why?
 691                                 switch (cp) {
 692                                 case 0x21C5: value = -1; break; // E2
 693                                 case 0x261D: value = 1; break;
 694                                 case 0x27A6: value = 3; break;
 695                                 case 0x21B0: value = 7; break;
 696                                 case 0x21B1: value = 3; break;
 697                                 case 0x21B2: value = 7; break;
 698                                 case 0x21B4: value = 5; break;
 699                                 case 0x21B5: value = 7; break;
 700                                 case 0x21B9: value = -1; break; // E1
 701                                 case 0x21CF: value = 7; break;
 702                                 case 0x21D0: value = 3; break;
 703                                 }
 704                                 string [] arrowTargets = new string [] {
 705                                         "",
 706                                         "UPWARDS",
 707                                         "NORTH EAST",
 708                                         "RIGHTWARDS",
 709                                         "SOUTH EAST",
 710                                         "DOWNWARDS",
 711                                         "SOUTH WEST",
 712                                         "LEFTWARDS",
 713                                         "NORTH WEST",
 714                                         };
 715                                 if (value == 0)
 716                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
 717                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
 718                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
 719                                                         s.IndexOf (" OVER") < 0
 720                                                 )
 721                                                         value = i;
 722                                 if (value > 0)
 723                                         arrowValues.Add (new DictionaryEntry (
 724                                                 cp, value));
 725                         }
 726
 727                         // Box names
 728                         if (0x2500 <= cp && cp < 0x25B0) {
 729                                 int value = 0;
 730                                 // flags:
 731                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
 732                                 // [h,rl] [r] [l]
 733                                 // [v,ud] [u] [d]
 734                                 // [dr] [dl] [ur] [ul]
 735                                 // [vr,udr] [vl,vdl]
 736                                 // [hd,rld] [hu,rlu]
 737                                 // [hv,udrl,rlv,udh]
 738                                 ArrayList flags = new ArrayList (new int [] {
 739                                         32, 8 + 4, 8, 4,
 740                                         16, 1 + 2, 1, 2,
 741                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
 742                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
 743                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
 744                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
 745                                         });
 746                                 byte [] offsets = new byte [] {
 747                                         0, 0, 1, 2,
 748                                         3, 3, 4, 5,
 749                                         6, 7, 8, 9,
 750                                         10, 10, 11, 11,
 751                                         12, 12, 13, 13,
 752                                         14, 14, 14, 14};
 753                                 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
 754                                         int flag = 0;
 755                                         if (s.IndexOf (" UP") > 0)
 756                                                 flag |= 1;
 757                                         if (s.IndexOf (" DOWN") > 0)
 758                                                 flag |= 2;
 759                                         if (s.IndexOf (" RIGHT") > 0)
 760                                                 flag |= 4;
 761                                         if (s.IndexOf (" LEFT") > 0)
 762                                                 flag |= 8;
 763                                         if (s.IndexOf (" VERTICAL") > 0)
 764                                                 flag |= 16;
 765                                         if (s.IndexOf (" HORIZONTAL") > 0)
 766                                                 flag |= 32;
 767
 768                                         int fidx = flags.IndexOf (flag);
 769                                         value = fidx < 0 ? fidx : offsets [fidx];
 770                                 } else if (s.IndexOf ("BLOCK") > 0) {
 771                                         if (s.IndexOf ("ONE EIGHTH") > 0)
 772                                                 value = 0x12;
 773                                         else if (s.IndexOf ("ONE QUARTER") > 0)
 774                                                 value = 0x13;
 775                                         else if (s.IndexOf ("THREE EIGHTHS") > 0)
 776                                                 value = 0x14;
 777                                         else if (s.IndexOf ("HALF") > 0)
 778                                                 value = 0x15;
 779                                         else if (s.IndexOf ("FIVE EIGHTHS") > 0)
 780                                                 value = 0x16;
 781                                         else if (s.IndexOf ("THREE QUARTERS") > 0)
 782                                                 value = 0x17;
 783                                         else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
 784                                                 value = 0x18;
 785                                         else
 786                                                 value = 0x19;
 787                                 }
 788                                 if (value >= 0)
 789                                         boxValues.Add (new DictionaryEntry (
 790                                                 cp, value));
 791                         }
 792
 793                         // For some characters store the name and sort later
 794                         // to determine sorting.
 795                         if (0x2100 <= cp && cp <= 0x213F &&
 796                                 Char.IsSymbol ((char) cp))
 797                                 sortableCharNames.Add (
 798                                         new DictionaryEntry (cp, values [0]));
 799                         else if (0x3380 <= cp && cp <= 0x33DD)
 800                                 sortableCharNames.Add (new DictionaryEntry (
 801                                         cp, values [0].Substring (7)));
 802
 803                         // diacritical weights by character name
 804                         for (int d = 0; d < diacritics.Length; d++)
 805                                 if (s.IndexOf (diacritics [d]) > 0)
 806                                         diacritical [cp] |= diacriticWeights [d];
 807                         // Two-step grep required for it.
 808                         if (s.IndexOf ("FULL STOP") > 0 &&
 809                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
 810                                 diacritical [cp] |= 0xF4;
 811
 812                         // Arabic letter name
 813                         if (0x0621 <= cp && cp <= 0x064A &&
 814                                 Char.GetUnicodeCategory ((char) cp)
 815                                 == UnicodeCategory.OtherLetter) {
 816                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
 817                                 switch (cp) {
 818                                 case 0x0621:
 819                                 case 0x0624:
 820                                 case 0x0626:
 821                                         // hamza, waw, yeh ... special cases.
 822                                         value = 0x07;
 823                                         break;
 824                                 case 0x0649:
 825                                 case 0x064A:
 826                                         value = 0x77; // special cases.
 827                                         break;
 828                                 default:
 829                                         // Get primary letter name i.e.
 830                                         // XXX part of ARABIC LETTER XXX yyy
 831                                         // e.g. that of "TEH MARBUTA" is "TEH".
 832                                         string letterName =
 833                                                 (cp == 0x0640) ?
 834                                                 // 0x0640 is special: it does
 835                                                 // not start with ARABIC LETTER
 836                                                 values [0] :
 837                                                 values [0].Substring (14);
 838                                         int tmpIdx = letterName.IndexOf (' ');
 839                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
 840 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
 841                                         if (arabicNameMap.ContainsKey (letterName))
 842                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
 843                                         else
 844                                                 arabicNameMap [letterName] = cp;
 845                                         break;
 846                                 }
 847                                 arabicLetterPrimaryValues [cp] = value;
 848                         }
 849
 850                         // Japanese square letter
 851                         if (0x3300 <= cp && cp <= 0x3357)
 852                                 if (!ExistsJIS (cp))
 853                                         nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
 854
 855                         // normalizationType
 856                         string decomp = values [4];
 857                         idx = decomp.IndexOf ('<');
 858                         if (idx >= 0) {
 859                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
 860                                 case "full":
 861                                         decompType [cp] = DecompositionFull;
 862                                         break;
 863                                 case "sub":
 864                                         decompType [cp] = DecompositionSub;
 865                                         break;
 866                                 case "super":
 867                                         decompType [cp] = DecompositionSuper;
 868                                         break;
 869                                 case "small":
 870                                         decompType [cp] = DecompositionSmall;
 871                                         break;
 872                                 case "isolated":
 873                                         decompType [cp] = DecompositionIsolated;
 874                                         break;
 875                                 case "initial":
 876                                         decompType [cp] = DecompositionInitial;
 877                                         break;
 878                                 case "final":
 879                                         decompType [cp] = DecompositionFinal;
 880                                         break;
 881                                 case "medial":
 882                                         decompType [cp] = DecompositionMedial;
 883                                         break;
 884                                 case "noBreak":
 885                                         decompType [cp] = DecompositionNoBreak;
 886                                         break;
 887                                 case "compat":
 888                                         decompType [cp] = DecompositionCompat;
 889                                         break;
 890                                 case "fraction":
 891                                         decompType [cp] = DecompositionFraction;
 892                                         break;
 893                                 case "font":
 894                                         decompType [cp] = DecompositionFont;
 895                                         break;
 896                                 case "circle":
 897                                         decompType [cp] = DecompositionCircle;
 898                                         break;
 899                                 case "square":
 900                                         decompType [cp] = DecompositionSquare;
 901                                         break;
 902                                 case "wide":
 903                                         decompType [cp] = DecompositionWide;
 904                                         break;
 905                                 case "narrow":
 906                                         decompType [cp] = DecompositionNarrow;
 907                                         break;
 908                                 case "vertical":
 909                                         decompType [cp] = DecompositionVertical;
 910                                         break;
 911                                 default:
 912                                         throw new Exception ("Support NFKD type : " + decomp);
 913                                 }
 914                         }
 915                         else
 916                                 decompType [cp] = DecompositionCanonical;
 917                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
 918                         if (decomp.Length > 0) {
 919
 920                                 string [] velems = decomp.Split (' ');
 921                                 int didx = decompValues.Count;
 922                                 decompIndex [cp] = didx;
 923                                 foreach (string v in velems)
 924                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
 925                                 decompLength [cp] = velems.Length;
 926
 927                                 // [decmpType] -> this_cp
 928                                 int targetCP = (int) decompValues [didx];
 929                                 // for "(x)" it specially maps to 'x' .
 930                                 // FIXME: check if it is sane
 931                                 if (velems.Length == 3 &&
 932                                         (int) decompValues [didx] == '(' &&
 933                                         (int) decompValues [didx + 2] == ')')
 934                                         targetCP = (int) decompValues [didx + 1];
 935                                 // special: 0x215F "1/"
 936                                 else if (cp == 0x215F)
 937                                         targetCP = '1';
 938                                 else if (velems.Length > 1 &&
 939                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
 940                                         // skip them, except for CJK ideograph compat
 941                                         targetCP = 0;
 942
 943                                 if (targetCP != 0) {
 944                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
 945                                         if (entry == null) {
 946                                                 entry = new Hashtable ();
 947                                                 nfkdMap [targetCP] = entry;
 948                                         }
 949                                         entry [(byte) decompType [cp]] = cp;
 950                                 }
 951                         }
 952                         // numeric values
 953                         if (values [5].Length > 0)
 954                                 decimalValue [cp] = decimal.Parse (values [5]);
 955                         else if (values [6].Length > 0)
 956                                 decimalValue [cp] = decimal.Parse (values [6]);
 957                         else if (values [7].Length > 0) {
 958                                 string decstr = values [7];
 959                                 idx = decstr.IndexOf ('/');
 960                                 if (cp == 0x215F) // special. "1/"
 961                                         decimalValue [cp] = 0x1;
 962                                 else if (idx > 0)
 963                                         // m/n
 964                                         decimalValue [cp] =
 965                                                 decimal.Parse (decstr.Substring (0, idx))
 966                                                 / decimal.Parse (decstr.Substring (idx + 1));
 967                                 else if (decstr [0] == '(' &&
 968                                         decstr [decstr.Length - 1] == ')')
 969                                         // (n)
 970                                         decimalValue [cp] =
 971                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
 972                                 else if (decstr [decstr.Length - 1] == '.')
 973                                         // n.
 974                                         decimalValue [cp] =
 975                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
 976                                 else
 977                                         decimalValue [cp] = decimal.Parse (decstr);
 978                         }
 979                 }
 980
 981                 void ParseDerivedCoreProperties (string filename)
 982                 {
 983                         // IsUppercase
 984                         using (StreamReader file =
 985                                 new StreamReader (filename)) {
 986                                 for (int line = 1; file.Peek () >= 0; line++) {
 987                                         try {
 988                                                 ProcessDerivedCorePropLine (file.ReadLine ());
 989                                         } catch (Exception) {
 990                                                 Console.Error.WriteLine ("**** At line " + line);
 991                                                 throw;
 992                                         }
 993                                 }
 994                         }
 995                 }
 996
 997                 void ProcessDerivedCorePropLine (string s)
 998                 {
 999                         int idx = s.IndexOf ('#');
1000                         if (idx >= 0)
1001                                 s = s.Substring (0, idx);
1002                         idx = s.IndexOf (';');
1003                         if (idx < 0)
1004                                 return;
1005                         string cpspec = s.Substring (0, idx);
1006                         idx = cpspec.IndexOf ("..");
1007                         NumberStyles nf = NumberStyles.HexNumber |
1008                                 NumberStyles.AllowTrailingWhite;
1009                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1010                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1011                         string value = s.Substring (cpspec.Length + 1).Trim ();
1012
1013                         // FIXME: use index
1014                         if (cp > char.MaxValue)
1015                                 return;
1016
1017                         switch (value) {
1018                         case "Uppercase":
1019                                 for (int x = cp; x <= cpEnd; x++)
1020                                         isUppercase [x] = true;
1021                                 break;
1022                         }
1023                 }
1024
1025                 void ParseScripts (string filename)
1026                 {
1027                         ArrayList cyrillic = new ArrayList ();
1028                         ArrayList gurmukhi = new ArrayList ();
1029                         ArrayList gujarati = new ArrayList ();
1030                         ArrayList georgian = new ArrayList ();
1031                         ArrayList thaana = new ArrayList ();
1032
1033                         using (StreamReader file =
1034                                 new StreamReader (filename)) {
1035                                 while (file.Peek () >= 0) {
1036                                         string s = file.ReadLine ();
1037                                         int idx = s.IndexOf ('#');
1038                                         if (idx >= 0)
1039                                                 s = s.Substring (0, idx);
1040                                         idx = s.IndexOf (';');
1041                                         if (idx < 0)
1042                                                 continue;
1043
1044                                         string cpspec = s.Substring (0, idx);
1045                                         idx = cpspec.IndexOf ("..");
1046                                         NumberStyles nf = NumberStyles.HexNumber |
1047                                                 NumberStyles.AllowTrailingWhite;
1048                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1049                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1050                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1051
1052                                         // FIXME: use index
1053                                         if (cp > char.MaxValue)
1054                                                 continue;
1055
1056                                         switch (value) {
1057                                         case "Cyrillic":
1058                                                 for (int x = cp; x <= cpEnd; x++)
1059                                                         if (!IsIgnorable (x))
1060                                                                 cyrillic.Add ((char) x);
1061                                                 break;
1062                                         case "Gurmukhi":
1063                                                 for (int x = cp; x <= cpEnd; x++)
1064                                                         if (!IsIgnorable (x))
1065                                                                 gurmukhi.Add ((char) x);
1066                                                 break;
1067                                         case "Gujarati":
1068                                                 for (int x = cp; x <= cpEnd; x++)
1069                                                         if (!IsIgnorable (x))
1070                                                                 gujarati.Add ((char) x);
1071                                                 break;
1072                                         case "Georgian":
1073                                                 for (int x = cp; x <= cpEnd; x++)
1074                                                         if (!IsIgnorable (x))
1075                                                                 georgian.Add ((char) x);
1076                                                 break;
1077                                         case "Thaana":
1078                                                 for (int x = cp; x <= cpEnd; x++)
1079                                                         if (!IsIgnorable (x))
1080                                                                 thaana.Add ((char) x);
1081                                                 break;
1082                                         }
1083                                 }
1084                         }
1085                         cyrillic.Sort (UCAComparer.Instance);
1086                         gurmukhi.Sort (UCAComparer.Instance);
1087                         gujarati.Sort (UCAComparer.Instance);
1088                         georgian.Sort (UCAComparer.Instance);
1089                         thaana.Sort (UCAComparer.Instance);
1090                         orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1091                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1092                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1093                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1094                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1095                 }
1096
1097                 void ParseJISOrder (string filename)
1098                 {
1099                         using (StreamReader file =
1100                                 new StreamReader (filename)) {
1101                                 while (file.Peek () >= 0) {
1102                                         string s = file.ReadLine ();
1103                                         int idx = s.IndexOf ('#');
1104                                         if (idx >= 0)
1105                                                 s = s.Substring (0, idx).Trim ();
1106                                         if (s.Length == 0)
1107                                                 continue;
1108                                         idx = s.IndexOf (' ');
1109                                         if (idx < 0)
1110                                                 continue;
1111                                         // They start with "0x" so cut them out.
1112                                         int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
1113                                         int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
1114                                         jisJapanese.Add (new JISCharacter (cp, jis));
1115                                 }
1116                         }
1117                 }
1118
1119                 void ParseCJK (string zhXML, string jaXML, string koXML)
1120                 {
1121                         XmlDocument doc = new XmlDocument ();
1122                         doc.XmlResolver = null;
1123                         int v;
1124                         string s;
1125                         string category;
1126                         int offset;
1127                         ushort [] arr;
1128
1129                         // Chinese Simplified
1130                         category = "chs";
1131                         arr = cjkCHS;
1132                         offset = char.MaxValue - arr.Length;
1133                         doc.Load (zhXML);
1134                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1135                         v = 0x8008;
1136                         foreach (char c in s) {
1137                                 if (c < '\u3100')
1138                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1139                                 else {
1140                                         arr [(int) c - offset] = (ushort) v++;
1141                                         if (v % 256 == 0)
1142                                                 v += 2;
1143                                 }
1144                         }
1145
1146                         // Chinese Traditional
1147                         category = "cht";
1148                         arr = cjkCHT;
1149                         offset = char.MaxValue - arr.Length;
1150                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1151                         v = 0x8002;
1152                         foreach (char c in s) {
1153                                 if (c < '\u4E00')
1154                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1155                                 else {
1156                                         arr [(int) c - offset] = (ushort) v++;
1157                                         if (v % 256 == 0)
1158                                                 v += 2;
1159                                 }
1160                         }
1161
1162                         // Japanese
1163                         category = "ja";
1164                         arr = cjkJA;
1165                         offset = char.MaxValue - arr.Length;
1166                         doc.Load (jaXML);
1167                         s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1168                         v = 0x8008;
1169                         foreach (char c in s) {
1170                                 if (c < '\u4E00')
1171                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1172                                 else {
1173                                         arr [(int) c - offset] = (ushort) v++;
1174                                         if (v % 256 == 0)
1175                                                 v += 2;
1176                                 }
1177                         }
1178
1179                         // Korean
1180                         // Korean weight is somewhat complex. It first shifts
1181                         // Hangul category from 52-x to 80-x (they are anyways
1182                         // computed). CJK ideographs are placed at secondary
1183                         // weight, like XX YY 01 zz 01, where XX and YY are
1184                         // corresponding "reset" value and zz is 41,43,45...
1185                         //
1186                         // Unlike chs,cht and ja, Korean value is a combined
1187                         // ushort which is computed as category
1188                         //
1189                         category = "ko";
1190                         arr = cjkKO;
1191                         offset = char.MaxValue - arr.Length;
1192                         doc.Load (koXML);
1193                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1194                                 XmlElement sc = (XmlElement) reset.NextSibling;
1195                                 // compute "category" and "level 1" for the
1196                                 // target "reset" Hangle syllable
1197                                 char rc = reset.InnerText [0];
1198                                 int ri = ((int) rc - 0xAC00) + 1;
1199                                 ushort p = (ushort)
1200                                         ((ri / 254) * 256 + (ri % 254) + 2);
1201                                 // Place the characters after the target.
1202                                 s = sc.InnerText;
1203                                 v = 0x41;
1204                                 foreach (char c in s) {
1205                                         arr [(int) c - offset] = p;
1206                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1207                                         v += 2;
1208                                 }
1209                         }
1210                 }
1211
1212                 #endregion
1213
1214                 #region Generation
1215
1216                 void FillIgnorables ()
1217                 {
1218                         for (int i = 0; i <= char.MaxValue; i++) {
1219                                 if (Char.GetUnicodeCategory ((char) i) ==
1220                                         UnicodeCategory.OtherNotAssigned)
1221                                         continue;
1222                                 if (IsIgnorable (i))
1223                                         ignorableFlags [i] |= 1;
1224                                 if (IsIgnorableSymbol (i))
1225                                         ignorableFlags [i] |= 2;
1226                                 if (IsIgnorableNonSpacing (i))
1227                                         ignorableFlags [i] |= 4;
1228                         }
1229                 }
1230
1231                 void ModifyParsedValues ()
1232                 {
1233                         // number, secondary weights
1234                         byte weight = 0x38;
1235                         int [] numarr = numberSecondaryWeightBounds;
1236                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1237                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1238                                         if (Char.IsNumber ((char) cp))
1239                                                 diacritical [cp] = weight;
1240
1241                         // Korean parens numbers
1242                         for (int i = 0x3200; i <= 0x321C; i++)
1243                                 diacritical [i] = 0xA;
1244                         for (int i = 0x3260; i <= 0x327B; i++)
1245                                 diacritical [i] = 0xC;
1246
1247                         // Update name part of named characters
1248                         for (int i = 0; i < sortableCharNames.Count; i++) {
1249                                 DictionaryEntry de =
1250                                         (DictionaryEntry) sortableCharNames [i];
1251                                 int cp = (int) de.Key;
1252                                 string renamed = null;
1253                                 switch (cp) {
1254                                 case 0x2101: renamed = "A_1"; break;
1255                                 case 0x33C3: renamed = "A_2"; break;
1256                                 case 0x2105: renamed = "C_1"; break;
1257                                 case 0x2106: renamed = "C_2"; break;
1258                                 case 0x211E: renamed = "R1"; break;
1259                                 case 0x211F: renamed = "R2"; break;
1260                                 // Remove some of them!
1261                                 case 0x2103:
1262                                 case 0x2109:
1263                                 case 0x2116:
1264                                 case 0x2117:
1265                                 case 0x2118:
1266                                 case 0x2125:
1267                                 case 0x2127:
1268                                 case 0x2129:
1269                                 case 0x212E:
1270                                 case 0x2132:
1271                                         sortableCharNames.RemoveAt (i);
1272                                         i--;
1273                                         continue;
1274                                 }
1275                                 if (renamed != null)
1276                                         sortableCharNames [i] =
1277                                                 new DictionaryEntry (cp, renamed);
1278                         }
1279                 }
1280
1281                 void GenerateCore ()
1282                 {
1283                         UnicodeCategory uc;
1284
1285                         #region Specially ignored // 01
1286                         // This will raise "Defined" flag up.
1287                         foreach (char c in specialIgnore)
1288                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1289                         #endregion
1290
1291
1292                         #region Variable weights
1293                         // Controls : 06 03 - 06 3D
1294                         fillIndex [6] = 3;
1295                         for (int i = 0; i < 65536; i++) {
1296                                 if (IsIgnorable (i))
1297                                         continue;
1298                                 char c = (char) i;
1299                                 uc = Char.GetUnicodeCategory (c);
1300                                 // NEL is whitespace but not ignored here.
1301                                 if (uc == UnicodeCategory.Control &&
1302                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1303                                         AddCharMap (c, 6, 1);
1304                         }
1305
1306                         // Apostrophe 06 80
1307                         fillIndex [6] = 0x80;
1308                         AddCharMapGroup ('\'', 6, 1, 0);
1309                         AddCharMap ('\uFE63', 6, 1);
1310
1311                         // Hyphen/Dash : 06 81 - 06 90
1312                         for (int i = 0; i < char.MaxValue; i++) {
1313                                 if (Char.GetUnicodeCategory ((char) i)
1314                                         == UnicodeCategory.DashPunctuation)
1315                                         AddCharMapGroupTail ((char) i, 6, 1);
1316                         }
1317
1318                         // Arabic variable weight chars 06 A0 -
1319                         fillIndex [6] = 0xA0;
1320                         // vowels
1321                         for (int i = 0x64B; i <= 0x650; i++)
1322                                 AddCharMapGroupTail ((char) i, 6, 1);
1323                         // sukun
1324                         AddCharMapGroup ('\u0652', 6, 1, 0);
1325                         // shadda
1326                         AddCharMapGroup ('\u0651', 6, 1, 0);
1327                         #endregion
1328
1329
1330                         #region Nonspacing marks // 01
1331                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1332
1333                         // Combining diacritical marks: 01 DC -
1334
1335                         fillIndex [0x1] = 0x41;
1336                         for (int i = 0x030E; i <= 0x0326; i++)
1337                                 if (!IsIgnorable (i))
1338                                         AddCharMap ((char) i, 0x1, 1);
1339                         for (int i = 0x0329; i <= 0x0334; i++)
1340                                 if (!IsIgnorable (i))
1341                                         AddCharMap ((char) i, 0x1, 1);
1342                         for (int i = 0x0339; i <= 0x0341; i++)
1343                                 if (!IsIgnorable (i))
1344                                         AddCharMap ((char) i, 0x1, 1);
1345                         fillIndex [0x1] = 0x72;
1346                         for (int i = 0x0346; i <= 0x0348; i++)
1347                                 if (!IsIgnorable (i))
1348                                         AddCharMap ((char) i, 0x1, 1);
1349                         for (int i = 0x02BE; i <= 0x02BF; i++)
1350                                 if (!IsIgnorable (i))
1351                                         AddCharMap ((char) i, 0x1, 1);
1352                         for (int i = 0x02C1; i <= 0x02C5; i++)
1353                                 if (!IsIgnorable (i))
1354                                         AddCharMap ((char) i, 0x1, 1);
1355                         for (int i = 0x02CE; i <= 0x02CF; i++)
1356                                 if (!IsIgnorable (i))
1357                                         AddCharMap ((char) i, 0x1, 1);
1358                         for (int i = 0x02D1; i <= 0x02D3; i++)
1359                                 if (!IsIgnorable (i))
1360                                         AddCharMap ((char) i, 0x1, 1);
1361                         AddCharMap ('\u02DE', 0x1, 1);
1362                         for (int i = 0x02E4; i <= 0x02E9; i++)
1363                                 if (!IsIgnorable (i))
1364                                         AddCharMap ((char) i, 0x1, 1);
1365
1366                         // LAMESPEC: It should not stop at '\u20E1'. There are
1367                         // a few more characters (that however results in
1368                         // overflow of level 2 unless we start before 0xDD).
1369                         fillIndex [0x1] = 0xDC;
1370                         for (int i = 0x20d0; i <= 0x20e1; i++)
1371                                 AddCharMap ((char) i, 0x1, 1);
1372                         #endregion
1373
1374
1375                         #region Whitespaces // 07 03 -
1376                         fillIndex [0x7] = 0x2;
1377                         AddCharMap (' ', 0x7, 2);
1378                         AddCharMap ('\u00A0', 0x7, 1);
1379                         for (int i = 9; i <= 0xD; i++)
1380                                 AddCharMap ((char) i, 0x7, 1);
1381                         for (int i = 0x2000; i <= 0x200B; i++)
1382                                 AddCharMap ((char) i, 0x7, 1);
1383
1384                         fillIndex [0x7] = 0x17;
1385                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1386                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1387
1388                         // Characters which used to represent layout control.
1389                         // LAMESPEC: Windows developers seem to have thought
1390                         // that those characters are kind of whitespaces,
1391                         // while they aren't.
1392                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1393                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1394                         #endregion
1395
1396                         // FIXME: 09 should be more complete.
1397                         fillIndex [0x9] = 2;
1398                         // misc tech mark
1399                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1400                                 AddCharMap ((char) cp, 0x9, 1, 0);
1401
1402                         // arrows
1403                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1404                         foreach (DictionaryEntry de in arrowValues) {
1405                                 int idx = (int) de.Value;
1406                                 int cp = (int) de.Key;
1407                                 if (map [cp].Defined)
1408                                         continue;
1409                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1410                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1411                                 arrowLv2 [idx]++;
1412                         }
1413                         // boxes
1414                         byte [] boxLv2 = new byte [128];
1415                         for (int i = 0; i < boxLv2.Length; i++)
1416                                 boxLv2 [i] = 3;
1417                         foreach (DictionaryEntry de in boxValues) {
1418                                 int cp = (int) de.Key;
1419                                 int idx = (int) de.Value;
1420                                 if (map [cp].Defined)
1421                                         continue;
1422                                 fillIndex [0x9] = (byte) (0xE5 + idx);
1423                                 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1424                                 boxLv2 [idx]++;
1425                         }
1426                         // Some special characters (slanted)
1427                         fillIndex [0x9] = 0xF4;
1428                         AddCharMap ('\u2571', 0x9, 3);
1429                         AddCharMap ('\u2572', 0x9, 3);
1430                         AddCharMap ('\u2573', 0x9, 3);
1431
1432                         // FIXME: implement 0A
1433                         #region Symbols
1434                         fillIndex [0xA] = 2;
1435                         // byte currency symbols
1436                         for (int cp = 0; cp < 0x100; cp++) {
1437                                 uc = Char.GetUnicodeCategory ((char) cp);
1438                                 if (!IsIgnorable (cp) &&
1439                                         uc == UnicodeCategory.CurrencySymbol &&
1440                                         cp != '$')
1441                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1442                         }
1443                         // byte other symbols
1444                         for (int cp = 0; cp < 0x100; cp++) {
1445                                 if (cp == 0xA6)
1446                                         continue; // SPECIAL: skip FIXME: why?
1447                                 uc = Char.GetUnicodeCategory ((char) cp);
1448                                 if (!IsIgnorable (cp) &&
1449                                         uc == UnicodeCategory.OtherSymbol)
1450                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1451                         }
1452
1453                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1454                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1455                                 AddCharMap ((char) cp, 0xA, 1, 0);
1456                         // Dingbats
1457                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1458                                 if (Char.IsSymbol ((char) cp))
1459                                         AddCharMap ((char) cp, 0xA, 1, 0);
1460                         // OCR
1461                         for (int i = 0x2440; i < 0x2460; i++)
1462                                 AddCharMap ((char) i, 0xA, 1, 0);
1463
1464                         #endregion
1465
1466                         #region Numbers // 0C 02 - 0C E1
1467                         fillIndex [0xC] = 2;
1468
1469                         // 9F8 : Bengali "one less than the denominator"
1470                         AddCharMap ('\u09F8', 0xC, 1);
1471
1472                         ArrayList numbers = new ArrayList ();
1473                         for (int i = 0; i < 65536; i++)
1474                                 if (!IsIgnorable (i) &&
1475                                         Char.IsNumber ((char) i) &&
1476                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1477                                         numbers.Add (i);
1478
1479                         ArrayList numberValues = new ArrayList ();
1480                         foreach (int i in numbers)
1481                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1482                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1483
1484 //foreach (DictionaryEntry de in numberValues)
1485 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1486
1487                         decimal prevValue = -1;
1488                         foreach (DictionaryEntry de in numberValues) {
1489                                 int cp = (int) de.Key;
1490                                 decimal currValue = (decimal) de.Value;
1491                                 bool addnew = false;
1492                                 if (prevValue < currValue &&
1493                                         prevValue - (int) prevValue == 0 &&
1494                                         prevValue >= 1) {
1495
1496                                         addnew = true;
1497                                         // Process Hangzhou and Roman numbers
1498
1499                                         // There are some SPECIAL cases.
1500                                         if (currValue != 4) // no increment for 4
1501                                                 fillIndex [0xC]++;
1502
1503                                         int xcp;
1504                                         xcp = (int) prevValue + 0x2170 - 1;
1505                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1506                                         xcp = (int) prevValue + 0x2160 - 1;
1507                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1508                                         fillIndex [0xC] += 2;
1509                                         xcp = (int) prevValue + 0x3021 - 1;
1510                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1511                                         fillIndex [0xC]++;
1512                                 }
1513                                 if (prevValue < currValue)
1514                                         prevValue = currValue;
1515                                 if (map [cp].Defined)
1516                                         continue;
1517                                 // HangZhou and Roman are add later
1518                                 // (code is above)
1519                                 else if (0x3021 <= cp && cp < 0x302A
1520                                         || 0x2160 <= cp && cp < 0x216A
1521                                         || 0x2170 <= cp && cp < 0x217A)
1522                                         continue;
1523
1524                                 if (cp ==  0x215B) // FIXME: why?
1525                                         fillIndex [0xC] += 2;
1526                                 else if (cp == 0x3021) // FIXME: why?
1527                                         fillIndex [0xC]++;
1528                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1529
1530                                 if (addnew || cp <= '9') {
1531                                         int xcp;
1532                                         if (1 <= currValue && currValue <= 10) {
1533                                                 xcp = cp - 0x31 + 0x2776;
1534                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1535                                                 xcp = cp - 0x31 + 0x2780;
1536                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1537                                                 xcp = cp - 0x31 + 0x278A;
1538                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1539                                         }
1540                                         if (1 <= currValue && currValue <= 20) {
1541                                                 xcp = cp - 0x31 + 0x2460;
1542                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1543                                                 xcp = cp - 0x31 + 0x2474;
1544                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1545                                                 xcp = cp - 0x31 + 0x2488;
1546                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1547                                         }
1548                                 }
1549
1550                                 if (cp != 0x09E7 && cp != 0x09EA)
1551                                         fillIndex [0xC]++;
1552
1553                                 // Add special cases that are not regarded as
1554                                 // numbers in UnicodeCategory speak.
1555                                 if (cp == '5') {
1556                                         // TONE FIVE
1557                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1558                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1559                                 }
1560                                 else if (cp == '6') // FIXME: why?
1561                                         fillIndex [0xC]++;
1562                         }
1563
1564                         // 221E: infinity
1565                         fillIndex [0xC] = 0xFF;
1566                         AddCharMap ('\u221E', 0xC, 1);
1567                         #endregion
1568
1569                         #region Letters and NonSpacing Marks (general)
1570
1571                         // ASCII Latin alphabets
1572                         for (int i = 0; i < alphabets.Length; i++)
1573                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1574
1575
1576                         // non-ASCII Latin alphabets
1577                         // FIXME: there is no such characters that are placed
1578                         // *after* "alphabets" array items. This is nothing
1579                         // more than a hack that creates dummy weight for
1580                         // primary characters.
1581                         for (int i = 0x0080; i < 0x0300; i++) {
1582                                 if (!Char.IsLetter ((char) i))
1583                                         continue;
1584                                 // For those Latin Letters which has NFKD are
1585                                 // not added as independent primary character.
1586                                 if (decompIndex [i] != 0)
1587                                         continue;
1588                                 // SPECIAL CASES:
1589                                 // 1.some alphabets have primarily
1590                                 //   equivalent ASCII alphabets.
1591                                 // 2.some have independent primary weights,
1592                                 //   but inside a-to-z range.
1593                                 // 3.there are some expanded characters that
1594                                 //   are not part of Unicode Standard NFKD.
1595                                 switch (i) {
1596                                 // 1. skipping them does not make sense
1597 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
1598 //                              case 0x184: case 0x185: case 0x186: case 0x189:
1599 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
1600 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
1601 //                              case 0x19B: case 0x19C:
1602                                 // 2. skipping them does not make sense
1603 //                              case 0x14A: // Ng
1604 //                              case 0x14B: // ng
1605                                 // 3.
1606                                 case 0xC6: // AE
1607                                 case 0xE6: // ae
1608                                 case 0xDE: // Icelandic Thorn
1609                                 case 0xFE: // Icelandic Thorn
1610                                 case 0xDF: // German ss
1611                                 case 0xFF: // German ss
1612                                 // not classified yet
1613 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1614 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1615 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1616 //                              case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1617 //                              case 0x1DD:
1618                                         continue;
1619                                 }
1620                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
1621                         }
1622
1623                         // Greek and Coptic
1624                         fillIndex [0xF] = 02;
1625                         for (int i = 0x0380; i < 0x0390; i++)
1626                                 if (Char.IsLetter ((char) i))
1627                                         AddLetterMap ((char) i, 0xF, 1);
1628                         fillIndex [0xF] = 02;
1629                         for (int i = 0x0391; i < 0x03CF; i++)
1630                                 if (Char.IsLetter ((char) i))
1631                                         AddLetterMap ((char) i, 0xF, 1);
1632                         fillIndex [0xF] = 0x40;
1633                         for (int i = 0x03D0; i < 0x0400; i++)
1634                                 if (Char.IsLetter ((char) i))
1635                                         AddLetterMap ((char) i, 0xF, 1);
1636
1637                         // Cyrillic - UCA order w/ some modification
1638                         fillIndex [0x10] = 0x3;
1639                         // table which is moslty from UCA DUCET.
1640                         for (int i = 0; i < orderedCyrillic.Length; i++) {
1641                                 char c = orderedCyrillic [i];
1642                                 if (Char.IsLetter (c))
1643                                         AddLetterMap (c, 0x10, 3);
1644                         }
1645                         for (int i = 0x0460; i < 0x0481; i++) {
1646                                 if (Char.IsLetter ((char) i))
1647                                         AddLetterMap ((char) i, 0x10, 3);
1648                         }
1649
1650                         // Armenian
1651                         fillIndex [0x11] = 0x3;
1652                         for (int i = 0x0531; i < 0x0586; i++)
1653                                 if (Char.IsLetter ((char) i))
1654                                         AddLetterMap ((char) i, 0x11, 1);
1655
1656                         // Hebrew
1657                         // -Letters
1658                         fillIndex [0x12] = 0x3;
1659                         for (int i = 0x05D0; i < 0x05FF; i++)
1660                                 if (Char.IsLetter ((char) i))
1661                                         AddLetterMap ((char) i, 0x12, 1);
1662                         // -Accents
1663                         fillIndex [0x1] = 0x3;
1664                         for (int i = 0x0591; i <= 0x05C2; i++)
1665                                 if (i != 0x05BE)
1666                                         AddCharMap ((char) i, 0x1, 1);
1667
1668                         // Arabic
1669                         fillIndex [0x1] = 0x8E;
1670                         fillIndex [0x13] = 0x3;
1671                         for (int i = 0x0621; i <= 0x064A; i++) {
1672                                 // Abjad
1673                                 if (Char.GetUnicodeCategory ((char) i)
1674                                         != UnicodeCategory.OtherLetter) {
1675                                         // FIXME: arabic nonspacing marks are
1676                                         // in different order.
1677                                         AddCharMap ((char) i, 0x1, 1);
1678                                         continue;
1679                                 }
1680 //                              map [i] = new CharMapEntry (0x13,
1681 //                                      (byte) arabicLetterPrimaryValues [i], 1);
1682                                 fillIndex [0x13] =
1683                                         (byte) arabicLetterPrimaryValues [i];
1684                                 AddLetterMap ((char) i, 0x13, 0);
1685                         }
1686                         fillIndex [0x13] = 0x84;
1687                         for (int i = 0x0674; i < 0x06D6; i++)
1688                                 if (Char.IsLetter ((char) i))
1689                                         AddLetterMap ((char) i, 0x13, 1);
1690
1691                         // Devanagari
1692                         // FIXME: it does seem straight codepoint mapping.
1693                         fillIndex [0x14] = 04;
1694                         for (int i = 0x0901; i < 0x0905; i++)
1695                                 if (!IsIgnorable (i))
1696                                         AddLetterMap ((char) i, 0x14, 2);
1697                         fillIndex [0x14] = 0xB;
1698                         for (int i = 0x0905; i < 0x093A; i++)
1699                                 if (Char.IsLetter ((char) i))
1700                                         AddLetterMap ((char) i, 0x14, 4);
1701                         for (int i = 0x093E; i < 0x094F; i++)
1702                                 if (!IsIgnorable (i))
1703                                         AddLetterMap ((char) i, 0x14, 2);
1704
1705                         // Bengali
1706                         // -Letters
1707                         fillIndex [0x15] = 02;
1708                         for (int i = 0x0980; i < 0x9FF; i++) {
1709                                 if (IsIgnorable (i))
1710                                         continue;
1711                                 if (i == 0x09E0)
1712                                         fillIndex [0x15] = 0x3B;
1713                                 switch (Char.GetUnicodeCategory ((char) i)) {
1714                                 case UnicodeCategory.NonSpacingMark:
1715                                 case UnicodeCategory.DecimalDigitNumber:
1716                                 case UnicodeCategory.OtherNumber:
1717                                         continue;
1718                                 }
1719                                 AddLetterMap ((char) i, 0x15, 1);
1720                         }
1721                         // -Signs
1722                         fillIndex [0x1] = 0x3;
1723                         for (int i = 0x0981; i < 0x0A00; i++)
1724                                 if (Char.GetUnicodeCategory ((char) i) ==
1725                                         UnicodeCategory.NonSpacingMark)
1726                                         AddCharMap ((char) i, 0x1, 1);
1727
1728                         // Gurmukhi. orderedGurmukhi is from UCA
1729                         // FIXME: it does not look equivalent to UCA.
1730                         fillIndex [0x1] = 03;
1731                         fillIndex [0x16] = 02;
1732                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
1733                                 char c = orderedGurmukhi [i];
1734                                 if (IsIgnorable ((int) c))
1735                                         continue;
1736                                 if (!Char.IsLetter (c)) {
1737                                         AddLetterMap (c, 0x1, 1);
1738                                         continue;
1739                                 }
1740                                 if (c == '\u0A3C' || c == '\u0A4D' ||
1741                                         '\u0A66' <= c && c <= '\u0A71')
1742                                         continue;
1743                                 AddLetterMap (c, 0x16, 4);
1744                         }
1745
1746                         // Gujarati. orderedGujarati is from UCA
1747                         fillIndex [0x17] = 02;
1748                         for (int i = 0; i < orderedGujarati.Length; i++)
1749                                 AddLetterMap (orderedGujarati [i], 0x17, 4);
1750
1751                         // Oriya
1752                         fillIndex [0x18] = 02;
1753                         for (int i = 0x0B00; i < 0x0B7F; i++) {
1754                                 switch (Char.GetUnicodeCategory ((char) i)) {
1755                                 case UnicodeCategory.NonSpacingMark:
1756                                 case UnicodeCategory.DecimalDigitNumber:
1757                                         continue;
1758                                 }
1759                                 AddLetterMap ((char) i, 0x18, 1);
1760                         }
1761
1762                         // Tamil
1763                         fillIndex [0x19] = 2;
1764                         AddCharMap ('\u0BD7', 0x19, 0);
1765                         fillIndex [0x19] = 0xA;
1766                         // vowels
1767                         for (int i = 0x0BD7; i < 0x0B94; i++)
1768                                 if (Char.IsLetter ((char) i))
1769                                         AddCharMap ((char) i, 0x19, 2);
1770                         // special vowel
1771                         fillIndex [0x19] = 0x24;
1772                         AddCharMap ('\u0B94', 0x19, 0);
1773                         fillIndex [0x19] = 0x26;
1774                         // The array for Tamil consonants is a constant.
1775                         // Windows have almost similar sequence to TAM from
1776                         // tamilnet but a bit different in Grantha.
1777                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
1778                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1779                         // combining marks
1780                         fillIndex [0x19] = 0x82;
1781                         for (int i = 0x0BBE; i < 0x0BCD; i++)
1782                                 if (Char.GetUnicodeCategory ((char) i) ==
1783                                         UnicodeCategory.SpacingCombiningMark
1784                                         || i == 0x0BC0)
1785                                         AddLetterMap ((char) i, 0x19, 2);
1786
1787                         // Telugu
1788                         fillIndex [0x1A] = 0x4;
1789                         for (int i = 0x0C00; i < 0x0C62; i++) {
1790                                 if (i == 0x0C55 || i == 0x0C56)
1791                                         continue; // skip
1792                                 AddCharMap ((char) i, 0x1A, 3);
1793                                 char supp = (i == 0x0C0B) ? '\u0C60':
1794                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
1795                                 if (supp == char.MinValue)
1796                                         continue;
1797                                 AddCharMap (supp, 0x1A, 3);
1798                         }
1799
1800                         // Kannada
1801                         fillIndex [0x1B] = 4;
1802                         for (int i = 0x0C80; i < 0x0CE5; i++) {
1803                                 if (i == 0x0CD5 || i == 0x0CD6)
1804                                         continue; // ignore
1805                                 AddCharMap ((char) i, 0x1B, 3);
1806                         }
1807
1808                         // Malayalam
1809                         fillIndex [0x1C] = 2;
1810                         for (int i = 0x0D02; i < 0x0D61; i++)
1811                                 // FIXME: I avoided MSCompatUnicodeTable usage
1812                                 // here (it results in recursion). So check if
1813                                 // using NonSpacingMark makes sense or not.
1814                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1815 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1816                                         AddCharMap ((char) i, 0x1C, 1);
1817
1818                         // Thai ... note that it breaks 0x1E wall after E2B!
1819                         // Also, all Thai characters have level 2 value 3.
1820                         fillIndex [0x1E] = 2;
1821                         for (int i = 0xE44; i < 0xE48; i++)
1822                                 AddCharMap ((char) i, 0x1E, 1, 3);
1823                         for (int i = 0xE01; i < 0xE2B; i++)
1824                                 AddCharMap ((char) i, 0x1E, 6, 0);
1825                         fillIndex [0x1F] = 5;
1826                         for (int i = 0xE2B; i < 0xE30; i++)
1827                                 AddCharMap ((char) i, 0x1F, 6, 0);
1828                         for (int i = 0xE30; i < 0xE3B; i++)
1829                                 AddCharMap ((char) i, 0x1F, 1, 3);
1830                         // some Thai characters remains.
1831                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
1832                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1833                         foreach (char c in specialThai)
1834                                 AddCharMap (c, 0x1F, 1);
1835
1836                         // Lao
1837                         fillIndex [0x1F] = 2;
1838                         for (int i = 0xE80; i < 0xEDF; i++)
1839                                 if (Char.IsLetter ((char) i))
1840                                         AddCharMap ((char) i, 0x1F, 1);
1841
1842                         // Georgian. orderedGeorgian is from UCA DUCET.
1843                         fillIndex [0x21] = 5;
1844                         for (int i = 0; i < orderedGeorgian.Length; i++)
1845                                 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1846
1847                         // Japanese Kana.
1848                         fillIndex [0x22] = 2;
1849                         int kanaOffset = 0x3041;
1850                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1851
1852                         for (int gyo = 0; gyo < 9; gyo++) {
1853                                 for (int dan = 0; dan < 5; dan++) {
1854                                         if (gyo == 7 && dan % 2 == 1) {
1855                                                 // 'ya'-gyo
1856                                                 fillIndex [0x22]++;
1857                                                 kanaOffset -= 2; // There is no space for yi and ye.
1858                                                 continue;
1859                                         }
1860                                         int cp = kanaOffset + dan * kanaLines [gyo];
1861                                         // small lines (a-gyo, ya-gyo)
1862                                         if (gyo == 0 || gyo == 7) {
1863                                                 AddKanaMap (cp, 1); // small
1864                                                 AddKanaMap (cp + 1, 1);
1865                                         }
1866                                         else
1867                                                 AddKanaMap (cp, kanaLines [gyo]);
1868                                         fillIndex [0x22]++;
1869
1870                                         if (cp == 0x3061) {
1871                                                 // add small 'Tsu' (before normal one)
1872                                                 AddKanaMap (0x3063, 1);
1873                                                 kanaOffset++;
1874                                         }
1875                                 }
1876                                 fillIndex [0x22] += 3;
1877                                 kanaOffset += 5 * kanaLines [gyo];
1878                         }
1879
1880                         // Wa-gyo is almost special, so I just manually add.
1881                         AddLetterMap ((char) 0x308E, 0x22, 0);
1882                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1883                         AddLetterMap ((char) 0x308F, 0x22, 0);
1884                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1885                         fillIndex [0x22]++;
1886                         AddLetterMap ((char) 0x3090, 0x22, 0);
1887                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1888                         fillIndex [0x22] += 2;
1889                         // no "Wu" in Japanese.
1890                         AddLetterMap ((char) 0x3091, 0x22, 0);
1891                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1892                         fillIndex [0x22]++;
1893                         AddLetterMap ((char) 0x3092, 0x22, 0);
1894                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1895                         // Nn
1896                         fillIndex [0x22] = 0x80;
1897                         AddLetterMap ((char) 0x3093, 0x22, 0);
1898                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1899
1900                         // JIS Japanese square chars.
1901                         fillIndex [0x22] = 0x97;
1902                         jisJapanese.Sort (JISComparer.Instance);
1903                         foreach (JISCharacter j in jisJapanese)
1904                                 AddCharMap ((char) j.CP, 0x22, 1);
1905                         // non-JIS Japanese square chars.
1906                         nonJisJapanese.Sort (NonJISComparer.Instance);
1907                         foreach (NonJISCharacter j in nonJisJapanese)
1908                                 AddCharMap ((char) j.CP, 0x22, 1);
1909
1910                         // Bopomofo
1911                         fillIndex [0x23] = 0x02;
1912                         for (int i = 0x3105; i <= 0x312C; i++)
1913                                 AddCharMap ((char) i, 0x23, 1);
1914
1915                         // Estrangela: ancient Syriac
1916                         fillIndex [0x24] = 0x0B;
1917                         // FIXME: is 0x71E really alternative form?
1918                         ArrayList syriacAlternatives = new ArrayList (
1919                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
1920                         for (int i = 0x0710; i <= 0x072C; i++) {
1921                                 if (i == 0x0711) // NonSpacingMark
1922                                         continue;
1923                                 if (syriacAlternatives.Contains (i))
1924                                         continue;
1925                                 AddCharMap ((char) i, 0x24, 4);
1926                                 // FIXME: why?
1927                                 if (i == 0x721)
1928                                         fillIndex [0x24]++;
1929                         }
1930                         foreach (int cp in syriacAlternatives)
1931                                 map [cp] = new CharMapEntry (0x24,
1932                                         (byte) (map [cp - 1].Level1 + 2),
1933                                         0);
1934
1935                         // Thaana
1936                         // FIXME: it turned out that it does not look like UCA
1937                         fillIndex [0x24] = 0x6E;
1938                         for (int i = 0; i < orderedThaana.Length; i++) {
1939                                 if (IsIgnorableNonSpacing (i))
1940                                         continue;
1941                                 AddCharMap (orderedThaana [i], 0x24, 2);
1942                         }
1943                         #endregion
1944
1945                         // FIXME: Add more culture-specific letters (that are
1946                         // not supported in Windows collation) here.
1947
1948                         // Surrogate ... they are computed.
1949
1950                         #region Hangul
1951                         // Hangul.
1952                         //
1953                         // Unlike UCA Windows Hangul sequence mixes Jongseong
1954                         // with Choseong sequence as well as Jungseong,
1955                         // adjusted to have the same primary weight for the
1956                         // same base character. So it is impossible to compute
1957                         // those sort keys.
1958                         //
1959                         // Here I introduce an ordered sequence of mixed
1960                         // 'commands' and 'characters' that is similar to
1961                         // LDML text:
1962                         //      - ',' increases primary weight.
1963                         //      - [A B] means a range, increasing index
1964                         //      - {A B} means a range, without increasing index
1965                         //      - '=' is no operation (it means the characters
1966                         //        of both sides have the same weight).
1967                         //      - '>' inserts a Hangul Syllable block that
1968                         //        contains 0x251 characters.
1969                         //      - '<' decreases the index
1970                         //      - '0'-'9' means skip count
1971                         //      - whitespaces are ignored
1972                         //
1973
1974                         string hangulSequence =
1975                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
1976                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
1977                         + "<{\u1113 \u1116}, \u3165,"
1978                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
1979                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
1980                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
1981                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
1982                                 + "[\u11D1 \u11D2], \u11B2,"
1983                                 + "[\u11D3 \u11D5], \u11B3,"
1984                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
1985                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
1986                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
1987                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
1988                         + "<{\u1121 \u112C}, \u11B9,,,,,,,,, [\u11E4 \u11E6],,"
1989                                 + "\u1109=\u11BA,,, \u3214=\u3274 <>"
1990                         + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
1991                                 + "\u11EA,, \u110A=\u11BB,,, >"
1992                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
1993                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
1994                         + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
1995                                 + "\u11F1,, \u11F2,,,"
1996                                 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
1997                         + "<\u114D, \u110D,,  >"
1998                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
1999                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2000                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
2001                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2002                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2003                                 + "[\u11F5 \u11F8]"
2004                         ;
2005
2006                         byte hangulCat = 0x52;
2007                         fillIndex [hangulCat] = 0x2;
2008
2009                         int syllableBlock = 0;
2010                         for (int n = 0; n < hangulSequence.Length; n++) {
2011                                 char c = hangulSequence [n];
2012                                 int start, end;
2013                                 if (Char.IsWhiteSpace (c))
2014                                         continue;
2015                                 switch (c) {
2016                                 case '=':
2017                                         break; // NOP
2018                                 case ',':
2019                                         IncrementSequentialIndex (ref hangulCat);
2020                                         break;
2021                                 case '<':
2022                                         if (fillIndex [hangulCat] == 2)
2023                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2024                                         fillIndex [hangulCat]--;
2025                                         break;
2026                                 case '>':
2027                                         IncrementSequentialIndex (ref hangulCat);
2028                                         for (int l = 0; l < 0x15; l++)
2029                                                 for (int v = 0; v < 0x1C; v++) {
2030                                                         AddCharMap (
2031                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2032                                                         IncrementSequentialIndex (ref hangulCat);
2033                                                 }
2034                                         syllableBlock++;
2035                                         break;
2036                                 case '[':
2037                                         start = hangulSequence [n + 1];
2038                                         end = hangulSequence [n + 3];
2039                                         for (int i = start; i <= end; i++) {
2040                                                 AddCharMap ((char) i, hangulCat, 0);
2041                                                 if (end > i)
2042                                                         IncrementSequentialIndex (ref hangulCat);
2043                                         }
2044                                         n += 4; // consumes 5 characters for this operation
2045                                         break;
2046                                 case '{':
2047                                         start = hangulSequence [n + 1];
2048                                         end = hangulSequence [n + 3];
2049                                         for (int i = start; i <= end; i++)
2050                                                 AddCharMap ((char) i, hangulCat, 0);
2051                                         n += 4; // consumes 5 characters for this operation
2052                                         break;
2053                                 default:
2054                                         AddCharMap (c, hangulCat, 0);
2055                                         break;
2056                                 }
2057                         }
2058
2059                         #endregion
2060
2061                         // Letterlike characters and CJK compatibility square
2062                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2063                         int [] counts = new int ['Z' - 'A' + 1];
2064                         char [] namedChars = new char [sortableCharNames.Count];
2065                         int nCharNames = 0;
2066                         foreach (DictionaryEntry de in sortableCharNames) {
2067                                 counts [((string) de.Value) [0] - 'A']++;
2068                                 namedChars [nCharNames++] = (char) ((int) de.Key);
2069                         }
2070                         nCharNames = 0; // reset
2071                         for (int a = 0; a < counts.Length; a++) {
2072                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2073                                 for (int i = 0; i < counts [a]; i++)
2074 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2075                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
2076                         }
2077
2078                         // CJK unified ideograph.
2079                         byte cjkCat = 0x9E;
2080                         fillIndex [cjkCat] = 0x2;
2081                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2082                                 if (!IsIgnorable (cp))
2083                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2084                         // CJK Extensions goes here.
2085                         // LAMESPEC: With this Windows style CJK layout, it is
2086                         // impossible to add more CJK ideograph i.e. 0x9FA6-
2087                         // 0x9FBB can never be added w/o breaking compat.
2088                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2089                                 if (!IsIgnorable (cp))
2090                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
2091
2092                         // PrivateUse ... computed.
2093                         // remaining Surrogate ... computed.
2094
2095                         #region Special "biggest" area (FF FF)
2096                         fillIndex [0xFF] = 0xFF;
2097                         char [] specialBiggest = new char [] {
2098                                 '\u3005', '\u3031', '\u3032', '\u309D',
2099                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2100                                 '\uFE7C', '\uFE7D', '\uFF70'};
2101                         foreach (char c in specialBiggest)
2102                                 AddCharMap (c, 0xFF, 0);
2103                         #endregion
2104
2105                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2106                         // non-alphanumeric ASCII except for: + - < = > '
2107                         for (int i = 0x21; i < 0x7F; i++) {
2108                                 if (Char.IsLetterOrDigit ((char) i)
2109                                         || "+-<=>'".IndexOf ((char) i) >= 0)
2110                                         continue; // they are not added here.
2111                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2112                                 // Insert 3001 after ',' and 3002 after '.'
2113                                 if (i == 0x2C)
2114                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2115                                 else if (i == 0x2E) {
2116                                         fillIndex [0x7]--;
2117                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2118                                 }
2119                                 else if (i == 0x3A)
2120                                         AddCharMap ('\uFE30', 0x7, 1, 0);
2121                         }
2122                         #endregion
2123
2124                         #region 07 - Punctuations and something else
2125                         for (int i = 0xA0; i < char.MaxValue; i++) {
2126                                 if (IsIgnorable (i))
2127                                         continue;
2128
2129                                 // SPECIAL CASES:
2130                                 switch (i) {
2131                                 case 0xAB: // 08
2132                                 case 0xB7: // 0A
2133                                 case 0x2329: // 09
2134                                 case 0x232A: // 09
2135                                         continue;
2136                                 }
2137
2138                                 switch (Char.GetUnicodeCategory ((char) i)) {
2139                                 case UnicodeCategory.OtherPunctuation:
2140                                 case UnicodeCategory.ClosePunctuation:
2141                                 case UnicodeCategory.OpenPunctuation:
2142                                 case UnicodeCategory.InitialQuotePunctuation:
2143                                 case UnicodeCategory.FinalQuotePunctuation:
2144                                 case UnicodeCategory.ModifierSymbol:
2145                                         // SPECIAL CASES: // 0xA
2146                                         if (0x2020 <= i && i <= 0x2042)
2147                                                 continue;
2148                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
2149                                         break;
2150                                 default:
2151                                         if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2152                                                 goto case UnicodeCategory.OtherPunctuation;
2153                                         break;
2154                                 }
2155                         }
2156                         // Control pictures
2157                         for (int i = 0x2400; i <= 0x2421; i++)
2158                                 AddCharMap ((char) i, 0x7, 1, 0);
2159                         #endregion
2160
2161                         // FIXME: for 07 xx we need more love.
2162
2163                         // FIXME: 08 should be more complete.
2164                         fillIndex [0x8] = 2;
2165                         for (int cp = 0; cp < char.MaxValue; cp++)
2166                                 if (!map [cp].Defined &&
2167                                         Char.GetUnicodeCategory ((char) cp) ==
2168                                         UnicodeCategory.MathSymbol)
2169                                         AddCharMapGroup ((char) cp, 0x8, 1, 0);
2170
2171                         // Characters w/ diacritical marks (NFKD)
2172                         for (int i = 0; i <= char.MaxValue; i++) {
2173                                 if (map [i].Defined || IsIgnorable (i))
2174                                         continue;
2175                                 if (decompIndex [i] == 0)
2176                                         continue;
2177
2178                                 int start = decompIndex [i];
2179                                 int primaryChar = decompValues [start];
2180                                 int secondary = 0;
2181                                 bool skip = false;
2182                                 int length = decompLength [i];
2183                                 // special processing for parenthesized ones.
2184                                 if (length == 3 &&
2185                                         decompValues [start] == '(' &&
2186                                         decompValues [start + 2] == ')') {
2187                                         primaryChar = decompValues [start + 1];
2188                                         length = 1;
2189                                 }
2190
2191                                 if (map [primaryChar].Level1 == 0)
2192                                         continue;
2193
2194                                 for (int l = 1; l < length; l++) {
2195                                         int c = decompValues [start + l];
2196                                         if (map [c].Level1 != 0)
2197                                                 skip = true;
2198                                         secondary += diacritical [c];
2199                                 }
2200                                 if (skip)
2201                                         continue;
2202                                 map [i] = new CharMapEntry (
2203                                         map [primaryChar].Category,
2204                                         map [primaryChar].Level1,
2205                                         (byte) secondary);
2206
2207                         }
2208
2209                         #region Level2 adjustment
2210                         // Arabic Hamzah
2211                         diacritical [0x624] = 0x5;
2212                         diacritical [0x626] = 0x7;
2213                         diacritical [0x622] = 0x9;
2214                         diacritical [0x623] = 0xA;
2215                         diacritical [0x625] = 0xB;
2216                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
2217                         diacritical [0x64A] = 0x7; // Yaa'
2218
2219
2220                         for (int i = 0; i < char.MaxValue; i++) {
2221                                 byte mod = 0;
2222                                 byte cat = map [i].Category;
2223                                 switch (cat) {
2224                                 case 0xE: // Latin diacritics
2225                                 case 0x22: // Japanese: circled characters
2226                                         mod = diacritical [i];
2227                                         break;
2228                                 case 0x13: // Arabic
2229                                         if (diacritical [i] == 0)
2230                                                 mod = 0x8; // default for arabic
2231                                         break;
2232                                 }
2233                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
2234                                         mod = diacritical [i];
2235                                 if (mod > 0)
2236                                         map [i] = new CharMapEntry (
2237                                                 cat, map [i].Level1, mod);
2238                         }
2239                         #endregion
2240
2241                         // FIXME: this is hack but those which are
2242                         // NonSpacingMark characters and still undefined
2243                         // are likely to be nonspacing.
2244                         for (int i = 0; i < char.MaxValue; i++)
2245                                 if (!map [i].Defined &&
2246                                         !IsIgnorable (i) &&
2247                                         Char.GetUnicodeCategory ((char) i) ==
2248                                         UnicodeCategory.NonSpacingMark)
2249                                         AddCharMap ((char) i, 1, 1);
2250                 }
2251
2252                 private void IncrementSequentialIndex (ref byte hangulCat)
2253                 {
2254                         fillIndex [hangulCat]++;
2255                         if (fillIndex [hangulCat] == 0) { // overflown
2256                                 hangulCat++;
2257                                 fillIndex [hangulCat] = 0x2;
2258                         }
2259                 }
2260
2261                 // Reset fillIndex to fixed value and call AddLetterMap().
2262                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2263                 {
2264                         fillIndex [category] = alphaWeight;
2265                         AddLetterMap (c, category, 0);
2266
2267                         ArrayList al = latinMap [c] as ArrayList;
2268                         if (al == null)
2269                                 return;
2270
2271                         foreach (int cp in al)
2272                                 AddLetterMap ((char) cp, category, 0);
2273                 }
2274
2275                 private void AddKanaMap (int i, byte voices)
2276                 {
2277                         for (byte b = 0; b < voices; b++) {
2278                                 char c = (char) (i + b);
2279                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
2280                                 // Hiragana
2281                                 AddLetterMapCore (c, 0x22, 0, arg);
2282                                 // Katakana
2283                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2284                         }
2285                 }
2286
2287                 private void AddLetterMap (char c, byte category, byte updateCount)
2288                 {
2289                         AddLetterMapCore (c, category, updateCount, 0);
2290                 }
2291
2292                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2293                 {
2294                         char c2;
2295                         // <small> updates index
2296                         c2 = ToSmallForm (c);
2297                         if (c2 != c)
2298                                 AddCharMapGroup (c2, category, updateCount, level2);
2299                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2300                         if (c2 != c && !map [(int) c2].Defined)
2301                                 AddLetterMapCore (c2, category, 0, level2);
2302                         bool doUpdate = true;
2303                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2304                                 doUpdate = false;
2305                         else
2306                                 AddCharMapGroup (c, category, 0, level2);
2307                         if (doUpdate)
2308                                 fillIndex [category] += updateCount;
2309                 }
2310
2311                 private bool AddCharMap (char c, byte category, byte increment)
2312                 {
2313                         return AddCharMap (c, category, increment, 0);
2314                 }
2315
2316                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2317                 {
2318                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2319                                 return false; // do nothing
2320                         map [(int) c] = new CharMapEntry (category,
2321                                 category == 1 ? alt : fillIndex [category],
2322                                 category == 1 ? fillIndex [category] : alt);
2323                         fillIndex [category] += increment;
2324                         return true;
2325                 }
2326
2327                 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2328                 {
2329                         char c2 = ToSmallFormTail (c);
2330                         if (c2 != c)
2331                                 AddCharMap (c2, category, updateCount, 0);
2332                         // itself
2333                         AddCharMap (c, category, updateCount, 0);
2334                         // <full>
2335                         c2 = ToFullWidthTail (c);
2336                         if (c2 != c)
2337                                 AddCharMapGroupTail (c2, category, updateCount);
2338                 }
2339
2340                 //
2341                 // Adds characters to table in the order below
2342                 // (+ increases weight):
2343                 //      (<small> +)
2344                 //      itself
2345                 //      <fraction>
2346                 //      <full> | <super> | <sub>
2347                 //      <circle> | <wide> (| <narrow>)
2348                 //      +
2349                 //      (vertical +)
2350                 //
2351                 // level2 is fixed (does not increase).
2352                 int [] sameWeightItems = new int [] {
2353                         DecompositionFraction,
2354                         DecompositionFull,
2355                         DecompositionSuper,
2356                         DecompositionSub,
2357                         DecompositionCircle,
2358                         DecompositionWide,
2359                         DecompositionNarrow,
2360                         };
2361                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2362                 {
2363                         if (map [(int) c].Defined)
2364                                 return;
2365
2366                         char small = char.MinValue;
2367                         char vertical = char.MinValue;
2368                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2369                         if (nfkd != null) {
2370                                 object smv = nfkd [(byte) DecompositionSmall];
2371                                 if (smv != null)
2372                                         small = (char) ((int) smv);
2373                                 object vv = nfkd [(byte) DecompositionVertical];
2374                                 if (vv != null)
2375                                         vertical = (char) ((int) vv);
2376                         }
2377
2378                         // <small> updates index
2379                         if (small != char.MinValue)
2380                                 AddCharMap (small, category, updateCount);
2381
2382                         // itself
2383                         AddCharMap (c, category, 0, level2);
2384
2385                         if (nfkd != null) {
2386                                 foreach (int weight in sameWeightItems) {
2387                                         object wv = nfkd [(byte) weight];
2388                                         if (wv != null)
2389                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
2390                                 }
2391                         }
2392
2393                         // update index here.
2394                         fillIndex [category] += updateCount;
2395
2396                         if (vertical != char.MinValue)
2397                                 AddCharMap (vertical, category, updateCount, level2);
2398                 }
2399
2400                 private void AddCharMapCJK (char c, ref byte category)
2401                 {
2402                         AddCharMap (c, category, 0, 0);
2403                         IncrementSequentialIndex (ref category);
2404
2405                         // Special. I wonder why but Windows skips 9E F9.
2406                         if (category == 0x9E && fillIndex [category] == 0xF9)
2407                                 IncrementSequentialIndex (ref category);
2408                 }
2409
2410                 private void AddCharMapGroupCJK (char c, ref byte category)
2411                 {
2412                         AddCharMapCJK (c, ref category);
2413
2414                         // LAMESPEC: see below.
2415                         if (c == '\u52DE') {
2416                                 AddCharMapCJK ('\u3298', ref category);
2417                                 AddCharMapCJK ('\u3238', ref category);
2418                         }
2419                         if (c == '\u5BEB')
2420                                 AddCharMapCJK ('\u32A2', ref category);
2421                         if (c == '\u91AB')
2422                                 // Especially this mapping order totally does
2423                                 // not make sense to me.
2424                                 AddCharMapCJK ('\u32A9', ref category);
2425
2426                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2427                         if (nfkd == null)
2428                                 return;
2429                         for (byte weight = 0; weight <= 0x12; weight++) {
2430                                 object wv = nfkd [weight];
2431                                 if (wv == null)
2432                                         continue;
2433                                 int w = (int) wv;
2434
2435                                 // Special: they are ignored in this area.
2436                                 // FIXME: check if it is sane
2437                                 if (0xF900 <= w && w <= 0xFAD9)
2438                                         continue;
2439                                 // LAMESPEC: on Windows some of CJK characters
2440                                 // in 3200-32B0 are incorrectly mapped. They
2441                                 // mix Chinise and Japanese Kanji when
2442                                 // ordering those characters.
2443                                 switch (w) {
2444                                 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
2445                                         continue;
2446                                 }
2447
2448                                 AddCharMapCJK ((char) w, ref category);
2449                         }
2450                 }
2451
2452                 // For now it is only for 0x7 category.
2453                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2454                 {
2455                         char small = char.MinValue;
2456                         char vertical = char.MinValue;
2457                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2458                         if (nfkd != null) {
2459                                 object smv = nfkd [(byte) DecompositionSmall];
2460                                 if (smv != null)
2461                                         small = (char) ((int) smv);
2462                                 object vv = nfkd [(byte) DecompositionVertical];
2463                                 if (vv != null)
2464                                         vertical = (char) ((int) vv);
2465                         }
2466
2467                         // <small> updates index
2468                         if (small != char.MinValue)
2469                                 // SPECIAL CASE excluded (FIXME: why?)
2470                                 if (small != '\u2024')
2471                                         AddCharMap (small, category, updateCount);
2472
2473                         // itself
2474                         AddCharMap (c, category, updateCount, level2);
2475
2476                         // Since nfkdMap is problematic to have two or more
2477                         // NFKD to an identical character, here I iterate all.
2478                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
2479                                 if (decompLength [c2] == 1 &&
2480                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
2481                                         switch (decompType [c2]) {
2482                                         case DecompositionCompat:
2483                                                 AddCharMap ((char) c2, category, updateCount, level2);
2484                                                 break;
2485                                         }
2486                                 }
2487                         }
2488
2489                         if (vertical != char.MinValue)
2490                                 // SPECIAL CASE excluded (FIXME: why?)
2491                                 if (vertical != '\uFE33' && vertical != '\uFE34')
2492                                         AddCharMap (vertical, category, updateCount, level2);
2493                 }
2494
2495                 char ToFullWidth (char c)
2496                 {
2497                         return ToDecomposed (c, DecompositionFull, false);
2498                 }
2499
2500                 char ToFullWidthTail (char c)
2501                 {
2502                         return ToDecomposed (c, DecompositionFull, true);
2503                 }
2504
2505                 char ToSmallForm (char c)
2506                 {
2507                         return ToDecomposed (c, DecompositionSmall, false);
2508                 }
2509
2510                 char ToSmallFormTail (char c)
2511                 {
2512                         return ToDecomposed (c, DecompositionSmall, true);
2513                 }
2514
2515                 char ToDecomposed (char c, byte d, bool tail)
2516                 {
2517                         if (decompType [(int) c] != d)
2518                                 return c;
2519                         int idx = decompIndex [(int) c];
2520                         if (tail)
2521                                 idx += decompLength [(int) c] - 1;
2522                         return (char) decompValues [idx];
2523                 }
2524
2525                 bool ExistsJIS (int cp)
2526                 {
2527                         foreach (JISCharacter j in jisJapanese)
2528                                 if (j.CP == cp)
2529                                         return true;
2530                         return false;
2531                 }
2532
2533                 #endregion
2534
2535                 #region Level 3 properties (Case/Width)
2536
2537                 private byte ComputeLevel3Weight (char c)
2538                 {
2539                         byte b = ComputeLevel3WeightRaw (c);
2540                         return b > 0 ? (byte) (b + 2) : b;
2541                 }
2542
2543                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
2544                 {
2545                         // Korean
2546                         if ('\u11A8' <= c && c <= '\u11F9')
2547                                 return 2;
2548                         if ('\uFFA0' <= c && c <= '\uFFDC')
2549                                 return 4;
2550                         if ('\u3130' <= c && c <= '\u3164')
2551                                 return 5;
2552                         // numbers
2553                         if ('\u2776' <= c && c <= '\u277F')
2554                                 return 4;
2555                         if ('\u2780' <= c && c <= '\u2789')
2556                                 return 8;
2557                         if ('\u2776' <= c && c <= '\u2793')
2558                                 return 0xC;
2559                         if ('\u2160' <= c && c <= '\u216F')
2560                                 return 0x18;
2561                         if ('\u2181' <= c && c <= '\u2182')
2562                                 return 0x18;
2563                         // Arabic
2564                         if ('\u2135' <= c && c <= '\u2138')
2565                                 return 4;
2566                         if ('\uFE80' <= c && c < '\uFE8E') {
2567                                 // 2(Isolated)/8(Final)/0x18(Medial)
2568                                 switch (decompType [(int) c]) {
2569                                 case DecompositionIsolated:
2570                                         return 2;
2571                                 case DecompositionFinal:
2572                                         return 8;
2573                                 case DecompositionMedial:
2574                                         return 0x18;
2575                                 }
2576                         }
2577
2578                         // actually I dunno the reason why they have weights.
2579                         switch (c) {
2580                         case '\u01BC':
2581                                 return 0x10;
2582                         case '\u06A9':
2583                                 return 0x20;
2584                         case '\u06AA':
2585                                 return 0x28;
2586                         }
2587
2588                         byte ret = 0;
2589                         switch (c) {
2590                         case '\u03C2':
2591                         case '\u2104':
2592                         case '\u212B':
2593                                 ret |= 8;
2594                                 break;
2595                         case '\uFE42':
2596                                 ret |= 0xC;
2597                                 break;
2598                         }
2599
2600                         // misc
2601                         switch (decompType [(int) c]) {
2602                         case DecompositionWide: // <wide>
2603                         case DecompositionSub: // <sub>
2604                         case DecompositionSuper: // <super>
2605                                 ret |= decompType [(int) c];
2606                                 break;
2607                         }
2608                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2609                                 ret |= 8;
2610                         if (isUppercase [(int) c]) // DerivedCoreProperties
2611                                 ret |= 0x10;
2612
2613                         return ret;
2614                 }
2615
2616                 #endregion
2617
2618                 #region IsIgnorable
2619 /*
2620                 static bool IsIgnorable (int i)
2621                 {
2622                         if (unicodeAge [i] >= 3.1)
2623                                 return true;
2624                         switch (char.GetUnicodeCategory ((char) i)) {
2625                         case UnicodeCategory.OtherNotAssigned:
2626                         case UnicodeCategory.Format:
2627                                 return true;
2628                         }
2629                         return false;
2630                 }
2631 */
2632
2633                 // FIXME: In the future use DerivedAge.txt to examine character
2634                 // versions and set those ones that have higher version than
2635                 // 1.0 as ignorable.
2636                 static bool IsIgnorable (int i)
2637                 {
2638                         switch (i) {
2639                         case 0:
2640                         // I guess, those characters are added between
2641                         // Unicode 1.0 (LCMapString) and Unicode 3.1
2642                         // (UnicodeCategory), so they used to be
2643                         // something like OtherNotAssigned as of Unicode 1.1.
2644                         case 0x2df: case 0x387:
2645                         case 0x3d7: case 0x3d8: case 0x3d9:
2646                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2647                         case 0x400: case 0x40d: case 0x450: case 0x45d:
2648                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
2649                         case 0x653: case 0x654: case 0x655: case 0x66d:
2650                         case 0xb56:
2651                         case 0x1e9b: case 0x202f: case 0x20ad:
2652                         case 0x20ae: case 0x20af:
2653                         case 0x20e2: case 0x20e3:
2654                         case 0x2139: case 0x213a: case 0x2183:
2655                         case 0x2425: case 0x2426: case 0x2619:
2656                         case 0x2670: case 0x2671: case 0x3007:
2657                         case 0x3190: case 0x3191:
2658                         case 0xfffc: case 0xfffd:
2659                                 return true;
2660                         // exceptional characters filtered by the
2661                         // following conditions. Originally those exceptional
2662                         // ranges are incorrect (they should not be ignored)
2663                         // and most of those characters are unfortunately in
2664                         // those ranges.
2665                         case 0x4d8: case 0x4d9:
2666                         case 0x4e8: case 0x4e9:
2667                         case 0x3036: case 0x303f:
2668                         case 0x337b: case 0xfb1e:
2669                                 return false;
2670                         }
2671
2672                         if (
2673                                 // The whole Sinhala characters.
2674                                 0x0D82 <= i && i <= 0x0DF4
2675                                 // The whole Tibetan characters.
2676                                 || 0x0F00 <= i && i <= 0x0FD1
2677                                 // The whole Myanmar characters.
2678                                 || 0x1000 <= i && i <= 0x1059
2679                                 // The whole Etiopic, Cherokee,
2680                                 // Canadian Syllablic, Ogham, Runic,
2681                                 // Tagalog, Hanunoo, Philippine,
2682                                 // Buhid, Tagbanwa, Khmer and Mongorian
2683                                 // characters.
2684                                 || 0x1200 <= i && i <= 0x1DFF
2685                                 // Greek extension characters.
2686                                 || 0x1F00 <= i && i <= 0x1FFF
2687                                 // The whole Braille characters.
2688                                 || 0x2800 <= i && i <= 0x28FF
2689                                 // CJK radical characters.
2690                                 || 0x2E80 <= i && i <= 0x2EF3
2691                                 // Kangxi radical characters.
2692                                 || 0x2F00 <= i && i <= 0x2FD5
2693                                 // Ideographic description characters.
2694                                 || 0x2FF0 <= i && i <= 0x2FFB
2695                                 // Bopomofo letter and final
2696                                 || 0x31A0 <= i && i <= 0x31B7
2697                                 // White square with quadrant characters.
2698                                 || 0x25F0 <= i && i <= 0x25F7
2699                                 // Ideographic telegraph symbols.
2700                                 || 0x32C0 <= i && i <= 0x32CB
2701                                 || 0x3358 <= i && i <= 0x3370
2702                                 || 0x33E0 <= i && i <= 0x33FF
2703                                 // The whole YI characters.
2704                                 || 0xA000 <= i && i <= 0xA48C
2705                                 || 0xA490 <= i && i <= 0xA4C6
2706                                 // American small ligatures
2707                                 || 0xFB13 <= i && i <= 0xFB17
2708                                 // hebrew, arabic, variation selector.
2709                                 || 0xFB1D <= i && i <= 0xFE2F
2710                                 // Arabic ligatures.
2711                                 || 0xFEF5 <= i && i <= 0xFEFC
2712                                 // FIXME: why are they excluded?
2713                                 || 0x01F6 <= i && i <= 0x01F9
2714                                 || 0x0218 <= i && i <= 0x0233
2715                                 || 0x02A9 <= i && i <= 0x02AD
2716                                 || 0x02EA <= i && i <= 0x02EE
2717                                 || 0x0349 <= i && i <= 0x036F
2718                                 || 0x0488 <= i && i <= 0x048F
2719                                 || 0x04D0 <= i && i <= 0x04FF
2720                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2721                                 || 0x06D6 <= i && i <= 0x06ED
2722                                 || 0x06FA <= i && i <= 0x06FE
2723                                 || 0x2048 <= i && i <= 0x204D
2724                                 || 0x20e4 <= i && i <= 0x20ea
2725                                 || 0x213C <= i && i <= 0x214B
2726                                 || 0x21EB <= i && i <= 0x21FF
2727                                 || 0x22F2 <= i && i <= 0x22FF
2728                                 || 0x237B <= i && i <= 0x239A
2729                                 || 0x239B <= i && i <= 0x23CF
2730                                 || 0x24EB <= i && i <= 0x24FF
2731                                 || 0x2596 <= i && i <= 0x259F
2732                                 || 0x25F8 <= i && i <= 0x25FF
2733                                 || 0x2672 <= i && i <= 0x2689
2734                                 || 0x2768 <= i && i <= 0x2775
2735                                 || 0x27d0 <= i && i <= 0x27ff
2736                                 || 0x2900 <= i && i <= 0x2aff
2737                                 || 0x3033 <= i && i <= 0x303F
2738                                 || 0x31F0 <= i && i <= 0x31FF
2739                                 || 0x3250 <= i && i <= 0x325F
2740                                 || 0x32B1 <= i && i <= 0x32BF
2741                                 || 0x3371 <= i && i <= 0x337B
2742                                 || 0xFA30 <= i && i <= 0xFA6A
2743                         )
2744                                 return true;
2745
2746                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2747                         switch (uc) {
2748                         case UnicodeCategory.PrivateUse:
2749                         case UnicodeCategory.Surrogate:
2750                                 return false;
2751                         // ignored by nature
2752                         case UnicodeCategory.Format:
2753                         case UnicodeCategory.OtherNotAssigned:
2754                                 return true;
2755                         default:
2756                                 return false;
2757                         }
2758                 }
2759
2760                 // To check IsIgnorable sanity, try the driver below under MS.NET.
2761
2762                 /*
2763                 public static void Main ()
2764                 {
2765                         for (int i = 0; i <= char.MaxValue; i++)
2766                                 Dump (i, IsIgnorable (i));
2767                 }
2768
2769                 static void Dump (int i, bool ignore)
2770                 {
2771                         switch (Char.GetUnicodeCategory ((char) i)) {
2772                         case UnicodeCategory.PrivateUse:
2773                         case UnicodeCategory.Surrogate:
2774                                 return; // check nothing
2775                         }
2776
2777                         string s1 = "";
2778                         string s2 = new string ((char) i, 10);
2779                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2780                         if ((ret == 0) == ignore)
2781                                 return;
2782                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2783                 }
2784                 */
2785                 #endregion // IsIgnorable
2786
2787                 #region IsIgnorableSymbol
2788                 static bool IsIgnorableSymbol (int i)
2789                 {
2790                         if (IsIgnorable (i))
2791                                 return true;
2792
2793                         switch (i) {
2794                         // *Letter
2795                         case 0x00b5: case 0x01C0: case 0x01C1:
2796                         case 0x01C2: case 0x01C3: case 0x01F6:
2797                         case 0x01F7: case 0x01F8: case 0x01F9:
2798                         case 0x02D0: case 0x02EE: case 0x037A:
2799                         case 0x03D7: case 0x03F3:
2800                         case 0x0400: case 0x040d:
2801                         case 0x0450: case 0x045d:
2802                         case 0x048C: case 0x048D:
2803                         case 0x048E: case 0x048F:
2804                         case 0x0587: case 0x0640: case 0x06E5:
2805                         case 0x06E6: case 0x06FA: case 0x06FB:
2806                         case 0x06FC: case 0x093D: case 0x0950:
2807                         case 0x1E9B: case 0x2139: case 0x3006:
2808                         case 0x3033: case 0x3034: case 0x3035:
2809                         case 0xFE7E: case 0xFE7F:
2810                         // OtherNumber
2811                         case 0x16EE: case 0x16EF: case 0x16F0:
2812                         // LetterNumber
2813                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2814                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2815                         case 0x3038: // HANGZHOU NUMERAL TEN
2816                         case 0x3039: // HANGZHOU NUMERAL TWENTY
2817                         case 0x303a: // HANGZHOU NUMERAL THIRTY
2818                         // OtherSymbol
2819                         case 0x2117:
2820                         case 0x327F:
2821                                 return true;
2822                         // ModifierSymbol
2823                         case 0x02B9: case 0x02BA: case 0x02C2:
2824                         case 0x02C3: case 0x02C4: case 0x02C5:
2825                         case 0x02C8: case 0x02CC: case 0x02CD:
2826                         case 0x02CE: case 0x02CF: case 0x02D2:
2827                         case 0x02D3: case 0x02D4: case 0x02D5:
2828                         case 0x02D6: case 0x02D7: case 0x02DE:
2829                         case 0x02E5: case 0x02E6: case 0x02E7:
2830                         case 0x02E8: case 0x02E9:
2831                         case 0x309B: case 0x309C:
2832                         // OtherPunctuation
2833                         case 0x055A: // American Apos
2834                         case 0x05C0: // Hebrew Punct
2835                         case 0x0E4F: // Thai FONGMAN
2836                         case 0x0E5A: // Thai ANGKHANKHU
2837                         case 0x0E5B: // Thai KHOMUT
2838                         // CurencySymbol
2839                         case 0x09F2: // Bengali Rupee Mark
2840                         case 0x09F3: // Bengali Rupee Sign
2841                         // MathSymbol
2842                         case 0x221e: // INF.
2843                         // OtherSymbol
2844                         case 0x0482:
2845                         case 0x09FA:
2846                         case 0x0B70:
2847                                 return false;
2848                         }
2849
2850                         // *Letter
2851                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2852 #if NET_2_0
2853                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2854                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2855 #endif
2856                         )
2857                                 return true;
2858
2859                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2860                         switch (uc) {
2861                         case UnicodeCategory.Surrogate:
2862                                 return false; // inconsistent
2863
2864                         case UnicodeCategory.SpacingCombiningMark:
2865                         case UnicodeCategory.EnclosingMark:
2866                         case UnicodeCategory.NonSpacingMark:
2867                         case UnicodeCategory.PrivateUse:
2868                                 // NonSpacingMark
2869                                 if (0x064B <= i && i <= 0x0652) // Arabic
2870                                         return true;
2871                                 return false;
2872
2873                         case UnicodeCategory.Format:
2874                         case UnicodeCategory.OtherNotAssigned:
2875                                 return true;
2876
2877                         default:
2878                                 bool use = false;
2879                                 // OtherSymbols
2880                                 if (
2881                                         // latin in a circle
2882                                         0x249A <= i && i <= 0x24E9
2883                                         || 0x2100 <= i && i <= 0x2132
2884                                         // Japanese
2885                                         || 0x3196 <= i && i <= 0x31A0
2886                                         // Korean
2887                                         || 0x3200 <= i && i <= 0x321C
2888                                         // Chinese/Japanese
2889                                         || 0x322A <= i && i <= 0x3243
2890                                         // CJK
2891                                         || 0x3260 <= i && i <= 0x32B0
2892                                         || 0x32D0 <= i && i <= 0x3357
2893                                         || 0x337B <= i && i <= 0x33DD
2894                                 )
2895                                         use = !Char.IsLetterOrDigit ((char) i);
2896                                 if (use)
2897                                         return false;
2898
2899                                 // This "Digit" rule is mystery.
2900                                 // It filters some symbols out.
2901                                 if (Char.IsLetterOrDigit ((char) i))
2902                                         return false;
2903                                 if (Char.IsNumber ((char) i))
2904                                         return false;
2905                                 if (Char.IsControl ((char) i)
2906                                         || Char.IsSeparator ((char) i)
2907                                         || Char.IsPunctuation ((char) i))
2908                                         return true;
2909                                 if (Char.IsSymbol ((char) i))
2910                                         return true;
2911
2912                                 // FIXME: should check more
2913                                 return false;
2914                         }
2915                 }
2916
2917                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
2918 /*
2919                 public static void Main ()
2920                 {
2921                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
2922                         for (int i = 0; i <= char.MaxValue; i++) {
2923                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2924                                 if (uc == UnicodeCategory.Surrogate)
2925                                         continue;
2926
2927                                 bool ret = IsIgnorableSymbol (i);
2928
2929                                 string s1 = "TEST ";
2930                                 string s2 = "TEST " + (char) i;
2931
2932                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
2933
2934                                 if (ret != (result == 0))
2935                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
2936                                                 ret ? "should not ignore" :
2937                                                         "should ignore",
2938                                                 i,(char) i, uc);
2939                         }
2940                 }
2941 */
2942                 #endregion
2943
2944                 #region NonSpacing
2945                 static bool IsIgnorableNonSpacing (int i)
2946                 {
2947                         if (IsIgnorable (i))
2948                                 return true;
2949
2950                         switch (i) {
2951                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
2952                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
2953                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
2954                                 return true;
2955                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
2956                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
2957                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
2958                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
2959                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
2960                         case 0x0CCD: case 0x0E4E:
2961                                 return false;
2962                         }
2963
2964                         if (0x02b9 <= i && i <= 0x02c5
2965                                 || 0x02cc <= i && i <= 0x02d7
2966                                 || 0x02e4 <= i && i <= 0x02ef
2967                                 || 0x20DD <= i && i <= 0x20E0
2968                         )
2969                                 return true;
2970
2971                         if (0x064B <= i && i <= 0x00652
2972                                 || 0x0941 <= i && i <= 0x0948
2973                                 || 0x0AC1 <= i && i <= 0x0ACD
2974                                 || 0x0C3E <= i && i <= 0x0C4F
2975                                 || 0x0E31 <= i && i <= 0x0E3F
2976                         )
2977                                 return false;
2978
2979                         return Char.GetUnicodeCategory ((char) i) ==
2980                                 UnicodeCategory.NonSpacingMark;
2981                 }
2982
2983                 // We can reuse IsIgnorableSymbol testcode
2984                 // for IsIgnorableNonSpacing.
2985                 #endregion
2986         }
2987
2988         struct CharMapEntry
2989         {
2990                 public byte Category;
2991                 public byte Level1;
2992                 public byte Level2; // It is always single byte.
2993                 public bool Defined;
2994
2995                 public CharMapEntry (byte category, byte level1, byte level2)
2996                 {
2997                         Category = category;
2998                         Level1 = level1;
2999                         Level2 = level2;
3000                         Defined = true;
3001                 }
3002         }
3003
3004         class JISCharacter
3005         {
3006                 public readonly int CP;
3007                 public readonly int JIS;
3008
3009                 public JISCharacter (int cp, int cpJIS)
3010                 {
3011                         CP = cp;
3012                         JIS = cpJIS;
3013                 }
3014         }
3015
3016         class JISComparer : IComparer
3017         {
3018                 public static readonly JISComparer Instance =
3019                         new JISComparer ();
3020
3021                 public int Compare (object o1, object o2)
3022                 {
3023                         JISCharacter j1 = (JISCharacter) o1;
3024                         JISCharacter j2 = (JISCharacter) o2;
3025                         return j2.JIS - j1.JIS;
3026                 }
3027         }
3028
3029         class NonJISCharacter
3030         {
3031                 public readonly int CP;
3032                 public readonly string Name;
3033
3034                 public NonJISCharacter (int cp, string name)
3035                 {
3036                         CP = cp;
3037                         Name = name;
3038                 }
3039         }
3040
3041         class NonJISComparer : IComparer
3042         {
3043                 public static readonly NonJISComparer Instance =
3044                         new NonJISComparer ();
3045
3046                 public int Compare (object o1, object o2)
3047                 {
3048                         NonJISCharacter j1 = (NonJISCharacter) o1;
3049                         NonJISCharacter j2 = (NonJISCharacter) o2;
3050                         return string.CompareOrdinal (j1.Name, j2.Name);
3051                 }
3052         }
3053
3054         class DecimalDictionaryValueComparer : IComparer
3055         {
3056                 public static readonly DecimalDictionaryValueComparer Instance
3057                         = new DecimalDictionaryValueComparer ();
3058
3059                 private DecimalDictionaryValueComparer ()
3060                 {
3061                 }
3062
3063                 public int Compare (object o1, object o2)
3064                 {
3065                         DictionaryEntry e1 = (DictionaryEntry) o1;
3066                         DictionaryEntry e2 = (DictionaryEntry) o2;
3067                         // FIXME: in case of 0, compare decomposition categories
3068                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3069                         if (ret != 0)
3070                                 return ret;
3071                         int i1 = (int) e1.Key;
3072                         int i2 = (int) e2.Key;
3073                         return i1 - i2;
3074                 }
3075         }
3076
3077         class StringDictionaryValueComparer : IComparer
3078         {
3079                 public static readonly StringDictionaryValueComparer Instance
3080                         = new StringDictionaryValueComparer ();
3081
3082                 private StringDictionaryValueComparer ()
3083                 {
3084                 }
3085
3086                 public int Compare (object o1, object o2)
3087                 {
3088                         DictionaryEntry e1 = (DictionaryEntry) o1;
3089                         DictionaryEntry e2 = (DictionaryEntry) o2;
3090                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3091                         if (ret != 0)
3092                                 return ret;
3093                         int i1 = (int) e1.Key;
3094                         int i2 = (int) e2.Key;
3095                         return i1 - i2;
3096                 }
3097         }
3098
3099         class UCAComparer : IComparer
3100         {
3101                 public static readonly UCAComparer Instance
3102                         = new UCAComparer ();
3103
3104                 private UCAComparer ()
3105                 {
3106                 }
3107
3108                 public int Compare (object o1, object o2)
3109                 {
3110                         char i1 = (char) o1;
3111                         char i2 = (char) o2;
3112
3113                         int l1 = CollationElementTable.GetSortKeyCount (i1);
3114                         int l2 = CollationElementTable.GetSortKeyCount (i2);
3115                         int l = l1 > l2 ? l2 : l1;
3116
3117                         for (int i = 0; i < l; i++) {
3118                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3119                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3120                                 int v = k1.Primary - k2.Primary;
3121                                 if (v != 0)
3122                                         return v;
3123                                 v = k1.Secondary - k2.Secondary;
3124                                 if (v != 0)
3125                                         return v;
3126                                 v = k1.Thirtiary - k2.Thirtiary;
3127                                 if (v != 0)
3128                                         return v;
3129                                 v = k1.Quarternary - k2.Quarternary;
3130                                 if (v != 0)
3131                                         return v;
3132                         }
3133                         return l1 - l2;
3134                 }
3135         }
3136
3137         class Tailoring
3138         {
3139                 int lcid;
3140                 int alias;
3141                 bool frenchSort;
3142                 ArrayList items = new ArrayList ();
3143
3144                 public Tailoring (int lcid)
3145                         : this (lcid, 0)
3146                 {
3147                 }
3148
3149                 public Tailoring (int lcid, int alias)
3150                 {
3151                         this.lcid = lcid;
3152                         this.alias = alias;
3153                 }
3154
3155                 public int LCID {
3156                         get { return lcid; }
3157                 }
3158
3159                 public int Alias {
3160                         get { return alias; }
3161                 }
3162
3163                 public bool FrenchSort {
3164                         get { return frenchSort; }
3165                         set { frenchSort = value; }
3166                 }
3167
3168                 public void AddDiacriticalMap (byte target, byte replace)
3169                 {
3170                         items.Add (new DiacriticalMap (target, replace));
3171                 }
3172
3173                 public void AddSortKeyMap (string source, byte [] sortkey)
3174                 {
3175                         items.Add (new SortKeyMap (source, sortkey));
3176                 }
3177
3178                 public void AddReplacementMap (string source, string replace)
3179                 {
3180                         items.Add (new ReplacementMap (source, replace));
3181                 }
3182
3183                 public char [] ItemToCharArray ()
3184                 {
3185                         ArrayList al = new ArrayList ();
3186                         foreach (ITailoringMap m in items)
3187                                 al.AddRange (m.ToCharArray ());
3188                         return al.ToArray (typeof (char)) as char [];
3189                 }
3190
3191                 interface ITailoringMap
3192                 {
3193                         char [] ToCharArray ();
3194                 }
3195
3196                 class DiacriticalMap : ITailoringMap
3197                 {
3198                         public readonly byte Target;
3199                         public readonly byte Replace;
3200
3201                         public DiacriticalMap (byte target, byte replace)
3202                         {
3203                                 Target = target;
3204                                 Replace = replace;
3205                         }
3206
3207                         public char [] ToCharArray ()
3208                         {
3209                                 char [] ret = new char [3];
3210                                 ret [0] = (char) 02; // kind:DiacriticalMap
3211                                 ret [1] = (char) Target;
3212                                 ret [2] = (char) Replace;
3213                                 return ret;
3214                         }
3215                 }
3216
3217                 class SortKeyMap : ITailoringMap
3218                 {
3219                         public readonly string Source;
3220                         public readonly byte [] SortKey;
3221
3222                         public SortKeyMap (string source, byte [] sortkey)
3223                         {
3224                                 Source = source;
3225                                 SortKey = sortkey;
3226                         }
3227
3228                         public char [] ToCharArray ()
3229                         {
3230                                 char [] ret = new char [Source.Length + 7];
3231                                 ret [0] = (char) 01; // kind:SortKeyMap
3232                                 for (int i = 0; i < Source.Length; i++)
3233                                         ret [i + 1] = Source [i];
3234                                 // null terminate
3235                                 for (int i = 0; i < 5; i++)
3236                                         ret [i + Source.Length + 2] = (char) SortKey [i];
3237                                 return ret;
3238                         }
3239                 }
3240
3241                 class ReplacementMap : ITailoringMap
3242                 {
3243                         public readonly string Source;
3244                         public readonly string Replace;
3245
3246                         public ReplacementMap (string source, string replace)
3247                         {
3248                                 Source = source;
3249                                 Replace = replace;
3250                         }
3251
3252                         public char [] ToCharArray ()
3253                         {
3254                                 char [] ret = new char [Source.Length + Replace.Length + 3];
3255                                 ret [0] = (char) 03; // kind:ReplaceMap
3256                                 int pos = 1;
3257                                 for (int i = 0; i < Source.Length; i++)
3258                                         ret [pos++] = Source [i];
3259                                 // null terminate
3260                                 pos++;
3261                                 for (int i = 0; i < Replace.Length; i++)
3262                                         ret [pos++] = Replace [i];
3263                                 // null terminate
3264                                 return ret;
3265                         }
3266                 }
3267         }
3268 }