mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14
  15 //
  16 // * sortkey getter signature
  17 //
  18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
  19 //      Stores sort key for corresponding character element into buf and
  20 //      returns the length of the consumed _source_ character element in s.
  21 //
  22 // * character length to consume
  23 //
  24 //      If there are characters whose primary weight is 0, they are consumed
  25 //      and considered as a part of the character element.
  26 //
  27
  28 using System;
  29 using System.IO;
  30 using System.Collections;
  31 using System.Globalization;
  32 using System.Xml;
  33
  34 namespace Mono.Globalization.Unicode
  35 {
  36         internal class MSCompatSortKeyTableGenerator
  37         {
  38                 public static void Main (string [] args)
  39                 {
  40                         new MSCompatSortKeyTableGenerator ().Run (args);
  41                 }
  42
  43                 const int DecompositionWide = 1; // fixed
  44                 const int DecompositionSub = 2; // fixed
  45                 const int DecompositionSmall = 3;
  46                 const int DecompositionIsolated = 4;
  47                 const int DecompositionInitial = 5;
  48                 const int DecompositionFinal = 6;
  49                 const int DecompositionMedial = 7;
  50                 const int DecompositionNoBreak = 8;
  51                 const int DecompositionVertical = 9;
  52                 const int DecompositionFraction = 0xA;
  53                 const int DecompositionFont = 0xB;
  54                 const int DecompositionSuper = 0xC; // fixed
  55                 const int DecompositionFull = 0xE;
  56                 const int DecompositionNarrow = 0xD;
  57                 const int DecompositionCircle = 0xF;
  58                 const int DecompositionSquare = 0x10;
  59                 const int DecompositionCompat = 0x11;
  60                 const int DecompositionCanonical = 0x12;
  61
  62                 TextWriter Result = Console.Out;
  63
  64                 byte [] fillIndex = new byte [256]; // by category
  65                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  66
  67                 char [] specialIgnore = new char [] {
  68                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  69                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  70                         };
  71
  72                 // FIXME: need more love (as always)
  73                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  74                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  75                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  76                         '\u0292', '\u01BE', '\u0298'};
  77                 byte [] alphaWeights = new byte [] {
  78                         2, 9, 0xA, 0x1A, 0x21,
  79                         0x23, 0x25, 0x2C, 0x32, 0x35,
  80                         0x36, 0x48, 0x51, 0x70, 0x7C,
  81                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  82                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  83                         0xA9, 0xAA, 0xB3, 0xB4};
  84
  85                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  86                 bool [] isUppercase = new bool [char.MaxValue + 1];
  87
  88                 byte [] decompType = new byte [char.MaxValue + 1];
  89                 int [] decompIndex = new int [char.MaxValue + 1];
  90                 int [] decompLength = new int [char.MaxValue + 1];
  91                 int [] decompValues;
  92                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  93
  94                 byte [] diacritical = new byte [char.MaxValue + 1];
  95
  96                 string [] diacritics = new string [] {
  97                         // LATIN
  98                         "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
  99                         "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 100                         " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
 101                         " OGONEK;", " CEDILLA;",
 102                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 103                         " STROKE;", " CIRCUMFLEX AND ACUTE;",
 104                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 105                         " DIAERESIS AND GRAVE;",
 106                         " BREVE AND ACUTE;",
 107                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 108                         " MACRON AND ACUTE;",
 109                         " MACRON AND GRAVE;",
 110                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 111                         " RING ABOVE AND ACUTE",
 112                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 113                         " CIRCUMFLEX AND TILDE",
 114                         " TILDE AND DIAERESIS",
 115                         " STROKE AND ACUTE",
 116                         " BREVE AND TILDE",
 117                         " CEDILLA AND BREVE",
 118                         " OGONEK AND MACRON",
 119                         " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 120                         " DOUBLE GRAVE;",
 121                         " INVERTED BREVE",
 122                         " PRECEDED BY APOSTROPHE",
 123                         " HORN;",
 124                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 125                         " PALATAL HOOK",
 126                         " DOT BELOW;",
 127                         " RETROFLEX;", "DIAERESIS BELOW",
 128                         " RING BELOW",
 129                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 130                         " BREVE BELOW;", " HORN AND GRAVE",
 131                         " TILDE BELOW",
 132                         " DOT BELOW AND DOT ABOVE",
 133                         " RIGHT HALF RING", " HORN AND TILDE",
 134                         " CIRCUMFLEX AND DOT BELOW",
 135                         " BREVE AND DOT BELOW",
 136                         " DOT BELOW AND MACRON",
 137                         " HORN AND HOOK ABOVE",
 138                         " HORN AND DOT",
 139                         // CIRCLED, PARENTHESIZED and so on
 140                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
 141                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 142                         };
 143                 byte [] diacriticWeights = new byte [] {
 144                         // LATIN.
 145                         0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 146                         0x17, 0x19, 0x1A, 0x1B, 0x1C,
 147                         0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 148                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 149                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 150                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 151                         0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
 152                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
 153                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
 154                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 155                         0x95, 0xAA,
 156                         // CIRCLED, PARENTHESIZED and so on.
 157                         0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
 158                         };
 159
 160                 int [] numberSecondaryWeightBounds = new int [] {
 161                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 162                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 163                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 164                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 165                         0xE50, 0xE60, 0xED0, 0xEE0
 166                         };
 167
 168                 char [] orderedCyrillic;
 169                 char [] orderedGurmukhi;
 170                 char [] orderedGujarati;
 171                 char [] orderedGeorgian;
 172                 char [] orderedThaana;
 173
 174                 static readonly char [] orderedTamilConsonants = new char [] {
 175                         // based on traditional Tamil consonants, except for
 176                         // Grantha (where Microsoft breaks traditionalism).
 177                         // http://www.angelfire.com/empire/thamizh/padanGaL
 178                         '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
 179                         '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
 180                         '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
 181                         '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
 182                         '\u0BB9'};
 183
 184                 // cp -> character name (only for some characters)
 185                 ArrayList sortableCharNames = new ArrayList ();
 186
 187                 // cp -> arrow value (int)
 188                 ArrayList arrowValues = new ArrayList ();
 189
 190                 // cp -> box value (int)
 191                 ArrayList boxValues = new ArrayList ();
 192
 193                 // cp -> level1 value
 194                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 195
 196                 // letterName -> cp
 197                 Hashtable arabicNameMap = new Hashtable ();
 198
 199                 // cp -> Hashtable [decompType] -> cp
 200                 Hashtable nfkdMap = new Hashtable ();
 201
 202                 // Latin letter -> ArrayList [int]
 203                 Hashtable latinMap = new Hashtable ();
 204
 205                 ArrayList jisJapanese = new ArrayList ();
 206                 ArrayList nonJisJapanese = new ArrayList ();
 207
 208                 ushort [] cjkJA = new ushort [char.MaxValue - 0x4E00];
 209                 ushort [] cjkCHS = new ushort [char.MaxValue - 0x3100];
 210                 ushort [] cjkCHT = new ushort [char.MaxValue - 0x4E00];
 211                 ushort [] cjkKO = new ushort [char.MaxValue - 0x4E00];
 212                 byte [] cjkKOlv2 = new byte [char.MaxValue - 0x4E00];
 213
 214                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 215
 216                 double [] unicodeAge = new double [char.MaxValue + 1];
 217
 218                 void Run (string [] args)
 219                 {
 220                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 221                         FillIgnorables ();
 222
 223                         ParseSources (dirname);
 224                         Console.Error.WriteLine ("parse done.");
 225
 226                         ModifyParsedValues ();
 227                         GenerateCore ();
 228                         Console.Error.WriteLine ("generation done.");
 229                         Serialize ();
 230                         Console.Error.WriteLine ("serialization done.");
 231                 }
 232
 233                 void Serialize ()
 234                 {
 235                         // Ignorables
 236                         Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
 237                         for (int i = 0; i <= char.MaxValue; i++) {
 238                                 byte value = ignorableFlags [i];
 239                                 if (value < 10)
 240                                         Result.Write ("{0},", value);
 241                                 else
 242                                         Result.Write ("0x{0:X02},", value);
 243                                 if ((i & 0xF) == 0xF)
 244                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 245                         }
 246                         Result.WriteLine ("};");
 247                         Result.WriteLine ();
 248
 249                         // Primary category
 250                         Result.WriteLine ("static byte [] categories = new byte [] {");
 251                         for (int i = 0; i < map.Length; i++) {
 252                                 byte value = map [i].Category;
 253                                 if (value < 10)
 254                                         Result.Write ("{0},", value);
 255                                 else
 256                                         Result.Write ("0x{0:X02},", value);
 257                                 if ((i & 0xF) == 0xF)
 258                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 259                         }
 260                         Result.WriteLine ("};");
 261                         Result.WriteLine ();
 262
 263                         // Primary weight value
 264                         Result.WriteLine ("static byte [] level1 = new byte [] {");
 265                         for (int i = 0; i < map.Length; i++) {
 266                                 byte value = map [i].Level1;
 267                                 if (value < 10)
 268                                         Result.Write ("{0},", value);
 269                                 else
 270                                         Result.Write ("0x{0:X02},", value);
 271                                 if ((i & 0xF) == 0xF)
 272                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 273                         }
 274                         Result.WriteLine ("};");
 275                         Result.WriteLine ();
 276
 277                         // Secondary weight
 278                         Result.WriteLine ("static byte [] level2 = new byte [] {");
 279                         for (int i = 0; i < map.Length; i++) {
 280                                 int value = map [i].Level2;
 281                                 if (value < 10)
 282                                         Result.Write ("{0},", value);
 283                                 else
 284                                         Result.Write ("0x{0:X02},", value);
 285                                 if ((i & 0xF) == 0xF)
 286                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 287                         }
 288                         Result.WriteLine ("};");
 289                         Result.WriteLine ();
 290
 291                         // Thirtiary weight
 292                         Result.WriteLine ("static byte [] level3 = new byte [] {");
 293                         for (int i = 0; i < map.Length; i++) {
 294                                 byte value = ComputeLevel3Weight ((char) i);
 295                                 if (value < 10)
 296                                         Result.Write ("{0},", value);
 297                                 else
 298                                         Result.Write ("0x{0:X02},", value);
 299                                 if ((i & 0xF) == 0xF)
 300                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 301                         }
 302                         Result.WriteLine ("};");
 303                         Result.WriteLine ();
 304
 305                         // Width insensitivity mappings
 306                         // (for now it is more lightweight than dumping the
 307                         // entire NFKD table).
 308                         Result.WriteLine ("static int [] widthCompat = new int [] {");
 309                         for (int i = 0; i < char.MaxValue; i++) {
 310                                 int value = 0;
 311                                 switch (decompType [i]) {
 312                                 case DecompositionNarrow:
 313                                 case DecompositionWide:
 314                                 case DecompositionSuper:
 315                                 case DecompositionSub:
 316                                         // they are always 1 char
 317                                         value = decompValues [decompIndex [i]];
 318                                         break;
 319                                 }
 320                                 if (value < 10)
 321                                         Result.Write ("{0},", value);
 322                                 else
 323                                         Result.Write ("0x{0:X04},", value);
 324                                 if ((i & 0xF) == 0xF)
 325                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 326                         }
 327                         Result.WriteLine ("};");
 328                         Result.WriteLine ();
 329
 330                         // CJK
 331                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 332                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 333                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 334                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 335                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 336                 }
 337
 338                 void SerializeCJK (string name, ushort [] cjk, int max)
 339                 {
 340                         int offset = char.MaxValue - cjk.Length;
 341                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
 342                         for (int i = 0; i < cjk.Length; i++) {
 343                                 if (i + offset == max)
 344                                         break;
 345                                 ushort value = cjk [i];
 346                                 if (value < 10)
 347                                         Result.Write ("{0},", value);
 348                                 else
 349                                         Result.Write ("0x{0:X04},", value);
 350                                 if ((i & 0xF) == 0xF)
 351                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 352                         }
 353                         Result.WriteLine ("};");
 354                         Result.WriteLine ();
 355                 }
 356
 357                 void SerializeCJK (string name, byte [] cjk, int max)
 358                 {
 359                         int offset = char.MaxValue - cjk.Length;
 360                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
 361                         for (int i = 0; i < cjk.Length; i++) {
 362                                 if (i + offset == max)
 363                                         break;
 364                                 byte value = cjk [i];
 365                                 if (value < 10)
 366                                         Result.Write ("{0},", value);
 367                                 else
 368                                         Result.Write ("0x{0:X02},", value);
 369                                 if ((i & 0xF) == 0xF)
 370                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 371                         }
 372                         Result.WriteLine ("};");
 373                         Result.WriteLine ();
 374                 }
 375
 376                 #region Parse
 377
 378                 void ParseSources (string dirname)
 379                 {
 380                         string unidata =
 381                                 dirname + "/UnicodeData.txt";
 382                         string derivedCoreProps =
 383                                 dirname + "/DerivedCoreProperties.txt";
 384                         string scripts =
 385                                 dirname + "/Scripts.txt";
 386                         string cp932 =
 387                                 dirname + "/CP932.TXT";
 388                         string derivedAge =
 389                                 dirname + "/DerivedAge.txt";
 390                         string chXML = dirname + "/common/collation/zh.xml";
 391                         string jaXML = dirname + "/common/collation/ja.xml";
 392                         string koXML = dirname + "/common/collation/ko.xml";
 393
 394                         ParseDerivedAge (derivedAge);
 395                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 396                         ParseUnidata (unidata);
 397                         ParseDerivedCoreProperties (derivedCoreProps);
 398                         ParseScripts (scripts);
 399                         ParseCJK (chXML, jaXML, koXML);
 400                 }
 401
 402                 void ParseDerivedAge (string filename)
 403                 {
 404                         using (StreamReader file =
 405                                 new StreamReader (filename)) {
 406                                 while (file.Peek () >= 0) {
 407                                         string s = file.ReadLine ();
 408                                         int idx = s.IndexOf ('#');
 409                                         if (idx >= 0)
 410                                                 s = s.Substring (0, idx);
 411                                         idx = s.IndexOf (';');
 412                                         if (idx < 0)
 413                                                 continue;
 414
 415                                         string cpspec = s.Substring (0, idx);
 416                                         idx = cpspec.IndexOf ("..");
 417                                         NumberStyles nf = NumberStyles.HexNumber |
 418                                                 NumberStyles.AllowTrailingWhite;
 419                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 420                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 421                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 422
 423                                         // FIXME: use index
 424                                         if (cp > char.MaxValue)
 425                                                 continue;
 426
 427                                         for (int i = cp; i <= cpEnd; i++)
 428                                                 unicodeAge [i] = double.Parse (value);
 429                                 }
 430                         }
 431                 }
 432
 433                 void ParseUnidata (string filename)
 434                 {
 435                         ArrayList decompValues = new ArrayList ();
 436                         using (StreamReader unidata =
 437                                 new StreamReader (filename)) {
 438                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 439                                         try {
 440                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 441                                         } catch (Exception) {
 442                                                 Console.Error.WriteLine ("**** At line " + line);
 443                                                 throw;
 444                                         }
 445                                 }
 446                         }
 447                         this.decompValues = (int [])
 448                                 decompValues.ToArray (typeof (int));
 449                 }
 450
 451                 void ProcessUnidataLine (string s, ArrayList decompValues)
 452                 {
 453                         int idx = s.IndexOf ('#');
 454                         if (idx >= 0)
 455                                 s = s.Substring (0, idx);
 456                         idx = s.IndexOf (';');
 457                         if (idx < 0)
 458                                 return;
 459                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 460                         string [] values = s.Substring (idx + 1).Split (';');
 461
 462                         // FIXME: use index
 463                         if (cp > char.MaxValue)
 464                                 return;
 465                         if (IsIgnorable (cp))
 466                                 return;
 467
 468                         string name = values [0];
 469
 470                         // isSmallCapital
 471                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 472                                 isSmallCapital [cp] = true;
 473
 474                         // latin mapping by character name
 475                         if (s.IndexOf ("LATIN") > 0) {
 476                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 477                                 int offset = lidx + 15;
 478                                 if (lidx < 0) {
 479                                         lidx = s.IndexOf ("LETTER TURNED ");
 480                                         offset = lidx + 14;
 481                                 }
 482                                 if (lidx < 0) {
 483                                         lidx = s.IndexOf ("LETTER ");
 484                                         offset = lidx + 7;
 485                                 }
 486                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 487                                 if ('A' <= c && c <= 'Z' &&
 488                                         (s.Length == offset + 1 || s [offset + 1] == ' ')) {
 489                                         ArrayList entry = (ArrayList) latinMap [c];
 490                                         if (entry == null) {
 491                                                 entry = new ArrayList ();
 492                                                 latinMap [c] = entry;
 493                                         }
 494                                         entry.Add (cp);
 495                                 }
 496                         }
 497
 498                         // Arrow names
 499                         if (0x2000 <= cp && cp < 0x3000) {
 500                                 int value = 0;
 501                                 // SPECIAL CASES. FIXME: why?
 502                                 switch (cp) {
 503                                 case 0x21C5: value = -1; break; // E2
 504                                 case 0x261D: value = 1; break;
 505                                 case 0x27A6: value = 3; break;
 506                                 case 0x21B0: value = 7; break;
 507                                 case 0x21B1: value = 3; break;
 508                                 case 0x21B2: value = 7; break;
 509                                 case 0x21B4: value = 5; break;
 510                                 case 0x21B5: value = 7; break;
 511                                 case 0x21B9: value = -1; break; // E1
 512                                 case 0x21CF: value = 7; break;
 513                                 case 0x21D0: value = 3; break;
 514                                 }
 515                                 string [] arrowTargets = new string [] {
 516                                         "",
 517                                         "UPWARDS",
 518                                         "NORTH EAST",
 519                                         "RIGHTWARDS",
 520                                         "SOUTH EAST",
 521                                         "DOWNWARDS",
 522                                         "SOUTH WEST",
 523                                         "LEFTWARDS",
 524                                         "NORTH WEST",
 525                                         };
 526                                 if (value == 0)
 527                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
 528                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
 529                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
 530                                                         s.IndexOf (" OVER") < 0
 531                                                 )
 532                                                         value = i;
 533                                 if (value > 0)
 534                                         arrowValues.Add (new DictionaryEntry (
 535                                                 cp, value));
 536                         }
 537
 538                         // Box names
 539                         if (0x2500 <= cp && cp < 0x25B0) {
 540                                 int value = 0;
 541                                 // flags:
 542                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
 543                                 // [h,rl] [r] [l]
 544                                 // [v,ud] [u] [d]
 545                                 // [dr] [dl] [ur] [ul]
 546                                 // [vr,udr] [vl,vdl]
 547                                 // [hd,rld] [hu,rlu]
 548                                 // [hv,udrl,rlv,udh]
 549                                 ArrayList flags = new ArrayList (new int [] {
 550                                         32, 8 + 4, 8, 4,
 551                                         16, 1 + 2, 1, 2,
 552                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
 553                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
 554                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
 555                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
 556                                         });
 557                                 byte [] offsets = new byte [] {
 558                                         0, 0, 1, 2,
 559                                         3, 3, 4, 5,
 560                                         6, 7, 8, 9,
 561                                         10, 10, 11, 11,
 562                                         12, 12, 13, 13,
 563                                         14, 14, 14, 14};
 564                                 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
 565                                         int flag = 0;
 566                                         if (s.IndexOf (" UP") > 0)
 567                                                 flag |= 1;
 568                                         if (s.IndexOf (" DOWN") > 0)
 569                                                 flag |= 2;
 570                                         if (s.IndexOf (" RIGHT") > 0)
 571                                                 flag |= 4;
 572                                         if (s.IndexOf (" LEFT") > 0)
 573                                                 flag |= 8;
 574                                         if (s.IndexOf (" VERTICAL") > 0)
 575                                                 flag |= 16;
 576                                         if (s.IndexOf (" HORIZONTAL") > 0)
 577                                                 flag |= 32;
 578
 579                                         int fidx = flags.IndexOf (flag);
 580                                         value = fidx < 0 ? fidx : offsets [fidx];
 581                                 } else if (s.IndexOf ("BLOCK") > 0) {
 582                                         if (s.IndexOf ("ONE EIGHTH") > 0)
 583                                                 value = 0x12;
 584                                         else if (s.IndexOf ("ONE QUARTER") > 0)
 585                                                 value = 0x13;
 586                                         else if (s.IndexOf ("THREE EIGHTHS") > 0)
 587                                                 value = 0x14;
 588                                         else if (s.IndexOf ("HALF") > 0)
 589                                                 value = 0x15;
 590                                         else if (s.IndexOf ("FIVE EIGHTHS") > 0)
 591                                                 value = 0x16;
 592                                         else if (s.IndexOf ("THREE QUARTERS") > 0)
 593                                                 value = 0x17;
 594                                         else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
 595                                                 value = 0x18;
 596                                         else
 597                                                 value = 0x19;
 598                                 }
 599                                 if (value >= 0)
 600                                         boxValues.Add (new DictionaryEntry (
 601                                                 cp, value));
 602                         }
 603
 604                         // For some characters store the name and sort later
 605                         // to determine sorting.
 606                         if (0x2100 <= cp && cp <= 0x213F &&
 607                                 Char.IsSymbol ((char) cp))
 608                                 sortableCharNames.Add (
 609                                         new DictionaryEntry (cp, values [0]));
 610                         else if (0x3380 <= cp && cp <= 0x33DD)
 611                                 sortableCharNames.Add (new DictionaryEntry (
 612                                         cp, values [0].Substring (7)));
 613
 614                         // diacritical weights by character name
 615                         for (int d = 0; d < diacritics.Length; d++)
 616                                 if (s.IndexOf (diacritics [d]) > 0)
 617                                         diacritical [cp] |= diacriticWeights [d];
 618                         // Two-step grep required for it.
 619                         if (s.IndexOf ("FULL STOP") > 0 &&
 620                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
 621                                 diacritical [cp] |= 0xF4;
 622
 623                         // Arabic letter name
 624                         if (0x0621 <= cp && cp <= 0x064A &&
 625                                 Char.GetUnicodeCategory ((char) cp)
 626                                 == UnicodeCategory.OtherLetter) {
 627                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
 628                                 switch (cp) {
 629                                 case 0x0621:
 630                                 case 0x0624:
 631                                 case 0x0626:
 632                                         // hamza, waw, yeh ... special cases.
 633                                         value = 0x07;
 634                                         break;
 635                                 case 0x0649:
 636                                 case 0x064A:
 637                                         value = 0x77; // special cases.
 638                                         break;
 639                                 default:
 640                                         // Get primary letter name i.e.
 641                                         // XXX part of ARABIC LETTER XXX yyy
 642                                         // e.g. that of "TEH MARBUTA" is "TEH".
 643                                         string letterName =
 644                                                 (cp == 0x0640) ?
 645                                                 // 0x0640 is special: it does
 646                                                 // not start with ARABIC LETTER
 647                                                 values [0] :
 648                                                 values [0].Substring (14);
 649                                         int tmpIdx = letterName.IndexOf (' ');
 650                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
 651 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
 652                                         if (arabicNameMap.ContainsKey (letterName))
 653                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
 654                                         else
 655                                                 arabicNameMap [letterName] = cp;
 656                                         break;
 657                                 }
 658                                 arabicLetterPrimaryValues [cp] = value;
 659                         }
 660
 661                         // Japanese square letter
 662                         if (0x3300 <= cp && cp <= 0x3357)
 663                                 if (!ExistsJIS (cp))
 664                                         nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
 665
 666                         // normalizationType
 667                         string decomp = values [4];
 668                         idx = decomp.IndexOf ('<');
 669                         if (idx >= 0) {
 670                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
 671                                 case "full":
 672                                         decompType [cp] = DecompositionFull;
 673                                         break;
 674                                 case "sub":
 675                                         decompType [cp] = DecompositionSub;
 676                                         break;
 677                                 case "super":
 678                                         decompType [cp] = DecompositionSuper;
 679                                         break;
 680                                 case "small":
 681                                         decompType [cp] = DecompositionSmall;
 682                                         break;
 683                                 case "isolated":
 684                                         decompType [cp] = DecompositionIsolated;
 685                                         break;
 686                                 case "initial":
 687                                         decompType [cp] = DecompositionInitial;
 688                                         break;
 689                                 case "final":
 690                                         decompType [cp] = DecompositionFinal;
 691                                         break;
 692                                 case "medial":
 693                                         decompType [cp] = DecompositionMedial;
 694                                         break;
 695                                 case "noBreak":
 696                                         decompType [cp] = DecompositionNoBreak;
 697                                         break;
 698                                 case "compat":
 699                                         decompType [cp] = DecompositionCompat;
 700                                         break;
 701                                 case "fraction":
 702                                         decompType [cp] = DecompositionFraction;
 703                                         break;
 704                                 case "font":
 705                                         decompType [cp] = DecompositionFont;
 706                                         break;
 707                                 case "circle":
 708                                         decompType [cp] = DecompositionCircle;
 709                                         break;
 710                                 case "square":
 711                                         decompType [cp] = DecompositionSquare;
 712                                         break;
 713                                 case "wide":
 714                                         decompType [cp] = DecompositionWide;
 715                                         break;
 716                                 case "narrow":
 717                                         decompType [cp] = DecompositionNarrow;
 718                                         break;
 719                                 case "vertical":
 720                                         decompType [cp] = DecompositionVertical;
 721                                         break;
 722                                 default:
 723                                         throw new Exception ("Support NFKD type : " + decomp);
 724                                 }
 725                         }
 726                         else
 727                                 decompType [cp] = DecompositionCanonical;
 728                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
 729                         if (decomp.Length > 0) {
 730
 731                                 string [] velems = decomp.Split (' ');
 732                                 int didx = decompValues.Count;
 733                                 decompIndex [cp] = didx;
 734                                 foreach (string v in velems)
 735                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
 736                                 decompLength [cp] = velems.Length;
 737
 738                                 // [decmpType] -> this_cp
 739                                 int targetCP = (int) decompValues [didx];
 740                                 // for "(x)" it specially maps to 'x' .
 741                                 // FIXME: check if it is sane
 742                                 if (velems.Length == 3 &&
 743                                         (int) decompValues [didx] == '(' &&
 744                                         (int) decompValues [didx + 2] == ')')
 745                                         targetCP = (int) decompValues [didx + 1];
 746                                 // special: 0x215F "1/"
 747                                 else if (cp == 0x215F)
 748                                         targetCP = '1';
 749                                 else if (velems.Length > 1 &&
 750                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
 751                                         // skip them, except for CJK ideograph compat
 752                                         targetCP = 0;
 753
 754                                 if (targetCP != 0) {
 755                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
 756                                         if (entry == null) {
 757                                                 entry = new Hashtable ();
 758                                                 nfkdMap [targetCP] = entry;
 759                                         }
 760                                         entry [(byte) decompType [cp]] = cp;
 761                                 }
 762                         }
 763                         // numeric values
 764                         if (values [5].Length > 0)
 765                                 decimalValue [cp] = decimal.Parse (values [5]);
 766                         else if (values [6].Length > 0)
 767                                 decimalValue [cp] = decimal.Parse (values [6]);
 768                         else if (values [7].Length > 0) {
 769                                 string decstr = values [7];
 770                                 idx = decstr.IndexOf ('/');
 771                                 if (cp == 0x215F) // special. "1/"
 772                                         decimalValue [cp] = 0x1;
 773                                 else if (idx > 0)
 774                                         // m/n
 775                                         decimalValue [cp] =
 776                                                 decimal.Parse (decstr.Substring (0, idx))
 777                                                 / decimal.Parse (decstr.Substring (idx + 1));
 778                                 else if (decstr [0] == '(' &&
 779                                         decstr [decstr.Length - 1] == ')')
 780                                         // (n)
 781                                         decimalValue [cp] =
 782                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
 783                                 else if (decstr [decstr.Length - 1] == '.')
 784                                         // n.
 785                                         decimalValue [cp] =
 786                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
 787                                 else
 788                                         decimalValue [cp] = decimal.Parse (decstr);
 789                         }
 790                 }
 791
 792                 void ParseDerivedCoreProperties (string filename)
 793                 {
 794                         // IsUppercase
 795                         using (StreamReader file =
 796                                 new StreamReader (filename)) {
 797                                 for (int line = 1; file.Peek () >= 0; line++) {
 798                                         try {
 799                                                 ProcessDerivedCorePropLine (file.ReadLine ());
 800                                         } catch (Exception) {
 801                                                 Console.Error.WriteLine ("**** At line " + line);
 802                                                 throw;
 803                                         }
 804                                 }
 805                         }
 806                 }
 807
 808                 void ProcessDerivedCorePropLine (string s)
 809                 {
 810                         int idx = s.IndexOf ('#');
 811                         if (idx >= 0)
 812                                 s = s.Substring (0, idx);
 813                         idx = s.IndexOf (';');
 814                         if (idx < 0)
 815                                 return;
 816                         string cpspec = s.Substring (0, idx);
 817                         idx = cpspec.IndexOf ("..");
 818                         NumberStyles nf = NumberStyles.HexNumber |
 819                                 NumberStyles.AllowTrailingWhite;
 820                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 821                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 822                         string value = s.Substring (cpspec.Length + 1).Trim ();
 823
 824                         // FIXME: use index
 825                         if (cp > char.MaxValue)
 826                                 return;
 827
 828                         switch (value) {
 829                         case "Uppercase":
 830                                 for (int x = cp; x <= cpEnd; x++)
 831                                         isUppercase [x] = true;
 832                                 break;
 833                         }
 834                 }
 835
 836                 void ParseScripts (string filename)
 837                 {
 838                         ArrayList cyrillic = new ArrayList ();
 839                         ArrayList gurmukhi = new ArrayList ();
 840                         ArrayList gujarati = new ArrayList ();
 841                         ArrayList georgian = new ArrayList ();
 842                         ArrayList thaana = new ArrayList ();
 843
 844                         using (StreamReader file =
 845                                 new StreamReader (filename)) {
 846                                 while (file.Peek () >= 0) {
 847                                         string s = file.ReadLine ();
 848                                         int idx = s.IndexOf ('#');
 849                                         if (idx >= 0)
 850                                                 s = s.Substring (0, idx);
 851                                         idx = s.IndexOf (';');
 852                                         if (idx < 0)
 853                                                 continue;
 854
 855                                         string cpspec = s.Substring (0, idx);
 856                                         idx = cpspec.IndexOf ("..");
 857                                         NumberStyles nf = NumberStyles.HexNumber |
 858                                                 NumberStyles.AllowTrailingWhite;
 859                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 860                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 861                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 862
 863                                         // FIXME: use index
 864                                         if (cp > char.MaxValue)
 865                                                 continue;
 866
 867                                         switch (value) {
 868                                         case "Cyrillic":
 869                                                 for (int x = cp; x <= cpEnd; x++)
 870                                                         if (!IsIgnorable (x))
 871                                                                 cyrillic.Add ((char) x);
 872                                                 break;
 873                                         case "Gurmukhi":
 874                                                 for (int x = cp; x <= cpEnd; x++)
 875                                                         if (!IsIgnorable (x))
 876                                                                 gurmukhi.Add ((char) x);
 877                                                 break;
 878                                         case "Gujarati":
 879                                                 for (int x = cp; x <= cpEnd; x++)
 880                                                         if (!IsIgnorable (x))
 881                                                                 gujarati.Add ((char) x);
 882                                                 break;
 883                                         case "Georgian":
 884                                                 for (int x = cp; x <= cpEnd; x++)
 885                                                         if (!IsIgnorable (x))
 886                                                                 georgian.Add ((char) x);
 887                                                 break;
 888                                         case "Thaana":
 889                                                 for (int x = cp; x <= cpEnd; x++)
 890                                                         if (!IsIgnorable (x))
 891                                                                 thaana.Add ((char) x);
 892                                                 break;
 893                                         }
 894                                 }
 895                         }
 896                         cyrillic.Sort (UCAComparer.Instance);
 897                         gurmukhi.Sort (UCAComparer.Instance);
 898                         gujarati.Sort (UCAComparer.Instance);
 899                         georgian.Sort (UCAComparer.Instance);
 900                         thaana.Sort (UCAComparer.Instance);
 901                         orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
 902                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
 903                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
 904                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
 905                         orderedThaana = (char []) thaana.ToArray (typeof (char));
 906                 }
 907
 908                 void ParseJISOrder (string filename)
 909                 {
 910                         using (StreamReader file =
 911                                 new StreamReader (filename)) {
 912                                 while (file.Peek () >= 0) {
 913                                         string s = file.ReadLine ();
 914                                         int idx = s.IndexOf ('#');
 915                                         if (idx >= 0)
 916                                                 s = s.Substring (0, idx).Trim ();
 917                                         if (s.Length == 0)
 918                                                 continue;
 919                                         idx = s.IndexOf (' ');
 920                                         if (idx < 0)
 921                                                 continue;
 922                                         // They start with "0x" so cut them out.
 923                                         int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
 924                                         int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
 925                                         jisJapanese.Add (new JISCharacter (cp, jis));
 926                                 }
 927                         }
 928                 }
 929
 930                 void ParseCJK (string zhXML, string jaXML, string koXML)
 931                 {
 932                         XmlDocument doc = new XmlDocument ();
 933                         doc.XmlResolver = null;
 934                         int v;
 935                         string s;
 936                         string category;
 937                         int offset;
 938                         ushort [] arr;
 939
 940                         // Chinese Simplified
 941                         category = "chs";
 942                         arr = cjkCHS;
 943                         offset = char.MaxValue - arr.Length;
 944                         doc.Load (zhXML);
 945                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
 946                         v = 0x8008;
 947                         foreach (char c in s) {
 948                                 if (c < '\u3100')
 949                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
 950                                 else {
 951                                         arr [(int) c - offset] = (ushort) v++;
 952                                         if (v % 256 == 0)
 953                                                 v += 2;
 954                                 }
 955                         }
 956
 957                         // Chinese Traditional
 958                         category = "cht";
 959                         arr = cjkCHT;
 960                         offset = char.MaxValue - arr.Length;
 961                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
 962                         v = 0x8002;
 963                         foreach (char c in s) {
 964                                 if (c < '\u4E00')
 965                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
 966                                 else {
 967                                         arr [(int) c - offset] = (ushort) v++;
 968                                         if (v % 256 == 0)
 969                                                 v += 2;
 970                                 }
 971                         }
 972
 973                         // Japanese
 974                         category = "ja";
 975                         arr = cjkJA;
 976                         offset = char.MaxValue - arr.Length;
 977                         doc.Load (jaXML);
 978                         s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
 979                         v = 0x8008;
 980                         foreach (char c in s) {
 981                                 if (c < '\u4E00')
 982                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
 983                                 else {
 984                                         arr [(int) c - offset] = (ushort) v++;
 985                                         if (v % 256 == 0)
 986                                                 v += 2;
 987                                 }
 988                         }
 989
 990                         // Korean
 991                         // Korean weight is somewhat complex. It first shifts
 992                         // Hangul category from 52-x to 80-x (they are anyways
 993                         // computed). CJK ideographs are placed at secondary
 994                         // weight, like XX YY 01 zz 01, where XX and YY are
 995                         // corresponding "reset" value and zz is 41,43,45...
 996                         //
 997                         // Unlike chs,cht and ja, Korean value is a combined
 998                         // ushort which is computed as category
 999                         //
1000                         category = "ko";
1001                         arr = cjkKO;
1002                         offset = char.MaxValue - arr.Length;
1003                         doc.Load (koXML);
1004                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1005                                 XmlElement sc = (XmlElement) reset.NextSibling;
1006                                 // compute "category" and "level 1" for the
1007                                 // target "reset" Hangle syllable
1008                                 char rc = reset.InnerText [0];
1009                                 int ri = ((int) rc - 0xAC00) + 1;
1010                                 ushort p = (ushort)
1011                                         ((ri / 254) * 256 + (ri % 254) + 2);
1012                                 // Place the characters after the target.
1013                                 s = sc.InnerText;
1014                                 v = 0x41;
1015                                 foreach (char c in s) {
1016                                         arr [(int) c - offset] = p;
1017                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1018                                         v += 2;
1019                                 }
1020                         }
1021                 }
1022
1023                 #endregion
1024
1025                 #region Generation
1026
1027                 void FillIgnorables ()
1028                 {
1029                         for (int i = 0; i <= char.MaxValue; i++) {
1030                                 if (Char.GetUnicodeCategory ((char) i) ==
1031                                         UnicodeCategory.OtherNotAssigned)
1032                                         continue;
1033                                 if (IsIgnorable (i))
1034                                         ignorableFlags [i] |= 1;
1035                                 if (IsIgnorableSymbol (i))
1036                                         ignorableFlags [i] |= 2;
1037                                 if (IsIgnorableNonSpacing (i))
1038                                         ignorableFlags [i] |= 4;
1039                         }
1040                 }
1041
1042                 void ModifyParsedValues ()
1043                 {
1044                         // number, secondary weights
1045                         byte weight = 0x38;
1046                         int [] numarr = numberSecondaryWeightBounds;
1047                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1048                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1049                                         if (Char.IsNumber ((char) cp))
1050                                                 diacritical [cp] = weight;
1051
1052                         // Korean parens numbers
1053                         for (int i = 0x3200; i <= 0x321C; i++)
1054                                 diacritical [i] = 0xA;
1055                         for (int i = 0x3260; i <= 0x327B; i++)
1056                                 diacritical [i] = 0xC;
1057
1058                         // Update name part of named characters
1059                         for (int i = 0; i < sortableCharNames.Count; i++) {
1060                                 DictionaryEntry de =
1061                                         (DictionaryEntry) sortableCharNames [i];
1062                                 int cp = (int) de.Key;
1063                                 string renamed = null;
1064                                 switch (cp) {
1065                                 case 0x2101: renamed = "A_1"; break;
1066                                 case 0x33C3: renamed = "A_2"; break;
1067                                 case 0x2105: renamed = "C_1"; break;
1068                                 case 0x2106: renamed = "C_2"; break;
1069                                 case 0x211E: renamed = "R1"; break;
1070                                 case 0x211F: renamed = "R2"; break;
1071                                 // Remove some of them!
1072                                 case 0x2103:
1073                                 case 0x2109:
1074                                 case 0x2116:
1075                                 case 0x2117:
1076                                 case 0x2118:
1077                                 case 0x2125:
1078                                 case 0x2127:
1079                                 case 0x2129:
1080                                 case 0x212E:
1081                                 case 0x2132:
1082                                         sortableCharNames.RemoveAt (i);
1083                                         i--;
1084                                         continue;
1085                                 }
1086                                 if (renamed != null)
1087                                         sortableCharNames [i] =
1088                                                 new DictionaryEntry (cp, renamed);
1089                         }
1090                 }
1091
1092                 void GenerateCore ()
1093                 {
1094                         UnicodeCategory uc;
1095
1096                         #region Specially ignored // 01
1097                         // This will raise "Defined" flag up.
1098                         foreach (char c in specialIgnore)
1099                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1100                         #endregion
1101
1102
1103                         #region Variable weights
1104                         // Controls : 06 03 - 06 3D
1105                         fillIndex [6] = 3;
1106                         for (int i = 0; i < 65536; i++) {
1107                                 if (IsIgnorable (i))
1108                                         continue;
1109                                 char c = (char) i;
1110                                 uc = Char.GetUnicodeCategory (c);
1111                                 // NEL is whitespace but not ignored here.
1112                                 if (uc == UnicodeCategory.Control &&
1113                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1114                                         AddCharMap (c, 6, 1);
1115                         }
1116
1117                         // Apostrophe 06 80
1118                         fillIndex [6] = 0x80;
1119                         AddCharMapGroup ('\'', 6, 1, 0);
1120                         AddCharMap ('\uFE63', 6, 1);
1121
1122                         // Hyphen/Dash : 06 81 - 06 90
1123                         for (int i = 0; i < char.MaxValue; i++) {
1124                                 if (Char.GetUnicodeCategory ((char) i)
1125                                         == UnicodeCategory.DashPunctuation)
1126                                         AddCharMapGroupTail ((char) i, 6, 1);
1127                         }
1128
1129                         // Arabic variable weight chars 06 A0 -
1130                         fillIndex [6] = 0xA0;
1131                         // vowels
1132                         for (int i = 0x64B; i <= 0x650; i++)
1133                                 AddCharMapGroupTail ((char) i, 6, 1);
1134                         // sukun
1135                         AddCharMapGroup ('\u0652', 6, 1, 0);
1136                         // shadda
1137                         AddCharMapGroup ('\u0651', 6, 1, 0);
1138                         #endregion
1139
1140
1141                         #region Nonspacing marks // 01
1142                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1143
1144                         // Combining diacritical marks: 01 DC -
1145
1146                         fillIndex [0x1] = 0x41;
1147                         for (int i = 0x030E; i <= 0x0326; i++)
1148                                 if (!IsIgnorable (i))
1149                                         AddCharMap ((char) i, 0x1, 1);
1150                         for (int i = 0x0329; i <= 0x0334; i++)
1151                                 if (!IsIgnorable (i))
1152                                         AddCharMap ((char) i, 0x1, 1);
1153                         for (int i = 0x0339; i <= 0x0341; i++)
1154                                 if (!IsIgnorable (i))
1155                                         AddCharMap ((char) i, 0x1, 1);
1156                         fillIndex [0x1] = 0x72;
1157                         for (int i = 0x0346; i <= 0x0348; i++)
1158                                 if (!IsIgnorable (i))
1159                                         AddCharMap ((char) i, 0x1, 1);
1160                         for (int i = 0x02BE; i <= 0x02BF; i++)
1161                                 if (!IsIgnorable (i))
1162                                         AddCharMap ((char) i, 0x1, 1);
1163                         for (int i = 0x02C1; i <= 0x02C5; i++)
1164                                 if (!IsIgnorable (i))
1165                                         AddCharMap ((char) i, 0x1, 1);
1166                         for (int i = 0x02CE; i <= 0x02CF; i++)
1167                                 if (!IsIgnorable (i))
1168                                         AddCharMap ((char) i, 0x1, 1);
1169                         for (int i = 0x02D1; i <= 0x02D3; i++)
1170                                 if (!IsIgnorable (i))
1171                                         AddCharMap ((char) i, 0x1, 1);
1172                         AddCharMap ('\u02DE', 0x1, 1);
1173                         for (int i = 0x02E4; i <= 0x02E9; i++)
1174                                 if (!IsIgnorable (i))
1175                                         AddCharMap ((char) i, 0x1, 1);
1176
1177                         // LAMESPEC: It should not stop at '\u20E1'. There are
1178                         // a few more characters (that however results in
1179                         // overflow of level 2 unless we start before 0xDD).
1180                         fillIndex [0x1] = 0xDC;
1181                         for (int i = 0x20d0; i <= 0x20e1; i++)
1182                                 AddCharMap ((char) i, 0x1, 1);
1183                         #endregion
1184
1185
1186                         #region Whitespaces // 07 03 -
1187                         fillIndex [0x7] = 0x2;
1188                         AddCharMap (' ', 0x7, 2);
1189                         AddCharMap ('\u00A0', 0x7, 1);
1190                         for (int i = 9; i <= 0xD; i++)
1191                                 AddCharMap ((char) i, 0x7, 1);
1192                         for (int i = 0x2000; i <= 0x200B; i++)
1193                                 AddCharMap ((char) i, 0x7, 1);
1194
1195                         fillIndex [0x7] = 0x17;
1196                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1197                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1198
1199                         // Characters which used to represent layout control.
1200                         // LAMESPEC: Windows developers seem to have thought
1201                         // that those characters are kind of whitespaces,
1202                         // while they aren't.
1203                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1204                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1205                         #endregion
1206
1207                         // FIXME: 09 should be more complete.
1208                         fillIndex [0x9] = 2;
1209                         // misc tech mark
1210                         for (int cp = 0x2300; cp <= 0x237A; cp++)
1211                                 AddCharMap ((char) cp, 0x9, 1, 0);
1212
1213                         // arrows
1214                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1215                         foreach (DictionaryEntry de in arrowValues) {
1216                                 int idx = (int) de.Value;
1217                                 int cp = (int) de.Key;
1218                                 if (map [cp].Defined)
1219                                         continue;
1220                                 fillIndex [0x9] = (byte) (0xD8 + idx);
1221                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1222                                 arrowLv2 [idx]++;
1223                         }
1224                         // boxes
1225                         byte [] boxLv2 = new byte [128];
1226                         for (int i = 0; i < boxLv2.Length; i++)
1227                                 boxLv2 [i] = 3;
1228                         foreach (DictionaryEntry de in boxValues) {
1229                                 int cp = (int) de.Key;
1230                                 int idx = (int) de.Value;
1231                                 if (map [cp].Defined)
1232                                         continue;
1233                                 fillIndex [0x9] = (byte) (0xE5 + idx);
1234                                 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1235                                 boxLv2 [idx]++;
1236                         }
1237                         // Some special characters (slanted)
1238                         fillIndex [0x9] = 0xF4;
1239                         AddCharMap ('\u2571', 0x9, 3);
1240                         AddCharMap ('\u2572', 0x9, 3);
1241                         AddCharMap ('\u2573', 0x9, 3);
1242
1243                         // FIXME: 08 should be more complete.
1244                         fillIndex [0x8] = 2;
1245                         for (int cp = 0; cp < char.MaxValue; cp++)
1246                                 if (!map [cp].Defined &&
1247                                         Char.GetUnicodeCategory ((char) cp) ==
1248                                         UnicodeCategory.MathSymbol)
1249                                         AddCharMapGroup ((char) cp, 0x8, 1, 0);
1250
1251                         // FIXME: implement 0A
1252                         #region Symbols
1253                         fillIndex [0xA] = 2;
1254                         // byte currency symbols
1255                         for (int cp = 0; cp < 0x100; cp++) {
1256                                 uc = Char.GetUnicodeCategory ((char) cp);
1257                                 if (!IsIgnorable (cp) &&
1258                                         uc == UnicodeCategory.CurrencySymbol &&
1259                                         cp != '$')
1260                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1261                         }
1262                         // byte other symbols
1263                         for (int cp = 0; cp < 0x100; cp++) {
1264                                 uc = Char.GetUnicodeCategory ((char) cp);
1265                                 if (!IsIgnorable (cp) &&
1266                                         uc == UnicodeCategory.OtherSymbol)
1267                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
1268                         }
1269
1270                         fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1271                         for (int cp = 0x2600; cp <= 0x2613; cp++)
1272                                 AddCharMap ((char) cp, 0xA, 1, 0);
1273                         for (int cp = 0x2620; cp <= 0x2770; cp++)
1274                                 if (Char.IsSymbol ((char) cp))
1275                                         AddCharMap ((char) cp, 0xA, 1, 0);
1276
1277                         #endregion
1278
1279                         #region Numbers // 0C 02 - 0C E1
1280                         fillIndex [0xC] = 2;
1281
1282                         // 9F8 : Bengali "one less than the denominator"
1283                         AddCharMap ('\u09F8', 0xC, 1);
1284
1285                         ArrayList numbers = new ArrayList ();
1286                         for (int i = 0; i < 65536; i++)
1287                                 if (!IsIgnorable (i) &&
1288                                         Char.IsNumber ((char) i) &&
1289                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1290                                         numbers.Add (i);
1291
1292                         ArrayList numberValues = new ArrayList ();
1293                         foreach (int i in numbers)
1294                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1295                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1296
1297 //foreach (DictionaryEntry de in numberValues)
1298 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1299
1300                         decimal prevValue = -1;
1301                         foreach (DictionaryEntry de in numberValues) {
1302                                 int cp = (int) de.Key;
1303                                 decimal currValue = (decimal) de.Value;
1304                                 bool addnew = false;
1305                                 if (prevValue < currValue &&
1306                                         prevValue - (int) prevValue == 0 &&
1307                                         prevValue >= 1) {
1308
1309                                         addnew = true;
1310                                         // Process Hangzhou and Roman numbers
1311
1312                                         // There are some SPECIAL cases.
1313                                         if (currValue != 4) // no increment for 4
1314                                                 fillIndex [0xC]++;
1315
1316                                         int xcp;
1317                                         xcp = (int) prevValue + 0x2170 - 1;
1318                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1319                                         xcp = (int) prevValue + 0x2160 - 1;
1320                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1321                                         fillIndex [0xC] += 2;
1322                                         xcp = (int) prevValue + 0x3021 - 1;
1323                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1324                                         fillIndex [0xC]++;
1325                                 }
1326                                 if (prevValue < currValue)
1327                                         prevValue = currValue;
1328                                 if (map [cp].Defined)
1329                                         continue;
1330                                 // HangZhou and Roman are add later
1331                                 // (code is above)
1332                                 else if (0x3021 <= cp && cp < 0x302A
1333                                         || 0x2160 <= cp && cp < 0x216A
1334                                         || 0x2170 <= cp && cp < 0x217A)
1335                                         continue;
1336
1337                                 if (cp ==  0x215B) // FIXME: why?
1338                                         fillIndex [0xC] += 2;
1339                                 else if (cp == 0x3021) // FIXME: why?
1340                                         fillIndex [0xC]++;
1341                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1342
1343                                 if (addnew || cp <= '9') {
1344                                         int xcp;
1345                                         if (1 <= currValue && currValue <= 10) {
1346                                                 xcp = cp - 0x31 + 0x2776;
1347                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1348                                                 xcp = cp - 0x31 + 0x2780;
1349                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1350                                                 xcp = cp - 0x31 + 0x278A;
1351                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1352                                         }
1353                                         if (1 <= currValue && currValue <= 20) {
1354                                                 xcp = cp - 0x31 + 0x2460;
1355                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1356                                                 xcp = cp - 0x31 + 0x2474;
1357                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1358                                                 xcp = cp - 0x31 + 0x2488;
1359                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1360                                         }
1361                                 }
1362
1363                                 if (cp != 0x09E7 && cp != 0x09EA)
1364                                         fillIndex [0xC]++;
1365
1366                                 // Add special cases that are not regarded as
1367                                 // numbers in UnicodeCategory speak.
1368                                 if (cp == '5') {
1369                                         // TONE FIVE
1370                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1371                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1372                                 }
1373                                 else if (cp == '6') // FIXME: why?
1374                                         fillIndex [0xC]++;
1375                         }
1376
1377                         // 221E: infinity
1378                         fillIndex [0xC] = 0xFF;
1379                         AddCharMap ('\u221E', 0xC, 1);
1380                         #endregion
1381
1382                         #region Letters and NonSpacing Marks (general)
1383
1384                         // ASCII Latin alphabets
1385                         for (int i = 0; i < alphabets.Length; i++)
1386                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1387
1388
1389                         // non-ASCII Latin alphabets
1390                         // FIXME: there is no such characters that are placed
1391                         // *after* "alphabets" array items. This is nothing
1392                         // more than a hack that creates dummy weight for
1393                         // primary characters.
1394                         for (int i = 0x0080; i < 0x0300; i++) {
1395                                 if (!Char.IsLetter ((char) i))
1396                                         continue;
1397                                 // For those Latin Letters which has NFKD are
1398                                 // not added as independent primary character.
1399                                 if (decompIndex [i] != 0)
1400                                         continue;
1401                                 // SPECIAL CASES:
1402                                 // 1.some alphabets have primarily
1403                                 //   equivalent ASCII alphabets.
1404                                 // 2.some have independent primary weights,
1405                                 //   but inside a-to-z range.
1406                                 // 3.there are some expanded characters that
1407                                 //   are not part of Unicode Standard NFKD.
1408                                 switch (i) {
1409                                 // 1. skipping them does not make sense
1410 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
1411 //                              case 0x184: case 0x185: case 0x186: case 0x189:
1412 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
1413 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
1414 //                              case 0x19B: case 0x19C:
1415                                 // 2. skipping them does not make sense
1416 //                              case 0x14A: // Ng
1417 //                              case 0x14B: // ng
1418                                 // 3.
1419                                 case 0xC6: // AE
1420                                 case 0xE6: // ae
1421                                 case 0xDE: // Icelandic Thorn
1422                                 case 0xFE: // Icelandic Thorn
1423                                 case 0xDF: // German ss
1424                                 case 0xFF: // German ss
1425                                 // not classified yet
1426 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1427 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1428 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1429 //                              case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1430 //                              case 0x1DD:
1431                                         continue;
1432                                 }
1433                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
1434                         }
1435
1436                         // Greek and Coptic
1437                         fillIndex [0xF] = 02;
1438                         for (int i = 0x0380; i < 0x0390; i++)
1439                                 if (Char.IsLetter ((char) i))
1440                                         AddLetterMap ((char) i, 0xF, 1);
1441                         fillIndex [0xF] = 02;
1442                         for (int i = 0x0391; i < 0x03CF; i++)
1443                                 if (Char.IsLetter ((char) i))
1444                                         AddLetterMap ((char) i, 0xF, 1);
1445                         fillIndex [0xF] = 0x40;
1446                         for (int i = 0x03D0; i < 0x0400; i++)
1447                                 if (Char.IsLetter ((char) i))
1448                                         AddLetterMap ((char) i, 0xF, 1);
1449
1450                         // Cyrillic - UCA order w/ some modification
1451                         fillIndex [0x10] = 0x3;
1452                         // table which is moslty from UCA DUCET.
1453                         for (int i = 0; i < orderedCyrillic.Length; i++) {
1454                                 char c = orderedCyrillic [i];
1455                                 if (Char.IsLetter (c))
1456                                         AddLetterMap (c, 0x10, 3);
1457                         }
1458                         for (int i = 0x0460; i < 0x0481; i++) {
1459                                 if (Char.IsLetter ((char) i))
1460                                         AddLetterMap ((char) i, 0x10, 3);
1461                         }
1462
1463                         // Armenian
1464                         fillIndex [0x11] = 0x3;
1465                         for (int i = 0x0531; i < 0x0586; i++)
1466                                 if (Char.IsLetter ((char) i))
1467                                         AddLetterMap ((char) i, 0x11, 1);
1468
1469                         // Hebrew
1470                         // -Letters
1471                         fillIndex [0x12] = 0x3;
1472                         for (int i = 0x05D0; i < 0x05FF; i++)
1473                                 if (Char.IsLetter ((char) i))
1474                                         AddLetterMap ((char) i, 0x12, 1);
1475                         // -Accents
1476                         fillIndex [0x1] = 0x3;
1477                         for (int i = 0x0591; i <= 0x05C2; i++)
1478                                 if (i != 0x05BE)
1479                                         AddCharMap ((char) i, 0x1, 1);
1480
1481                         // Arabic
1482                         fillIndex [0x1] = 0x8E;
1483                         fillIndex [0x13] = 0x3;
1484                         for (int i = 0x0621; i <= 0x064A; i++) {
1485                                 // Abjad
1486                                 if (Char.GetUnicodeCategory ((char) i)
1487                                         != UnicodeCategory.OtherLetter) {
1488                                         // FIXME: arabic nonspacing marks are
1489                                         // in different order.
1490                                         AddCharMap ((char) i, 0x1, 1);
1491                                         continue;
1492                                 }
1493 //                              map [i] = new CharMapEntry (0x13,
1494 //                                      (byte) arabicLetterPrimaryValues [i], 1);
1495                                 fillIndex [0x13] =
1496                                         (byte) arabicLetterPrimaryValues [i];
1497                                 AddLetterMap ((char) i, 0x13, 0);
1498                         }
1499                         fillIndex [0x13] = 0x84;
1500                         for (int i = 0x0674; i < 0x06D6; i++)
1501                                 if (Char.IsLetter ((char) i))
1502                                         AddLetterMap ((char) i, 0x13, 1);
1503
1504                         // Devanagari
1505                         // FIXME: it does seem straight codepoint mapping.
1506                         fillIndex [0x14] = 04;
1507                         for (int i = 0x0901; i < 0x0905; i++)
1508                                 if (!IsIgnorable (i))
1509                                         AddLetterMap ((char) i, 0x14, 2);
1510                         fillIndex [0x14] = 0xB;
1511                         for (int i = 0x0905; i < 0x093A; i++)
1512                                 if (Char.IsLetter ((char) i))
1513                                         AddLetterMap ((char) i, 0x14, 4);
1514                         for (int i = 0x093E; i < 0x094F; i++)
1515                                 if (!IsIgnorable (i))
1516                                         AddLetterMap ((char) i, 0x14, 2);
1517
1518                         // Bengali
1519                         // -Letters
1520                         fillIndex [0x15] = 02;
1521                         for (int i = 0x0980; i < 0x9FF; i++) {
1522                                 if (IsIgnorable (i))
1523                                         continue;
1524                                 if (i == 0x09E0)
1525                                         fillIndex [0x15] = 0x3B;
1526                                 switch (Char.GetUnicodeCategory ((char) i)) {
1527                                 case UnicodeCategory.NonSpacingMark:
1528                                 case UnicodeCategory.DecimalDigitNumber:
1529                                 case UnicodeCategory.OtherNumber:
1530                                         continue;
1531                                 }
1532                                 AddLetterMap ((char) i, 0x15, 1);
1533                         }
1534                         // -Signs
1535                         fillIndex [0x1] = 0x3;
1536                         for (int i = 0x0981; i < 0x0A00; i++)
1537                                 if (Char.GetUnicodeCategory ((char) i) ==
1538                                         UnicodeCategory.NonSpacingMark)
1539                                         AddCharMap ((char) i, 0x1, 1);
1540
1541                         // Gurmukhi. orderedGurmukhi is from UCA
1542                         // FIXME: it does not look equivalent to UCA.
1543                         fillIndex [0x1] = 03;
1544                         fillIndex [0x16] = 02;
1545                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
1546                                 char c = orderedGurmukhi [i];
1547                                 if (IsIgnorable ((int) c))
1548                                         continue;
1549                                 if (!Char.IsLetter (c)) {
1550                                         AddLetterMap (c, 0x1, 1);
1551                                         continue;
1552                                 }
1553                                 if (c == '\u0A3C' || c == '\u0A4D' ||
1554                                         '\u0A66' <= c && c <= '\u0A71')
1555                                         continue;
1556                                 AddLetterMap (c, 0x16, 4);
1557                         }
1558
1559                         // Gujarati. orderedGujarati is from UCA
1560                         fillIndex [0x17] = 02;
1561                         for (int i = 0; i < orderedGujarati.Length; i++)
1562                                 AddLetterMap (orderedGujarati [i], 0x17, 4);
1563
1564                         // Oriya
1565                         fillIndex [0x18] = 02;
1566                         for (int i = 0x0B00; i < 0x0B7F; i++) {
1567                                 switch (Char.GetUnicodeCategory ((char) i)) {
1568                                 case UnicodeCategory.NonSpacingMark:
1569                                 case UnicodeCategory.DecimalDigitNumber:
1570                                         continue;
1571                                 }
1572                                 AddLetterMap ((char) i, 0x18, 1);
1573                         }
1574
1575                         // Tamil
1576                         fillIndex [0x19] = 2;
1577                         AddCharMap ('\u0BD7', 0x19, 0);
1578                         fillIndex [0x19] = 0xA;
1579                         // vowels
1580                         for (int i = 0x0BD7; i < 0x0B94; i++)
1581                                 if (Char.IsLetter ((char) i))
1582                                         AddCharMap ((char) i, 0x19, 2);
1583                         // special vowel
1584                         fillIndex [0x19] = 0x24;
1585                         AddCharMap ('\u0B94', 0x19, 0);
1586                         fillIndex [0x19] = 0x26;
1587                         // The array for Tamil consonants is a constant.
1588                         // Windows have almost similar sequence to TAM from
1589                         // tamilnet but a bit different in Grantha.
1590                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
1591                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1592                         // combining marks
1593                         fillIndex [0x19] = 0x82;
1594                         for (int i = 0x0BBE; i < 0x0BCD; i++)
1595                                 if (Char.GetUnicodeCategory ((char) i) ==
1596                                         UnicodeCategory.SpacingCombiningMark
1597                                         || i == 0x0BC0)
1598                                         AddLetterMap ((char) i, 0x19, 2);
1599
1600                         // Telugu
1601                         fillIndex [0x1A] = 0x4;
1602                         for (int i = 0x0C00; i < 0x0C62; i++) {
1603                                 if (i == 0x0C55 || i == 0x0C56)
1604                                         continue; // skip
1605                                 AddCharMap ((char) i, 0x1A, 3);
1606                                 char supp = (i == 0x0C0B) ? '\u0C60':
1607                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
1608                                 if (supp == char.MinValue)
1609                                         continue;
1610                                 AddCharMap (supp, 0x1A, 3);
1611                         }
1612
1613                         // Kannada
1614                         fillIndex [0x1B] = 4;
1615                         for (int i = 0x0C80; i < 0x0CE5; i++) {
1616                                 if (i == 0x0CD5 || i == 0x0CD6)
1617                                         continue; // ignore
1618                                 AddCharMap ((char) i, 0x1B, 3);
1619                         }
1620
1621                         // Malayalam
1622                         fillIndex [0x1C] = 2;
1623                         for (int i = 0x0D02; i < 0x0D61; i++)
1624                                 // FIXME: I avoided MSCompatUnicodeTable usage
1625                                 // here (it results in recursion). So check if
1626                                 // using NonSpacingMark makes sense or not.
1627                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1628 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1629                                         AddCharMap ((char) i, 0x1C, 1);
1630
1631                         // Thai ... note that it breaks 0x1E wall after E2B!
1632                         // Also, all Thai characters have level 2 value 3.
1633                         fillIndex [0x1E] = 2;
1634                         for (int i = 0xE44; i < 0xE48; i++)
1635                                 AddCharMap ((char) i, 0x1E, 1, 3);
1636                         for (int i = 0xE01; i < 0xE2B; i++)
1637                                 AddCharMap ((char) i, 0x1E, 6, 0);
1638                         fillIndex [0x1F] = 5;
1639                         for (int i = 0xE2B; i < 0xE30; i++)
1640                                 AddCharMap ((char) i, 0x1F, 6, 0);
1641                         for (int i = 0xE30; i < 0xE3B; i++)
1642                                 AddCharMap ((char) i, 0x1F, 1, 3);
1643                         // some Thai characters remains.
1644                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
1645                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1646                         foreach (char c in specialThai)
1647                                 AddCharMap (c, 0x1F, 1);
1648
1649                         // Lao
1650                         fillIndex [0x1F] = 2;
1651                         for (int i = 0xE80; i < 0xEDF; i++)
1652                                 if (Char.IsLetter ((char) i))
1653                                         AddCharMap ((char) i, 0x1F, 1);
1654
1655                         // Georgian. orderedGeorgian is from UCA DUCET.
1656                         fillIndex [0x21] = 5;
1657                         for (int i = 0; i < orderedGeorgian.Length; i++)
1658                                 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1659
1660                         // Japanese Kana.
1661                         fillIndex [0x22] = 2;
1662                         int kanaOffset = 0x3041;
1663                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1664
1665                         for (int gyo = 0; gyo < 9; gyo++) {
1666                                 for (int dan = 0; dan < 5; dan++) {
1667                                         if (gyo == 7 && dan % 2 == 1) {
1668                                                 // 'ya'-gyo
1669                                                 fillIndex [0x22]++;
1670                                                 kanaOffset -= 2; // There is no space for yi and ye.
1671                                                 continue;
1672                                         }
1673                                         int cp = kanaOffset + dan * kanaLines [gyo];
1674                                         // small lines (a-gyo, ya-gyo)
1675                                         if (gyo == 0 || gyo == 7) {
1676                                                 AddKanaMap (cp, 1); // small
1677                                                 AddKanaMap (cp + 1, 1);
1678                                         }
1679                                         else
1680                                                 AddKanaMap (cp, kanaLines [gyo]);
1681                                         fillIndex [0x22]++;
1682
1683                                         if (cp == 0x3061) {
1684                                                 // add small 'Tsu' (before normal one)
1685                                                 AddKanaMap (0x3063, 1);
1686                                                 kanaOffset++;
1687                                         }
1688                                 }
1689                                 fillIndex [0x22] += 3;
1690                                 kanaOffset += 5 * kanaLines [gyo];
1691                         }
1692
1693                         // Wa-gyo is almost special, so I just manually add.
1694                         AddLetterMap ((char) 0x308E, 0x22, 0);
1695                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1696                         AddLetterMap ((char) 0x308F, 0x22, 0);
1697                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1698                         fillIndex [0x22]++;
1699                         AddLetterMap ((char) 0x3090, 0x22, 0);
1700                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1701                         fillIndex [0x22] += 2;
1702                         // no "Wu" in Japanese.
1703                         AddLetterMap ((char) 0x3091, 0x22, 0);
1704                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1705                         fillIndex [0x22]++;
1706                         AddLetterMap ((char) 0x3092, 0x22, 0);
1707                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1708                         // Nn
1709                         fillIndex [0x22] = 0x80;
1710                         AddLetterMap ((char) 0x3093, 0x22, 0);
1711                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1712
1713                         // JIS Japanese square chars.
1714                         fillIndex [0x22] = 0x97;
1715                         jisJapanese.Sort (JISComparer.Instance);
1716                         foreach (JISCharacter j in jisJapanese)
1717                                 AddCharMap ((char) j.CP, 0x22, 1);
1718                         // non-JIS Japanese square chars.
1719                         nonJisJapanese.Sort (NonJISComparer.Instance);
1720                         foreach (NonJISCharacter j in nonJisJapanese)
1721                                 AddCharMap ((char) j.CP, 0x22, 1);
1722
1723                         // Bopomofo
1724                         fillIndex [0x23] = 0x02;
1725                         for (int i = 0x3105; i <= 0x312C; i++)
1726                                 AddCharMap ((char) i, 0x23, 1);
1727
1728                         // Estrangela: ancient Syriac
1729                         fillIndex [0x24] = 0x0B;
1730                         // FIXME: is 0x71E really alternative form?
1731                         ArrayList syriacAlternatives = new ArrayList (
1732                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
1733                         for (int i = 0x0710; i <= 0x072C; i++) {
1734                                 if (i == 0x0711) // NonSpacingMark
1735                                         continue;
1736                                 if (syriacAlternatives.Contains (i))
1737                                         continue;
1738                                 AddCharMap ((char) i, 0x24, 4);
1739                                 // FIXME: why?
1740                                 if (i == 0x721)
1741                                         fillIndex [0x24]++;
1742                         }
1743                         foreach (int cp in syriacAlternatives)
1744                                 map [cp] = new CharMapEntry (0x24,
1745                                         (byte) (map [cp - 1].Level1 + 2),
1746                                         0);
1747
1748                         // Thaana
1749                         // FIXME: it turned out that it does not look like UCA
1750                         fillIndex [0x24] = 0x6E;
1751                         for (int i = 0; i < orderedThaana.Length; i++) {
1752                                 if (IsIgnorableNonSpacing (i))
1753                                         continue;
1754                                 AddCharMap (orderedThaana [i], 0x24, 2);
1755                         }
1756                         #endregion
1757
1758                         // FIXME: Add more culture-specific letters (that are
1759                         // not supported in Windows collation) here.
1760
1761                         // Surrogate ... they are computed.
1762
1763                         #region Hangul
1764                         // Hangul.
1765                         //
1766                         // Unlike UCA Windows Hangul sequence mixes Jongseong
1767                         // with Choseong sequence as well as Jungseong,
1768                         // adjusted to have the same primary weight for the
1769                         // same base character. So it is impossible to compute
1770                         // those sort keys.
1771                         //
1772                         // Here I introduce an ordered sequence of mixed
1773                         // 'commands' and 'characters' that is similar to
1774                         // LDML text:
1775                         //      - ',' increases primary weight.
1776                         //      - [A B] means a range, increasing index
1777                         //      - {A B} means a range, without increasing index
1778                         //      - '=' is no operation (it means the characters
1779                         //        of both sides have the same weight).
1780                         //      - '>' inserts a Hangul Syllable block that
1781                         //        contains 0x251 characters.
1782                         //      - '<' decreases the index
1783                         //      - '0'-'9' means skip count
1784                         //      - whitespaces are ignored
1785                         //
1786
1787                         string hangulSequence =
1788                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
1789                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
1790                         + "<{\u1113 \u1116}, \u3165,"
1791                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
1792                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
1793                         + "\u11CA, \u1104, \u11CB > \u1105 >"
1794                         + "\u11B0, [\u11CC \u11D0], \u11B1, [\u11D1 \u11D2],"
1795                                 + "\u11B2, [\u11D3 \u11D5], \u11B3,"
1796                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
1797                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
1798                         + "[\u11DA \u11E2], \u1107=\u11B8 >"
1799                         + "<{\u111E \u1120}, \u3172,, \u3173, "
1800                                 + "\u11E3, \u1108 >"
1801                         + "\u11B9,,,,,,,,, [\u11E4 \u11E6],, \u1109=\u11BA,,,"
1802                                 + "\u3214=\u3274 <>"
1803                         + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
1804                                 + "\u11EA,, \u110A=\u11BB,,, >"
1805                         + "{\u1134 \u1140}, \u317E,,,,,, \u11EB,"
1806                         + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
1807                         + "\u11EE, \u11EC, \u11ED,,,,, \u11F1,, \u11F2,,,"
1808                                 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
1809                         + "\u110D,,  >"
1810                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
1811                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
1812                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
1813                         + "\u11F3, \u11F4, \u1112=\u11C2 >"
1814                         + "\u11F9, [\u11F5 \u11F8]"
1815                         ;
1816
1817                         byte hangulCat = 0x52;
1818                         fillIndex [hangulCat] = 0x2;
1819
1820                         int syllableBlock = 0;
1821                         for (int n = 0; n < hangulSequence.Length; n++) {
1822                                 char c = hangulSequence [n];
1823                                 int start, end;
1824                                 if (Char.IsWhiteSpace (c))
1825                                         continue;
1826                                 switch (c) {
1827                                 case '=':
1828                                         break; // NOP
1829                                 case ',':
1830                                         IncrementSequentialIndex (ref hangulCat);
1831                                         break;
1832                                 case '<':
1833                                         if (fillIndex [hangulCat] == 2)
1834                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
1835                                         fillIndex [hangulCat]--;
1836                                         break;
1837                                 case '>':
1838                                         IncrementSequentialIndex (ref hangulCat);
1839                                         for (int l = 0; l < 0x15; l++)
1840                                                 for (int v = 0; v < 0x1C; v++) {
1841                                                         AddCharMap (
1842                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
1843                                                         IncrementSequentialIndex (ref hangulCat);
1844                                                 }
1845                                         syllableBlock++;
1846                                         break;
1847                                 case '[':
1848                                         start = hangulSequence [n + 1];
1849                                         end = hangulSequence [n + 3];
1850                                         for (int i = start; i <= end; i++) {
1851                                                 AddCharMap ((char) i, hangulCat, 0);
1852                                                 if (end > i)
1853                                                         IncrementSequentialIndex (ref hangulCat);
1854                                         }
1855                                         n += 4; // consumes 5 characters for this operation
1856                                         break;
1857                                 case '{':
1858                                         start = hangulSequence [n + 1];
1859                                         end = hangulSequence [n + 3];
1860                                         for (int i = start; i <= end; i++)
1861                                                 AddCharMap ((char) i, hangulCat, 0);
1862                                         n += 4; // consumes 5 characters for this operation
1863                                         break;
1864                                 default:
1865                                         AddCharMap (c, hangulCat, 0);
1866                                         break;
1867                                 }
1868                         }
1869
1870                         #endregion
1871
1872                         // Letterlike characters and CJK compatibility square
1873                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
1874                         int [] counts = new int ['Z' - 'A' + 1];
1875                         char [] namedChars = new char [sortableCharNames.Count];
1876                         int nCharNames = 0;
1877                         foreach (DictionaryEntry de in sortableCharNames) {
1878                                 counts [((string) de.Value) [0] - 'A']++;
1879                                 namedChars [nCharNames++] = (char) ((int) de.Key);
1880                         }
1881                         nCharNames = 0; // reset
1882                         for (int a = 0; a < counts.Length; a++) {
1883                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
1884                                 for (int i = 0; i < counts [a]; i++)
1885 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
1886                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
1887                         }
1888
1889                         // CJK unified ideograph.
1890                         byte cjkCat = 0x9E;
1891                         fillIndex [cjkCat] = 0x2;
1892                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
1893                                 if (!IsIgnorable (cp))
1894                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
1895                         // CJK Extensions goes here.
1896                         // LAMESPEC: With this Windows style CJK layout, it is
1897                         // impossible to add more CJK ideograph i.e. 0x9FA6-
1898                         // 0x9FBB can never be added w/o breaking compat.
1899                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
1900                                 if (!IsIgnorable (cp))
1901                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
1902
1903                         // PrivateUse ... computed.
1904                         // remaining Surrogate ... computed.
1905
1906                         #region Special "biggest" area (FF FF)
1907                         fillIndex [0xFF] = 0xFF;
1908                         char [] specialBiggest = new char [] {
1909                                 '\u3005', '\u3031', '\u3032', '\u309D',
1910                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1911                                 '\uFE7C', '\uFE7D', '\uFF70'};
1912                         foreach (char c in specialBiggest)
1913                                 AddCharMap (c, 0xFF, 0);
1914                         #endregion
1915
1916                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
1917                         // non-alphanumeric ASCII except for: + - < = > '
1918                         for (int i = 0x21; i < 0x7F; i++) {
1919                                 if (Char.IsLetterOrDigit ((char) i)
1920                                         || "+-<=>'".IndexOf ((char) i) >= 0)
1921                                         continue; // they are not added here.
1922                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
1923                                 // Insert 3001 after ',' and 3002 after '.'
1924                                 if (i == 0x2C)
1925                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
1926                                 else if (i == 0x2E) {
1927                                         fillIndex [0x7]--;
1928                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
1929                                 }
1930                                 else if (i == 0x3A)
1931                                         AddCharMap ('\uFE30', 0x7, 1, 0);
1932                         }
1933                         #endregion
1934
1935                         #region 07 - Punctuations and something else
1936                         for (int i = 0xA0; i < char.MaxValue; i++) {
1937                                 if (IsIgnorable (i))
1938                                         continue;
1939
1940                                 // SPECIAL CASES:
1941                                 switch (i) {
1942                                 case 0xAB: // 08
1943                                 case 0x2329: // 09
1944                                 case 0x232A: // 09
1945                                         continue;
1946                                 }
1947
1948                                 switch (Char.GetUnicodeCategory ((char) i)) {
1949                                 case UnicodeCategory.OtherPunctuation:
1950                                 case UnicodeCategory.ClosePunctuation:
1951                                 case UnicodeCategory.InitialQuotePunctuation:
1952                                 case UnicodeCategory.FinalQuotePunctuation:
1953                                 case UnicodeCategory.ModifierSymbol:
1954                                         // SPECIAL CASES: // 0xA
1955                                         if (0x2020 <= i && i <= 0x2042)
1956                                                 continue;
1957                                         AddCharMapGroup ((char) i, 0x7, 1, 0);
1958                                         break;
1959                                 default:
1960                                         if (i == 0xA6) // SPECIAL CASE. FIXME: why?
1961                                                 goto case UnicodeCategory.OtherPunctuation;
1962                                         break;
1963                                 }
1964                         }
1965                         #endregion
1966
1967                         // FIXME: for 07 xx we need more love.
1968
1969                         // Characters w/ diacritical marks (NFKD)
1970                         for (int i = 0; i <= char.MaxValue; i++) {
1971                                 if (map [i].Defined || IsIgnorable (i))
1972                                         continue;
1973                                 if (decompIndex [i] == 0)
1974                                         continue;
1975
1976                                 int start = decompIndex [i];
1977                                 int primaryChar = decompValues [start];
1978                                 int secondary = 0;
1979                                 bool skip = false;
1980                                 int length = decompLength [i];
1981                                 // special processing for parenthesized ones.
1982                                 if (length == 3 &&
1983                                         decompValues [start] == '(' &&
1984                                         decompValues [start + 2] == ')') {
1985                                         primaryChar = decompValues [start + 1];
1986                                         length = 1;
1987                                 }
1988
1989                                 if (map [primaryChar].Level1 == 0)
1990                                         continue;
1991
1992                                 for (int l = 1; l < length; l++) {
1993                                         int c = decompValues [start + l];
1994                                         if (map [c].Level1 != 0)
1995                                                 skip = true;
1996                                         secondary += diacritical [c];
1997                                 }
1998                                 if (skip)
1999                                         continue;
2000                                 map [i] = new CharMapEntry (
2001                                         map [primaryChar].Category,
2002                                         map [primaryChar].Level1,
2003                                         (byte) secondary);
2004
2005                         }
2006
2007                         #region Level2 adjustment
2008                         // Arabic Hamzah
2009                         diacritical [0x624] = 0x5;
2010                         diacritical [0x626] = 0x7;
2011                         diacritical [0x622] = 0x9;
2012                         diacritical [0x623] = 0xA;
2013                         diacritical [0x625] = 0xB;
2014                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
2015                         diacritical [0x64A] = 0x7; // Yaa'
2016
2017
2018                         for (int i = 0; i < char.MaxValue; i++) {
2019                                 byte mod = 0;
2020                                 byte cat = map [i].Category;
2021                                 switch (cat) {
2022                                 case 0xE: // Latin diacritics
2023                                 case 0x22: // Japanese: circled characters
2024                                         mod = diacritical [i];
2025                                         break;
2026                                 case 0x13: // Arabic
2027                                         if (diacritical [i] == 0)
2028                                                 mod = 0x8; // default for arabic
2029                                         break;
2030                                 }
2031                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
2032                                         mod = diacritical [i];
2033                                 if (mod > 0)
2034                                         map [i] = new CharMapEntry (
2035                                                 cat, map [i].Level1, mod);
2036                         }
2037                         #endregion
2038
2039                         // FIXME: this is hack but those which are
2040                         // NonSpacingMark characters and still undefined
2041                         // are likely to be nonspacing.
2042                         for (int i = 0; i < char.MaxValue; i++)
2043                                 if (!map [i].Defined &&
2044                                         !IsIgnorable (i) &&
2045                                         Char.GetUnicodeCategory ((char) i) ==
2046                                         UnicodeCategory.NonSpacingMark)
2047                                         AddCharMap ((char) i, 1, 1);
2048                 }
2049
2050                 private void IncrementSequentialIndex (ref byte hangulCat)
2051                 {
2052                         fillIndex [hangulCat]++;
2053                         if (fillIndex [hangulCat] == 0) { // overflown
2054                                 hangulCat++;
2055                                 fillIndex [hangulCat] = 0x2;
2056                         }
2057                 }
2058
2059                 // Reset fillIndex to fixed value and call AddLetterMap().
2060                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2061                 {
2062                         fillIndex [category] = alphaWeight;
2063                         AddLetterMap (c, category, 0);
2064
2065                         ArrayList al = latinMap [c] as ArrayList;
2066                         if (al == null)
2067                                 return;
2068
2069                         foreach (int cp in al)
2070                                 AddLetterMap ((char) cp, category, 0);
2071                 }
2072
2073                 private void AddKanaMap (int i, byte voices)
2074                 {
2075                         for (byte b = 0; b < voices; b++) {
2076                                 char c = (char) (i + b);
2077                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
2078                                 // Hiragana
2079                                 AddLetterMapCore (c, 0x22, 0, arg);
2080                                 // Katakana
2081                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2082                         }
2083                 }
2084
2085                 private void AddLetterMap (char c, byte category, byte updateCount)
2086                 {
2087                         AddLetterMapCore (c, category, updateCount, 0);
2088                 }
2089
2090                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2091                 {
2092                         char c2;
2093                         // <small> updates index
2094                         c2 = ToSmallForm (c);
2095                         if (c2 != c)
2096                                 AddCharMapGroup (c2, category, updateCount, level2);
2097                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2098                         if (c2 != c && !map [(int) c2].Defined)
2099                                 AddLetterMapCore (c2, category, 0, level2);
2100                         bool doUpdate = true;
2101                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2102                                 doUpdate = false;
2103                         else
2104                                 AddCharMapGroup (c, category, 0, level2);
2105                         if (doUpdate)
2106                                 fillIndex [category] += updateCount;
2107                 }
2108
2109                 private bool AddCharMap (char c, byte category, byte increment)
2110                 {
2111                         return AddCharMap (c, category, increment, 0);
2112                 }
2113
2114                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2115                 {
2116                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
2117                                 return false; // do nothing
2118                         map [(int) c] = new CharMapEntry (category,
2119                                 category == 1 ? alt : fillIndex [category],
2120                                 category == 1 ? fillIndex [category] : alt);
2121                         fillIndex [category] += increment;
2122                         return true;
2123                 }
2124
2125                 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2126                 {
2127                         char c2 = ToSmallFormTail (c);
2128                         if (c2 != c)
2129                                 AddCharMap (c2, category, updateCount, 0);
2130                         // itself
2131                         AddCharMap (c, category, updateCount, 0);
2132                         // <full>
2133                         c2 = ToFullWidthTail (c);
2134                         if (c2 != c)
2135                                 AddCharMapGroupTail (c2, category, updateCount);
2136                 }
2137
2138                 //
2139                 // Adds characters to table in the order below
2140                 // (+ increases weight):
2141                 //      (<small> +)
2142                 //      itself
2143                 //      <fraction>
2144                 //      <full> | <super> | <sub>
2145                 //      <circle> | <wide> (| <narrow>)
2146                 //      +
2147                 //      (vertical +)
2148                 //
2149                 // level2 is fixed (does not increase).
2150                 int [] sameWeightItems = new int [] {
2151                         DecompositionFraction,
2152                         DecompositionFull,
2153                         DecompositionSuper,
2154                         DecompositionSub,
2155                         DecompositionCircle,
2156                         DecompositionWide,
2157                         DecompositionNarrow,
2158                         };
2159                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2160                 {
2161                         if (map [(int) c].Defined)
2162                                 return;
2163
2164                         char small = char.MinValue;
2165                         char vertical = char.MinValue;
2166                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2167                         if (nfkd != null) {
2168                                 object smv = nfkd [(byte) DecompositionSmall];
2169                                 if (smv != null)
2170                                         small = (char) ((int) smv);
2171                                 object vv = nfkd [(byte) DecompositionVertical];
2172                                 if (vv != null)
2173                                         vertical = (char) ((int) vv);
2174                         }
2175
2176                         // <small> updates index
2177                         if (small != char.MinValue)
2178                                 AddCharMap (small, category, updateCount);
2179
2180                         // itself
2181                         AddCharMap (c, category, 0, level2);
2182
2183                         if (nfkd != null) {
2184                                 foreach (int weight in sameWeightItems) {
2185                                         object wv = nfkd [(byte) weight];
2186                                         if (wv != null)
2187                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
2188                                 }
2189                         }
2190
2191                         // update index here.
2192                         fillIndex [category] += updateCount;
2193
2194                         if (vertical != char.MinValue)
2195                                 AddCharMap (vertical, category, updateCount, level2);
2196                 }
2197
2198                 private void AddCharMapCJK (char c, ref byte category)
2199                 {
2200                         AddCharMap (c, category, 0, 0);
2201                         IncrementSequentialIndex (ref category);
2202
2203                         // Special. I wonder why but Windows skips 9E F9.
2204                         if (category == 0x9E && fillIndex [category] == 0xF9)
2205                                 IncrementSequentialIndex (ref category);
2206                 }
2207
2208                 private void AddCharMapGroupCJK (char c, ref byte category)
2209                 {
2210                         AddCharMapCJK (c, ref category);
2211
2212                         // LAMESPEC: see below.
2213                         if (c == '\u52DE') {
2214                                 AddCharMapCJK ('\u3298', ref category);
2215                                 AddCharMapCJK ('\u3238', ref category);
2216                         }
2217                         if (c == '\u5BEB')
2218                                 AddCharMapCJK ('\u32A2', ref category);
2219                         if (c == '\u91AB')
2220                                 // Especially this mapping order totally does
2221                                 // not make sense to me.
2222                                 AddCharMapCJK ('\u32A9', ref category);
2223
2224                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2225                         if (nfkd == null)
2226                                 return;
2227                         for (byte weight = 0; weight <= 0x12; weight++) {
2228                                 object wv = nfkd [weight];
2229                                 if (wv == null)
2230                                         continue;
2231                                 int w = (int) wv;
2232
2233                                 // Special: they are ignored in this area.
2234                                 // FIXME: check if it is sane
2235                                 if (0xF900 <= w && w <= 0xFAD9)
2236                                         continue;
2237                                 // LAMESPEC: on Windows some of CJK characters
2238                                 // in 3200-32B0 are incorrectly mapped. They
2239                                 // mix Chinise and Japanese Kanji when
2240                                 // ordering those characters.
2241                                 switch (w) {
2242                                 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
2243                                         continue;
2244                                 }
2245
2246                                 AddCharMapCJK ((char) w, ref category);
2247                         }
2248                 }
2249
2250                 // For now it is only for 0x7 category.
2251                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2252                 {
2253                         char small = char.MinValue;
2254                         char vertical = char.MinValue;
2255                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2256                         if (nfkd != null) {
2257                                 object smv = nfkd [(byte) DecompositionSmall];
2258                                 if (smv != null)
2259                                         small = (char) ((int) smv);
2260                                 object vv = nfkd [(byte) DecompositionVertical];
2261                                 if (vv != null)
2262                                         vertical = (char) ((int) vv);
2263                         }
2264
2265                         // <small> updates index
2266                         if (small != char.MinValue)
2267                                 // SPECIAL CASE excluded (FIXME: why?)
2268                                 if (small != '\u2024')
2269                                         AddCharMap (small, category, updateCount);
2270
2271                         // itself
2272                         AddCharMap (c, category, updateCount, level2);
2273
2274                         // Since nfkdMap is problematic to have two or more
2275                         // NFKD to an identical character, here I iterate all.
2276                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
2277                                 if (decompLength [c2] == 1 &&
2278                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
2279                                         switch (decompType [c2]) {
2280                                         case DecompositionCompat:
2281                                                 AddCharMap ((char) c2, category, updateCount, level2);
2282                                                 break;
2283                                         }
2284                                 }
2285                         }
2286
2287                         if (vertical != char.MinValue)
2288                                 // SPECIAL CASE excluded (FIXME: why?)
2289                                 if (vertical != '\uFE33' && vertical != '\uFE34')
2290                                         AddCharMap (vertical, category, updateCount, level2);
2291                 }
2292
2293                 char ToFullWidth (char c)
2294                 {
2295                         return ToDecomposed (c, DecompositionFull, false);
2296                 }
2297
2298                 char ToFullWidthTail (char c)
2299                 {
2300                         return ToDecomposed (c, DecompositionFull, true);
2301                 }
2302
2303                 char ToSmallForm (char c)
2304                 {
2305                         return ToDecomposed (c, DecompositionSmall, false);
2306                 }
2307
2308                 char ToSmallFormTail (char c)
2309                 {
2310                         return ToDecomposed (c, DecompositionSmall, true);
2311                 }
2312
2313                 char ToDecomposed (char c, byte d, bool tail)
2314                 {
2315                         if (decompType [(int) c] != d)
2316                                 return c;
2317                         int idx = decompIndex [(int) c];
2318                         if (tail)
2319                                 idx += decompLength [(int) c] - 1;
2320                         return (char) decompValues [idx];
2321                 }
2322
2323                 bool ExistsJIS (int cp)
2324                 {
2325                         foreach (JISCharacter j in jisJapanese)
2326                                 if (j.CP == cp)
2327                                         return true;
2328                         return false;
2329                 }
2330
2331                 #endregion
2332
2333                 #region Level 3 properties (Case/Width)
2334
2335                 private byte ComputeLevel3Weight (char c)
2336                 {
2337                         byte b = ComputeLevel3WeightRaw (c);
2338                         return b > 0 ? (byte) (b + 2) : b;
2339                 }
2340
2341                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
2342                 {
2343                         // Korean
2344                         if ('\u11A8' <= c && c <= '\u11F9')
2345                                 return 2;
2346                         if ('\uFFA0' <= c && c <= '\uFFDC')
2347                                 return 4;
2348                         if ('\u3130' <= c && c <= '\u3164')
2349                                 return 5;
2350                         // numbers
2351                         if ('\u2776' <= c && c <= '\u277F')
2352                                 return 4;
2353                         if ('\u2780' <= c && c <= '\u2789')
2354                                 return 8;
2355                         if ('\u2776' <= c && c <= '\u2793')
2356                                 return 0xC;
2357                         if ('\u2160' <= c && c <= '\u216F')
2358                                 return 0x18;
2359                         if ('\u2181' <= c && c <= '\u2182')
2360                                 return 0x18;
2361                         // Arabic
2362                         if ('\u2135' <= c && c <= '\u2138')
2363                                 return 4;
2364                         if ('\uFE80' <= c && c < '\uFE8E') {
2365                                 // 2(Isolated)/8(Final)/0x18(Medial)
2366                                 switch (decompType [(int) c]) {
2367                                 case DecompositionIsolated:
2368                                         return 2;
2369                                 case DecompositionFinal:
2370                                         return 8;
2371                                 case DecompositionMedial:
2372                                         return 0x18;
2373                                 }
2374                         }
2375
2376                         // actually I dunno the reason why they have weights.
2377                         switch (c) {
2378                         case '\u01BC':
2379                                 return 0x10;
2380                         case '\u06A9':
2381                                 return 0x20;
2382                         case '\u06AA':
2383                                 return 0x28;
2384                         }
2385
2386                         byte ret = 0;
2387                         switch (c) {
2388                         case '\u03C2':
2389                         case '\u2104':
2390                         case '\u212B':
2391                                 ret |= 8;
2392                                 break;
2393                         case '\uFE42':
2394                                 ret |= 0xC;
2395                                 break;
2396                         }
2397
2398                         // misc
2399                         switch (decompType [(int) c]) {
2400                         case DecompositionWide: // <wide>
2401                         case DecompositionSub: // <sub>
2402                         case DecompositionSuper: // <super>
2403                                 ret |= decompType [(int) c];
2404                                 break;
2405                         }
2406                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2407                                 ret |= 8;
2408                         if (isUppercase [(int) c]) // DerivedCoreProperties
2409                                 ret |= 0x10;
2410
2411                         return ret;
2412                 }
2413
2414                 #endregion
2415
2416                 #region IsIgnorable
2417                 // FIXME: In the future use DerivedAge.txt to examine character
2418                 // versions and set those ones that have higher version than
2419                 // 1.0 as ignorable.
2420                 static bool IsIgnorable (int i)
2421                 {
2422                         switch (i) {
2423                         case 0:
2424                         // I guess, those characters are added between
2425                         // Unicode 1.0 (LCMapString) and Unicode 3.1
2426                         // (UnicodeCategory), so they used to be
2427                         // something like OtherNotAssigned as of Unicode 1.1.
2428                         case 0x2df: case 0x387:
2429                         case 0x3d7: case 0x3d8: case 0x3d9:
2430                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2431                         case 0x400: case 0x40d: case 0x450: case 0x45d:
2432                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
2433                         case 0x653: case 0x654: case 0x655: case 0x66d:
2434                         case 0xb56:
2435                         case 0x1e9b: case 0x202f: case 0x20ad:
2436                         case 0x20ae: case 0x20af:
2437                         case 0x20e2: case 0x20e3:
2438                         case 0x2139: case 0x213a: case 0x2183:
2439                         case 0x2425: case 0x2426: case 0x2619:
2440                         case 0x2670: case 0x2671: case 0x3007:
2441                         case 0x3190: case 0x3191:
2442                         case 0xfffc: case 0xfffd:
2443                                 return true;
2444                         // exceptional characters filtered by the
2445                         // following conditions. Originally those exceptional
2446                         // ranges are incorrect (they should not be ignored)
2447                         // and most of those characters are unfortunately in
2448                         // those ranges.
2449                         case 0x4d8: case 0x4d9:
2450                         case 0x4e8: case 0x4e9:
2451                         case 0x3036: case 0x303f:
2452                         case 0x337b: case 0xfb1e:
2453                                 return false;
2454                         }
2455
2456                         if (
2457                                 // The whole Sinhala characters.
2458                                 0x0D82 <= i && i <= 0x0DF4
2459                                 // The whole Tibetan characters.
2460                                 || 0x0F00 <= i && i <= 0x0FD1
2461                                 // The whole Myanmar characters.
2462                                 || 0x1000 <= i && i <= 0x1059
2463                                 // The whole Etiopic, Cherokee,
2464                                 // Canadian Syllablic, Ogham, Runic,
2465                                 // Tagalog, Hanunoo, Philippine,
2466                                 // Buhid, Tagbanwa, Khmer and Mongorian
2467                                 // characters.
2468                                 || 0x1200 <= i && i <= 0x1DFF
2469                                 // Greek extension characters.
2470                                 || 0x1F00 <= i && i <= 0x1FFF
2471                                 // The whole Braille characters.
2472                                 || 0x2800 <= i && i <= 0x28FF
2473                                 // CJK radical characters.
2474                                 || 0x2E80 <= i && i <= 0x2EF3
2475                                 // Kangxi radical characters.
2476                                 || 0x2F00 <= i && i <= 0x2FD5
2477                                 // Ideographic description characters.
2478                                 || 0x2FF0 <= i && i <= 0x2FFB
2479                                 // Bopomofo letter and final
2480                                 || 0x31A0 <= i && i <= 0x31B7
2481                                 // White square with quadrant characters.
2482                                 || 0x25F0 <= i && i <= 0x25F7
2483                                 // Ideographic telegraph symbols.
2484                                 || 0x32C0 <= i && i <= 0x32CB
2485                                 || 0x3358 <= i && i <= 0x3370
2486                                 || 0x33E0 <= i && i <= 0x33FF
2487                                 // The whole YI characters.
2488                                 || 0xA000 <= i && i <= 0xA48C
2489                                 || 0xA490 <= i && i <= 0xA4C6
2490                                 // American small ligatures
2491                                 || 0xFB13 <= i && i <= 0xFB17
2492                                 // hebrew, arabic, variation selector.
2493                                 || 0xFB1D <= i && i <= 0xFE2F
2494                                 // Arabic ligatures.
2495                                 || 0xFEF5 <= i && i <= 0xFEFC
2496                                 // FIXME: why are they excluded?
2497                                 || 0x01F6 <= i && i <= 0x01F9
2498                                 || 0x0218 <= i && i <= 0x0233
2499                                 || 0x02A9 <= i && i <= 0x02AD
2500                                 || 0x02EA <= i && i <= 0x02EE
2501                                 || 0x0349 <= i && i <= 0x036F
2502                                 || 0x0488 <= i && i <= 0x048F
2503                                 || 0x04D0 <= i && i <= 0x04FF
2504                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2505                                 || 0x06D6 <= i && i <= 0x06ED
2506                                 || 0x06FA <= i && i <= 0x06FE
2507                                 || 0x2048 <= i && i <= 0x204D
2508                                 || 0x20e4 <= i && i <= 0x20ea
2509                                 || 0x213C <= i && i <= 0x214B
2510                                 || 0x21EB <= i && i <= 0x21FF
2511                                 || 0x22F2 <= i && i <= 0x22FF
2512                                 || 0x237B <= i && i <= 0x239A
2513                                 || 0x239B <= i && i <= 0x23CF
2514                                 || 0x24EB <= i && i <= 0x24FF
2515                                 || 0x2596 <= i && i <= 0x259F
2516                                 || 0x25F8 <= i && i <= 0x25FF
2517                                 || 0x2672 <= i && i <= 0x2689
2518                                 || 0x2768 <= i && i <= 0x2775
2519                                 || 0x27d0 <= i && i <= 0x27ff
2520                                 || 0x2900 <= i && i <= 0x2aff
2521                                 || 0x3033 <= i && i <= 0x303F
2522                                 || 0x31F0 <= i && i <= 0x31FF
2523                                 || 0x3250 <= i && i <= 0x325F
2524                                 || 0x32B1 <= i && i <= 0x32BF
2525                                 || 0x3371 <= i && i <= 0x337B
2526                                 || 0xFA30 <= i && i <= 0xFA6A
2527                         )
2528                                 return true;
2529
2530                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2531                         switch (uc) {
2532                         case UnicodeCategory.PrivateUse:
2533                         case UnicodeCategory.Surrogate:
2534                                 return false;
2535                         // ignored by nature
2536                         case UnicodeCategory.Format:
2537                         case UnicodeCategory.OtherNotAssigned:
2538                                 return true;
2539                         default:
2540                                 return false;
2541                         }
2542                 }
2543
2544                 // To check IsIgnorable sanity, try the driver below under MS.NET.
2545
2546                 /*
2547                 public static void Main ()
2548                 {
2549                         for (int i = 0; i <= char.MaxValue; i++)
2550                                 Dump (i, IsIgnorable (i));
2551                 }
2552
2553                 static void Dump (int i, bool ignore)
2554                 {
2555                         switch (Char.GetUnicodeCategory ((char) i)) {
2556                         case UnicodeCategory.PrivateUse:
2557                         case UnicodeCategory.Surrogate:
2558                                 return; // check nothing
2559                         }
2560
2561                         string s1 = "";
2562                         string s2 = new string ((char) i, 10);
2563                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2564                         if ((ret == 0) == ignore)
2565                                 return;
2566                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2567                 }
2568                 */
2569                 #endregion // IsIgnorable
2570
2571                 #region IsIgnorableSymbol
2572                 static bool IsIgnorableSymbol (int i)
2573                 {
2574                         if (IsIgnorable (i))
2575                                 return true;
2576
2577                         switch (i) {
2578                         // *Letter
2579                         case 0x00b5: case 0x01C0: case 0x01C1:
2580                         case 0x01C2: case 0x01C3: case 0x01F6:
2581                         case 0x01F7: case 0x01F8: case 0x01F9:
2582                         case 0x02D0: case 0x02EE: case 0x037A:
2583                         case 0x03D7: case 0x03F3:
2584                         case 0x0400: case 0x040d:
2585                         case 0x0450: case 0x045d:
2586                         case 0x048C: case 0x048D:
2587                         case 0x048E: case 0x048F:
2588                         case 0x0587: case 0x0640: case 0x06E5:
2589                         case 0x06E6: case 0x06FA: case 0x06FB:
2590                         case 0x06FC: case 0x093D: case 0x0950:
2591                         case 0x1E9B: case 0x2139: case 0x3006:
2592                         case 0x3033: case 0x3034: case 0x3035:
2593                         case 0xFE7E: case 0xFE7F:
2594                         // OtherNumber
2595                         case 0x16EE: case 0x16EF: case 0x16F0:
2596                         // LetterNumber
2597                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2598                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2599                         case 0x3038: // HANGZHOU NUMERAL TEN
2600                         case 0x3039: // HANGZHOU NUMERAL TWENTY
2601                         case 0x303a: // HANGZHOU NUMERAL THIRTY
2602                         // OtherSymbol
2603                         case 0x2117:
2604                         case 0x327F:
2605                                 return true;
2606                         // ModifierSymbol
2607                         case 0x02B9: case 0x02BA: case 0x02C2:
2608                         case 0x02C3: case 0x02C4: case 0x02C5:
2609                         case 0x02C8: case 0x02CC: case 0x02CD:
2610                         case 0x02CE: case 0x02CF: case 0x02D2:
2611                         case 0x02D3: case 0x02D4: case 0x02D5:
2612                         case 0x02D6: case 0x02D7: case 0x02DE:
2613                         case 0x02E5: case 0x02E6: case 0x02E7:
2614                         case 0x02E8: case 0x02E9:
2615                         case 0x309B: case 0x309C:
2616                         // OtherPunctuation
2617                         case 0x055A: // American Apos
2618                         case 0x05C0: // Hebrew Punct
2619                         case 0x0E4F: // Thai FONGMAN
2620                         case 0x0E5A: // Thai ANGKHANKHU
2621                         case 0x0E5B: // Thai KHOMUT
2622                         // CurencySymbol
2623                         case 0x09F2: // Bengali Rupee Mark
2624                         case 0x09F3: // Bengali Rupee Sign
2625                         // MathSymbol
2626                         case 0x221e: // INF.
2627                         // OtherSymbol
2628                         case 0x0482:
2629                         case 0x09FA:
2630                         case 0x0B70:
2631                                 return false;
2632                         }
2633
2634                         // *Letter
2635                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2636 #if NET_2_0
2637                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2638                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2639 #endif
2640                         )
2641                                 return true;
2642
2643                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2644                         switch (uc) {
2645                         case UnicodeCategory.Surrogate:
2646                                 return false; // inconsistent
2647
2648                         case UnicodeCategory.SpacingCombiningMark:
2649                         case UnicodeCategory.EnclosingMark:
2650                         case UnicodeCategory.NonSpacingMark:
2651                         case UnicodeCategory.PrivateUse:
2652                                 // NonSpacingMark
2653                                 if (0x064B <= i && i <= 0x0652) // Arabic
2654                                         return true;
2655                                 return false;
2656
2657                         case UnicodeCategory.Format:
2658                         case UnicodeCategory.OtherNotAssigned:
2659                                 return true;
2660
2661                         default:
2662                                 bool use = false;
2663                                 // OtherSymbols
2664                                 if (
2665                                         // latin in a circle
2666                                         0x249A <= i && i <= 0x24E9
2667                                         || 0x2100 <= i && i <= 0x2132
2668                                         // Japanese
2669                                         || 0x3196 <= i && i <= 0x31A0
2670                                         // Korean
2671                                         || 0x3200 <= i && i <= 0x321C
2672                                         // Chinese/Japanese
2673                                         || 0x322A <= i && i <= 0x3243
2674                                         // CJK
2675                                         || 0x3260 <= i && i <= 0x32B0
2676                                         || 0x32D0 <= i && i <= 0x3357
2677                                         || 0x337B <= i && i <= 0x33DD
2678                                 )
2679                                         use = !Char.IsLetterOrDigit ((char) i);
2680                                 if (use)
2681                                         return false;
2682
2683                                 // This "Digit" rule is mystery.
2684                                 // It filters some symbols out.
2685                                 if (Char.IsLetterOrDigit ((char) i))
2686                                         return false;
2687                                 if (Char.IsNumber ((char) i))
2688                                         return false;
2689                                 if (Char.IsControl ((char) i)
2690                                         || Char.IsSeparator ((char) i)
2691                                         || Char.IsPunctuation ((char) i))
2692                                         return true;
2693                                 if (Char.IsSymbol ((char) i))
2694                                         return true;
2695
2696                                 // FIXME: should check more
2697                                 return false;
2698                         }
2699                 }
2700
2701                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
2702 /*
2703                 public static void Main ()
2704                 {
2705                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
2706                         for (int i = 0; i <= char.MaxValue; i++) {
2707                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2708                                 if (uc == UnicodeCategory.Surrogate)
2709                                         continue;
2710
2711                                 bool ret = IsIgnorableSymbol (i);
2712
2713                                 string s1 = "TEST ";
2714                                 string s2 = "TEST " + (char) i;
2715
2716                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
2717
2718                                 if (ret != (result == 0))
2719                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
2720                                                 ret ? "should not ignore" :
2721                                                         "should ignore",
2722                                                 i,(char) i, uc);
2723                         }
2724                 }
2725 */
2726                 #endregion
2727
2728                 #region NonSpacing
2729                 static bool IsIgnorableNonSpacing (int i)
2730                 {
2731                         if (IsIgnorable (i))
2732                                 return true;
2733
2734                         switch (i) {
2735                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
2736                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
2737                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
2738                                 return true;
2739                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
2740                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
2741                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
2742                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
2743                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
2744                         case 0x0CCD: case 0x0E4E:
2745                                 return false;
2746                         }
2747
2748                         if (0x02b9 <= i && i <= 0x02c5
2749                                 || 0x02cc <= i && i <= 0x02d7
2750                                 || 0x02e4 <= i && i <= 0x02ef
2751                                 || 0x20DD <= i && i <= 0x20E0
2752                         )
2753                                 return true;
2754
2755                         if (0x064B <= i && i <= 0x00652
2756                                 || 0x0941 <= i && i <= 0x0948
2757                                 || 0x0AC1 <= i && i <= 0x0ACD
2758                                 || 0x0C3E <= i && i <= 0x0C4F
2759                                 || 0x0E31 <= i && i <= 0x0E3F
2760                         )
2761                                 return false;
2762
2763                         return Char.GetUnicodeCategory ((char) i) ==
2764                                 UnicodeCategory.NonSpacingMark;
2765                 }
2766
2767                 // We can reuse IsIgnorableSymbol testcode
2768                 // for IsIgnorableNonSpacing.
2769                 #endregion
2770         }
2771
2772         struct CharMapEntry
2773         {
2774                 public byte Category;
2775                 public byte Level1;
2776                 public byte Level2; // It is always single byte.
2777                 public bool Defined;
2778
2779                 public CharMapEntry (byte category, byte level1, byte level2)
2780                 {
2781                         Category = category;
2782                         Level1 = level1;
2783                         Level2 = level2;
2784                         Defined = true;
2785                 }
2786         }
2787
2788         class JISCharacter
2789         {
2790                 public readonly int CP;
2791                 public readonly int JIS;
2792
2793                 public JISCharacter (int cp, int cpJIS)
2794                 {
2795                         CP = cp;
2796                         JIS = cpJIS;
2797                 }
2798         }
2799
2800         class JISComparer : IComparer
2801         {
2802                 public static readonly JISComparer Instance =
2803                         new JISComparer ();
2804
2805                 public int Compare (object o1, object o2)
2806                 {
2807                         JISCharacter j1 = (JISCharacter) o1;
2808                         JISCharacter j2 = (JISCharacter) o2;
2809                         return j2.JIS - j1.JIS;
2810                 }
2811         }
2812
2813         class NonJISCharacter
2814         {
2815                 public readonly int CP;
2816                 public readonly string Name;
2817
2818                 public NonJISCharacter (int cp, string name)
2819                 {
2820                         CP = cp;
2821                         Name = name;
2822                 }
2823         }
2824
2825         class NonJISComparer : IComparer
2826         {
2827                 public static readonly NonJISComparer Instance =
2828                         new NonJISComparer ();
2829
2830                 public int Compare (object o1, object o2)
2831                 {
2832                         NonJISCharacter j1 = (NonJISCharacter) o1;
2833                         NonJISCharacter j2 = (NonJISCharacter) o2;
2834                         return string.CompareOrdinal (j1.Name, j2.Name);
2835                 }
2836         }
2837
2838         class DecimalDictionaryValueComparer : IComparer
2839         {
2840                 public static readonly DecimalDictionaryValueComparer Instance
2841                         = new DecimalDictionaryValueComparer ();
2842
2843                 private DecimalDictionaryValueComparer ()
2844                 {
2845                 }
2846
2847                 public int Compare (object o1, object o2)
2848                 {
2849                         DictionaryEntry e1 = (DictionaryEntry) o1;
2850                         DictionaryEntry e2 = (DictionaryEntry) o2;
2851                         // FIXME: in case of 0, compare decomposition categories
2852                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
2853                         if (ret != 0)
2854                                 return ret;
2855                         int i1 = (int) e1.Key;
2856                         int i2 = (int) e2.Key;
2857                         return i1 - i2;
2858                 }
2859         }
2860
2861         class StringDictionaryValueComparer : IComparer
2862         {
2863                 public static readonly StringDictionaryValueComparer Instance
2864                         = new StringDictionaryValueComparer ();
2865
2866                 private StringDictionaryValueComparer ()
2867                 {
2868                 }
2869
2870                 public int Compare (object o1, object o2)
2871                 {
2872                         DictionaryEntry e1 = (DictionaryEntry) o1;
2873                         DictionaryEntry e2 = (DictionaryEntry) o2;
2874                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
2875                         if (ret != 0)
2876                                 return ret;
2877                         int i1 = (int) e1.Key;
2878                         int i2 = (int) e2.Key;
2879                         return i1 - i2;
2880                 }
2881         }
2882
2883         class UCAComparer : IComparer
2884         {
2885                 public static readonly UCAComparer Instance
2886                         = new UCAComparer ();
2887
2888                 private UCAComparer ()
2889                 {
2890                 }
2891
2892                 public int Compare (object o1, object o2)
2893                 {
2894                         char i1 = (char) o1;
2895                         char i2 = (char) o2;
2896
2897                         int l1 = CollationElementTable.GetSortKeyCount (i1);
2898                         int l2 = CollationElementTable.GetSortKeyCount (i2);
2899                         int l = l1 > l2 ? l2 : l1;
2900
2901                         for (int i = 0; i < l; i++) {
2902                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
2903                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
2904                                 int v = k1.Primary - k2.Primary;
2905                                 if (v != 0)
2906                                         return v;
2907                                 v = k1.Secondary - k2.Secondary;
2908                                 if (v != 0)
2909                                         return v;
2910                                 v = k1.Thirtiary - k2.Thirtiary;
2911                                 if (v != 0)
2912                                         return v;
2913                                 v = k1.Quarternary - k2.Quarternary;
2914                                 if (v != 0)
2915                                         return v;
2916                         }
2917                         return l1 - l2;
2918                 }
2919         }
2920 }