mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 //
   3 // There are two kind of sort keys : which are computed and which are laid out
   4 // as an indexed array. Computed sort keys are:
   5 //
   6 //      - Surrogate
   7 //      - PrivateUse
   8 //
   9 // Also, for composite characters it should prepare different index table.
  10 //
  11 // Though it is possible to "compute" level 3 weights, they are still dumped
  12 // to an array to avoid execution cost.
  13 //
  14
  15 //
  16 // * sortkey getter signature
  17 //
  18 //      int GetSortKey (string s, int index, SortKeyBuffer buf)
  19 //      Stores sort key for corresponding character element into buf and
  20 //      returns the length of the consumed _source_ character element in s.
  21 //
  22 // * character length to consume
  23 //
  24 //      If there are characters whose primary weight is 0, they are consumed
  25 //      and considered as a part of the character element.
  26 //
  27
  28 using System;
  29 using System.IO;
  30 using System.Collections;
  31 using System.Globalization;
  32 using System.Xml;
  33
  34 namespace Mono.Globalization.Unicode
  35 {
  36         internal class MSCompatSortKeyTableGenerator
  37         {
  38                 public static void Main (string [] args)
  39                 {
  40                         new MSCompatSortKeyTableGenerator ().Run (args);
  41                 }
  42
  43                 const int DecompositionWide = 1; // fixed
  44                 const int DecompositionSub = 2; // fixed
  45                 const int DecompositionSmall = 3;
  46                 const int DecompositionIsolated = 4;
  47                 const int DecompositionInitial = 5;
  48                 const int DecompositionFinal = 6;
  49                 const int DecompositionMedial = 7;
  50                 const int DecompositionNoBreak = 8;
  51                 const int DecompositionVertical = 9;
  52                 const int DecompositionFraction = 0xA;
  53                 const int DecompositionFont = 0xB;
  54                 const int DecompositionSuper = 0xC; // fixed
  55                 const int DecompositionFull = 0xE;
  56                 const int DecompositionNarrow = 0xD;
  57                 const int DecompositionCircle = 0xF;
  58                 const int DecompositionSquare = 0x10;
  59                 const int DecompositionCompat = 0x11;
  60
  61                 TextWriter Result = Console.Out;
  62
  63                 byte [] fillIndex = new byte [256]; // by category
  64                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  65
  66                 char [] specialIgnore = new char [] {
  67                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  68                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  69                         };
  70
  71                 // FIXME: need more love (as always)
  72                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  73                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  74                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  75                         '\u0292', '\u01BE', '\u0298'};
  76                 byte [] alphaWeights = new byte [] {
  77                         2, 9, 0xA, 0x1A, 0x21,
  78                         0x23, 0x25, 0x2C, 0x32, 0x35,
  79                         0x36, 0x48, 0x51, 0x70, 0x7C,
  80                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  81                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
  82                         0xA9, 0xAA, 0xB3, 0xB4};
  83
  84                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
  85                 bool [] isUppercase = new bool [char.MaxValue + 1];
  86
  87                 byte [] decompType = new byte [char.MaxValue + 1];
  88                 int [] decompIndex = new int [char.MaxValue + 1];
  89                 int [] decompLength = new int [char.MaxValue + 1];
  90                 int [] decompValues;
  91                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
  92
  93                 byte [] diacritical = new byte [char.MaxValue + 1];
  94
  95                 string [] diacritics = new string [] {
  96                         // LATIN
  97                         "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
  98                         " CIRCUMFLEX;", " DIAERESIS;", " CARON;", "WITH BREVE;",
  99                         " DIALYTIKA AND TONOS;", "WITH MACRON;", " TILDE;", " RING ABOVE;",
 100                         " OGONEK;", " CEDILLA;",
 101                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 102                         " STROKE;", " CIRCUMFLEX AND ACUTE;",
 103                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 104                         " DIAERESIS AND GRAVE;",
 105                         " BREVE AND ACUTE;",
 106                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 107                         " MACRON AND ACUTE;",
 108                         " MACRON AND GRAVE;",
 109                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 110                         " RING ABOVE AND ACUTE",
 111                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 112                         " CIRCUMFLEX AND TILDE",
 113                         " TILDE AND DIAERESIS",
 114                         " STROKE AND ACUTE",
 115                         " BREVE AND TILDE",
 116                         " CEDILLA AND BREVE",
 117                         " OGONEK AND MACRON",
 118                         " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 119                         " DOUBLE GRAVE;",
 120                         " INVERTED BREVE",
 121                         " PRECEDED BY APOSTROPHE",
 122                         " HORN;",
 123                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 124                         " PALATAL HOOK",
 125                         " DOT BELOW;",
 126                         " RETROFLEX;", "DIAERESIS BELOW",
 127                         " RING BELOW",
 128                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 129                         " BREVE BELOW;", " HORN AND GRAVE",
 130                         " TILDE BELOW",
 131                         " DOT BELOW AND DOT ABOVE",
 132                         " RIGHT HALF RING", " HORN AND TILDE",
 133                         " CIRCUMFLEX AND DOT BELOW",
 134                         " BREVE AND DOT BELOW",
 135                         " DOT BELOW AND MACRON",
 136                         " HORN AND HOOK ABOVE",
 137                         " HORN AND DOT",
 138                         // CIRCLED, PARENTHESIZED and so on
 139                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
 140                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 141                         };
 142                 byte [] diacriticWeights = new byte [] {
 143                         // LATIN.
 144                         0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 145                         0x17, 0x19, 0x1A, 0x1B, 0x1C,
 146                         0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 147                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 148                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 149                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 150                         0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
 151                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
 152                         0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
 153                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 154                         0x95, 0xAA,
 155                         // CIRCLED, PARENTHESIZED and so on.
 156                         0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
 157                         };
 158
 159                 int [] numberSecondaryWeightBounds = new int [] {
 160                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 161                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 162                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 163                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 164                         0xE50, 0xE60, 0xED0, 0xEE0
 165                         };
 166
 167                 char [] orderedCyrillic;
 168                 char [] orderedGurmukhi;
 169                 char [] orderedGujarati;
 170                 char [] orderedGeorgian;
 171                 char [] orderedThaana;
 172
 173                 static readonly char [] orderedTamilConsonants = new char [] {
 174                         // based on traditional Tamil consonants, except for
 175                         // Grantha (where Microsoft breaks traditionalism).
 176                         // http://www.angelfire.com/empire/thamizh/padanGaL
 177                         '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
 178                         '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
 179                         '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
 180                         '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
 181                         '\u0BB9'};
 182
 183                 // cp -> level1 value
 184                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 185
 186                 // letterName -> cp
 187                 Hashtable arabicNameMap = new Hashtable ();
 188
 189                 // cp -> Hashtable [decompType] -> cp
 190                 Hashtable nfkdMap = new Hashtable ();
 191
 192                 // Latin letter -> ArrayList [int]
 193                 Hashtable latinMap = new Hashtable ();
 194
 195                 ArrayList jisJapanese = new ArrayList ();
 196                 ArrayList nonJisJapanese = new ArrayList ();
 197
 198                 ushort [] cjkJA = new ushort [char.MaxValue - 0x4E00];
 199                 ushort [] cjkCHS = new ushort [char.MaxValue - 0x3100];
 200                 ushort [] cjkCHT = new ushort [char.MaxValue - 0x4E00];
 201                 ushort [] cjkKO = new ushort [char.MaxValue - 0x4E00];
 202                 byte [] cjkKOlv2 = new byte [char.MaxValue - 0x4E00];
 203
 204                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 205
 206                 double [] unicodeAge = new double [char.MaxValue + 1];
 207
 208                 void Run (string [] args)
 209                 {
 210                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 211                         FillIgnorables ();
 212
 213                         ParseSources (dirname);
 214                         Console.Error.WriteLine ("parse done.");
 215
 216                         FillSecondaryValues ();
 217                         GenerateCore ();
 218                         Console.Error.WriteLine ("generation done.");
 219                         Serialize ();
 220                         Console.Error.WriteLine ("serialization done.");
 221                 }
 222
 223                 void Serialize ()
 224                 {
 225                         // Ignorables
 226                         Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
 227                         for (int i = 0; i <= char.MaxValue; i++) {
 228                                 byte value = ignorableFlags [i];
 229                                 if (value < 10)
 230                                         Result.Write ("{0},", value);
 231                                 else
 232                                         Result.Write ("0x{0:X02},", value);
 233                                 if ((i & 0xF) == 0xF)
 234                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 235                         }
 236                         Result.WriteLine ("};");
 237                         Result.WriteLine ();
 238
 239                         // Primary category
 240                         Result.WriteLine ("static byte [] categories = new byte [] {");
 241                         for (int i = 0; i < map.Length; i++) {
 242                                 byte value = map [i].Category;
 243                                 if (value < 10)
 244                                         Result.Write ("{0},", value);
 245                                 else
 246                                         Result.Write ("0x{0:X02},", value);
 247                                 if ((i & 0xF) == 0xF)
 248                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 249                         }
 250                         Result.WriteLine ("};");
 251                         Result.WriteLine ();
 252
 253                         // Primary weight value
 254                         Result.WriteLine ("static byte [] level1 = new byte [] {");
 255                         for (int i = 0; i < map.Length; i++) {
 256                                 byte value = map [i].Level1;
 257                                 if (value < 10)
 258                                         Result.Write ("{0},", value);
 259                                 else
 260                                         Result.Write ("0x{0:X02},", value);
 261                                 if ((i & 0xF) == 0xF)
 262                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 263                         }
 264                         Result.WriteLine ("};");
 265                         Result.WriteLine ();
 266
 267                         // Secondary weight
 268                         Result.WriteLine ("static byte [] level2 = new byte [] {");
 269                         for (int i = 0; i < map.Length; i++) {
 270                                 int value = map [i].Level2;
 271                                 if (value < 10)
 272                                         Result.Write ("{0},", value);
 273                                 else
 274                                         Result.Write ("0x{0:X02},", value);
 275                                 if ((i & 0xF) == 0xF)
 276                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 277                         }
 278                         Result.WriteLine ("};");
 279                         Result.WriteLine ();
 280
 281                         // Thirtiary weight
 282                         Result.WriteLine ("static byte [] level3 = new byte [] {");
 283                         for (int i = 0; i < map.Length; i++) {
 284                                 byte value = ComputeLevel3Weight ((char) i);
 285                                 if (value < 10)
 286                                         Result.Write ("{0},", value);
 287                                 else
 288                                         Result.Write ("0x{0:X02},", value);
 289                                 if ((i & 0xF) == 0xF)
 290                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 291                         }
 292                         Result.WriteLine ("};");
 293                         Result.WriteLine ();
 294
 295                         // Width insensitivity mappings
 296                         // (for now it is more lightweight than dumping the
 297                         // entire NFKD table).
 298                         Result.WriteLine ("static int [] widthCompat = new int [] {");
 299                         for (int i = 0; i < char.MaxValue; i++) {
 300                                 int value = 0;
 301                                 switch (decompType [i]) {
 302                                 case DecompositionNarrow:
 303                                 case DecompositionWide:
 304                                 case DecompositionSuper:
 305                                 case DecompositionSub:
 306                                         // they are always 1 char
 307                                         value = decompValues [decompIndex [i]];
 308                                         break;
 309                                 }
 310                                 if (value < 10)
 311                                         Result.Write ("{0},", value);
 312                                 else
 313                                         Result.Write ("0x{0:X04},", value);
 314                                 if ((i & 0xF) == 0xF)
 315                                         Result.WriteLine ("// {0:X04}", i - 0xF);
 316                         }
 317                         Result.WriteLine ("};");
 318                         Result.WriteLine ();
 319
 320                         // CJK
 321                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 322                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 323                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 324                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 325                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 326                 }
 327
 328                 void SerializeCJK (string name, ushort [] cjk, int max)
 329                 {
 330                         int offset = char.MaxValue - cjk.Length;
 331                         Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
 332                         for (int i = 0; i < cjk.Length; i++) {
 333                                 if (i + offset == max)
 334                                         break;
 335                                 ushort value = cjk [i];
 336                                 if (value < 10)
 337                                         Result.Write ("{0},", value);
 338                                 else
 339                                         Result.Write ("0x{0:X04},", value);
 340                                 if ((i & 0xF) == 0xF)
 341                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 342                         }
 343                         Result.WriteLine ("};");
 344                         Result.WriteLine ();
 345                 }
 346
 347                 void SerializeCJK (string name, byte [] cjk, int max)
 348                 {
 349                         int offset = char.MaxValue - cjk.Length;
 350                         Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
 351                         for (int i = 0; i < cjk.Length; i++) {
 352                                 if (i + offset == max)
 353                                         break;
 354                                 byte value = cjk [i];
 355                                 if (value < 10)
 356                                         Result.Write ("{0},", value);
 357                                 else
 358                                         Result.Write ("0x{0:X02},", value);
 359                                 if ((i & 0xF) == 0xF)
 360                                         Result.WriteLine ("// {0:X04}", i - 0xF + offset);
 361                         }
 362                         Result.WriteLine ("};");
 363                         Result.WriteLine ();
 364                 }
 365
 366                 #region Parse
 367
 368                 void ParseSources (string dirname)
 369                 {
 370                         string unidata =
 371                                 dirname + "/UnicodeData.txt";
 372                         string derivedCoreProps =
 373                                 dirname + "/DerivedCoreProperties.txt";
 374                         string scripts =
 375                                 dirname + "/Scripts.txt";
 376                         string cp932 =
 377                                 dirname + "/CP932.TXT";
 378                         string derivedAge =
 379                                 dirname + "/DerivedAge.txt";
 380                         string chXML = dirname + "/common/collation/zh.xml";
 381                         string jaXML = dirname + "/common/collation/ja.xml";
 382                         string koXML = dirname + "/common/collation/ko.xml";
 383
 384                         ParseDerivedAge (derivedAge);
 385                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 386                         ParseUnidata (unidata);
 387                         ParseDerivedCoreProperties (derivedCoreProps);
 388                         ParseScripts (scripts);
 389                         ParseCJK (chXML, jaXML, koXML);
 390                 }
 391
 392                 void ParseDerivedAge (string filename)
 393                 {
 394                         using (StreamReader file =
 395                                 new StreamReader (filename)) {
 396                                 while (file.Peek () >= 0) {
 397                                         string s = file.ReadLine ();
 398                                         int idx = s.IndexOf ('#');
 399                                         if (idx >= 0)
 400                                                 s = s.Substring (0, idx);
 401                                         idx = s.IndexOf (';');
 402                                         if (idx < 0)
 403                                                 continue;
 404
 405                                         string cpspec = s.Substring (0, idx);
 406                                         idx = cpspec.IndexOf ("..");
 407                                         NumberStyles nf = NumberStyles.HexNumber |
 408                                                 NumberStyles.AllowTrailingWhite;
 409                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 410                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 411                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 412
 413                                         // FIXME: use index
 414                                         if (cp > char.MaxValue)
 415                                                 continue;
 416
 417                                         for (int i = cp; i <= cpEnd; i++)
 418                                                 unicodeAge [i] = double.Parse (value);
 419                                 }
 420                         }
 421                 }
 422
 423                 void ParseUnidata (string filename)
 424                 {
 425                         ArrayList decompValues = new ArrayList ();
 426                         using (StreamReader unidata =
 427                                 new StreamReader (filename)) {
 428                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 429                                         try {
 430                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 431                                         } catch (Exception) {
 432                                                 Console.Error.WriteLine ("**** At line " + line);
 433                                                 throw;
 434                                         }
 435                                 }
 436                         }
 437                         this.decompValues = (int [])
 438                                 decompValues.ToArray (typeof (int));
 439                 }
 440
 441                 void ProcessUnidataLine (string s, ArrayList decompValues)
 442                 {
 443                         int idx = s.IndexOf ('#');
 444                         if (idx >= 0)
 445                                 s = s.Substring (0, idx);
 446                         idx = s.IndexOf (';');
 447                         if (idx < 0)
 448                                 return;
 449                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 450                         string [] values = s.Substring (idx + 1).Split (';');
 451
 452                         // FIXME: use index
 453                         if (cp > char.MaxValue)
 454                                 return;
 455                         if (IsIgnorable (cp))
 456                                 return;
 457
 458                         // isSmallCapital
 459                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 460                                 isSmallCapital [cp] = true;
 461
 462                         // latin mapping by character name
 463                         if (s.IndexOf ("LATIN") > 0) {
 464                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 465                                 int offset = lidx + 15;
 466                                 if (lidx < 0) {
 467                                         lidx = s.IndexOf ("LETTER TURNED ");
 468                                         offset = lidx + 14;
 469                                 }
 470                                 if (lidx < 0) {
 471                                         lidx = s.IndexOf ("LETTER ");
 472                                         offset = lidx + 7;
 473                                 }
 474                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 475                                 if ('A' <= c && c <= 'Z' &&
 476                                         (s.Length == offset + 1 || s [offset + 1] == ' ')) {
 477                                         ArrayList entry = (ArrayList) latinMap [c];
 478                                         if (entry == null) {
 479                                                 entry = new ArrayList ();
 480                                                 latinMap [c] = entry;
 481                                         }
 482                                         entry.Add (cp);
 483                                 }
 484                         }
 485
 486                         // diacritical weights by character name
 487                         for (int d = 0; d < diacritics.Length; d++)
 488                                 if (s.IndexOf (diacritics [d]) > 0)
 489                                         diacritical [cp] |= diacriticWeights [d];
 490                         // Two-step grep required for it.
 491                         if (s.IndexOf ("FULL STOP") > 0 &&
 492                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
 493                                 diacritical [cp] |= 0xF4;
 494
 495                         // Arabic letter name
 496                         if (0x0621 <= cp && cp <= 0x064A &&
 497                                 Char.GetUnicodeCategory ((char) cp)
 498                                 == UnicodeCategory.OtherLetter) {
 499                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
 500                                 switch (cp) {
 501                                 case 0x0621:
 502                                 case 0x0624:
 503                                 case 0x0626:
 504                                         // hamza, waw, yeh ... special cases.
 505                                         value = 0x07;
 506                                         break;
 507                                 case 0x0649:
 508                                 case 0x064A:
 509                                         value = 0x77; // special cases.
 510                                         break;
 511                                 default:
 512                                         // Get primary letter name i.e.
 513                                         // XXX part of ARABIC LETTER XXX yyy
 514                                         // e.g. that of "TEH MARBUTA" is "TEH".
 515                                         string letterName =
 516                                                 (cp == 0x0640) ?
 517                                                 // 0x0640 is special: it does
 518                                                 // not start with ARABIC LETTER
 519                                                 values [0] :
 520                                                 values [0].Substring (14);
 521                                         int tmpIdx = letterName.IndexOf (' ');
 522                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
 523 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
 524                                         if (arabicNameMap.ContainsKey (letterName))
 525                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
 526                                         else
 527                                                 arabicNameMap [letterName] = cp;
 528                                         break;
 529                                 }
 530                                 arabicLetterPrimaryValues [cp] = value;
 531                         }
 532
 533                         // Japanese square letter
 534                         if (0x3300 <= cp && cp <= 0x3357)
 535                                 if (!ExistsJIS (cp))
 536                                         nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
 537
 538                         // normalizationType
 539                         string decomp = values [4];
 540                         idx = decomp.IndexOf ('<');
 541                         if (idx >= 0) {
 542                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
 543                                 case "full":
 544                                         decompType [cp] = DecompositionFull;
 545                                         break;
 546                                 case "sub":
 547                                         decompType [cp] = DecompositionSub;
 548                                         break;
 549                                 case "super":
 550                                         decompType [cp] = DecompositionSuper;
 551                                         break;
 552                                 case "small":
 553                                         decompType [cp] = DecompositionSmall;
 554                                         break;
 555                                 case "isolated":
 556                                         decompType [cp] = DecompositionIsolated;
 557                                         break;
 558                                 case "initial":
 559                                         decompType [cp] = DecompositionInitial;
 560                                         break;
 561                                 case "final":
 562                                         decompType [cp] = DecompositionFinal;
 563                                         break;
 564                                 case "medial":
 565                                         decompType [cp] = DecompositionMedial;
 566                                         break;
 567                                 case "noBreak":
 568                                         decompType [cp] = DecompositionNoBreak;
 569                                         break;
 570                                 case "compat":
 571                                         decompType [cp] = DecompositionCompat;
 572                                         break;
 573                                 case "fraction":
 574                                         decompType [cp] = DecompositionFraction;
 575                                         break;
 576                                 case "font":
 577                                         decompType [cp] = DecompositionFont;
 578                                         break;
 579                                 case "circle":
 580                                         decompType [cp] = DecompositionCircle;
 581                                         break;
 582                                 case "square":
 583                                         decompType [cp] = DecompositionSquare;
 584                                         break;
 585                                 case "wide":
 586                                         decompType [cp] = DecompositionWide;
 587                                         break;
 588                                 case "narrow":
 589                                         decompType [cp] = DecompositionNarrow;
 590                                         break;
 591                                 case "vertical":
 592                                         decompType [cp] = DecompositionVertical;
 593                                         break;
 594                                 default:
 595                                         throw new Exception ("Support NFKD type : " + decomp);
 596                                 }
 597                         }
 598                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
 599                         if (decomp.Length > 0) {
 600
 601                                 string [] velems = decomp.Split (' ');
 602                                 int didx = decompValues.Count;
 603                                 decompIndex [cp] = didx;
 604                                 foreach (string v in velems)
 605                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
 606                                 decompLength [cp] = velems.Length;
 607
 608                                 // [decmpType] -> this_cp
 609                                 int targetCP = (int) decompValues [didx];
 610                                 // for "(x)" it specially maps to 'x' .
 611                                 // FIXME: check if it is sane
 612                                 if (velems.Length == 3 &&
 613                                         (int) decompValues [didx] == '(' &&
 614                                         (int) decompValues [didx + 2] == ')')
 615                                         targetCP = (int) decompValues [didx + 1];
 616                                 // special: 0x215F "1/"
 617                                 else if (cp == 0x215F)
 618                                         targetCP = '1';
 619                                 else if (velems.Length > 1 &&
 620                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
 621                                         // skip them, except for CJK ideograph compat
 622                                         targetCP = 0;
 623
 624                                 if (targetCP != 0) {
 625                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
 626                                         if (entry == null) {
 627                                                 entry = new Hashtable ();
 628                                                 nfkdMap [targetCP] = entry;
 629                                         }
 630                                         entry [(byte) decompType [cp]] = cp;
 631                                 }
 632                         }
 633                         // numeric values
 634                         if (values [5].Length > 0)
 635                                 decimalValue [cp] = decimal.Parse (values [5]);
 636                         else if (values [6].Length > 0)
 637                                 decimalValue [cp] = decimal.Parse (values [6]);
 638                         else if (values [7].Length > 0) {
 639                                 string decstr = values [7];
 640                                 idx = decstr.IndexOf ('/');
 641                                 if (cp == 0x215F) // special. "1/"
 642                                         decimalValue [cp] = 0x1;
 643                                 else if (idx > 0)
 644                                         // m/n
 645                                         decimalValue [cp] =
 646                                                 decimal.Parse (decstr.Substring (0, idx))
 647                                                 / decimal.Parse (decstr.Substring (idx + 1));
 648                                 else if (decstr [0] == '(' &&
 649                                         decstr [decstr.Length - 1] == ')')
 650                                         // (n)
 651                                         decimalValue [cp] =
 652                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
 653                                 else if (decstr [decstr.Length - 1] == '.')
 654                                         // n.
 655                                         decimalValue [cp] =
 656                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
 657                                 else
 658                                         decimalValue [cp] = decimal.Parse (decstr);
 659                         }
 660                 }
 661
 662                 void ParseDerivedCoreProperties (string filename)
 663                 {
 664                         // IsUppercase
 665                         using (StreamReader file =
 666                                 new StreamReader (filename)) {
 667                                 for (int line = 1; file.Peek () >= 0; line++) {
 668                                         try {
 669                                                 ProcessDerivedCorePropLine (file.ReadLine ());
 670                                         } catch (Exception) {
 671                                                 Console.Error.WriteLine ("**** At line " + line);
 672                                                 throw;
 673                                         }
 674                                 }
 675                         }
 676                 }
 677
 678                 void ProcessDerivedCorePropLine (string s)
 679                 {
 680                         int idx = s.IndexOf ('#');
 681                         if (idx >= 0)
 682                                 s = s.Substring (0, idx);
 683                         idx = s.IndexOf (';');
 684                         if (idx < 0)
 685                                 return;
 686                         string cpspec = s.Substring (0, idx);
 687                         idx = cpspec.IndexOf ("..");
 688                         NumberStyles nf = NumberStyles.HexNumber |
 689                                 NumberStyles.AllowTrailingWhite;
 690                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 691                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 692                         string value = s.Substring (cpspec.Length + 1).Trim ();
 693
 694                         // FIXME: use index
 695                         if (cp > char.MaxValue)
 696                                 return;
 697
 698                         switch (value) {
 699                         case "Uppercase":
 700                                 for (int x = cp; x <= cpEnd; x++)
 701                                         isUppercase [x] = true;
 702                                 break;
 703                         }
 704                 }
 705
 706                 void ParseScripts (string filename)
 707                 {
 708                         ArrayList cyrillic = new ArrayList ();
 709                         ArrayList gurmukhi = new ArrayList ();
 710                         ArrayList gujarati = new ArrayList ();
 711                         ArrayList georgian = new ArrayList ();
 712                         ArrayList thaana = new ArrayList ();
 713
 714                         using (StreamReader file =
 715                                 new StreamReader (filename)) {
 716                                 while (file.Peek () >= 0) {
 717                                         string s = file.ReadLine ();
 718                                         int idx = s.IndexOf ('#');
 719                                         if (idx >= 0)
 720                                                 s = s.Substring (0, idx);
 721                                         idx = s.IndexOf (';');
 722                                         if (idx < 0)
 723                                                 continue;
 724
 725                                         string cpspec = s.Substring (0, idx);
 726                                         idx = cpspec.IndexOf ("..");
 727                                         NumberStyles nf = NumberStyles.HexNumber |
 728                                                 NumberStyles.AllowTrailingWhite;
 729                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 730                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 731                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 732
 733                                         // FIXME: use index
 734                                         if (cp > char.MaxValue)
 735                                                 continue;
 736
 737                                         switch (value) {
 738                                         case "Cyrillic":
 739                                                 for (int x = cp; x <= cpEnd; x++)
 740                                                         if (!IsIgnorable (x))
 741                                                                 cyrillic.Add ((char) x);
 742                                                 break;
 743                                         case "Gurmukhi":
 744                                                 for (int x = cp; x <= cpEnd; x++)
 745                                                         if (!IsIgnorable (x))
 746                                                                 gurmukhi.Add ((char) x);
 747                                                 break;
 748                                         case "Gujarati":
 749                                                 for (int x = cp; x <= cpEnd; x++)
 750                                                         if (!IsIgnorable (x))
 751                                                                 gujarati.Add ((char) x);
 752                                                 break;
 753                                         case "Georgian":
 754                                                 for (int x = cp; x <= cpEnd; x++)
 755                                                         if (!IsIgnorable (x))
 756                                                                 georgian.Add ((char) x);
 757                                                 break;
 758                                         case "Thaana":
 759                                                 for (int x = cp; x <= cpEnd; x++)
 760                                                         if (!IsIgnorable (x))
 761                                                                 thaana.Add ((char) x);
 762                                                 break;
 763                                         }
 764                                 }
 765                         }
 766                         cyrillic.Sort (UCAComparer.Instance);
 767                         gurmukhi.Sort (UCAComparer.Instance);
 768                         gujarati.Sort (UCAComparer.Instance);
 769                         georgian.Sort (UCAComparer.Instance);
 770                         thaana.Sort (UCAComparer.Instance);
 771                         orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
 772                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
 773                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
 774                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
 775                         orderedThaana = (char []) thaana.ToArray (typeof (char));
 776                 }
 777
 778                 void ParseJISOrder (string filename)
 779                 {
 780                         using (StreamReader file =
 781                                 new StreamReader (filename)) {
 782                                 while (file.Peek () >= 0) {
 783                                         string s = file.ReadLine ();
 784                                         int idx = s.IndexOf ('#');
 785                                         if (idx >= 0)
 786                                                 s = s.Substring (0, idx).Trim ();
 787                                         if (s.Length == 0)
 788                                                 continue;
 789                                         idx = s.IndexOf (' ');
 790                                         if (idx < 0)
 791                                                 continue;
 792                                         // They start with "0x" so cut them out.
 793                                         int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
 794                                         int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
 795                                         jisJapanese.Add (new JISCharacter (cp, jis));
 796                                 }
 797                         }
 798                 }
 799
 800                 void ParseCJK (string zhXML, string jaXML, string koXML)
 801                 {
 802                         XmlDocument doc = new XmlDocument ();
 803                         doc.XmlResolver = null;
 804                         int v;
 805                         string s;
 806                         string category;
 807                         int offset;
 808                         ushort [] arr;
 809
 810                         // Chinese Simplified
 811                         category = "chs";
 812                         arr = cjkCHS;
 813                         offset = char.MaxValue - arr.Length;
 814                         doc.Load (zhXML);
 815                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
 816                         v = 0x8008;
 817                         foreach (char c in s) {
 818                                 if (c < '\u3100')
 819                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
 820                                 else {
 821                                         arr [(int) c - offset] = (ushort) v++;
 822                                         if (v % 256 == 0)
 823                                                 v += 2;
 824                                 }
 825                         }
 826
 827                         // Chinese Traditional
 828                         category = "cht";
 829                         arr = cjkCHT;
 830                         offset = char.MaxValue - arr.Length;
 831                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
 832                         v = 0x8002;
 833                         foreach (char c in s) {
 834                                 if (c < '\u4E00')
 835                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
 836                                 else {
 837                                         arr [(int) c - offset] = (ushort) v++;
 838                                         if (v % 256 == 0)
 839                                                 v += 2;
 840                                 }
 841                         }
 842
 843                         // Japanese
 844                         category = "ja";
 845                         arr = cjkJA;
 846                         offset = char.MaxValue - arr.Length;
 847                         doc.Load (jaXML);
 848                         s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
 849                         v = 0x8008;
 850                         foreach (char c in s) {
 851                                 if (c < '\u4E00')
 852                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
 853                                 else {
 854                                         arr [(int) c - offset] = (ushort) v++;
 855                                         if (v % 256 == 0)
 856                                                 v += 2;
 857                                 }
 858                         }
 859
 860                         // Korean
 861                         // Korean weight is somewhat complex. It first shifts
 862                         // Hangul category from 52-x to 80-x (they are anyways
 863                         // computed). CJK ideographs are placed at secondary
 864                         // weight, like XX YY 01 zz 01, where XX and YY are
 865                         // corresponding "reset" value and zz is 41,43,45...
 866                         //
 867                         // Unlike chs,cht and ja, Korean value is a combined
 868                         // ushort which is computed as category
 869                         //
 870                         category = "ko";
 871                         arr = cjkKO;
 872                         offset = char.MaxValue - arr.Length;
 873                         doc.Load (koXML);
 874                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
 875                                 XmlElement sc = (XmlElement) reset.NextSibling;
 876                                 // compute "category" and "level 1" for the
 877                                 // target "reset" Hangle syllable
 878                                 char rc = reset.InnerText [0];
 879                                 int ri = ((int) rc - 0xAC00) + 1;
 880                                 ushort p = (ushort)
 881                                         ((ri / 254) * 256 + (ri % 254) + 2);
 882                                 // Place the characters after the target.
 883                                 s = sc.InnerText;
 884                                 v = 0x41;
 885                                 foreach (char c in s) {
 886                                         arr [(int) c - offset] = p;
 887                                         cjkKOlv2 [(int) c - offset] = (byte) v;
 888                                         v += 2;
 889                                 }
 890                         }
 891                 }
 892
 893                 #endregion
 894
 895                 #region Generation
 896
 897                 void FillIgnorables ()
 898                 {
 899                         for (int i = 0; i <= char.MaxValue; i++) {
 900                                 if (Char.GetUnicodeCategory ((char) i) ==
 901                                         UnicodeCategory.OtherNotAssigned)
 902                                         continue;
 903                                 if (IsIgnorable (i))
 904                                         ignorableFlags [i] |= 1;
 905                                 if (IsIgnorableSymbol (i))
 906                                         ignorableFlags [i] |= 2;
 907                                 if (IsIgnorableNonSpacing (i))
 908                                         ignorableFlags [i] |= 4;
 909                         }
 910                 }
 911
 912                 void FillSecondaryValues ()
 913                 {
 914                         // number, secondary weights
 915                         byte weight = 0x38;
 916                         int [] numarr = numberSecondaryWeightBounds;
 917                         for (int i = 0; i < numarr.Length; i += 2, weight++)
 918                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
 919                                         if (Char.IsNumber ((char) cp))
 920                                                 diacritical [cp] = weight;
 921                 }
 922
 923                 void GenerateCore ()
 924                 {
 925                         UnicodeCategory uc;
 926
 927                         #region Specially ignored // 01
 928                         // This will raise "Defined" flag up.
 929                         foreach (char c in specialIgnore)
 930                                 map [(int) c] = new CharMapEntry (0, 0, 0);
 931                         #endregion
 932
 933
 934                         #region Variable weights
 935                         // Controls : 06 03 - 06 3D
 936                         fillIndex [6] = 3;
 937                         for (int i = 0; i < 65536; i++) {
 938                                 if (IsIgnorable (i))
 939                                         continue;
 940                                 char c = (char) i;
 941                                 uc = Char.GetUnicodeCategory (c);
 942                                 // NEL is whitespace but not ignored here.
 943                                 if (uc == UnicodeCategory.Control &&
 944                                         !Char.IsWhiteSpace (c) || c == '\u0085')
 945                                         AddCharMap (c, 6, 1);
 946                         }
 947
 948                         // Apostrophe 06 80
 949                         fillIndex [6] = 0x80;
 950                         AddCharMapGroup ('\'', 6, 1, 0);
 951                         AddCharMap ('\uFE63', 6, 1);
 952
 953                         // Hyphen/Dash : 06 81 - 06 90
 954                         for (int i = 0; i < char.MaxValue; i++) {
 955                                 if (Char.GetUnicodeCategory ((char) i)
 956                                         == UnicodeCategory.DashPunctuation)
 957                                         AddCharMapGroupTail ((char) i, 6, 1);
 958                         }
 959
 960                         // Arabic variable weight chars 06 A0 -
 961                         fillIndex [6] = 0xA0;
 962                         // vowels
 963                         for (int i = 0x64B; i <= 0x650; i++)
 964                                 AddCharMapGroupTail ((char) i, 6, 1);
 965                         // sukun
 966                         AddCharMapGroup ('\u0652', 6, 1, 0);
 967                         // shadda
 968                         AddCharMapGroup ('\u0651', 6, 1, 0);
 969                         #endregion
 970
 971
 972                         #region Nonspacing marks // 01
 973                         // FIXME: 01 03 - 01 B6 ... annoyance :(
 974
 975                         // Combining diacritical marks: 01 DC -
 976
 977                         fillIndex [0x1] = 0x41;
 978                         for (int i = 0x030E; i <= 0x0326; i++)
 979                                 if (!IsIgnorable (i))
 980                                         AddCharMap ((char) i, 0x1, 1);
 981                         for (int i = 0x0329; i <= 0x0334; i++)
 982                                 if (!IsIgnorable (i))
 983                                         AddCharMap ((char) i, 0x1, 1);
 984                         for (int i = 0x0339; i <= 0x0341; i++)
 985                                 if (!IsIgnorable (i))
 986                                         AddCharMap ((char) i, 0x1, 1);
 987                         fillIndex [0x1] = 0x72;
 988                         for (int i = 0x0346; i <= 0x0348; i++)
 989                                 if (!IsIgnorable (i))
 990                                         AddCharMap ((char) i, 0x1, 1);
 991                         for (int i = 0x02BE; i <= 0x02BF; i++)
 992                                 if (!IsIgnorable (i))
 993                                         AddCharMap ((char) i, 0x1, 1);
 994                         for (int i = 0x02C1; i <= 0x02C5; i++)
 995                                 if (!IsIgnorable (i))
 996                                         AddCharMap ((char) i, 0x1, 1);
 997                         for (int i = 0x02CE; i <= 0x02CF; i++)
 998                                 if (!IsIgnorable (i))
 999                                         AddCharMap ((char) i, 0x1, 1);
1000                         for (int i = 0x02D1; i <= 0x02D3; i++)
1001                                 if (!IsIgnorable (i))
1002                                         AddCharMap ((char) i, 0x1, 1);
1003                         AddCharMap ('\u02DE', 0x1, 1);
1004                         for (int i = 0x02E4; i <= 0x02E9; i++)
1005                                 if (!IsIgnorable (i))
1006                                         AddCharMap ((char) i, 0x1, 1);
1007
1008                         // LAMESPEC: It should not stop at '\u20E1'. There are
1009                         // a few more characters (that however results in
1010                         // overflow of level 2 unless we start before 0xDD).
1011                         fillIndex [0x1] = 0xDC;
1012                         for (int i = 0x20d0; i <= 0x20e1; i++)
1013                                 AddCharMap ((char) i, 0x1, 1);
1014                         #endregion
1015
1016
1017                         #region Whitespaces // 07 03 -
1018                         fillIndex [0x7] = 0x2;
1019                         AddCharMap (' ', 0x7, 2);
1020                         AddCharMap ('\u00A0', 0x7, 1);
1021                         for (int i = 9; i <= 0xD; i++)
1022                                 AddCharMap ((char) i, 0x7, 1);
1023                         for (int i = 0x2000; i <= 0x200B; i++)
1024                                 AddCharMap ((char) i, 0x7, 1);
1025
1026                         fillIndex [0x7] = 0x17;
1027                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
1028                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
1029
1030                         // Characters which used to represent layout control.
1031                         // LAMESPEC: Windows developers seem to have thought
1032                         // that those characters are kind of whitespaces,
1033                         // while they aren't.
1034                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1035                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
1036                         #endregion
1037
1038
1039                         #region ASCII non-alphanumeric + 3001, 3002 // 07
1040                         // non-alphanumeric ASCII except for: + - < = > '
1041                         for (int i = 0x21; i < 0x7F; i++) {
1042                                 if (Char.IsLetterOrDigit ((char) i)
1043                                         || "+-<=>'".IndexOf ((char) i) >= 0)
1044                                         continue; // they are not added here.
1045                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
1046                                 // Insert 3001 after ',' and 3002 after '.'
1047                                 if (i == 0x2C)
1048                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
1049                                 else if (i == 0x2E)
1050                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
1051                                 else if (i == 0x3A)
1052                                         AddCharMap ('\uFE30', 0x7, 1, 0);
1053                         }
1054                         #endregion
1055
1056
1057                         // FIXME: for 07 xx we need more love.
1058
1059                         // FIXME: 08 should be more complete.
1060                         fillIndex [0x8] = 2;
1061                         for (int cp = 0; cp < char.MaxValue; cp++)
1062                                 if (Char.GetUnicodeCategory ((char) cp) ==
1063                                         UnicodeCategory.MathSymbol)
1064                                         AddCharMapGroup2 ((char) cp, 0x8, 1, 0);
1065
1066                         // FIXME: implement 09
1067
1068                         // FIXME: implement 0A
1069                         #region Symbols
1070                         fillIndex [0xA] = 2;
1071                         // byte currency symbols
1072                         for (int cp = 0; cp < 0x100; cp++) {
1073                                 uc = Char.GetUnicodeCategory ((char) cp);
1074                                 if (uc == UnicodeCategory.CurrencySymbol &&
1075                                         cp != '$')
1076                                         AddCharMapGroup2 ((char) cp, 0xA, 1, 0);
1077                         }
1078                         // byte other symbols
1079                         for (int cp = 0; cp < 0x100; cp++) {
1080                                 uc = Char.GetUnicodeCategory ((char) cp);
1081                                 if (uc == UnicodeCategory.OtherSymbol)
1082                                         AddCharMapGroup2 ((char) cp, 0xA, 1, 0);
1083                         }
1084                         #endregion
1085
1086                         #region Numbers // 0C 02 - 0C E1
1087                         fillIndex [0xC] = 2;
1088
1089                         // 9F8 : Bengali "one less than the denominator"
1090                         AddCharMap ('\u09F8', 0xC, 1);
1091
1092                         ArrayList numbers = new ArrayList ();
1093                         for (int i = 0; i < 65536; i++)
1094                                 if (!IsIgnorable (i) &&
1095                                         Char.IsNumber ((char) i) &&
1096                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1097                                         numbers.Add (i);
1098
1099                         ArrayList numberValues = new ArrayList ();
1100                         foreach (int i in numbers)
1101                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1102                         numberValues.Sort (DictionaryValueComparer.Instance);
1103
1104 //foreach (DictionaryEntry de in numberValues)
1105 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1106
1107                         decimal prevValue = -1;
1108                         foreach (DictionaryEntry de in numberValues) {
1109                                 int cp = (int) de.Key;
1110                                 decimal currValue = (decimal) de.Value;
1111                                 bool addnew = false;
1112                                 if (prevValue < currValue &&
1113                                         prevValue - (int) prevValue == 0 &&
1114                                         prevValue >= 1) {
1115
1116                                         addnew = true;
1117                                         // Process Hangzhou and Roman numbers
1118
1119                                         // There are some SPECIAL cases.
1120                                         if (currValue != 4) // no increment for 4
1121                                                 fillIndex [0xC]++;
1122
1123                                         int xcp;
1124                                         xcp = (int) prevValue + 0x2170 - 1;
1125                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1126                                         xcp = (int) prevValue + 0x2160 - 1;
1127                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1128                                         fillIndex [0xC] += 2;
1129                                         xcp = (int) prevValue + 0x3021 - 1;
1130                                         AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1131                                         fillIndex [0xC]++;
1132                                 }
1133                                 if (prevValue < currValue)
1134                                         prevValue = currValue;
1135                                 if (map [cp].Defined)
1136                                         continue;
1137                                 // HangZhou and Roman are add later
1138                                 // (code is above)
1139                                 else if (0x3021 <= cp && cp < 0x302A
1140                                         || 0x2160 <= cp && cp < 0x216A
1141                                         || 0x2170 <= cp && cp < 0x217A)
1142                                         continue;
1143
1144                                 if (cp ==  0x215B) // FIXME: why?
1145                                         fillIndex [0xC] += 2;
1146                                 else if (cp == 0x3021) // FIXME: why?
1147                                         fillIndex [0xC]++;
1148                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1149
1150                                 if (addnew || cp <= '9') {
1151                                         int xcp;
1152                                         if (1 <= currValue && currValue <= 10) {
1153                                                 xcp = cp - 0x31 + 0x2776;
1154                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1155                                                 xcp = cp - 0x31 + 0x2780;
1156                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1157                                                 xcp = cp - 0x31 + 0x278A;
1158                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1159                                         }
1160                                         if (1 <= currValue && currValue <= 20) {
1161                                                 xcp = cp - 0x31 + 0x2460;
1162                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1163                                                 xcp = cp - 0x31 + 0x2474;
1164                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1165                                                 xcp = cp - 0x31 + 0x2488;
1166                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1167                                         }
1168                                 }
1169
1170                                 if (cp != 0x09E7 && cp != 0x09EA)
1171                                         fillIndex [0xC]++;
1172
1173                                 // Add special cases that are not regarded as
1174                                 // numbers in UnicodeCategory speak.
1175                                 if (cp == '5') {
1176                                         // TONE FIVE
1177                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1178                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1179                                 }
1180                                 else if (cp == '6') // FIXME: why?
1181                                         fillIndex [0xC]++;
1182                         }
1183
1184                         // 221E: infinity
1185                         fillIndex [0xC] = 0xFF;
1186                         AddCharMap ('\u221E', 0xC, 1);
1187                         #endregion
1188
1189                         #region Letters and NonSpacing Marks (general)
1190
1191                         // Latin alphabets
1192                         for (int i = 0; i < alphabets.Length; i++)
1193                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1194
1195                         // Greek and Coptic
1196                         fillIndex [0xF] = 02;
1197                         for (int i = 0x0380; i < 0x0390; i++)
1198                                 if (Char.IsLetter ((char) i))
1199                                         AddLetterMap ((char) i, 0xF, 1);
1200                         fillIndex [0xF] = 02;
1201                         for (int i = 0x0391; i < 0x03CF; i++)
1202                                 if (Char.IsLetter ((char) i))
1203                                         AddLetterMap ((char) i, 0xF, 1);
1204                         fillIndex [0xF] = 0x40;
1205                         for (int i = 0x03D0; i < 0x0400; i++)
1206                                 if (Char.IsLetter ((char) i))
1207                                         AddLetterMap ((char) i, 0xF, 1);
1208
1209                         // Cyrillic - UCA order w/ some modification
1210                         fillIndex [0x10] = 0x3;
1211                         // table which is moslty from UCA DUCET.
1212                         for (int i = 0; i < orderedCyrillic.Length; i++) {
1213                                 char c = orderedCyrillic [i];
1214                                 if (Char.IsLetter (c))
1215                                         AddLetterMap (c, 0x10, 3);
1216                         }
1217                         for (int i = 0x0460; i < 0x0481; i++) {
1218                                 if (Char.IsLetter ((char) i))
1219                                         AddLetterMap ((char) i, 0x10, 3);
1220                         }
1221
1222                         // Armenian
1223                         fillIndex [0x11] = 0x3;
1224                         for (int i = 0x0531; i < 0x0586; i++)
1225                                 if (Char.IsLetter ((char) i))
1226                                         AddLetterMap ((char) i, 0x11, 1);
1227
1228                         // Hebrew
1229                         // -Letters
1230                         fillIndex [0x12] = 0x3;
1231                         for (int i = 0x05D0; i < 0x05FF; i++)
1232                                 if (Char.IsLetter ((char) i))
1233                                         AddLetterMap ((char) i, 0x12, 1);
1234                         // -Accents
1235                         fillIndex [0x1] = 0x3;
1236                         for (int i = 0x0591; i <= 0x05C2; i++)
1237                                 if (i != 0x05BE)
1238                                         AddCharMap ((char) i, 0x1, 1);
1239
1240                         // Arabic
1241                         fillIndex [0x1] = 0x8E;
1242                         fillIndex [0x13] = 0x3;
1243                         for (int i = 0x0621; i <= 0x064A; i++) {
1244                                 // Abjad
1245                                 if (Char.GetUnicodeCategory ((char) i)
1246                                         != UnicodeCategory.OtherLetter) {
1247                                         // FIXME: arabic nonspacing marks are
1248                                         // in different order.
1249                                         AddCharMap ((char) i, 0x1, 1);
1250                                         continue;
1251                                 }
1252                                 map [i] = new CharMapEntry (0x13,
1253                                         (byte) arabicLetterPrimaryValues [i], 1);
1254                         }
1255                         fillIndex [0x13] = 0x84;
1256                         for (int i = 0x0674; i < 0x06D6; i++)
1257                                 if (Char.IsLetter ((char) i))
1258                                         AddLetterMap ((char) i, 0x13, 1);
1259
1260                         // Devanagari
1261                         // FIXME: it does seem straight codepoint mapping.
1262                         fillIndex [0x14] = 04;
1263                         for (int i = 0x0901; i < 0x0905; i++)
1264                                 if (!IsIgnorable (i))
1265                                         AddLetterMap ((char) i, 0x14, 2);
1266                         fillIndex [0x14] = 0xB;
1267                         for (int i = 0x0905; i < 0x093A; i++)
1268                                 if (Char.IsLetter ((char) i))
1269                                         AddLetterMap ((char) i, 0x14, 4);
1270                         for (int i = 0x093E; i < 0x094F; i++)
1271                                 if (Char.IsLetter ((char) i))
1272                                         AddLetterMap ((char) i, 0x14, 2);
1273
1274                         // Bengali
1275                         // -Letters
1276                         fillIndex [0x15] = 02;
1277                         for (int i = 0x0980; i < 0x9FF; i++) {
1278                                 if (IsIgnorable (i))
1279                                         continue;
1280                                 if (i == 0x09E0)
1281                                         fillIndex [0x15] = 0x3B;
1282                                 switch (Char.GetUnicodeCategory ((char) i)) {
1283                                 case UnicodeCategory.NonSpacingMark:
1284                                 case UnicodeCategory.DecimalDigitNumber:
1285                                 case UnicodeCategory.OtherNumber:
1286                                         continue;
1287                                 }
1288                                 AddLetterMap ((char) i, 0x15, 1);
1289                         }
1290                         // -Signs
1291                         fillIndex [0x1] = 0x3;
1292                         for (int i = 0x0981; i < 0x0A00; i++)
1293                                 if (Char.GetUnicodeCategory ((char) i) ==
1294                                         UnicodeCategory.NonSpacingMark)
1295                                         AddCharMap ((char) i, 0x1, 1);
1296
1297                         // Gurmukhi. orderedGurmukhi is from UCA
1298                         // FIXME: it does not look equivalent to UCA.
1299                         fillIndex [0x1] = 03;
1300                         fillIndex [0x16] = 02;
1301                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
1302                                 char c = orderedGurmukhi [i];
1303                                 if (IsIgnorable ((int) c))
1304                                         continue;
1305                                 if (!Char.IsLetter (c)) {
1306                                         AddLetterMap (c, 0x1, 1);
1307                                         continue;
1308                                 }
1309                                 if (c == '\u0A3C' || c == '\u0A4D' ||
1310                                         '\u0A66' <= c && c <= '\u0A71')
1311                                         continue;
1312                                 AddLetterMap (c, 0x16, 4);
1313                         }
1314
1315                         // Gujarati. orderedGujarati is from UCA
1316                         fillIndex [0x17] = 02;
1317                         for (int i = 0; i < orderedGujarati.Length; i++)
1318                                 AddLetterMap (orderedGujarati [i], 0x17, 4);
1319
1320                         // Oriya
1321                         fillIndex [0x18] = 02;
1322                         for (int i = 0x0B00; i < 0x0B7F; i++) {
1323                                 switch (Char.GetUnicodeCategory ((char) i)) {
1324                                 case UnicodeCategory.NonSpacingMark:
1325                                 case UnicodeCategory.DecimalDigitNumber:
1326                                         continue;
1327                                 }
1328                                 AddLetterMap ((char) i, 0x18, 1);
1329                         }
1330
1331                         // Tamil
1332                         fillIndex [0x19] = 2;
1333                         AddCharMap ('\u0BD7', 0x19, 0);
1334                         fillIndex [0x19] = 0xA;
1335                         // vowels
1336                         for (int i = 0x0BD7; i < 0x0B94; i++)
1337                                 if (Char.IsLetter ((char) i))
1338                                         AddCharMap ((char) i, 0x19, 2);
1339                         // special vowel
1340                         fillIndex [0x19] = 0x24;
1341                         AddCharMap ('\u0B94', 0x19, 0);
1342                         fillIndex [0x19] = 0x26;
1343                         // The array for Tamil consonants is a constant.
1344                         // Windows have almost similar sequence to TAM from
1345                         // tamilnet but a bit different in Grantha.
1346                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
1347                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1348                         // combining marks
1349                         fillIndex [0x19] = 0x82;
1350                         for (int i = 0x0BBE; i < 0x0BCD; i++)
1351                                 if (Char.GetUnicodeCategory ((char) i) ==
1352                                         UnicodeCategory.SpacingCombiningMark
1353                                         || i == 0x0BC0)
1354                                         AddLetterMap ((char) i, 0x19, 2);
1355
1356                         // Telugu
1357                         fillIndex [0x1A] = 0x4;
1358                         for (int i = 0x0C00; i < 0x0C62; i++) {
1359                                 if (i == 0x0C55 || i == 0x0C56)
1360                                         continue; // skip
1361                                 AddCharMap ((char) i, 0x1A, 3);
1362                                 char supp = (i == 0x0C0B) ? '\u0C60':
1363                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
1364                                 if (supp == char.MinValue)
1365                                         continue;
1366                                 AddCharMap (supp, 0x1A, 3);
1367                         }
1368
1369                         // Kannada
1370                         fillIndex [0x1B] = 4;
1371                         for (int i = 0x0C80; i < 0x0CE5; i++) {
1372                                 if (i == 0x0CD5 || i == 0x0CD6)
1373                                         continue; // ignore
1374                                 AddCharMap ((char) i, 0x1B, 3);
1375                         }
1376
1377                         // Malayalam
1378                         fillIndex [0x1C] = 2;
1379                         for (int i = 0x0D02; i < 0x0D61; i++)
1380                                 // FIXME: I avoided MSCompatUnicodeTable usage
1381                                 // here (it results in recursion). So check if
1382                                 // using NonSpacingMark makes sense or not.
1383                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1384 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1385                                         AddCharMap ((char) i, 0x1C, 1);
1386
1387                         // Thai ... note that it breaks 0x1E wall after E2B!
1388                         // Also, all Thai characters have level 2 value 3.
1389                         fillIndex [0x1E] = 2;
1390                         for (int i = 0xE44; i < 0xE48; i++)
1391                                 AddCharMap ((char) i, 0x1E, 1, 3);
1392                         for (int i = 0xE01; i < 0xE2B; i++)
1393                                 AddCharMap ((char) i, 0x1E, 6, 0);
1394                         fillIndex [0x1F] = 5;
1395                         for (int i = 0xE2B; i < 0xE30; i++)
1396                                 AddCharMap ((char) i, 0x1F, 6, 0);
1397                         for (int i = 0xE30; i < 0xE3B; i++)
1398                                 AddCharMap ((char) i, 0x1F, 1, 3);
1399                         // some Thai characters remains.
1400                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
1401                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1402                         foreach (char c in specialThai)
1403                                 AddCharMap (c, 0x1F, 1);
1404
1405                         // Lao
1406                         fillIndex [0x1F] = 2;
1407                         for (int i = 0xE80; i < 0xEDF; i++)
1408                                 if (Char.IsLetter ((char) i))
1409                                         AddCharMap ((char) i, 0x1F, 1);
1410
1411                         // Georgian. orderedGeorgian is from UCA DUCET.
1412                         fillIndex [0x21] = 5;
1413                         for (int i = 0; i < orderedGeorgian.Length; i++)
1414                                 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1415
1416                         // Japanese Kana.
1417                         fillIndex [0x22] = 2;
1418                         int kanaOffset = 0x3041;
1419                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1420
1421                         for (int gyo = 0; gyo < 9; gyo++) {
1422                                 for (int dan = 0; dan < 5; dan++) {
1423                                         if (gyo == 7 && dan % 2 == 1) {
1424                                                 // 'ya'-gyo
1425                                                 fillIndex [0x22]++;
1426                                                 kanaOffset -= 2; // There is no space for yi and ye.
1427                                                 continue;
1428                                         }
1429                                         int cp = kanaOffset + dan * kanaLines [gyo];
1430                                         // small lines (a-gyo, ya-gyo)
1431                                         if (gyo == 0 || gyo == 7) {
1432                                                 AddKanaMap (cp, 1); // small
1433                                                 AddKanaMap (cp + 1, 1);
1434                                         }
1435                                         else
1436                                                 AddKanaMap (cp, kanaLines [gyo]);
1437                                         fillIndex [0x22]++;
1438
1439                                         if (cp == 0x3061) {
1440                                                 // add small 'Tsu' (before normal one)
1441                                                 AddKanaMap (0x3063, 1);
1442                                                 kanaOffset++;
1443                                         }
1444                                 }
1445                                 fillIndex [0x22] += 3;
1446                                 kanaOffset += 5 * kanaLines [gyo];
1447                         }
1448
1449                         // Wa-gyo is almost special, so I just manually add.
1450                         AddLetterMap ((char) 0x308E, 0x22, 0);
1451                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1452                         AddLetterMap ((char) 0x308F, 0x22, 0);
1453                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1454                         fillIndex [0x22]++;
1455                         AddLetterMap ((char) 0x3090, 0x22, 0);
1456                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1457                         fillIndex [0x22] += 2;
1458                         // no "Wu" in Japanese.
1459                         AddLetterMap ((char) 0x3091, 0x22, 0);
1460                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1461                         fillIndex [0x22]++;
1462                         AddLetterMap ((char) 0x3092, 0x22, 0);
1463                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1464                         // Nn
1465                         fillIndex [0x22] = 0x80;
1466                         AddLetterMap ((char) 0x3093, 0x22, 0);
1467                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1468
1469                         // JIS Japanese square chars.
1470                         fillIndex [0x22] = 0x97;
1471                         jisJapanese.Sort (JISComparer.Instance);
1472                         foreach (JISCharacter j in jisJapanese)
1473                                 AddCharMap ((char) j.CP, 0x22, 1);
1474                         // non-JIS Japanese square chars.
1475                         nonJisJapanese.Sort (NonJISComparer.Instance);
1476                         foreach (NonJISCharacter j in nonJisJapanese)
1477                                 AddCharMap ((char) j.CP, 0x22, 1);
1478
1479                         // Bopomofo
1480                         fillIndex [0x23] = 0x02;
1481                         for (int i = 0x3105; i <= 0x312C; i++)
1482                                 AddCharMap ((char) i, 0x23, 1);
1483
1484                         // Estrangela: ancient Syriac
1485                         fillIndex [0x24] = 0x0B;
1486                         // FIXME: is 0x71E really alternative form?
1487                         ArrayList syriacAlternatives = new ArrayList (
1488                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
1489                         for (int i = 0x0710; i <= 0x072C; i++) {
1490                                 if (i == 0x0711) // NonSpacingMark
1491                                         continue;
1492                                 if (syriacAlternatives.Contains (i))
1493                                         continue;
1494                                 AddCharMap ((char) i, 0x24, 4);
1495                                 // FIXME: why?
1496                                 if (i == 0x721)
1497                                         fillIndex [0x24]++;
1498                         }
1499                         foreach (int cp in syriacAlternatives)
1500                                 map [cp] = new CharMapEntry (0x24,
1501                                         (byte) (map [cp - 1].Level1 + 2),
1502                                         0);
1503
1504                         // Thaana
1505                         // FIXME: it turned out that it does not look like UCA
1506                         fillIndex [0x24] = 0x6E;
1507                         for (int i = 0; i < orderedThaana.Length; i++) {
1508                                 if (IsIgnorableNonSpacing (i))
1509                                         continue;
1510                                 AddCharMap (orderedThaana [i], 0x24, 2);
1511                         }
1512                         #endregion
1513
1514                         #region Level2 adjustment
1515                         // Arabic Hamzah
1516                         diacritical [0x624] = 0x5;
1517                         diacritical [0x626] = 0x7;
1518                         diacritical [0x622] = 0x9;
1519                         diacritical [0x623] = 0xA;
1520                         diacritical [0x625] = 0xB;
1521                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
1522                         diacritical [0x64A] = 0x7; // Yaa'
1523
1524
1525                         for (int i = 0; i < 0x10000; i++) {
1526                                 switch (map [i].Category) {
1527                                 case 0xE: // Latin diacritics
1528                                 case 0x22: // Japanese: circled characters
1529                                         map [i] = new CharMapEntry (
1530                                                 map [i].Category,
1531                                                 map [i].Level1,
1532                                                 diacritical [i]);
1533                                         break;
1534                                 case 0x13: // Arabic
1535                                         if (diacritical [i] == 0)
1536                                                 // default by 8
1537                                                 diacritical [i] = 0x8;
1538                                         map [i] = new CharMapEntry (0xE, map [i].Level1, diacritical [i]);
1539                                         break;
1540                                 }
1541                         }
1542                         #endregion
1543
1544                         // FIXME: Add more culture-specific letters (that are
1545                         // not supported in Windows collation) here.
1546
1547                         // Surrogate ... they are computed.
1548
1549                         // Hangul.
1550                         //
1551                         // Unlike UCA Windows Hangul sequence mixes Jongseong
1552                         // with Choseong sequence as well as Jungseong,
1553                         // adjusted to have the same primary weight for the
1554                         // same base character. So it is impossible to compute
1555                         // those sort keys.
1556                         //
1557                         // Here I introduce an ordered sequence of mixed
1558                         // 'commands' and 'characters' that is similar to
1559                         // LDML text:
1560                         //      - ',' increases primary weight.
1561                         //      - [A B] means a range, increasing index
1562                         //      - {A B} means a range, without increasing index
1563                         //      - '=' is no operation (it means the characters
1564                         //        of both sides have the same weight).
1565                         //      - '>' inserts a Hangul Syllable block that
1566                         //        contains 0x251 characters.
1567                         //      - '<' decreases the index
1568                         //      - '0'-'9' means skip count
1569                         //      - whitespaces are ignored
1570                         //
1571
1572                         string hangulSequence =
1573                         + "\u1100=\u11A8 > \u1101=\u11A9 >"
1574                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
1575                         + "<{\u1113 \u1116}, \u3165,"
1576                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
1577                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
1578                         + "\u11CA, \u1104, \u11CB > \u1105 >"
1579                         + "\u11B0, [\u11CC \u11D0], \u11B1, [\u11D1 \u11D2],"
1580                                 + "\u11B2, [\u11D3 \u11D5], \u11B3,"
1581                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
1582                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
1583                         + "[\u11DA \u11E2], \u1107=\u11B8 >"
1584                         + "<{\u111E \u1120}, \u3172,, \u3173, "
1585                                 + "\u11E3, \u1108 >"
1586                         + "\u11B9,,,,,,,,, [\u11E4 \u11E6],, \u1109=\u11BA,,,"
1587                                 + "\u3214=\u3274 <>"
1588                         + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
1589                                 + "\u11EA,, \u110A=\u11BB,,, >"
1590                         + "{\u1134 \u1140}, \u317E,,,,,, \u11EB,"
1591                         + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
1592                         + "\u11EE, \u11EC, \u11ED,,,,, \u11F1,, \u11F2,,,"
1593                                 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
1594                         + "\u110D,,  >"
1595                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
1596                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
1597                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
1598                         + "\u11F3, \u11F4, \u1112=\u11C2 >"
1599                         + "\u11F9, [\u11F5 \u11F8]"
1600                         ;
1601
1602                         byte hangulCat = 0x52;
1603                         fillIndex [hangulCat] = 0x2;
1604
1605                         int syllableBlock = 0;
1606                         for (int n = 0; n < hangulSequence.Length; n++) {
1607                                 char c = hangulSequence [n];
1608                                 int start, end;
1609                                 if (Char.IsWhiteSpace (c))
1610                                         continue;
1611                                 switch (c) {
1612                                 case '=':
1613                                         break; // NOP
1614                                 case ',':
1615                                         IncrementSequentialIndex (ref hangulCat);
1616                                         break;
1617                                 case '<':
1618                                         if (fillIndex [hangulCat] == 2)
1619                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
1620                                         fillIndex [hangulCat]--;
1621                                         break;
1622                                 case '>':
1623                                         IncrementSequentialIndex (ref hangulCat);
1624                                         for (int l = 0; l < 0x15; l++)
1625                                                 for (int v = 0; v < 0x1C; v++) {
1626                                                         AddCharMap (
1627                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
1628                                                         IncrementSequentialIndex (ref hangulCat);
1629                                                 }
1630                                         syllableBlock++;
1631                                         break;
1632                                 case '[':
1633                                         start = hangulSequence [n + 1];
1634                                         end = hangulSequence [n + 3];
1635                                         for (int i = start; i <= end; i++) {
1636                                                 AddCharMap ((char) i, hangulCat, 0);
1637                                                 if (end > i)
1638                                                         IncrementSequentialIndex (ref hangulCat);
1639                                         }
1640                                         n += 4; // consumes 5 characters for this operation
1641                                         break;
1642                                 case '{':
1643                                         start = hangulSequence [n + 1];
1644                                         end = hangulSequence [n + 3];
1645                                         for (int i = start; i <= end; i++)
1646                                                 AddCharMap ((char) i, hangulCat, 0);
1647                                         n += 4; // consumes 5 characters for this operation
1648                                         break;
1649                                 default:
1650                                         AddCharMap (c, hangulCat, 0);
1651                                         break;
1652                                 }
1653                         }
1654
1655                         // CJK unified ideograph.
1656                         byte cjkCat = 0x9E;
1657                         fillIndex [cjkCat] = 0x2;
1658                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
1659                                 if (!IsIgnorable (cp))
1660                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
1661                         // CJK Extensions goes here.
1662                         // LAMESPEC: With this Windows style CJK layout, it is
1663                         // impossible to add more CJK ideograph i.e. 0x9FA6-
1664                         // 0x9FBB can never be added w/o breaking compat.
1665                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
1666                                 if (!IsIgnorable (cp))
1667                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
1668
1669                         // PrivateUse ... computed.
1670                         // remaining Surrogate ... computed.
1671
1672                         #region Special "biggest" area (FF FF)
1673                         fillIndex [0xFF] = 0xFF;
1674                         char [] specialBiggest = new char [] {
1675                                 '\u3005', '\u3031', '\u3032', '\u309D',
1676                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1677                                 '\uFE7C', '\uFE7D', '\uFF70'};
1678                         foreach (char c in specialBiggest)
1679                                 AddCharMap (c, 0xFF, 0);
1680                         #endregion
1681
1682                         // Characters w/ diacritical marks (NFKD)
1683                         for (int i = 0; i <= char.MaxValue; i++) {
1684                                 if (map [i].Defined || IsIgnorable (i))
1685                                         continue;
1686                                 if (decompIndex [i] == 0)
1687                                         continue;
1688
1689                                 int start = decompIndex [i];
1690                                 int primaryChar = decompValues [start];
1691                                 if (map [primaryChar].Level1 == 0)
1692                                         continue;
1693                                 int secondary = 0;
1694                                 bool skip = false;
1695                                 for (int l = 1; l < decompLength [i]; l++) {
1696                                         int c = decompValues [start + l];
1697                                         if (map [c].Level1 != 0)
1698                                                 skip = true;
1699                                         secondary += diacritical [c];
1700                                 }
1701                                 if (skip)
1702                                         continue;
1703                                 map [i] = new CharMapEntry (
1704                                         map [primaryChar].Category,
1705                                         map [primaryChar].Level1,
1706                                         (byte) secondary);
1707
1708                         }
1709
1710
1711                         // FIXME: this is hack but those which are
1712                         // NonSpacingMark characters and still undefined
1713                         // are likely to be nonspacing.
1714                         for (int i = 0; i < char.MaxValue; i++)
1715                                 if (!map [i].Defined &&
1716                                         !IsIgnorable (i) &&
1717                                         Char.GetUnicodeCategory ((char) i) ==
1718                                         UnicodeCategory.NonSpacingMark)
1719                                         AddCharMap ((char) i, 1, 1);
1720                 }
1721
1722                 private void IncrementSequentialIndex (ref byte hangulCat)
1723                 {
1724                         fillIndex [hangulCat]++;
1725                         if (fillIndex [hangulCat] == 0) { // overflown
1726                                 hangulCat++;
1727                                 fillIndex [hangulCat] = 0x2;
1728                         }
1729                 }
1730
1731                 // Reset fillIndex to fixed value and call AddLetterMap().
1732                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
1733                 {
1734                         fillIndex [category] = alphaWeight;
1735                         AddLetterMap (c, category, 0);
1736
1737                         ArrayList al = latinMap [c] as ArrayList;
1738                         if (al == null)
1739                                 return;
1740
1741 //Console.Error.WriteLine ("PROCESSING {0}: {1} entries", c, al.Count);
1742 //foreach (int cp in al) Console.Error.WriteLine ("    {0:X04}", cp);
1743                         foreach (int cp in al)
1744                                 AddLetterMap ((char) cp, category, 0);
1745                 }
1746
1747                 private void AddKanaMap (int i, byte voices)
1748                 {
1749                         for (byte b = 0; b < voices; b++) {
1750                                 char c = (char) (i + b);
1751                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
1752                                 // Hiragana
1753                                 AddLetterMapCore (c, 0x22, 0, arg);
1754                                 // Katakana
1755                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
1756                         }
1757                 }
1758
1759                 private void AddLetterMap (char c, byte category, byte updateCount)
1760                 {
1761                         AddLetterMapCore (c, category, updateCount, 0);
1762                 }
1763
1764                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
1765                 {
1766                         char c2;
1767                         // <small> updates index
1768                         c2 = ToSmallForm (c);
1769                         if (c2 != c)
1770                                 AddCharMapGroup2 (c2, category, updateCount, level2);
1771                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
1772                         if (c2 != c && !map [(int) c2].Defined)
1773                                 AddLetterMapCore (c2, category, 0, level2);
1774                         bool doUpdate = true;
1775                         if (!map [c].Defined)
1776                                 AddCharMapGroup2 (c, category, 0, level2);
1777                         else
1778                                 doUpdate = false;
1779                         if (doUpdate)
1780                                 fillIndex [category] += updateCount;
1781                 }
1782
1783                 private void AddCharMap (char c, byte category, byte increment)
1784                 {
1785                         AddCharMap (c, category, increment, 0);
1786                 }
1787
1788                 private void AddCharMap (char c, byte category, byte increment, byte alt)
1789                 {
1790                         if (IsIgnorable ((int) c) || map [(int) c].Defined) {
1791                                 return; // do nothing
1792                         }
1793
1794                         map [(int) c] = new CharMapEntry (category,
1795                                 category == 1 ? alt : fillIndex [category],
1796                                 category == 1 ? fillIndex [category] : alt);
1797                         fillIndex [category] += increment;
1798                 }
1799
1800                 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
1801                 {
1802                         char c2 = ToSmallFormTail (c);
1803                         if (c2 != c)
1804                                 AddCharMap (c2, category, updateCount, 0);
1805                         // itself
1806                         AddCharMap (c, category, updateCount, 0);
1807                         // <full>
1808                         c2 = ToFullWidthTail (c);
1809                         if (c2 != c)
1810                                 AddCharMapGroupTail (c2, category, updateCount);
1811                 }
1812
1813                 //
1814                 // Adds characters to table in the order below
1815                 // (+ increases weight):
1816                 //      (<small> +)
1817                 //      itself
1818                 //      <fraction>
1819                 //      <full> | <super> | <sub>
1820                 //      <circle> | <wide> (| <narrow>)
1821                 //      +
1822                 //      (vertical +)
1823                 //
1824                 // level2 is fixed (does not increase).
1825                         int [] sameWeightItems = new int [] {
1826                                 0, // canonically compatible
1827                                 DecompositionFraction,
1828                                 DecompositionFull,
1829                                 DecompositionSuper,
1830                                 DecompositionSub,
1831                                 DecompositionCircle,
1832                                 DecompositionWide,
1833                                 DecompositionNarrow,
1834                                 };
1835                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
1836                 {
1837                         char small = char.MinValue;
1838                         char vertical = char.MinValue;
1839                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
1840                         if (nfkd != null) {
1841                                 object smv = nfkd [(byte) DecompositionSmall];
1842                                 if (smv != null)
1843                                         small = (char) ((int) smv);
1844                                 object vv = nfkd [(byte) DecompositionVertical];
1845                                 if (vv != null)
1846                                         vertical = (char) ((int) vv);
1847                         }
1848
1849                         // <small> updates index
1850                         if (small != char.MinValue)
1851                                 AddCharMap (small, category, updateCount);
1852
1853                         // itself
1854                         AddCharMap (c, category, 0, level2);
1855
1856                         if (nfkd != null) {
1857                                 foreach (int weight in sameWeightItems) {
1858                                         object wv = nfkd [(byte) weight];
1859                                         if (wv != null)
1860                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
1861                                 }
1862                         }
1863
1864                         // update index here.
1865                         fillIndex [category] += updateCount;
1866
1867                         if (vertical != char.MinValue)
1868                                 AddCharMap (vertical, category, updateCount, level2);
1869                 }
1870
1871                 private void AddCharMapCJK (char c, ref byte category)
1872                 {
1873                         AddCharMap (c, category, 0, 0);
1874                         IncrementSequentialIndex (ref category);
1875
1876                         // Special. I wonder why but Windows skips 9E F9.
1877                         if (category == 0x9E && fillIndex [category] == 0xF9)
1878                                 IncrementSequentialIndex (ref category);
1879                 }
1880
1881                 private void AddCharMapGroupCJK (char c, ref byte category)
1882                 {
1883                         AddCharMapCJK (c, ref category);
1884
1885                         // LAMESPEC: see below.
1886                         if (c == '\u52DE') {
1887                                 AddCharMapCJK ('\u3298', ref category);
1888                                 AddCharMapCJK ('\u3238', ref category);
1889                         }
1890                         if (c == '\u5BEB')
1891                                 AddCharMapCJK ('\u32A2', ref category);
1892                         if (c == '\u91AB')
1893                                 // Especially this mapping order totally does
1894                                 // not make sense to me.
1895                                 AddCharMapCJK ('\u32A9', ref category);
1896
1897                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
1898                         if (nfkd == null)
1899                                 return;
1900                         for (byte weight = 0; weight <= 17; weight++) {
1901                                 object wv = nfkd [weight];
1902                                 if (wv == null)
1903                                         continue;
1904                                 int w = (int) wv;
1905
1906                                 // Special: they are ignored in this area.
1907                                 // FIXME: check if it is sane
1908                                 if (0xF900 <= w && w <= 0xFAD9)
1909                                         continue;
1910                                 // LAMESPEC: on Windows some of CJK characters
1911                                 // in 3200-32B0 are incorrectly mapped. They
1912                                 // mix Chinise and Japanese Kanji when
1913                                 // ordering those characters.
1914                                 switch (w) {
1915                                 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
1916                                         continue;
1917                                 }
1918
1919                                 AddCharMapCJK ((char) w, ref category);
1920                         }
1921                 }
1922
1923                 // note that level2 is fixed
1924                 // different order than AddCharMapGroup2()
1925                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
1926                 {
1927 /*
1928                         // itself
1929                         AddCharMap (c, category, updateCount, level2);
1930
1931                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
1932                         if (nfkd == null)
1933                                 return;
1934
1935                         // Here type of i must be byte since the constants
1936                         // are stored as byte.
1937                         for (byte i = 1; i <= 17; i++) {
1938                                 if (nfkd.ContainsKey (i)) {
1939                                         int cp = (int) nfkd [i];
1940                                         if (decompLength [cp] == 1) {
1941                                                 AddCharMapGroup ((char) cp, category, updateCount, level2);
1942                                         }
1943                                 }
1944                         }
1945 */
1946                         AddCharMapGroup2 (c, category, updateCount, level2);
1947                 }
1948
1949                 char ToFullWidth (char c)
1950                 {
1951                         return ToDecomposed (c, DecompositionFull, false);
1952                 }
1953
1954                 char ToFullWidthTail (char c)
1955                 {
1956                         return ToDecomposed (c, DecompositionFull, true);
1957                 }
1958
1959                 char ToSmallForm (char c)
1960                 {
1961                         return ToDecomposed (c, DecompositionSmall, false);
1962                 }
1963
1964                 char ToSmallFormTail (char c)
1965                 {
1966                         return ToDecomposed (c, DecompositionSmall, true);
1967                 }
1968
1969                 char ToDecomposed (char c, byte d, bool tail)
1970                 {
1971                         if (decompType [(int) c] != d)
1972                                 return c;
1973                         int idx = decompIndex [(int) c];
1974                         if (tail)
1975                                 idx += decompLength [(int) c] - 1;
1976                         return (char) decompValues [idx];
1977                 }
1978
1979                 bool ExistsJIS (int cp)
1980                 {
1981                         foreach (JISCharacter j in jisJapanese)
1982                                 if (j.CP == cp)
1983                                         return true;
1984                         return false;
1985                 }
1986
1987                 #endregion
1988
1989                 #region Level 3 properties (Case/Width)
1990
1991                 private byte ComputeLevel3Weight (char c)
1992                 {
1993                         byte b = ComputeLevel3WeightRaw (c);
1994                         return b > 0 ? (byte) (b + 2) : b;
1995                 }
1996
1997                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
1998                 {
1999                         // Korean
2000                         if ('\u11A8' <= c && c <= '\u11F9')
2001                                 return 2;
2002                         if ('\uFFA0' <= c && c <= '\uFFDC')
2003                                 return 4;
2004                         if ('\u3130' <= c && c <= '\u3164')
2005                                 return 5;
2006                         // numbers
2007                         if ('\u2776' <= c && c <= '\u277F')
2008                                 return 4;
2009                         if ('\u2780' <= c && c <= '\u2789')
2010                                 return 8;
2011                         if ('\u2776' <= c && c <= '\u2793')
2012                                 return 0xC;
2013                         if ('\u2160' <= c && c <= '\u216F')
2014                                 return 0x18;
2015                         if ('\u2181' <= c && c <= '\u2182')
2016                                 return 0x18;
2017                         // Arabic
2018                         if ('\u2135' <= c && c <= '\u2138')
2019                                 return 4;
2020                         if ('\uFE80' <= c && c < '\uFE8E') {
2021                                 // 2(Isolated)/8(Final)/0x18(Medial)
2022                                 switch (decompType [(int) c]) {
2023                                 case DecompositionIsolated:
2024                                         return 2;
2025                                 case DecompositionFinal:
2026                                         return 8;
2027                                 case DecompositionMedial:
2028                                         return 0x18;
2029                                 }
2030                         }
2031
2032                         // actually I dunno the reason why they have weights.
2033                         switch (c) {
2034                         case '\u01BC':
2035                                 return 0x10;
2036                         case '\u06A9':
2037                                 return 0x20;
2038                         case '\u06AA':
2039                                 return 0x28;
2040                         }
2041
2042                         byte ret = 0;
2043                         switch (c) {
2044                         case '\u03C2':
2045                         case '\u2104':
2046                         case '\u212B':
2047                                 ret |= 8;
2048                                 break;
2049                         case '\uFE42':
2050                                 ret |= 0xC;
2051                                 break;
2052                         }
2053
2054                         // misc
2055                         switch (decompType [(int) c]) {
2056                         case DecompositionWide: // <wide>
2057                         case DecompositionSub: // <sub>
2058                         case DecompositionSuper: // <super>
2059                                 ret |= decompType [(int) c];
2060                                 break;
2061                         }
2062                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2063                                 ret |= 8;
2064                         if (isUppercase [(int) c]) // DerivedCoreProperties
2065                                 ret |= 0x10;
2066
2067                         return ret;
2068                 }
2069
2070                 #endregion
2071
2072                 #region IsIgnorable
2073                 // FIXME: In the future use DerivedAge.txt to examine character
2074                 // versions and set those ones that have higher version than
2075                 // 1.0 as ignorable.
2076                 static bool IsIgnorable (int i)
2077                 {
2078                         switch (i) {
2079                         case 0:
2080                         // I guess, those characters are added between
2081                         // Unicode 1.0 (LCMapString) and Unicode 3.1
2082                         // (UnicodeCategory), so they used to be
2083                         // something like OtherNotAssigned as of Unicode 1.1.
2084                         case 0x2df: case 0x387:
2085                         case 0x3d7: case 0x3d8: case 0x3d9:
2086                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2087                         case 0x400: case 0x40d: case 0x450: case 0x45d:
2088                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
2089                         case 0x653: case 0x654: case 0x655: case 0x66d:
2090                         case 0xb56:
2091                         case 0x1e9b: case 0x202f: case 0x20ad:
2092                         case 0x20ae: case 0x20af:
2093                         case 0x20e2: case 0x20e3:
2094                         case 0x2139: case 0x213a: case 0x2183:
2095                         case 0x2425: case 0x2426: case 0x2619:
2096                         case 0x2670: case 0x2671: case 0x3007:
2097                         case 0x3190: case 0x3191:
2098                         case 0xfffc: case 0xfffd:
2099                                 return true;
2100                         // exceptional characters filtered by the
2101                         // following conditions. Originally those exceptional
2102                         // ranges are incorrect (they should not be ignored)
2103                         // and most of those characters are unfortunately in
2104                         // those ranges.
2105                         case 0x4d8: case 0x4d9:
2106                         case 0x4e8: case 0x4e9:
2107                         case 0x3036: case 0x303f:
2108                         case 0x337b: case 0xfb1e:
2109                                 return false;
2110                         }
2111
2112                         if (
2113                                 // The whole Sinhala characters.
2114                                 0x0D82 <= i && i <= 0x0DF4
2115                                 // The whole Tibetan characters.
2116                                 || 0x0F00 <= i && i <= 0x0FD1
2117                                 // The whole Myanmar characters.
2118                                 || 0x1000 <= i && i <= 0x1059
2119                                 // The whole Etiopic, Cherokee,
2120                                 // Canadian Syllablic, Ogham, Runic,
2121                                 // Tagalog, Hanunoo, Philippine,
2122                                 // Buhid, Tagbanwa, Khmer and Mongorian
2123                                 // characters.
2124                                 || 0x1200 <= i && i <= 0x1DFF
2125                                 // Greek extension characters.
2126                                 || 0x1F00 <= i && i <= 0x1FFF
2127                                 // The whole Braille characters.
2128                                 || 0x2800 <= i && i <= 0x28FF
2129                                 // CJK radical characters.
2130                                 || 0x2E80 <= i && i <= 0x2EF3
2131                                 // Kangxi radical characters.
2132                                 || 0x2F00 <= i && i <= 0x2FD5
2133                                 // Ideographic description characters.
2134                                 || 0x2FF0 <= i && i <= 0x2FFB
2135                                 // Bopomofo letter and final
2136                                 || 0x31A0 <= i && i <= 0x31B7
2137                                 // White square with quadrant characters.
2138                                 || 0x25F0 <= i && i <= 0x25F7
2139                                 // Ideographic telegraph symbols.
2140                                 || 0x32C0 <= i && i <= 0x32CB
2141                                 || 0x3358 <= i && i <= 0x3370
2142                                 || 0x33E0 <= i && i <= 0x33FF
2143                                 // The whole YI characters.
2144                                 || 0xA000 <= i && i <= 0xA48C
2145                                 || 0xA490 <= i && i <= 0xA4C6
2146                                 // American small ligatures
2147                                 || 0xFB13 <= i && i <= 0xFB17
2148                                 // hebrew, arabic, variation selector.
2149                                 || 0xFB1D <= i && i <= 0xFE2F
2150                                 // Arabic ligatures.
2151                                 || 0xFEF5 <= i && i <= 0xFEFC
2152                                 // FIXME: why are they excluded?
2153                                 || 0x01F6 <= i && i <= 0x01F9
2154                                 || 0x0218 <= i && i <= 0x0233
2155                                 || 0x02A9 <= i && i <= 0x02AD
2156                                 || 0x02EA <= i && i <= 0x02EE
2157                                 || 0x0349 <= i && i <= 0x036F
2158                                 || 0x0488 <= i && i <= 0x048F
2159                                 || 0x04D0 <= i && i <= 0x04FF
2160                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2161                                 || 0x06D6 <= i && i <= 0x06ED
2162                                 || 0x06FA <= i && i <= 0x06FE
2163                                 || 0x2048 <= i && i <= 0x204D
2164                                 || 0x20e4 <= i && i <= 0x20ea
2165                                 || 0x213C <= i && i <= 0x214B
2166                                 || 0x21EB <= i && i <= 0x21FF
2167                                 || 0x22F2 <= i && i <= 0x22FF
2168                                 || 0x237B <= i && i <= 0x239A
2169                                 || 0x239B <= i && i <= 0x23CF
2170                                 || 0x24EB <= i && i <= 0x24FF
2171                                 || 0x2596 <= i && i <= 0x259F
2172                                 || 0x25F8 <= i && i <= 0x25FF
2173                                 || 0x2672 <= i && i <= 0x2689
2174                                 || 0x2768 <= i && i <= 0x2775
2175                                 || 0x27d0 <= i && i <= 0x27ff
2176                                 || 0x2900 <= i && i <= 0x2aff
2177                                 || 0x3033 <= i && i <= 0x303F
2178                                 || 0x31F0 <= i && i <= 0x31FF
2179                                 || 0x3250 <= i && i <= 0x325F
2180                                 || 0x32B1 <= i && i <= 0x32BF
2181                                 || 0x3371 <= i && i <= 0x337B
2182                                 || 0xFA30 <= i && i <= 0xFA6A
2183                         )
2184                                 return true;
2185
2186                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2187                         switch (uc) {
2188                         case UnicodeCategory.PrivateUse:
2189                         case UnicodeCategory.Surrogate:
2190                                 return false;
2191                         // ignored by nature
2192                         case UnicodeCategory.Format:
2193                         case UnicodeCategory.OtherNotAssigned:
2194                                 return true;
2195                         default:
2196                                 return false;
2197                         }
2198                 }
2199
2200                 // To check IsIgnorable sanity, try the driver below under MS.NET.
2201
2202                 /*
2203                 public static void Main ()
2204                 {
2205                         for (int i = 0; i <= char.MaxValue; i++)
2206                                 Dump (i, IsIgnorable (i));
2207                 }
2208
2209                 static void Dump (int i, bool ignore)
2210                 {
2211                         switch (Char.GetUnicodeCategory ((char) i)) {
2212                         case UnicodeCategory.PrivateUse:
2213                         case UnicodeCategory.Surrogate:
2214                                 return; // check nothing
2215                         }
2216
2217                         string s1 = "";
2218                         string s2 = new string ((char) i, 10);
2219                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2220                         if ((ret == 0) == ignore)
2221                                 return;
2222                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2223                 }
2224                 */
2225                 #endregion // IsIgnorable
2226
2227                 #region IsIgnorableSymbol
2228                 static bool IsIgnorableSymbol (int i)
2229                 {
2230                         if (IsIgnorable (i))
2231                                 return true;
2232
2233                         switch (i) {
2234                         // *Letter
2235                         case 0x00b5: case 0x01C0: case 0x01C1:
2236                         case 0x01C2: case 0x01C3: case 0x01F6:
2237                         case 0x01F7: case 0x01F8: case 0x01F9:
2238                         case 0x02D0: case 0x02EE: case 0x037A:
2239                         case 0x03D7: case 0x03F3:
2240                         case 0x0400: case 0x040d:
2241                         case 0x0450: case 0x045d:
2242                         case 0x048C: case 0x048D:
2243                         case 0x048E: case 0x048F:
2244                         case 0x0587: case 0x0640: case 0x06E5:
2245                         case 0x06E6: case 0x06FA: case 0x06FB:
2246                         case 0x06FC: case 0x093D: case 0x0950:
2247                         case 0x1E9B: case 0x2139: case 0x3006:
2248                         case 0x3033: case 0x3034: case 0x3035:
2249                         case 0xFE7E: case 0xFE7F:
2250                         // OtherNumber
2251                         case 0x16EE: case 0x16EF: case 0x16F0:
2252                         // LetterNumber
2253                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2254                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2255                         case 0x3038: // HANGZHOU NUMERAL TEN
2256                         case 0x3039: // HANGZHOU NUMERAL TWENTY
2257                         case 0x303a: // HANGZHOU NUMERAL THIRTY
2258                         // OtherSymbol
2259                         case 0x2117:
2260                         case 0x327F:
2261                                 return true;
2262                         // ModifierSymbol
2263                         case 0x02B9: case 0x02BA: case 0x02C2:
2264                         case 0x02C3: case 0x02C4: case 0x02C5:
2265                         case 0x02C8: case 0x02CC: case 0x02CD:
2266                         case 0x02CE: case 0x02CF: case 0x02D2:
2267                         case 0x02D3: case 0x02D4: case 0x02D5:
2268                         case 0x02D6: case 0x02D7: case 0x02DE:
2269                         case 0x02E5: case 0x02E6: case 0x02E7:
2270                         case 0x02E8: case 0x02E9:
2271                         case 0x309B: case 0x309C:
2272                         // OtherPunctuation
2273                         case 0x055A: // American Apos
2274                         case 0x05C0: // Hebrew Punct
2275                         case 0x0E4F: // Thai FONGMAN
2276                         case 0x0E5A: // Thai ANGKHANKHU
2277                         case 0x0E5B: // Thai KHOMUT
2278                         // CurencySymbol
2279                         case 0x09F2: // Bengali Rupee Mark
2280                         case 0x09F3: // Bengali Rupee Sign
2281                         // MathSymbol
2282                         case 0x221e: // INF.
2283                         // OtherSymbol
2284                         case 0x0482:
2285                         case 0x09FA:
2286                         case 0x0B70:
2287                                 return false;
2288                         }
2289
2290                         // *Letter
2291                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2292 #if NET_2_0
2293                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2294                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2295 #endif
2296                         )
2297                                 return true;
2298
2299                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2300                         switch (uc) {
2301                         case UnicodeCategory.Surrogate:
2302                                 return false; // inconsistent
2303
2304                         case UnicodeCategory.SpacingCombiningMark:
2305                         case UnicodeCategory.EnclosingMark:
2306                         case UnicodeCategory.NonSpacingMark:
2307                         case UnicodeCategory.PrivateUse:
2308                                 // NonSpacingMark
2309                                 if (0x064B <= i && i <= 0x0652) // Arabic
2310                                         return true;
2311                                 return false;
2312
2313                         case UnicodeCategory.Format:
2314                         case UnicodeCategory.OtherNotAssigned:
2315                                 return true;
2316
2317                         default:
2318                                 bool use = false;
2319                                 // OtherSymbols
2320                                 if (
2321                                         // latin in a circle
2322                                         0x249A <= i && i <= 0x24E9
2323                                         || 0x2100 <= i && i <= 0x2132
2324                                         // Japanese
2325                                         || 0x3196 <= i && i <= 0x31A0
2326                                         // Korean
2327                                         || 0x3200 <= i && i <= 0x321C
2328                                         // Chinese/Japanese
2329                                         || 0x322A <= i && i <= 0x3243
2330                                         // CJK
2331                                         || 0x3260 <= i && i <= 0x32B0
2332                                         || 0x32D0 <= i && i <= 0x3357
2333                                         || 0x337B <= i && i <= 0x33DD
2334                                 )
2335                                         use = !Char.IsLetterOrDigit ((char) i);
2336                                 if (use)
2337                                         return false;
2338
2339                                 // This "Digit" rule is mystery.
2340                                 // It filters some symbols out.
2341                                 if (Char.IsLetterOrDigit ((char) i))
2342                                         return false;
2343                                 if (Char.IsNumber ((char) i))
2344                                         return false;
2345                                 if (Char.IsControl ((char) i)
2346                                         || Char.IsSeparator ((char) i)
2347                                         || Char.IsPunctuation ((char) i))
2348                                         return true;
2349                                 if (Char.IsSymbol ((char) i))
2350                                         return true;
2351
2352                                 // FIXME: should check more
2353                                 return false;
2354                         }
2355                 }
2356
2357                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
2358 /*
2359                 public static void Main ()
2360                 {
2361                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
2362                         for (int i = 0; i <= char.MaxValue; i++) {
2363                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2364                                 if (uc == UnicodeCategory.Surrogate)
2365                                         continue;
2366
2367                                 bool ret = IsIgnorableSymbol (i);
2368
2369                                 string s1 = "TEST ";
2370                                 string s2 = "TEST " + (char) i;
2371
2372                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
2373
2374                                 if (ret != (result == 0))
2375                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
2376                                                 ret ? "should not ignore" :
2377                                                         "should ignore",
2378                                                 i,(char) i, uc);
2379                         }
2380                 }
2381 */
2382                 #endregion
2383
2384                 #region NonSpacing
2385                 static bool IsIgnorableNonSpacing (int i)
2386                 {
2387                         if (IsIgnorable (i))
2388                                 return true;
2389
2390                         switch (i) {
2391                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
2392                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
2393                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
2394                                 return true;
2395                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
2396                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
2397                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
2398                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
2399                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
2400                         case 0x0CCD: case 0x0E4E:
2401                                 return false;
2402                         }
2403
2404                         if (0x02b9 <= i && i <= 0x02c5
2405                                 || 0x02cc <= i && i <= 0x02d7
2406                                 || 0x02e4 <= i && i <= 0x02ef
2407                                 || 0x20DD <= i && i <= 0x20E0
2408                         )
2409                                 return true;
2410
2411                         if (0x064B <= i && i <= 0x00652
2412                                 || 0x0941 <= i && i <= 0x0948
2413                                 || 0x0AC1 <= i && i <= 0x0ACD
2414                                 || 0x0C3E <= i && i <= 0x0C4F
2415                                 || 0x0E31 <= i && i <= 0x0E3F
2416                         )
2417                                 return false;
2418
2419                         return Char.GetUnicodeCategory ((char) i) ==
2420                                 UnicodeCategory.NonSpacingMark;
2421                 }
2422
2423                 // We can reuse IsIgnorableSymbol testcode
2424                 // for IsIgnorableNonSpacing.
2425                 #endregion
2426         }
2427
2428         struct CharMapEntry
2429         {
2430                 public byte Category;
2431                 public byte Level1;
2432                 public byte Level2; // It is always single byte.
2433                 public bool Defined;
2434
2435                 public CharMapEntry (byte category, byte level1, byte level2)
2436                 {
2437                         Category = category;
2438                         Level1 = level1;
2439                         Level2 = level2;
2440                         Defined = true;
2441                 }
2442         }
2443
2444         class JISCharacter
2445         {
2446                 public readonly int CP;
2447                 public readonly int JIS;
2448
2449                 public JISCharacter (int cp, int cpJIS)
2450                 {
2451                         CP = cp;
2452                         JIS = cpJIS;
2453                 }
2454         }
2455
2456         class JISComparer : IComparer
2457         {
2458                 public static readonly JISComparer Instance =
2459                         new JISComparer ();
2460
2461                 public int Compare (object o1, object o2)
2462                 {
2463                         JISCharacter j1 = (JISCharacter) o1;
2464                         JISCharacter j2 = (JISCharacter) o2;
2465                         return j2.JIS - j1.JIS;
2466                 }
2467         }
2468
2469         class NonJISCharacter
2470         {
2471                 public readonly int CP;
2472                 public readonly string Name;
2473
2474                 public NonJISCharacter (int cp, string name)
2475                 {
2476                         CP = cp;
2477                         Name = name;
2478                 }
2479         }
2480
2481         class NonJISComparer : IComparer
2482         {
2483                 public static readonly NonJISComparer Instance =
2484                         new NonJISComparer ();
2485
2486                 public int Compare (object o1, object o2)
2487                 {
2488                         NonJISCharacter j1 = (NonJISCharacter) o1;
2489                         NonJISCharacter j2 = (NonJISCharacter) o2;
2490                         return string.CompareOrdinal (j1.Name, j2.Name);
2491                 }
2492         }
2493
2494         class DictionaryValueComparer : IComparer
2495         {
2496                 public static readonly DictionaryValueComparer Instance
2497                         = new DictionaryValueComparer ();
2498
2499                 private DictionaryValueComparer ()
2500                 {
2501                 }
2502
2503                 public /*static*/ int Compare (object o1, object o2)
2504                 {
2505                         DictionaryEntry e1 = (DictionaryEntry) o1;
2506                         DictionaryEntry e2 = (DictionaryEntry) o2;
2507                         // FIXME: in case of 0, compare decomposition categories
2508                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
2509                         if (ret != 0)
2510                                 return ret;
2511                         int i1 = (int) e1.Key;
2512                         int i2 = (int) e2.Key;
2513                         return i1 - i2;
2514                 }
2515         }
2516
2517         class UCAComparer : IComparer
2518         {
2519                 public static readonly UCAComparer Instance
2520                         = new UCAComparer ();
2521
2522                 private UCAComparer ()
2523                 {
2524                 }
2525
2526                 public int Compare (object o1, object o2)
2527                 {
2528                         char i1 = (char) o1;
2529                         char i2 = (char) o2;
2530
2531                         int l1 = CollationElementTable.GetSortKeyCount (i1);
2532                         int l2 = CollationElementTable.GetSortKeyCount (i2);
2533                         int l = l1 > l2 ? l2 : l1;
2534
2535                         for (int i = 0; i < l; i++) {
2536                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
2537                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
2538                                 int v = k1.Primary - k2.Primary;
2539                                 if (v != 0)
2540                                         return v;
2541                                 v = k1.Secondary - k2.Secondary;
2542                                 if (v != 0)
2543                                         return v;
2544                                 v = k1.Thirtiary - k2.Thirtiary;
2545                                 if (v != 0)
2546                                         return v;
2547                                 v = k1.Quarternary - k2.Quarternary;
2548                                 if (v != 0)
2549                                         return v;
2550                         }
2551                         return l1 - l2;
2552                 }
2553         }
2554 }