mcs/class/corlib/Mono.Globalization.Unicode/MSCompatUnicodeTable.cs

   1 using System;
   2 using System.Globalization;
   3
   4 namespace Mono.Globalization.Unicode
   5 {
   6         internal class MSCompatUnicodeTable
   7         {
   8                 #region IsIgnorable
   9                 public static bool IsIgnorable (int i)
  10                 {
  11                         switch (i) {
  12                         case 0:
  13                         // No idea why each of those is ignored.
  14                         case 0x2df: case 0x387:
  15                         case 0x3d7: case 0x3d8: case 0x3d9:
  16                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
  17                         case 0x400: case 0x40d: case 0x450: case 0x45d:
  18                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
  19                         case 0x653: case 0x654: case 0x655: case 0x66d:
  20                         case 0xb56:
  21                         case 0x1e9b: case 0x202f: case 0x20ad:
  22                         case 0x20ae: case 0x20af:
  23                         case 0x20e2: case 0x20e3:
  24                         case 0x2139: case 0x213a: case 0x2183:
  25                         case 0x2425: case 0x2426: case 0x2619:
  26                         case 0x2670: case 0x2671: case 0x3007:
  27                         case 0x3190: case 0x3191:
  28                         case 0xfffc: case 0xfffd:
  29                                 return true;
  30                         // exceptional characters filtered by the
  31                         // following conditions (no idea why though).
  32                         case 0x4d8: case 0x4d9: case 0x4e8: case 0x4e9:
  33                         case 0x70f: case 0x3036: case 0x303f:
  34                         case 0x337b: case 0xfb1e:
  35                                 return false;
  36                         }
  37
  38                         if (
  39                                 // The whole Sinhala characters.
  40                                 0x0D82 <= i && i <= 0x0DF4
  41                                 // The whole Tibetan characters.
  42                                 || 0x0F00 <= i && i <= 0x0FD1
  43                                 // The whole Myanmar characters.
  44                                 || 0x1000 <= i && i <= 0x1059
  45                                 // The whole Etiopic, Cherokee,
  46                                 // Canadian Syllablic, Ogham, Runic,
  47                                 // Tagalog, Hanunoo, Philippine,
  48                                 // Buhid, Tagbanwa, Khmer and Mongorian
  49                                 // characters.
  50                                 || 0x1200 <= i && i <= 0x1DFF
  51                                 // Greek extension characters.
  52                                 || 0x1F00 <= i && i <= 0x1FFF
  53                                 // The whole Braille characters.
  54                                 || 0x2800 <= i && i <= 0x28FF
  55                                 // CJK radical characters.
  56                                 || 0x2E80 <= i && i <= 0x2EF3
  57                                 // Kangxi radical characters.
  58                                 || 0x2F00 <= i && i <= 0x2FD5
  59                                 // Ideographic description characters.
  60                                 || 0x2FF0 <= i && i <= 0x2FFB
  61                                 // Bopomofo letter and final
  62                                 || 0x31A0 <= i && i <= 0x31B7
  63                                 // White square with quadrant characters.
  64                                 || 0x25F0 <= i && i <= 0x25F7
  65                                 // Ideographic telegraph symbols.
  66                                 || 0x32C0 <= i && i <= 0x32CB
  67                                 || 0x3358 <= i && i <= 0x3370
  68                                 || 0x33E0 <= i && i <= 0x33FF
  69                                 // The whole YI characters.
  70                                 || 0xA000 <= i && i <= 0xA48C
  71                                 || 0xA490 <= i && i <= 0xA4C6
  72                                 // American small ligatures
  73                                 || 0xFB13 <= i && i <= 0xFB17
  74                                 // hebrew, arabic, variation selector.
  75                                 || 0xFB1D <= i && i <= 0xFE2F
  76                                 // Arabic ligatures.
  77                                 || 0xFEF5 <= i && i <= 0xFEFC
  78                                 // FIXME: why are they excluded?
  79                                 || 0x01F6 <= i && i <= 0x01F9
  80                                 || 0x0218 <= i && i <= 0x0233
  81                                 || 0x02A9 <= i && i <= 0x02AD
  82                                 || 0x02EA <= i && i <= 0x02EE
  83                                 || 0x0349 <= i && i <= 0x036F
  84                                 || 0x0488 <= i && i <= 0x048F
  85                                 || 0x04D0 <= i && i <= 0x04FF
  86                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
  87                                 || 0x06D6 <= i && i <= 0x06ED
  88                                 || 0x06FA <= i && i <= 0x06FE
  89                                 || 0x2048 <= i && i <= 0x204D
  90                                 || 0x20e4 <= i && i <= 0x20ea
  91                                 || 0x213C <= i && i <= 0x214B
  92                                 || 0x21EB <= i && i <= 0x21FF
  93                                 || 0x22F2 <= i && i <= 0x22FF
  94                                 || 0x237B <= i && i <= 0x239A
  95                                 || 0x239B <= i && i <= 0x23CF
  96                                 || 0x24EB <= i && i <= 0x24FF
  97                                 || 0x2596 <= i && i <= 0x259F
  98                                 || 0x25F8 <= i && i <= 0x25FF
  99                                 || 0x2672 <= i && i <= 0x2689
 100                                 || 0x2768 <= i && i <= 0x2775
 101                                 || 0x27d0 <= i && i <= 0x27ff
 102                                 || 0x2900 <= i && i <= 0x2aff
 103                                 || 0x3033 <= i && i <= 0x303F
 104                                 || 0x31F0 <= i && i <= 0x31FF
 105                                 || 0x3250 <= i && i <= 0x325F
 106                                 || 0x32B1 <= i && i <= 0x32BF
 107                                 || 0x3371 <= i && i <= 0x337B
 108                                 || 0xFA30 <= i && i <= 0xFA6A
 109                         )
 110                                 return true;
 111
 112                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
 113                         switch (uc) {
 114                         // ignored by nature
 115                         case UnicodeCategory.PrivateUse:
 116                         case UnicodeCategory.Surrogate:
 117                                 return false;
 118                         case UnicodeCategory.Format:
 119                         case UnicodeCategory.OtherNotAssigned:
 120                                 return true;
 121                         default:
 122                                 return false;
 123                         }
 124                 }
 125
 126                 // To check IsIgnorable sanity, try the driver below under MS.NET.
 127
 128                 /*
 129                 public static void Main ()
 130                 {
 131                         for (int i = 0; i <= char.MaxValue; i++)
 132                                 Dump (i, IsIgnorable (i));
 133                 }
 134
 135                 static void Dump (int i, bool ignore)
 136                 {
 137                         switch (Char.GetUnicodeCategory ((char) i)) {
 138                         case UnicodeCategory.PrivateUse:
 139                         case UnicodeCategory.Surrogate:
 140                                 return; // check nothing
 141                         }
 142
 143                         string s1 = "";
 144                         string s2 = new string ((char) i, 10);
 145                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
 146                         if ((ret == 0) == ignore)
 147                                 return;
 148                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
 149                 }
 150                 */
 151                 #endregion // IsIgnorable
 152
 153                 #region IsIgnorableSymbol
 154                 public static bool IsIgnorableSymbol (int i)
 155                 {
 156                         if (IsIgnorable (i))
 157                                 return true;
 158
 159                         switch (i) {
 160                         // *Letter
 161                         case 0x00b5: case 0x01C0: case 0x01C1:
 162                         case 0x01C2: case 0x01C3: case 0x01F6:
 163                         case 0x01F7: case 0x01F8: case 0x01F9:
 164                         case 0x02D0: case 0x02EE: case 0x037A:
 165                         case 0x03D7: case 0x03F3:
 166                         case 0x0400: case 0x040d:
 167                         case 0x0450: case 0x045d:
 168                         case 0x048C: case 0x048D:
 169                         case 0x048E: case 0x048F:
 170                         case 0x0587: case 0x0640: case 0x06E5:
 171                         case 0x06E6: case 0x06FA: case 0x06FB:
 172                         case 0x06FC: case 0x093D: case 0x0950:
 173                         case 0x1E9B: case 0x2139: case 0x3006:
 174                         case 0x3033: case 0x3034: case 0x3035:
 175                         case 0xFE7E: case 0xFE7F:
 176                         // OtherNumber
 177                         case 0x16EE: case 0x16EF: case 0x16F0:
 178                         // LetterNumber
 179                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
 180                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
 181                         case 0x3038: // HANGZHOU NUMERAL TEN
 182                         case 0x3039: // HANGZHOU NUMERAL TWENTY
 183                         case 0x303a: // HANGZHOU NUMERAL THIRTY
 184                         // OtherSymbol
 185                         case 0x2117:
 186                         case 0x327F:
 187                                 return true;
 188                         // ModifierSymbol
 189                         case 0x02B9: case 0x02BA: case 0x02C2:
 190                         case 0x02C3: case 0x02C4: case 0x02C5:
 191                         case 0x02C8: case 0x02CC: case 0x02CD:
 192                         case 0x02CE: case 0x02CF: case 0x02D2:
 193                         case 0x02D3: case 0x02D4: case 0x02D5:
 194                         case 0x02D6: case 0x02D7: case 0x02DE:
 195                         case 0x02E5: case 0x02E6: case 0x02E7:
 196                         case 0x02E8: case 0x02E9:
 197                         case 0x309B: case 0x309C:
 198                         // OtherPunctuation
 199                         case 0x055A: // American Apos
 200                         case 0x05C0: // Hebrew Punct
 201                         case 0x0E4F: // Thai FONGMAN
 202                         case 0x0E5A: // Thai ANGKHANKHU
 203                         case 0x0E5B: // Thai KHOMUT
 204                         // CurencySymbol
 205                         case 0x09F2: // Bengali Rupee Mark
 206                         case 0x09F3: // Bengali Rupee Sign
 207                         // MathSymbol
 208                         case 0x221e: // INF.
 209                         // OtherSymbol
 210                         case 0x0482:
 211                         case 0x09FA:
 212                         case 0x0B70:
 213                                 return false;
 214                         }
 215
 216                         // *Letter
 217                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
 218 #if NET_2_0
 219                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
 220                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
 221 #endif
 222                         )
 223                                 return true;
 224
 225                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
 226                         switch (uc) {
 227                         case UnicodeCategory.Surrogate:
 228                                 return false; // inconsistent
 229
 230                         case UnicodeCategory.SpacingCombiningMark:
 231                         case UnicodeCategory.EnclosingMark:
 232                         case UnicodeCategory.NonSpacingMark:
 233                         case UnicodeCategory.PrivateUse:
 234                                 // NonSpacingMark
 235                                 if (0x064B <= i && i <= 0x0652) // Arabic
 236                                         return true;
 237                                 return false;
 238
 239                         case UnicodeCategory.Format:
 240                         case UnicodeCategory.OtherNotAssigned:
 241                                 return true;
 242
 243                         default:
 244                                 bool use = false;
 245                                 // OtherSymbols
 246                                 if (
 247                                         // latin in a circle
 248                                         0x249A <= i && i <= 0x24E9
 249                                         || 0x2100 <= i && i <= 0x2132
 250                                         // Japanese
 251                                         || 0x3196 <= i && i <= 0x31A0
 252                                         // Korean
 253                                         || 0x3200 <= i && i <= 0x321C
 254                                         // Chinese/Japanese
 255                                         || 0x322A <= i && i <= 0x3243
 256                                         // CJK
 257                                         || 0x3260 <= i && i <= 0x32B0
 258                                         || 0x32D0 <= i && i <= 0x3357
 259                                         || 0x337B <= i && i <= 0x33DD
 260                                 )
 261                                         use = !Char.IsLetterOrDigit ((char) i);
 262                                 if (use)
 263                                         return false;
 264
 265                                 // This "Digit" rule is mystery.
 266                                 // It filters some symbols out.
 267                                 if (Char.IsLetterOrDigit ((char) i))
 268                                         return false;
 269                                 if (Char.IsNumber ((char) i))
 270                                         return false;
 271                                 if (Char.IsControl ((char) i)
 272                                         || Char.IsSeparator ((char) i)
 273                                         || Char.IsPunctuation ((char) i))
 274                                         return true;
 275                                 if (Char.IsSymbol ((char) i))
 276                                         return true;
 277
 278                                 // FIXME: should check more
 279                                 return false;
 280                         }
 281                 }
 282
 283                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
 284 /*
 285                 public static void Main ()
 286                 {
 287                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
 288                         for (int i = 0; i <= char.MaxValue; i++) {
 289                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
 290                                 if (uc == UnicodeCategory.Surrogate)
 291                                         continue;
 292
 293                                 bool ret = IsIgnorableSymbol (i);
 294
 295                                 string s1 = "TEST ";
 296                                 string s2 = "TEST " + (char) i;
 297
 298                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
 299
 300                                 if (ret != (result == 0))
 301                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
 302                                                 ret ? "should not ignore" :
 303                                                         "should ignore",
 304                                                 i,(char) i, uc);
 305                         }
 306                 }
 307 */
 308                 #endregion
 309
 310                 #region NonSpacing
 311                 public static bool IsIgnorableNonSpacing (int i)
 312                 {
 313                         if (Mono.Globalization.Unicode.MSCompatUnicodeTable.IsIgnorable (i))
 314                                 return true;
 315
 316                         switch (i) {
 317                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
 318                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
 319                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
 320                                 return true;
 321                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
 322                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
 323                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
 324                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
 325                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
 326                         case 0x0CCD: case 0x0E4E:
 327                                 return false;
 328                         }
 329
 330                         if (0x02b9 <= i && i <= 0x02c5
 331                                 || 0x02cc <= i && i <= 0x02d7
 332                                 || 0x02e4 <= i && i <= 0x02ef
 333                                 || 0x20DD <= i && i <= 0x20E0
 334                         )
 335                                 return true;
 336
 337                         if (0x064B <= i && i <= 0x00652
 338                                 || 0x0941 <= i && i <= 0x0948
 339                                 || 0x0AC1 <= i && i <= 0x0ACD
 340                                 || 0x0C3E <= i && i <= 0x0C4F
 341                                 || 0x0E31 <= i && i <= 0x0E3F
 342                         )
 343                                 return false;
 344
 345                         return Char.GetUnicodeCategory ((char) i) ==
 346                                 UnicodeCategory.NonSpacingMark;
 347                 }
 348
 349                 // We can reuse IsIgnorableSymbol testcode
 350                 // for IsIgnorableNonSpacing.
 351                 #endregion
 352
 353                 public static int ToKanatypeInsensitive (int i)
 354                 {
 355                         // Note that IgnoreKanaType does not treat half-width
 356                         // katakana as equivalent to full-width ones.
 357
 358                         // Thus, it is so simple ;-)
 359                         return (0x3041 <= i && i <= 0x3094) ? i + 0x60 : i;
 360                 }
 361
 362                 public static int ToWidthInsensitive (int i)
 363                 {
 364                         return Normalization.ToWidthInsensitive (i);
 365                 }
 366         }
 367 }
 368