2005-04-25 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / MSCompatUnicodeTable.cs
1 using System;
2 using System.Globalization;
3
4 namespace Mono.Globalization.Unicode
5 {
6         internal class MSCompatUnicodeTable
7         {
8                 #region IsIgnorable
9                 public static bool IsIgnorable (int i)
10                 {
11                         switch (i) {
12                         case 0:
13                         // No idea why each of those is ignored.
14                         case 0x2df: case 0x387:
15                         case 0x3d7: case 0x3d8: case 0x3d9:
16                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
17                         case 0x400: case 0x40d: case 0x450: case 0x45d:
18                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
19                         case 0x653: case 0x654: case 0x655: case 0x66d:
20                         case 0xb56:
21                         case 0x1e9b: case 0x202f: case 0x20ad:
22                         case 0x20ae: case 0x20af:
23                         case 0x20e2: case 0x20e3:
24                         case 0x2139: case 0x213a: case 0x2183:
25                         case 0x2425: case 0x2426: case 0x2619:
26                         case 0x2670: case 0x2671: case 0x3007:
27                         case 0x3190: case 0x3191:
28                         case 0xfffc: case 0xfffd:
29                                 return true;
30                         // exceptional characters filtered by the 
31                         // following conditions (no idea why though).
32                         case 0x4d8: case 0x4d9: case 0x4e8: case 0x4e9:
33                         case 0x70f: case 0x3036: case 0x303f:
34                         case 0x337b: case 0xfb1e:
35                                 return false;
36                         }
37
38                         if (
39                                 // The whole Sinhala characters.
40                                 0x0D82 <= i && i <= 0x0DF4
41                                 // The whole Tibetan characters.
42                                 || 0x0F00 <= i && i <= 0x0FD1
43                                 // The whole Myanmar characters.
44                                 || 0x1000 <= i && i <= 0x1059
45                                 // The whole Etiopic, Cherokee, 
46                                 // Canadian Syllablic, Ogham, Runic,
47                                 // Tagalog, Hanunoo, Philippine,
48                                 // Buhid, Tagbanwa, Khmer and Mongorian
49                                 // characters.
50                                 || 0x1200 <= i && i <= 0x1DFF
51                                 // Greek extension characters.
52                                 || 0x1F00 <= i && i <= 0x1FFF
53                                 // The whole Braille characters.
54                                 || 0x2800 <= i && i <= 0x28FF
55                                 // CJK radical characters.
56                                 || 0x2E80 <= i && i <= 0x2EF3
57                                 // Kangxi radical characters.
58                                 || 0x2F00 <= i && i <= 0x2FD5
59                                 // Ideographic description characters.
60                                 || 0x2FF0 <= i && i <= 0x2FFB
61                                 // Bopomofo letter and final
62                                 || 0x31A0 <= i && i <= 0x31B7
63                                 // White square with quadrant characters.
64                                 || 0x25F0 <= i && i <= 0x25F7
65                                 // Ideographic telegraph symbols.
66                                 || 0x32C0 <= i && i <= 0x32CB
67                                 || 0x3358 <= i && i <= 0x3370
68                                 || 0x33E0 <= i && i <= 0x33FF
69                                 // The whole YI characters.
70                                 || 0xA000 <= i && i <= 0xA48C
71                                 || 0xA490 <= i && i <= 0xA4C6
72                                 // American small ligatures
73                                 || 0xFB13 <= i && i <= 0xFB17
74                                 // hebrew, arabic, variation selector.
75                                 || 0xFB1D <= i && i <= 0xFE2F
76                                 // Arabic ligatures.
77                                 || 0xFEF5 <= i && i <= 0xFEFC
78                                 // FIXME: why are they excluded?
79                                 || 0x01F6 <= i && i <= 0x01F9
80                                 || 0x0218 <= i && i <= 0x0233
81                                 || 0x02A9 <= i && i <= 0x02AD
82                                 || 0x02EA <= i && i <= 0x02EE
83                                 || 0x0349 <= i && i <= 0x036F
84                                 || 0x0488 <= i && i <= 0x048F
85                                 || 0x04D0 <= i && i <= 0x04FF
86                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
87                                 || 0x06D6 <= i && i <= 0x06ED
88                                 || 0x06FA <= i && i <= 0x06FE
89                                 || 0x2048 <= i && i <= 0x204D
90                                 || 0x20e4 <= i && i <= 0x20ea
91                                 || 0x213C <= i && i <= 0x214B
92                                 || 0x21EB <= i && i <= 0x21FF
93                                 || 0x22F2 <= i && i <= 0x22FF
94                                 || 0x237B <= i && i <= 0x239A
95                                 || 0x239B <= i && i <= 0x23CF
96                                 || 0x24EB <= i && i <= 0x24FF
97                                 || 0x2596 <= i && i <= 0x259F
98                                 || 0x25F8 <= i && i <= 0x25FF
99                                 || 0x2672 <= i && i <= 0x2689
100                                 || 0x2768 <= i && i <= 0x2775
101                                 || 0x27d0 <= i && i <= 0x27ff
102                                 || 0x2900 <= i && i <= 0x2aff
103                                 || 0x3033 <= i && i <= 0x303F
104                                 || 0x31F0 <= i && i <= 0x31FF
105                                 || 0x3250 <= i && i <= 0x325F
106                                 || 0x32B1 <= i && i <= 0x32BF
107                                 || 0x3371 <= i && i <= 0x337B
108                                 || 0xFA30 <= i && i <= 0xFA6A
109                         )
110                                 return true;
111
112                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
113                         switch (uc) {
114                         // ignored by nature
115                         case UnicodeCategory.PrivateUse:
116                         case UnicodeCategory.Surrogate:
117                                 return false;
118                         case UnicodeCategory.Format:
119                         case UnicodeCategory.OtherNotAssigned:
120                                 return true;
121                         default:
122                                 return false;
123                         }
124                 }
125
126                 // To check IsIgnorable sanity, try the driver below under MS.NET.
127
128                 /*
129                 public static void Main ()
130                 {
131                         for (int i = 0; i <= char.MaxValue; i++)
132                                 Dump (i, IsIgnorable (i));
133                 }
134
135                 static void Dump (int i, bool ignore)
136                 {
137                         switch (Char.GetUnicodeCategory ((char) i)) {
138                         case UnicodeCategory.PrivateUse:
139                         case UnicodeCategory.Surrogate:
140                                 return; // check nothing
141                         }
142
143                         string s1 = "";
144                         string s2 = new string ((char) i, 10);
145                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
146                         if ((ret == 0) == ignore)
147                                 return;
148                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
149                 }
150                 */
151                 #endregion // IsIgnorable
152
153                 #region IsIgnorableSymbol
154                 public static bool IsIgnorableSymbol (int i)
155                 {
156                         if (IsIgnorable (i))
157                                 return true;
158
159                         switch (i) {
160                         // *Letter
161                         case 0x00b5: case 0x01C0: case 0x01C1:
162                         case 0x01C2: case 0x01C3: case 0x01F6:
163                         case 0x01F7: case 0x01F8: case 0x01F9:
164                         case 0x02D0: case 0x02EE: case 0x037A:
165                         case 0x03D7: case 0x03F3:
166                         case 0x0400: case 0x040d:
167                         case 0x0450: case 0x045d:
168                         case 0x048C: case 0x048D:
169                         case 0x048E: case 0x048F:
170                         case 0x0587: case 0x0640: case 0x06E5:
171                         case 0x06E6: case 0x06FA: case 0x06FB:
172                         case 0x06FC: case 0x093D: case 0x0950:
173                         case 0x1E9B: case 0x2139: case 0x3006:
174                         case 0x3033: case 0x3034: case 0x3035:
175                         case 0xFE7E: case 0xFE7F:
176                         // OtherNumber
177                         case 0x16EE: case 0x16EF: case 0x16F0:
178                         // LetterNumber
179                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
180                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
181                         case 0x3038: // HANGZHOU NUMERAL TEN
182                         case 0x3039: // HANGZHOU NUMERAL TWENTY
183                         case 0x303a: // HANGZHOU NUMERAL THIRTY
184                         // OtherSymbol
185                         case 0x2117:
186                         case 0x327F:
187                                 return true;
188                         // ModifierSymbol
189                         case 0x02B9: case 0x02BA: case 0x02C2:
190                         case 0x02C3: case 0x02C4: case 0x02C5:
191                         case 0x02C8: case 0x02CC: case 0x02CD:
192                         case 0x02CE: case 0x02CF: case 0x02D2:
193                         case 0x02D3: case 0x02D4: case 0x02D5:
194                         case 0x02D6: case 0x02D7: case 0x02DE:
195                         case 0x02E5: case 0x02E6: case 0x02E7:
196                         case 0x02E8: case 0x02E9:
197                         case 0x309B: case 0x309C:
198                         // OtherPunctuation
199                         case 0x055A: // American Apos
200                         case 0x05C0: // Hebrew Punct
201                         case 0x0E4F: // Thai FONGMAN
202                         case 0x0E5A: // Thai ANGKHANKHU
203                         case 0x0E5B: // Thai KHOMUT
204                         // CurencySymbol
205                         case 0x09F2: // Bengali Rupee Mark
206                         case 0x09F3: // Bengali Rupee Sign
207                         // MathSymbol
208                         case 0x221e: // INF.
209                         // OtherSymbol
210                         case 0x0482:
211                         case 0x09FA:
212                         case 0x0B70:
213                                 return false;
214                         }
215
216                         // *Letter
217                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
218 #if NET_2_0
219                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
220                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
221 #endif
222                         )
223                                 return true;
224
225                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
226                         switch (uc) {
227                         case UnicodeCategory.Surrogate:
228                                 return false; // inconsistent
229
230                         case UnicodeCategory.SpacingCombiningMark:
231                         case UnicodeCategory.EnclosingMark:
232                         case UnicodeCategory.NonSpacingMark:
233                         case UnicodeCategory.PrivateUse:
234                                 // NonSpacingMark
235                                 if (0x064B <= i && i <= 0x0652) // Arabic
236                                         return true;
237                                 return false;
238
239                         case UnicodeCategory.Format:
240                         case UnicodeCategory.OtherNotAssigned:
241                                 return true;
242
243                         default:
244                                 bool use = false;
245                                 // OtherSymbols
246                                 if (
247                                         // latin in a circle
248                                         0x249A <= i && i <= 0x24E9
249                                         || 0x2100 <= i && i <= 0x2132
250                                         // Japanese
251                                         || 0x3196 <= i && i <= 0x31A0
252                                         // Korean
253                                         || 0x3200 <= i && i <= 0x321C
254                                         // Chinese/Japanese
255                                         || 0x322A <= i && i <= 0x3243
256                                         // CJK
257                                         || 0x3260 <= i && i <= 0x32B0
258                                         || 0x32D0 <= i && i <= 0x3357
259                                         || 0x337B <= i && i <= 0x33DD
260                                 )
261                                         use = !Char.IsLetterOrDigit ((char) i);
262                                 if (use)
263                                         return false;
264
265                                 // This "Digit" rule is mystery.
266                                 // It filters some symbols out.
267                                 if (Char.IsLetterOrDigit ((char) i))
268                                         return false;
269                                 if (Char.IsNumber ((char) i))
270                                         return false;
271                                 if (Char.IsControl ((char) i)
272                                         || Char.IsSeparator ((char) i)
273                                         || Char.IsPunctuation ((char) i))
274                                         return true;
275                                 if (Char.IsSymbol ((char) i))
276                                         return true;
277
278                                 // FIXME: should check more
279                                 return false;
280                         }
281                 }
282
283                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
284 /*
285                 public static void Main ()
286                 {
287                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
288                         for (int i = 0; i <= char.MaxValue; i++) {
289                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
290                                 if (uc == UnicodeCategory.Surrogate)
291                                         continue;
292
293                                 bool ret = IsIgnorableSymbol (i);
294
295                                 string s1 = "TEST ";
296                                 string s2 = "TEST " + (char) i;
297
298                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
299
300                                 if (ret != (result == 0))
301                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
302                                                 ret ? "should not ignore" :
303                                                         "should ignore",
304                                                 i,(char) i, uc);
305                         }
306                 }
307 */
308                 #endregion
309
310                 #region NonSpacing
311                 public static bool IsIgnorableNonSpacing (int i)
312                 {
313                         if (Mono.Globalization.Unicode.MSCompatUnicodeTable.IsIgnorable (i))
314                                 return true;
315
316                         switch (i) {
317                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
318                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
319                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
320                                 return true;
321                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
322                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
323                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
324                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
325                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
326                         case 0x0CCD: case 0x0E4E:
327                                 return false;
328                         }
329
330                         if (0x02b9 <= i && i <= 0x02c5
331                                 || 0x02cc <= i && i <= 0x02d7
332                                 || 0x02e4 <= i && i <= 0x02ef
333                                 || 0x20DD <= i && i <= 0x20E0
334                         )
335                                 return true;
336
337                         if (0x064B <= i && i <= 0x00652
338                                 || 0x0941 <= i && i <= 0x0948
339                                 || 0x0AC1 <= i && i <= 0x0ACD
340                                 || 0x0C3E <= i && i <= 0x0C4F
341                                 || 0x0E31 <= i && i <= 0x0E3F
342                         )
343                                 return false;
344
345                         return Char.GetUnicodeCategory ((char) i) ==
346                                 UnicodeCategory.NonSpacingMark;
347                 }
348
349                 // We can reuse IsIgnorableSymbol testcode 
350                 // for IsIgnorableNonSpacing.
351                 #endregion
352
353                 public static int ToKanatypeInsensitive (int i)
354                 {
355                         // Note that IgnoreKanaType does not treat half-width
356                         // katakana as equivalent to full-width ones.
357
358                         // Thus, it is so simple ;-)
359                         return (0x3041 <= i && i <= 0x3094) ? i + 0x60 : i;
360                 }
361
362                 public static int ToWidthInsensitive (int i)
363                 {
364                         return Normalization.ToWidthInsensitive (i);
365                 }
366         }
367 }
368