2 using System.Globalization;
4 namespace Mono.Globalization.Unicode
6 internal class MSCompatUnicodeTable
9 public static bool IsIgnorable (int i)
13 // No idea why each of those is ignored.
14 case 0x2df: case 0x387:
15 case 0x3d7: case 0x3d8: case 0x3d9:
16 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
17 case 0x400: case 0x40d: case 0x450: case 0x45d:
18 case 0x587: case 0x58a: case 0x5c4: case 0x640:
19 case 0x653: case 0x654: case 0x655: case 0x66d:
21 case 0x1e9b: case 0x202f: case 0x20ad:
22 case 0x20ae: case 0x20af:
23 case 0x20e2: case 0x20e3:
24 case 0x2139: case 0x213a: case 0x2183:
25 case 0x2425: case 0x2426: case 0x2619:
26 case 0x2670: case 0x2671: case 0x3007:
27 case 0x3190: case 0x3191:
28 case 0xfffc: case 0xfffd:
30 // exceptional characters filtered by the
31 // following conditions (no idea why though).
32 case 0x4d8: case 0x4d9: case 0x4e8: case 0x4e9:
33 case 0x70f: case 0x3036: case 0x303f:
34 case 0x337b: case 0xfb1e:
39 // The whole Sinhala characters.
40 0x0D82 <= i && i <= 0x0DF4
41 // The whole Tibetan characters.
42 || 0x0F00 <= i && i <= 0x0FD1
43 // The whole Myanmar characters.
44 || 0x1000 <= i && i <= 0x1059
45 // The whole Etiopic, Cherokee,
46 // Canadian Syllablic, Ogham, Runic,
47 // Tagalog, Hanunoo, Philippine,
48 // Buhid, Tagbanwa, Khmer and Mongorian
50 || 0x1200 <= i && i <= 0x1DFF
51 // Greek extension characters.
52 || 0x1F00 <= i && i <= 0x1FFF
53 // The whole Braille characters.
54 || 0x2800 <= i && i <= 0x28FF
55 // CJK radical characters.
56 || 0x2E80 <= i && i <= 0x2EF3
57 // Kangxi radical characters.
58 || 0x2F00 <= i && i <= 0x2FD5
59 // Ideographic description characters.
60 || 0x2FF0 <= i && i <= 0x2FFB
61 // Bopomofo letter and final
62 || 0x31A0 <= i && i <= 0x31B7
63 // White square with quadrant characters.
64 || 0x25F0 <= i && i <= 0x25F7
65 // Ideographic telegraph symbols.
66 || 0x32C0 <= i && i <= 0x32CB
67 || 0x3358 <= i && i <= 0x3370
68 || 0x33E0 <= i && i <= 0x33FF
69 // The whole YI characters.
70 || 0xA000 <= i && i <= 0xA48C
71 || 0xA490 <= i && i <= 0xA4C6
72 // American small ligatures
73 || 0xFB13 <= i && i <= 0xFB17
74 // hebrew, arabic, variation selector.
75 || 0xFB1D <= i && i <= 0xFE2F
77 || 0xFEF5 <= i && i <= 0xFEFC
78 // FIXME: why are they excluded?
79 || 0x01F6 <= i && i <= 0x01F9
80 || 0x0218 <= i && i <= 0x0233
81 || 0x02A9 <= i && i <= 0x02AD
82 || 0x02EA <= i && i <= 0x02EE
83 || 0x0349 <= i && i <= 0x036F
84 || 0x0488 <= i && i <= 0x048F
85 || 0x04D0 <= i && i <= 0x04FF
86 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
87 || 0x06D6 <= i && i <= 0x06ED
88 || 0x06FA <= i && i <= 0x06FE
89 || 0x2048 <= i && i <= 0x204D
90 || 0x20e4 <= i && i <= 0x20ea
91 || 0x213C <= i && i <= 0x214B
92 || 0x21EB <= i && i <= 0x21FF
93 || 0x22F2 <= i && i <= 0x22FF
94 || 0x237B <= i && i <= 0x239A
95 || 0x239B <= i && i <= 0x23CF
96 || 0x24EB <= i && i <= 0x24FF
97 || 0x2596 <= i && i <= 0x259F
98 || 0x25F8 <= i && i <= 0x25FF
99 || 0x2672 <= i && i <= 0x2689
100 || 0x2768 <= i && i <= 0x2775
101 || 0x27d0 <= i && i <= 0x27ff
102 || 0x2900 <= i && i <= 0x2aff
103 || 0x3033 <= i && i <= 0x303F
104 || 0x31F0 <= i && i <= 0x31FF
105 || 0x3250 <= i && i <= 0x325F
106 || 0x32B1 <= i && i <= 0x32BF
107 || 0x3371 <= i && i <= 0x337B
108 || 0xFA30 <= i && i <= 0xFA6A
112 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
115 case UnicodeCategory.PrivateUse:
116 case UnicodeCategory.Surrogate:
118 case UnicodeCategory.Format:
119 case UnicodeCategory.OtherNotAssigned:
126 // To check IsIgnorable sanity, try the driver below under MS.NET.
129 public static void Main ()
131 for (int i = 0; i <= char.MaxValue; i++)
132 Dump (i, IsIgnorable (i));
135 static void Dump (int i, bool ignore)
137 switch (Char.GetUnicodeCategory ((char) i)) {
138 case UnicodeCategory.PrivateUse:
139 case UnicodeCategory.Surrogate:
140 return; // check nothing
144 string s2 = new string ((char) i, 10);
145 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
146 if ((ret == 0) == ignore)
148 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
151 #endregion // IsIgnorable
153 #region IsIgnorableSymbol
154 public static bool IsIgnorableSymbol (int i)
161 case 0x00b5: case 0x01C0: case 0x01C1:
162 case 0x01C2: case 0x01C3: case 0x01F6:
163 case 0x01F7: case 0x01F8: case 0x01F9:
164 case 0x02D0: case 0x02EE: case 0x037A:
165 case 0x03D7: case 0x03F3:
166 case 0x0400: case 0x040d:
167 case 0x0450: case 0x045d:
168 case 0x048C: case 0x048D:
169 case 0x048E: case 0x048F:
170 case 0x0587: case 0x0640: case 0x06E5:
171 case 0x06E6: case 0x06FA: case 0x06FB:
172 case 0x06FC: case 0x093D: case 0x0950:
173 case 0x1E9B: case 0x2139: case 0x3006:
174 case 0x3033: case 0x3034: case 0x3035:
175 case 0xFE7E: case 0xFE7F:
177 case 0x16EE: case 0x16EF: case 0x16F0:
179 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
180 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
181 case 0x3038: // HANGZHOU NUMERAL TEN
182 case 0x3039: // HANGZHOU NUMERAL TWENTY
183 case 0x303a: // HANGZHOU NUMERAL THIRTY
189 case 0x02B9: case 0x02BA: case 0x02C2:
190 case 0x02C3: case 0x02C4: case 0x02C5:
191 case 0x02C8: case 0x02CC: case 0x02CD:
192 case 0x02CE: case 0x02CF: case 0x02D2:
193 case 0x02D3: case 0x02D4: case 0x02D5:
194 case 0x02D6: case 0x02D7: case 0x02DE:
195 case 0x02E5: case 0x02E6: case 0x02E7:
196 case 0x02E8: case 0x02E9:
197 case 0x309B: case 0x309C:
199 case 0x055A: // American Apos
200 case 0x05C0: // Hebrew Punct
201 case 0x0E4F: // Thai FONGMAN
202 case 0x0E5A: // Thai ANGKHANKHU
203 case 0x0E5B: // Thai KHOMUT
205 case 0x09F2: // Bengali Rupee Mark
206 case 0x09F3: // Bengali Rupee Sign
217 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
219 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
220 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
225 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
227 case UnicodeCategory.Surrogate:
228 return false; // inconsistent
230 case UnicodeCategory.SpacingCombiningMark:
231 case UnicodeCategory.EnclosingMark:
232 case UnicodeCategory.NonSpacingMark:
233 case UnicodeCategory.PrivateUse:
235 if (0x064B <= i && i <= 0x0652) // Arabic
239 case UnicodeCategory.Format:
240 case UnicodeCategory.OtherNotAssigned:
248 0x249A <= i && i <= 0x24E9
249 || 0x2100 <= i && i <= 0x2132
251 || 0x3196 <= i && i <= 0x31A0
253 || 0x3200 <= i && i <= 0x321C
255 || 0x322A <= i && i <= 0x3243
257 || 0x3260 <= i && i <= 0x32B0
258 || 0x32D0 <= i && i <= 0x3357
259 || 0x337B <= i && i <= 0x33DD
261 use = !Char.IsLetterOrDigit ((char) i);
265 // This "Digit" rule is mystery.
266 // It filters some symbols out.
267 if (Char.IsLetterOrDigit ((char) i))
269 if (Char.IsNumber ((char) i))
271 if (Char.IsControl ((char) i)
272 || Char.IsSeparator ((char) i)
273 || Char.IsPunctuation ((char) i))
275 if (Char.IsSymbol ((char) i))
278 // FIXME: should check more
283 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
285 public static void Main ()
287 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
288 for (int i = 0; i <= char.MaxValue; i++) {
289 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
290 if (uc == UnicodeCategory.Surrogate)
293 bool ret = IsIgnorableSymbol (i);
296 string s2 = "TEST " + (char) i;
298 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
300 if (ret != (result == 0))
301 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
302 ret ? "should not ignore" :
311 public static bool IsIgnorableNonSpacing (int i)
313 if (Mono.Globalization.Unicode.MSCompatUnicodeTable.IsIgnorable (i))
317 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
318 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
319 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
321 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
322 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
323 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
324 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
325 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
326 case 0x0CCD: case 0x0E4E:
330 if (0x02b9 <= i && i <= 0x02c5
331 || 0x02cc <= i && i <= 0x02d7
332 || 0x02e4 <= i && i <= 0x02ef
333 || 0x20DD <= i && i <= 0x20E0
337 if (0x064B <= i && i <= 0x00652
338 || 0x0941 <= i && i <= 0x0948
339 || 0x0AC1 <= i && i <= 0x0ACD
340 || 0x0C3E <= i && i <= 0x0C4F
341 || 0x0E31 <= i && i <= 0x0E3F
345 return Char.GetUnicodeCategory ((char) i) ==
346 UnicodeCategory.NonSpacingMark;
349 // We can reuse IsIgnorableSymbol testcode
350 // for IsIgnorableNonSpacing.
353 public static int ToKanatypeInsensitive (int i)
355 // Note that IgnoreKanaType does not treat half-width
356 // katakana as equivalent to full-width ones.
358 // Thus, it is so simple ;-)
359 return (0x3041 <= i && i <= 0x3094) ? i + 0x60 : i;
362 public static int ToWidthInsensitive (int i)
364 return Normalization.ToWidthInsensitive (i);