3 // namespace: System.Text.RegularExpressions
\r
6 // author: Dan Lewis (dlewis@gmx.co.uk)
\r
10 using System.Globalization;
\r
12 namespace System.Text.RegularExpressions {
\r
14 enum Category : ushort {
\r
17 // canonical classes
\r
19 Any, // any character except newline .
\r
20 AnySingleline, // any character . (s option)
\r
21 Word, // any word character \w
\r
22 Digit, // any digit character \d
\r
23 WhiteSpace, // any whitespace character \s
\r
25 // ECMAScript classes
\r
30 EcmaWord, // [a-zA-Z_0-9]
\r
32 EcmaWhiteSpace, // [ \f\n\r\t\v]
\r
34 // unicode categories
\r
39 UnicodeZ, // Separator
\r
40 UnicodeP, // Punctuation
\r
44 UnicodeLu, // UppercaseLetter
\r
45 UnicodeLl, // LowercaseLetter
\r
46 UnicodeLt, // TitlecaseLetter
\r
47 UnicodeLm, // ModifierLetter
\r
48 UnicodeLo, // OtherLetter
\r
49 UnicodeMn, // NonspacingMark
\r
50 UnicodeMe, // EnclosingMark
\r
51 UnicodeMc, // SpacingMark
\r
52 UnicodeNd, // DecimalNumber
\r
53 UnicodeNl, // LetterNumber
\r
54 UnicodeNo, // OtherNumber
\r
55 UnicodeZs, // SpaceSeparator
\r
56 UnicodeZl, // LineSeparator
\r
57 UnicodeZp, // ParagraphSeparator
\r
58 UnicodePd, // DashPunctuation
\r
59 UnicodePs, // OpenPunctuation
\r
60 UnicodePi, // InitialPunctuation
\r
61 UnicodePe, // ClosePunctuation
\r
62 UnicodePf, // FinalPunctuation
\r
63 UnicodePc, // ConnectorPunctuation
\r
64 UnicodePo, // OtherPunctuation
\r
65 UnicodeSm, // MathSymbol
\r
66 UnicodeSc, // CurrencySymbol
\r
67 UnicodeSk, // ModifierSymbol
\r
68 UnicodeSo, // OtherSymbol
\r
69 UnicodeCc, // Control
\r
70 UnicodeCf, // Format
\r
71 UnicodeCo, // PrivateUse
\r
72 UnicodeCs, // Surrogate
\r
73 UnicodeCn, // Unassigned
\r
75 // unicode block ranges
\r
77 // notes: the categories marked with a star are valid unicode block ranges,
\r
78 // but don't seem to be accepted by the MS parser using the /p{...} format.
\r
82 UnicodeLatin1Supplement, // *
\r
83 UnicodeLatinExtendedA, // *
\r
84 UnicodeLatinExtendedB, // *
\r
85 UnicodeIPAExtensions,
\r
86 UnicodeSpacingModifierLetters,
\r
87 UnicodeCombiningDiacriticalMarks,
\r
113 UnicodeUnifiedCanadianAboriginalSyllabics,
\r
118 UnicodeLatinExtendedAdditional,
\r
119 UnicodeGreekExtended,
\r
120 UnicodeGeneralPunctuation,
\r
121 UnicodeSuperscriptsandSubscripts,
\r
122 UnicodeCurrencySymbols,
\r
123 UnicodeCombiningMarksforSymbols,
\r
124 UnicodeLetterlikeSymbols,
\r
125 UnicodeNumberForms,
\r
127 UnicodeMathematicalOperators,
\r
128 UnicodeMiscellaneousTechnical,
\r
129 UnicodeControlPictures,
\r
130 UnicodeOpticalCharacterRecognition,
\r
131 UnicodeEnclosedAlphanumerics,
\r
133 UnicodeBlockElements,
\r
134 UnicodeGeometricShapes,
\r
135 UnicodeMiscellaneousSymbols,
\r
137 UnicodeBraillePatterns,
\r
138 UnicodeCJKRadicalsSupplement,
\r
139 UnicodeKangxiRadicals,
\r
140 UnicodeIdeographicDescriptionCharacters,
\r
141 UnicodeCJKSymbolsandPunctuation,
\r
145 UnicodeHangulCompatibilityJamo,
\r
147 UnicodeBopomofoExtended,
\r
148 UnicodeEnclosedCJKLettersandMonths,
\r
149 UnicodeCJKCompatibility,
\r
150 UnicodeCJKUnifiedIdeographsExtensionA,
\r
151 UnicodeCJKUnifiedIdeographs,
\r
152 UnicodeYiSyllables,
\r
154 UnicodeHangulSyllables,
\r
155 UnicodeHighSurrogates,
\r
156 UnicodeHighPrivateUseSurrogates,
\r
157 UnicodeLowSurrogates,
\r
159 UnicodeCJKCompatibilityIdeographs,
\r
160 UnicodeAlphabeticPresentationForms,
\r
161 UnicodeArabicPresentationFormsA, // *
\r
162 UnicodeCombiningHalfMarks,
\r
163 UnicodeCJKCompatibilityForms,
\r
164 UnicodeSmallFormVariants,
\r
165 UnicodeArabicPresentationFormsB, // *
\r
167 UnicodeHalfwidthandFullwidthForms,
\r
172 UnicodeByzantineMusicalSymbols,
\r
173 UnicodeMusicalSymbols,
\r
174 UnicodeMathematicalAlphanumericSymbols,
\r
175 UnicodeCJKUnifiedIdeographsExtensionB,
\r
176 UnicodeCJKCompatibilityIdeographsSupplement,
\r
180 class CategoryUtils {
\r
181 public static Category CategoryFromName (string name) {
\r
183 if (name.StartsWith ("Is")) // remove prefix from block range
\r
184 name = name.Substring (2);
\r
186 return (Category)Enum.Parse (typeof (Category), "Unicode" + name);
\r
188 catch (ArgumentException) {
\r
189 return Category.None;
\r
193 public static bool IsCategory (Category cat, char c) {
\r
195 case Category.None:
\r
201 case Category.AnySingleline:
\r
204 case Category.Word:
\r
206 Char.IsLetterOrDigit (c) ||
\r
207 IsCategory (UnicodeCategory.ConnectorPunctuation, c);
\r
209 case Category.Digit:
\r
210 return Char.IsDigit (c);
\r
212 case Category.WhiteSpace:
\r
213 return Char.IsWhiteSpace (c);
\r
217 case Category.EcmaAny:
\r
220 case Category.EcmaAnySingleline:
\r
223 case Category.EcmaWord:
\r
225 'a' <= c && c <= 'z' ||
\r
226 'A' <= c && c <= 'Z' ||
\r
227 '0' <= c && c <= '9' ||
\r
230 case Category.EcmaDigit:
\r
232 '0' <= c && c <= 9;
\r
234 case Category.EcmaWhiteSpace:
\r
243 // Unicode categories...
\r
247 case Category.UnicodeLu: return IsCategory (UnicodeCategory.UppercaseLetter, c);
\r
248 case Category.UnicodeLl: return IsCategory (UnicodeCategory.LowercaseLetter, c);
\r
249 case Category.UnicodeLt: return IsCategory (UnicodeCategory.TitlecaseLetter, c);
\r
250 case Category.UnicodeLm: return IsCategory (UnicodeCategory.ModifierLetter, c);
\r
251 case Category.UnicodeLo: return IsCategory (UnicodeCategory.OtherLetter, c);
\r
255 case Category.UnicodeMn: return IsCategory (UnicodeCategory.NonSpacingMark, c);
\r
256 case Category.UnicodeMe: return IsCategory (UnicodeCategory.EnclosingMark, c);
\r
257 case Category.UnicodeMc: return IsCategory (UnicodeCategory.SpacingCombiningMark, c);
\r
258 case Category.UnicodeNd: return IsCategory (UnicodeCategory.DecimalDigitNumber, c);
\r
262 case Category.UnicodeNl: return IsCategory (UnicodeCategory.LetterNumber, c);
\r
263 case Category.UnicodeNo: return IsCategory (UnicodeCategory.OtherNumber, c);
\r
267 case Category.UnicodeZs: return IsCategory (UnicodeCategory.SpaceSeparator, c);
\r
268 case Category.UnicodeZl: return IsCategory (UnicodeCategory.LineSeparator, c);
\r
269 case Category.UnicodeZp: return IsCategory (UnicodeCategory.ParagraphSeparator, c);
\r
273 case Category.UnicodePd: return IsCategory (UnicodeCategory.DashPunctuation, c);
\r
274 case Category.UnicodePs: return IsCategory (UnicodeCategory.OpenPunctuation, c);
\r
275 case Category.UnicodePi: return IsCategory (UnicodeCategory.InitialQuotePunctuation, c);
\r
276 case Category.UnicodePe: return IsCategory (UnicodeCategory.ClosePunctuation, c);
\r
277 case Category.UnicodePf: return IsCategory (UnicodeCategory.FinalQuotePunctuation, c);
\r
278 case Category.UnicodePc: return IsCategory (UnicodeCategory.ConnectorPunctuation, c);
\r
279 case Category.UnicodePo: return IsCategory (UnicodeCategory.OtherPunctuation, c);
\r
283 case Category.UnicodeSm: return IsCategory (UnicodeCategory.MathSymbol, c);
\r
284 case Category.UnicodeSc: return IsCategory (UnicodeCategory.CurrencySymbol, c);
\r
285 case Category.UnicodeSk: return IsCategory (UnicodeCategory.ModifierSymbol, c);
\r
286 case Category.UnicodeSo: return IsCategory (UnicodeCategory.OtherSymbol, c);
\r
290 case Category.UnicodeCc: return IsCategory (UnicodeCategory.Control, c);
\r
291 case Category.UnicodeCf: return IsCategory (UnicodeCategory.Format, c);
\r
292 case Category.UnicodeCo: return IsCategory (UnicodeCategory.PrivateUse, c);
\r
293 case Category.UnicodeCs: return IsCategory (UnicodeCategory.Surrogate, c);
\r
294 case Category.UnicodeCn: return IsCategory (UnicodeCategory.OtherNotAssigned, c);
\r
296 case Category.UnicodeL: // letter
\r
298 IsCategory (UnicodeCategory.UppercaseLetter, c) ||
\r
299 IsCategory (UnicodeCategory.LowercaseLetter, c) ||
\r
300 IsCategory (UnicodeCategory.TitlecaseLetter, c) ||
\r
301 IsCategory (UnicodeCategory.ModifierLetter, c) ||
\r
302 IsCategory (UnicodeCategory.OtherLetter, c);
\r
304 case Category.UnicodeM: // mark
\r
306 IsCategory (UnicodeCategory.NonSpacingMark, c) ||
\r
307 IsCategory (UnicodeCategory.EnclosingMark, c) ||
\r
308 IsCategory (UnicodeCategory.SpacingCombiningMark, c);
\r
310 case Category.UnicodeN: // number
\r
312 IsCategory (UnicodeCategory.DecimalDigitNumber, c) ||
\r
313 IsCategory (UnicodeCategory.LetterNumber, c) ||
\r
314 IsCategory (UnicodeCategory.OtherNumber, c);
\r
316 case Category.UnicodeZ: // separator
\r
318 IsCategory (UnicodeCategory.SpaceSeparator, c) ||
\r
319 IsCategory (UnicodeCategory.LineSeparator, c) ||
\r
320 IsCategory (UnicodeCategory.ParagraphSeparator, c);
\r
322 case Category.UnicodeP: // punctuation
\r
324 IsCategory (UnicodeCategory.DashPunctuation, c) ||
\r
325 IsCategory (UnicodeCategory.OpenPunctuation, c) ||
\r
326 IsCategory (UnicodeCategory.InitialQuotePunctuation, c) ||
\r
327 IsCategory (UnicodeCategory.ClosePunctuation, c) ||
\r
328 IsCategory (UnicodeCategory.FinalQuotePunctuation, c) ||
\r
329 IsCategory (UnicodeCategory.ConnectorPunctuation, c) ||
\r
330 IsCategory (UnicodeCategory.OtherPunctuation, c);
\r
332 case Category.UnicodeS: // symbol
\r
334 IsCategory (UnicodeCategory.MathSymbol, c) ||
\r
335 IsCategory (UnicodeCategory.CurrencySymbol, c) ||
\r
336 IsCategory (UnicodeCategory.ModifierSymbol, c) ||
\r
337 IsCategory (UnicodeCategory.OtherSymbol, c);
\r
339 case Category.UnicodeC: // other
\r
341 IsCategory (UnicodeCategory.Control, c) ||
\r
342 IsCategory (UnicodeCategory.Format, c) ||
\r
343 IsCategory (UnicodeCategory.PrivateUse, c) ||
\r
344 IsCategory (UnicodeCategory.Surrogate, c) ||
\r
345 IsCategory (UnicodeCategory.OtherNotAssigned, c);
\r
347 // Unicode block ranges...
\r
349 case Category.UnicodeBasicLatin:
\r
350 return '\u0000' <= c && c <= '\u007F';
\r
352 case Category.UnicodeLatin1Supplement:
\r
353 return '\u0080' <= c && c <= '\u00FF';
\r
355 case Category.UnicodeLatinExtendedA:
\r
356 return '\u0100' <= c && c <= '\u017F';
\r
358 case Category.UnicodeLatinExtendedB:
\r
359 return '\u0180' <= c && c <= '\u024F';
\r
361 case Category.UnicodeIPAExtensions:
\r
362 return '\u0250' <= c && c <= '\u02AF';
\r
364 case Category.UnicodeSpacingModifierLetters:
\r
365 return '\u02B0' <= c && c <= '\u02FF';
\r
367 case Category.UnicodeCombiningDiacriticalMarks:
\r
368 return '\u0300' <= c && c <= '\u036F';
\r
370 case Category.UnicodeGreek:
\r
371 return '\u0370' <= c && c <= '\u03FF';
\r
373 case Category.UnicodeCyrillic:
\r
374 return '\u0400' <= c && c <= '\u04FF';
\r
376 case Category.UnicodeArmenian:
\r
377 return '\u0530' <= c && c <= '\u058F';
\r
379 case Category.UnicodeHebrew:
\r
380 return '\u0590' <= c && c <= '\u05FF';
\r
382 case Category.UnicodeArabic:
\r
383 return '\u0600' <= c && c <= '\u06FF';
\r
385 case Category.UnicodeSyriac:
\r
386 return '\u0700' <= c && c <= '\u074F';
\r
388 case Category.UnicodeThaana:
\r
389 return '\u0780' <= c && c <= '\u07BF';
\r
391 case Category.UnicodeDevanagari:
\r
392 return '\u0900' <= c && c <= '\u097F';
\r
394 case Category.UnicodeBengali:
\r
395 return '\u0980' <= c && c <= '\u09FF';
\r
397 case Category.UnicodeGurmukhi:
\r
398 return '\u0A00' <= c && c <= '\u0A7F';
\r
400 case Category.UnicodeGujarati:
\r
401 return '\u0A80' <= c && c <= '\u0AFF';
\r
403 case Category.UnicodeOriya:
\r
404 return '\u0B00' <= c && c <= '\u0B7F';
\r
406 case Category.UnicodeTamil:
\r
407 return '\u0B80' <= c && c <= '\u0BFF';
\r
409 case Category.UnicodeTelugu:
\r
410 return '\u0C00' <= c && c <= '\u0C7F';
\r
412 case Category.UnicodeKannada:
\r
413 return '\u0C80' <= c && c <= '\u0CFF';
\r
415 case Category.UnicodeMalayalam:
\r
416 return '\u0D00' <= c && c <= '\u0D7F';
\r
418 case Category.UnicodeSinhala:
\r
419 return '\u0D80' <= c && c <= '\u0DFF';
\r
421 case Category.UnicodeThai:
\r
422 return '\u0E00' <= c && c <= '\u0E7F';
\r
424 case Category.UnicodeLao:
\r
425 return '\u0E80' <= c && c <= '\u0EFF';
\r
427 case Category.UnicodeTibetan:
\r
428 return '\u0F00' <= c && c <= '\u0FFF';
\r
430 case Category.UnicodeMyanmar:
\r
431 return '\u1000' <= c && c <= '\u109F';
\r
433 case Category.UnicodeGeorgian:
\r
434 return '\u10A0' <= c && c <= '\u10FF';
\r
436 case Category.UnicodeHangulJamo:
\r
437 return '\u1100' <= c && c <= '\u11FF';
\r
439 case Category.UnicodeEthiopic:
\r
440 return '\u1200' <= c && c <= '\u137F';
\r
442 case Category.UnicodeCherokee:
\r
443 return '\u13A0' <= c && c <= '\u13FF';
\r
445 case Category.UnicodeUnifiedCanadianAboriginalSyllabics:
\r
446 return '\u1400' <= c && c <= '\u167F';
\r
448 case Category.UnicodeOgham:
\r
449 return '\u1680' <= c && c <= '\u169F';
\r
451 case Category.UnicodeRunic:
\r
452 return '\u16A0' <= c && c <= '\u16FF';
\r
454 case Category.UnicodeKhmer:
\r
455 return '\u1780' <= c && c <= '\u17FF';
\r
457 case Category.UnicodeMongolian:
\r
458 return '\u1800' <= c && c <= '\u18AF';
\r
460 case Category.UnicodeLatinExtendedAdditional:
\r
461 return '\u1E00' <= c && c <= '\u1EFF';
\r
463 case Category.UnicodeGreekExtended:
\r
464 return '\u1F00' <= c && c <= '\u1FFF';
\r
466 case Category.UnicodeGeneralPunctuation:
\r
467 return '\u2000' <= c && c <= '\u206F';
\r
469 case Category.UnicodeSuperscriptsandSubscripts:
\r
470 return '\u2070' <= c && c <= '\u209F';
\r
472 case Category.UnicodeCurrencySymbols:
\r
473 return '\u20A0' <= c && c <= '\u20CF';
\r
475 case Category.UnicodeCombiningMarksforSymbols:
\r
476 return '\u20D0' <= c && c <= '\u20FF';
\r
478 case Category.UnicodeLetterlikeSymbols:
\r
479 return '\u2100' <= c && c <= '\u214F';
\r
481 case Category.UnicodeNumberForms:
\r
482 return '\u2150' <= c && c <= '\u218F';
\r
484 case Category.UnicodeArrows:
\r
485 return '\u2190' <= c && c <= '\u21FF';
\r
487 case Category.UnicodeMathematicalOperators:
\r
488 return '\u2200' <= c && c <= '\u22FF';
\r
490 case Category.UnicodeMiscellaneousTechnical:
\r
491 return '\u2300' <= c && c <= '\u23FF';
\r
493 case Category.UnicodeControlPictures:
\r
494 return '\u2400' <= c && c <= '\u243F';
\r
496 case Category.UnicodeOpticalCharacterRecognition:
\r
497 return '\u2440' <= c && c <= '\u245F';
\r
499 case Category.UnicodeEnclosedAlphanumerics:
\r
500 return '\u2460' <= c && c <= '\u24FF';
\r
502 case Category.UnicodeBoxDrawing:
\r
503 return '\u2500' <= c && c <= '\u257F';
\r
505 case Category.UnicodeBlockElements:
\r
506 return '\u2580' <= c && c <= '\u259F';
\r
508 case Category.UnicodeGeometricShapes:
\r
509 return '\u25A0' <= c && c <= '\u25FF';
\r
511 case Category.UnicodeMiscellaneousSymbols:
\r
512 return '\u2600' <= c && c <= '\u26FF';
\r
514 case Category.UnicodeDingbats:
\r
515 return '\u2700' <= c && c <= '\u27BF';
\r
517 case Category.UnicodeBraillePatterns:
\r
518 return '\u2800' <= c && c <= '\u28FF';
\r
520 case Category.UnicodeCJKRadicalsSupplement:
\r
521 return '\u2E80' <= c && c <= '\u2EFF';
\r
523 case Category.UnicodeKangxiRadicals:
\r
524 return '\u2F00' <= c && c <= '\u2FDF';
\r
526 case Category.UnicodeIdeographicDescriptionCharacters:
\r
527 return '\u2FF0' <= c && c <= '\u2FFF';
\r
529 case Category.UnicodeCJKSymbolsandPunctuation:
\r
530 return '\u3000' <= c && c <= '\u303F';
\r
532 case Category.UnicodeHiragana:
\r
533 return '\u3040' <= c && c <= '\u309F';
\r
535 case Category.UnicodeKatakana:
\r
536 return '\u30A0' <= c && c <= '\u30FF';
\r
538 case Category.UnicodeBopomofo:
\r
539 return '\u3100' <= c && c <= '\u312F';
\r
541 case Category.UnicodeHangulCompatibilityJamo:
\r
542 return '\u3130' <= c && c <= '\u318F';
\r
544 case Category.UnicodeKanbun:
\r
545 return '\u3190' <= c && c <= '\u319F';
\r
547 case Category.UnicodeBopomofoExtended:
\r
548 return '\u31A0' <= c && c <= '\u31BF';
\r
550 case Category.UnicodeEnclosedCJKLettersandMonths:
\r
551 return '\u3200' <= c && c <= '\u32FF';
\r
553 case Category.UnicodeCJKCompatibility:
\r
554 return '\u3300' <= c && c <= '\u33FF';
\r
556 case Category.UnicodeCJKUnifiedIdeographsExtensionA:
\r
557 return '\u3400' <= c && c <= '\u4DB5';
\r
559 case Category.UnicodeCJKUnifiedIdeographs:
\r
560 return '\u4E00' <= c && c <= '\u9FFF';
\r
562 case Category.UnicodeYiSyllables:
\r
563 return '\uA000' <= c && c <= '\uA48F';
\r
565 case Category.UnicodeYiRadicals:
\r
566 return '\uA490' <= c && c <= '\uA4CF';
\r
568 case Category.UnicodeHangulSyllables:
\r
569 return '\uAC00' <= c && c <= '\uD7A3';
\r
571 case Category.UnicodeHighSurrogates:
\r
572 return '\uD800' <= c && c <= '\uDB7F';
\r
574 case Category.UnicodeHighPrivateUseSurrogates:
\r
575 return '\uDB80' <= c && c <= '\uDBFF';
\r
577 case Category.UnicodeLowSurrogates:
\r
578 return '\uDC00' <= c && c <= '\uDFFF';
\r
580 case Category.UnicodePrivateUse:
\r
581 return '\uE000' <= c && c <= '\uF8FF';
\r
583 case Category.UnicodeCJKCompatibilityIdeographs:
\r
584 return '\uF900' <= c && c <= '\uFAFF';
\r
586 case Category.UnicodeAlphabeticPresentationForms:
\r
587 return '\uFB00' <= c && c <= '\uFB4F';
\r
589 case Category.UnicodeArabicPresentationFormsA:
\r
590 return '\uFB50' <= c && c <= '\uFDFF';
\r
592 case Category.UnicodeCombiningHalfMarks:
\r
593 return '\uFE20' <= c && c <= '\uFE2F';
\r
595 case Category.UnicodeCJKCompatibilityForms:
\r
596 return '\uFE30' <= c && c <= '\uFE4F';
\r
598 case Category.UnicodeSmallFormVariants:
\r
599 return '\uFE50' <= c && c <= '\uFE6F';
\r
601 case Category.UnicodeArabicPresentationFormsB:
\r
602 return '\uFE70' <= c && c <= '\uFEFE';
\r
604 case Category.UnicodeHalfwidthandFullwidthForms:
\r
605 return '\uFF00' <= c && c <= '\uFFEF';
\r
607 case Category.UnicodeSpecials:
\r
609 '\uFEFF' <= c && c <= '\uFEFF' ||
\r
610 '\uFFF0' <= c && c <= '\uFFFD';
\r
612 // these block ranges begin above 0x10000
\r
614 case Category.UnicodeOldItalic:
\r
615 case Category.UnicodeGothic:
\r
616 case Category.UnicodeDeseret:
\r
617 case Category.UnicodeByzantineMusicalSymbols:
\r
618 case Category.UnicodeMusicalSymbols:
\r
619 case Category.UnicodeMathematicalAlphanumericSymbols:
\r
620 case Category.UnicodeCJKUnifiedIdeographsExtensionB:
\r
621 case Category.UnicodeCJKCompatibilityIdeographsSupplement:
\r
622 case Category.UnicodeTags:
\r
630 private static bool IsCategory (UnicodeCategory uc, char c) {
\r
631 if (Char.GetUnicodeCategory (c) == uc)
\r