3 // namespace: System.Text.RegularExpressions
\r
6 // author: Dan Lewis (dlewis@gmx.co.uk)
\r
10 using System.Globalization;
\r
12 namespace System.Text.RegularExpressions {
\r
14 enum Category : ushort {
\r
17 // canonical classes
\r
19 Any, // any character except newline .
\r
20 AnySingleline, // any character . (s option)
\r
21 Word, // any word character \w
\r
22 Digit, // any digit character \d
\r
23 WhiteSpace, // any whitespace character \s
\r
25 // ECMAScript classes
\r
30 EcmaWord, // [a-zA-Z_0-9]
\r
32 EcmaWhiteSpace, // [ \f\n\r\t\v]
\r
34 // unicode categories
\r
39 UnicodeZ, // Separator
\r
40 UnicodeP, // Punctuation
\r
44 UnicodeLu, // UppercaseLetter
\r
45 UnicodeLl, // LowercaseLetter
\r
46 UnicodeLt, // TitlecaseLetter
\r
47 UnicodeLm, // ModifierLetter
\r
48 UnicodeLo, // OtherLetter
\r
49 UnicodeMn, // NonspacingMark
\r
50 UnicodeMe, // EnclosingMark
\r
51 UnicodeMc, // SpacingMark
\r
52 UnicodeNd, // DecimalNumber
\r
53 UnicodeNl, // LetterNumber
\r
54 UnicodeNo, // OtherNumber
\r
55 UnicodeZs, // SpaceSeparator
\r
56 UnicodeZl, // LineSeparator
\r
57 UnicodeZp, // ParagraphSeparator
\r
58 UnicodePd, // DashPunctuation
\r
59 UnicodePs, // OpenPunctuation
\r
60 UnicodePi, // InitialPunctuation
\r
61 UnicodePe, // ClosePunctuation
\r
62 UnicodePf, // FinalPunctuation
\r
63 UnicodePc, // ConnectorPunctuation
\r
64 UnicodePo, // OtherPunctuation
\r
65 UnicodeSm, // MathSymbol
\r
66 UnicodeSc, // CurrencySymbol
\r
67 UnicodeSk, // ModifierSymbol
\r
68 UnicodeSo, // OtherSymbol
\r
69 UnicodeCc, // Control
\r
70 UnicodeCf, // Format
\r
71 UnicodeCo, // PrivateUse
\r
72 UnicodeCs, // Surrogate
\r
73 UnicodeCn, // Unassigned
\r
75 // unicode block ranges
\r
77 // notes: the categories marked with a star are valid unicode block ranges,
\r
78 // but don't seem to be accepted by the MS parser using the /p{...} format.
\r
82 UnicodeLatin1Supplement, // *
\r
83 UnicodeLatinExtendedA, // *
\r
84 UnicodeLatinExtendedB, // *
\r
85 UnicodeIPAExtensions,
\r
86 UnicodeSpacingModifierLetters,
\r
87 UnicodeCombiningDiacriticalMarks,
\r
113 UnicodeUnifiedCanadianAboriginalSyllabics,
\r
118 UnicodeLatinExtendedAdditional,
\r
119 UnicodeGreekExtended,
\r
120 UnicodeGeneralPunctuation,
\r
121 UnicodeSuperscriptsandSubscripts,
\r
122 UnicodeCurrencySymbols,
\r
123 UnicodeCombiningMarksforSymbols,
\r
124 UnicodeLetterlikeSymbols,
\r
125 UnicodeNumberForms,
\r
127 UnicodeMathematicalOperators,
\r
128 UnicodeMiscellaneousTechnical,
\r
129 UnicodeControlPictures,
\r
130 UnicodeOpticalCharacterRecognition,
\r
131 UnicodeEnclosedAlphanumerics,
\r
133 UnicodeBlockElements,
\r
134 UnicodeGeometricShapes,
\r
135 UnicodeMiscellaneousSymbols,
\r
137 UnicodeBraillePatterns,
\r
138 UnicodeCJKRadicalsSupplement,
\r
139 UnicodeKangxiRadicals,
\r
140 UnicodeIdeographicDescriptionCharacters,
\r
141 UnicodeCJKSymbolsandPunctuation,
\r
145 UnicodeHangulCompatibilityJamo,
\r
147 UnicodeBopomofoExtended,
\r
148 UnicodeEnclosedCJKLettersandMonths,
\r
149 UnicodeCJKCompatibility,
\r
150 UnicodeCJKUnifiedIdeographsExtensionA,
\r
151 UnicodeCJKUnifiedIdeographs,
\r
152 UnicodeYiSyllables,
\r
154 UnicodeHangulSyllables,
\r
155 UnicodeHighSurrogates,
\r
156 UnicodeHighPrivateUseSurrogates,
\r
157 UnicodeLowSurrogates,
\r
159 UnicodeCJKCompatibilityIdeographs,
\r
160 UnicodeAlphabeticPresentationForms,
\r
161 UnicodeArabicPresentationFormsA, // *
\r
162 UnicodeCombiningHalfMarks,
\r
163 UnicodeCJKCompatibilityForms,
\r
164 UnicodeSmallFormVariants,
\r
165 UnicodeArabicPresentationFormsB, // *
\r
167 UnicodeHalfwidthandFullwidthForms,
\r
172 UnicodeByzantineMusicalSymbols,
\r
173 UnicodeMusicalSymbols,
\r
174 UnicodeMathematicalAlphanumericSymbols,
\r
175 UnicodeCJKUnifiedIdeographsExtensionB,
\r
176 UnicodeCJKCompatibilityIdeographsSupplement,
\r
179 LastValue // Keep this with the higher value in the enumeration
\r
182 class CategoryUtils {
\r
183 public static Category CategoryFromName (string name) {
\r
185 if (name.StartsWith ("Is")) // remove prefix from block range
\r
186 name = name.Substring (2);
\r
188 return (Category)Enum.Parse (typeof (Category), "Unicode" + name);
\r
190 catch (ArgumentException) {
\r
191 return Category.None;
\r
195 public static bool IsCategory (Category cat, char c) {
\r
197 case Category.None:
\r
203 case Category.AnySingleline:
\r
206 case Category.Word:
\r
208 Char.IsLetterOrDigit (c) ||
\r
209 IsCategory (UnicodeCategory.ConnectorPunctuation, c);
\r
211 case Category.Digit:
\r
212 return Char.IsDigit (c);
\r
214 case Category.WhiteSpace:
\r
215 return Char.IsWhiteSpace (c);
\r
219 case Category.EcmaAny:
\r
222 case Category.EcmaAnySingleline:
\r
225 case Category.EcmaWord:
\r
227 'a' <= c && c <= 'z' ||
\r
228 'A' <= c && c <= 'Z' ||
\r
229 '0' <= c && c <= '9' ||
\r
232 case Category.EcmaDigit:
\r
234 '0' <= c && c <= 9;
\r
236 case Category.EcmaWhiteSpace:
\r
245 // Unicode categories...
\r
249 case Category.UnicodeLu: return IsCategory (UnicodeCategory.UppercaseLetter, c);
\r
250 case Category.UnicodeLl: return IsCategory (UnicodeCategory.LowercaseLetter, c);
\r
251 case Category.UnicodeLt: return IsCategory (UnicodeCategory.TitlecaseLetter, c);
\r
252 case Category.UnicodeLm: return IsCategory (UnicodeCategory.ModifierLetter, c);
\r
253 case Category.UnicodeLo: return IsCategory (UnicodeCategory.OtherLetter, c);
\r
257 case Category.UnicodeMn: return IsCategory (UnicodeCategory.NonSpacingMark, c);
\r
258 case Category.UnicodeMe: return IsCategory (UnicodeCategory.EnclosingMark, c);
\r
259 case Category.UnicodeMc: return IsCategory (UnicodeCategory.SpacingCombiningMark, c);
\r
260 case Category.UnicodeNd: return IsCategory (UnicodeCategory.DecimalDigitNumber, c);
\r
264 case Category.UnicodeNl: return IsCategory (UnicodeCategory.LetterNumber, c);
\r
265 case Category.UnicodeNo: return IsCategory (UnicodeCategory.OtherNumber, c);
\r
269 case Category.UnicodeZs: return IsCategory (UnicodeCategory.SpaceSeparator, c);
\r
270 case Category.UnicodeZl: return IsCategory (UnicodeCategory.LineSeparator, c);
\r
271 case Category.UnicodeZp: return IsCategory (UnicodeCategory.ParagraphSeparator, c);
\r
275 case Category.UnicodePd: return IsCategory (UnicodeCategory.DashPunctuation, c);
\r
276 case Category.UnicodePs: return IsCategory (UnicodeCategory.OpenPunctuation, c);
\r
277 case Category.UnicodePi: return IsCategory (UnicodeCategory.InitialQuotePunctuation, c);
\r
278 case Category.UnicodePe: return IsCategory (UnicodeCategory.ClosePunctuation, c);
\r
279 case Category.UnicodePf: return IsCategory (UnicodeCategory.FinalQuotePunctuation, c);
\r
280 case Category.UnicodePc: return IsCategory (UnicodeCategory.ConnectorPunctuation, c);
\r
281 case Category.UnicodePo: return IsCategory (UnicodeCategory.OtherPunctuation, c);
\r
285 case Category.UnicodeSm: return IsCategory (UnicodeCategory.MathSymbol, c);
\r
286 case Category.UnicodeSc: return IsCategory (UnicodeCategory.CurrencySymbol, c);
\r
287 case Category.UnicodeSk: return IsCategory (UnicodeCategory.ModifierSymbol, c);
\r
288 case Category.UnicodeSo: return IsCategory (UnicodeCategory.OtherSymbol, c);
\r
292 case Category.UnicodeCc: return IsCategory (UnicodeCategory.Control, c);
\r
293 case Category.UnicodeCf: return IsCategory (UnicodeCategory.Format, c);
\r
294 case Category.UnicodeCo: return IsCategory (UnicodeCategory.PrivateUse, c);
\r
295 case Category.UnicodeCs: return IsCategory (UnicodeCategory.Surrogate, c);
\r
296 case Category.UnicodeCn: return IsCategory (UnicodeCategory.OtherNotAssigned, c);
\r
298 case Category.UnicodeL: // letter
\r
300 IsCategory (UnicodeCategory.UppercaseLetter, c) ||
\r
301 IsCategory (UnicodeCategory.LowercaseLetter, c) ||
\r
302 IsCategory (UnicodeCategory.TitlecaseLetter, c) ||
\r
303 IsCategory (UnicodeCategory.ModifierLetter, c) ||
\r
304 IsCategory (UnicodeCategory.OtherLetter, c);
\r
306 case Category.UnicodeM: // mark
\r
308 IsCategory (UnicodeCategory.NonSpacingMark, c) ||
\r
309 IsCategory (UnicodeCategory.EnclosingMark, c) ||
\r
310 IsCategory (UnicodeCategory.SpacingCombiningMark, c);
\r
312 case Category.UnicodeN: // number
\r
314 IsCategory (UnicodeCategory.DecimalDigitNumber, c) ||
\r
315 IsCategory (UnicodeCategory.LetterNumber, c) ||
\r
316 IsCategory (UnicodeCategory.OtherNumber, c);
\r
318 case Category.UnicodeZ: // separator
\r
320 IsCategory (UnicodeCategory.SpaceSeparator, c) ||
\r
321 IsCategory (UnicodeCategory.LineSeparator, c) ||
\r
322 IsCategory (UnicodeCategory.ParagraphSeparator, c);
\r
324 case Category.UnicodeP: // punctuation
\r
326 IsCategory (UnicodeCategory.DashPunctuation, c) ||
\r
327 IsCategory (UnicodeCategory.OpenPunctuation, c) ||
\r
328 IsCategory (UnicodeCategory.InitialQuotePunctuation, c) ||
\r
329 IsCategory (UnicodeCategory.ClosePunctuation, c) ||
\r
330 IsCategory (UnicodeCategory.FinalQuotePunctuation, c) ||
\r
331 IsCategory (UnicodeCategory.ConnectorPunctuation, c) ||
\r
332 IsCategory (UnicodeCategory.OtherPunctuation, c);
\r
334 case Category.UnicodeS: // symbol
\r
336 IsCategory (UnicodeCategory.MathSymbol, c) ||
\r
337 IsCategory (UnicodeCategory.CurrencySymbol, c) ||
\r
338 IsCategory (UnicodeCategory.ModifierSymbol, c) ||
\r
339 IsCategory (UnicodeCategory.OtherSymbol, c);
\r
341 case Category.UnicodeC: // other
\r
343 IsCategory (UnicodeCategory.Control, c) ||
\r
344 IsCategory (UnicodeCategory.Format, c) ||
\r
345 IsCategory (UnicodeCategory.PrivateUse, c) ||
\r
346 IsCategory (UnicodeCategory.Surrogate, c) ||
\r
347 IsCategory (UnicodeCategory.OtherNotAssigned, c);
\r
349 // Unicode block ranges...
\r
351 case Category.UnicodeBasicLatin:
\r
352 return '\u0000' <= c && c <= '\u007F';
\r
354 case Category.UnicodeLatin1Supplement:
\r
355 return '\u0080' <= c && c <= '\u00FF';
\r
357 case Category.UnicodeLatinExtendedA:
\r
358 return '\u0100' <= c && c <= '\u017F';
\r
360 case Category.UnicodeLatinExtendedB:
\r
361 return '\u0180' <= c && c <= '\u024F';
\r
363 case Category.UnicodeIPAExtensions:
\r
364 return '\u0250' <= c && c <= '\u02AF';
\r
366 case Category.UnicodeSpacingModifierLetters:
\r
367 return '\u02B0' <= c && c <= '\u02FF';
\r
369 case Category.UnicodeCombiningDiacriticalMarks:
\r
370 return '\u0300' <= c && c <= '\u036F';
\r
372 case Category.UnicodeGreek:
\r
373 return '\u0370' <= c && c <= '\u03FF';
\r
375 case Category.UnicodeCyrillic:
\r
376 return '\u0400' <= c && c <= '\u04FF';
\r
378 case Category.UnicodeArmenian:
\r
379 return '\u0530' <= c && c <= '\u058F';
\r
381 case Category.UnicodeHebrew:
\r
382 return '\u0590' <= c && c <= '\u05FF';
\r
384 case Category.UnicodeArabic:
\r
385 return '\u0600' <= c && c <= '\u06FF';
\r
387 case Category.UnicodeSyriac:
\r
388 return '\u0700' <= c && c <= '\u074F';
\r
390 case Category.UnicodeThaana:
\r
391 return '\u0780' <= c && c <= '\u07BF';
\r
393 case Category.UnicodeDevanagari:
\r
394 return '\u0900' <= c && c <= '\u097F';
\r
396 case Category.UnicodeBengali:
\r
397 return '\u0980' <= c && c <= '\u09FF';
\r
399 case Category.UnicodeGurmukhi:
\r
400 return '\u0A00' <= c && c <= '\u0A7F';
\r
402 case Category.UnicodeGujarati:
\r
403 return '\u0A80' <= c && c <= '\u0AFF';
\r
405 case Category.UnicodeOriya:
\r
406 return '\u0B00' <= c && c <= '\u0B7F';
\r
408 case Category.UnicodeTamil:
\r
409 return '\u0B80' <= c && c <= '\u0BFF';
\r
411 case Category.UnicodeTelugu:
\r
412 return '\u0C00' <= c && c <= '\u0C7F';
\r
414 case Category.UnicodeKannada:
\r
415 return '\u0C80' <= c && c <= '\u0CFF';
\r
417 case Category.UnicodeMalayalam:
\r
418 return '\u0D00' <= c && c <= '\u0D7F';
\r
420 case Category.UnicodeSinhala:
\r
421 return '\u0D80' <= c && c <= '\u0DFF';
\r
423 case Category.UnicodeThai:
\r
424 return '\u0E00' <= c && c <= '\u0E7F';
\r
426 case Category.UnicodeLao:
\r
427 return '\u0E80' <= c && c <= '\u0EFF';
\r
429 case Category.UnicodeTibetan:
\r
430 return '\u0F00' <= c && c <= '\u0FFF';
\r
432 case Category.UnicodeMyanmar:
\r
433 return '\u1000' <= c && c <= '\u109F';
\r
435 case Category.UnicodeGeorgian:
\r
436 return '\u10A0' <= c && c <= '\u10FF';
\r
438 case Category.UnicodeHangulJamo:
\r
439 return '\u1100' <= c && c <= '\u11FF';
\r
441 case Category.UnicodeEthiopic:
\r
442 return '\u1200' <= c && c <= '\u137F';
\r
444 case Category.UnicodeCherokee:
\r
445 return '\u13A0' <= c && c <= '\u13FF';
\r
447 case Category.UnicodeUnifiedCanadianAboriginalSyllabics:
\r
448 return '\u1400' <= c && c <= '\u167F';
\r
450 case Category.UnicodeOgham:
\r
451 return '\u1680' <= c && c <= '\u169F';
\r
453 case Category.UnicodeRunic:
\r
454 return '\u16A0' <= c && c <= '\u16FF';
\r
456 case Category.UnicodeKhmer:
\r
457 return '\u1780' <= c && c <= '\u17FF';
\r
459 case Category.UnicodeMongolian:
\r
460 return '\u1800' <= c && c <= '\u18AF';
\r
462 case Category.UnicodeLatinExtendedAdditional:
\r
463 return '\u1E00' <= c && c <= '\u1EFF';
\r
465 case Category.UnicodeGreekExtended:
\r
466 return '\u1F00' <= c && c <= '\u1FFF';
\r
468 case Category.UnicodeGeneralPunctuation:
\r
469 return '\u2000' <= c && c <= '\u206F';
\r
471 case Category.UnicodeSuperscriptsandSubscripts:
\r
472 return '\u2070' <= c && c <= '\u209F';
\r
474 case Category.UnicodeCurrencySymbols:
\r
475 return '\u20A0' <= c && c <= '\u20CF';
\r
477 case Category.UnicodeCombiningMarksforSymbols:
\r
478 return '\u20D0' <= c && c <= '\u20FF';
\r
480 case Category.UnicodeLetterlikeSymbols:
\r
481 return '\u2100' <= c && c <= '\u214F';
\r
483 case Category.UnicodeNumberForms:
\r
484 return '\u2150' <= c && c <= '\u218F';
\r
486 case Category.UnicodeArrows:
\r
487 return '\u2190' <= c && c <= '\u21FF';
\r
489 case Category.UnicodeMathematicalOperators:
\r
490 return '\u2200' <= c && c <= '\u22FF';
\r
492 case Category.UnicodeMiscellaneousTechnical:
\r
493 return '\u2300' <= c && c <= '\u23FF';
\r
495 case Category.UnicodeControlPictures:
\r
496 return '\u2400' <= c && c <= '\u243F';
\r
498 case Category.UnicodeOpticalCharacterRecognition:
\r
499 return '\u2440' <= c && c <= '\u245F';
\r
501 case Category.UnicodeEnclosedAlphanumerics:
\r
502 return '\u2460' <= c && c <= '\u24FF';
\r
504 case Category.UnicodeBoxDrawing:
\r
505 return '\u2500' <= c && c <= '\u257F';
\r
507 case Category.UnicodeBlockElements:
\r
508 return '\u2580' <= c && c <= '\u259F';
\r
510 case Category.UnicodeGeometricShapes:
\r
511 return '\u25A0' <= c && c <= '\u25FF';
\r
513 case Category.UnicodeMiscellaneousSymbols:
\r
514 return '\u2600' <= c && c <= '\u26FF';
\r
516 case Category.UnicodeDingbats:
\r
517 return '\u2700' <= c && c <= '\u27BF';
\r
519 case Category.UnicodeBraillePatterns:
\r
520 return '\u2800' <= c && c <= '\u28FF';
\r
522 case Category.UnicodeCJKRadicalsSupplement:
\r
523 return '\u2E80' <= c && c <= '\u2EFF';
\r
525 case Category.UnicodeKangxiRadicals:
\r
526 return '\u2F00' <= c && c <= '\u2FDF';
\r
528 case Category.UnicodeIdeographicDescriptionCharacters:
\r
529 return '\u2FF0' <= c && c <= '\u2FFF';
\r
531 case Category.UnicodeCJKSymbolsandPunctuation:
\r
532 return '\u3000' <= c && c <= '\u303F';
\r
534 case Category.UnicodeHiragana:
\r
535 return '\u3040' <= c && c <= '\u309F';
\r
537 case Category.UnicodeKatakana:
\r
538 return '\u30A0' <= c && c <= '\u30FF';
\r
540 case Category.UnicodeBopomofo:
\r
541 return '\u3100' <= c && c <= '\u312F';
\r
543 case Category.UnicodeHangulCompatibilityJamo:
\r
544 return '\u3130' <= c && c <= '\u318F';
\r
546 case Category.UnicodeKanbun:
\r
547 return '\u3190' <= c && c <= '\u319F';
\r
549 case Category.UnicodeBopomofoExtended:
\r
550 return '\u31A0' <= c && c <= '\u31BF';
\r
552 case Category.UnicodeEnclosedCJKLettersandMonths:
\r
553 return '\u3200' <= c && c <= '\u32FF';
\r
555 case Category.UnicodeCJKCompatibility:
\r
556 return '\u3300' <= c && c <= '\u33FF';
\r
558 case Category.UnicodeCJKUnifiedIdeographsExtensionA:
\r
559 return '\u3400' <= c && c <= '\u4DB5';
\r
561 case Category.UnicodeCJKUnifiedIdeographs:
\r
562 return '\u4E00' <= c && c <= '\u9FFF';
\r
564 case Category.UnicodeYiSyllables:
\r
565 return '\uA000' <= c && c <= '\uA48F';
\r
567 case Category.UnicodeYiRadicals:
\r
568 return '\uA490' <= c && c <= '\uA4CF';
\r
570 case Category.UnicodeHangulSyllables:
\r
571 return '\uAC00' <= c && c <= '\uD7A3';
\r
573 case Category.UnicodeHighSurrogates:
\r
574 return '\uD800' <= c && c <= '\uDB7F';
\r
576 case Category.UnicodeHighPrivateUseSurrogates:
\r
577 return '\uDB80' <= c && c <= '\uDBFF';
\r
579 case Category.UnicodeLowSurrogates:
\r
580 return '\uDC00' <= c && c <= '\uDFFF';
\r
582 case Category.UnicodePrivateUse:
\r
583 return '\uE000' <= c && c <= '\uF8FF';
\r
585 case Category.UnicodeCJKCompatibilityIdeographs:
\r
586 return '\uF900' <= c && c <= '\uFAFF';
\r
588 case Category.UnicodeAlphabeticPresentationForms:
\r
589 return '\uFB00' <= c && c <= '\uFB4F';
\r
591 case Category.UnicodeArabicPresentationFormsA:
\r
592 return '\uFB50' <= c && c <= '\uFDFF';
\r
594 case Category.UnicodeCombiningHalfMarks:
\r
595 return '\uFE20' <= c && c <= '\uFE2F';
\r
597 case Category.UnicodeCJKCompatibilityForms:
\r
598 return '\uFE30' <= c && c <= '\uFE4F';
\r
600 case Category.UnicodeSmallFormVariants:
\r
601 return '\uFE50' <= c && c <= '\uFE6F';
\r
603 case Category.UnicodeArabicPresentationFormsB:
\r
604 return '\uFE70' <= c && c <= '\uFEFE';
\r
606 case Category.UnicodeHalfwidthandFullwidthForms:
\r
607 return '\uFF00' <= c && c <= '\uFFEF';
\r
609 case Category.UnicodeSpecials:
\r
611 '\uFEFF' <= c && c <= '\uFEFF' ||
\r
612 '\uFFF0' <= c && c <= '\uFFFD';
\r
614 // these block ranges begin above 0x10000
\r
616 case Category.UnicodeOldItalic:
\r
617 case Category.UnicodeGothic:
\r
618 case Category.UnicodeDeseret:
\r
619 case Category.UnicodeByzantineMusicalSymbols:
\r
620 case Category.UnicodeMusicalSymbols:
\r
621 case Category.UnicodeMathematicalAlphanumericSymbols:
\r
622 case Category.UnicodeCJKUnifiedIdeographsExtensionB:
\r
623 case Category.UnicodeCJKCompatibilityIdeographsSupplement:
\r
624 case Category.UnicodeTags:
\r
632 private static bool IsCategory (UnicodeCategory uc, char c) {
\r
633 if (Char.GetUnicodeCategory (c) == uc)
\r