3 // namespace: System.Text.RegularExpressions
\r
6 // author: Dan Lewis (dlewis@gmx.co.uk)
\r
10 using System.Collections;
\r
12 namespace System.Text.RegularExpressions.Syntax {
\r
13 // collection classes
\r
15 class ExpressionCollection : CollectionBase {
\r
16 public void Add (Expression e) {
\r
20 public Expression this[int i] {
\r
21 get { return (Expression)List[i]; }
\r
22 set { List[i] = value; }
\r
25 protected override void OnValidate (object o) {
\r
26 // allow null elements
\r
32 abstract class Expression {
\r
33 public abstract void Compile (ICompiler cmp, bool reverse);
\r
34 public abstract void GetWidth (out int min, out int max);
\r
36 public int GetFixedWidth () {
\r
38 GetWidth (out min, out max);
\r
46 public virtual AnchorInfo GetAnchorInfo (bool reverse) {
\r
47 return new AnchorInfo (this, GetFixedWidth ());
\r
50 public virtual bool IsComplex () {
\r
55 // composite expressions
\r
57 abstract class CompositeExpression : Expression {
\r
58 public CompositeExpression () {
\r
59 expressions = new ExpressionCollection ();
\r
62 protected ExpressionCollection Expressions {
\r
63 get { return expressions; }
\r
66 protected void GetWidth (out int min, out int max, int count) {
\r
67 min = Int32.MaxValue;
\r
71 for (int i = 0; i < count; ++ i) {
\r
72 Expression e = Expressions[i];
\r
78 e.GetWidth (out a, out b);
\r
79 if (a < min) min = a;
\r
80 if (b > max) max = b;
\r
87 private ExpressionCollection expressions;
\r
92 class Group : CompositeExpression {
\r
96 public Expression Expression {
\r
97 get { return Expressions[0]; }
\r
98 set { Expressions[0] = value; }
\r
101 public void AppendExpression (Expression e) {
\r
102 Expressions.Add (e);
\r
105 public override void Compile (ICompiler cmp, bool reverse) {
\r
106 int count = Expressions.Count;
\r
107 for (int i = 0; i < count; ++ i) {
\r
110 e = Expressions[count - i - 1];
\r
112 e = Expressions[i];
\r
114 e.Compile (cmp, reverse);
\r
118 public override void GetWidth (out int min, out int max) {
\r
122 foreach (Expression e in Expressions) {
\r
124 e.GetWidth (out a, out b);
\r
126 if (max == Int32.MaxValue || b == Int32.MaxValue)
\r
127 max = Int32.MaxValue;
\r
133 public override AnchorInfo GetAnchorInfo (bool reverse) {
\r
135 int width = GetFixedWidth ();
\r
137 ArrayList infos = new ArrayList ();
\r
138 IntervalCollection segments = new IntervalCollection ();
\r
140 // accumulate segments
\r
143 //foreach (Expression e in Expressions) {
\r
144 int count = Expressions.Count;
\r
145 for (int i = 0; i < count; ++ i) {
\r
148 e = Expressions [count - i - 1];
\r
150 e = Expressions [i];
\r
152 AnchorInfo info = e.GetAnchorInfo (reverse);
\r
155 if (info.IsPosition)
\r
156 return new AnchorInfo (this, ptr + info.Offset, width, info.Position);
\r
158 if (info.IsSubstring)
\r
159 segments.Add (info.GetInterval (ptr));
\r
161 if (info.IsUnknownWidth)
\r
167 // normalize and find the longest segment
\r
169 segments.Normalize ();
\r
171 Interval longest = Interval.Empty;
\r
172 foreach (Interval segment in segments) {
\r
173 if (segment.Size > longest.Size)
\r
177 // now chain the substrings that made this segment together
\r
179 if (!longest.IsEmpty) {
\r
181 ArrayList strs = new ArrayList();
\r
182 bool ignore = false;
\r
186 //foreach (AnchorInfo info in infos) {
\r
187 for (int i = 0; i < infos.Count; ++ i) {
\r
190 info = (AnchorInfo)infos[i];
\r
192 if (info.IsSubstring && longest.Contains (info.GetInterval (ptr))) {
\r
193 //str += info.Substring; // TODO mark subexpressions
\r
194 strs.Add(info.Substring);
\r
195 ignore |= info.IgnoreCase;
\r
199 if (info.IsUnknownWidth)
\r
205 for (int i = 0; i< strs.Count; ++i)
\r
208 str += strs [strs.Count - i - 1];
\r
216 return new AnchorInfo (this, longest.low, width, str, ignore);
\r
219 return new AnchorInfo (this, width);
\r
222 public override bool IsComplex () {
\r
224 foreach (Expression e in Expressions) {
\r
225 comp |= e.IsComplex ();
\r
228 return comp | GetFixedWidth () <= 0;
\r
232 class RegularExpression : Group {
\r
233 public RegularExpression () {
\r
237 public int GroupCount {
\r
238 get { return group_count; }
\r
239 set { group_count = value; }
\r
242 public override void Compile (ICompiler cmp, bool reverse) {
\r
246 GetWidth (out min, out max);
\r
247 cmp.EmitInfo (group_count, min, max);
\r
249 // anchoring expression
\r
251 AnchorInfo info = GetAnchorInfo (reverse);
\r
253 // info = new AnchorInfo (this, GetFixedWidth ()); // FIXME
\r
255 LinkRef pattern = cmp.NewLink ();
\r
256 cmp.EmitAnchor (reverse, info.Offset, pattern);
\r
258 if (info.IsPosition)
\r
259 cmp.EmitPosition (info.Position);
\r
260 else if (info.IsSubstring)
\r
261 cmp.EmitString (info.Substring, info.IgnoreCase, reverse);
\r
267 cmp.ResolveLink (pattern);
\r
268 base.Compile (cmp, reverse);
\r
272 private int group_count;
\r
275 class CapturingGroup : Group {
\r
276 public CapturingGroup () {
\r
281 public int Number {
\r
282 get { return gid; }
\r
283 set { gid = value; }
\r
286 public string Name {
\r
287 get { return name; }
\r
288 set { name = value; }
\r
291 public bool IsNamed {
\r
292 get { return name != null; }
\r
295 public override void Compile (ICompiler cmp, bool reverse) {
\r
296 cmp.EmitOpen (gid);
\r
297 base.Compile (cmp, reverse);
\r
298 cmp.EmitClose (gid);
\r
301 public override bool IsComplex () {
\r
306 private string name;
\r
309 class BalancingGroup : CapturingGroup {
\r
310 public BalancingGroup () {
\r
311 this.balance = null;
\r
314 public CapturingGroup Balance {
\r
315 get { return balance; }
\r
316 set { balance = value; }
\r
319 public override void Compile (ICompiler cmp, bool reverse) {
\r
320 // can't invoke Group.Compile from here :(
\r
321 // so I'll just repeat the code
\r
323 int count = Expressions.Count;
\r
324 for (int i = 0; i < count; ++ i) {
\r
327 e = Expressions[count - i - 1];
\r
329 e = Expressions[i];
\r
331 e.Compile (cmp, reverse);
\r
334 cmp.EmitBalance (this.Number, balance.Number);
\r
337 private CapturingGroup balance;
\r
340 class NonBacktrackingGroup : Group {
\r
341 public NonBacktrackingGroup () {
\r
344 public override void Compile (ICompiler cmp, bool reverse) {
\r
345 LinkRef tail = cmp.NewLink ();
\r
347 cmp.EmitSub (tail);
\r
348 base.Compile (cmp, reverse);
\r
350 cmp.ResolveLink (tail);
\r
353 public override bool IsComplex () {
\r
360 class Repetition : CompositeExpression {
\r
361 public Repetition (int min, int max, bool lazy) {
\r
362 Expressions.Add (null);
\r
369 public Expression Expression {
\r
370 get { return Expressions[0]; }
\r
371 set { Expressions[0] = value; }
\r
374 public int Minimum {
\r
375 get { return min; }
\r
376 set { min = value; }
\r
379 public int Maximum {
\r
380 get { return max; }
\r
381 set { max = value; }
\r
385 get { return lazy; }
\r
386 set { lazy = value; }
\r
389 public override void Compile (ICompiler cmp, bool reverse) {
\r
390 if (Expression.IsComplex ()) {
\r
391 LinkRef until = cmp.NewLink ();
\r
393 cmp.EmitRepeat (min, max, lazy, until);
\r
394 Expression.Compile (cmp, reverse);
\r
395 cmp.EmitUntil (until);
\r
398 LinkRef tail = cmp.NewLink ();
\r
400 cmp.EmitFastRepeat (min, max, lazy, tail);
\r
401 Expression.Compile (cmp, reverse);
\r
403 cmp.ResolveLink (tail);
\r
407 public override void GetWidth (out int min, out int max) {
\r
408 Expression.GetWidth (out min, out max);
\r
409 min = min * this.min;
\r
410 if (max == Int32.MaxValue || this.max == 0xffff)
\r
411 max = Int32.MaxValue;
\r
413 max = max * this.max;
\r
416 public override AnchorInfo GetAnchorInfo (bool reverse) {
\r
417 int width = GetFixedWidth ();
\r
419 return new AnchorInfo (this, width);
\r
421 AnchorInfo info = Expression.GetAnchorInfo (reverse);
\r
422 if (info.IsPosition)
\r
423 return new AnchorInfo (this, info.Offset, width, info.Position);
\r
425 if (info.IsSubstring) {
\r
426 if (info.IsComplete) {
\r
428 for (int i = 0; i < Minimum; ++ i)
\r
429 str += info.Substring;
\r
431 return new AnchorInfo (this, 0, width, str, info.IgnoreCase);
\r
434 return new AnchorInfo (this, info.Offset, width, info.Substring, info.IgnoreCase);
\r
437 return new AnchorInfo (this, width);
\r
440 private int min, max;
\r
446 abstract class Assertion : CompositeExpression {
\r
447 public Assertion () {
\r
448 Expressions.Add (null); // true expression
\r
449 Expressions.Add (null); // false expression
\r
452 public Expression TrueExpression {
\r
453 get { return Expressions[0]; }
\r
454 set { Expressions[0] = value; }
\r
457 public Expression FalseExpression {
\r
458 get { return Expressions[1]; }
\r
459 set { Expressions[1] = value; }
\r
462 public override void GetWidth (out int min, out int max) {
\r
463 GetWidth (out min, out max, 2);
\r
465 if (TrueExpression == null || FalseExpression == null)
\r
470 class CaptureAssertion : Assertion {
\r
471 public CaptureAssertion () {
\r
474 public CapturingGroup CapturingGroup {
\r
475 get { return group; }
\r
476 set { group = value; }
\r
479 public override void Compile (ICompiler cmp, bool reverse) {
\r
480 int gid = group.Number;
\r
481 LinkRef tail = cmp.NewLink ();
\r
483 if (FalseExpression == null) {
\r
488 cmp.EmitIfDefined (gid, tail);
\r
489 TrueExpression.Compile (cmp, reverse);
\r
498 LinkRef false_expr = cmp.NewLink ();
\r
499 cmp.EmitIfDefined (gid, false_expr);
\r
500 TrueExpression.Compile (cmp, reverse);
\r
501 cmp.EmitJump (tail);
\r
502 cmp.ResolveLink (false_expr);
\r
503 FalseExpression.Compile (cmp, reverse);
\r
506 cmp.ResolveLink (tail);
\r
509 public override bool IsComplex () {
\r
511 if (TrueExpression != null)
\r
512 comp |= TrueExpression.IsComplex ();
\r
513 if (FalseExpression != null)
\r
514 comp |= FalseExpression.IsComplex ();
\r
516 return comp | GetFixedWidth () <= 0;
\r
519 private CapturingGroup group;
\r
522 class ExpressionAssertion : Assertion {
\r
523 public ExpressionAssertion () {
\r
524 Expressions.Add (null); // test expression
\r
527 public bool Reverse {
\r
528 get { return reverse; }
\r
529 set { reverse = value; }
\r
532 public bool Negate {
\r
533 get { return negate; }
\r
534 set { negate = value; }
\r
537 public Expression TestExpression {
\r
538 get { return Expressions[2]; }
\r
539 set { Expressions[2] = value; }
\r
542 public override void Compile (ICompiler cmp, bool reverse) {
\r
543 LinkRef true_expr = cmp.NewLink ();
\r
544 LinkRef false_expr = cmp.NewLink ();
\r
546 // test op: positive / negative
\r
549 cmp.EmitTest (true_expr, false_expr);
\r
551 cmp.EmitTest (false_expr, true_expr);
\r
553 // test expression: lookahead / lookbehind
\r
555 TestExpression.Compile (cmp, this.reverse);
\r
558 // target expressions
\r
560 if (TrueExpression == null) { // (?= ...)
\r
566 cmp.ResolveLink (false_expr);
\r
568 cmp.ResolveLink (true_expr);
\r
571 cmp.ResolveLink (true_expr);
\r
572 TrueExpression.Compile (cmp, reverse);
\r
574 if (FalseExpression == null) { // (?(...) ...)
\r
580 cmp.ResolveLink (false_expr);
\r
582 else { // (?(...) ... | ...)
\r
590 LinkRef tail = cmp.NewLink ();
\r
592 cmp.EmitJump (tail);
\r
593 cmp.ResolveLink (false_expr);
\r
594 FalseExpression.Compile (cmp, reverse);
\r
595 cmp.ResolveLink (tail);
\r
600 private bool reverse, negate;
\r
605 class Alternation : CompositeExpression {
\r
606 public Alternation () {
\r
609 public ExpressionCollection Alternatives {
\r
610 get { return Expressions; }
\r
613 public void AddAlternative (Expression e) {
\r
614 Alternatives.Add (e);
\r
617 public override void Compile (ICompiler cmp, bool reverse) {
\r
618 // LinkRef next = cmp.NewLink ();
\r
619 LinkRef tail = cmp.NewLink ();
\r
621 foreach (Expression e in Alternatives) {
\r
622 LinkRef next = cmp.NewLink ();
\r
623 cmp.EmitBranch (next);
\r
624 e.Compile (cmp, reverse);
\r
625 cmp.EmitJump (tail);
\r
626 cmp.ResolveLink (next);
\r
627 cmp.EmitBranchEnd();
\r
631 cmp.ResolveLink (tail);
\r
632 cmp.EmitAlternationEnd();
\r
635 public override void GetWidth (out int min, out int max) {
\r
636 GetWidth (out min, out max, Alternatives.Count);
\r
639 public override bool IsComplex () {
\r
641 foreach (Expression e in Alternatives) {
\r
642 comp |= e.IsComplex ();
\r
645 return comp | GetFixedWidth () <= 0;
\r
649 // terminal expressions
\r
651 class Literal : Expression {
\r
652 public Literal (string str, bool ignore) {
\r
654 this.ignore = ignore;
\r
657 public string String {
\r
658 get { return str; }
\r
659 set { str = value; }
\r
662 public bool IgnoreCase {
\r
663 get { return ignore; }
\r
664 set { ignore = value; }
\r
667 public override void Compile (ICompiler cmp, bool reverse) {
\r
668 if (str.Length == 0)
\r
671 if (str.Length == 1)
\r
672 cmp.EmitCharacter (str[0], false, ignore, reverse);
\r
674 cmp.EmitString (str, ignore, reverse);
\r
677 public override void GetWidth (out int min, out int max) {
\r
678 min = max = str.Length;
\r
681 public override AnchorInfo GetAnchorInfo (bool reverse) {
\r
682 return new AnchorInfo (this, 0, str.Length, str, ignore);
\r
685 public override bool IsComplex () {
\r
689 private string str;
\r
690 private bool ignore;
\r
693 class PositionAssertion : Expression {
\r
694 public PositionAssertion (Position pos) {
\r
698 public Position Position {
\r
699 get { return pos; }
\r
700 set { pos = value; }
\r
703 public override void Compile (ICompiler cmp, bool reverse) {
\r
704 cmp.EmitPosition (pos);
\r
707 public override void GetWidth (out int min, out int max) {
\r
711 public override bool IsComplex () {
\r
715 public override AnchorInfo GetAnchorInfo (bool revers) {
\r
717 case Position.StartOfString: case Position.StartOfLine: case Position.StartOfScan:
\r
718 return new AnchorInfo (this, 0, 0, pos);
\r
721 return new AnchorInfo (this, 0);
\r
725 private Position pos;
\r
728 class Reference : Expression {
\r
729 public Reference (bool ignore) {
\r
730 this.ignore = ignore;
\r
733 public CapturingGroup CapturingGroup {
\r
734 get { return group; }
\r
735 set { group = value; }
\r
738 public bool IgnoreCase {
\r
739 get { return ignore; }
\r
740 set { ignore = value; }
\r
743 public override void Compile (ICompiler cmp, bool reverse) {
\r
744 cmp.EmitReference (group.Number, ignore, reverse);
\r
747 public override void GetWidth (out int min, out int max) {
\r
748 //group.GetWidth (out min, out max);
\r
749 // TODO set width to referenced group for non-cyclical references
\r
751 max = Int32.MaxValue;
\r
754 public override bool IsComplex () {
\r
755 return true; // FIXME incorporate cyclic check
\r
758 private CapturingGroup group;
\r
759 private bool ignore;
\r
762 class CharacterClass : Expression {
\r
763 public CharacterClass (bool negate, bool ignore) {
\r
764 this.negate = negate;
\r
765 this.ignore = ignore;
\r
767 intervals = new IntervalCollection ();
\r
769 // initialize pos/neg category arrays
\r
771 Array cat_values = Enum.GetValues (typeof (Category));
\r
772 int cat_size = (int)(Category)cat_values.GetValue (cat_values.Length - 1) + 1;
\r
773 pos_cats = new bool[cat_size];
\r
774 neg_cats = new bool[cat_size];
\r
775 for (int i = 0; i < cat_size; ++ i) {
\r
776 pos_cats[i] = false;
\r
777 neg_cats[i] = false;
\r
781 public CharacterClass (Category cat, bool negate) : this (false, false) {
\r
782 this.AddCategory (cat, negate);
\r
785 public bool Negate {
\r
786 get { return negate; }
\r
787 set { negate = value; }
\r
790 public bool IgnoreCase {
\r
791 get { return ignore; }
\r
792 set { ignore = value; }
\r
795 public void AddCategory (Category cat, bool negate) {
\r
800 pos_cats[n] = false;
\r
802 neg_cats[n] = true;
\r
806 neg_cats[n] = false;
\r
808 pos_cats[n] = true;
\r
812 public void AddCharacter (char c) {
\r
813 // TODO: this is certainly not the most efficient way of doing things
\r
814 // TODO: but at least it produces correct results.
\r
818 public void AddRange (char lo, char hi) {
\r
819 Interval new_interval = new Interval (lo, hi);
\r
821 // ignore case is on. we must make sure our interval does not
\r
822 // use upper case. if it does, we must normalize the upper case
\r
823 // characters into lower case.
\r
825 if (upper_case_characters.Intersects (new_interval)) {
\r
826 Interval partial_new_interval;
\r
828 if (new_interval.low < upper_case_characters.low) {
\r
829 partial_new_interval = new Interval (upper_case_characters.low + distance_between_upper_and_lower_case,
\r
830 new_interval.high + distance_between_upper_and_lower_case);
\r
831 new_interval.high = upper_case_characters.low - 1;
\r
834 partial_new_interval = new Interval (new_interval.low + distance_between_upper_and_lower_case,
\r
835 upper_case_characters.high + distance_between_upper_and_lower_case);
\r
836 new_interval.low = upper_case_characters.high + 1;
\r
838 intervals.Add (partial_new_interval);
\r
840 else if (upper_case_characters.Contains (new_interval)) {
\r
841 new_interval.high += distance_between_upper_and_lower_case;
\r
842 new_interval.low += distance_between_upper_and_lower_case;
\r
845 intervals.Add (new_interval);
\r
848 public override void Compile (ICompiler cmp, bool reverse) {
\r
849 // create the meta-collection
\r
850 IntervalCollection meta =
\r
851 intervals.GetMetaCollection (new IntervalCollection.CostDelegate (GetIntervalCost));
\r
855 int count = meta.Count;
\r
856 for (int i = 0; i < pos_cats.Length; ++ i) {
\r
857 if (pos_cats[i]) ++ count;
\r
858 if (neg_cats[i]) ++ count;
\r
864 // emit in op for |meta| > 1
\r
866 LinkRef tail = cmp.NewLink ();
\r
873 for (int i = 0; i < pos_cats.Length; ++ i) {
\r
875 cmp.EmitCategory ((Category)i, negate, reverse);
\r
876 else if (neg_cats[i])
\r
877 cmp.EmitCategory ((Category)i, !negate, reverse);
\r
880 // emit character/range/sets from meta-collection
\r
882 foreach (Interval a in meta) {
\r
883 if (a.IsDiscontiguous) { // Set
\r
884 BitArray bits = new BitArray (a.Size);
\r
885 foreach (Interval b in intervals) {
\r
886 if (a.Contains (b)) {
\r
887 for (int i = b.low; i <= b.high; ++ i)
\r
888 bits[i - a.low] = true;
\r
892 cmp.EmitSet ((char)a.low, bits, negate, ignore, reverse);
\r
894 else if (a.IsSingleton) // Character
\r
895 cmp.EmitCharacter ((char)a.low, negate, ignore, reverse);
\r
897 cmp.EmitRange ((char)a.low, (char)a.high, negate, ignore, reverse);
\r
908 cmp.ResolveLink (tail);
\r
912 public override void GetWidth (out int min, out int max) {
\r
916 public override bool IsComplex () {
\r
922 private static double GetIntervalCost (Interval i) {
\r
923 // use op length as cost metric (=> optimize for space)
\r
925 if (i.IsDiscontiguous)
\r
926 return 3 + ((i.Size + 0xf) >> 4); // Set
\r
927 else if (i.IsSingleton)
\r
928 return 2; // Character
\r
933 private static Interval upper_case_characters = new Interval ((char)65, (char)90);
\r
934 private const int distance_between_upper_and_lower_case = 32;
\r
935 private bool negate, ignore;
\r
936 private bool[] pos_cats, neg_cats;
\r
937 private IntervalCollection intervals;
\r
941 private Expression expr;
\r
943 private Position pos;
\r
944 private int offset;
\r
946 private string str;
\r
948 private bool ignore;
\r
950 public AnchorInfo (Expression expr, int width) {
\r
953 this.width = width;
\r
956 this.ignore = false;
\r
957 this.pos = Position.Any;
\r
960 public AnchorInfo (Expression expr, int offset, int width, string str, bool ignore) {
\r
962 this.offset = offset;
\r
963 this.width = width;
\r
965 this.str = ignore ? str.ToLower () : str;
\r
967 this.ignore = ignore;
\r
968 this.pos = Position.Any;
\r
971 public AnchorInfo (Expression expr, int offset, int width, Position pos) {
\r
973 this.offset = offset;
\r
974 this.width = width;
\r
979 this.ignore = false;
\r
982 public Expression Expression {
\r
983 get { return expr; }
\r
986 public int Offset {
\r
987 get { return offset; }
\r
991 get { return width; }
\r
994 public int Length {
\r
995 get { return (str != null) ? str.Length : 0; }
\r
998 public bool IsUnknownWidth {
\r
999 get { return width < 0; }
\r
1002 public bool IsComplete {
\r
1003 get { return Length == Width; }
\r
1006 public string Substring {
\r
1007 get { return str; }
\r
1010 public bool IgnoreCase {
\r
1011 get { return ignore; }
\r
1014 public Position Position {
\r
1015 get { return pos; }
\r
1018 public bool IsSubstring {
\r
1019 get { return str != null; }
\r
1022 public bool IsPosition {
\r
1023 get { return pos != Position.Any; }
\r
1026 public Interval GetInterval () {
\r
1027 return GetInterval (0);
\r
1030 public Interval GetInterval (int start) {
\r
1032 return Interval.Empty;
\r
1034 return new Interval (start + Offset, start + Offset + Length - 1);
\r