3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 using System.Collections;
12 namespace System.Text.RegularExpressions.Syntax {
15 class ExpressionCollection : CollectionBase {
16 public void Add (Expression e) {
20 public Expression this[int i] {
21 get { return (Expression)List[i]; }
22 set { List[i] = value; }
25 protected override void OnValidate (object o) {
26 // allow null elements
32 abstract class Expression {
33 public abstract void Compile (ICompiler cmp, bool reverse);
34 public abstract void GetWidth (out int min, out int max);
36 public int GetFixedWidth () {
38 GetWidth (out min, out max);
46 public virtual AnchorInfo GetAnchorInfo (bool reverse) {
47 return new AnchorInfo (this, GetFixedWidth ());
50 public virtual bool IsComplex () {
55 // composite expressions
57 abstract class CompositeExpression : Expression {
58 public CompositeExpression () {
59 expressions = new ExpressionCollection ();
62 protected ExpressionCollection Expressions {
63 get { return expressions; }
66 protected void GetWidth (out int min, out int max, int count) {
71 for (int i = 0; i < count; ++ i) {
72 Expression e = Expressions[i];
78 e.GetWidth (out a, out b);
87 private ExpressionCollection expressions;
92 class Group : CompositeExpression {
96 public Expression Expression {
97 get { return Expressions[0]; }
98 set { Expressions[0] = value; }
101 public void AppendExpression (Expression e) {
105 public override void Compile (ICompiler cmp, bool reverse) {
106 int count = Expressions.Count;
107 for (int i = 0; i < count; ++ i) {
110 e = Expressions[count - i - 1];
114 e.Compile (cmp, reverse);
118 public override void GetWidth (out int min, out int max) {
122 foreach (Expression e in Expressions) {
124 e.GetWidth (out a, out b);
126 if (max == Int32.MaxValue || b == Int32.MaxValue)
127 max = Int32.MaxValue;
133 public override AnchorInfo GetAnchorInfo (bool reverse) {
135 int width = GetFixedWidth ();
137 ArrayList infos = new ArrayList ();
138 IntervalCollection segments = new IntervalCollection ();
140 // accumulate segments
143 //foreach (Expression e in Expressions) {
144 int count = Expressions.Count;
145 for (int i = 0; i < count; ++ i) {
148 e = Expressions [count - i - 1];
152 AnchorInfo info = e.GetAnchorInfo (reverse);
156 return new AnchorInfo (this, ptr + info.Offset, width, info.Position);
158 if (info.IsSubstring)
159 segments.Add (info.GetInterval (ptr));
161 if (info.IsUnknownWidth)
167 // normalize and find the longest segment
169 segments.Normalize ();
171 Interval longest = Interval.Empty;
172 foreach (Interval segment in segments) {
173 if (segment.Size > longest.Size)
177 // now chain the substrings that made this segment together
179 if (!longest.IsEmpty) {
181 ArrayList strs = new ArrayList();
186 //foreach (AnchorInfo info in infos) {
187 for (int i = 0; i < infos.Count; ++ i) {
190 info = (AnchorInfo)infos[i];
192 if (info.IsSubstring && longest.Contains (info.GetInterval (ptr))) {
193 //str += info.Substring; // TODO mark subexpressions
194 strs.Add(info.Substring);
195 ignore |= info.IgnoreCase;
199 if (info.IsUnknownWidth)
205 for (int i = 0; i< strs.Count; ++i)
208 str += strs [strs.Count - i - 1];
216 return new AnchorInfo (this, longest.low, width, str, ignore);
219 return new AnchorInfo (this, width);
222 public override bool IsComplex () {
224 foreach (Expression e in Expressions) {
225 comp |= e.IsComplex ();
228 return comp | GetFixedWidth () <= 0;
232 class RegularExpression : Group {
233 public RegularExpression () {
237 public int GroupCount {
238 get { return group_count; }
239 set { group_count = value; }
242 public override void Compile (ICompiler cmp, bool reverse) {
246 GetWidth (out min, out max);
247 cmp.EmitInfo (group_count, min, max);
249 // anchoring expression
251 AnchorInfo info = GetAnchorInfo (reverse);
253 // info = new AnchorInfo (this, GetFixedWidth ()); // FIXME
255 LinkRef pattern = cmp.NewLink ();
256 cmp.EmitAnchor (reverse, info.Offset, pattern);
259 cmp.EmitPosition (info.Position);
260 else if (info.IsSubstring)
261 cmp.EmitString (info.Substring, info.IgnoreCase, reverse);
267 cmp.ResolveLink (pattern);
268 base.Compile (cmp, reverse);
272 private int group_count;
275 class CapturingGroup : Group {
276 public CapturingGroup () {
288 set { name = value; }
291 public bool IsNamed {
292 get { return name != null; }
295 public override void Compile (ICompiler cmp, bool reverse) {
297 base.Compile (cmp, reverse);
301 public override bool IsComplex () {
309 class BalancingGroup : CapturingGroup {
310 public BalancingGroup () {
314 public CapturingGroup Balance {
315 get { return balance; }
316 set { balance = value; }
319 public override void Compile (ICompiler cmp, bool reverse) {
320 // can't invoke Group.Compile from here :(
321 // so I'll just repeat the code
323 LinkRef tail = cmp.NewLink ();
325 cmp.EmitBalanceStart (this.Number, balance.Number, this.IsNamed, tail);
327 int count = Expressions.Count;
328 for (int i = 0; i < count; ++ i) {
331 e = Expressions[count - i - 1];
335 e.Compile (cmp, reverse);
339 cmp.ResolveLink(tail);
342 private CapturingGroup balance;
345 class NonBacktrackingGroup : Group {
346 public NonBacktrackingGroup () {
349 public override void Compile (ICompiler cmp, bool reverse) {
350 LinkRef tail = cmp.NewLink ();
353 base.Compile (cmp, reverse);
355 cmp.ResolveLink (tail);
358 public override bool IsComplex () {
365 class Repetition : CompositeExpression {
366 public Repetition (int min, int max, bool lazy) {
367 Expressions.Add (null);
374 public Expression Expression {
375 get { return Expressions[0]; }
376 set { Expressions[0] = value; }
391 set { lazy = value; }
394 public override void Compile (ICompiler cmp, bool reverse) {
395 if (Expression.IsComplex ()) {
396 LinkRef until = cmp.NewLink ();
398 cmp.EmitRepeat (min, max, lazy, until);
399 Expression.Compile (cmp, reverse);
400 cmp.EmitUntil (until);
403 LinkRef tail = cmp.NewLink ();
405 cmp.EmitFastRepeat (min, max, lazy, tail);
406 Expression.Compile (cmp, reverse);
408 cmp.ResolveLink (tail);
412 public override void GetWidth (out int min, out int max) {
413 Expression.GetWidth (out min, out max);
414 min = min * this.min;
415 if (max == Int32.MaxValue || this.max == 0xffff)
416 max = Int32.MaxValue;
418 max = max * this.max;
421 public override AnchorInfo GetAnchorInfo (bool reverse) {
422 int width = GetFixedWidth ();
424 return new AnchorInfo (this, width);
426 AnchorInfo info = Expression.GetAnchorInfo (reverse);
428 return new AnchorInfo (this, info.Offset, width, info.Position);
430 if (info.IsSubstring) {
431 if (info.IsComplete) {
433 for (int i = 0; i < Minimum; ++ i)
434 str += info.Substring;
436 return new AnchorInfo (this, 0, width, str, info.IgnoreCase);
439 return new AnchorInfo (this, info.Offset, width, info.Substring, info.IgnoreCase);
442 return new AnchorInfo (this, width);
445 private int min, max;
451 abstract class Assertion : CompositeExpression {
452 public Assertion () {
453 Expressions.Add (null); // true expression
454 Expressions.Add (null); // false expression
457 public Expression TrueExpression {
458 get { return Expressions[0]; }
459 set { Expressions[0] = value; }
462 public Expression FalseExpression {
463 get { return Expressions[1]; }
464 set { Expressions[1] = value; }
467 public override void GetWidth (out int min, out int max) {
468 GetWidth (out min, out max, 2);
470 if (TrueExpression == null || FalseExpression == null)
475 class CaptureAssertion : Assertion {
476 public CaptureAssertion () {
479 public CapturingGroup CapturingGroup {
480 get { return group; }
481 set { group = value; }
484 public override void Compile (ICompiler cmp, bool reverse) {
485 int gid = group.Number;
486 LinkRef tail = cmp.NewLink ();
488 if (FalseExpression == null) {
493 cmp.EmitIfDefined (gid, tail);
494 TrueExpression.Compile (cmp, reverse);
503 LinkRef false_expr = cmp.NewLink ();
504 cmp.EmitIfDefined (gid, false_expr);
505 TrueExpression.Compile (cmp, reverse);
507 cmp.ResolveLink (false_expr);
508 FalseExpression.Compile (cmp, reverse);
511 cmp.ResolveLink (tail);
514 public override bool IsComplex () {
516 if (TrueExpression != null)
517 comp |= TrueExpression.IsComplex ();
518 if (FalseExpression != null)
519 comp |= FalseExpression.IsComplex ();
521 return comp | GetFixedWidth () <= 0;
524 private CapturingGroup group;
527 class ExpressionAssertion : Assertion {
528 public ExpressionAssertion () {
529 Expressions.Add (null); // test expression
532 public bool Reverse {
533 get { return reverse; }
534 set { reverse = value; }
538 get { return negate; }
539 set { negate = value; }
542 public Expression TestExpression {
543 get { return Expressions[2]; }
544 set { Expressions[2] = value; }
547 public override void Compile (ICompiler cmp, bool reverse) {
548 LinkRef true_expr = cmp.NewLink ();
549 LinkRef false_expr = cmp.NewLink ();
551 // test op: positive / negative
554 cmp.EmitTest (true_expr, false_expr);
556 cmp.EmitTest (false_expr, true_expr);
558 // test expression: lookahead / lookbehind
560 TestExpression.Compile (cmp, this.reverse);
563 // target expressions
565 if (TrueExpression == null) { // (?= ...)
571 cmp.ResolveLink (false_expr);
573 cmp.ResolveLink (true_expr);
576 cmp.ResolveLink (true_expr);
577 TrueExpression.Compile (cmp, reverse);
579 if (FalseExpression == null) { // (?(...) ...)
585 cmp.ResolveLink (false_expr);
587 else { // (?(...) ... | ...)
595 LinkRef tail = cmp.NewLink ();
598 cmp.ResolveLink (false_expr);
599 FalseExpression.Compile (cmp, reverse);
600 cmp.ResolveLink (tail);
605 private bool reverse, negate;
610 class Alternation : CompositeExpression {
611 public Alternation () {
614 public ExpressionCollection Alternatives {
615 get { return Expressions; }
618 public void AddAlternative (Expression e) {
619 Alternatives.Add (e);
622 public override void Compile (ICompiler cmp, bool reverse) {
623 // LinkRef next = cmp.NewLink ();
624 LinkRef tail = cmp.NewLink ();
626 foreach (Expression e in Alternatives) {
627 LinkRef next = cmp.NewLink ();
628 cmp.EmitBranch (next);
629 e.Compile (cmp, reverse);
631 cmp.ResolveLink (next);
636 cmp.ResolveLink (tail);
637 cmp.EmitAlternationEnd();
640 public override void GetWidth (out int min, out int max) {
641 GetWidth (out min, out max, Alternatives.Count);
644 public override bool IsComplex () {
646 foreach (Expression e in Alternatives) {
647 comp |= e.IsComplex ();
650 return comp | GetFixedWidth () <= 0;
654 // terminal expressions
656 class Literal : Expression {
657 public Literal (string str, bool ignore) {
659 this.ignore = ignore;
662 public string String {
667 public bool IgnoreCase {
668 get { return ignore; }
669 set { ignore = value; }
672 public override void Compile (ICompiler cmp, bool reverse) {
677 cmp.EmitCharacter (str[0], false, ignore, reverse);
679 cmp.EmitString (str, ignore, reverse);
682 public override void GetWidth (out int min, out int max) {
683 min = max = str.Length;
686 public override AnchorInfo GetAnchorInfo (bool reverse) {
687 return new AnchorInfo (this, 0, str.Length, str, ignore);
690 public override bool IsComplex () {
698 class PositionAssertion : Expression {
699 public PositionAssertion (Position pos) {
703 public Position Position {
708 public override void Compile (ICompiler cmp, bool reverse) {
709 cmp.EmitPosition (pos);
712 public override void GetWidth (out int min, out int max) {
716 public override bool IsComplex () {
720 public override AnchorInfo GetAnchorInfo (bool revers) {
722 case Position.StartOfString: case Position.StartOfLine: case Position.StartOfScan:
723 return new AnchorInfo (this, 0, 0, pos);
726 return new AnchorInfo (this, 0);
730 private Position pos;
733 class Reference : Expression {
734 public Reference (bool ignore) {
735 this.ignore = ignore;
738 public CapturingGroup CapturingGroup {
739 get { return group; }
740 set { group = value; }
743 public bool IgnoreCase {
744 get { return ignore; }
745 set { ignore = value; }
748 public override void Compile (ICompiler cmp, bool reverse) {
749 cmp.EmitReference (group.Number, ignore, reverse);
752 public override void GetWidth (out int min, out int max) {
753 //group.GetWidth (out min, out max);
754 // TODO set width to referenced group for non-cyclical references
756 max = Int32.MaxValue;
759 public override bool IsComplex () {
760 return true; // FIXME incorporate cyclic check
763 private CapturingGroup group;
767 class CharacterClass : Expression {
768 public CharacterClass (bool negate, bool ignore) {
769 this.negate = negate;
770 this.ignore = ignore;
772 intervals = new IntervalCollection ();
774 // initialize pos/neg category arrays
776 int cat_size = (int) Category.LastValue;
777 pos_cats = new BitArray (cat_size);
778 neg_cats = new BitArray (cat_size);
781 public CharacterClass (Category cat, bool negate) : this (false, false) {
782 this.AddCategory (cat, negate);
786 get { return negate; }
787 set { negate = value; }
790 public bool IgnoreCase {
791 get { return ignore; }
792 set { ignore = value; }
795 public void AddCategory (Category cat, bool negate) {
805 public void AddCharacter (char c) {
806 // TODO: this is certainly not the most efficient way of doing things
807 // TODO: but at least it produces correct results.
811 public void AddRange (char lo, char hi) {
812 Interval new_interval = new Interval (lo, hi);
814 // ignore case is on. we must make sure our interval does not
815 // use upper case. if it does, we must normalize the upper case
816 // characters into lower case.
818 if (upper_case_characters.Intersects (new_interval)) {
819 Interval partial_new_interval;
821 if (new_interval.low < upper_case_characters.low) {
822 partial_new_interval = new Interval (upper_case_characters.low + distance_between_upper_and_lower_case,
823 new_interval.high + distance_between_upper_and_lower_case);
824 new_interval.high = upper_case_characters.low - 1;
827 partial_new_interval = new Interval (new_interval.low + distance_between_upper_and_lower_case,
828 upper_case_characters.high + distance_between_upper_and_lower_case);
829 new_interval.low = upper_case_characters.high + 1;
831 intervals.Add (partial_new_interval);
833 else if (upper_case_characters.Contains (new_interval)) {
834 new_interval.high += distance_between_upper_and_lower_case;
835 new_interval.low += distance_between_upper_and_lower_case;
838 intervals.Add (new_interval);
841 public override void Compile (ICompiler cmp, bool reverse) {
842 // create the meta-collection
843 IntervalCollection meta =
844 intervals.GetMetaCollection (new IntervalCollection.CostDelegate (GetIntervalCost));
848 int count = meta.Count;
849 for (int i = 0; i < pos_cats.Length; ++ i) {
850 if (pos_cats[i]) ++ count;
851 if (neg_cats[i]) ++ count;
857 // emit in op for |meta| > 1
859 LinkRef tail = cmp.NewLink ();
866 for (int i = 0; i < pos_cats.Length; ++ i) {
869 cmp.EmitCategory (Category.Any, negate, reverse);
871 cmp.EmitCategory ((Category)i, negate, reverse);
872 } else if (neg_cats[i]) {
873 cmp.EmitCategory ((Category)i, !negate, reverse);
877 // emit character/range/sets from meta-collection
879 foreach (Interval a in meta) {
880 if (a.IsDiscontiguous) { // Set
881 BitArray bits = new BitArray (a.Size);
882 foreach (Interval b in intervals) {
883 if (a.Contains (b)) {
884 for (int i = b.low; i <= b.high; ++ i)
885 bits[i - a.low] = true;
889 cmp.EmitSet ((char)a.low, bits, negate, ignore, reverse);
891 else if (a.IsSingleton) // Character
892 cmp.EmitCharacter ((char)a.low, negate, ignore, reverse);
894 cmp.EmitRange ((char)a.low, (char)a.high, negate, ignore, reverse);
905 cmp.ResolveLink (tail);
909 public override void GetWidth (out int min, out int max) {
913 public override bool IsComplex () {
919 private static double GetIntervalCost (Interval i) {
920 // use op length as cost metric (=> optimize for space)
922 if (i.IsDiscontiguous)
923 return 3 + ((i.Size + 0xf) >> 4); // Set
924 else if (i.IsSingleton)
925 return 2; // Character
930 private static Interval upper_case_characters = new Interval ((char)65, (char)90);
931 private const int distance_between_upper_and_lower_case = 32;
932 private bool negate, ignore;
933 private BitArray pos_cats, neg_cats;
934 private IntervalCollection intervals;
938 private Expression expr;
940 private Position pos;
947 public AnchorInfo (Expression expr, int width) {
954 this.pos = Position.Any;
957 public AnchorInfo (Expression expr, int offset, int width, string str, bool ignore) {
959 this.offset = offset;
962 this.str = ignore ? str.ToLower () : str;
964 this.ignore = ignore;
965 this.pos = Position.Any;
968 public AnchorInfo (Expression expr, int offset, int width, Position pos) {
970 this.offset = offset;
979 public Expression Expression {
984 get { return offset; }
988 get { return width; }
992 get { return (str != null) ? str.Length : 0; }
995 public bool IsUnknownWidth {
996 get { return width < 0; }
999 public bool IsComplete {
1000 get { return Length == Width; }
1003 public string Substring {
1007 public bool IgnoreCase {
1008 get { return ignore; }
1011 public Position Position {
1015 public bool IsSubstring {
1016 get { return str != null; }
1019 public bool IsPosition {
1020 get { return pos != Position.Any; }
1023 public Interval GetInterval () {
1024 return GetInterval (0);
1027 public Interval GetInterval (int start) {
1029 return Interval.Empty;
1031 return new Interval (start + Offset, start + Offset + Length - 1);