3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
33 namespace System.Text.RegularExpressions.Syntax {
36 class ExpressionCollection : CollectionBase {
37 public void Add (Expression e) {
41 public Expression this[int i] {
42 get { return (Expression)List[i]; }
43 set { List[i] = value; }
46 protected override void OnValidate (object o) {
47 // allow null elements
53 abstract class Expression {
54 public abstract void Compile (ICompiler cmp, bool reverse);
55 public abstract void GetWidth (out int min, out int max);
57 public int GetFixedWidth () {
59 GetWidth (out min, out max);
67 public virtual AnchorInfo GetAnchorInfo (bool reverse) {
68 return new AnchorInfo (this, GetFixedWidth ());
71 public abstract bool IsComplex ();
74 // composite expressions
76 abstract class CompositeExpression : Expression {
77 public CompositeExpression () {
78 expressions = new ExpressionCollection ();
81 protected ExpressionCollection Expressions {
82 get { return expressions; }
85 protected void GetWidth (out int min, out int max, int count) {
90 for (int i = 0; i < count; ++ i) {
91 Expression e = Expressions[i];
97 e.GetWidth (out a, out b);
107 public override bool IsComplex ()
109 foreach (Expression e in Expressions) {
113 return GetFixedWidth () <= 0;
116 private ExpressionCollection expressions;
121 class Group : CompositeExpression {
125 public Expression Expression {
126 get { return Expressions[0]; }
127 set { Expressions[0] = value; }
130 public void AppendExpression (Expression e) {
134 public override void Compile (ICompiler cmp, bool reverse) {
135 int count = Expressions.Count;
136 for (int i = 0; i < count; ++ i) {
139 e = Expressions[count - i - 1];
143 e.Compile (cmp, reverse);
147 public override void GetWidth (out int min, out int max) {
151 foreach (Expression e in Expressions) {
153 e.GetWidth (out a, out b);
155 if (max == Int32.MaxValue || b == Int32.MaxValue)
156 max = Int32.MaxValue;
162 public override AnchorInfo GetAnchorInfo (bool reverse)
165 int width = GetFixedWidth ();
167 ArrayList infos = new ArrayList ();
168 IntervalCollection segments = new IntervalCollection ();
170 // accumulate segments
172 int count = Expressions.Count;
173 for (int i = 0; i < count; ++ i) {
176 e = Expressions [count - i - 1];
180 AnchorInfo info = e.GetAnchorInfo (reverse);
184 return new AnchorInfo (this, ptr + info.Offset, width, info.Position);
186 if (info.IsSubstring)
187 segments.Add (info.GetInterval (ptr));
189 if (info.IsUnknownWidth)
195 // normalize and find the longest segment
196 segments.Normalize ();
198 Interval longest = Interval.Empty;
199 foreach (Interval segment in segments) {
200 if (segment.Size > longest.Size)
205 return new AnchorInfo (this, width);
207 // now chain the substrings that made this segment together
212 for (int i = 0; i < infos.Count; ++i) {
213 AnchorInfo info = (AnchorInfo) infos [i];
215 if (info.IsSubstring && longest.Contains (info.GetInterval (ptr))) {
216 ignore |= info.IgnoreCase;
217 infos [n_strings ++] = info;
220 if (info.IsUnknownWidth)
226 StringBuilder sb = new StringBuilder ();
227 for (int i = 0; i < n_strings; ++i) {
230 info = (AnchorInfo) infos [n_strings - i - 1];
232 info = (AnchorInfo) infos [i];
233 sb.Append (info.Substring);
236 if (sb.Length == longest.Size)
237 return new AnchorInfo (this, longest.low, width, sb.ToString (), ignore);
238 // were the string segments overlapping?
239 if (sb.Length > longest.Size) {
240 Console.Error.WriteLine ("overlapping?");
241 return new AnchorInfo (this, width);
243 throw new SystemException ("Shouldn't happen");
247 class RegularExpression : Group {
248 public RegularExpression () {
252 public int GroupCount {
253 get { return group_count; }
254 set { group_count = value; }
257 public override void Compile (ICompiler cmp, bool reverse) {
261 GetWidth (out min, out max);
262 cmp.EmitInfo (group_count, min, max);
264 // anchoring expression
266 AnchorInfo info = GetAnchorInfo (reverse);
268 // info = new AnchorInfo (this, GetFixedWidth ()); // FIXME
270 LinkRef pattern = cmp.NewLink ();
271 cmp.EmitAnchor (reverse, info.Offset, pattern);
274 cmp.EmitPosition (info.Position);
275 else if (info.IsSubstring)
276 cmp.EmitString (info.Substring, info.IgnoreCase, reverse);
282 cmp.ResolveLink (pattern);
283 base.Compile (cmp, reverse);
287 private int group_count;
290 class CapturingGroup : Group {
291 public CapturingGroup () {
303 set { name = value; }
306 public bool IsNamed {
307 get { return name != null; }
310 public override void Compile (ICompiler cmp, bool reverse) {
312 base.Compile (cmp, reverse);
316 public override bool IsComplex () {
324 class BalancingGroup : CapturingGroup {
325 public BalancingGroup () {
329 public CapturingGroup Balance {
330 get { return balance; }
331 set { balance = value; }
334 public override void Compile (ICompiler cmp, bool reverse) {
335 // can't invoke Group.Compile from here :(
336 // so I'll just repeat the code
338 LinkRef tail = cmp.NewLink ();
340 cmp.EmitBalanceStart (this.Number, balance.Number, this.IsNamed, tail);
342 int count = Expressions.Count;
343 for (int i = 0; i < count; ++ i) {
346 e = Expressions[count - i - 1];
350 e.Compile (cmp, reverse);
354 cmp.ResolveLink(tail);
357 private CapturingGroup balance;
360 class NonBacktrackingGroup : Group {
361 public NonBacktrackingGroup () {
364 public override void Compile (ICompiler cmp, bool reverse) {
365 LinkRef tail = cmp.NewLink ();
368 base.Compile (cmp, reverse);
370 cmp.ResolveLink (tail);
373 public override bool IsComplex () {
380 class Repetition : CompositeExpression {
381 public Repetition (int min, int max, bool lazy) {
382 Expressions.Add (null);
389 public Expression Expression {
390 get { return Expressions[0]; }
391 set { Expressions[0] = value; }
406 set { lazy = value; }
409 public override void Compile (ICompiler cmp, bool reverse) {
410 if (Expression.IsComplex ()) {
411 LinkRef until = cmp.NewLink ();
413 cmp.EmitRepeat (min, max, lazy, until);
414 Expression.Compile (cmp, reverse);
415 cmp.EmitUntil (until);
418 LinkRef tail = cmp.NewLink ();
420 cmp.EmitFastRepeat (min, max, lazy, tail);
421 Expression.Compile (cmp, reverse);
423 cmp.ResolveLink (tail);
427 public override void GetWidth (out int min, out int max) {
428 Expression.GetWidth (out min, out max);
429 min = min * this.min;
430 if (max == Int32.MaxValue || this.max == 0xffff)
431 max = Int32.MaxValue;
433 max = max * this.max;
436 public override AnchorInfo GetAnchorInfo (bool reverse) {
437 int width = GetFixedWidth ();
439 return new AnchorInfo (this, width);
441 AnchorInfo info = Expression.GetAnchorInfo (reverse);
443 return new AnchorInfo (this, info.Offset, width, info.Position);
445 if (info.IsSubstring) {
446 if (info.IsComplete) {
448 string str = info.Substring;
449 StringBuilder sb = new StringBuilder (str);
450 for (int i = 1; i < Minimum; ++ i)
453 return new AnchorInfo (this, 0, width, sb.ToString (), info.IgnoreCase);
456 return new AnchorInfo (this, info.Offset, width, info.Substring, info.IgnoreCase);
459 return new AnchorInfo (this, width);
462 private int min, max;
468 abstract class Assertion : CompositeExpression {
469 public Assertion () {
470 Expressions.Add (null); // true expression
471 Expressions.Add (null); // false expression
474 public Expression TrueExpression {
475 get { return Expressions[0]; }
476 set { Expressions[0] = value; }
479 public Expression FalseExpression {
480 get { return Expressions[1]; }
481 set { Expressions[1] = value; }
484 public override void GetWidth (out int min, out int max) {
485 GetWidth (out min, out max, 2);
487 if (TrueExpression == null || FalseExpression == null)
492 class CaptureAssertion : Assertion {
493 public CaptureAssertion () {
496 public CapturingGroup CapturingGroup {
497 get { return group; }
498 set { group = value; }
501 public override void Compile (ICompiler cmp, bool reverse) {
502 int gid = group.Number;
503 LinkRef tail = cmp.NewLink ();
505 if (FalseExpression == null) {
510 cmp.EmitIfDefined (gid, tail);
511 TrueExpression.Compile (cmp, reverse);
520 LinkRef false_expr = cmp.NewLink ();
521 cmp.EmitIfDefined (gid, false_expr);
522 TrueExpression.Compile (cmp, reverse);
524 cmp.ResolveLink (false_expr);
525 FalseExpression.Compile (cmp, reverse);
528 cmp.ResolveLink (tail);
531 public override bool IsComplex () {
532 if (TrueExpression != null && TrueExpression.IsComplex ())
534 if (FalseExpression != null && FalseExpression.IsComplex ())
536 return GetFixedWidth () <= 0;
539 private CapturingGroup group;
542 class ExpressionAssertion : Assertion {
543 public ExpressionAssertion () {
544 Expressions.Add (null); // test expression
547 public bool Reverse {
548 get { return reverse; }
549 set { reverse = value; }
553 get { return negate; }
554 set { negate = value; }
557 public Expression TestExpression {
558 get { return Expressions[2]; }
559 set { Expressions[2] = value; }
562 public override void Compile (ICompiler cmp, bool reverse) {
563 LinkRef true_expr = cmp.NewLink ();
564 LinkRef false_expr = cmp.NewLink ();
566 // test op: positive / negative
569 cmp.EmitTest (true_expr, false_expr);
571 cmp.EmitTest (false_expr, true_expr);
573 // test expression: lookahead / lookbehind
575 TestExpression.Compile (cmp, this.reverse);
578 // target expressions
580 if (TrueExpression == null) { // (?= ...)
586 cmp.ResolveLink (false_expr);
588 cmp.ResolveLink (true_expr);
591 cmp.ResolveLink (true_expr);
592 TrueExpression.Compile (cmp, reverse);
594 if (FalseExpression == null) { // (?(...) ...)
600 cmp.ResolveLink (false_expr);
602 else { // (?(...) ... | ...)
610 LinkRef tail = cmp.NewLink ();
613 cmp.ResolveLink (false_expr);
614 FalseExpression.Compile (cmp, reverse);
615 cmp.ResolveLink (tail);
620 public override bool IsComplex ()
625 private bool reverse, negate;
630 class Alternation : CompositeExpression {
631 public Alternation () {
634 public ExpressionCollection Alternatives {
635 get { return Expressions; }
638 public void AddAlternative (Expression e) {
639 Alternatives.Add (e);
642 public override void Compile (ICompiler cmp, bool reverse) {
643 // LinkRef next = cmp.NewLink ();
644 LinkRef tail = cmp.NewLink ();
646 foreach (Expression e in Alternatives) {
647 LinkRef next = cmp.NewLink ();
648 cmp.EmitBranch (next);
649 e.Compile (cmp, reverse);
651 cmp.ResolveLink (next);
656 cmp.ResolveLink (tail);
657 cmp.EmitAlternationEnd();
660 public override void GetWidth (out int min, out int max) {
661 GetWidth (out min, out max, Alternatives.Count);
665 // terminal expressions
667 class Literal : Expression {
668 public Literal (string str, bool ignore) {
670 this.ignore = ignore;
673 public string String {
678 public bool IgnoreCase {
679 get { return ignore; }
680 set { ignore = value; }
683 public override void Compile (ICompiler cmp, bool reverse) {
688 cmp.EmitCharacter (str[0], false, ignore, reverse);
690 cmp.EmitString (str, ignore, reverse);
693 public override void GetWidth (out int min, out int max) {
694 min = max = str.Length;
697 public override AnchorInfo GetAnchorInfo (bool reverse) {
698 return new AnchorInfo (this, 0, str.Length, str, ignore);
701 public override bool IsComplex () {
709 class PositionAssertion : Expression {
710 public PositionAssertion (Position pos) {
714 public Position Position {
719 public override void Compile (ICompiler cmp, bool reverse) {
720 cmp.EmitPosition (pos);
723 public override void GetWidth (out int min, out int max) {
727 public override bool IsComplex () {
731 public override AnchorInfo GetAnchorInfo (bool revers) {
733 case Position.StartOfString: case Position.StartOfLine: case Position.StartOfScan:
734 return new AnchorInfo (this, 0, 0, pos);
737 return new AnchorInfo (this, 0);
741 private Position pos;
744 class Reference : Expression {
745 public Reference (bool ignore) {
746 this.ignore = ignore;
749 public CapturingGroup CapturingGroup {
750 get { return group; }
751 set { group = value; }
754 public bool IgnoreCase {
755 get { return ignore; }
756 set { ignore = value; }
759 public override void Compile (ICompiler cmp, bool reverse) {
760 cmp.EmitReference (group.Number, ignore, reverse);
763 public override void GetWidth (out int min, out int max) {
764 //group.GetWidth (out min, out max);
765 // TODO set width to referenced group for non-cyclical references
767 max = Int32.MaxValue;
770 public override bool IsComplex () {
771 return true; // FIXME incorporate cyclic check
774 private CapturingGroup group;
778 class CharacterClass : Expression {
779 public CharacterClass (bool negate, bool ignore) {
780 this.negate = negate;
781 this.ignore = ignore;
783 intervals = new IntervalCollection ();
785 // initialize pos/neg category arrays
787 int cat_size = (int) Category.LastValue;
788 pos_cats = new BitArray (cat_size);
789 neg_cats = new BitArray (cat_size);
792 public CharacterClass (Category cat, bool negate) : this (false, false) {
793 this.AddCategory (cat, negate);
797 get { return negate; }
798 set { negate = value; }
801 public bool IgnoreCase {
802 get { return ignore; }
803 set { ignore = value; }
806 public void AddCategory (Category cat, bool negate) {
816 public void AddCharacter (char c) {
817 // TODO: this is certainly not the most efficient way of doing things
818 // TODO: but at least it produces correct results.
822 public void AddRange (char lo, char hi) {
823 Interval new_interval = new Interval (lo, hi);
825 // ignore case is on. we must make sure our interval does not
826 // use upper case. if it does, we must normalize the upper case
827 // characters into lower case.
829 if (upper_case_characters.Intersects (new_interval)) {
830 Interval partial_new_interval;
832 if (new_interval.low < upper_case_characters.low) {
833 partial_new_interval = new Interval (upper_case_characters.low + distance_between_upper_and_lower_case,
834 new_interval.high + distance_between_upper_and_lower_case);
835 new_interval.high = upper_case_characters.low - 1;
838 partial_new_interval = new Interval (new_interval.low + distance_between_upper_and_lower_case,
839 upper_case_characters.high + distance_between_upper_and_lower_case);
840 new_interval.low = upper_case_characters.high + 1;
842 intervals.Add (partial_new_interval);
844 else if (upper_case_characters.Contains (new_interval)) {
845 new_interval.high += distance_between_upper_and_lower_case;
846 new_interval.low += distance_between_upper_and_lower_case;
849 intervals.Add (new_interval);
852 public override void Compile (ICompiler cmp, bool reverse) {
853 // create the meta-collection
854 IntervalCollection meta =
855 intervals.GetMetaCollection (new IntervalCollection.CostDelegate (GetIntervalCost));
858 int count = meta.Count;
859 for (int i = 0; i < pos_cats.Length; ++ i) {
860 if (pos_cats[i] || neg_cats [i])
867 // emit in op for |meta| > 1
868 LinkRef tail = cmp.NewLink ();
872 // emit character/range/sets from meta-collection
873 // we emit these first so that any 'ignore' flags will be noticed by the evaluator
874 foreach (Interval a in meta) {
875 if (a.IsDiscontiguous) { // Set
876 BitArray bits = new BitArray (a.Size);
877 foreach (Interval b in intervals) {
878 if (a.Contains (b)) {
879 for (int i = b.low; i <= b.high; ++ i)
880 bits[i - a.low] = true;
884 cmp.EmitSet ((char)a.low, bits, negate, ignore, reverse);
886 else if (a.IsSingleton) // Character
887 cmp.EmitCharacter ((char)a.low, negate, ignore, reverse);
889 cmp.EmitRange ((char)a.low, (char)a.high, negate, ignore, reverse);
893 for (int i = 0; i < pos_cats.Length; ++ i) {
896 cmp.EmitCategory (Category.AnySingleline, negate, reverse);
898 cmp.EmitCategory ((Category)i, negate, reverse);
899 } else if (neg_cats[i]) {
900 cmp.EmitNotCategory ((Category)i, negate, reverse);
911 cmp.ResolveLink (tail);
915 public override void GetWidth (out int min, out int max) {
919 public override bool IsComplex () {
925 private static double GetIntervalCost (Interval i) {
926 // use op length as cost metric (=> optimize for space)
928 if (i.IsDiscontiguous)
929 return 3 + ((i.Size + 0xf) >> 4); // Set
930 else if (i.IsSingleton)
931 return 2; // Character
936 private static Interval upper_case_characters = new Interval ((char)65, (char)90);
937 private const int distance_between_upper_and_lower_case = 32;
938 private bool negate, ignore;
939 private BitArray pos_cats, neg_cats;
940 private IntervalCollection intervals;
944 private Expression expr;
946 private Position pos;
953 public AnchorInfo (Expression expr, int width) {
960 this.pos = Position.Any;
963 public AnchorInfo (Expression expr, int offset, int width, string str, bool ignore) {
965 this.offset = offset;
968 this.str = ignore ? str.ToLower () : str;
970 this.ignore = ignore;
971 this.pos = Position.Any;
974 public AnchorInfo (Expression expr, int offset, int width, Position pos) {
976 this.offset = offset;
985 public Expression Expression {
990 get { return offset; }
994 get { return width; }
998 get { return (str != null) ? str.Length : 0; }
1001 public bool IsUnknownWidth {
1002 get { return width < 0; }
1005 public bool IsComplete {
1006 get { return Length == Width; }
1009 public string Substring {
1013 public bool IgnoreCase {
1014 get { return ignore; }
1017 public Position Position {
1021 public bool IsSubstring {
1022 get { return str != null; }
1025 public bool IsPosition {
1026 get { return pos != Position.Any; }
1029 public Interval GetInterval () {
1030 return GetInterval (0);
1033 public Interval GetInterval (int start) {
1035 return Interval.Empty;
1037 return new Interval (start + Offset, start + Offset + Length - 1);