3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
33 namespace System.Text.RegularExpressions.Syntax {
36 class ExpressionCollection : CollectionBase {
37 public void Add (Expression e) {
41 public Expression this[int i] {
42 get { return (Expression)List[i]; }
43 set { List[i] = value; }
46 protected override void OnValidate (object o) {
47 // allow null elements
53 abstract class Expression {
54 public abstract void Compile (ICompiler cmp, bool reverse);
55 public abstract void GetWidth (out int min, out int max);
57 public int GetFixedWidth () {
59 GetWidth (out min, out max);
67 public virtual AnchorInfo GetAnchorInfo (bool reverse) {
68 return new AnchorInfo (this, GetFixedWidth ());
71 public abstract bool IsComplex ();
74 // composite expressions
76 abstract class CompositeExpression : Expression {
77 public CompositeExpression () {
78 expressions = new ExpressionCollection ();
81 protected ExpressionCollection Expressions {
82 get { return expressions; }
85 protected void GetWidth (out int min, out int max, int count) {
90 for (int i = 0; i < count; ++ i) {
91 Expression e = Expressions[i];
97 e.GetWidth (out a, out b);
107 public override bool IsComplex ()
109 foreach (Expression e in Expressions) {
113 return GetFixedWidth () <= 0;
116 private ExpressionCollection expressions;
121 class Group : CompositeExpression {
125 public Expression Expression {
126 get { return Expressions[0]; }
127 set { Expressions[0] = value; }
130 public void AppendExpression (Expression e) {
134 public override void Compile (ICompiler cmp, bool reverse) {
135 int count = Expressions.Count;
136 for (int i = 0; i < count; ++ i) {
139 e = Expressions[count - i - 1];
143 e.Compile (cmp, reverse);
147 public override void GetWidth (out int min, out int max) {
151 foreach (Expression e in Expressions) {
153 e.GetWidth (out a, out b);
155 if (max == Int32.MaxValue || b == Int32.MaxValue)
156 max = Int32.MaxValue;
162 public override AnchorInfo GetAnchorInfo (bool reverse)
165 int width = GetFixedWidth ();
167 ArrayList infos = new ArrayList ();
168 IntervalCollection segments = new IntervalCollection ();
170 // accumulate segments
172 int count = Expressions.Count;
173 for (int i = 0; i < count; ++ i) {
176 e = Expressions [count - i - 1];
180 AnchorInfo info = e.GetAnchorInfo (reverse);
184 return new AnchorInfo (this, ptr + info.Offset, width, info.Position);
186 if (info.IsSubstring)
187 segments.Add (info.GetInterval (ptr));
189 if (info.IsUnknownWidth)
195 // normalize and find the longest segment
196 segments.Normalize ();
198 Interval longest = Interval.Empty;
199 foreach (Interval segment in segments) {
200 if (segment.Size > longest.Size)
205 return new AnchorInfo (this, width);
207 // now chain the substrings that made this segment together
212 for (int i = 0; i < infos.Count; ++i) {
213 AnchorInfo info = (AnchorInfo) infos [i];
215 if (info.IsSubstring && longest.Contains (info.GetInterval (ptr))) {
216 ignore |= info.IgnoreCase;
217 infos [n_strings ++] = info;
220 if (info.IsUnknownWidth)
226 StringBuilder sb = new StringBuilder ();
227 for (int i = 0; i < n_strings; ++i) {
230 info = (AnchorInfo) infos [n_strings - i - 1];
232 info = (AnchorInfo) infos [i];
233 sb.Append (info.Substring);
236 if (sb.Length == longest.Size)
237 return new AnchorInfo (this, longest.low, width, sb.ToString (), ignore);
238 // were the string segments overlapping?
239 if (sb.Length > longest.Size) {
240 Console.Error.WriteLine ("overlapping?");
241 return new AnchorInfo (this, width);
243 throw new SystemException ("Shouldn't happen");
247 class RegularExpression : Group {
248 public RegularExpression () {
252 public int GroupCount {
253 get { return group_count; }
254 set { group_count = value; }
257 public override void Compile (ICompiler cmp, bool reverse) {
261 GetWidth (out min, out max);
262 cmp.EmitInfo (group_count, min, max);
264 // anchoring expression
266 AnchorInfo info = GetAnchorInfo (reverse);
268 // info = new AnchorInfo (this, GetFixedWidth ()); // FIXME
270 LinkRef pattern = cmp.NewLink ();
271 cmp.EmitAnchor (reverse, info.Offset, pattern);
274 cmp.EmitPosition (info.Position);
275 else if (info.IsSubstring)
276 cmp.EmitString (info.Substring, info.IgnoreCase, reverse);
282 cmp.ResolveLink (pattern);
283 base.Compile (cmp, reverse);
287 private int group_count;
290 class CapturingGroup : Group, IComparable {
291 public CapturingGroup () {
303 set { name = value; }
306 public bool IsNamed {
307 get { return name != null; }
310 public override void Compile (ICompiler cmp, bool reverse) {
312 base.Compile (cmp, reverse);
316 public override bool IsComplex () {
320 public int CompareTo (object other)
322 return gid - ((CapturingGroup) other).gid;
329 class BalancingGroup : CapturingGroup {
330 public BalancingGroup () {
334 public CapturingGroup Balance {
335 get { return balance; }
336 set { balance = value; }
339 public override void Compile (ICompiler cmp, bool reverse) {
340 // can't invoke Group.Compile from here :(
341 // so I'll just repeat the code
343 LinkRef tail = cmp.NewLink ();
345 cmp.EmitBalanceStart (this.Index, balance.Index, this.IsNamed, tail);
347 int count = Expressions.Count;
348 for (int i = 0; i < count; ++ i) {
351 e = Expressions[count - i - 1];
355 e.Compile (cmp, reverse);
359 cmp.ResolveLink(tail);
362 private CapturingGroup balance;
365 class NonBacktrackingGroup : Group {
366 public NonBacktrackingGroup () {
369 public override void Compile (ICompiler cmp, bool reverse) {
370 LinkRef tail = cmp.NewLink ();
373 base.Compile (cmp, reverse);
375 cmp.ResolveLink (tail);
378 public override bool IsComplex () {
385 class Repetition : CompositeExpression {
386 public Repetition (int min, int max, bool lazy) {
387 Expressions.Add (null);
394 public Expression Expression {
395 get { return Expressions[0]; }
396 set { Expressions[0] = value; }
411 set { lazy = value; }
414 public override void Compile (ICompiler cmp, bool reverse) {
415 if (Expression.IsComplex ()) {
416 LinkRef until = cmp.NewLink ();
418 cmp.EmitRepeat (min, max, lazy, until);
419 Expression.Compile (cmp, reverse);
420 cmp.EmitUntil (until);
423 LinkRef tail = cmp.NewLink ();
425 cmp.EmitFastRepeat (min, max, lazy, tail);
426 Expression.Compile (cmp, reverse);
428 cmp.ResolveLink (tail);
432 public override void GetWidth (out int min, out int max) {
433 Expression.GetWidth (out min, out max);
434 min = min * this.min;
435 if (max == Int32.MaxValue || this.max == 0xffff)
436 max = Int32.MaxValue;
438 max = max * this.max;
441 public override AnchorInfo GetAnchorInfo (bool reverse) {
442 int width = GetFixedWidth ();
444 return new AnchorInfo (this, width);
446 AnchorInfo info = Expression.GetAnchorInfo (reverse);
448 return new AnchorInfo (this, info.Offset, width, info.Position);
450 if (info.IsSubstring) {
451 if (info.IsComplete) {
453 string str = info.Substring;
454 StringBuilder sb = new StringBuilder (str);
455 for (int i = 1; i < Minimum; ++ i)
458 return new AnchorInfo (this, 0, width, sb.ToString (), info.IgnoreCase);
461 return new AnchorInfo (this, info.Offset, width, info.Substring, info.IgnoreCase);
464 return new AnchorInfo (this, width);
467 private int min, max;
473 abstract class Assertion : CompositeExpression {
474 public Assertion () {
475 Expressions.Add (null); // true expression
476 Expressions.Add (null); // false expression
479 public Expression TrueExpression {
480 get { return Expressions[0]; }
481 set { Expressions[0] = value; }
484 public Expression FalseExpression {
485 get { return Expressions[1]; }
486 set { Expressions[1] = value; }
489 public override void GetWidth (out int min, out int max) {
490 GetWidth (out min, out max, 2);
492 if (TrueExpression == null || FalseExpression == null)
497 class CaptureAssertion : Assertion {
498 public CaptureAssertion (Literal l) {
502 public CapturingGroup CapturingGroup {
503 get { return group; }
504 set { group = value; }
507 public override void Compile (ICompiler cmp, bool reverse) {
509 Alternate.Compile (cmp, reverse);
513 int gid = group.Index;
514 LinkRef tail = cmp.NewLink ();
516 if (FalseExpression == null) {
521 cmp.EmitIfDefined (gid, tail);
522 TrueExpression.Compile (cmp, reverse);
531 LinkRef false_expr = cmp.NewLink ();
532 cmp.EmitIfDefined (gid, false_expr);
533 TrueExpression.Compile (cmp, reverse);
535 cmp.ResolveLink (false_expr);
536 FalseExpression.Compile (cmp, reverse);
539 cmp.ResolveLink (tail);
542 public override bool IsComplex () {
544 return Alternate.IsComplex ();
545 if (TrueExpression != null && TrueExpression.IsComplex ())
547 if (FalseExpression != null && FalseExpression.IsComplex ())
549 return GetFixedWidth () <= 0;
552 ExpressionAssertion Alternate {
554 if (alternate == null) {
555 alternate = new ExpressionAssertion ();
556 alternate.TrueExpression = TrueExpression;
557 alternate.FalseExpression = FalseExpression;
558 alternate.TestExpression = literal;
564 private ExpressionAssertion alternate;
565 private CapturingGroup group;
566 private Literal literal;
569 class ExpressionAssertion : Assertion {
570 public ExpressionAssertion () {
571 Expressions.Add (null); // test expression
574 public bool Reverse {
575 get { return reverse; }
576 set { reverse = value; }
580 get { return negate; }
581 set { negate = value; }
584 public Expression TestExpression {
585 get { return Expressions[2]; }
586 set { Expressions[2] = value; }
589 public override void Compile (ICompiler cmp, bool reverse) {
590 LinkRef true_expr = cmp.NewLink ();
591 LinkRef false_expr = cmp.NewLink ();
593 // test op: positive / negative
596 cmp.EmitTest (true_expr, false_expr);
598 cmp.EmitTest (false_expr, true_expr);
600 // test expression: lookahead / lookbehind
602 TestExpression.Compile (cmp, this.reverse);
605 // target expressions
607 if (TrueExpression == null) { // (?= ...)
613 cmp.ResolveLink (false_expr);
615 cmp.ResolveLink (true_expr);
618 cmp.ResolveLink (true_expr);
619 TrueExpression.Compile (cmp, reverse);
621 if (FalseExpression == null) { // (?(...) ...)
627 cmp.ResolveLink (false_expr);
629 else { // (?(...) ... | ...)
637 LinkRef tail = cmp.NewLink ();
640 cmp.ResolveLink (false_expr);
641 FalseExpression.Compile (cmp, reverse);
642 cmp.ResolveLink (tail);
647 public override bool IsComplex ()
652 private bool reverse, negate;
657 class Alternation : CompositeExpression {
658 public Alternation () {
661 public ExpressionCollection Alternatives {
662 get { return Expressions; }
665 public void AddAlternative (Expression e) {
666 Alternatives.Add (e);
669 public override void Compile (ICompiler cmp, bool reverse) {
670 // LinkRef next = cmp.NewLink ();
671 LinkRef tail = cmp.NewLink ();
673 foreach (Expression e in Alternatives) {
674 LinkRef next = cmp.NewLink ();
675 cmp.EmitBranch (next);
676 e.Compile (cmp, reverse);
678 cmp.ResolveLink (next);
683 cmp.ResolveLink (tail);
684 cmp.EmitAlternationEnd();
687 public override void GetWidth (out int min, out int max) {
688 GetWidth (out min, out max, Alternatives.Count);
692 // terminal expressions
694 class Literal : Expression {
695 public Literal (string str, bool ignore) {
697 this.ignore = ignore;
700 public string String {
705 public bool IgnoreCase {
706 get { return ignore; }
707 set { ignore = value; }
710 public static void CompileLiteral (string str, ICompiler cmp, bool ignore, bool reverse)
716 cmp.EmitCharacter (str[0], false, ignore, reverse);
718 cmp.EmitString (str, ignore, reverse);
721 public override void Compile (ICompiler cmp, bool reverse)
723 CompileLiteral (str, cmp, ignore, reverse);
726 public override void GetWidth (out int min, out int max) {
727 min = max = str.Length;
730 public override AnchorInfo GetAnchorInfo (bool reverse) {
731 return new AnchorInfo (this, 0, str.Length, str, ignore);
734 public override bool IsComplex () {
742 class PositionAssertion : Expression {
743 public PositionAssertion (Position pos) {
747 public Position Position {
752 public override void Compile (ICompiler cmp, bool reverse) {
753 cmp.EmitPosition (pos);
756 public override void GetWidth (out int min, out int max) {
760 public override bool IsComplex () {
764 public override AnchorInfo GetAnchorInfo (bool revers) {
766 case Position.StartOfString: case Position.StartOfLine: case Position.StartOfScan:
767 return new AnchorInfo (this, 0, 0, pos);
770 return new AnchorInfo (this, 0);
774 private Position pos;
777 class Reference : Expression {
778 public Reference (bool ignore) {
779 this.ignore = ignore;
782 public CapturingGroup CapturingGroup {
783 get { return group; }
784 set { group = value; }
787 public bool IgnoreCase {
788 get { return ignore; }
789 set { ignore = value; }
792 public override void Compile (ICompiler cmp, bool reverse) {
793 cmp.EmitReference (group.Index, ignore, reverse);
796 public override void GetWidth (out int min, out int max) {
797 //group.GetWidth (out min, out max);
798 // TODO set width to referenced group for non-cyclical references
800 max = Int32.MaxValue;
803 public override bool IsComplex () {
804 return true; // FIXME incorporate cyclic check
807 private CapturingGroup group;
811 class BackslashNumber : Reference {
815 public BackslashNumber (bool ignore, bool ecma)
821 // Precondition: groups [num_str] == null
822 public bool ResolveReference (string num_str, Hashtable groups)
826 for (int i = 1; i < num_str.Length; ++i) {
827 if (groups [num_str.Substring (0, i)] != null)
831 CapturingGroup = (CapturingGroup) groups [num_str.Substring (0, last_i)];
832 literal = num_str.Substring (last_i);
836 if (num_str.Length == 1)
841 int as_octal = Parser.ParseOctal (num_str, ref ptr);
842 // Since ParseOctal reads at most 3 digits, as_octal <= octal 0777
845 if (as_octal > 0xff && ecma) {
850 literal = ((char) as_octal) + num_str.Substring (ptr);
854 public override void Compile (ICompiler cmp, bool reverse)
856 if (CapturingGroup != null)
857 base.Compile (cmp, reverse);
859 Literal.CompileLiteral (literal, cmp, IgnoreCase, reverse);
863 class CharacterClass : Expression {
864 public CharacterClass (bool negate, bool ignore) {
865 this.negate = negate;
866 this.ignore = ignore;
868 intervals = new IntervalCollection ();
870 // initialize pos/neg category arrays
872 int cat_size = (int) Category.LastValue;
873 pos_cats = new BitArray (cat_size);
874 neg_cats = new BitArray (cat_size);
877 public CharacterClass (Category cat, bool negate) : this (false, false) {
878 this.AddCategory (cat, negate);
882 get { return negate; }
883 set { negate = value; }
886 public bool IgnoreCase {
887 get { return ignore; }
888 set { ignore = value; }
891 public void AddCategory (Category cat, bool negate) {
901 public void AddCharacter (char c) {
902 // TODO: this is certainly not the most efficient way of doing things
903 // TODO: but at least it produces correct results.
907 public void AddRange (char lo, char hi) {
908 Interval new_interval = new Interval (lo, hi);
910 // ignore case is on. we must make sure our interval does not
911 // use upper case. if it does, we must normalize the upper case
912 // characters into lower case.
914 if (upper_case_characters.Intersects (new_interval)) {
915 Interval partial_new_interval;
917 if (new_interval.low < upper_case_characters.low) {
918 partial_new_interval = new Interval (upper_case_characters.low + distance_between_upper_and_lower_case,
919 new_interval.high + distance_between_upper_and_lower_case);
920 new_interval.high = upper_case_characters.low - 1;
923 partial_new_interval = new Interval (new_interval.low + distance_between_upper_and_lower_case,
924 upper_case_characters.high + distance_between_upper_and_lower_case);
925 new_interval.low = upper_case_characters.high + 1;
927 intervals.Add (partial_new_interval);
929 else if (upper_case_characters.Contains (new_interval)) {
930 new_interval.high += distance_between_upper_and_lower_case;
931 new_interval.low += distance_between_upper_and_lower_case;
934 intervals.Add (new_interval);
937 public override void Compile (ICompiler cmp, bool reverse) {
938 // create the meta-collection
939 IntervalCollection meta =
940 intervals.GetMetaCollection (new IntervalCollection.CostDelegate (GetIntervalCost));
943 int count = meta.Count;
944 for (int i = 0; i < pos_cats.Length; ++ i) {
945 if (pos_cats[i] || neg_cats [i])
952 // emit in op for |meta| > 1
953 LinkRef tail = cmp.NewLink ();
957 // emit character/range/sets from meta-collection
958 // we emit these first so that any 'ignore' flags will be noticed by the evaluator
959 foreach (Interval a in meta) {
960 if (a.IsDiscontiguous) { // Set
961 BitArray bits = new BitArray (a.Size);
962 foreach (Interval b in intervals) {
963 if (a.Contains (b)) {
964 for (int i = b.low; i <= b.high; ++ i)
965 bits[i - a.low] = true;
969 cmp.EmitSet ((char)a.low, bits, negate, ignore, reverse);
971 else if (a.IsSingleton) // Character
972 cmp.EmitCharacter ((char)a.low, negate, ignore, reverse);
974 cmp.EmitRange ((char)a.low, (char)a.high, negate, ignore, reverse);
978 for (int i = 0; i < pos_cats.Length; ++ i) {
981 cmp.EmitCategory (Category.AnySingleline, negate, reverse);
983 cmp.EmitCategory ((Category)i, negate, reverse);
984 } else if (neg_cats[i]) {
985 cmp.EmitNotCategory ((Category)i, negate, reverse);
996 cmp.ResolveLink (tail);
1000 public override void GetWidth (out int min, out int max) {
1004 public override bool IsComplex () {
1010 private static double GetIntervalCost (Interval i) {
1011 // use op length as cost metric (=> optimize for space)
1013 if (i.IsDiscontiguous)
1014 return 3 + ((i.Size + 0xf) >> 4); // Set
1015 else if (i.IsSingleton)
1016 return 2; // Character
1021 private static Interval upper_case_characters = new Interval ((char)65, (char)90);
1022 private const int distance_between_upper_and_lower_case = 32;
1023 private bool negate, ignore;
1024 private BitArray pos_cats, neg_cats;
1025 private IntervalCollection intervals;
1029 private Expression expr;
1031 private Position pos;
1036 private bool ignore;
1038 public AnchorInfo (Expression expr, int width) {
1044 this.ignore = false;
1045 this.pos = Position.Any;
1048 public AnchorInfo (Expression expr, int offset, int width, string str, bool ignore) {
1050 this.offset = offset;
1053 this.str = ignore ? str.ToLower () : str;
1055 this.ignore = ignore;
1056 this.pos = Position.Any;
1059 public AnchorInfo (Expression expr, int offset, int width, Position pos) {
1061 this.offset = offset;
1067 this.ignore = false;
1070 public Expression Expression {
1071 get { return expr; }
1075 get { return offset; }
1079 get { return width; }
1083 get { return (str != null) ? str.Length : 0; }
1086 public bool IsUnknownWidth {
1087 get { return width < 0; }
1090 public bool IsComplete {
1091 get { return Length == Width; }
1094 public string Substring {
1098 public bool IgnoreCase {
1099 get { return ignore; }
1102 public Position Position {
1106 public bool IsSubstring {
1107 get { return str != null; }
1110 public bool IsPosition {
1111 get { return pos != Position.Any; }
1114 public Interval GetInterval () {
1115 return GetInterval (0);
1118 public Interval GetInterval (int start) {
1120 return Interval.Empty;
1122 return new Interval (start + Offset, start + Offset + Length - 1);