3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
33 namespace System.Text.RegularExpressions.Syntax {
36 class ExpressionCollection : CollectionBase {
37 public void Add (Expression e) {
41 public Expression this[int i] {
42 get { return (Expression)List[i]; }
43 set { List[i] = value; }
46 protected override void OnValidate (object o) {
47 // allow null elements
53 abstract class Expression {
54 public abstract void Compile (ICompiler cmp, bool reverse);
55 public abstract void GetWidth (out int min, out int max);
57 public int GetFixedWidth () {
59 GetWidth (out min, out max);
67 public virtual AnchorInfo GetAnchorInfo (bool reverse) {
68 return new AnchorInfo (this, GetFixedWidth ());
71 public abstract bool IsComplex ();
74 // composite expressions
76 abstract class CompositeExpression : Expression {
77 public CompositeExpression () {
78 expressions = new ExpressionCollection ();
81 protected ExpressionCollection Expressions {
82 get { return expressions; }
85 protected void GetWidth (out int min, out int max, int count) {
90 for (int i = 0; i < count; ++ i) {
91 Expression e = Expressions[i];
97 e.GetWidth (out a, out b);
107 public override bool IsComplex ()
109 foreach (Expression e in Expressions) {
113 return GetFixedWidth () <= 0;
116 private ExpressionCollection expressions;
121 class Group : CompositeExpression {
125 public Expression Expression {
126 get { return Expressions[0]; }
127 set { Expressions[0] = value; }
130 public void AppendExpression (Expression e) {
134 public override void Compile (ICompiler cmp, bool reverse) {
135 int count = Expressions.Count;
136 for (int i = 0; i < count; ++ i) {
139 e = Expressions[count - i - 1];
143 e.Compile (cmp, reverse);
147 public override void GetWidth (out int min, out int max) {
151 foreach (Expression e in Expressions) {
153 e.GetWidth (out a, out b);
155 if (max == Int32.MaxValue || b == Int32.MaxValue)
156 max = Int32.MaxValue;
162 public override AnchorInfo GetAnchorInfo (bool reverse)
165 int width = GetFixedWidth ();
167 ArrayList infos = new ArrayList ();
168 IntervalCollection segments = new IntervalCollection ();
170 // accumulate segments
172 int count = Expressions.Count;
173 for (int i = 0; i < count; ++ i) {
176 e = Expressions [count - i - 1];
180 AnchorInfo info = e.GetAnchorInfo (reverse);
184 return new AnchorInfo (this, ptr + info.Offset, width, info.Position);
186 if (info.IsSubstring)
187 segments.Add (info.GetInterval (ptr));
189 if (info.IsUnknownWidth)
195 // normalize and find the longest segment
196 segments.Normalize ();
198 Interval longest = Interval.Empty;
199 foreach (Interval segment in segments) {
200 if (segment.Size > longest.Size)
205 return new AnchorInfo (this, width);
207 // now chain the substrings that made this segment together
212 for (int i = 0; i < infos.Count; ++i) {
213 AnchorInfo info = (AnchorInfo) infos [i];
215 if (info.IsSubstring && longest.Contains (info.GetInterval (ptr))) {
216 ignore |= info.IgnoreCase;
217 infos [n_strings ++] = info;
220 if (info.IsUnknownWidth)
226 StringBuilder sb = new StringBuilder ();
227 for (int i = 0; i < n_strings; ++i) {
230 info = (AnchorInfo) infos [n_strings - i - 1];
232 info = (AnchorInfo) infos [i];
233 sb.Append (info.Substring);
236 if (sb.Length == longest.Size)
237 return new AnchorInfo (this, longest.low, width, sb.ToString (), ignore);
238 // were the string segments overlapping?
239 if (sb.Length > longest.Size) {
240 Console.Error.WriteLine ("overlapping?");
241 return new AnchorInfo (this, width);
243 throw new SystemException ("Shouldn't happen");
247 class RegularExpression : Group {
248 public RegularExpression () {
252 public int GroupCount {
253 get { return group_count; }
254 set { group_count = value; }
257 public override void Compile (ICompiler cmp, bool reverse) {
261 GetWidth (out min, out max);
262 cmp.EmitInfo (group_count, min, max);
264 // anchoring expression
266 AnchorInfo info = GetAnchorInfo (reverse);
268 // info = new AnchorInfo (this, GetFixedWidth ()); // FIXME
270 LinkRef pattern = cmp.NewLink ();
271 cmp.EmitAnchor (reverse, info.Offset, pattern);
274 cmp.EmitPosition (info.Position);
275 else if (info.IsSubstring)
276 cmp.EmitString (info.Substring, info.IgnoreCase, reverse);
282 cmp.ResolveLink (pattern);
283 base.Compile (cmp, reverse);
287 private int group_count;
290 class CapturingGroup : Group {
291 public CapturingGroup () {
303 set { name = value; }
306 public bool IsNamed {
307 get { return name != null; }
310 public override void Compile (ICompiler cmp, bool reverse) {
312 base.Compile (cmp, reverse);
316 public override bool IsComplex () {
324 class BalancingGroup : CapturingGroup {
325 public BalancingGroup () {
329 public CapturingGroup Balance {
330 get { return balance; }
331 set { balance = value; }
334 public override void Compile (ICompiler cmp, bool reverse) {
335 // can't invoke Group.Compile from here :(
336 // so I'll just repeat the code
338 LinkRef tail = cmp.NewLink ();
340 cmp.EmitBalanceStart (this.Number, balance.Number, this.IsNamed, tail);
342 int count = Expressions.Count;
343 for (int i = 0; i < count; ++ i) {
346 e = Expressions[count - i - 1];
350 e.Compile (cmp, reverse);
354 cmp.ResolveLink(tail);
357 private CapturingGroup balance;
360 class NonBacktrackingGroup : Group {
361 public NonBacktrackingGroup () {
364 public override void Compile (ICompiler cmp, bool reverse) {
365 LinkRef tail = cmp.NewLink ();
368 base.Compile (cmp, reverse);
370 cmp.ResolveLink (tail);
373 public override bool IsComplex () {
380 class Repetition : CompositeExpression {
381 public Repetition (int min, int max, bool lazy) {
382 Expressions.Add (null);
389 public Expression Expression {
390 get { return Expressions[0]; }
391 set { Expressions[0] = value; }
406 set { lazy = value; }
409 public override void Compile (ICompiler cmp, bool reverse) {
410 if (Expression.IsComplex ()) {
411 LinkRef until = cmp.NewLink ();
413 cmp.EmitRepeat (min, max, lazy, until);
414 Expression.Compile (cmp, reverse);
415 cmp.EmitUntil (until);
418 LinkRef tail = cmp.NewLink ();
420 cmp.EmitFastRepeat (min, max, lazy, tail);
421 Expression.Compile (cmp, reverse);
423 cmp.ResolveLink (tail);
427 public override void GetWidth (out int min, out int max) {
428 Expression.GetWidth (out min, out max);
429 min = min * this.min;
430 if (max == Int32.MaxValue || this.max == 0xffff)
431 max = Int32.MaxValue;
433 max = max * this.max;
436 public override AnchorInfo GetAnchorInfo (bool reverse) {
437 int width = GetFixedWidth ();
439 return new AnchorInfo (this, width);
441 AnchorInfo info = Expression.GetAnchorInfo (reverse);
443 return new AnchorInfo (this, info.Offset, width, info.Position);
445 if (info.IsSubstring) {
446 if (info.IsComplete) {
448 string str = info.Substring;
449 StringBuilder sb = new StringBuilder (str);
450 for (int i = 1; i < Minimum; ++ i)
453 return new AnchorInfo (this, 0, width, sb.ToString (), info.IgnoreCase);
456 return new AnchorInfo (this, info.Offset, width, info.Substring, info.IgnoreCase);
459 return new AnchorInfo (this, width);
462 private int min, max;
468 abstract class Assertion : CompositeExpression {
469 public Assertion () {
470 Expressions.Add (null); // true expression
471 Expressions.Add (null); // false expression
474 public Expression TrueExpression {
475 get { return Expressions[0]; }
476 set { Expressions[0] = value; }
479 public Expression FalseExpression {
480 get { return Expressions[1]; }
481 set { Expressions[1] = value; }
484 public override void GetWidth (out int min, out int max) {
485 GetWidth (out min, out max, 2);
487 if (TrueExpression == null || FalseExpression == null)
492 class CaptureAssertion : Assertion {
493 public CaptureAssertion (Literal l) {
497 public CapturingGroup CapturingGroup {
498 get { return group; }
499 set { group = value; }
502 public override void Compile (ICompiler cmp, bool reverse) {
504 Alternate.Compile (cmp, reverse);
508 int gid = group.Number;
509 LinkRef tail = cmp.NewLink ();
511 if (FalseExpression == null) {
516 cmp.EmitIfDefined (gid, tail);
517 TrueExpression.Compile (cmp, reverse);
526 LinkRef false_expr = cmp.NewLink ();
527 cmp.EmitIfDefined (gid, false_expr);
528 TrueExpression.Compile (cmp, reverse);
530 cmp.ResolveLink (false_expr);
531 FalseExpression.Compile (cmp, reverse);
534 cmp.ResolveLink (tail);
537 public override bool IsComplex () {
539 return Alternate.IsComplex ();
540 if (TrueExpression != null && TrueExpression.IsComplex ())
542 if (FalseExpression != null && FalseExpression.IsComplex ())
544 return GetFixedWidth () <= 0;
547 ExpressionAssertion Alternate {
549 if (alternate == null) {
550 alternate = new ExpressionAssertion ();
551 alternate.TrueExpression = TrueExpression;
552 alternate.FalseExpression = FalseExpression;
553 alternate.TestExpression = literal;
559 private ExpressionAssertion alternate;
560 private CapturingGroup group;
561 private Literal literal;
564 class ExpressionAssertion : Assertion {
565 public ExpressionAssertion () {
566 Expressions.Add (null); // test expression
569 public bool Reverse {
570 get { return reverse; }
571 set { reverse = value; }
575 get { return negate; }
576 set { negate = value; }
579 public Expression TestExpression {
580 get { return Expressions[2]; }
581 set { Expressions[2] = value; }
584 public override void Compile (ICompiler cmp, bool reverse) {
585 LinkRef true_expr = cmp.NewLink ();
586 LinkRef false_expr = cmp.NewLink ();
588 // test op: positive / negative
591 cmp.EmitTest (true_expr, false_expr);
593 cmp.EmitTest (false_expr, true_expr);
595 // test expression: lookahead / lookbehind
597 TestExpression.Compile (cmp, this.reverse);
600 // target expressions
602 if (TrueExpression == null) { // (?= ...)
608 cmp.ResolveLink (false_expr);
610 cmp.ResolveLink (true_expr);
613 cmp.ResolveLink (true_expr);
614 TrueExpression.Compile (cmp, reverse);
616 if (FalseExpression == null) { // (?(...) ...)
622 cmp.ResolveLink (false_expr);
624 else { // (?(...) ... | ...)
632 LinkRef tail = cmp.NewLink ();
635 cmp.ResolveLink (false_expr);
636 FalseExpression.Compile (cmp, reverse);
637 cmp.ResolveLink (tail);
642 public override bool IsComplex ()
647 private bool reverse, negate;
652 class Alternation : CompositeExpression {
653 public Alternation () {
656 public ExpressionCollection Alternatives {
657 get { return Expressions; }
660 public void AddAlternative (Expression e) {
661 Alternatives.Add (e);
664 public override void Compile (ICompiler cmp, bool reverse) {
665 // LinkRef next = cmp.NewLink ();
666 LinkRef tail = cmp.NewLink ();
668 foreach (Expression e in Alternatives) {
669 LinkRef next = cmp.NewLink ();
670 cmp.EmitBranch (next);
671 e.Compile (cmp, reverse);
673 cmp.ResolveLink (next);
678 cmp.ResolveLink (tail);
679 cmp.EmitAlternationEnd();
682 public override void GetWidth (out int min, out int max) {
683 GetWidth (out min, out max, Alternatives.Count);
687 // terminal expressions
689 class Literal : Expression {
690 public Literal (string str, bool ignore) {
692 this.ignore = ignore;
695 public string String {
700 public bool IgnoreCase {
701 get { return ignore; }
702 set { ignore = value; }
705 public override void Compile (ICompiler cmp, bool reverse) {
710 cmp.EmitCharacter (str[0], false, ignore, reverse);
712 cmp.EmitString (str, ignore, reverse);
715 public override void GetWidth (out int min, out int max) {
716 min = max = str.Length;
719 public override AnchorInfo GetAnchorInfo (bool reverse) {
720 return new AnchorInfo (this, 0, str.Length, str, ignore);
723 public override bool IsComplex () {
731 class PositionAssertion : Expression {
732 public PositionAssertion (Position pos) {
736 public Position Position {
741 public override void Compile (ICompiler cmp, bool reverse) {
742 cmp.EmitPosition (pos);
745 public override void GetWidth (out int min, out int max) {
749 public override bool IsComplex () {
753 public override AnchorInfo GetAnchorInfo (bool revers) {
755 case Position.StartOfString: case Position.StartOfLine: case Position.StartOfScan:
756 return new AnchorInfo (this, 0, 0, pos);
759 return new AnchorInfo (this, 0);
763 private Position pos;
766 class Reference : Expression {
767 public Reference (bool ignore) {
768 this.ignore = ignore;
771 public CapturingGroup CapturingGroup {
772 get { return group; }
773 set { group = value; }
776 public bool IgnoreCase {
777 get { return ignore; }
778 set { ignore = value; }
781 public override void Compile (ICompiler cmp, bool reverse) {
782 cmp.EmitReference (group.Number, ignore, reverse);
785 public override void GetWidth (out int min, out int max) {
786 //group.GetWidth (out min, out max);
787 // TODO set width to referenced group for non-cyclical references
789 max = Int32.MaxValue;
792 public override bool IsComplex () {
793 return true; // FIXME incorporate cyclic check
796 private CapturingGroup group;
800 class CharacterClass : Expression {
801 public CharacterClass (bool negate, bool ignore) {
802 this.negate = negate;
803 this.ignore = ignore;
805 intervals = new IntervalCollection ();
807 // initialize pos/neg category arrays
809 int cat_size = (int) Category.LastValue;
810 pos_cats = new BitArray (cat_size);
811 neg_cats = new BitArray (cat_size);
814 public CharacterClass (Category cat, bool negate) : this (false, false) {
815 this.AddCategory (cat, negate);
819 get { return negate; }
820 set { negate = value; }
823 public bool IgnoreCase {
824 get { return ignore; }
825 set { ignore = value; }
828 public void AddCategory (Category cat, bool negate) {
838 public void AddCharacter (char c) {
839 // TODO: this is certainly not the most efficient way of doing things
840 // TODO: but at least it produces correct results.
844 public void AddRange (char lo, char hi) {
845 Interval new_interval = new Interval (lo, hi);
847 // ignore case is on. we must make sure our interval does not
848 // use upper case. if it does, we must normalize the upper case
849 // characters into lower case.
851 if (upper_case_characters.Intersects (new_interval)) {
852 Interval partial_new_interval;
854 if (new_interval.low < upper_case_characters.low) {
855 partial_new_interval = new Interval (upper_case_characters.low + distance_between_upper_and_lower_case,
856 new_interval.high + distance_between_upper_and_lower_case);
857 new_interval.high = upper_case_characters.low - 1;
860 partial_new_interval = new Interval (new_interval.low + distance_between_upper_and_lower_case,
861 upper_case_characters.high + distance_between_upper_and_lower_case);
862 new_interval.low = upper_case_characters.high + 1;
864 intervals.Add (partial_new_interval);
866 else if (upper_case_characters.Contains (new_interval)) {
867 new_interval.high += distance_between_upper_and_lower_case;
868 new_interval.low += distance_between_upper_and_lower_case;
871 intervals.Add (new_interval);
874 public override void Compile (ICompiler cmp, bool reverse) {
875 // create the meta-collection
876 IntervalCollection meta =
877 intervals.GetMetaCollection (new IntervalCollection.CostDelegate (GetIntervalCost));
880 int count = meta.Count;
881 for (int i = 0; i < pos_cats.Length; ++ i) {
882 if (pos_cats[i] || neg_cats [i])
889 // emit in op for |meta| > 1
890 LinkRef tail = cmp.NewLink ();
894 // emit character/range/sets from meta-collection
895 // we emit these first so that any 'ignore' flags will be noticed by the evaluator
896 foreach (Interval a in meta) {
897 if (a.IsDiscontiguous) { // Set
898 BitArray bits = new BitArray (a.Size);
899 foreach (Interval b in intervals) {
900 if (a.Contains (b)) {
901 for (int i = b.low; i <= b.high; ++ i)
902 bits[i - a.low] = true;
906 cmp.EmitSet ((char)a.low, bits, negate, ignore, reverse);
908 else if (a.IsSingleton) // Character
909 cmp.EmitCharacter ((char)a.low, negate, ignore, reverse);
911 cmp.EmitRange ((char)a.low, (char)a.high, negate, ignore, reverse);
915 for (int i = 0; i < pos_cats.Length; ++ i) {
918 cmp.EmitCategory (Category.AnySingleline, negate, reverse);
920 cmp.EmitCategory ((Category)i, negate, reverse);
921 } else if (neg_cats[i]) {
922 cmp.EmitNotCategory ((Category)i, negate, reverse);
933 cmp.ResolveLink (tail);
937 public override void GetWidth (out int min, out int max) {
941 public override bool IsComplex () {
947 private static double GetIntervalCost (Interval i) {
948 // use op length as cost metric (=> optimize for space)
950 if (i.IsDiscontiguous)
951 return 3 + ((i.Size + 0xf) >> 4); // Set
952 else if (i.IsSingleton)
953 return 2; // Character
958 private static Interval upper_case_characters = new Interval ((char)65, (char)90);
959 private const int distance_between_upper_and_lower_case = 32;
960 private bool negate, ignore;
961 private BitArray pos_cats, neg_cats;
962 private IntervalCollection intervals;
966 private Expression expr;
968 private Position pos;
975 public AnchorInfo (Expression expr, int width) {
982 this.pos = Position.Any;
985 public AnchorInfo (Expression expr, int offset, int width, string str, bool ignore) {
987 this.offset = offset;
990 this.str = ignore ? str.ToLower () : str;
992 this.ignore = ignore;
993 this.pos = Position.Any;
996 public AnchorInfo (Expression expr, int offset, int width, Position pos) {
998 this.offset = offset;
1004 this.ignore = false;
1007 public Expression Expression {
1008 get { return expr; }
1012 get { return offset; }
1016 get { return width; }
1020 get { return (str != null) ? str.Length : 0; }
1023 public bool IsUnknownWidth {
1024 get { return width < 0; }
1027 public bool IsComplete {
1028 get { return Length == Width; }
1031 public string Substring {
1035 public bool IgnoreCase {
1036 get { return ignore; }
1039 public Position Position {
1043 public bool IsSubstring {
1044 get { return str != null; }
1047 public bool IsPosition {
1048 get { return pos != Position.Any; }
1051 public Interval GetInterval () {
1052 return GetInterval (0);
1055 public Interval GetInterval (int start) {
1057 return Interval.Empty;
1059 return new Interval (start + Offset, start + Offset + Length - 1);