3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
33 namespace System.Text.RegularExpressions.Syntax {
36 class ExpressionCollection : CollectionBase {
37 public void Add (Expression e) {
41 public Expression this[int i] {
42 get { return (Expression)List[i]; }
43 set { List[i] = value; }
46 protected override void OnValidate (object o) {
47 // allow null elements
53 abstract class Expression {
54 public abstract void Compile (ICompiler cmp, bool reverse);
55 public abstract void GetWidth (out int min, out int max);
57 public int GetFixedWidth () {
59 GetWidth (out min, out max);
67 public virtual AnchorInfo GetAnchorInfo (bool reverse) {
68 return new AnchorInfo (this, GetFixedWidth ());
71 public abstract bool IsComplex ();
74 // composite expressions
76 abstract class CompositeExpression : Expression {
77 public CompositeExpression () {
78 expressions = new ExpressionCollection ();
81 protected ExpressionCollection Expressions {
82 get { return expressions; }
85 protected void GetWidth (out int min, out int max, int count) {
90 for (int i = 0; i < count; ++ i) {
91 Expression e = Expressions[i];
97 e.GetWidth (out a, out b);
107 public override bool IsComplex ()
109 foreach (Expression e in Expressions) {
113 return GetFixedWidth () <= 0;
116 private ExpressionCollection expressions;
121 class Group : CompositeExpression {
125 public Expression Expression {
126 get { return Expressions[0]; }
127 set { Expressions[0] = value; }
130 public void AppendExpression (Expression e) {
134 public override void Compile (ICompiler cmp, bool reverse) {
135 int count = Expressions.Count;
136 for (int i = 0; i < count; ++ i) {
139 e = Expressions[count - i - 1];
143 e.Compile (cmp, reverse);
147 public override void GetWidth (out int min, out int max) {
151 foreach (Expression e in Expressions) {
153 e.GetWidth (out a, out b);
155 if (max == Int32.MaxValue || b == Int32.MaxValue)
156 max = Int32.MaxValue;
162 public override AnchorInfo GetAnchorInfo (bool reverse)
165 int width = GetFixedWidth ();
167 ArrayList infos = new ArrayList ();
168 IntervalCollection segments = new IntervalCollection ();
170 // accumulate segments
172 int count = Expressions.Count;
173 for (int i = 0; i < count; ++ i) {
176 e = Expressions [count - i - 1];
180 AnchorInfo info = e.GetAnchorInfo (reverse);
184 return new AnchorInfo (this, ptr + info.Offset, width, info.Position);
186 if (info.IsSubstring)
187 segments.Add (info.GetInterval (ptr));
189 if (info.IsUnknownWidth)
195 // normalize and find the longest segment
196 segments.Normalize ();
198 Interval longest = Interval.Empty;
199 foreach (Interval segment in segments) {
200 if (segment.Size > longest.Size)
205 return new AnchorInfo (this, width);
207 // now chain the substrings that made this segment together
212 for (int i = 0; i < infos.Count; ++i) {
213 AnchorInfo info = (AnchorInfo) infos [i];
215 if (info.IsSubstring && longest.Contains (info.GetInterval (ptr))) {
216 ignore |= info.IgnoreCase;
217 infos [n_strings ++] = info;
220 if (info.IsUnknownWidth)
226 StringBuilder sb = new StringBuilder ();
227 for (int i = 0; i < n_strings; ++i) {
230 info = (AnchorInfo) infos [n_strings - i - 1];
232 info = (AnchorInfo) infos [i];
233 sb.Append (info.Substring);
236 if (sb.Length == longest.Size)
237 return new AnchorInfo (this, longest.low, width, sb.ToString (), ignore);
238 // were the string segments overlapping?
239 if (sb.Length > longest.Size) {
240 Console.Error.WriteLine ("overlapping?");
241 return new AnchorInfo (this, width);
243 throw new SystemException ("Shouldn't happen");
247 class RegularExpression : Group {
248 public RegularExpression () {
252 public int GroupCount {
253 get { return group_count; }
254 set { group_count = value; }
257 public override void Compile (ICompiler cmp, bool reverse) {
261 GetWidth (out min, out max);
262 cmp.EmitInfo (group_count, min, max);
264 // anchoring expression
266 AnchorInfo info = GetAnchorInfo (reverse);
268 // info = new AnchorInfo (this, GetFixedWidth ()); // FIXME
270 LinkRef pattern = cmp.NewLink ();
271 cmp.EmitAnchor (reverse, info.Offset, pattern);
274 cmp.EmitPosition (info.Position);
275 else if (info.IsSubstring)
276 cmp.EmitString (info.Substring, info.IgnoreCase, reverse);
282 cmp.ResolveLink (pattern);
283 base.Compile (cmp, reverse);
287 private int group_count;
290 class CapturingGroup : Group {
291 public CapturingGroup () {
303 set { name = value; }
306 public bool IsNamed {
307 get { return name != null; }
310 public override void Compile (ICompiler cmp, bool reverse) {
312 base.Compile (cmp, reverse);
316 public override bool IsComplex () {
324 class BalancingGroup : CapturingGroup {
325 public BalancingGroup () {
329 public CapturingGroup Balance {
330 get { return balance; }
331 set { balance = value; }
334 public override void Compile (ICompiler cmp, bool reverse) {
335 // can't invoke Group.Compile from here :(
336 // so I'll just repeat the code
338 LinkRef tail = cmp.NewLink ();
340 cmp.EmitBalanceStart (this.Number, balance.Number, this.IsNamed, tail);
342 int count = Expressions.Count;
343 for (int i = 0; i < count; ++ i) {
346 e = Expressions[count - i - 1];
350 e.Compile (cmp, reverse);
354 cmp.ResolveLink(tail);
357 private CapturingGroup balance;
360 class NonBacktrackingGroup : Group {
361 public NonBacktrackingGroup () {
364 public override void Compile (ICompiler cmp, bool reverse) {
365 LinkRef tail = cmp.NewLink ();
368 base.Compile (cmp, reverse);
370 cmp.ResolveLink (tail);
373 public override bool IsComplex () {
380 class Repetition : CompositeExpression {
381 public Repetition (int min, int max, bool lazy) {
382 Expressions.Add (null);
389 public Expression Expression {
390 get { return Expressions[0]; }
391 set { Expressions[0] = value; }
406 set { lazy = value; }
409 public override void Compile (ICompiler cmp, bool reverse) {
410 if (Expression.IsComplex ()) {
411 LinkRef until = cmp.NewLink ();
413 cmp.EmitRepeat (min, max, lazy, until);
414 Expression.Compile (cmp, reverse);
415 cmp.EmitUntil (until);
418 LinkRef tail = cmp.NewLink ();
420 cmp.EmitFastRepeat (min, max, lazy, tail);
421 Expression.Compile (cmp, reverse);
423 cmp.ResolveLink (tail);
427 public override void GetWidth (out int min, out int max) {
428 Expression.GetWidth (out min, out max);
429 min = min * this.min;
430 if (max == Int32.MaxValue || this.max == 0xffff)
431 max = Int32.MaxValue;
433 max = max * this.max;
436 public override AnchorInfo GetAnchorInfo (bool reverse) {
437 int width = GetFixedWidth ();
439 return new AnchorInfo (this, width);
441 AnchorInfo info = Expression.GetAnchorInfo (reverse);
443 return new AnchorInfo (this, info.Offset, width, info.Position);
445 if (info.IsSubstring) {
446 if (info.IsComplete) {
448 string str = info.Substring;
449 StringBuilder sb = new StringBuilder (str);
450 for (int i = 1; i < Minimum; ++ i)
453 return new AnchorInfo (this, 0, width, sb.ToString (), info.IgnoreCase);
456 return new AnchorInfo (this, info.Offset, width, info.Substring, info.IgnoreCase);
459 return new AnchorInfo (this, width);
462 private int min, max;
468 abstract class Assertion : CompositeExpression {
469 public Assertion () {
470 Expressions.Add (null); // true expression
471 Expressions.Add (null); // false expression
474 public Expression TrueExpression {
475 get { return Expressions[0]; }
476 set { Expressions[0] = value; }
479 public Expression FalseExpression {
480 get { return Expressions[1]; }
481 set { Expressions[1] = value; }
484 public override void GetWidth (out int min, out int max) {
485 GetWidth (out min, out max, 2);
487 if (TrueExpression == null || FalseExpression == null)
492 class CaptureAssertion : Assertion {
493 public CaptureAssertion (Literal l) {
497 public CapturingGroup CapturingGroup {
498 get { return group; }
499 set { group = value; }
502 public override void Compile (ICompiler cmp, bool reverse) {
504 Alternate.Compile (cmp, reverse);
508 int gid = group.Number;
509 LinkRef tail = cmp.NewLink ();
511 if (FalseExpression == null) {
516 cmp.EmitIfDefined (gid, tail);
517 TrueExpression.Compile (cmp, reverse);
526 LinkRef false_expr = cmp.NewLink ();
527 cmp.EmitIfDefined (gid, false_expr);
528 TrueExpression.Compile (cmp, reverse);
530 cmp.ResolveLink (false_expr);
531 FalseExpression.Compile (cmp, reverse);
534 cmp.ResolveLink (tail);
537 public override bool IsComplex () {
539 return Alternate.IsComplex ();
540 if (TrueExpression != null && TrueExpression.IsComplex ())
542 if (FalseExpression != null && FalseExpression.IsComplex ())
544 return GetFixedWidth () <= 0;
547 ExpressionAssertion Alternate {
549 if (alternate == null) {
550 alternate = new ExpressionAssertion ();
551 alternate.TrueExpression = TrueExpression;
552 alternate.FalseExpression = FalseExpression;
553 alternate.TestExpression = literal;
559 private ExpressionAssertion alternate;
560 private CapturingGroup group;
561 private Literal literal;
564 class ExpressionAssertion : Assertion {
565 public ExpressionAssertion () {
566 Expressions.Add (null); // test expression
569 public bool Reverse {
570 get { return reverse; }
571 set { reverse = value; }
575 get { return negate; }
576 set { negate = value; }
579 public Expression TestExpression {
580 get { return Expressions[2]; }
581 set { Expressions[2] = value; }
584 public override void Compile (ICompiler cmp, bool reverse) {
585 LinkRef true_expr = cmp.NewLink ();
586 LinkRef false_expr = cmp.NewLink ();
588 // test op: positive / negative
591 cmp.EmitTest (true_expr, false_expr);
593 cmp.EmitTest (false_expr, true_expr);
595 // test expression: lookahead / lookbehind
597 TestExpression.Compile (cmp, this.reverse);
600 // target expressions
602 if (TrueExpression == null) { // (?= ...)
608 cmp.ResolveLink (false_expr);
610 cmp.ResolveLink (true_expr);
613 cmp.ResolveLink (true_expr);
614 TrueExpression.Compile (cmp, reverse);
616 if (FalseExpression == null) { // (?(...) ...)
622 cmp.ResolveLink (false_expr);
624 else { // (?(...) ... | ...)
632 LinkRef tail = cmp.NewLink ();
635 cmp.ResolveLink (false_expr);
636 FalseExpression.Compile (cmp, reverse);
637 cmp.ResolveLink (tail);
642 public override bool IsComplex ()
647 private bool reverse, negate;
652 class Alternation : CompositeExpression {
653 public Alternation () {
656 public ExpressionCollection Alternatives {
657 get { return Expressions; }
660 public void AddAlternative (Expression e) {
661 Alternatives.Add (e);
664 public override void Compile (ICompiler cmp, bool reverse) {
665 // LinkRef next = cmp.NewLink ();
666 LinkRef tail = cmp.NewLink ();
668 foreach (Expression e in Alternatives) {
669 LinkRef next = cmp.NewLink ();
670 cmp.EmitBranch (next);
671 e.Compile (cmp, reverse);
673 cmp.ResolveLink (next);
678 cmp.ResolveLink (tail);
679 cmp.EmitAlternationEnd();
682 public override void GetWidth (out int min, out int max) {
683 GetWidth (out min, out max, Alternatives.Count);
687 // terminal expressions
689 class Literal : Expression {
690 public Literal (string str, bool ignore) {
692 this.ignore = ignore;
695 public string String {
700 public bool IgnoreCase {
701 get { return ignore; }
702 set { ignore = value; }
705 public override void Compile (ICompiler cmp, bool reverse) {
710 cmp.EmitCharacter (str[0], false, ignore, reverse);
712 cmp.EmitString (str, ignore, reverse);
715 public override void GetWidth (out int min, out int max) {
716 min = max = str.Length;
719 public override AnchorInfo GetAnchorInfo (bool reverse) {
720 return new AnchorInfo (this, 0, str.Length, str, ignore);
723 public override bool IsComplex () {
731 class PositionAssertion : Expression {
732 public PositionAssertion (Position pos) {
736 public Position Position {
741 public override void Compile (ICompiler cmp, bool reverse) {
742 cmp.EmitPosition (pos);
745 public override void GetWidth (out int min, out int max) {
749 public override bool IsComplex () {
753 public override AnchorInfo GetAnchorInfo (bool revers) {
755 case Position.StartOfString: case Position.StartOfLine: case Position.StartOfScan:
756 return new AnchorInfo (this, 0, 0, pos);
759 return new AnchorInfo (this, 0);
763 private Position pos;
766 class Reference : Expression {
767 public Reference (bool ignore) {
768 this.ignore = ignore;
771 public CapturingGroup CapturingGroup {
772 get { return group; }
773 set { group = value; }
776 public bool IgnoreCase {
777 get { return ignore; }
778 set { ignore = value; }
781 public override void Compile (ICompiler cmp, bool reverse) {
782 cmp.EmitReference (group.Number, ignore, reverse);
785 public override void GetWidth (out int min, out int max) {
786 //group.GetWidth (out min, out max);
787 // TODO set width to referenced group for non-cyclical references
789 max = Int32.MaxValue;
792 public override bool IsComplex () {
793 return true; // FIXME incorporate cyclic check
796 private CapturingGroup group;
800 class BackslashNumber : Reference {
804 public BackslashNumber (bool ignore, bool ecma)
811 class CharacterClass : Expression {
812 public CharacterClass (bool negate, bool ignore) {
813 this.negate = negate;
814 this.ignore = ignore;
816 intervals = new IntervalCollection ();
818 // initialize pos/neg category arrays
820 int cat_size = (int) Category.LastValue;
821 pos_cats = new BitArray (cat_size);
822 neg_cats = new BitArray (cat_size);
825 public CharacterClass (Category cat, bool negate) : this (false, false) {
826 this.AddCategory (cat, negate);
830 get { return negate; }
831 set { negate = value; }
834 public bool IgnoreCase {
835 get { return ignore; }
836 set { ignore = value; }
839 public void AddCategory (Category cat, bool negate) {
849 public void AddCharacter (char c) {
850 // TODO: this is certainly not the most efficient way of doing things
851 // TODO: but at least it produces correct results.
855 public void AddRange (char lo, char hi) {
856 Interval new_interval = new Interval (lo, hi);
858 // ignore case is on. we must make sure our interval does not
859 // use upper case. if it does, we must normalize the upper case
860 // characters into lower case.
862 if (upper_case_characters.Intersects (new_interval)) {
863 Interval partial_new_interval;
865 if (new_interval.low < upper_case_characters.low) {
866 partial_new_interval = new Interval (upper_case_characters.low + distance_between_upper_and_lower_case,
867 new_interval.high + distance_between_upper_and_lower_case);
868 new_interval.high = upper_case_characters.low - 1;
871 partial_new_interval = new Interval (new_interval.low + distance_between_upper_and_lower_case,
872 upper_case_characters.high + distance_between_upper_and_lower_case);
873 new_interval.low = upper_case_characters.high + 1;
875 intervals.Add (partial_new_interval);
877 else if (upper_case_characters.Contains (new_interval)) {
878 new_interval.high += distance_between_upper_and_lower_case;
879 new_interval.low += distance_between_upper_and_lower_case;
882 intervals.Add (new_interval);
885 public override void Compile (ICompiler cmp, bool reverse) {
886 // create the meta-collection
887 IntervalCollection meta =
888 intervals.GetMetaCollection (new IntervalCollection.CostDelegate (GetIntervalCost));
891 int count = meta.Count;
892 for (int i = 0; i < pos_cats.Length; ++ i) {
893 if (pos_cats[i] || neg_cats [i])
900 // emit in op for |meta| > 1
901 LinkRef tail = cmp.NewLink ();
905 // emit character/range/sets from meta-collection
906 // we emit these first so that any 'ignore' flags will be noticed by the evaluator
907 foreach (Interval a in meta) {
908 if (a.IsDiscontiguous) { // Set
909 BitArray bits = new BitArray (a.Size);
910 foreach (Interval b in intervals) {
911 if (a.Contains (b)) {
912 for (int i = b.low; i <= b.high; ++ i)
913 bits[i - a.low] = true;
917 cmp.EmitSet ((char)a.low, bits, negate, ignore, reverse);
919 else if (a.IsSingleton) // Character
920 cmp.EmitCharacter ((char)a.low, negate, ignore, reverse);
922 cmp.EmitRange ((char)a.low, (char)a.high, negate, ignore, reverse);
926 for (int i = 0; i < pos_cats.Length; ++ i) {
929 cmp.EmitCategory (Category.AnySingleline, negate, reverse);
931 cmp.EmitCategory ((Category)i, negate, reverse);
932 } else if (neg_cats[i]) {
933 cmp.EmitNotCategory ((Category)i, negate, reverse);
944 cmp.ResolveLink (tail);
948 public override void GetWidth (out int min, out int max) {
952 public override bool IsComplex () {
958 private static double GetIntervalCost (Interval i) {
959 // use op length as cost metric (=> optimize for space)
961 if (i.IsDiscontiguous)
962 return 3 + ((i.Size + 0xf) >> 4); // Set
963 else if (i.IsSingleton)
964 return 2; // Character
969 private static Interval upper_case_characters = new Interval ((char)65, (char)90);
970 private const int distance_between_upper_and_lower_case = 32;
971 private bool negate, ignore;
972 private BitArray pos_cats, neg_cats;
973 private IntervalCollection intervals;
977 private Expression expr;
979 private Position pos;
986 public AnchorInfo (Expression expr, int width) {
993 this.pos = Position.Any;
996 public AnchorInfo (Expression expr, int offset, int width, string str, bool ignore) {
998 this.offset = offset;
1001 this.str = ignore ? str.ToLower () : str;
1003 this.ignore = ignore;
1004 this.pos = Position.Any;
1007 public AnchorInfo (Expression expr, int offset, int width, Position pos) {
1009 this.offset = offset;
1015 this.ignore = false;
1018 public Expression Expression {
1019 get { return expr; }
1023 get { return offset; }
1027 get { return width; }
1031 get { return (str != null) ? str.Length : 0; }
1034 public bool IsUnknownWidth {
1035 get { return width < 0; }
1038 public bool IsComplete {
1039 get { return Length == Width; }
1042 public string Substring {
1046 public bool IgnoreCase {
1047 get { return ignore; }
1050 public Position Position {
1054 public bool IsSubstring {
1055 get { return str != null; }
1058 public bool IsPosition {
1059 get { return pos != Position.Any; }
1062 public Interval GetInterval () {
1063 return GetInterval (0);
1066 public Interval GetInterval (int start) {
1068 return Interval.Empty;
1070 return new Interval (start + Offset, start + Offset + Length - 1);