3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System.Collections;
33 namespace System.Text.RegularExpressions.Syntax {
36 class ExpressionCollection : CollectionBase {
37 public void Add (Expression e) {
41 public Expression this[int i] {
42 get { return (Expression)List[i]; }
43 set { List[i] = value; }
46 protected override void OnValidate (object o) {
47 // allow null elements
53 abstract class Expression {
54 public abstract void Compile (ICompiler cmp, bool reverse);
55 public abstract void GetWidth (out int min, out int max);
57 public int GetFixedWidth () {
59 GetWidth (out min, out max);
67 public virtual AnchorInfo GetAnchorInfo (bool reverse) {
68 return new AnchorInfo (this, GetFixedWidth ());
71 public virtual bool IsComplex () {
76 // composite expressions
78 abstract class CompositeExpression : Expression {
79 public CompositeExpression () {
80 expressions = new ExpressionCollection ();
83 protected ExpressionCollection Expressions {
84 get { return expressions; }
87 protected void GetWidth (out int min, out int max, int count) {
92 for (int i = 0; i < count; ++ i) {
93 Expression e = Expressions[i];
99 e.GetWidth (out a, out b);
100 if (a < min) min = a;
101 if (b > max) max = b;
108 private ExpressionCollection expressions;
113 class Group : CompositeExpression {
117 public Expression Expression {
118 get { return Expressions[0]; }
119 set { Expressions[0] = value; }
122 public void AppendExpression (Expression e) {
126 public override void Compile (ICompiler cmp, bool reverse) {
127 int count = Expressions.Count;
128 for (int i = 0; i < count; ++ i) {
131 e = Expressions[count - i - 1];
135 e.Compile (cmp, reverse);
139 public override void GetWidth (out int min, out int max) {
143 foreach (Expression e in Expressions) {
145 e.GetWidth (out a, out b);
147 if (max == Int32.MaxValue || b == Int32.MaxValue)
148 max = Int32.MaxValue;
154 public override AnchorInfo GetAnchorInfo (bool reverse) {
156 int width = GetFixedWidth ();
158 ArrayList infos = new ArrayList ();
159 IntervalCollection segments = new IntervalCollection ();
161 // accumulate segments
164 //foreach (Expression e in Expressions) {
165 int count = Expressions.Count;
166 for (int i = 0; i < count; ++ i) {
169 e = Expressions [count - i - 1];
173 AnchorInfo info = e.GetAnchorInfo (reverse);
177 return new AnchorInfo (this, ptr + info.Offset, width, info.Position);
179 if (info.IsSubstring)
180 segments.Add (info.GetInterval (ptr));
182 if (info.IsUnknownWidth)
188 // normalize and find the longest segment
190 segments.Normalize ();
192 Interval longest = Interval.Empty;
193 foreach (Interval segment in segments) {
194 if (segment.Size > longest.Size)
198 // now chain the substrings that made this segment together
200 if (!longest.IsEmpty) {
202 ArrayList strs = new ArrayList();
207 //foreach (AnchorInfo info in infos) {
208 for (int i = 0; i < infos.Count; ++ i) {
211 info = (AnchorInfo)infos[i];
213 if (info.IsSubstring && longest.Contains (info.GetInterval (ptr))) {
214 //str += info.Substring; // TODO mark subexpressions
215 strs.Add(info.Substring);
216 ignore |= info.IgnoreCase;
220 if (info.IsUnknownWidth)
226 for (int i = 0; i< strs.Count; ++i)
229 str += strs [strs.Count - i - 1];
237 return new AnchorInfo (this, longest.low, width, str, ignore);
240 return new AnchorInfo (this, width);
243 public override bool IsComplex () {
245 foreach (Expression e in Expressions) {
246 comp |= e.IsComplex ();
249 return comp | GetFixedWidth () <= 0;
253 class RegularExpression : Group {
254 public RegularExpression () {
258 public int GroupCount {
259 get { return group_count; }
260 set { group_count = value; }
263 public override void Compile (ICompiler cmp, bool reverse) {
267 GetWidth (out min, out max);
268 cmp.EmitInfo (group_count, min, max);
270 // anchoring expression
272 AnchorInfo info = GetAnchorInfo (reverse);
274 // info = new AnchorInfo (this, GetFixedWidth ()); // FIXME
276 LinkRef pattern = cmp.NewLink ();
277 cmp.EmitAnchor (reverse, info.Offset, pattern);
280 cmp.EmitPosition (info.Position);
281 else if (info.IsSubstring)
282 cmp.EmitString (info.Substring, info.IgnoreCase, reverse);
288 cmp.ResolveLink (pattern);
289 base.Compile (cmp, reverse);
293 private int group_count;
296 class CapturingGroup : Group {
297 public CapturingGroup () {
309 set { name = value; }
312 public bool IsNamed {
313 get { return name != null; }
316 public override void Compile (ICompiler cmp, bool reverse) {
318 base.Compile (cmp, reverse);
322 public override bool IsComplex () {
330 class BalancingGroup : CapturingGroup {
331 public BalancingGroup () {
335 public CapturingGroup Balance {
336 get { return balance; }
337 set { balance = value; }
340 public override void Compile (ICompiler cmp, bool reverse) {
341 // can't invoke Group.Compile from here :(
342 // so I'll just repeat the code
344 LinkRef tail = cmp.NewLink ();
346 cmp.EmitBalanceStart (this.Number, balance.Number, this.IsNamed, tail);
348 int count = Expressions.Count;
349 for (int i = 0; i < count; ++ i) {
352 e = Expressions[count - i - 1];
356 e.Compile (cmp, reverse);
360 cmp.ResolveLink(tail);
363 private CapturingGroup balance;
366 class NonBacktrackingGroup : Group {
367 public NonBacktrackingGroup () {
370 public override void Compile (ICompiler cmp, bool reverse) {
371 LinkRef tail = cmp.NewLink ();
374 base.Compile (cmp, reverse);
376 cmp.ResolveLink (tail);
379 public override bool IsComplex () {
386 class Repetition : CompositeExpression {
387 public Repetition (int min, int max, bool lazy) {
388 Expressions.Add (null);
395 public Expression Expression {
396 get { return Expressions[0]; }
397 set { Expressions[0] = value; }
412 set { lazy = value; }
415 public override void Compile (ICompiler cmp, bool reverse) {
416 if (Expression.IsComplex ()) {
417 LinkRef until = cmp.NewLink ();
419 cmp.EmitRepeat (min, max, lazy, until);
420 Expression.Compile (cmp, reverse);
421 cmp.EmitUntil (until);
424 LinkRef tail = cmp.NewLink ();
426 cmp.EmitFastRepeat (min, max, lazy, tail);
427 Expression.Compile (cmp, reverse);
429 cmp.ResolveLink (tail);
433 public override void GetWidth (out int min, out int max) {
434 Expression.GetWidth (out min, out max);
435 min = min * this.min;
436 if (max == Int32.MaxValue || this.max == 0xffff)
437 max = Int32.MaxValue;
439 max = max * this.max;
442 public override AnchorInfo GetAnchorInfo (bool reverse) {
443 int width = GetFixedWidth ();
445 return new AnchorInfo (this, width);
447 AnchorInfo info = Expression.GetAnchorInfo (reverse);
449 return new AnchorInfo (this, info.Offset, width, info.Position);
451 if (info.IsSubstring) {
452 if (info.IsComplete) {
454 for (int i = 0; i < Minimum; ++ i)
455 str += info.Substring;
457 return new AnchorInfo (this, 0, width, str, info.IgnoreCase);
460 return new AnchorInfo (this, info.Offset, width, info.Substring, info.IgnoreCase);
463 return new AnchorInfo (this, width);
466 private int min, max;
472 abstract class Assertion : CompositeExpression {
473 public Assertion () {
474 Expressions.Add (null); // true expression
475 Expressions.Add (null); // false expression
478 public Expression TrueExpression {
479 get { return Expressions[0]; }
480 set { Expressions[0] = value; }
483 public Expression FalseExpression {
484 get { return Expressions[1]; }
485 set { Expressions[1] = value; }
488 public override void GetWidth (out int min, out int max) {
489 GetWidth (out min, out max, 2);
491 if (TrueExpression == null || FalseExpression == null)
496 class CaptureAssertion : Assertion {
497 public CaptureAssertion () {
500 public CapturingGroup CapturingGroup {
501 get { return group; }
502 set { group = value; }
505 public override void Compile (ICompiler cmp, bool reverse) {
506 int gid = group.Number;
507 LinkRef tail = cmp.NewLink ();
509 if (FalseExpression == null) {
514 cmp.EmitIfDefined (gid, tail);
515 TrueExpression.Compile (cmp, reverse);
524 LinkRef false_expr = cmp.NewLink ();
525 cmp.EmitIfDefined (gid, false_expr);
526 TrueExpression.Compile (cmp, reverse);
528 cmp.ResolveLink (false_expr);
529 FalseExpression.Compile (cmp, reverse);
532 cmp.ResolveLink (tail);
535 public override bool IsComplex () {
537 if (TrueExpression != null)
538 comp |= TrueExpression.IsComplex ();
539 if (FalseExpression != null)
540 comp |= FalseExpression.IsComplex ();
542 return comp | GetFixedWidth () <= 0;
545 private CapturingGroup group;
548 class ExpressionAssertion : Assertion {
549 public ExpressionAssertion () {
550 Expressions.Add (null); // test expression
553 public bool Reverse {
554 get { return reverse; }
555 set { reverse = value; }
559 get { return negate; }
560 set { negate = value; }
563 public Expression TestExpression {
564 get { return Expressions[2]; }
565 set { Expressions[2] = value; }
568 public override void Compile (ICompiler cmp, bool reverse) {
569 LinkRef true_expr = cmp.NewLink ();
570 LinkRef false_expr = cmp.NewLink ();
572 // test op: positive / negative
575 cmp.EmitTest (true_expr, false_expr);
577 cmp.EmitTest (false_expr, true_expr);
579 // test expression: lookahead / lookbehind
581 TestExpression.Compile (cmp, this.reverse);
584 // target expressions
586 if (TrueExpression == null) { // (?= ...)
592 cmp.ResolveLink (false_expr);
594 cmp.ResolveLink (true_expr);
597 cmp.ResolveLink (true_expr);
598 TrueExpression.Compile (cmp, reverse);
600 if (FalseExpression == null) { // (?(...) ...)
606 cmp.ResolveLink (false_expr);
608 else { // (?(...) ... | ...)
616 LinkRef tail = cmp.NewLink ();
619 cmp.ResolveLink (false_expr);
620 FalseExpression.Compile (cmp, reverse);
621 cmp.ResolveLink (tail);
626 private bool reverse, negate;
631 class Alternation : CompositeExpression {
632 public Alternation () {
635 public ExpressionCollection Alternatives {
636 get { return Expressions; }
639 public void AddAlternative (Expression e) {
640 Alternatives.Add (e);
643 public override void Compile (ICompiler cmp, bool reverse) {
644 // LinkRef next = cmp.NewLink ();
645 LinkRef tail = cmp.NewLink ();
647 foreach (Expression e in Alternatives) {
648 LinkRef next = cmp.NewLink ();
649 cmp.EmitBranch (next);
650 e.Compile (cmp, reverse);
652 cmp.ResolveLink (next);
657 cmp.ResolveLink (tail);
658 cmp.EmitAlternationEnd();
661 public override void GetWidth (out int min, out int max) {
662 GetWidth (out min, out max, Alternatives.Count);
665 public override bool IsComplex () {
667 foreach (Expression e in Alternatives) {
668 comp |= e.IsComplex ();
671 return comp | GetFixedWidth () <= 0;
675 // terminal expressions
677 class Literal : Expression {
678 public Literal (string str, bool ignore) {
680 this.ignore = ignore;
683 public string String {
688 public bool IgnoreCase {
689 get { return ignore; }
690 set { ignore = value; }
693 public override void Compile (ICompiler cmp, bool reverse) {
698 cmp.EmitCharacter (str[0], false, ignore, reverse);
700 cmp.EmitString (str, ignore, reverse);
703 public override void GetWidth (out int min, out int max) {
704 min = max = str.Length;
707 public override AnchorInfo GetAnchorInfo (bool reverse) {
708 return new AnchorInfo (this, 0, str.Length, str, ignore);
711 public override bool IsComplex () {
719 class PositionAssertion : Expression {
720 public PositionAssertion (Position pos) {
724 public Position Position {
729 public override void Compile (ICompiler cmp, bool reverse) {
730 cmp.EmitPosition (pos);
733 public override void GetWidth (out int min, out int max) {
737 public override bool IsComplex () {
741 public override AnchorInfo GetAnchorInfo (bool revers) {
743 case Position.StartOfString: case Position.StartOfLine: case Position.StartOfScan:
744 return new AnchorInfo (this, 0, 0, pos);
747 return new AnchorInfo (this, 0);
751 private Position pos;
754 class Reference : Expression {
755 public Reference (bool ignore) {
756 this.ignore = ignore;
759 public CapturingGroup CapturingGroup {
760 get { return group; }
761 set { group = value; }
764 public bool IgnoreCase {
765 get { return ignore; }
766 set { ignore = value; }
769 public override void Compile (ICompiler cmp, bool reverse) {
770 cmp.EmitReference (group.Number, ignore, reverse);
773 public override void GetWidth (out int min, out int max) {
774 //group.GetWidth (out min, out max);
775 // TODO set width to referenced group for non-cyclical references
777 max = Int32.MaxValue;
780 public override bool IsComplex () {
781 return true; // FIXME incorporate cyclic check
784 private CapturingGroup group;
788 class CharacterClass : Expression {
789 public CharacterClass (bool negate, bool ignore) {
790 this.negate = negate;
791 this.ignore = ignore;
793 intervals = new IntervalCollection ();
795 // initialize pos/neg category arrays
797 int cat_size = (int) Category.LastValue;
798 pos_cats = new BitArray (cat_size);
799 neg_cats = new BitArray (cat_size);
802 public CharacterClass (Category cat, bool negate) : this (false, false) {
803 this.AddCategory (cat, negate);
807 get { return negate; }
808 set { negate = value; }
811 public bool IgnoreCase {
812 get { return ignore; }
813 set { ignore = value; }
816 public void AddCategory (Category cat, bool negate) {
826 public void AddCharacter (char c) {
827 // TODO: this is certainly not the most efficient way of doing things
828 // TODO: but at least it produces correct results.
832 public void AddRange (char lo, char hi) {
833 Interval new_interval = new Interval (lo, hi);
835 // ignore case is on. we must make sure our interval does not
836 // use upper case. if it does, we must normalize the upper case
837 // characters into lower case.
839 if (upper_case_characters.Intersects (new_interval)) {
840 Interval partial_new_interval;
842 if (new_interval.low < upper_case_characters.low) {
843 partial_new_interval = new Interval (upper_case_characters.low + distance_between_upper_and_lower_case,
844 new_interval.high + distance_between_upper_and_lower_case);
845 new_interval.high = upper_case_characters.low - 1;
848 partial_new_interval = new Interval (new_interval.low + distance_between_upper_and_lower_case,
849 upper_case_characters.high + distance_between_upper_and_lower_case);
850 new_interval.low = upper_case_characters.high + 1;
852 intervals.Add (partial_new_interval);
854 else if (upper_case_characters.Contains (new_interval)) {
855 new_interval.high += distance_between_upper_and_lower_case;
856 new_interval.low += distance_between_upper_and_lower_case;
859 intervals.Add (new_interval);
862 public override void Compile (ICompiler cmp, bool reverse) {
863 // create the meta-collection
864 IntervalCollection meta =
865 intervals.GetMetaCollection (new IntervalCollection.CostDelegate (GetIntervalCost));
869 int count = meta.Count;
870 for (int i = 0; i < pos_cats.Length; ++ i) {
871 if (pos_cats[i]) ++ count;
872 if (neg_cats[i]) ++ count;
878 // emit in op for |meta| > 1
880 LinkRef tail = cmp.NewLink ();
887 for (int i = 0; i < pos_cats.Length; ++ i) {
890 cmp.EmitCategory (Category.Any, negate, reverse);
892 cmp.EmitCategory ((Category)i, negate, reverse);
893 } else if (neg_cats[i]) {
894 cmp.EmitCategory ((Category)i, !negate, reverse);
898 // emit character/range/sets from meta-collection
900 foreach (Interval a in meta) {
901 if (a.IsDiscontiguous) { // Set
902 BitArray bits = new BitArray (a.Size);
903 foreach (Interval b in intervals) {
904 if (a.Contains (b)) {
905 for (int i = b.low; i <= b.high; ++ i)
906 bits[i - a.low] = true;
910 cmp.EmitSet ((char)a.low, bits, negate, ignore, reverse);
912 else if (a.IsSingleton) // Character
913 cmp.EmitCharacter ((char)a.low, negate, ignore, reverse);
915 cmp.EmitRange ((char)a.low, (char)a.high, negate, ignore, reverse);
926 cmp.ResolveLink (tail);
930 public override void GetWidth (out int min, out int max) {
934 public override bool IsComplex () {
940 private static double GetIntervalCost (Interval i) {
941 // use op length as cost metric (=> optimize for space)
943 if (i.IsDiscontiguous)
944 return 3 + ((i.Size + 0xf) >> 4); // Set
945 else if (i.IsSingleton)
946 return 2; // Character
951 private static Interval upper_case_characters = new Interval ((char)65, (char)90);
952 private const int distance_between_upper_and_lower_case = 32;
953 private bool negate, ignore;
954 private BitArray pos_cats, neg_cats;
955 private IntervalCollection intervals;
959 private Expression expr;
961 private Position pos;
968 public AnchorInfo (Expression expr, int width) {
975 this.pos = Position.Any;
978 public AnchorInfo (Expression expr, int offset, int width, string str, bool ignore) {
980 this.offset = offset;
983 this.str = ignore ? str.ToLower () : str;
985 this.ignore = ignore;
986 this.pos = Position.Any;
989 public AnchorInfo (Expression expr, int offset, int width, Position pos) {
991 this.offset = offset;
1000 public Expression Expression {
1001 get { return expr; }
1005 get { return offset; }
1009 get { return width; }
1013 get { return (str != null) ? str.Length : 0; }
1016 public bool IsUnknownWidth {
1017 get { return width < 0; }
1020 public bool IsComplete {
1021 get { return Length == Width; }
1024 public string Substring {
1028 public bool IgnoreCase {
1029 get { return ignore; }
1032 public Position Position {
1036 public bool IsSubstring {
1037 get { return str != null; }
1040 public bool IsPosition {
1041 get { return pos != Position.Any; }
1044 public Interval GetInterval () {
1045 return GetInterval (0);
1048 public Interval GetInterval (int start) {
1050 return Interval.Empty;
1052 return new Interval (start + Offset, start + Offset + Length - 1);