3 // namespace: System.Text.RegularExpressions
\r
6 // author: Dan Lewis (dlewis@gmx.co.uk)
\r
10 using System.Collections;
\r
12 namespace System.Text.RegularExpressions.Syntax {
\r
13 // collection classes
\r
15 class ExpressionCollection : CollectionBase {
\r
16 public void Add (Expression e) {
\r
20 public Expression this[int i] {
\r
21 get { return (Expression)List[i]; }
\r
22 set { List[i] = value; }
\r
25 protected override void OnValidate (object o) {
\r
26 // allow null elements
\r
32 abstract class Expression {
\r
33 public abstract void Compile (ICompiler cmp, bool reverse);
\r
34 public abstract void GetWidth (out int min, out int max);
\r
36 public int GetFixedWidth () {
\r
38 GetWidth (out min, out max);
\r
46 public virtual AnchorInfo GetAnchorInfo () {
\r
47 return new AnchorInfo (this, GetFixedWidth ());
\r
50 public virtual bool IsComplex () {
\r
55 // composite expressions
\r
57 abstract class CompositeExpression : Expression {
\r
58 public CompositeExpression () {
\r
59 expressions = new ExpressionCollection ();
\r
62 protected ExpressionCollection Expressions {
\r
63 get { return expressions; }
\r
66 protected void GetWidth (out int min, out int max, int count) {
\r
67 min = Int32.MaxValue;
\r
71 for (int i = 0; i < count; ++ i) {
\r
72 Expression e = Expressions[i];
\r
78 e.GetWidth (out a, out b);
\r
79 if (a < min) min = a;
\r
80 if (b > max) max = b;
\r
87 private ExpressionCollection expressions;
\r
92 class Group : CompositeExpression {
\r
96 public Expression Expression {
\r
97 get { return Expressions[0]; }
\r
98 set { Expressions[0] = value; }
\r
101 public void AppendExpression (Expression e) {
\r
102 Expressions.Add (e);
\r
105 public override void Compile (ICompiler cmp, bool reverse) {
\r
106 int count = Expressions.Count;
\r
107 for (int i = 0; i < count; ++ i) {
\r
110 e = Expressions[count - i - 1];
\r
112 e = Expressions[i];
\r
114 e.Compile (cmp, reverse);
\r
118 public override void GetWidth (out int min, out int max) {
\r
122 foreach (Expression e in Expressions) {
\r
124 e.GetWidth (out a, out b);
\r
126 if (max == Int32.MaxValue || b == Int32.MaxValue)
\r
127 max = Int32.MaxValue;
\r
133 public override AnchorInfo GetAnchorInfo () {
\r
135 int width = GetFixedWidth ();
\r
137 ArrayList infos = new ArrayList ();
\r
138 IntervalCollection segments = new IntervalCollection ();
\r
140 // accumulate segments
\r
143 foreach (Expression e in Expressions) {
\r
144 AnchorInfo info = e.GetAnchorInfo ();
\r
147 if (info.IsPosition)
\r
148 return new AnchorInfo (this, ptr + info.Offset, width, info.Position);
\r
150 if (info.IsSubstring)
\r
151 segments.Add (info.GetInterval (ptr));
\r
153 if (info.IsUnknownWidth)
\r
159 // normalize and find the longest segment
\r
161 segments.Normalize ();
\r
163 Interval longest = Interval.Empty;
\r
164 foreach (Interval segment in segments) {
\r
165 if (segment.Size > longest.Size)
\r
169 // now chain the substrings that made this segment together
\r
171 if (!longest.IsEmpty) {
\r
173 bool ignore = false;
\r
176 foreach (AnchorInfo info in infos) {
\r
177 if (info.IsSubstring && longest.Contains (info.GetInterval (ptr))) {
\r
178 str += info.Substring; // TODO mark subexpressions
\r
179 ignore |= info.IgnoreCase;
\r
182 if (info.IsUnknownWidth)
\r
188 return new AnchorInfo (this, longest.low, width, str, ignore);
\r
191 return new AnchorInfo (this, width);
\r
194 public override bool IsComplex () {
\r
196 foreach (Expression e in Expressions) {
\r
197 comp |= e.IsComplex ();
\r
200 return comp | GetFixedWidth () <= 0;
\r
204 class RegularExpression : Group {
\r
205 public RegularExpression () {
\r
209 public int GroupCount {
\r
210 get { return group_count; }
\r
211 set { group_count = value; }
\r
214 public override void Compile (ICompiler cmp, bool reverse) {
\r
218 GetWidth (out min, out max);
\r
219 cmp.EmitInfo (group_count, min, max);
\r
221 // anchoring expression
\r
223 AnchorInfo info = GetAnchorInfo ();
\r
225 info = new AnchorInfo (this, GetFixedWidth ()); // FIXME
\r
227 LinkRef pattern = cmp.NewLink ();
\r
228 cmp.EmitAnchor (info.Offset, pattern);
\r
230 if (info.IsPosition)
\r
231 cmp.EmitPosition (info.Position);
\r
232 else if (info.IsSubstring)
\r
233 cmp.EmitString (info.Substring, info.IgnoreCase, reverse);
\r
239 cmp.ResolveLink (pattern);
\r
240 base.Compile (cmp, reverse);
\r
244 private int group_count;
\r
247 class CapturingGroup : Group {
\r
248 public CapturingGroup () {
\r
253 public int Number {
\r
254 get { return gid; }
\r
255 set { gid = value; }
\r
258 public string Name {
\r
259 get { return name; }
\r
260 set { name = value; }
\r
263 public bool IsNamed {
\r
264 get { return name != null; }
\r
267 public override void Compile (ICompiler cmp, bool reverse) {
\r
268 cmp.EmitOpen (gid);
\r
269 base.Compile (cmp, reverse);
\r
270 cmp.EmitClose (gid);
\r
273 public override bool IsComplex () {
\r
278 private string name;
\r
281 class BalancingGroup : CapturingGroup {
\r
282 public BalancingGroup () {
\r
283 this.balance = null;
\r
286 public CapturingGroup Balance {
\r
287 get { return balance; }
\r
288 set { balance = value; }
\r
291 public override void Compile (ICompiler cmp, bool reverse) {
\r
292 // can't invoke Group.Compile from here :(
\r
293 // so I'll just repeat the code
\r
295 int count = Expressions.Count;
\r
296 for (int i = 0; i < count; ++ i) {
\r
299 e = Expressions[count - i - 1];
\r
301 e = Expressions[i];
\r
303 e.Compile (cmp, reverse);
\r
306 cmp.EmitBalance (this.Number, balance.Number);
\r
309 private CapturingGroup balance;
\r
312 class NonBacktrackingGroup : Group {
\r
313 public NonBacktrackingGroup () {
\r
316 public override void Compile (ICompiler cmp, bool reverse) {
\r
317 LinkRef tail = cmp.NewLink ();
\r
319 cmp.EmitSub (tail);
\r
320 base.Compile (cmp, reverse);
\r
322 cmp.ResolveLink (tail);
\r
325 public override bool IsComplex () {
\r
332 class Repetition : CompositeExpression {
\r
333 public Repetition (int min, int max, bool lazy) {
\r
334 Expressions.Add (null);
\r
341 public Expression Expression {
\r
342 get { return Expressions[0]; }
\r
343 set { Expressions[0] = value; }
\r
346 public int Minimum {
\r
347 get { return min; }
\r
348 set { min = value; }
\r
351 public int Maximum {
\r
352 get { return max; }
\r
353 set { max = value; }
\r
357 get { return lazy; }
\r
358 set { lazy = value; }
\r
361 public override void Compile (ICompiler cmp, bool reverse) {
\r
362 if (Expression.IsComplex ()) {
\r
363 LinkRef until = cmp.NewLink ();
\r
365 cmp.EmitRepeat (min, max, lazy, until);
\r
366 Expression.Compile (cmp, reverse);
\r
367 cmp.EmitUntil (until);
\r
370 LinkRef tail = cmp.NewLink ();
\r
372 cmp.EmitFastRepeat (min, max, lazy, tail);
\r
373 Expression.Compile (cmp, reverse);
\r
375 cmp.ResolveLink (tail);
\r
379 public override void GetWidth (out int min, out int max) {
\r
380 Expression.GetWidth (out min, out max);
\r
381 min = min * this.min;
\r
382 if (max == Int32.MaxValue || this.max == 0xffff)
\r
383 max = Int32.MaxValue;
\r
385 max = max * this.max;
\r
388 public override AnchorInfo GetAnchorInfo () {
\r
389 int width = GetFixedWidth ();
\r
391 return new AnchorInfo (this, width);
\r
393 AnchorInfo info = Expression.GetAnchorInfo ();
\r
394 if (info.IsPosition)
\r
395 return new AnchorInfo (this, info.Offset, width, info.Position);
\r
397 if (info.IsSubstring) {
\r
398 if (info.IsComplete) {
\r
400 for (int i = 0; i < Minimum; ++ i)
\r
401 str += info.Substring;
\r
403 return new AnchorInfo (this, 0, width, str, info.IgnoreCase);
\r
406 return new AnchorInfo (this, info.Offset, width, info.Substring, info.IgnoreCase);
\r
409 return new AnchorInfo (this, width);
\r
412 private int min, max;
\r
418 abstract class Assertion : CompositeExpression {
\r
419 public Assertion () {
\r
420 Expressions.Add (null); // true expression
\r
421 Expressions.Add (null); // false expression
\r
424 public Expression TrueExpression {
\r
425 get { return Expressions[0]; }
\r
426 set { Expressions[0] = value; }
\r
429 public Expression FalseExpression {
\r
430 get { return Expressions[1]; }
\r
431 set { Expressions[1] = value; }
\r
434 public override void GetWidth (out int min, out int max) {
\r
435 GetWidth (out min, out max, 2);
\r
437 if (TrueExpression == null || FalseExpression == null)
\r
442 class CaptureAssertion : Assertion {
\r
443 public CaptureAssertion () {
\r
446 public CapturingGroup CapturingGroup {
\r
447 get { return group; }
\r
448 set { group = value; }
\r
451 public override void Compile (ICompiler cmp, bool reverse) {
\r
452 int gid = group.Number;
\r
453 LinkRef tail = cmp.NewLink ();
\r
455 if (FalseExpression == null) {
\r
460 cmp.EmitIfDefined (gid, tail);
\r
461 TrueExpression.Compile (cmp, reverse);
\r
470 LinkRef false_expr = cmp.NewLink ();
\r
471 cmp.EmitIfDefined (gid, false_expr);
\r
472 TrueExpression.Compile (cmp, reverse);
\r
473 cmp.EmitJump (tail);
\r
474 cmp.ResolveLink (false_expr);
\r
475 FalseExpression.Compile (cmp, reverse);
\r
478 cmp.ResolveLink (tail);
\r
481 public override bool IsComplex () {
\r
483 if (TrueExpression != null)
\r
484 comp |= TrueExpression.IsComplex ();
\r
485 if (FalseExpression != null)
\r
486 comp |= FalseExpression.IsComplex ();
\r
488 return comp | GetFixedWidth () <= 0;
\r
491 private CapturingGroup group;
\r
494 class ExpressionAssertion : Assertion {
\r
495 public ExpressionAssertion () {
\r
496 Expressions.Add (null); // test expression
\r
499 public bool Reverse {
\r
500 get { return reverse; }
\r
501 set { reverse = value; }
\r
504 public bool Negate {
\r
505 get { return negate; }
\r
506 set { negate = value; }
\r
509 public Expression TestExpression {
\r
510 get { return Expressions[2]; }
\r
511 set { Expressions[2] = value; }
\r
514 public override void Compile (ICompiler cmp, bool reverse) {
\r
515 LinkRef true_expr = cmp.NewLink ();
\r
516 LinkRef false_expr = cmp.NewLink ();
\r
518 // test op: positive / negative
\r
521 cmp.EmitTest (true_expr, false_expr);
\r
523 cmp.EmitTest (false_expr, true_expr);
\r
525 // test expression: lookahead / lookbehind
\r
527 TestExpression.Compile (cmp, reverse ^ this.reverse);
\r
530 // target expressions
\r
532 if (TrueExpression == null) { // (?= ...)
\r
538 cmp.ResolveLink (false_expr);
\r
540 cmp.ResolveLink (true_expr);
\r
543 cmp.ResolveLink (true_expr);
\r
544 TrueExpression.Compile (cmp, reverse);
\r
546 if (FalseExpression == null) { // (?(...) ...)
\r
552 cmp.ResolveLink (false_expr);
\r
554 else { // (?(...) ... | ...)
\r
562 LinkRef tail = cmp.NewLink ();
\r
564 cmp.EmitJump (tail);
\r
565 cmp.ResolveLink (false_expr);
\r
566 FalseExpression.Compile (cmp, reverse);
\r
567 cmp.ResolveLink (tail);
\r
572 private bool reverse, negate;
\r
577 class Alternation : CompositeExpression {
\r
578 public Alternation () {
\r
581 public ExpressionCollection Alternatives {
\r
582 get { return Expressions; }
\r
585 public void AddAlternative (Expression e) {
\r
586 Alternatives.Add (e);
\r
589 public override void Compile (ICompiler cmp, bool reverse) {
\r
590 LinkRef next = cmp.NewLink ();
\r
591 LinkRef tail = cmp.NewLink ();
\r
593 foreach (Expression e in Alternatives) {
\r
594 cmp.EmitBranch (next);
\r
595 e.Compile (cmp, reverse);
\r
596 cmp.EmitJump (tail);
\r
597 cmp.ResolveLink (next);
\r
601 cmp.ResolveLink (tail);
\r
604 public override void GetWidth (out int min, out int max) {
\r
605 GetWidth (out min, out max, Alternatives.Count);
\r
608 public override bool IsComplex () {
\r
610 foreach (Expression e in Alternatives) {
\r
611 comp |= e.IsComplex ();
\r
614 return comp | GetFixedWidth () <= 0;
\r
618 // terminal expressions
\r
620 class Literal : Expression {
\r
621 public Literal (string str, bool ignore) {
\r
623 this.ignore = ignore;
\r
626 public string String {
\r
627 get { return str; }
\r
628 set { str = value; }
\r
631 public bool IgnoreCase {
\r
632 get { return ignore; }
\r
633 set { ignore = value; }
\r
636 public override void Compile (ICompiler cmp, bool reverse) {
\r
637 if (str.Length == 0)
\r
640 if (str.Length == 1)
\r
641 cmp.EmitCharacter (str[0], false, ignore, reverse);
\r
643 cmp.EmitString (str, ignore, reverse);
\r
646 public override void GetWidth (out int min, out int max) {
\r
647 min = max = str.Length;
\r
650 public override AnchorInfo GetAnchorInfo () {
\r
651 return new AnchorInfo (this, 0, str.Length, str, ignore);
\r
654 public override bool IsComplex () {
\r
658 private string str;
\r
659 private bool ignore;
\r
662 class PositionAssertion : Expression {
\r
663 public PositionAssertion (Position pos) {
\r
667 public Position Position {
\r
668 get { return pos; }
\r
669 set { pos = value; }
\r
672 public override void Compile (ICompiler cmp, bool reverse) {
\r
673 cmp.EmitPosition (pos);
\r
676 public override void GetWidth (out int min, out int max) {
\r
680 public override bool IsComplex () {
\r
684 public override AnchorInfo GetAnchorInfo () {
\r
686 case Position.StartOfString: case Position.StartOfLine: case Position.StartOfScan:
\r
687 return new AnchorInfo (this, 0, 0, pos);
\r
690 return new AnchorInfo (this, 0);
\r
694 private Position pos;
\r
697 class Reference : Expression {
\r
698 public Reference (bool ignore) {
\r
699 this.ignore = ignore;
\r
702 public CapturingGroup CapturingGroup {
\r
703 get { return group; }
\r
704 set { group = value; }
\r
707 public bool IgnoreCase {
\r
708 get { return ignore; }
\r
709 set { ignore = value; }
\r
712 public override void Compile (ICompiler cmp, bool reverse) {
\r
713 cmp.EmitReference (group.Number, ignore, reverse);
\r
716 public override void GetWidth (out int min, out int max) {
\r
717 //group.GetWidth (out min, out max);
\r
718 // TODO set width to referenced group for non-cyclical references
\r
720 max = Int32.MaxValue;
\r
723 public override bool IsComplex () {
\r
724 return true; // FIXME incorporate cyclic check
\r
727 private CapturingGroup group;
\r
728 private bool ignore;
\r
731 class CharacterClass : Expression {
\r
732 public CharacterClass (bool negate, bool ignore) {
\r
733 this.negate = negate;
\r
734 this.ignore = ignore;
\r
736 intervals = new IntervalCollection ();
\r
738 // initialize pos/neg category arrays
\r
740 Array cat_values = Enum.GetValues (typeof (Category));
\r
741 int cat_size = (int)(Category)cat_values.GetValue (cat_values.Length - 1) + 1;
\r
742 pos_cats = new bool[cat_size];
\r
743 neg_cats = new bool[cat_size];
\r
744 for (int i = 0; i < cat_size; ++ i) {
\r
745 pos_cats[i] = false;
\r
746 neg_cats[i] = false;
\r
750 public CharacterClass (Category cat, bool negate) : this (false, false) {
\r
751 this.AddCategory (cat, negate);
\r
754 public bool Negate {
\r
755 get { return negate; }
\r
756 set { negate = value; }
\r
759 public bool IgnoreCase {
\r
760 get { return ignore; }
\r
761 set { ignore = value; }
\r
764 public void AddCategory (Category cat, bool negate) {
\r
769 pos_cats[n] = false;
\r
771 neg_cats[n] = true;
\r
775 neg_cats[n] = false;
\r
777 pos_cats[n] = true;
\r
781 public void AddCharacter (char c) {
\r
782 intervals.Add (new Interval (c, c));
\r
785 public void AddRange (char lo, char hi) {
\r
786 intervals.Add (new Interval (lo, hi));
\r
789 public override void Compile (ICompiler cmp, bool reverse) {
\r
790 // create the meta-collection
\r
792 IntervalCollection meta =
\r
793 intervals.GetMetaCollection (new IntervalCollection.CostDelegate (GetIntervalCost));
\r
797 int count = meta.Count;
\r
798 for (int i = 0; i < pos_cats.Length; ++ i) {
\r
799 if (pos_cats[i]) ++ count;
\r
800 if (neg_cats[i]) ++ count;
\r
806 // emit in op for |meta| > 1
\r
808 LinkRef tail = cmp.NewLink ();
\r
814 for (int i = 0; i < pos_cats.Length; ++ i) {
\r
816 cmp.EmitCategory ((Category)i, negate, reverse);
\r
817 else if (neg_cats[i])
\r
818 cmp.EmitCategory ((Category)i, !negate, reverse);
\r
821 // emit character/range/sets from meta-collection
\r
823 foreach (Interval a in meta) {
\r
824 if (a.IsDiscontiguous) { // Set
\r
825 BitArray bits = new BitArray (a.Size);
\r
826 foreach (Interval b in intervals) {
\r
827 if (a.Contains (b)) {
\r
828 for (int i = b.low; i <= b.high; ++ i)
\r
829 bits[i - a.low] = true;
\r
833 cmp.EmitSet ((char)a.low, bits, negate, ignore, reverse);
\r
835 else if (a.IsSingleton) // Character
\r
836 cmp.EmitCharacter ((char)a.low, negate, ignore, reverse);
\r
838 cmp.EmitRange ((char)a.low, (char)a.high, negate, ignore, reverse);
\r
849 cmp.ResolveLink (tail);
\r
853 public override void GetWidth (out int min, out int max) {
\r
857 public override bool IsComplex () {
\r
863 private static double GetIntervalCost (Interval i) {
\r
864 // use op length as cost metric (=> optimize for space)
\r
866 if (i.IsDiscontiguous)
\r
867 return 3 + ((i.Size + 0xf) >> 4); // Set
\r
868 else if (i.IsSingleton)
\r
869 return 2; // Character
\r
874 private bool negate, ignore;
\r
875 private bool[] pos_cats, neg_cats;
\r
876 private IntervalCollection intervals;
\r
880 private Expression expr;
\r
882 private Position pos;
\r
883 private int offset;
\r
885 private string str;
\r
887 private bool ignore;
\r
889 public AnchorInfo (Expression expr, int width) {
\r
892 this.width = width;
\r
895 this.ignore = false;
\r
896 this.pos = Position.Any;
\r
899 public AnchorInfo (Expression expr, int offset, int width, string str, bool ignore) {
\r
901 this.offset = offset;
\r
902 this.width = width;
\r
904 this.str = ignore ? str.ToLower () : str;
\r
906 this.ignore = ignore;
\r
907 this.pos = Position.Any;
\r
910 public AnchorInfo (Expression expr, int offset, int width, Position pos) {
\r
912 this.offset = offset;
\r
913 this.width = width;
\r
918 this.ignore = false;
\r
921 public Expression Expression {
\r
922 get { return expr; }
\r
925 public int Offset {
\r
926 get { return offset; }
\r
930 get { return width; }
\r
933 public int Length {
\r
934 get { return (str != null) ? str.Length : 0; }
\r
937 public bool IsUnknownWidth {
\r
938 get { return width < 0; }
\r
941 public bool IsComplete {
\r
942 get { return Length == Width; }
\r
945 public string Substring {
\r
946 get { return str; }
\r
949 public bool IgnoreCase {
\r
950 get { return ignore; }
\r
953 public Position Position {
\r
954 get { return pos; }
\r
957 public bool IsSubstring {
\r
958 get { return str != null; }
\r
961 public bool IsPosition {
\r
962 get { return pos != Position.Any; }
\r
965 public Interval GetInterval () {
\r
966 return GetInterval (0);
\r
969 public Interval GetInterval (int start) {
\r
971 return Interval.Empty;
\r
973 return new Interval (start + Offset, start + Offset + Length - 1);
\r