// // assembly: System // namespace: System.Text.RegularExpressions // file: syntax.cs // // author: Dan Lewis (dlewis@gmx.co.uk) // (c) 2002 using System; using System.Collections; namespace System.Text.RegularExpressions.Syntax { // collection classes class ExpressionCollection : CollectionBase { public void Add (Expression e) { List.Add (e); } public Expression this[int i] { get { return (Expression)List[i]; } set { List[i] = value; } } protected override void OnValidate (object o) { // allow null elements } } // abstract classes abstract class Expression { public abstract void Compile (ICompiler cmp, bool reverse); public abstract void GetWidth (out int min, out int max); public int GetFixedWidth () { int min, max; GetWidth (out min, out max); if (min == max) return min; return -1; } public virtual AnchorInfo GetAnchorInfo () { return new AnchorInfo (this, GetFixedWidth ()); } public virtual bool IsComplex () { return true; } } // composite expressions abstract class CompositeExpression : Expression { public CompositeExpression () { expressions = new ExpressionCollection (); } protected ExpressionCollection Expressions { get { return expressions; } } protected void GetWidth (out int min, out int max, int count) { min = Int32.MaxValue; max = 0; bool empty = true; for (int i = 0; i < count; ++ i) { Expression e = Expressions[i]; if (e == null) continue; empty = false; int a, b; e.GetWidth (out a, out b); if (a < min) min = a; if (b > max) max = b; } if (empty) min = max = 0; } private ExpressionCollection expressions; } // groups class Group : CompositeExpression { public Group () { } public Expression Expression { get { return Expressions[0]; } set { Expressions[0] = value; } } public void AppendExpression (Expression e) { Expressions.Add (e); } public override void Compile (ICompiler cmp, bool reverse) { int count = Expressions.Count; for (int i = 0; i < count; ++ i) { Expression e; if (reverse) e = Expressions[count - i - 1]; else e = Expressions[i]; e.Compile (cmp, reverse); } } public override void GetWidth (out int min, out int max) { min = 0; max = 0; foreach (Expression e in Expressions) { int a, b; e.GetWidth (out a, out b); min += a; if (max == Int32.MaxValue || b == Int32.MaxValue) max = Int32.MaxValue; else max += b; } } public override AnchorInfo GetAnchorInfo () { int ptr; int width = GetFixedWidth (); ArrayList infos = new ArrayList (); IntervalCollection segments = new IntervalCollection (); // accumulate segments ptr = 0; foreach (Expression e in Expressions) { AnchorInfo info = e.GetAnchorInfo (); infos.Add (info); if (info.IsPosition) return new AnchorInfo (this, ptr + info.Offset, width, info.Position); if (info.IsSubstring) segments.Add (info.GetInterval (ptr)); if (info.IsUnknownWidth) break; ptr += info.Width; } // normalize and find the longest segment segments.Normalize (); Interval longest = Interval.Empty; foreach (Interval segment in segments) { if (segment.Size > longest.Size) longest = segment; } // now chain the substrings that made this segment together if (!longest.IsEmpty) { string str = ""; bool ignore = false; ptr = 0; foreach (AnchorInfo info in infos) { if (info.IsSubstring && longest.Contains (info.GetInterval (ptr))) { str += info.Substring; // TODO mark subexpressions ignore |= info.IgnoreCase; } if (info.IsUnknownWidth) break; ptr += info.Width; } return new AnchorInfo (this, longest.low, width, str, ignore); } return new AnchorInfo (this, width); } public override bool IsComplex () { bool comp = false; foreach (Expression e in Expressions) { comp |= e.IsComplex (); } return comp | GetFixedWidth () <= 0; } } class RegularExpression : Group { public RegularExpression () { group_count = 0; } public int GroupCount { get { return group_count; } set { group_count = value; } } public override void Compile (ICompiler cmp, bool reverse) { // info block int min, max; GetWidth (out min, out max); cmp.EmitInfo (group_count, min, max); // anchoring expression AnchorInfo info = GetAnchorInfo (); if (reverse) info = new AnchorInfo (this, GetFixedWidth ()); // FIXME LinkRef pattern = cmp.NewLink (); cmp.EmitAnchor (info.Offset, pattern); if (info.IsPosition) cmp.EmitPosition (info.Position); else if (info.IsSubstring) cmp.EmitString (info.Substring, info.IgnoreCase, reverse); cmp.EmitTrue (); // pattern cmp.ResolveLink (pattern); base.Compile (cmp, reverse); cmp.EmitTrue (); } private int group_count; } class CapturingGroup : Group { public CapturingGroup () { this.gid = 0; this.name = null; } public int Number { get { return gid; } set { gid = value; } } public string Name { get { return name; } set { name = value; } } public bool IsNamed { get { return name != null; } } public override void Compile (ICompiler cmp, bool reverse) { cmp.EmitOpen (gid); base.Compile (cmp, reverse); cmp.EmitClose (gid); } public override bool IsComplex () { return true; } private int gid; private string name; } class BalancingGroup : CapturingGroup { public BalancingGroup () { this.balance = null; } public CapturingGroup Balance { get { return balance; } set { balance = value; } } public override void Compile (ICompiler cmp, bool reverse) { // can't invoke Group.Compile from here :( // so I'll just repeat the code int count = Expressions.Count; for (int i = 0; i < count; ++ i) { Expression e; if (reverse) e = Expressions[count - i - 1]; else e = Expressions[i]; e.Compile (cmp, reverse); } cmp.EmitBalance (this.Number, balance.Number); } private CapturingGroup balance; } class NonBacktrackingGroup : Group { public NonBacktrackingGroup () { } public override void Compile (ICompiler cmp, bool reverse) { LinkRef tail = cmp.NewLink (); cmp.EmitSub (tail); base.Compile (cmp, reverse); cmp.EmitTrue (); cmp.ResolveLink (tail); } public override bool IsComplex () { return true; } } // repetition class Repetition : CompositeExpression { public Repetition (int min, int max, bool lazy) { Expressions.Add (null); this.min = min; this.max = max; this.lazy = lazy; } public Expression Expression { get { return Expressions[0]; } set { Expressions[0] = value; } } public int Minimum { get { return min; } set { min = value; } } public int Maximum { get { return max; } set { max = value; } } public bool Lazy { get { return lazy; } set { lazy = value; } } public override void Compile (ICompiler cmp, bool reverse) { if (Expression.IsComplex ()) { LinkRef until = cmp.NewLink (); cmp.EmitRepeat (min, max, lazy, until); Expression.Compile (cmp, reverse); cmp.EmitUntil (until); } else { LinkRef tail = cmp.NewLink (); cmp.EmitFastRepeat (min, max, lazy, tail); Expression.Compile (cmp, reverse); cmp.EmitTrue (); cmp.ResolveLink (tail); } } public override void GetWidth (out int min, out int max) { Expression.GetWidth (out min, out max); min = min * this.min; if (max == Int32.MaxValue || this.max == 0xffff) max = Int32.MaxValue; else max = max * this.max; } public override AnchorInfo GetAnchorInfo () { int width = GetFixedWidth (); if (Minimum == 0) return new AnchorInfo (this, width); AnchorInfo info = Expression.GetAnchorInfo (); if (info.IsPosition) return new AnchorInfo (this, info.Offset, width, info.Position); if (info.IsSubstring) { if (info.IsComplete) { string str = ""; for (int i = 0; i < Minimum; ++ i) str += info.Substring; return new AnchorInfo (this, 0, width, str, info.IgnoreCase); } return new AnchorInfo (this, info.Offset, width, info.Substring, info.IgnoreCase); } return new AnchorInfo (this, width); } private int min, max; private bool lazy; } // assertions abstract class Assertion : CompositeExpression { public Assertion () { Expressions.Add (null); // true expression Expressions.Add (null); // false expression } public Expression TrueExpression { get { return Expressions[0]; } set { Expressions[0] = value; } } public Expression FalseExpression { get { return Expressions[1]; } set { Expressions[1] = value; } } public override void GetWidth (out int min, out int max) { GetWidth (out min, out max, 2); if (TrueExpression == null || FalseExpression == null) min = 0; } } class CaptureAssertion : Assertion { public CaptureAssertion () { } public CapturingGroup CapturingGroup { get { return group; } set { group = value; } } public override void Compile (ICompiler cmp, bool reverse) { int gid = group.Number; LinkRef tail = cmp.NewLink (); if (FalseExpression == null) { // IfDefined :1 // // 1: cmp.EmitIfDefined (gid, tail); TrueExpression.Compile (cmp, reverse); } else { // IfDefined :1 // // Jump :2 // 1: // 2: LinkRef false_expr = cmp.NewLink (); cmp.EmitIfDefined (gid, false_expr); TrueExpression.Compile (cmp, reverse); cmp.EmitJump (tail); cmp.ResolveLink (false_expr); FalseExpression.Compile (cmp, reverse); } cmp.ResolveLink (tail); } public override bool IsComplex () { bool comp = false; if (TrueExpression != null) comp |= TrueExpression.IsComplex (); if (FalseExpression != null) comp |= FalseExpression.IsComplex (); return comp | GetFixedWidth () <= 0; } private CapturingGroup group; } class ExpressionAssertion : Assertion { public ExpressionAssertion () { Expressions.Add (null); // test expression } public bool Reverse { get { return reverse; } set { reverse = value; } } public bool Negate { get { return negate; } set { negate = value; } } public Expression TestExpression { get { return Expressions[2]; } set { Expressions[2] = value; } } public override void Compile (ICompiler cmp, bool reverse) { LinkRef true_expr = cmp.NewLink (); LinkRef false_expr = cmp.NewLink (); // test op: positive / negative if (!negate) cmp.EmitTest (true_expr, false_expr); else cmp.EmitTest (false_expr, true_expr); // test expression: lookahead / lookbehind TestExpression.Compile (cmp, reverse ^ this.reverse); cmp.EmitTrue (); // target expressions if (TrueExpression == null) { // (?= ...) // Test :1, :2 // // :2 False // :1 cmp.ResolveLink (false_expr); cmp.EmitFalse (); cmp.ResolveLink (true_expr); } else { cmp.ResolveLink (true_expr); TrueExpression.Compile (cmp, reverse); if (FalseExpression == null) { // (?(...) ...) // Test :1, :2 // // :1 // :2 cmp.ResolveLink (false_expr); } else { // (?(...) ... | ...) // Test :1, :2 // // :1 // Jump :3 // :2 // :3 LinkRef tail = cmp.NewLink (); cmp.EmitJump (tail); cmp.ResolveLink (false_expr); FalseExpression.Compile (cmp, reverse); cmp.ResolveLink (tail); } } } private bool reverse, negate; } // alternation class Alternation : CompositeExpression { public Alternation () { } public ExpressionCollection Alternatives { get { return Expressions; } } public void AddAlternative (Expression e) { Alternatives.Add (e); } public override void Compile (ICompiler cmp, bool reverse) { LinkRef next = cmp.NewLink (); LinkRef tail = cmp.NewLink (); foreach (Expression e in Alternatives) { cmp.EmitBranch (next); e.Compile (cmp, reverse); cmp.EmitJump (tail); cmp.ResolveLink (next); } cmp.EmitFalse (); cmp.ResolveLink (tail); } public override void GetWidth (out int min, out int max) { GetWidth (out min, out max, Alternatives.Count); } public override bool IsComplex () { bool comp = false; foreach (Expression e in Alternatives) { comp |= e.IsComplex (); } return comp | GetFixedWidth () <= 0; } } // terminal expressions class Literal : Expression { public Literal (string str, bool ignore) { this.str = str; this.ignore = ignore; } public string String { get { return str; } set { str = value; } } public bool IgnoreCase { get { return ignore; } set { ignore = value; } } public override void Compile (ICompiler cmp, bool reverse) { if (str.Length == 0) return; if (str.Length == 1) cmp.EmitCharacter (str[0], false, ignore, reverse); else cmp.EmitString (str, ignore, reverse); } public override void GetWidth (out int min, out int max) { min = max = str.Length; } public override AnchorInfo GetAnchorInfo () { return new AnchorInfo (this, 0, str.Length, str, ignore); } public override bool IsComplex () { return false; } private string str; private bool ignore; } class PositionAssertion : Expression { public PositionAssertion (Position pos) { this.pos = pos; } public Position Position { get { return pos; } set { pos = value; } } public override void Compile (ICompiler cmp, bool reverse) { cmp.EmitPosition (pos); } public override void GetWidth (out int min, out int max) { min = max = 0; } public override bool IsComplex () { return false; } public override AnchorInfo GetAnchorInfo () { switch (pos) { case Position.StartOfString: case Position.StartOfLine: case Position.StartOfScan: return new AnchorInfo (this, 0, 0, pos); default: return new AnchorInfo (this, 0); } } private Position pos; } class Reference : Expression { public Reference (bool ignore) { this.ignore = ignore; } public CapturingGroup CapturingGroup { get { return group; } set { group = value; } } public bool IgnoreCase { get { return ignore; } set { ignore = value; } } public override void Compile (ICompiler cmp, bool reverse) { cmp.EmitReference (group.Number, ignore, reverse); } public override void GetWidth (out int min, out int max) { //group.GetWidth (out min, out max); // TODO set width to referenced group for non-cyclical references min = 0; max = Int32.MaxValue; } public override bool IsComplex () { return true; // FIXME incorporate cyclic check } private CapturingGroup group; private bool ignore; } class CharacterClass : Expression { public CharacterClass (bool negate, bool ignore) { this.negate = negate; this.ignore = ignore; intervals = new IntervalCollection (); // initialize pos/neg category arrays Array cat_values = Enum.GetValues (typeof (Category)); int cat_size = (int)(Category)cat_values.GetValue (cat_values.Length - 1) + 1; pos_cats = new bool[cat_size]; neg_cats = new bool[cat_size]; for (int i = 0; i < cat_size; ++ i) { pos_cats[i] = false; neg_cats[i] = false; } } public CharacterClass (Category cat, bool negate) : this (false, false) { this.AddCategory (cat, negate); } public bool Negate { get { return negate; } set { negate = value; } } public bool IgnoreCase { get { return ignore; } set { ignore = value; } } public void AddCategory (Category cat, bool negate) { int n = (int)cat; if (negate) { if (pos_cats[n]) pos_cats[n] = false; neg_cats[n] = true; } else { if (neg_cats[n]) neg_cats[n] = false; pos_cats[n] = true; } } public void AddCharacter (char c) { // TODO: this is certainly not the most efficient way of doing things // TODO: but at least it produces correct results. AddRange (c, c); } public void AddRange (char lo, char hi) { Interval new_interval = new Interval (lo, hi); // ignore case is on. we must make sure our interval does not // use upper case. if it does, we must normalize the upper case // characters into lower case. if (ignore) { if (upper_case_characters.Intersects (new_interval)) { Interval partial_new_interval; if (new_interval.low < upper_case_characters.low) { partial_new_interval = new Interval (upper_case_characters.low + distance_between_upper_and_lower_case, new_interval.high + distance_between_upper_and_lower_case); new_interval.high = upper_case_characters.low - 1; } else { partial_new_interval = new Interval (new_interval.low + distance_between_upper_and_lower_case, upper_case_characters.high + distance_between_upper_and_lower_case); new_interval.low = upper_case_characters.high + 1; } intervals.Add (partial_new_interval); } else if (upper_case_characters.Contains (new_interval)) { new_interval.high += distance_between_upper_and_lower_case; new_interval.low += distance_between_upper_and_lower_case; } } intervals.Add (new_interval); } public override void Compile (ICompiler cmp, bool reverse) { // create the meta-collection IntervalCollection meta = intervals.GetMetaCollection (new IntervalCollection.CostDelegate (GetIntervalCost)); // count ops int count = meta.Count; for (int i = 0; i < pos_cats.Length; ++ i) { if (pos_cats[i]) ++ count; if (neg_cats[i]) ++ count; } if (count == 0) return; // emit in op for |meta| > 1 LinkRef tail = cmp.NewLink (); if (count > 1) cmp.EmitIn (tail); // emit categories for (int i = 0; i < pos_cats.Length; ++ i) { if (pos_cats[i]) cmp.EmitCategory ((Category)i, negate, reverse); else if (neg_cats[i]) cmp.EmitCategory ((Category)i, !negate, reverse); } // emit character/range/sets from meta-collection foreach (Interval a in meta) { if (a.IsDiscontiguous) { // Set BitArray bits = new BitArray (a.Size); foreach (Interval b in intervals) { if (a.Contains (b)) { for (int i = b.low; i <= b.high; ++ i) bits[i - a.low] = true; } } cmp.EmitSet ((char)a.low, bits, negate, ignore, reverse); } else if (a.IsSingleton) // Character cmp.EmitCharacter ((char)a.low, negate, ignore, reverse); else // Range cmp.EmitRange ((char)a.low, (char)a.high, negate, ignore, reverse); } // finish up if (count > 1) { if (negate) cmp.EmitTrue (); else cmp.EmitFalse (); cmp.ResolveLink (tail); } } public override void GetWidth (out int min, out int max) { min = max = 1; } public override bool IsComplex () { return false; } // private private static double GetIntervalCost (Interval i) { // use op length as cost metric (=> optimize for space) if (i.IsDiscontiguous) return 3 + ((i.Size + 0xf) >> 4); // Set else if (i.IsSingleton) return 2; // Character else return 3; // Range } private static Interval upper_case_characters = new Interval ((char)65, (char)90); private const int distance_between_upper_and_lower_case = 32; private bool negate, ignore; private bool[] pos_cats, neg_cats; private IntervalCollection intervals; } class AnchorInfo { private Expression expr; private Position pos; private int offset; private string str; private int width; private bool ignore; public AnchorInfo (Expression expr, int width) { this.expr = expr; this.offset = 0; this.width = width; this.str = null; this.ignore = false; this.pos = Position.Any; } public AnchorInfo (Expression expr, int offset, int width, string str, bool ignore) { this.expr = expr; this.offset = offset; this.width = width; this.str = ignore ? str.ToLower () : str; this.ignore = ignore; this.pos = Position.Any; } public AnchorInfo (Expression expr, int offset, int width, Position pos) { this.expr = expr; this.offset = offset; this.width = width; this.pos = pos; this.str = null; this.ignore = false; } public Expression Expression { get { return expr; } } public int Offset { get { return offset; } } public int Width { get { return width; } } public int Length { get { return (str != null) ? str.Length : 0; } } public bool IsUnknownWidth { get { return width < 0; } } public bool IsComplete { get { return Length == Width; } } public string Substring { get { return str; } } public bool IgnoreCase { get { return ignore; } } public Position Position { get { return pos; } } public bool IsSubstring { get { return str != null; } } public bool IsPosition { get { return pos != Position.Any; } } public Interval GetInterval () { return GetInterval (0); } public Interval GetInterval (int start) { if (!IsSubstring) return Interval.Empty; return new Interval (start + Offset, start + Offset + Length - 1); } } }