1 //------------------------------------------------------------------------------
2 // <copyright file="XPathScanner.cs" company="Microsoft">
3 // Copyright (c) Microsoft Corporation. All rights reserved.
5 // <owner current="true" primary="true">Microsoft</owner>
6 // <spec>http://www.w3.org/TR/xpath#exprlex</spec>
7 //------------------------------------------------------------------------------
9 using System.Diagnostics;
11 namespace System.Xml.Xsl.XPath {
12 using Res = System.Xml.Utils.Res;
14 // Extends XPathOperator enumeration
15 internal enum LexKind {
16 Unknown, // Unknown lexeme
18 And, // Operator 'and'
26 Minus, // Operator '-'
27 Multiply, // Operator '*'
28 Divide, // Operator 'div'
29 Modulo, // Operator 'mod'
30 UnaryMinus, // Not used
31 Union, // Operator '|'
36 SlashSlash, // Operator '//'
37 Number, // Number (numeric literal)
40 Name, // NameTest, NodeType, FunctionName, AxisName, second part of VariableReference
41 String, // Literal (string literal)
42 Eof, // End of the expression
44 FirstStringable = Name,
55 Star = '*', // NameTest
56 Slash = '/', // Operator '/'
57 Dollar = '$', // First part of VariableReference
58 RBrace = '}', // Used for AVTs
61 internal sealed class XPathScanner {
62 private string xpathExpr;
67 private string prefix;
68 private string stringValue;
69 private bool canBeFunction;
71 private int prevLexEnd;
72 private LexKind prevKind;
73 private XPathAxis axis;
75 private XmlCharType xmlCharType = XmlCharType.Instance;
77 public XPathScanner(string xpathExpr) : this(xpathExpr, 0) {}
79 public XPathScanner(string xpathExpr, int startFrom) {
80 Debug.Assert(xpathExpr != null);
81 this.xpathExpr = xpathExpr;
82 this.kind = LexKind.Unknown;
83 SetSourceIndex(startFrom);
87 public string Source { get { return xpathExpr; } }
88 public LexKind Kind { get { return kind; } }
89 public int LexStart { get { return lexStart; } }
90 public int LexSize { get { return curIndex - lexStart; } }
91 public int PrevLexEnd { get { return prevLexEnd; } }
93 private void SetSourceIndex(int index) {
94 Debug.Assert(0 <= index && index <= xpathExpr.Length);
99 private void NextChar() {
100 Debug.Assert(-1 <= curIndex && curIndex < xpathExpr.Length);
102 if (curIndex < xpathExpr.Length) {
103 curChar = xpathExpr[curIndex];
105 Debug.Assert(curIndex == xpathExpr.Length);
110 #if XML10_FIFTH_EDITION
111 private char PeekNextChar() {
112 Debug.Assert(-1 <= curIndex && curIndex <= xpathExpr.Length);
113 if (curIndex + 1 < xpathExpr.Length) {
114 return xpathExpr[curIndex + 1];
124 Debug.Assert(kind == LexKind.Name);
125 Debug.Assert(name != null);
130 public string Prefix {
132 Debug.Assert(kind == LexKind.Name);
133 Debug.Assert(prefix != null);
138 public string RawValue {
140 if (kind == LexKind.Eof) {
141 return LexKindToString(kind);
143 return xpathExpr.Substring(lexStart, curIndex - lexStart);
148 public string StringValue {
150 Debug.Assert(kind == LexKind.String);
151 Debug.Assert(stringValue != null);
156 // Returns true if the character following an QName (possibly after intervening
157 // ExprWhitespace) is '('. In this case the token must be recognized as a NodeType
158 // or a FunctionName unless it is an OperatorName. This distinction cannot be done
159 // without knowing the previous lexeme. For example, "or" in "... or (1 != 0)" may
160 // be an OperatorName or a FunctionName.
161 public bool CanBeFunction {
163 Debug.Assert(kind == LexKind.Name);
164 return canBeFunction;
168 public XPathAxis Axis {
170 Debug.Assert(kind == LexKind.Axis);
171 Debug.Assert(axis != XPathAxis.Unknown);
176 private void SkipSpace() {
177 while (xmlCharType.IsWhiteSpace(curChar)) {
182 private static bool IsAsciiDigit(char ch) {
183 return (uint)(ch - '0') <= 9;
186 public void NextLex() {
187 prevLexEnd = curIndex;
196 case '(': case ')': case '[': case ']':
197 case '@': case ',': case '$': case '}':
198 kind = (LexKind)curChar;
203 if (curChar == '.') {
204 kind = LexKind.DotDot;
206 } else if (IsAsciiDigit(curChar)) {
207 SetSourceIndex(lexStart);
215 if (curChar == ':') {
216 kind = LexKind.ColonColon;
219 kind = LexKind.Unknown;
229 if (curChar == '/') {
230 kind = LexKind.SlashSlash;
233 kind = LexKind.Slash;
237 kind = LexKind.Union;
245 kind = LexKind.Minus;
254 if (curChar == '=') {
258 kind = LexKind.Unknown;
263 if (curChar == '=') {
272 if (curChar == '=') {
281 kind = LexKind.String;
284 case '0': case '1': case '2': case '3':
285 case '4': case '5': case '6': case '7':
287 kind = LexKind.Number;
291 if (xmlCharType.IsStartNCNameSingleChar(curChar)
292 #if XML10_FIFTH_EDITION
293 || xmlCharType.IsNCNameHighSurrogateChar(curChar)
297 this.name = ScanNCName();
298 this.prefix = string.Empty;
299 this.canBeFunction = false;
300 this.axis = XPathAxis.Unknown;
301 bool colonColon = false;
302 int saveSourceIndex = curIndex;
304 // "foo:bar" or "foo:*" -- one lexeme (no spaces allowed)
305 // "foo::" or "foo ::" -- two lexemes, reported as one (AxisName)
306 // "foo:?" or "foo :?" -- lexeme "foo" reported
307 if (curChar == ':') {
309 if (curChar == ':') { // "foo::" -> OperatorName, AxisName
312 SetSourceIndex(saveSourceIndex);
313 } else { // "foo:bar", "foo:*" or "foo:?"
314 if (curChar == '*') {
316 this.prefix = this.name;
318 } else if (xmlCharType.IsStartNCNameSingleChar(curChar)
319 #if XML10_FIFTH_EDITION
320 || xmlCharType.IsNCNameHighSurrogateChar(curChar)
323 this.prefix = this.name;
324 this.name = ScanNCName();
325 // Look ahead for '(' to determine whether QName can be a FunctionName
326 saveSourceIndex = curIndex;
328 this.canBeFunction = (curChar == '(');
329 SetSourceIndex(saveSourceIndex);
330 } else { // "foo:?" -> OperatorName, NameTest
331 // Return "foo" and leave ":" to be reported later as an unknown lexeme
332 SetSourceIndex(saveSourceIndex);
337 if (curChar == ':') { // "foo ::" or "foo :?"
339 if (curChar == ':') {
343 SetSourceIndex(saveSourceIndex);
345 this.canBeFunction = (curChar == '(');
348 if (!CheckOperator(false) && colonColon) {
349 this.axis = CheckAxis();
352 kind = LexKind.Unknown;
359 private bool CheckOperator(bool star) {
363 opKind = LexKind.Multiply;
365 if (prefix.Length != 0 || name.Length > 3)
369 case "or" : opKind = LexKind.Or; break;
370 case "and": opKind = LexKind.And; break;
371 case "div": opKind = LexKind.Divide; break;
372 case "mod": opKind = LexKind.Modulo; break;
373 default : return false;
377 // If there is a preceding token and the preceding token is not one of '@', '::', '(', '[', ',' or an Operator,
378 // then a '*' must be recognized as a MultiplyOperator and an NCName must be recognized as an OperatorName.
379 if (prevKind <= LexKind.LastOperator)
384 case LexKind.SlashSlash:
386 case LexKind.ColonColon:
387 case LexKind.LParens:
388 case LexKind.LBracket:
398 private XPathAxis CheckAxis() {
399 this.kind = LexKind.Axis;
401 case "ancestor" : return XPathAxis.Ancestor;
402 case "ancestor-or-self" : return XPathAxis.AncestorOrSelf;
403 case "attribute" : return XPathAxis.Attribute;
404 case "child" : return XPathAxis.Child;
405 case "descendant" : return XPathAxis.Descendant;
406 case "descendant-or-self" : return XPathAxis.DescendantOrSelf;
407 case "following" : return XPathAxis.Following;
408 case "following-sibling" : return XPathAxis.FollowingSibling;
409 case "namespace" : return XPathAxis.Namespace;
410 case "parent" : return XPathAxis.Parent;
411 case "preceding" : return XPathAxis.Preceding;
412 case "preceding-sibling" : return XPathAxis.PrecedingSibling;
413 case "self" : return XPathAxis.Self;
414 default : this.kind = LexKind.Name; return XPathAxis.Unknown;
418 private void ScanNumber() {
419 Debug.Assert(IsAsciiDigit(curChar) || curChar == '.');
420 while (IsAsciiDigit(curChar)) {
423 if (curChar == '.') {
425 while (IsAsciiDigit(curChar)) {
429 if ((curChar & (~0x20)) == 'E') {
431 if (curChar == '+' || curChar == '-') {
434 while (IsAsciiDigit(curChar)) {
437 throw CreateException(Res.XPath_ScientificNotation);
441 private void ScanString() {
442 int startIdx = curIndex + 1;
443 int endIdx = xpathExpr.IndexOf(curChar, startIdx);
446 SetSourceIndex(xpathExpr.Length);
447 throw CreateException(Res.XPath_UnclosedString);
450 this.stringValue = xpathExpr.Substring(startIdx, endIdx - startIdx);
451 SetSourceIndex(endIdx + 1);
454 private string ScanNCName() {
455 Debug.Assert(xmlCharType.IsStartNCNameSingleChar(curChar)
456 #if XML10_FIFTH_EDITION
457 || xmlCharType.IsNCNameHighSurrogateChar(curChar)
460 int start = curIndex;
462 if (xmlCharType.IsNCNameSingleChar(curChar)) {
465 #if XML10_FIFTH_EDITION
466 else if (xmlCharType.IsNCNameSurrogateChar(PeekNextChar(), curChar)) {
475 return xpathExpr.Substring(start, curIndex - start);
478 public void PassToken(LexKind t) {
483 public void CheckToken(LexKind t) {
484 Debug.Assert(LexKind.FirstStringable <= t);
486 if (t == LexKind.Eof) {
487 throw CreateException(Res.XPath_EofExpected, RawValue);
489 throw CreateException(Res.XPath_TokenExpected, LexKindToString(t), RawValue);
494 // May be called for the following tokens: Name, String, Eof, Comma, LParens, RParens, LBracket, RBracket, RBrace
495 private string LexKindToString(LexKind t) {
496 Debug.Assert(LexKind.FirstStringable <= t);
498 if (LexKind.LastNonChar < t) {
499 Debug.Assert("()[].@,*/$}".IndexOf((char)t) >= 0);
500 return new String((char)t, 1);
504 case LexKind.Name : return "<name>";
505 case LexKind.String : return "<string literal>";
506 case LexKind.Eof : return "<eof>";
508 Debug.Fail("Unexpected LexKind: " + t.ToString());
513 public XPathCompileException CreateException(string resId, params string[] args) {
514 return new XPathCompileException(xpathExpr, lexStart, curIndex, resId, args);