2 // TokenStream.cs: Port of Mozilla's Rhino TokenStream
3 // This class implements the JScript scanner
6 // Cesar Lopez Nataren (cesar@ciencias.unam.mx)
8 // (C) 2004, Cesar Lopez Nataren
12 // Permission is hereby granted, free of charge, to any person obtaining
13 // a copy of this software and associated documentation files (the
14 // "Software"), to deal in the Software without restriction, including
15 // without limitation the rights to use, copy, modify, merge, publish,
16 // distribute, sublicense, and/or sell copies of the Software, and to
17 // permit persons to whom the Software is furnished to do so, subject to
18 // the following conditions:
20 // The above copyright notice and this permission notice shall be
21 // included in all copies or substantial portions of the Software.
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
27 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
28 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
29 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
34 using System.Collections;
35 using System.Globalization;
37 namespace Microsoft.JScript {
38 internal class TokenStream {
44 internal string SourceName {
45 get { return source_name; }
46 set { source_name = value; }
50 internal int LineNumber {
51 get { return line_number; }
52 set { line_number = value; }
57 get { return hit_eof; }
61 internal int TokenNumber {
62 get { return token_number; }
63 set { token_number = value; }
71 int string_buffer_top;
72 char [] string_buffer = new char [128];
74 // Room backtrace from to < on failed match of the last - in <!--
75 int [] unget_buffer = new int [3];
82 char [] source_buffer;
86 static int EOF_CHAR = -1;
87 static int EOL_HINT_MASK = 0xdfd0;
89 StreamReader source_reader;
94 internal string GetString {
95 get { return _string; }
98 static bool reserved_keyword_as_identifier;
101 internal double GetNumber {
102 get { return number; }
107 internal int GetOp ()
112 internal bool allow_reg_exp;
114 internal string reg_exp_flags;
120 internal TokenStream (StreamReader source_reader, string source_string, string source_name, int line_number)
122 pushback_token = Token.EOF;
123 SourceName = source_name;
124 this.line_number = line_number;
125 if (source_reader != null) {
126 if (source_string != null)
128 this.source_reader = source_reader;
129 source_buffer = new char [512];
132 if (source_string == null)
134 this.source_string = source_string;
135 source_end = source_string.Length;
140 static bool IsKeyword (string s)
142 return Token.EOF != StringToKeyword (s);
145 static int StringToKeyword (string name)
147 // The following assumes that Token.EOF == 0
149 Id_break = Token.BREAK,
150 Id_case = Token.CASE,
151 Id_continue = Token.CONTINUE,
152 Id_default = Token.DEFAULT,
153 Id_delete = Token.DELPROP,
155 Id_else = Token.ELSE,
156 Id_export = Token.EXPORT,
157 Id_false = Token.FALSE,
159 Id_function = Token.FUNCTION,
163 Id_null = Token.NULL,
164 Id_return = Token.RETURN,
165 Id_switch = Token.SWITCH,
166 Id_this = Token.THIS,
167 Id_true = Token.TRUE,
168 Id_typeof = Token.TYPEOF,
170 Id_void = Token.VOID,
171 Id_while = Token.WHILE,
172 Id_with = Token.WITH,
174 // the following are #ifdef RESERVE_JAVA_KEYWORDS in jsscan.c
175 Id_abstract = Token.RESERVED,
176 Id_boolean = Token.RESERVED,
177 Id_byte = Token.RESERVED,
178 Id_catch = Token.CATCH,
179 Id_char = Token.RESERVED,
180 Id_class = Token.RESERVED,
181 Id_const = Token.RESERVED,
182 Id_debugger = Token.RESERVED,
183 Id_double = Token.RESERVED,
184 Id_enum = Token.RESERVED,
185 Id_extends = Token.RESERVED,
186 Id_final = Token.RESERVED,
187 Id_finally = Token.FINALLY,
188 Id_float = Token.RESERVED,
189 Id_goto = Token.RESERVED,
190 Id_implements = Token.RESERVED,
191 Id_import = Token.IMPORT,
192 Id_instanceof = Token.INSTANCEOF,
193 Id_int = Token.RESERVED,
194 Id_interface = Token.RESERVED,
195 Id_long = Token.RESERVED,
196 Id_native = Token.RESERVED,
197 Id_package = Token.RESERVED,
198 Id_private = Token.RESERVED,
199 Id_protected = Token.RESERVED,
200 Id_public = Token.RESERVED,
201 Id_short = Token.RESERVED,
202 Id_static = Token.RESERVED,
203 Id_super = Token.RESERVED,
204 Id_synchronized = Token.RESERVED,
205 Id_throw = Token.THROW,
206 Id_throws = Token.RESERVED,
207 Id_transient = Token.RESERVED,
209 Id_volatile = Token.RESERVED;
216 string X = String.Empty;
227 } else if (c == 'n') {
232 } else if (c == 'o') {
242 if (s [2] == 'r' && s [1] == 'o') {
248 if (s [2] == 't' && s [1] == 'n') {
254 if (s [2] == 'w' && s [1] == 'e') {
260 if (s [2] == 'y' && s [1] == 'r') {
266 if (s [2] == 'r' && s [1] == 'a') {
282 if (s [2] == 's' && s [1] == 'a') {
286 } else if (c == 'r') {
287 if (s [2] == 'a' && s [1] == 'h') {
296 if (s [2] == 's' && s [1] == 'l') {
300 } else if (c == 'm') {
301 if (s [2] == 'u' && s [1] == 'n') {
318 if (s [2] == 'u' && s [1] == 'r') {
322 } else if (c == 's') {
323 if (s [2] == 'i' && s [1] == 'h') {
362 } else if (c == 'f') {
372 } else if (c == 's') {
402 } else if (c == 'r') {
498 } else if (c == 'p') {
501 } else if (c == 't') {
511 } else if (c == 'n') {
518 id = Id_synchronized;
523 if (X != null && X != s && !X.Equals (s))
534 // return and pop the token from the stream if it matches otherwise return null
536 internal bool MatchToken (int to_match)
538 int token = GetToken ();
539 if (token == to_match)
541 // did not match, push back the token
543 pushback_token = token;
547 internal void UnGetToken (int tt)
549 // Can not unreadmore than one token
550 if (pushback_token != Token.EOF && tt != Token.ERROR)
556 internal int PeekToken ()
558 int result = GetToken ();
559 pushback_token = result;
564 internal int PeekTokenSameLine ()
566 significant_eol = true;
567 int result = GetToken ();
568 pushback_token = result;
570 significant_eol = false;
574 internal int GetToken ()
579 // Check for pushed-back token
580 if (pushback_token != Token.EOF) {
581 int result = pushback_token;
582 pushback_token = Token.EOF;
583 if (result != Token.EOL || significant_eol)
589 // Eat whitespace, possibly sensitive to newlines
594 else if (c == '\n') {
598 } else if (!IsJSSpace (c)) {
605 // identifier/keyword/instanceof?
606 // watch out for starting with a <backslash>
607 bool identifier_start;
608 bool is_unicode_escape_start = false;
613 identifier_start = true;
614 is_unicode_escape_start = true;
615 string_buffer_top = 0;
617 identifier_start = false;
622 identifier_start = IsJavaIdentifierStart ((char) c);
623 if (identifier_start) {
624 string_buffer_top = 0;
629 if (identifier_start) {
630 bool contains_escape = is_unicode_escape_start;
632 if (is_unicode_escape_start) {
633 // strictly speaking we should probably push-back
634 // all the bad characters if the <backslash>uXXXX
635 // sequence is malformed. But since there isn't a
636 // correct context(is there?) for a bad Unicode
637 // escape sequence in an identifier, we can report
640 for (int i = 0; i != 4; ++i) {
642 escape_val = (escape_val << 4) | xDigitToInt (c);
643 // Next check takes care about c < 0 and bad escape
647 if (escape_val < 0) {
648 ReportCurrentLineError ("msg.invalid.escape");
651 AddToString (escape_val);
652 is_unicode_escape_start = false;
658 is_unicode_escape_start = true;
659 contains_escape = true;
661 ReportCurrentLineError ("msg.illegal.character");
665 if (c == EOF_CHAR || !IsJavaIdentifierPart ((char) c))
673 string str = GetStringFromBuffer ();
674 if (!contains_escape) {
675 // OPT we shouldn't have to make a string (object!) to
676 // check if it's a keyword.
678 // Return the corresponding token if it's a keyword
679 int result = StringToKeyword (str);
680 if (result != Token.EOF) {
681 if (result != Token.RESERVED)
683 else if (!reserved_keyword_as_identifier)
686 // If implementation permits to use future reserved
687 // keywords in violation with the EcmaScript,
688 // treat it as name but issue warning
689 ReportCurrentLineWarning ("msg.reserved.keyword", str);
690 Console.WriteLine ("Warning: using future reserved keyword as name");
694 _string = String.Intern (str);
699 if (IsDigit (c) || (c == '.' && IsDigit (PeekChar ()))) {
700 string_buffer_top = 0;
705 if (c == 'x' || c == 'X') {
708 } else if (IsDigit (c))
715 while (0 <= xDigitToInt (c)) {
720 while ('0' <= c && c <= '9') {
722 * We permit 08 and 09 as decimal numbers, which
723 * makes our behavior a superset of the ECMA
724 * numeric grammar. We might not always be so
725 * permissive, so we warn about it.
727 if (_base == 8 && c >= '8') {
728 ReportCurrentLineWarning ("msg.bad.octal.literal", c == '8' ? "8" : "9");
736 bool is_integer = true;
738 if (_base == 10 && (c == '.' || c == 'e' || c == 'E')) {
744 } while (IsDigit (c));
746 if (c == 'e' || c == 'E') {
749 if (c == '+' || c == '-') {
754 ReportCurrentLineError ("msg.missing.exponent");
760 } while (IsDigit (c));
764 string num_string = GetStringFromBuffer ();
767 if (_base == 10 && !is_integer) {
769 // Use C# conversion to number from string
770 dval = Double.Parse (num_string, CultureInfo.InvariantCulture);
771 } catch (FormatException ex) {
772 ReportCurrentLineError ("msg.caught.nfe");
774 } catch (OverflowException) {
778 dval = StringToNumber (num_string, 0, _base);
785 if (c == '"' || c == '\'') {
786 // We attempt to accumulate a string the fast way, by
787 // building it directly out of the reader. But if there
788 // are any escaped characters in the string, we revert to
789 // building it out of a StringBuffer.
792 string_buffer_top = 0;
795 strLoop: while (c != quote_char) {
796 if (c == '\n' || c == EOF_CHAR) {
798 ReportCurrentLineError ("msg.unterminated.string.lit");
803 // We've hit an escaped character
808 case 'b': c = '\b'; break;
809 case 'f': c = '\f'; break;
810 case 'n': c = '\n'; break;
811 case 'r': c = '\r'; break;
812 case 't': c = '\t'; break;
814 // \v a late addition to the ECMA spec,
815 // it is not in Java, so use 0xb
816 case 'v': c = 0xb; break;
819 // Get 4 hex digits; if the u escape is not
820 // followed by 4 hex digits, use 'u' + the
821 // literal character sequence that follows.
822 int escape_start = string_buffer_top;
825 for (int i = 0; i != 4; ++i) {
827 escape_val = (escape_val << 4) | xDigitToInt (c);
832 // prepare for replace of stored 'u' sequence
834 string_buffer_top = escape_start;
838 // Get 2 hex digits, defaulting to 'x'+literal
839 // sequence, as above.
841 escape_val = xDigitToInt (c);
842 if (escape_val < 0) {
848 escape_val = (escape_val << 4) | xDigitToInt (c);
849 if (escape_val < 0) {
853 } else // got 2 hex digits
858 if ('0' <= c && c < '8') {
861 if ('0' <= c && c < '8') {
862 val = 8 * val + c - '0';
864 if ('0' <= c && c < '8' && val <= 037) {
865 // c is 3rd char of octal sequence only
866 // if the resulting val <= 0377
867 val = 8 * val + c - '0';
880 string str = GetStringFromBuffer ();
881 _string = String.Intern (str);
886 case ';': return Token.SEMI;
887 case '[': return Token.LB;
888 case ']': return Token.RB;
889 case '{': return Token.LC;
890 case '}': return Token.RC;
891 case '(': return Token.LP;
892 case ')': return Token.RP;
893 case ',': return Token.COMMA;
894 case '?': return Token.HOOK;
895 case ':': return Token.COLON;
896 case '.': return Token.DOT;
901 else if (MatchChar ('=')) {
903 return Token.ASSIGNOP;
908 if (MatchChar ('=')) {
910 return Token.ASSIGNOP;
917 else if (MatchChar ('=')) {
919 return Token.ASSIGNOP;
924 if (MatchChar ('=')) {
933 if (MatchChar ('=')) {
942 /* NB:treat HTML begin-comment as comment-till-eol */
943 if (MatchChar ('!')) {
944 if (MatchChar ('-')) {
945 if (MatchChar ('-')) {
953 if (MatchChar ('<')) {
954 if (MatchChar ('=')) {
956 return Token.ASSIGNOP;
967 if (MatchChar ('>')) {
968 if (MatchChar ('>')) {
969 if (MatchChar ('=')) {
971 return Token.ASSIGNOP;
975 if (MatchChar ('=')) {
977 return Token.ASSIGNOP;
989 if (MatchChar ('=')) {
991 return Token.ASSIGNOP;
996 // is it a // comment?
997 if (MatchChar ('/')) {
1001 if (MatchChar ('*')) {
1002 bool look_for_slash = false;
1005 if (c == EOF_CHAR) {
1006 ReportCurrentLineError ("msg.unterminated.comment");
1008 } else if (c == '*')
1009 look_for_slash = true;
1010 else if (c == '/') {
1014 look_for_slash = false;
1019 if (allow_reg_exp) {
1020 string_buffer_top = 0;
1021 while ((c = GetChar ()) != '/') {
1022 if (c == '\n' || c == EOF_CHAR) {
1024 ReportCurrentLineError ("msg.unterminated.re.lit");
1033 int re_end = string_buffer_top;
1036 if (MatchChar ('g'))
1038 else if (MatchChar ('i'))
1040 else if (MatchChar ('m'))
1046 if (IsAlpha (PeekChar ())) {
1047 ReportCurrentLineError ("msg.invalid.re.flag");
1051 _string = to_string (string_buffer).Substring (0, re_end);
1052 reg_exp_flags = to_string (string_buffer).Substring (re_end, string_buffer_top - re_end);
1053 return Token.REGEXP;
1056 if (MatchChar ('=')) {
1058 return Token.ASSIGNOP;
1063 if (MatchChar ('=')) {
1065 return Token.ASSIGNOP;
1070 return Token.BITNOT;
1073 if (MatchChar ('=')) {
1075 return Token.ASSIGNOP;
1076 } else if (MatchChar ('+'))
1082 if (MatchChar ('=')) {
1085 } else if (MatchChar ('-')) {
1087 // treat HTML end-comment after possible whitespace
1088 // after line start as comment-utill-eol
1089 if (MatchChar ('>')) {
1101 ReportCurrentLineError ("msg.illegal.character");
1108 static bool IsAlpha (int c)
1114 return 'a' <= c && c <= 'z';
1117 double StringToNumber (string s, int start, int radix)
1119 char digit_max = '9';
1120 char lower_case_bound = 'a';
1121 char upper_case_bound = 'A';
1125 lower_case_bound = (char) ('a' + radix - 10);
1126 upper_case_bound = (char) ('A' + radix - 10);
1132 for (end = start; end < len; end++) {
1135 if ('0' <= c && c <= digit_max)
1136 new_digit = c - '0';
1137 else if ('a' <= c && c < lower_case_bound)
1138 new_digit = c - 'a' + 10;
1139 else if ('A' <= c && c < upper_case_bound)
1140 new_digit = c - 'A' + 10;
1143 sum = sum * radix + new_digit;
1149 if (sum >= 9007199254740992.0) {
1151 /* If we're accumulating a decimal number and the number
1152 * is >= 2^53, then the result from the repeated multiply-add
1153 * above may be inaccurate. Call Java to get the correct
1157 return Double.Parse (s, CultureInfo.InvariantCulture);
1158 } catch (FormatException fe) {
1161 } else if (radix == 2 || radix == 4 || radix == 8 ||
1162 radix == 16 || radix == 32) {
1163 /* The number may also be inaccurate for one of these bases.
1164 * This happens if the addition in value*radix + digit causes
1165 * a round-down to an even least significant mantissa bit
1166 * when the first dropped bit is a one. If any of the
1167 * following digits in the number (which haven't been added
1168 * in yet) are nonzero then the correct action would have
1169 * been to round up instead of down. An example of this
1170 * occurs when reading the number 0x1000000000000081, which
1171 * rounds to 0x1000000000000000 instead of 0x1000000000000100.
1173 int bit_shift_in_char = 1;
1176 const int SKIP_LEADING_ZEROS = 0;
1177 const int FIRST_EXACT_53_BITS = 1;
1178 const int AFTER_BIT_53 = 2;
1179 const int ZEROS_AFTER_54 = 3;
1180 const int MIXED_AFTER_54 = 4;
1182 int state = SKIP_LEADING_ZEROS;
1183 int exact_bits_limit = 53;
1184 double factor = 0.0;
1186 // bit54 is the 54th bit (the first dropped from the mantissa)
1190 if (bit_shift_in_char == 1) {
1193 digit = s [start++];
1194 if ('0' <= digit && digit <= '9')
1196 else if ('a' <= digit && digit <= 'z')
1200 bit_shift_in_char = radix;
1202 bit_shift_in_char >>= 1;
1203 bool bit = (digit & bit_shift_in_char) != 0;
1206 case SKIP_LEADING_ZEROS:
1210 state = FIRST_EXACT_53_BITS;
1213 case FIRST_EXACT_53_BITS:
1218 if (exact_bits_limit == 0) {
1220 state = AFTER_BIT_53;
1226 state = ZEROS_AFTER_54;
1228 // FIXME: check if this work
1229 case ZEROS_AFTER_54:
1230 case MIXED_AFTER_54:
1231 if (state == ZEROS_AFTER_54 && bit) {
1232 state = MIXED_AFTER_54;
1240 case SKIP_LEADING_ZEROS:
1243 case FIRST_EXACT_53_BITS:
1247 case ZEROS_AFTER_54:
1248 // x1.1 -> x1 + 1 (round up)
1249 // x0.1 -> x0 (round down)
1254 case MIXED_AFTER_54:
1255 // x.100...1.. -> x + 1 (round up)
1256 // x.0anything -> x (round down)
1263 /* We don't worry about inaccurate numbers for any other base. */
1268 bool IsDigit (int c)
1270 return '0' <= c && c <= '9';
1273 static int xDigitToInt (int c)
1275 // use 0..9 < A..Z < a..z
1280 } else if (c <= 'F') {
1282 return c - ('A' - 10);
1283 } else if (c <= 'f') {
1285 return c - ('a' - 10);
1291 internal static bool IsJSSpace (int c)
1294 return c == 0x20 || c == 0x9 || c == 0xC || c == 0xB;
1296 return c == 0xA0 || Char.GetUnicodeCategory ((char) c) == UnicodeCategory.SpaceSeparator;
1299 internal static bool IsJSLineTerminator (int c)
1301 return c == '\n' || c == '\r' || c == 0x2028 || c == 0x2029;
1304 static bool IsJSFormatChar (int c)
1306 return (c > 127) && (Char.GetUnicodeCategory ((char) c) == UnicodeCategory.Format);
1309 string GetStringFromBuffer ()
1311 return new string (string_buffer, 0, string_buffer_top);
1314 void AddToString (int c)
1316 int N = string_buffer_top;
1317 if (N == string_buffer.Length) {
1318 char [] tmp = new char [string_buffer.Length * 2];
1319 Array.Copy (string_buffer, 0, tmp, 0, N);
1320 string_buffer = tmp;
1322 string_buffer [N] = (char) c;
1323 string_buffer_top = N + 1;
1326 void UnGetChar (int c)
1328 // can not unread past across line boundary
1329 if (unget_cursor != 0 && unget_buffer [unget_cursor - 1] == '\n')
1331 unget_buffer [unget_cursor++] = c;
1334 bool MatchChar (int test)
1355 if (unget_cursor != 0)
1356 return unget_buffer [--unget_cursor];
1360 if (source_string != null) {
1361 if (source_cursor == source_end) {
1365 c = source_string [source_cursor++];
1367 if (source_cursor == source_end) {
1368 if (!FillSourceBuffer ()) {
1373 c = source_buffer [source_cursor++];
1376 if (line_end_char >= 0) {
1377 if (line_end_char == '\r' && c == '\n') {
1378 line_end_char = '\n';
1382 line_start = source_cursor - 1;
1387 if (c == '\n' || c == '\r') {
1392 if (IsJSFormatChar (c))
1394 if ((c & EOL_HINT_MASK) == 0 && IsJSLineTerminator (c)) {
1405 // skip to end of line
1407 while ((c = GetChar ()) != EOF_CHAR && c != '\n')
1412 bool FillSourceBuffer ()
1414 if (source_string == null)
1416 if (source_end == source_buffer.Length) {
1417 if (line_start != 0) {
1418 Array.Copy (source_buffer, line_start, source_buffer, 0, source_end - line_start);
1419 source_end -= line_start;
1420 source_cursor -= line_start;
1423 char [] tmp = new char [source_buffer.Length * 2];
1424 Array.Copy (source_buffer, 0, tmp, 0, source_end);
1425 source_buffer = tmp;
1428 int n = source_reader.Read (source_buffer, source_end, source_buffer.Length - source_end);
1435 internal void ReportCurrentLineWarning (string message, string str)
1437 Console.WriteLine ("warning: {0}, {1}, {2}, {3}", message, SourceName, LineNumber, str);
1440 internal void ReportCurrentLineError (string message)
1442 Console.WriteLine ("{0} ({1}, 0): error: {2}", SourceName, LineNumber, message);
1445 // FIXME: we don't check for combining mark yet
1446 static bool IsJavaIdentifierPart (char c)
1448 UnicodeCategory unicode_category = Char.GetUnicodeCategory (c);
1449 return Char.IsLetter (c) || unicode_category == UnicodeCategory.CurrencySymbol ||
1450 unicode_category == UnicodeCategory.ConnectorPunctuation || Char.IsDigit (c) ||
1451 unicode_category == UnicodeCategory.LetterNumber ||
1452 unicode_category == UnicodeCategory.NonSpacingMark || IsIdentifierIgnorable (c);
1455 static bool IsIdentifierIgnorable (char c)
1457 return (c >= '\u0000' && c <= '\u0008') || (c >= '\u000E' && c <= '\u001B') ||
1458 (c >= '\u007F' && c <= '\u009F') || Char.GetUnicodeCategory (c) == UnicodeCategory.Format;
1461 static bool IsJavaIdentifierStart (char c)
1463 UnicodeCategory unicode_category = Char.GetUnicodeCategory (c);
1464 return Char.IsLetter (c) || unicode_category == UnicodeCategory.LetterNumber ||
1465 unicode_category == UnicodeCategory.CurrencySymbol ||
1466 unicode_category == UnicodeCategory.ConnectorPunctuation;
1469 internal static string to_string (Array a)
1471 string s = String.Empty;
1472 foreach (char c in a)