2 // cs-tokenizer.cs: The Tokenizer for the C# compiler
\r
4 // Author: Miguel de Icaza (miguel@gnu.org)
\r
6 // Licensed under the terms of the GNU GPL
\r
8 // (C) 2001 Ximian, Inc (http://www.ximian.com)
\r
14 Do something with the integer and float suffixes, pass full datatype?
\r
15 Make sure we accept the proper Unicode ranges, per the spec.
\r
19 I was returning Token.ERROR on errors and setting an
\r
20 internal error string with the details, but it might make sense
\r
21 to just use exceptions.
\r
23 Change of mind: I think I want to keep returning errors *UNLESS* the
\r
24 parser is catching errors from the tokenizer (at that point, there is
\r
25 not really any reason to use exceptions) so that I can continue the
\r
30 I think I have solved the problem. The idea is to not even *bother*
\r
31 about handling data types a lot here (except for fitting data into
\r
32 the proper places), but let the upper layer handle it.
\r
34 Ie, treat LITERAL_CHARACTER, LITERAL_INTEGER, LITERAL_FLOAT, LITERAL_DOUBLE, and
\r
35 return then as `LITERAL_LITERAL' with maybe subdetail information
\r
41 using System.Collections;
\r
43 using System.Globalization;
\r
48 /// Tokenizer for C# source code.
\r
51 public class Tokenizer : yyParser.yyInput
\r
53 StreamReader reader;
\r
54 public string ref_name;
\r
55 public int ref_line = 1;
\r
56 public int line = 1;
\r
58 public int current_token;
\r
59 bool handle_get_set = false;
\r
62 // Returns a verbose representation of the current location
\r
64 public string location {
\r
68 if (current_token == Token.ERROR)
\r
69 det = "detail: " + error_details;
\r
73 // return "Line: "+line+" Col: "+col + "\n" +
\r
74 // "VirtLine: "+ref_line +
\r
75 // " Token: "+current_token + " " + det;
\r
77 return ref_name + " " + "(" + line + "," + col + "), Token:" + current_token + " " + det;
\r
81 public bool properties {
\r
83 return handle_get_set;
\r
87 handle_get_set = value;
\r
94 static Hashtable keywords;
\r
95 static NumberStyles styles;
\r
96 static NumberFormatInfo csharp_format_info;
\r
99 // Values for the associated token returned
\r
101 System.Text.StringBuilder number;
\r
106 // Details about the error encoutered by the tokenizer
\r
108 string error_details;
\r
110 public string error {
\r
112 return error_details;
\r
128 static void initTokens ()
\r
130 keywords = new Hashtable ();
\r
132 keywords.Add ("abstract", Token.ABSTRACT);
\r
133 keywords.Add ("as", Token.AS);
\r
134 keywords.Add ("add", Token.ADD);
\r
135 keywords.Add ("base", Token.BASE);
\r
136 keywords.Add ("bool", Token.BOOL);
\r
137 keywords.Add ("break", Token.BREAK);
\r
138 keywords.Add ("byte", Token.BYTE);
\r
139 keywords.Add ("case", Token.CASE);
\r
140 keywords.Add ("catch", Token.CATCH);
\r
141 keywords.Add ("char", Token.CHAR);
\r
142 keywords.Add ("checked", Token.CHECKED);
\r
143 keywords.Add ("class", Token.CLASS);
\r
144 keywords.Add ("const", Token.CONST);
\r
145 keywords.Add ("continue", Token.CONTINUE);
\r
146 keywords.Add ("decimal", Token.DECIMAL);
\r
147 keywords.Add ("default", Token.DEFAULT);
\r
148 keywords.Add ("delegate", Token.DELEGATE);
\r
149 keywords.Add ("do", Token.DO);
\r
150 keywords.Add ("double", Token.DOUBLE);
\r
151 keywords.Add ("else", Token.ELSE);
\r
152 keywords.Add ("enum", Token.ENUM);
\r
153 keywords.Add ("event", Token.EVENT);
\r
154 keywords.Add ("explicit", Token.EXPLICIT);
\r
155 keywords.Add ("extern", Token.EXTERN);
\r
156 keywords.Add ("false", Token.FALSE);
\r
157 keywords.Add ("finally", Token.FINALLY);
\r
158 keywords.Add ("fixed", Token.FIXED);
\r
159 keywords.Add ("float", Token.FLOAT);
\r
160 keywords.Add ("for", Token.FOR);
\r
161 keywords.Add ("foreach", Token.FOREACH);
\r
162 keywords.Add ("goto", Token.GOTO);
\r
163 keywords.Add ("get", Token.GET);
\r
164 keywords.Add ("if", Token.IF);
\r
165 keywords.Add ("implicit", Token.IMPLICIT);
\r
166 keywords.Add ("in", Token.IN);
\r
167 keywords.Add ("int", Token.INT);
\r
168 keywords.Add ("interface", Token.INTERFACE);
\r
169 keywords.Add ("internal", Token.INTERNAL);
\r
170 keywords.Add ("is", Token.IS);
\r
171 keywords.Add ("lock", Token.LOCK);
\r
172 keywords.Add ("long", Token.LONG);
\r
173 keywords.Add ("namespace", Token.NAMESPACE);
\r
174 keywords.Add ("new", Token.NEW);
\r
175 keywords.Add ("null", Token.NULL);
\r
176 keywords.Add ("object", Token.OBJECT);
\r
177 keywords.Add ("operator", Token.OPERATOR);
\r
178 keywords.Add ("out", Token.OUT);
\r
179 keywords.Add ("override", Token.OVERRIDE);
\r
180 keywords.Add ("params", Token.PARAMS);
\r
181 keywords.Add ("private", Token.PRIVATE);
\r
182 keywords.Add ("protected", Token.PROTECTED);
\r
183 keywords.Add ("public", Token.PUBLIC);
\r
184 keywords.Add ("readonly", Token.READONLY);
\r
185 keywords.Add ("ref", Token.REF);
\r
186 keywords.Add ("remove", Token.REMOVE);
\r
187 keywords.Add ("return", Token.RETURN);
\r
188 keywords.Add ("sbyte", Token.SBYTE);
\r
189 keywords.Add ("sealed", Token.SEALED);
\r
190 keywords.Add ("set", Token.SET);
\r
191 keywords.Add ("short", Token.SHORT);
\r
192 keywords.Add ("sizeof", Token.SIZEOF);
\r
193 keywords.Add ("static", Token.STATIC);
\r
194 keywords.Add ("string", Token.STRING);
\r
195 keywords.Add ("struct", Token.STRUCT);
\r
196 keywords.Add ("switch", Token.SWITCH);
\r
197 keywords.Add ("this", Token.THIS);
\r
198 keywords.Add ("throw", Token.THROW);
\r
199 keywords.Add ("true", Token.TRUE);
\r
200 keywords.Add ("try", Token.TRY);
\r
201 keywords.Add ("typeof", Token.TYPEOF);
\r
202 keywords.Add ("uint", Token.UINT);
\r
203 keywords.Add ("ulong", Token.ULONG);
\r
204 keywords.Add ("unchecked", Token.UNCHECKED);
\r
205 keywords.Add ("unsafe", Token.UNSAFE);
\r
206 keywords.Add ("ushort", Token.USHORT);
\r
207 keywords.Add ("using", Token.USING);
\r
208 keywords.Add ("virtual", Token.VIRTUAL);
\r
209 keywords.Add ("void", Token.VOID);
\r
210 keywords.Add ("while", Token.WHILE);
\r
214 // Class initializer
\r
216 static Tokenizer ()
\r
219 csharp_format_info = new NumberFormatInfo ();
\r
220 csharp_format_info.CurrencyDecimalSeparator = ".";
\r
221 styles = NumberStyles.AllowExponent | NumberStyles.AllowDecimalPoint;
\r
224 bool is_keyword (string name)
\r
228 res = keywords.Contains (name);
\r
229 if ((name == "get" || name == "set") && handle_get_set == false)
\r
234 int getKeyword (string name)
\r
236 return (int) (keywords [name]);
\r
239 public Location Location {
\r
241 return new Location (ref_line);
\r
245 public Tokenizer (System.IO.Stream input, string fname)
\r
247 this.ref_name = fname;
\r
248 reader = new System.IO.StreamReader (input);
\r
251 Location.Push (fname);
\r
254 bool is_identifier_start_character (char c)
\r
256 return Char.IsLetter (c) || c == '_' ;
\r
259 bool is_identifier_part_character (char c)
\r
261 return (Char.IsLetter (c) || Char.IsDigit (c) || c == '_');
\r
264 int is_punct (char c, ref bool doread)
\r
266 int idx = "{}[](),:;~+-*/%&|^!=<>?".IndexOf (c);
\r
274 return Token.OPEN_BRACE;
\r
276 return Token.CLOSE_BRACE;
\r
278 return Token.OPEN_BRACKET;
\r
280 return Token.CLOSE_BRACKET;
\r
282 return Token.OPEN_PARENS;
\r
284 return Token.CLOSE_PARENS;
\r
286 return Token.COMMA;
\r
288 return Token.COLON;
\r
290 return Token.SEMICOLON;
\r
292 return Token.TILDE;
\r
294 return Token.INTERR;
\r
303 t = Token.OP_ADD_ASSIGN;
\r
313 t = Token.OP_SUB_ASSIGN;
\r
315 return Token.OP_PTR;
\r
317 return Token.MINUS;
\r
325 return Token.OP_NE;
\r
333 return Token.OP_EQ;
\r
335 return Token.ASSIGN;
\r
341 return Token.OP_AND;
\r
342 } else if (d == '='){
\r
344 return Token.OP_AND_ASSIGN;
\r
346 return Token.BITWISE_AND;
\r
352 return Token.OP_OR;
\r
353 } else if (d == '='){
\r
355 return Token.OP_OR_ASSIGN;
\r
357 return Token.BITWISE_OR;
\r
363 return Token.OP_MULT_ASSIGN;
\r
371 return Token.OP_DIV_ASSIGN;
\r
379 return Token.OP_MOD_ASSIGN;
\r
381 return Token.PERCENT;
\r
387 return Token.OP_XOR_ASSIGN;
\r
389 return Token.CARRET;
\r
399 return Token.OP_SHIFT_LEFT_ASSIGN;
\r
401 return Token.OP_SHIFT_LEFT;
\r
402 } else if (d == '='){
\r
404 return Token.OP_LE;
\r
406 return Token.OP_LT;
\r
416 return Token.OP_SHIFT_RIGHT_ASSIGN;
\r
418 return Token.OP_SHIFT_RIGHT;
\r
419 } else if (d == '='){
\r
421 return Token.OP_GE;
\r
423 return Token.OP_GT;
\r
425 return Token.ERROR;
\r
428 bool decimal_digits (int c)
\r
431 bool seen_digits = false;
\r
434 number.Append ((char) c);
\r
436 while ((d = peekChar ()) != -1){
\r
437 if (Char.IsDigit ((char)d)){
\r
438 number.Append ((char) d);
\r
440 seen_digits = true;
\r
444 return seen_digits;
\r
447 void hex_digits (int c)
\r
452 number.Append ((char) c);
\r
453 while ((d = peekChar ()) != -1){
\r
454 char e = Char.ToUpper ((char) d);
\r
456 if (Char.IsDigit (e) ||
\r
457 (e >= 'A' && e <= 'F')){
\r
458 number.Append ((char) e);
\r
465 int real_type_suffix (int c)
\r
470 case 'F': case 'f':
\r
471 t = Token.LITERAL_FLOAT;
\r
473 case 'D': case 'd':
\r
474 t = Token.LITERAL_DOUBLE;
\r
476 case 'M': case 'm':
\r
477 t= Token.LITERAL_DECIMAL;
\r
486 int integer_type_suffix (int c)
\r
488 // FIXME: Handle U and L suffixes.
\r
489 // We also need to see in which kind of
\r
490 // Int the thing fits better according to the spec.
\r
491 return Token.LITERAL_INTEGER;
\r
494 void adjust_int (int t)
\r
496 val = new System.Int32();
\r
497 val = System.Int32.Parse (number.ToString (), 0);
\r
500 int adjust_real (int t)
\r
502 string s = number.ToString ();
\r
505 case Token.LITERAL_DECIMAL:
\r
506 val = new System.Decimal ();
\r
507 val = System.Decimal.Parse (
\r
508 s, styles, csharp_format_info);
\r
510 case Token.LITERAL_DOUBLE:
\r
511 val = new System.Double ();
\r
512 val = System.Double.Parse (
\r
513 s, styles, csharp_format_info);
\r
515 case Token.LITERAL_FLOAT:
\r
516 val = new System.Double ();
\r
517 val = (float) System.Double.Parse (
\r
518 s, styles, csharp_format_info);
\r
522 val = new System.Double ();
\r
523 val = System.Double.Parse (
\r
524 s, styles, csharp_format_info);
\r
525 t = Token.LITERAL_DOUBLE;
\r
532 // Invoked if we know we have .digits or digits
\r
534 int is_number (int c)
\r
536 bool is_real = false;
\r
537 number = new System.Text.StringBuilder ();
\r
542 if (Char.IsDigit ((char)c)){
\r
543 if (c == '0' && peekChar () == 'x' || peekChar () == 'X'){
\r
548 string s = number.ToString ();
\r
550 ul = System.UInt64.Parse (s, NumberStyles.HexNumber);
\r
551 if ((ul & 0xffffffff00000000) == 0){
\r
552 uint ui = (uint) ul;
\r
554 if ((ui & 0x80000000) != 0)
\r
559 if ((ul & 0x8000000000000000) != 0)
\r
565 return integer_type_suffix (peekChar ());
\r
567 decimal_digits (c);
\r
572 // We need to handle the case of
\r
573 // "1.1" vs "1.string" (LITERAL_FLOAT vs NUMBER DOT IDENTIFIER)
\r
576 if (decimal_digits ('.')){
\r
581 number.Length -= 1;
\r
582 adjust_int (Token.LITERAL_INTEGER);
\r
583 return Token.LITERAL_INTEGER;
\r
587 if (c == 'e' || c == 'E'){
\r
589 number.Append ("e");
\r
594 number.Append ((char) c);
\r
597 } else if (c == '-'){
\r
598 number.Append ((char) c);
\r
602 decimal_digits (-1);
\r
606 type = real_type_suffix (c);
\r
607 if (type == Token.NONE && !is_real){
\r
608 type = integer_type_suffix (c);
\r
616 return adjust_real (type);
\r
618 Console.WriteLine ("This should not be reached");
\r
619 throw new Exception ("Is Number should never reach this point");
\r
655 error_details = "cs1009: Unrecognized escape sequence " + (char)d;
\r
664 if (putback_char != -1){
\r
665 int x = putback_char;
\r
670 return reader.Read ();
\r
675 if (putback_char != -1)
\r
676 return putback_char;
\r
677 return reader.Peek ();
\r
680 void putback (int c)
\r
682 if (putback_char != -1)
\r
683 throw new Exception ("This should not happen putback on putback");
\r
687 public bool advance ()
\r
689 return peekChar () != -1;
\r
692 public Object Value {
\r
698 public Object value ()
\r
703 public int token ()
\r
705 current_token = xtoken ();
\r
706 return current_token;
\r
709 public int xtoken ()
\r
712 bool allow_keyword_as_ident = false;
\r
713 bool doread = false;
\r
717 for (;(c = getChar ()) != -1; col++) {
\r
719 if (is_identifier_start_character ((char) c)){
\r
720 System.Text.StringBuilder id = new System.Text.StringBuilder ();
\r
723 id.Append ((char) c);
\r
725 while ((c = peekChar ()) != -1) {
\r
726 if (is_identifier_part_character ((char) c)){
\r
727 id.Append ((char)getChar ());
\r
733 ids = id.ToString ();
\r
735 if (!is_keyword (ids) || allow_keyword_as_ident) {
\r
737 return Token.IDENTIFIER;
\r
740 // true, false and null are in the hash anyway.
\r
741 return getKeyword (ids);
\r
746 if (Char.IsDigit ((char) peekChar ()))
\r
747 return is_number (c);
\r
751 if (Char.IsDigit ((char) c))
\r
752 return is_number (c);
\r
754 // Handle double-slash comments.
\r
756 int d = peekChar ();
\r
760 while ((d = getChar ()) != -1 && (d != '\n'))
\r
765 } else if (d == '*'){
\r
768 while ((d = getChar ()) != -1){
\r
769 if (d == '*' && peekChar () == '/'){
\r
784 /* For now, ignore pre-processor commands */
\r
785 if (col == 1 && c == '#'){
\r
786 System.Text.StringBuilder s = new System.Text.StringBuilder ();
\r
788 while ((c = getChar ()) != -1 && (c != '\n')){
\r
789 s.Append ((char) c);
\r
791 if (String.Compare (s.ToString (), 0, "line", 0, 4) == 0){
\r
792 string arg = s.ToString ().Substring (5);
\r
795 if ((pos = arg.IndexOf (' ')) != -1 && pos != 0){
\r
796 ref_line = System.Int32.Parse (arg.Substring (0, pos));
\r
799 char [] quotes = { '\"' };
\r
801 ref_name = arg.Substring (pos);
\r
802 ref_name.TrimStart (quotes);
\r
803 ref_name.TrimEnd (quotes);
\r
805 ref_line = System.Int32.Parse (arg);
\r
812 if ((t = is_punct ((char)c, ref doread)) != Token.ERROR){
\r
821 System.Text.StringBuilder s = new System.Text.StringBuilder ();
\r
823 while ((c = getChar ()) != -1){
\r
825 val = s.ToString ();
\r
826 return Token.LITERAL_STRING;
\r
831 return Token.ERROR;
\r
832 s.Append ((char) c);
\r
839 error_details = "CS1011: Empty character literal";
\r
840 return Token.ERROR;
\r
844 return Token.ERROR;
\r
845 val = new System.Char ();
\r
849 error_details = "CS1012: Too many characters in character literal";
\r
850 // Try to recover, read until newline or next "'"
\r
851 while ((c = getChar ()) != -1){
\r
852 if (c == '\n' || c == '\'')
\r
856 return Token.ERROR;
\r
858 return Token.LITERAL_CHARACTER;
\r
868 if (c == ' ' || c == '\t' || c == '\f' || c == '\v' || c == '\r'){
\r
870 col = (((col + 8) / 8) * 8) - 1;
\r
876 allow_keyword_as_ident = true;
\r
880 error_details = ((char)c).ToString ();
\r
882 return Token.ERROR;
\r