2 // cs-tokenizer.cs: The Tokenizer for the C# compiler
\r
4 // Author: Miguel de Icaza (miguel@gnu.org)
\r
6 // Licensed under the terms of the GNU GPL
\r
8 // (C) 2001 Ximian, Inc (http://www.ximian.com)
\r
14 Do something with the integer and float suffixes, pass full datatype?
\r
15 Make sure we accept the proper Unicode ranges, per the spec.
\r
19 * Data type handling
\r
21 Currently I am returning different tokens for the various
\r
22 kinds of floating point types (float, double, decimal) and I
\r
23 am only returning a single token for all integer values
\r
24 (integer, unsigned int, etc) as an experiment as to see
\r
25 which mechanism is better.
\r
27 I do not know yet how I will be doing the mapping of "int"
\r
28 to things like System.Int32 and so on. I am confused. MAN
\r
31 Indeed, this might be the core of the problem, I should
\r
32 *probably* just return a TYPE token and have the value of
\r
33 the token be stuff like `System.Int32', `System.UInt32',
\r
34 `System.Double' and so on. I will see.
\r
38 I was returning Token.ERROR on errors and setting an
\r
39 internal error string with the details, but it might make sense
\r
40 to just use exceptions.
\r
42 Change of mind: I think I want to keep returning errors *UNLESS* the
\r
43 parser is catching errors from the tokenizer (at that point, there is
\r
44 not really any reason to use exceptions) so that I can continue the
\r
49 I think I have solved the problem. The idea is to not even *bother*
\r
50 about handling data types a lot here (except for fitting data into
\r
51 the proper places), but let the upper layer handle it.
\r
53 Ie, treat LITERAL_CHARACTER, LITERAL_INTEGER, LITERAL_FLOAT, LITERAL_DOUBLE, and
\r
54 return then as `LITERAL_LITERAL' with maybe subdetail information
\r
60 using System.Collections;
\r
62 using System.Globalization;
\r
67 /// Tokenizer for C# source code.
\r
70 public class Tokenizer : yyParser.yyInput
\r
72 StreamReader reader;
\r
73 public string ref_name;
\r
74 public int ref_line = 1;
\r
75 public int line = 1;
\r
77 public int current_token;
\r
78 bool handle_get_set = false;
\r
81 // Returns a verbose representation of the current location
\r
83 public string location {
\r
87 if (current_token == Token.ERROR)
\r
88 det = "detail: " + error_details;
\r
92 //return "Line: "+line+" Col: "+col + "\n" +
\r
93 // "VirtLine: "+ref_line +
\r
94 // " Token: "+current_token + " " + det;
\r
96 return ref_name + " " + "(" + line + "," + col + ")";
\r
100 public bool properties {
\r
102 return handle_get_set;
\r
106 handle_get_set = value;
\r
113 static Hashtable keywords;
\r
114 static NumberStyles styles;
\r
115 static NumberFormatInfo csharp_format_info;
\r
118 // Values for the associated token returned
\r
120 System.Text.StringBuilder number;
\r
125 // Details about the error encoutered by the tokenizer
\r
127 string error_details;
\r
129 public string error {
\r
131 return error_details;
\r
147 static void initTokens ()
\r
149 keywords = new Hashtable ();
\r
151 keywords.Add ("abstract", Token.ABSTRACT);
\r
152 keywords.Add ("as", Token.AS);
\r
153 keywords.Add ("add", Token.ADD);
\r
154 keywords.Add ("base", Token.BASE);
\r
155 keywords.Add ("bool", Token.BOOL);
\r
156 keywords.Add ("break", Token.BREAK);
\r
157 keywords.Add ("byte", Token.BYTE);
\r
158 keywords.Add ("case", Token.CASE);
\r
159 keywords.Add ("catch", Token.CATCH);
\r
160 keywords.Add ("char", Token.CHAR);
\r
161 keywords.Add ("checked", Token.CHECKED);
\r
162 keywords.Add ("class", Token.CLASS);
\r
163 keywords.Add ("const", Token.CONST);
\r
164 keywords.Add ("continue", Token.CONTINUE);
\r
165 keywords.Add ("decimal", Token.DECIMAL);
\r
166 keywords.Add ("default", Token.DEFAULT);
\r
167 keywords.Add ("delegate", Token.DELEGATE);
\r
168 keywords.Add ("do", Token.DO);
\r
169 keywords.Add ("double", Token.DOUBLE);
\r
170 keywords.Add ("else", Token.ELSE);
\r
171 keywords.Add ("enum", Token.ENUM);
\r
172 keywords.Add ("event", Token.EVENT);
\r
173 keywords.Add ("explicit", Token.EXPLICIT);
\r
174 keywords.Add ("extern", Token.EXTERN);
\r
175 keywords.Add ("false", Token.FALSE);
\r
176 keywords.Add ("finally", Token.FINALLY);
\r
177 keywords.Add ("fixed", Token.FIXED);
\r
178 keywords.Add ("float", Token.FLOAT);
\r
179 keywords.Add ("for", Token.FOR);
\r
180 keywords.Add ("foreach", Token.FOREACH);
\r
181 keywords.Add ("goto", Token.GOTO);
\r
182 keywords.Add ("get", Token.GET);
\r
183 keywords.Add ("if", Token.IF);
\r
184 keywords.Add ("implicit", Token.IMPLICIT);
\r
185 keywords.Add ("in", Token.IN);
\r
186 keywords.Add ("int", Token.INT);
\r
187 keywords.Add ("interface", Token.INTERFACE);
\r
188 keywords.Add ("internal", Token.INTERNAL);
\r
189 keywords.Add ("is", Token.IS);
\r
190 keywords.Add ("lock ", Token.LOCK );
\r
191 keywords.Add ("long", Token.LONG);
\r
192 keywords.Add ("namespace", Token.NAMESPACE);
\r
193 keywords.Add ("new", Token.NEW);
\r
194 keywords.Add ("null", Token.NULL);
\r
195 keywords.Add ("object", Token.OBJECT);
\r
196 keywords.Add ("operator", Token.OPERATOR);
\r
197 keywords.Add ("out", Token.OUT);
\r
198 keywords.Add ("override", Token.OVERRIDE);
\r
199 keywords.Add ("params", Token.PARAMS);
\r
200 keywords.Add ("private", Token.PRIVATE);
\r
201 keywords.Add ("protected", Token.PROTECTED);
\r
202 keywords.Add ("public", Token.PUBLIC);
\r
203 keywords.Add ("readonly", Token.READONLY);
\r
204 keywords.Add ("ref", Token.REF);
\r
205 keywords.Add ("remove", Token.REMOVE);
\r
206 keywords.Add ("return", Token.RETURN);
\r
207 keywords.Add ("sbyte", Token.SBYTE);
\r
208 keywords.Add ("sealed", Token.SEALED);
\r
209 keywords.Add ("set", Token.SET);
\r
210 keywords.Add ("short", Token.SHORT);
\r
211 keywords.Add ("sizeof", Token.SIZEOF);
\r
212 keywords.Add ("static", Token.STATIC);
\r
213 keywords.Add ("string", Token.STRING);
\r
214 keywords.Add ("struct", Token.STRUCT);
\r
215 keywords.Add ("switch", Token.SWITCH);
\r
216 keywords.Add ("this", Token.THIS);
\r
217 keywords.Add ("throw", Token.THROW);
\r
218 keywords.Add ("true", Token.TRUE);
\r
219 keywords.Add ("try", Token.TRY);
\r
220 keywords.Add ("typeof", Token.TYPEOF);
\r
221 keywords.Add ("uint", Token.UINT);
\r
222 keywords.Add ("ulong", Token.ULONG);
\r
223 keywords.Add ("unchecked", Token.UNCHECKED);
\r
224 keywords.Add ("unsafe", Token.UNSAFE);
\r
225 keywords.Add ("ushort", Token.USHORT);
\r
226 keywords.Add ("using", Token.USING);
\r
227 keywords.Add ("virtual", Token.VIRTUAL);
\r
228 keywords.Add ("void", Token.VOID);
\r
229 keywords.Add ("while", Token.WHILE);
\r
233 // Class initializer
\r
235 static Tokenizer ()
\r
238 csharp_format_info = new NumberFormatInfo ();
\r
239 csharp_format_info.CurrencyDecimalSeparator = ".";
\r
240 styles = NumberStyles.AllowExponent | NumberStyles.AllowDecimalPoint;
\r
243 bool is_keyword (string name)
\r
247 res = keywords.Contains (name);
\r
248 if ((name == "get" || name == "set") && handle_get_set == false)
\r
253 int getKeyword (string name)
\r
255 return (int) (keywords [name]);
\r
258 public Location Location {
\r
260 return new Location (ref_name, col, ref_line);
\r
264 public Tokenizer (System.IO.Stream input, string fname)
\r
266 this.ref_name = fname;
\r
267 reader = new System.IO.StreamReader (input);
\r
271 bool is_identifier_start_character (char c)
\r
273 return Char.IsLetter (c) || c == '_' ;
\r
276 bool is_identifier_part_character (char c)
\r
278 return (Char.IsLetter (c) || Char.IsDigit (c) || c == '_');
\r
281 int is_punct (char c, ref bool doread)
\r
283 int idx = "{}[](),:;~+-*/%&|^!=<>?".IndexOf (c);
\r
291 return Token.OPEN_BRACE;
\r
293 return Token.CLOSE_BRACE;
\r
295 return Token.OPEN_BRACKET;
\r
297 return Token.CLOSE_BRACKET;
\r
299 return Token.OPEN_PARENS;
\r
301 return Token.CLOSE_PARENS;
\r
303 return Token.COMMA;
\r
305 return Token.COLON;
\r
307 return Token.SEMICOLON;
\r
309 return Token.TILDE;
\r
311 return Token.INTERR;
\r
320 t = Token.OP_ADD_ASSIGN;
\r
330 t = Token.OP_SUB_ASSIGN;
\r
332 return Token.OP_PTR;
\r
334 return Token.MINUS;
\r
342 return Token.OP_NE;
\r
350 return Token.OP_EQ;
\r
352 return Token.ASSIGN;
\r
358 return Token.OP_AND;
\r
359 } else if (d == '='){
\r
361 return Token.OP_AND_ASSIGN;
\r
363 return Token.BITWISE_AND;
\r
369 return Token.OP_OR;
\r
370 } else if (d == '='){
\r
372 return Token.OP_OR_ASSIGN;
\r
374 return Token.BITWISE_OR;
\r
380 return Token.OP_MULT_ASSIGN;
\r
388 return Token.OP_DIV_ASSIGN;
\r
396 return Token.OP_MOD_ASSIGN;
\r
398 return Token.PERCENT;
\r
404 return Token.OP_XOR_ASSIGN;
\r
406 return Token.CARRET;
\r
416 return Token.OP_SHIFT_LEFT_ASSIGN;
\r
418 return Token.OP_SHIFT_LEFT;
\r
419 } else if (d == '='){
\r
421 return Token.OP_LE;
\r
423 return Token.OP_LT;
\r
433 return Token.OP_SHIFT_RIGHT_ASSIGN;
\r
435 return Token.OP_SHIFT_RIGHT;
\r
436 } else if (d == '='){
\r
438 return Token.OP_GE;
\r
440 return Token.OP_GT;
\r
442 return Token.ERROR;
\r
445 bool decimal_digits (int c)
\r
448 bool seen_digits = false;
\r
451 number.Append ((char) c);
\r
453 while ((d = peekChar ()) != -1){
\r
454 if (Char.IsDigit ((char)d)){
\r
455 number.Append ((char) d);
\r
457 seen_digits = true;
\r
461 return seen_digits;
\r
464 void hex_digits (int c)
\r
469 number.Append ((char) c);
\r
470 while ((d = peekChar ()) != -1){
\r
471 char e = Char.ToUpper ((char) d);
\r
473 if (Char.IsDigit (e) ||
\r
474 (e >= 'A' && e <= 'F')){
\r
475 number.Append ((char) e);
\r
482 int real_type_suffix (int c)
\r
487 case 'F': case 'f':
\r
488 t = Token.LITERAL_FLOAT;
\r
490 case 'D': case 'd':
\r
491 t = Token.LITERAL_DOUBLE;
\r
493 case 'M': case 'm':
\r
494 t= Token.LITERAL_DECIMAL;
\r
503 int integer_type_suffix (int c)
\r
505 // FIXME: Handle U and L suffixes.
\r
506 // We also need to see in which kind of
\r
507 // Int the thing fits better according to the spec.
\r
508 return Token.LITERAL_INTEGER;
\r
511 void adjust_int (int t)
\r
513 val = new System.Int32();
\r
514 val = System.Int32.Parse (number.ToString (), 0);
\r
517 int adjust_real (int t)
\r
519 string s = number.ToString ();
\r
522 case Token.LITERAL_DECIMAL:
\r
523 val = new System.Decimal ();
\r
524 val = System.Decimal.Parse (
\r
525 s, styles, csharp_format_info);
\r
527 case Token.LITERAL_DOUBLE:
\r
528 val = new System.Double ();
\r
529 val = System.Double.Parse (
\r
530 s, styles, csharp_format_info);
\r
532 case Token.LITERAL_FLOAT:
\r
533 val = new System.Double ();
\r
534 val = (float) System.Double.Parse (
\r
535 s, styles, csharp_format_info);
\r
539 val = new System.Double ();
\r
540 val = System.Double.Parse (
\r
541 s, styles, csharp_format_info);
\r
542 t = Token.LITERAL_DOUBLE;
\r
549 // Invoked if we know we have .digits or digits
\r
551 int is_number (int c)
\r
553 bool is_real = false;
\r
554 number = new System.Text.StringBuilder ();
\r
559 if (Char.IsDigit ((char)c)){
\r
560 if (c == '0' && peekChar () == 'x' || peekChar () == 'X'){
\r
563 val = new System.Int32 ();
\r
564 val = System.Int32.Parse (number.ToString (), NumberStyles.HexNumber);
\r
565 return integer_type_suffix (peekChar ());
\r
567 decimal_digits (c);
\r
572 // We need to handle the case of
\r
573 // "1.1" vs "1.string" (LITERAL_FLOAT vs NUMBER DOT IDENTIFIER)
\r
576 if (decimal_digits ('.')){
\r
581 number.Length -= 1;
\r
582 adjust_int (Token.LITERAL_INTEGER);
\r
583 return Token.LITERAL_INTEGER;
\r
587 if (c == 'e' || c == 'E'){
\r
589 number.Append ("e");
\r
594 number.Append ((char) c);
\r
597 } else if (c == '-'){
\r
598 number.Append ((char) c);
\r
602 decimal_digits (-1);
\r
606 type = real_type_suffix (c);
\r
607 if (type == Token.NONE && !is_real){
\r
608 type = integer_type_suffix (c);
\r
616 return adjust_real (type);
\r
618 Console.WriteLine ("This should not be reached");
\r
619 throw new Exception ("Is Number should never reach this point");
\r
655 error_details = "cs1009: Unrecognized escape sequence " + (char)d;
\r
664 if (putback_char != -1){
\r
665 int x = putback_char;
\r
670 return reader.Read ();
\r
675 if (putback_char != -1)
\r
676 return putback_char;
\r
677 return reader.Peek ();
\r
680 void putback (int c)
\r
682 if (putback_char != -1)
\r
683 throw new Exception ("This should not happen putback on putback");
\r
687 public bool advance ()
\r
689 return peekChar () != -1;
\r
692 public Object Value {
\r
698 public Object value ()
\r
703 public int token ()
\r
705 current_token = xtoken ();
\r
706 return current_token;
\r
709 public int xtoken ()
\r
712 bool allow_keyword_as_ident = false;
\r
713 bool doread = false;
\r
717 for (;(c = getChar ()) != -1; col++) {
\r
719 if (is_identifier_start_character ((char) c)){
\r
720 System.Text.StringBuilder id = new System.Text.StringBuilder ();
\r
723 id.Append ((char) c);
\r
725 while ((c = peekChar ()) != -1) {
\r
726 if (is_identifier_part_character ((char) c)){
\r
727 id.Append ((char)getChar ());
\r
733 ids = id.ToString ();
\r
735 if (!is_keyword (ids) || allow_keyword_as_ident) {
\r
737 return Token.IDENTIFIER;
\r
740 // true, false and null are in the hash anyway.
\r
741 return getKeyword (ids);
\r
746 if (Char.IsDigit ((char) peekChar ()))
\r
747 return is_number (c);
\r
751 if (Char.IsDigit ((char) c))
\r
752 return is_number (c);
\r
754 // Handle double-slash comments.
\r
756 int d = peekChar ();
\r
760 while ((d = getChar ()) != -1 && (d != '\n'))
\r
765 } else if (d == '*'){
\r
768 while ((d = getChar ()) != -1){
\r
769 if (d == '*' && peekChar () == '/'){
\r
784 /* For now, ignore pre-processor commands */
\r
785 if (col == 1 && c == '#'){
\r
786 System.Text.StringBuilder s = new System.Text.StringBuilder ();
\r
788 while ((c = getChar ()) != -1 && (c != '\n')){
\r
789 s.Append ((char) c);
\r
791 if (String.Compare (s.ToString (), 0, "line", 0, 4) == 0){
\r
792 string arg = s.ToString ().Substring (5);
\r
795 if ((pos = arg.IndexOf (' ')) != -1 && pos != 0){
\r
796 ref_line = System.Int32.Parse (arg.Substring (0, pos));
\r
799 char [] quotes = { '\"' };
\r
801 ref_name = arg.Substring (pos);
\r
802 ref_name.TrimStart (quotes);
\r
803 ref_name.TrimEnd (quotes);
\r
805 ref_line = System.Int32.Parse (arg);
\r
812 if ((t = is_punct ((char)c, ref doread)) != Token.ERROR){
\r
821 System.Text.StringBuilder s = new System.Text.StringBuilder ();
\r
823 while ((c = getChar ()) != -1){
\r
825 val = s.ToString ();
\r
826 return Token.LITERAL_STRING;
\r
831 return Token.ERROR;
\r
832 s.Append ((char) c);
\r
839 error_details = "CS1011: Empty character literal";
\r
840 return Token.ERROR;
\r
844 return Token.ERROR;
\r
845 val = new System.Char ();
\r
849 error_details = "CS1012: Too many characters in character literal";
\r
850 // Try to recover, read until newline or next "'"
\r
851 while ((c = getChar ()) != -1){
\r
852 if (c == '\n' || c == '\'')
\r
856 return Token.ERROR;
\r
858 return Token.LITERAL_CHARACTER;
\r
868 if (c == ' ' || c == '\t' || c == '\f' || c == '\v' || c == '\r'){
\r
870 col = (((col + 8) / 8) * 8) - 1;
\r
876 allow_keyword_as_ident = true;
\r
880 error_details = ((char)c).ToString ();
\r
882 return Token.ERROR;
\r
889 public class Location {
\r
890 public readonly string Name;
\r
891 public readonly int Col;
\r
892 public readonly int Row;
\r
894 public Location (string name, int col, int row)
\r
902 // Whether the Location is Null
\r
904 static public bool IsNull (Location l)
\r
909 static public Location Null {
\r