2 // cs-tokenizer.cs: The Tokenizer for the C# compiler
\r
4 // Author: Miguel de Icaza (miguel@gnu.org)
\r
6 // Licensed under the terms of the GNU GPL
\r
8 // (C) 2001 Ximian, Inc (http://www.ximian.com)
\r
14 Do something with the integer and float suffixes, pass full datatype?
\r
15 Make sure we accept the proper Unicode ranges, per the spec.
\r
19 * Data type handling
\r
21 Currently I am returning different tokens for the various
\r
22 kinds of floating point types (float, double, decimal) and I
\r
23 am only returning a single token for all integer values
\r
24 (integer, unsigned int, etc) as an experiment as to see
\r
25 which mechanism is better.
\r
27 I do not know yet how I will be doing the mapping of "int"
\r
28 to things like System.Int32 and so on. I am confused. MAN
\r
31 Indeed, this might be the core of the problem, I should
\r
32 *probably* just return a TYPE token and have the value of
\r
33 the token be stuff like `System.Int32', `System.UInt32',
\r
34 `System.Double' and so on. I will see.
\r
38 I was returning Token.ERROR on errors and setting an
\r
39 internal error string with the details, but it might make sense
\r
40 to just use exceptions.
\r
42 Change of mind: I think I want to keep returning errors *UNLESS* the
\r
43 parser is catching errors from the tokenizer (at that point, there is
\r
44 not really any reason to use exceptions) so that I can continue the
\r
49 I think I have solved the problem. The idea is to not even *bother*
\r
50 about handling data types a lot here (except for fitting data into
\r
51 the proper places), but let the upper layer handle it.
\r
53 Ie, treat LITERAL_CHARACTER, LITERAL_INTEGER, LITERAL_FLOAT, LITERAL_DOUBLE, and
\r
54 return then as `LITERAL_LITERAL' with maybe subdetail information
\r
61 using System.Collections;
\r
63 using System.Globalization;
\r
70 /// Tokenizer for C# source code.
\r
73 public class Tokenizer : yyParser.yyInput
\r
75 StreamReader reader;
\r
76 public string ref_name;
\r
77 public int ref_line = 1;
\r
78 public int line = 1;
\r
80 public int current_token;
\r
81 bool handle_get_set = false;
\r
83 public string location {
\r
87 if (current_token == Token.ERROR)
\r
88 det = "detail: " + error_details;
\r
92 return "Line: "+line+" Col: "+col + "\n" +
\r
93 "VirtLine: "+ref_line +
\r
94 " Token: "+current_token + " " + det;
\r
98 public bool properties {
\r
100 return handle_get_set;
\r
104 handle_get_set = value;
\r
111 static Hashtable keywords;
\r
112 static NumberStyles styles;
\r
113 static NumberFormatInfo csharp_format_info;
\r
116 // Values for the associated token returned
\r
118 System.Text.StringBuilder number;
\r
123 // Details about the error encoutered by the tokenizer
\r
125 string error_details;
\r
127 public string error {
\r
129 return error_details;
\r
145 static void initTokens ()
\r
147 keywords = new Hashtable ();
\r
149 keywords.Add ("abstract", Token.ABSTRACT);
\r
150 keywords.Add ("as", Token.AS);
\r
151 keywords.Add ("base", Token.BASE);
\r
152 keywords.Add ("bool", Token.BOOL);
\r
153 keywords.Add ("break", Token.BREAK);
\r
154 keywords.Add ("byte", Token.BYTE);
\r
155 keywords.Add ("case", Token.CASE);
\r
156 keywords.Add ("catch", Token.CATCH);
\r
157 keywords.Add ("char", Token.CHAR);
\r
158 keywords.Add ("checked", Token.CHECKED);
\r
159 keywords.Add ("class", Token.CLASS);
\r
160 keywords.Add ("const", Token.CONST);
\r
161 keywords.Add ("continue", Token.CONTINUE);
\r
162 keywords.Add ("decimal", Token.DECIMAL);
\r
163 keywords.Add ("default", Token.DEFAULT);
\r
164 keywords.Add ("delegate", Token.DELEGATE);
\r
165 keywords.Add ("do", Token.DO);
\r
166 keywords.Add ("double", Token.DOUBLE);
\r
167 keywords.Add ("else", Token.ELSE);
\r
168 keywords.Add ("enum", Token.ENUM);
\r
169 keywords.Add ("event", Token.EVENT);
\r
170 keywords.Add ("explicit", Token.EXPLICIT);
\r
171 keywords.Add ("extern", Token.EXTERN);
\r
172 keywords.Add ("false", Token.FALSE);
\r
173 keywords.Add ("finally", Token.FINALLY);
\r
174 keywords.Add ("fixed", Token.FIXED);
\r
175 keywords.Add ("float", Token.FLOAT);
\r
176 keywords.Add ("for", Token.FOR);
\r
177 keywords.Add ("foreach", Token.FOREACH);
\r
178 keywords.Add ("goto", Token.GOTO);
\r
179 keywords.Add ("get", Token.GET);
\r
180 keywords.Add ("if", Token.IF);
\r
181 keywords.Add ("implicit", Token.IMPLICIT);
\r
182 keywords.Add ("in", Token.IN);
\r
183 keywords.Add ("int", Token.INT);
\r
184 keywords.Add ("interface", Token.INTERFACE);
\r
185 keywords.Add ("internal", Token.INTERNAL);
\r
186 keywords.Add ("is", Token.IS);
\r
187 keywords.Add ("lock ", Token.LOCK );
\r
188 keywords.Add ("long", Token.LONG);
\r
189 keywords.Add ("namespace", Token.NAMESPACE);
\r
190 keywords.Add ("new", Token.NEW);
\r
191 keywords.Add ("null", Token.NULL);
\r
192 keywords.Add ("object", Token.OBJECT);
\r
193 keywords.Add ("operator", Token.OPERATOR);
\r
194 keywords.Add ("out", Token.OUT);
\r
195 keywords.Add ("override", Token.OVERRIDE);
\r
196 keywords.Add ("params", Token.PARAMS);
\r
197 keywords.Add ("private", Token.PRIVATE);
\r
198 keywords.Add ("protected", Token.PROTECTED);
\r
199 keywords.Add ("public", Token.PUBLIC);
\r
200 keywords.Add ("readonly", Token.READONLY);
\r
201 keywords.Add ("ref", Token.REF);
\r
202 keywords.Add ("return", Token.RETURN);
\r
203 keywords.Add ("sbyte", Token.SBYTE);
\r
204 keywords.Add ("sealed", Token.SEALED);
\r
205 keywords.Add ("set", Token.SET);
\r
206 keywords.Add ("short", Token.SHORT);
\r
207 keywords.Add ("sizeof", Token.SIZEOF);
\r
208 keywords.Add ("static", Token.STATIC);
\r
209 keywords.Add ("string", Token.STRING);
\r
210 keywords.Add ("struct", Token.STRUCT);
\r
211 keywords.Add ("switch", Token.SWITCH);
\r
212 keywords.Add ("this", Token.THIS);
\r
213 keywords.Add ("throw", Token.THROW);
\r
214 keywords.Add ("true", Token.TRUE);
\r
215 keywords.Add ("try", Token.TRY);
\r
216 keywords.Add ("typeof", Token.TYPEOF);
\r
217 keywords.Add ("uint", Token.UINT);
\r
218 keywords.Add ("ulong", Token.ULONG);
\r
219 keywords.Add ("unchecked", Token.UNCHECKED);
\r
220 keywords.Add ("unsafe", Token.UNSAFE);
\r
221 keywords.Add ("ushort", Token.USHORT);
\r
222 keywords.Add ("using", Token.USING);
\r
223 keywords.Add ("virtual", Token.VIRTUAL);
\r
224 keywords.Add ("void", Token.VOID);
\r
225 keywords.Add ("while", Token.WHILE);
\r
229 // Class initializer
\r
231 static Tokenizer ()
\r
234 csharp_format_info = new NumberFormatInfo ();
\r
235 csharp_format_info.CurrencyDecimalSeparator = ".";
\r
236 styles = NumberStyles.AllowExponent | NumberStyles.AllowDecimalPoint;
\r
239 bool is_keyword (string name)
\r
243 res = keywords.Contains (name);
\r
244 if ((name == "get" || name == "set") && handle_get_set == false)
\r
249 int getKeyword (string name)
\r
251 return (int) (keywords [name]);
\r
254 public Tokenizer (System.IO.Stream input, string fname)
\r
256 this.ref_name = fname;
\r
257 reader = new System.IO.StreamReader (input);
\r
261 bool is_identifier_start_character (char c)
\r
263 return CharacterInfo.IsLetter (c) || c == '_' ;
\r
266 bool is_identifier_part_character (char c)
\r
268 return (CharacterInfo.IsLetter (c) || CharacterInfo.IsDigit (c) || c == '_');
\r
271 int is_punct (char c, ref bool doread)
\r
273 int idx = "{}[](),:;~+-*/%&|^!=<>?".IndexOf (c);
\r
281 return Token.OPEN_BRACE;
\r
283 return Token.CLOSE_BRACE;
\r
285 return Token.OPEN_BRACKET;
\r
287 return Token.CLOSE_BRACKET;
\r
289 return Token.OPEN_PARENS;
\r
291 return Token.CLOSE_PARENS;
\r
293 return Token.COMMA;
\r
295 return Token.COLON;
\r
297 return Token.SEMICOLON;
\r
299 return Token.TILDE;
\r
301 return Token.INTERR;
\r
310 t = Token.OP_ADD_ASSIGN;
\r
320 t = Token.OP_SUB_ASSIGN;
\r
322 return Token.OP_PTR;
\r
324 return Token.MINUS;
\r
332 return Token.OP_NE;
\r
340 return Token.OP_EQ;
\r
342 return Token.ASSIGN;
\r
348 return Token.OP_AND;
\r
349 } else if (d == '='){
\r
351 return Token.OP_AND_ASSIGN;
\r
353 return Token.BITWISE_AND;
\r
359 return Token.OP_OR;
\r
360 } else if (d == '='){
\r
362 return Token.OP_OR_ASSIGN;
\r
364 return Token.BITWISE_OR;
\r
370 return Token.OP_MULT_ASSIGN;
\r
378 return Token.OP_DIV_ASSIGN;
\r
386 return Token.OP_MOD_ASSIGN;
\r
388 return Token.PERCENT;
\r
394 return Token.OP_XOR_ASSIGN;
\r
396 return Token.CARRET;
\r
406 return Token.OP_SHIFT_LEFT_ASSIGN;
\r
408 return Token.OP_SHIFT_LEFT;
\r
409 } else if (d == '='){
\r
411 return Token.OP_LE;
\r
413 return Token.OP_LT;
\r
423 return Token.OP_SHIFT_RIGHT_ASSIGN;
\r
425 return Token.OP_SHIFT_RIGHT;
\r
426 } else if (d == '='){
\r
428 return Token.OP_GE;
\r
430 return Token.OP_GT;
\r
432 return Token.ERROR;
\r
435 bool decimal_digits (int c)
\r
438 bool seen_digits = false;
\r
441 number.Append ((char) c);
\r
443 while ((d = peekChar ()) != -1){
\r
444 if (CharacterInfo.IsDigit ((char)d)){
\r
445 number.Append ((char) d);
\r
447 seen_digits = true;
\r
451 return seen_digits;
\r
454 void hex_digits (int c)
\r
459 number.Append ((char) c);
\r
460 while ((d = peekChar ()) != -1){
\r
461 char e = Char.ToUpper ((char) d);
\r
463 if (CharacterInfo.IsDigit (e) ||
\r
464 (e >= 'A' && e <= 'F')){
\r
465 number.Append ((char) e);
\r
472 int real_type_suffix (int c)
\r
477 case 'F': case 'f':
\r
478 t = Token.LITERAL_FLOAT;
\r
480 case 'D': case 'd':
\r
481 t = Token.LITERAL_DOUBLE;
\r
483 case 'M': case 'm':
\r
484 t= Token.LITERAL_DECIMAL;
\r
493 int integer_type_suffix (int c)
\r
495 // FIXME: Handle U and L suffixes.
\r
496 // We also need to see in which kind of
\r
497 // Int the thing fits better according to the spec.
\r
498 return Token.LITERAL_INTEGER;
\r
501 void adjust_int (int t)
\r
503 val = new System.Int32();
\r
504 val = System.Int32.Parse (number.ToString (), 0);
\r
507 int adjust_real (int t)
\r
509 string s = number.ToString ();
\r
511 Console.WriteLine (s);
\r
513 case Token.LITERAL_DECIMAL:
\r
514 val = new System.Decimal ();
\r
515 val = System.Decimal.Parse (
\r
516 s, styles, csharp_format_info);
\r
518 case Token.LITERAL_DOUBLE:
\r
519 val = new System.Double ();
\r
520 val = System.Double.Parse (
\r
521 s, styles, csharp_format_info);
\r
523 case Token.LITERAL_FLOAT:
\r
524 val = new System.Double ();
\r
525 val = (float) System.Double.Parse (
\r
526 s, styles, csharp_format_info);
\r
530 val = new System.Double ();
\r
531 val = System.Double.Parse (
\r
532 s, styles, csharp_format_info);
\r
533 t = Token.LITERAL_DOUBLE;
\r
540 // Invoked if we know we have .digits or digits
\r
542 int is_number (int c)
\r
544 bool is_real = false;
\r
545 number = new System.Text.StringBuilder ();
\r
550 if (CharacterInfo.IsDigit ((char)c)){
\r
551 if (peekChar () == 'x' || peekChar () == 'X'){
\r
554 val = new System.Int32 ();
\r
555 val = System.Int32.Parse (number.ToString (), NumberStyles.HexNumber);
\r
556 return integer_type_suffix (peekChar ());
\r
558 decimal_digits (c);
\r
563 // We need to handle the case of
\r
564 // "1.1" vs "1.string" (LITERAL_FLOAT vs NUMBER DOT IDENTIFIER)
\r
567 if (decimal_digits ('.')){
\r
572 number.Length -= 1;
\r
573 adjust_int (Token.LITERAL_INTEGER);
\r
574 return Token.LITERAL_INTEGER;
\r
578 if (c == 'e' || c == 'E'){
\r
580 number.Append ("e");
\r
585 number.Append ((char) c);
\r
588 } else if (c == '-'){
\r
589 number.Append ((char) c);
\r
593 decimal_digits (-1);
\r
597 type = real_type_suffix (c);
\r
598 if (type == Token.NONE && !is_real){
\r
599 type = integer_type_suffix (c);
\r
607 return adjust_real (type);
\r
609 Console.WriteLine ("This should not be reached");
\r
610 throw new Exception ("Is Number should never reach this point");
\r
646 error_details = "cs1009: Unrecognized escape sequence " + (char)d;
\r
655 if (putback_char != -1){
\r
656 int x = putback_char;
\r
661 return reader.Read ();
\r
666 if (putback_char != -1)
\r
667 return putback_char;
\r
668 return reader.Peek ();
\r
671 void putback (int c)
\r
673 if (putback_char != -1)
\r
674 throw new Exception ("This should not happen putback on putback");
\r
678 public bool advance ()
\r
680 return peekChar () != -1;
\r
683 public Object Value {
\r
689 public Object value ()
\r
694 public int token ()
\r
696 current_token = xtoken ();
\r
697 return current_token;
\r
700 public int xtoken ()
\r
703 bool allow_keyword = false;
\r
704 bool doread = false;
\r
708 for (;(c = getChar ()) != -1; col++) {
\r
710 if (is_identifier_start_character ((char) c)){
\r
711 System.Text.StringBuilder id = new System.Text.StringBuilder ();
\r
714 id.Append ((char) c);
\r
716 while ((c = peekChar ()) != -1) {
\r
717 if (is_identifier_part_character ((char) c)){
\r
718 id.Append ((char)getChar ());
\r
724 ids = id.ToString ();
\r
726 if (!is_keyword (ids)){
\r
727 val = id.ToString ();
\r
728 return Token.IDENTIFIER;
\r
731 if (allow_keyword) {
\r
733 return Token.IDENTIFIER;
\r
738 else if (ids == "false")
\r
739 return Token.FALSE;
\r
740 else if (ids == "null")
\r
743 return getKeyword (ids);
\r
747 if (CharacterInfo.IsDigit ((char) peekChar ()))
\r
748 return is_number (c);
\r
752 if (CharacterInfo.IsDigit ((char) c))
\r
753 return is_number (c);
\r
755 // Handle double-slash comments.
\r
757 int d = peekChar ();
\r
761 while ((d = getChar ()) != -1 && (d != '\n'))
\r
766 } else if (d == '*'){
\r
769 while ((d = getChar ()) != -1){
\r
770 if (d == '*' && peekChar () == '/'){
\r
785 /* For now, ignore pre-processor commands */
\r
786 if (col == 1 && c == '#'){
\r
787 System.Text.StringBuilder s = new System.Text.StringBuilder ();
\r
789 while ((c = getChar ()) != -1 && (c != '\n')){
\r
790 s.Append ((char) c);
\r
792 if (String.Compare (s.ToString (), 0, "line", 0, 4) == 0){
\r
793 string arg = s.ToString ().Substring (5);
\r
796 if ((pos = arg.IndexOf (' ')) != -1 && pos != 0){
\r
797 ref_line = System.Int32.Parse (arg.Substring (0, pos));
\r
800 char [] quotes = { '\"' };
\r
802 ref_name = arg.Substring (pos);
\r
803 ref_name.TrimStart (quotes);
\r
804 ref_name.TrimEnd (quotes);
\r
806 ref_line = System.Int32.Parse (arg);
\r
813 if ((t = is_punct ((char)c, ref doread)) != Token.ERROR){
\r
822 System.Text.StringBuilder s = new System.Text.StringBuilder ();
\r
824 while ((c = getChar ()) != -1){
\r
826 val = s.ToString ();
\r
827 return Token.LITERAL_STRING;
\r
832 return Token.ERROR;
\r
833 s.Append ((char) c);
\r
840 error_details = "CS1011: Empty character literal";
\r
841 return Token.ERROR;
\r
845 return Token.ERROR;
\r
846 val = new System.Char ();
\r
850 error_details = "CS1012: Too many characters in character literal";
\r
851 // Try to recover, read until newline or next "'"
\r
852 while ((c = getChar ()) != -1){
\r
853 if (c == '\n' || c == '\'')
\r
857 return Token.ERROR;
\r
859 return Token.LITERAL_CHARACTER;
\r
869 if (c == ' ' || c == '\t' || c == '\f' || c == '\v' || c == '\r'){
\r
871 col = (((col + 8) / 8) * 8) - 1;
\r
877 allow_keyword = true;
\r
881 error_details = ((char)c).ToString ();
\r
883 return Token.ERROR;
\r