2 // cs-tokenizer.cs: The Tokenizer for the C# compiler
\r
3 // This also implements the preprocessor
\r
5 // Author: Miguel de Icaza (miguel@gnu.org)
\r
7 // Licensed under the terms of the GNU GPL
\r
9 // (C) 2001 Ximian, Inc (http://www.ximian.com)
\r
14 * Make sure we accept the proper Unicode ranges, per the spec.
\r
20 using System.Collections;
\r
22 using System.Globalization;
\r
24 namespace Mono.CSharp
\r
27 /// Tokenizer for C# source code.
\r
30 public class Tokenizer : yyParser.yyInput
\r
32 StreamReader reader;
\r
33 public string ref_name;
\r
34 public int ref_line = 1;
\r
35 public int line = 1;
\r
37 public int current_token;
\r
38 bool handle_get_set = false;
\r
39 bool handle_remove_add = false;
\r
42 // Returns a verbose representation of the current location
\r
44 public string location {
\r
48 if (current_token == Token.ERROR)
\r
49 det = "detail: " + error_details;
\r
53 // return "Line: "+line+" Col: "+col + "\n" +
\r
54 // "VirtLine: "+ref_line +
\r
55 // " Token: "+current_token + " " + det;
\r
57 return ref_name + " " + "(" + line + "," + col + "), Token:" + current_token + " " + det;
\r
61 public bool PropertyParsing {
\r
63 return handle_get_set;
\r
67 handle_get_set = value;
\r
71 public bool EventParsing {
\r
73 return handle_remove_add;
\r
77 handle_remove_add = value;
\r
84 static Hashtable keywords;
\r
85 static NumberStyles styles;
\r
86 static NumberFormatInfo csharp_format_info;
\r
89 // Values for the associated token returned
\r
91 System.Text.StringBuilder number;
\r
100 const int TAKING = 1;
\r
101 const int TAKEN_BEFORE = 2;
\r
102 const int ELSE_SEEN = 4;
\r
103 const int PARENT_TAKING = 8;
\r
106 // pre-processor if stack state:
\r
111 // Details about the error encoutered by the tokenizer
\r
113 string error_details;
\r
115 public string error {
\r
117 return error_details;
\r
133 static void InitTokens ()
\r
135 keywords = new Hashtable ();
\r
137 keywords.Add ("abstract", Token.ABSTRACT);
\r
138 keywords.Add ("as", Token.AS);
\r
139 keywords.Add ("add", Token.ADD);
\r
140 keywords.Add ("base", Token.BASE);
\r
141 keywords.Add ("bool", Token.BOOL);
\r
142 keywords.Add ("break", Token.BREAK);
\r
143 keywords.Add ("byte", Token.BYTE);
\r
144 keywords.Add ("case", Token.CASE);
\r
145 keywords.Add ("catch", Token.CATCH);
\r
146 keywords.Add ("char", Token.CHAR);
\r
147 keywords.Add ("checked", Token.CHECKED);
\r
148 keywords.Add ("class", Token.CLASS);
\r
149 keywords.Add ("const", Token.CONST);
\r
150 keywords.Add ("continue", Token.CONTINUE);
\r
151 keywords.Add ("decimal", Token.DECIMAL);
\r
152 keywords.Add ("default", Token.DEFAULT);
\r
153 keywords.Add ("delegate", Token.DELEGATE);
\r
154 keywords.Add ("do", Token.DO);
\r
155 keywords.Add ("double", Token.DOUBLE);
\r
156 keywords.Add ("else", Token.ELSE);
\r
157 keywords.Add ("enum", Token.ENUM);
\r
158 keywords.Add ("event", Token.EVENT);
\r
159 keywords.Add ("explicit", Token.EXPLICIT);
\r
160 keywords.Add ("extern", Token.EXTERN);
\r
161 keywords.Add ("false", Token.FALSE);
\r
162 keywords.Add ("finally", Token.FINALLY);
\r
163 keywords.Add ("fixed", Token.FIXED);
\r
164 keywords.Add ("float", Token.FLOAT);
\r
165 keywords.Add ("for", Token.FOR);
\r
166 keywords.Add ("foreach", Token.FOREACH);
\r
167 keywords.Add ("goto", Token.GOTO);
\r
168 keywords.Add ("get", Token.GET);
\r
169 keywords.Add ("if", Token.IF);
\r
170 keywords.Add ("implicit", Token.IMPLICIT);
\r
171 keywords.Add ("in", Token.IN);
\r
172 keywords.Add ("int", Token.INT);
\r
173 keywords.Add ("interface", Token.INTERFACE);
\r
174 keywords.Add ("internal", Token.INTERNAL);
\r
175 keywords.Add ("is", Token.IS);
\r
176 keywords.Add ("lock", Token.LOCK);
\r
177 keywords.Add ("long", Token.LONG);
\r
178 keywords.Add ("namespace", Token.NAMESPACE);
\r
179 keywords.Add ("new", Token.NEW);
\r
180 keywords.Add ("null", Token.NULL);
\r
181 keywords.Add ("object", Token.OBJECT);
\r
182 keywords.Add ("operator", Token.OPERATOR);
\r
183 keywords.Add ("out", Token.OUT);
\r
184 keywords.Add ("override", Token.OVERRIDE);
\r
185 keywords.Add ("params", Token.PARAMS);
\r
186 keywords.Add ("private", Token.PRIVATE);
\r
187 keywords.Add ("protected", Token.PROTECTED);
\r
188 keywords.Add ("public", Token.PUBLIC);
\r
189 keywords.Add ("readonly", Token.READONLY);
\r
190 keywords.Add ("ref", Token.REF);
\r
191 keywords.Add ("remove", Token.REMOVE);
\r
192 keywords.Add ("return", Token.RETURN);
\r
193 keywords.Add ("sbyte", Token.SBYTE);
\r
194 keywords.Add ("sealed", Token.SEALED);
\r
195 keywords.Add ("set", Token.SET);
\r
196 keywords.Add ("short", Token.SHORT);
\r
197 keywords.Add ("sizeof", Token.SIZEOF);
\r
198 keywords.Add ("static", Token.STATIC);
\r
199 keywords.Add ("string", Token.STRING);
\r
200 keywords.Add ("struct", Token.STRUCT);
\r
201 keywords.Add ("switch", Token.SWITCH);
\r
202 keywords.Add ("this", Token.THIS);
\r
203 keywords.Add ("throw", Token.THROW);
\r
204 keywords.Add ("true", Token.TRUE);
\r
205 keywords.Add ("try", Token.TRY);
\r
206 keywords.Add ("typeof", Token.TYPEOF);
\r
207 keywords.Add ("uint", Token.UINT);
\r
208 keywords.Add ("ulong", Token.ULONG);
\r
209 keywords.Add ("unchecked", Token.UNCHECKED);
\r
210 keywords.Add ("unsafe", Token.UNSAFE);
\r
211 keywords.Add ("ushort", Token.USHORT);
\r
212 keywords.Add ("using", Token.USING);
\r
213 keywords.Add ("virtual", Token.VIRTUAL);
\r
214 keywords.Add ("void", Token.VOID);
\r
215 keywords.Add ("while", Token.WHILE);
\r
219 // Class initializer
\r
221 static Tokenizer ()
\r
224 csharp_format_info = new NumberFormatInfo ();
\r
225 csharp_format_info.CurrencyDecimalSeparator = ".";
\r
226 styles = NumberStyles.AllowExponent | NumberStyles.AllowDecimalPoint;
\r
229 bool is_keyword (string name)
\r
233 res = keywords.Contains (name);
\r
234 if (handle_get_set == false && (name == "get" || name == "set"))
\r
236 if (handle_remove_add == false && (name == "remove" || name == "add"))
\r
241 int GetKeyword (string name)
\r
243 return (int) (keywords [name]);
\r
246 public Location Location {
\r
248 return new Location (ref_line);
\r
252 public Tokenizer (System.IO.Stream input, string fname, ArrayList defs)
\r
254 this.ref_name = fname;
\r
255 reader = new System.IO.StreamReader (input);
\r
259 defines = new Hashtable ();
\r
260 foreach (string def in defs)
\r
261 defines [def] = true;
\r
265 // FIXME: This could be `Location.Push' but we have to
\r
266 // find out why the MS compiler allows this
\r
268 Mono.CSharp.Location.Push (fname);
\r
271 bool is_identifier_start_character (char c)
\r
273 return Char.IsLetter (c) || c == '_' ;
\r
276 bool is_identifier_part_character (char c)
\r
278 return (Char.IsLetter (c) || Char.IsDigit (c) || c == '_');
\r
281 int is_punct (char c, ref bool doread)
\r
283 int idx = "{}[](),:;~+-*/%&|^!=<>?".IndexOf (c);
\r
291 return Token.OPEN_BRACE;
\r
293 return Token.CLOSE_BRACE;
\r
295 return Token.OPEN_BRACKET;
\r
297 return Token.CLOSE_BRACKET;
\r
299 return Token.OPEN_PARENS;
\r
301 return Token.CLOSE_PARENS;
\r
303 return Token.COMMA;
\r
305 return Token.COLON;
\r
307 return Token.SEMICOLON;
\r
309 return Token.TILDE;
\r
311 return Token.INTERR;
\r
320 t = Token.OP_ADD_ASSIGN;
\r
330 t = Token.OP_SUB_ASSIGN;
\r
332 return Token.OP_PTR;
\r
334 return Token.MINUS;
\r
342 return Token.OP_NE;
\r
350 return Token.OP_EQ;
\r
352 return Token.ASSIGN;
\r
358 return Token.OP_AND;
\r
359 } else if (d == '='){
\r
361 return Token.OP_AND_ASSIGN;
\r
363 return Token.BITWISE_AND;
\r
369 return Token.OP_OR;
\r
370 } else if (d == '='){
\r
372 return Token.OP_OR_ASSIGN;
\r
374 return Token.BITWISE_OR;
\r
380 return Token.OP_MULT_ASSIGN;
\r
388 return Token.OP_DIV_ASSIGN;
\r
396 return Token.OP_MOD_ASSIGN;
\r
398 return Token.PERCENT;
\r
404 return Token.OP_XOR_ASSIGN;
\r
406 return Token.CARRET;
\r
416 return Token.OP_SHIFT_LEFT_ASSIGN;
\r
418 return Token.OP_SHIFT_LEFT;
\r
419 } else if (d == '='){
\r
421 return Token.OP_LE;
\r
423 return Token.OP_LT;
\r
433 return Token.OP_SHIFT_RIGHT_ASSIGN;
\r
435 return Token.OP_SHIFT_RIGHT;
\r
436 } else if (d == '='){
\r
438 return Token.OP_GE;
\r
440 return Token.OP_GT;
\r
442 return Token.ERROR;
\r
445 bool decimal_digits (int c)
\r
448 bool seen_digits = false;
\r
451 number.Append ((char) c);
\r
453 while ((d = peekChar ()) != -1){
\r
454 if (Char.IsDigit ((char)d)){
\r
455 number.Append ((char) d);
\r
457 seen_digits = true;
\r
461 return seen_digits;
\r
464 void hex_digits (int c)
\r
469 number.Append ((char) c);
\r
470 while ((d = peekChar ()) != -1){
\r
471 char e = Char.ToUpper ((char) d);
\r
473 if (Char.IsDigit (e) || (e >= 'A' && e <= 'F')){
\r
474 number.Append ((char) e);
\r
481 int real_type_suffix (int c)
\r
486 case 'F': case 'f':
\r
487 t = Token.LITERAL_FLOAT;
\r
489 case 'D': case 'd':
\r
490 t = Token.LITERAL_DOUBLE;
\r
492 case 'M': case 'm':
\r
493 t= Token.LITERAL_DECIMAL;
\r
502 int integer_type_suffix (ulong ul, int c)
\r
504 bool is_unsigned = false;
\r
505 bool is_long = false;
\r
508 bool scanning = true;
\r
511 case 'U': case 'u':
\r
514 is_unsigned = true;
\r
521 // if we have not seen anything in between
\r
522 // report this error
\r
526 "the 'l' suffix is easily confused with digit `1'," +
\r
527 " use 'L' for clarity");
\r
543 } while (scanning);
\r
546 if (is_long && is_unsigned){
\r
548 return Token.LITERAL_INTEGER;
\r
549 } else if (is_unsigned){
\r
550 // uint if possible, or ulong else.
\r
552 if ((ul & 0xffffffff00000000) == 0)
\r
556 } else if (is_long){
\r
557 // long if possible, ulong otherwise
\r
558 if ((ul & 0x8000000000000000) != 0)
\r
563 // int, uint, long or ulong in that order
\r
564 if ((ul & 0xffffffff00000000) == 0){
\r
565 uint ui = (uint) ul;
\r
567 if ((ui & 0x80000000) != 0)
\r
572 if ((ul & 0x8000000000000000) != 0)
\r
578 return Token.LITERAL_INTEGER;
\r
582 // given `c' as the next char in the input decide whether
\r
583 // we need to convert to a special type, and then choose
\r
584 // the best representation for the integer
\r
586 int adjust_int (int c)
\r
588 ulong ul = System.UInt64.Parse (number.ToString ());
\r
589 return integer_type_suffix (ul, c);
\r
592 int adjust_real (int t)
\r
594 string s = number.ToString ();
\r
597 case Token.LITERAL_DECIMAL:
\r
598 val = new System.Decimal ();
\r
599 val = System.Decimal.Parse (
\r
600 s, styles, csharp_format_info);
\r
602 case Token.LITERAL_DOUBLE:
\r
603 val = new System.Double ();
\r
604 val = System.Double.Parse (
\r
605 s, styles, csharp_format_info);
\r
607 case Token.LITERAL_FLOAT:
\r
608 val = new System.Double ();
\r
609 val = (float) System.Double.Parse (
\r
610 s, styles, csharp_format_info);
\r
614 val = new System.Double ();
\r
615 val = System.Double.Parse (
\r
616 s, styles, csharp_format_info);
\r
617 t = Token.LITERAL_DOUBLE;
\r
624 // Invoked if we know we have .digits or digits
\r
626 int is_number (int c)
\r
628 bool is_real = false;
\r
629 number = new System.Text.StringBuilder ();
\r
634 if (Char.IsDigit ((char)c)){
\r
635 if (c == '0' && peekChar () == 'x' || peekChar () == 'X'){
\r
640 string s = number.ToString ();
\r
642 ul = System.UInt64.Parse (s, NumberStyles.HexNumber);
\r
643 return integer_type_suffix (ul, peekChar ());
\r
645 decimal_digits (c);
\r
650 // We need to handle the case of
\r
651 // "1.1" vs "1.string" (LITERAL_FLOAT vs NUMBER DOT IDENTIFIER)
\r
654 if (decimal_digits ('.')){
\r
659 number.Length -= 1;
\r
660 return adjust_int (-1);
\r
664 if (c == 'e' || c == 'E'){
\r
666 number.Append ("e");
\r
671 number.Append ((char) c);
\r
674 } else if (c == '-'){
\r
675 number.Append ((char) c);
\r
679 decimal_digits (-1);
\r
683 type = real_type_suffix (c);
\r
684 if (type == Token.NONE && !is_real){
\r
686 return adjust_int (c);
\r
691 return adjust_real (type);
\r
693 Console.WriteLine ("This should not be reached");
\r
694 throw new Exception ("Is Number should never reach this point");
\r
730 error_details = "cs1009: Unrecognized escape sequence " + (char)d;
\r
739 if (putback_char != -1){
\r
740 int x = putback_char;
\r
745 return reader.Read ();
\r
750 if (putback_char != -1)
\r
751 return putback_char;
\r
752 return reader.Peek ();
\r
755 void putback (int c)
\r
757 if (putback_char != -1)
\r
758 throw new Exception ("This should not happen putback on putback");
\r
762 public bool advance ()
\r
764 return peekChar () != -1;
\r
767 public Object Value {
\r
773 public Object value ()
\r
778 public int token ()
\r
780 current_token = xtoken ();
\r
781 return current_token;
\r
784 static StringBuilder static_cmd_arg = new System.Text.StringBuilder ();
\r
786 void get_cmd_arg (out string cmd, out string arg)
\r
791 static_cmd_arg.Length = 0;
\r
793 while ((c = getChar ()) != -1 && (c != '\n') && (c != ' '))
\r
794 static_cmd_arg.Append ((char) c);
\r
796 cmd = static_cmd_arg.ToString ();
\r
804 // skip over white space
\r
805 while ((c = getChar ()) != -1 && (c != '\n') && (c == ' '))
\r
814 static_cmd_arg.Length = 0;
\r
815 static_cmd_arg.Append ((char) c);
\r
817 while ((c = getChar ()) != -1 && (c != '\n'))
\r
818 static_cmd_arg.Append ((char) c);
\r
824 arg = static_cmd_arg.ToString ().Trim ();
\r
828 // Handles the #line directive
\r
830 bool PreProcessLine (string arg)
\r
835 if (arg == "default"){
\r
836 line = ref_line = line;
\r
843 if ((pos = arg.IndexOf (' ')) != -1 && pos != 0){
\r
844 ref_line = System.Int32.Parse (arg.Substring (0, pos));
\r
847 char [] quotes = { '\"' };
\r
849 ref_name = arg.Substring (pos);
\r
850 ref_name.TrimStart (quotes);
\r
851 ref_name.TrimEnd (quotes);
\r
853 ref_line = System.Int32.Parse (arg);
\r
863 // Handles #define and #undef
\r
865 void PreProcessDefinition (bool is_define, string arg)
\r
867 if (arg == "" || arg == "true" || arg == "false"){
\r
868 Report.Error(1001, Location, "Missing identifer to pre-processor directive");
\r
873 if (defines == null)
\r
874 defines = new Hashtable ();
\r
877 if (defines == null)
\r
879 if (defines.Contains (arg))
\r
880 defines.Remove (arg);
\r
884 bool eval_val (string s)
\r
891 if (defines == null)
\r
893 if (defines.Contains (s))
\r
900 // Evaluates an expression for `#if' or `#elif'
\r
902 bool eval (string s)
\r
904 return eval_val (s);
\r
907 void report1028 (string extra)
\r
911 "Unexpected processor directive (" + extra + ")");
\r
915 // if true, then the code continues processing the code
\r
916 // if false, the code stays in a loop until another directive is
\r
919 bool handle_preprocessing_directive ()
\r
921 char [] blank = { ' ', '\t' };
\r
924 get_cmd_arg (out cmd, out arg);
\r
928 if (!PreProcessLine (arg))
\r
931 "Argument to #line directive is missing or invalid");
\r
935 PreProcessDefinition (true, arg);
\r
939 PreProcessDefinition (false, arg);
\r
943 Report.Error (1029, Location, "#error: '" + arg + "'");
\r
947 Report.Warning (1030, Location, "#warning: '" + arg + "'");
\r
959 Report.Error (1517, Location, "Invalid pre-processor directive");
\r
962 bool taking = false;
\r
963 if (ifstack == null)
\r
964 ifstack = new Stack ();
\r
966 if (ifstack.Count == 0){
\r
969 int state = (int) ifstack.Peek ();
\r
970 if ((state & TAKING) != 0)
\r
974 if (eval (arg) && taking){
\r
975 ifstack.Push (TAKING | TAKEN_BEFORE | PARENT_TAKING);
\r
978 ifstack.Push (taking ? PARENT_TAKING : 0);
\r
983 if (ifstack == null || ifstack.Count == 0){
\r
984 report1028 ("no #if for this #endif");
\r
988 if (ifstack.Count == 0)
\r
991 int state = (int) ifstack.Peek ();
\r
993 if ((state & TAKING) != 0)
\r
1001 if (ifstack == null || ifstack.Count == 0){
\r
1002 report1028 ("no #if for this #elif");
\r
1005 int state = (int) ifstack.Peek ();
\r
1007 if ((state & ELSE_SEEN) != 0){
\r
1008 report1028 ("#elif not valid after #else");
\r
1012 if ((state & (TAKEN_BEFORE | TAKING)) != 0)
\r
1015 if (eval (arg) && ((state & PARENT_TAKING) != 0)){
\r
1016 state = (int) ifstack.Pop ();
\r
1017 ifstack.Push (state | TAKING | TAKEN_BEFORE);
\r
1024 if (ifstack == null || ifstack.Count == 0){
\r
1027 "Unexpected processor directive (no #if for this #else)");
\r
1030 int state = (int) ifstack.Peek ();
\r
1032 if ((state & ELSE_SEEN) != 0){
\r
1033 report1028 ("#else within #else");
\r
1038 ifstack.Push (state | ELSE_SEEN);
\r
1040 if ((state & TAKEN_BEFORE) == 0){
\r
1041 if ((state & PARENT_TAKING) != 0)
\r
1050 Report.Error (1024, "Preprocessor directive expected");
\r
1054 public int xtoken ()
\r
1057 bool allow_keyword_as_ident = false;
\r
1058 bool doread = false;
\r
1062 // optimization: eliminate col and implement #directive semantic correctly.
\r
1063 for (;(c = getChar ()) != -1; col++) {
\r
1064 if (is_identifier_start_character ((char) c)){
\r
1065 System.Text.StringBuilder id = new System.Text.StringBuilder ();
\r
1068 id.Append ((char) c);
\r
1070 while ((c = peekChar ()) != -1) {
\r
1071 if (is_identifier_part_character ((char) c)){
\r
1072 id.Append ((char)getChar ());
\r
1078 ids = id.ToString ();
\r
1080 if (!is_keyword (ids) || allow_keyword_as_ident) {
\r
1082 return Token.IDENTIFIER;
\r
1085 // true, false and null are in the hash anyway.
\r
1086 return GetKeyword (ids);
\r
1091 if (Char.IsDigit ((char) peekChar ()))
\r
1092 return is_number (c);
\r
1096 if (Char.IsDigit ((char) c))
\r
1097 return is_number (c);
\r
1099 // Handle double-slash comments.
\r
1101 int d = peekChar ();
\r
1105 while ((d = getChar ()) != -1 && (d != '\n'))
\r
1111 } else if (d == '*'){
\r
1114 while ((d = getChar ()) != -1){
\r
1115 if (d == '*' && peekChar () == '/'){
\r
1130 /* For now, ignore pre-processor commands */
\r
1131 // FIXME: In C# the '#' is not limited to appear
\r
1132 // on the first column.
\r
1133 if (col <= 1 && c == '#'){
\r
1136 bool cont = handle_preprocessing_directive ();
\r
1144 for (;(c = getChar ()) != -1; col++){
\r
1149 } else if (col == 1 && c == '#'){
\r
1154 Report.Error (1027, Location, "#endif expected");
\r
1158 if ((t = is_punct ((char)c, ref doread)) != Token.ERROR){
\r
1167 System.Text.StringBuilder s = new System.Text.StringBuilder ();
\r
1169 while ((c = getChar ()) != -1){
\r
1171 val = s.ToString ();
\r
1172 return Token.LITERAL_STRING;
\r
1177 return Token.ERROR;
\r
1178 s.Append ((char) c);
\r
1185 error_details = "CS1011: Empty character literal";
\r
1186 return Token.ERROR;
\r
1190 return Token.ERROR;
\r
1191 val = new System.Char ();
\r
1195 error_details = "CS1012: Too many characters in character literal";
\r
1196 // Try to recover, read until newline or next "'"
\r
1197 while ((c = getChar ()) != -1){
\r
1198 if (c == '\n' || c == '\'')
\r
1202 return Token.ERROR;
\r
1204 return Token.LITERAL_CHARACTER;
\r
1214 if (c == ' ' || c == '\t' || c == '\f' || c == '\v' || c == '\r'){
\r
1216 col = (((col + 8) / 8) * 8) - 1;
\r
1222 allow_keyword_as_ident = true;
\r
1226 error_details = ((char)c).ToString ();
\r
1228 return Token.ERROR;
\r
1231 if (ifstack != null && ifstack.Count > 1)
\r
1232 Report.Error (1027, Location, "#endif expected");
\r