//
// cs-tokenizer.cs: The Tokenizer for the C# compiler
//
// Author: Miguel de Icaza (miguel@gnu.org)
//
// Licensed under the terms of the GNU GPL
//
// (C) 2001 Ximian, Inc (http://www.ximian.com)
//
/*
Todo:
Do something with the integer and float suffixes, pass full datatype?
Make sure we accept the proper Unicode ranges, per the spec.
* Error reporting.
I was returning Token.ERROR on errors and setting an
internal error string with the details, but it might make sense
to just use exceptions.
Change of mind: I think I want to keep returning errors *UNLESS* the
parser is catching errors from the tokenizer (at that point, there is
not really any reason to use exceptions) so that I can continue the
parsing
* IDEA
I think I have solved the problem. The idea is to not even *bother*
about handling data types a lot here (except for fitting data into
the proper places), but let the upper layer handle it.
Ie, treat LITERAL_CHARACTER, LITERAL_INTEGER, LITERAL_FLOAT, LITERAL_DOUBLE, and
return then as `LITERAL_LITERAL' with maybe subdetail information
*/
using System;
using System.Text;
using System.Collections;
using System.IO;
using System.Globalization;
namespace CIR
{
///
/// Tokenizer for C# source code.
///
public class Tokenizer : yyParser.yyInput
{
StreamReader reader;
public string ref_name;
public int ref_line = 1;
public int line = 1;
public int col = 1;
public int current_token;
bool handle_get_set = false;
//
// Returns a verbose representation of the current location
//
public string location {
get {
string det;
if (current_token == Token.ERROR)
det = "detail: " + error_details;
else
det = "";
//return "Line: "+line+" Col: "+col + "\n" +
// "VirtLine: "+ref_line +
// " Token: "+current_token + " " + det;
return ref_name + " " + "(" + line + "," + col + ")";
}
}
public bool properties {
get {
return handle_get_set;
}
set {
handle_get_set = value;
}
}
//
// Class variables
//
static Hashtable keywords;
static NumberStyles styles;
static NumberFormatInfo csharp_format_info;
//
// Values for the associated token returned
//
System.Text.StringBuilder number;
int putback_char;
Object val;
//
// Details about the error encoutered by the tokenizer
//
string error_details;
public string error {
get {
return error_details;
}
}
public int Line {
get {
return line;
}
}
public int Col {
get {
return col;
}
}
static void initTokens ()
{
keywords = new Hashtable ();
keywords.Add ("abstract", Token.ABSTRACT);
keywords.Add ("as", Token.AS);
keywords.Add ("add", Token.ADD);
keywords.Add ("base", Token.BASE);
keywords.Add ("bool", Token.BOOL);
keywords.Add ("break", Token.BREAK);
keywords.Add ("byte", Token.BYTE);
keywords.Add ("case", Token.CASE);
keywords.Add ("catch", Token.CATCH);
keywords.Add ("char", Token.CHAR);
keywords.Add ("checked", Token.CHECKED);
keywords.Add ("class", Token.CLASS);
keywords.Add ("const", Token.CONST);
keywords.Add ("continue", Token.CONTINUE);
keywords.Add ("decimal", Token.DECIMAL);
keywords.Add ("default", Token.DEFAULT);
keywords.Add ("delegate", Token.DELEGATE);
keywords.Add ("do", Token.DO);
keywords.Add ("double", Token.DOUBLE);
keywords.Add ("else", Token.ELSE);
keywords.Add ("enum", Token.ENUM);
keywords.Add ("event", Token.EVENT);
keywords.Add ("explicit", Token.EXPLICIT);
keywords.Add ("extern", Token.EXTERN);
keywords.Add ("false", Token.FALSE);
keywords.Add ("finally", Token.FINALLY);
keywords.Add ("fixed", Token.FIXED);
keywords.Add ("float", Token.FLOAT);
keywords.Add ("for", Token.FOR);
keywords.Add ("foreach", Token.FOREACH);
keywords.Add ("goto", Token.GOTO);
keywords.Add ("get", Token.GET);
keywords.Add ("if", Token.IF);
keywords.Add ("implicit", Token.IMPLICIT);
keywords.Add ("in", Token.IN);
keywords.Add ("int", Token.INT);
keywords.Add ("interface", Token.INTERFACE);
keywords.Add ("internal", Token.INTERNAL);
keywords.Add ("is", Token.IS);
keywords.Add ("lock ", Token.LOCK );
keywords.Add ("long", Token.LONG);
keywords.Add ("namespace", Token.NAMESPACE);
keywords.Add ("new", Token.NEW);
keywords.Add ("null", Token.NULL);
keywords.Add ("object", Token.OBJECT);
keywords.Add ("operator", Token.OPERATOR);
keywords.Add ("out", Token.OUT);
keywords.Add ("override", Token.OVERRIDE);
keywords.Add ("params", Token.PARAMS);
keywords.Add ("private", Token.PRIVATE);
keywords.Add ("protected", Token.PROTECTED);
keywords.Add ("public", Token.PUBLIC);
keywords.Add ("readonly", Token.READONLY);
keywords.Add ("ref", Token.REF);
keywords.Add ("remove", Token.REMOVE);
keywords.Add ("return", Token.RETURN);
keywords.Add ("sbyte", Token.SBYTE);
keywords.Add ("sealed", Token.SEALED);
keywords.Add ("set", Token.SET);
keywords.Add ("short", Token.SHORT);
keywords.Add ("sizeof", Token.SIZEOF);
keywords.Add ("static", Token.STATIC);
keywords.Add ("string", Token.STRING);
keywords.Add ("struct", Token.STRUCT);
keywords.Add ("switch", Token.SWITCH);
keywords.Add ("this", Token.THIS);
keywords.Add ("throw", Token.THROW);
keywords.Add ("true", Token.TRUE);
keywords.Add ("try", Token.TRY);
keywords.Add ("typeof", Token.TYPEOF);
keywords.Add ("uint", Token.UINT);
keywords.Add ("ulong", Token.ULONG);
keywords.Add ("unchecked", Token.UNCHECKED);
keywords.Add ("unsafe", Token.UNSAFE);
keywords.Add ("ushort", Token.USHORT);
keywords.Add ("using", Token.USING);
keywords.Add ("virtual", Token.VIRTUAL);
keywords.Add ("void", Token.VOID);
keywords.Add ("while", Token.WHILE);
}
//
// Class initializer
//
static Tokenizer ()
{
initTokens ();
csharp_format_info = new NumberFormatInfo ();
csharp_format_info.CurrencyDecimalSeparator = ".";
styles = NumberStyles.AllowExponent | NumberStyles.AllowDecimalPoint;
}
bool is_keyword (string name)
{
bool res;
res = keywords.Contains (name);
if ((name == "get" || name == "set") && handle_get_set == false)
return false;
return res;
}
int getKeyword (string name)
{
return (int) (keywords [name]);
}
public Location Location {
get {
return new Location (ref_line);
}
}
public Tokenizer (System.IO.Stream input, string fname)
{
this.ref_name = fname;
reader = new System.IO.StreamReader (input);
putback_char = -1;
Location.Push (fname);
}
bool is_identifier_start_character (char c)
{
return Char.IsLetter (c) || c == '_' ;
}
bool is_identifier_part_character (char c)
{
return (Char.IsLetter (c) || Char.IsDigit (c) || c == '_');
}
int is_punct (char c, ref bool doread)
{
int idx = "{}[](),:;~+-*/%&|^!=<>?".IndexOf (c);
int d;
int t;
doread = false;
switch (c){
case '{':
return Token.OPEN_BRACE;
case '}':
return Token.CLOSE_BRACE;
case '[':
return Token.OPEN_BRACKET;
case ']':
return Token.CLOSE_BRACKET;
case '(':
return Token.OPEN_PARENS;
case ')':
return Token.CLOSE_PARENS;
case ',':
return Token.COMMA;
case ':':
return Token.COLON;
case ';':
return Token.SEMICOLON;
case '~':
return Token.TILDE;
case '?':
return Token.INTERR;
}
d = peekChar ();
if (c == '+'){
if (d == '+')
t = Token.OP_INC;
else if (d == '=')
t = Token.OP_ADD_ASSIGN;
else
return Token.PLUS;
doread = true;
return t;
}
if (c == '-'){
if (d == '-')
t = Token.OP_DEC;
else if (d == '=')
t = Token.OP_SUB_ASSIGN;
else if (d == '>')
return Token.OP_PTR;
else
return Token.MINUS;
doread = true;
return t;
}
if (c == '!'){
if (d == '='){
doread = true;
return Token.OP_NE;
}
return Token.BANG;
}
if (c == '='){
if (d == '='){
doread = true;
return Token.OP_EQ;
}
return Token.ASSIGN;
}
if (c == '&'){
if (d == '&'){
doread = true;
return Token.OP_AND;
} else if (d == '='){
doread = true;
return Token.OP_AND_ASSIGN;
}
return Token.BITWISE_AND;
}
if (c == '|'){
if (d == '|'){
doread = true;
return Token.OP_OR;
} else if (d == '='){
doread = true;
return Token.OP_OR_ASSIGN;
}
return Token.BITWISE_OR;
}
if (c == '*'){
if (d == '='){
doread = true;
return Token.OP_MULT_ASSIGN;
}
return Token.STAR;
}
if (c == '/'){
if (d == '='){
doread = true;
return Token.OP_DIV_ASSIGN;
}
return Token.DIV;
}
if (c == '%'){
if (d == '='){
doread = true;
return Token.OP_MOD_ASSIGN;
}
return Token.PERCENT;
}
if (c == '^'){
if (d == '='){
doread = true;
return Token.OP_XOR_ASSIGN;
}
return Token.CARRET;
}
if (c == '<'){
if (d == '<'){
getChar ();
d = peekChar ();
if (d == '='){
doread = true;
return Token.OP_SHIFT_LEFT_ASSIGN;
}
return Token.OP_SHIFT_LEFT;
} else if (d == '='){
doread = true;
return Token.OP_LE;
}
return Token.OP_LT;
}
if (c == '>'){
if (d == '>'){
getChar ();
d = peekChar ();
if (d == '='){
doread = true;
return Token.OP_SHIFT_RIGHT_ASSIGN;
}
return Token.OP_SHIFT_RIGHT;
} else if (d == '='){
doread = true;
return Token.OP_GE;
}
return Token.OP_GT;
}
return Token.ERROR;
}
bool decimal_digits (int c)
{
int d;
bool seen_digits = false;
if (c != -1)
number.Append ((char) c);
while ((d = peekChar ()) != -1){
if (Char.IsDigit ((char)d)){
number.Append ((char) d);
getChar ();
seen_digits = true;
} else
break;
}
return seen_digits;
}
void hex_digits (int c)
{
int d;
if (c != -1)
number.Append ((char) c);
while ((d = peekChar ()) != -1){
char e = Char.ToUpper ((char) d);
if (Char.IsDigit (e) ||
(e >= 'A' && e <= 'F')){
number.Append ((char) e);
getChar ();
} else
break;
}
}
int real_type_suffix (int c)
{
int t;
switch (c){
case 'F': case 'f':
t = Token.LITERAL_FLOAT;
break;
case 'D': case 'd':
t = Token.LITERAL_DOUBLE;
break;
case 'M': case 'm':
t= Token.LITERAL_DECIMAL;
break;
default:
return Token.NONE;
}
//getChar ();
return t;
}
int integer_type_suffix (int c)
{
// FIXME: Handle U and L suffixes.
// We also need to see in which kind of
// Int the thing fits better according to the spec.
return Token.LITERAL_INTEGER;
}
void adjust_int (int t)
{
val = new System.Int32();
val = System.Int32.Parse (number.ToString (), 0);
}
int adjust_real (int t)
{
string s = number.ToString ();
switch (t){
case Token.LITERAL_DECIMAL:
val = new System.Decimal ();
val = System.Decimal.Parse (
s, styles, csharp_format_info);
break;
case Token.LITERAL_DOUBLE:
val = new System.Double ();
val = System.Double.Parse (
s, styles, csharp_format_info);
break;
case Token.LITERAL_FLOAT:
val = new System.Double ();
val = (float) System.Double.Parse (
s, styles, csharp_format_info);
break;
case Token.NONE:
val = new System.Double ();
val = System.Double.Parse (
s, styles, csharp_format_info);
t = Token.LITERAL_DOUBLE;
break;
}
return t;
}
//
// Invoked if we know we have .digits or digits
//
int is_number (int c)
{
bool is_real = false;
number = new System.Text.StringBuilder ();
int type;
number.Length = 0;
if (Char.IsDigit ((char)c)){
if (c == '0' && peekChar () == 'x' || peekChar () == 'X'){
ulong ul;
getChar ();
hex_digits (-1);
string s = number.ToString ();
ul = System.UInt64.Parse (s, NumberStyles.HexNumber);
if ((ul & 0xffffffff00000000) == 0){
uint ui = (uint) ul;
if ((ui & 0x80000000) != 0)
val = ui;
else
val = (int) ui;
} else {
if ((ul & 0x8000000000000000) != 0)
val = ul;
else
val = (long) ul;
}
return integer_type_suffix (peekChar ());
}
decimal_digits (c);
c = getChar ();
}
//
// We need to handle the case of
// "1.1" vs "1.string" (LITERAL_FLOAT vs NUMBER DOT IDENTIFIER)
//
if (c == '.'){
if (decimal_digits ('.')){
is_real = true;
c = peekChar ();
} else {
putback ('.');
number.Length -= 1;
adjust_int (Token.LITERAL_INTEGER);
return Token.LITERAL_INTEGER;
}
}
if (c == 'e' || c == 'E'){
is_real = true;
number.Append ("e");
getChar ();
c = peekChar ();
if (c == '+'){
number.Append ((char) c);
getChar ();
c = peekChar ();
} else if (c == '-'){
number.Append ((char) c);
getChar ();
c = peekChar ();
}
decimal_digits (-1);
c = peekChar ();
}
type = real_type_suffix (c);
if (type == Token.NONE && !is_real){
type = integer_type_suffix (c);
adjust_int (type);
putback (c);
return type;
} else
is_real = true;
if (is_real)
return adjust_real (type);
Console.WriteLine ("This should not be reached");
throw new Exception ("Is Number should never reach this point");
}
int escape (int c)
{
int d;
int v;
d = peekChar ();
if (c != '\\')
return c;
switch (d){
case 'a':
v = '\a'; break;
case 'b':
v = '\b'; break;
case 'n':
v = '\n'; break;
case 't':
v = '\t'; break;
case 'v':
v = '\v'; break;
case 'r':
v = 'c'; break;
case '\\':
v = '\\'; break;
case 'f':
v = '\f'; break;
case '0':
v = 0; break;
case '"':
v = '"'; break;
case '\'':
v = '\''; break;
default:
error_details = "cs1009: Unrecognized escape sequence " + (char)d;
return -1;
}
getChar ();
return v;
}
int getChar ()
{
if (putback_char != -1){
int x = putback_char;
putback_char = -1;
return x;
}
return reader.Read ();
}
int peekChar ()
{
if (putback_char != -1)
return putback_char;
return reader.Peek ();
}
void putback (int c)
{
if (putback_char != -1)
throw new Exception ("This should not happen putback on putback");
putback_char = c;
}
public bool advance ()
{
return peekChar () != -1;
}
public Object Value {
get {
return val;
}
}
public Object value ()
{
return val;
}
public int token ()
{
current_token = xtoken ();
return current_token;
}
public int xtoken ()
{
int t;
bool allow_keyword_as_ident = false;
bool doread = false;
int c;
val = null;
for (;(c = getChar ()) != -1; col++) {
if (is_identifier_start_character ((char) c)){
System.Text.StringBuilder id = new System.Text.StringBuilder ();
string ids;
id.Append ((char) c);
while ((c = peekChar ()) != -1) {
if (is_identifier_part_character ((char) c)){
id.Append ((char)getChar ());
col++;
} else
break;
}
ids = id.ToString ();
if (!is_keyword (ids) || allow_keyword_as_ident) {
val = ids;
return Token.IDENTIFIER;
}
// true, false and null are in the hash anyway.
return getKeyword (ids);
}
if (c == '.'){
if (Char.IsDigit ((char) peekChar ()))
return is_number (c);
return Token.DOT;
}
if (Char.IsDigit ((char) c))
return is_number (c);
// Handle double-slash comments.
if (c == '/'){
int d = peekChar ();
if (d == '/'){
getChar ();
while ((d = getChar ()) != -1 && (d != '\n'))
col++;
line++;
ref_line++;
continue;
} else if (d == '*'){
getChar ();
while ((d = getChar ()) != -1){
if (d == '*' && peekChar () == '/'){
getChar ();
col++;
break;
}
if (d == '\n'){
line++;
ref_line++;
}
col++;
}
continue;
}
}
/* For now, ignore pre-processor commands */
if (col == 1 && c == '#'){
System.Text.StringBuilder s = new System.Text.StringBuilder ();
while ((c = getChar ()) != -1 && (c != '\n')){
s.Append ((char) c);
}
if (String.Compare (s.ToString (), 0, "line", 0, 4) == 0){
string arg = s.ToString ().Substring (5);
int pos;
if ((pos = arg.IndexOf (' ')) != -1 && pos != 0){
ref_line = System.Int32.Parse (arg.Substring (0, pos));
pos++;
char [] quotes = { '\"' };
ref_name = arg.Substring (pos);
ref_name.TrimStart (quotes);
ref_name.TrimEnd (quotes);
} else
ref_line = System.Int32.Parse (arg);
}
line++;
ref_line++;
continue;
}
if ((t = is_punct ((char)c, ref doread)) != Token.ERROR){
if (doread){
getChar ();
col++;
}
return t;
}
if (c == '"'){
System.Text.StringBuilder s = new System.Text.StringBuilder ();
while ((c = getChar ()) != -1){
if (c == '"'){
val = s.ToString ();
return Token.LITERAL_STRING;
}
c = escape (c);
if (c == -1)
return Token.ERROR;
s.Append ((char) c);
}
}
if (c == '\''){
c = getChar ();
if (c == '\''){
error_details = "CS1011: Empty character literal";
return Token.ERROR;
}
c = escape (c);
if (c == -1)
return Token.ERROR;
val = new System.Char ();
val = (char) c;
c = getChar ();
if (c != '\''){
error_details = "CS1012: Too many characters in character literal";
// Try to recover, read until newline or next "'"
while ((c = getChar ()) != -1){
if (c == '\n' || c == '\'')
break;
}
return Token.ERROR;
}
return Token.LITERAL_CHARACTER;
}
// white space
if (c == '\n'){
line++;
ref_line++;
col = 0;
continue;
}
if (c == ' ' || c == '\t' || c == '\f' || c == '\v' || c == '\r'){
if (c == '\t')
col = (((col + 8) / 8) * 8) - 1;
continue;
}
if (c == '@'){
allow_keyword_as_ident = true;
continue;
}
error_details = ((char)c).ToString ();
return Token.ERROR;
}
return Token.EOF;
}
}
//
// Keeps track of the location in the program
//
// This uses a compact representation and a couple of auxiliary
// structures to keep track of tokens to (file,line) mappings.
//
// We could probably also keep track of columns by storing those
// in 8 bits (and say, map anything after char 255 to be `255+').
//
public struct Location {
public int token;
static Hashtable map;
static ArrayList list;
static int global_count;
static int module_base;
static void Reset ()
{
map = new Hashtable ();
list = new ArrayList ();
global_count = 0;
module_base = 0;
}
static Location ()
{
Reset ();
}
public void Push (string name)
{
map.Add (global_count, name);
list.Add (global_count);
module_base = global_count;
}
public Location (int row)
{
if (row < 0)
token = -1;
else {
token = module_base + row;
global_count = token;
}
}
//
// Whether the Location is Null
//
static public bool IsNull (Location l)
{
return l.token == -1;
}
static public Location Null {
get {
return new Location (-1);
}
}
public string Name {
get {
if (token < 0)
return "Internal";
foreach (int b in list){
if (token < b)
continue;
return (string) map [b];
}
return "Unknown";
}
}
public int Row {
get {
if (token < 0)
return 1;
foreach (int b in list){
if (token < b)
continue;
return token - b;
}
return 0;
}
}
}
}