// ==++== // // Copyright (c) Microsoft Corporation. All rights reserved. // // ==--== /*============================================================ ** ** CLASS: Tokenizer.cs ** ** [....] ** ** ** PURPOSE: Tokenize "Elementary XML", that is, XML without ** attributes or DTDs, in other words, XML with ** elements only. ** ** ===========================================================*/ namespace System.Security.Util { using System.Text; using System; using System.IO; using System.Globalization; using System.Diagnostics.Contracts; internal sealed class Tokenizer { // There are five externally knowable token types: bras, kets, // slashes, cstrs, and equals. internal const byte bra = 0; internal const byte ket = 1; internal const byte slash = 2; internal const byte cstr = 3; internal const byte equals = 4; internal const byte quest = 5; internal const byte bang = 6; internal const byte dash = 7; // these are the assorted text characters that the tokenizer must // understand to do its job internal const int intOpenBracket = (int) '<'; internal const int intCloseBracket = (int) '>'; internal const int intSlash = (int) '/'; internal const int intEquals = (int) '='; internal const int intQuote = (int) '\"'; internal const int intQuest = (int) '?'; internal const int intBang = (int) '!'; internal const int intDash = (int) '-'; internal const int intTab = (int) '\t'; internal const int intCR = (int) '\r'; internal const int intLF = (int) '\n'; internal const int intSpace = (int) ' '; // this tells us where we will be getting our input from // and what the encoding is (if any) private enum TokenSource { UnicodeByteArray, // this is little-endian unicode (there are other encodings) UTF8ByteArray, ASCIIByteArray, CharArray, String, NestedStrings, Other } // byte streams can come in 3 different flavors internal enum ByteTokenEncoding { UnicodeTokens, // this is little-endian unicode (there are other encodings) UTF8Tokens, ByteTokens } public int LineNo; // these variables represent the input state of the of the tokenizer private int _inProcessingTag; private byte[] _inBytes; private char[] _inChars; private String _inString; private int _inIndex; private int _inSize; private int _inSavedCharacter; private TokenSource _inTokenSource; private ITokenReader _inTokenReader; // these variables are used to build and deliver tokenizer output strings private StringMaker _maker = null; private String[] _searchStrings; private String[] _replaceStrings; private int _inNestedIndex; private int _inNestedSize; private String _inNestedString; //================================================================ // Constructor uses given ICharInputStream // internal void BasicInitialization() { LineNo = 1 ; _inProcessingTag = 0; _inSavedCharacter = -1; _inIndex = 0; _inSize = 0; _inNestedSize = 0; _inNestedIndex = 0; _inTokenSource = TokenSource.Other; _maker = System.SharedStatics.GetSharedStringMaker(); } public void Recycle() { System.SharedStatics.ReleaseSharedStringMaker(ref _maker); // will set _maker to null } internal Tokenizer (String input) { BasicInitialization(); _inString = input; _inSize = input.Length; _inTokenSource = TokenSource.String; } internal Tokenizer (String input, String[] searchStrings, String[] replaceStrings) { BasicInitialization(); _inString = input; _inSize = _inString.Length; _inTokenSource = TokenSource.NestedStrings; _searchStrings = searchStrings; _replaceStrings = replaceStrings; #if DEBUG Contract.Assert(searchStrings.Length == replaceStrings.Length, "different number of search/replace strings"); Contract.Assert(searchStrings.Length != 0, "no search replace strings, shouldn't be using this ctor"); for (int istr=0; istr= 3 , "XML Slug too small"); Contract.Assert( str[0] == '{', "XML Slug doesn't start with '{'" ); Contract.Assert( str[str.Length-1] == '}', "XML Slug doesn't end with '}'" ); str = replaceStrings[istr]; Contract.Assert( str != null, "XML Replacement null"); Contract.Assert( str.Length >= 1, "XML Replacement empty"); } #endif } internal Tokenizer (byte[] array, ByteTokenEncoding encoding, int startIndex) { BasicInitialization(); _inBytes = array; _inSize = array.Length; _inIndex = startIndex; switch (encoding) { case ByteTokenEncoding.UnicodeTokens: _inTokenSource = TokenSource.UnicodeByteArray; break; case ByteTokenEncoding.UTF8Tokens: _inTokenSource = TokenSource.UTF8ByteArray; break; case ByteTokenEncoding.ByteTokens: _inTokenSource = TokenSource.ASCIIByteArray; break; default: throw new ArgumentException(Environment.GetResourceString("Arg_EnumIllegalVal", (int)encoding)); } } internal Tokenizer (char[] array) { BasicInitialization(); _inChars = array; _inSize = array.Length; _inTokenSource = TokenSource.CharArray; } internal Tokenizer (StreamReader input) { BasicInitialization(); _inTokenReader = new StreamTokenReader(input); } internal void ChangeFormat( System.Text.Encoding encoding ) { if (encoding == null) { return; } Contract.Assert( _inSavedCharacter == -1, "There was a lookahead character at the stream change point, that means the parser is changing encodings too late" ); switch (_inTokenSource) { case TokenSource.UnicodeByteArray: case TokenSource.UTF8ByteArray: case TokenSource.ASCIIByteArray: // these are the ones we can change on the fly if (encoding == System.Text.Encoding.Unicode) { _inTokenSource = TokenSource.UnicodeByteArray; return; } if (encoding == System.Text.Encoding.UTF8) { _inTokenSource = TokenSource.UTF8ByteArray; return; } #if FEATURE_ASCII if (encoding == System.Text.Encoding.ASCII) { _inTokenSource = TokenSource.ASCIIByteArray; return; } #endif break; case TokenSource.String: case TokenSource.CharArray: case TokenSource.NestedStrings: // these are already unicode and encoding changes are moot // they can't be further decoded return; } // if we're here it means we don't know how to change // to the desired encoding with the memory that we have // we'll have to do this the hard way -- that means // creating a suitable stream from what we've got // this is thankfully the rare case as UTF8 and unicode // dominate the scene Stream stream = null; switch (_inTokenSource) { case TokenSource.UnicodeByteArray: case TokenSource.UTF8ByteArray: case TokenSource.ASCIIByteArray: stream = new MemoryStream(_inBytes, _inIndex, _inSize - _inIndex); break; case TokenSource.CharArray: case TokenSource.String: case TokenSource.NestedStrings: Contract.Assert(false, "attempting to change encoding on a non-changable source, should have been prevented earlier" ); return; default: StreamTokenReader reader = _inTokenReader as StreamTokenReader; if (reader == null) { Contract.Assert(false, "A new input source type has been added to the Tokenizer but it doesn't support encoding changes"); return; } stream = reader._in.BaseStream; Contract.Assert( reader._in.CurrentEncoding != null, "Tokenizer's StreamReader does not have an encoding" ); String fakeReadString = new String(' ', reader.NumCharEncountered); stream.Position = reader._in.CurrentEncoding.GetByteCount( fakeReadString ); break; } Contract.Assert(stream != null, "The XML stream with new encoding was not properly initialized for kind of input we had"); // we now have an initialized memory stream based on whatever source we had before _inTokenReader = new StreamTokenReader( new StreamReader( stream, encoding ) ); _inTokenSource = TokenSource.Other; } internal void GetTokens( TokenizerStream stream, int maxNum, bool endAfterKet ) { while (maxNum == -1 || stream.GetTokenCount() < maxNum) { int i = -1; byte ch; int cb = 0; bool inLiteral = false; bool inQuotedString = false; StringMaker m = _maker; m._outStringBuilder = null; m._outIndex = 0; BEGINNING: if (_inSavedCharacter != -1) { i = _inSavedCharacter; _inSavedCharacter = -1; } else switch (_inTokenSource) { case TokenSource.UnicodeByteArray: if (_inIndex + 1 >= _inSize) { stream.AddToken( -1 ); return; } i = (int)((_inBytes[_inIndex+1]<<8) + _inBytes[_inIndex]); _inIndex += 2; break; case TokenSource.UTF8ByteArray: if (_inIndex >= _inSize) { stream.AddToken( -1 ); return; } i = (int)(_inBytes[_inIndex++]); // single byte -- case, early out as we're done already if ((i & 0x80) == 0x00) break; // to decode the lead byte switch on the high nibble // shifted down so the switch gets dense integers switch ((i & 0xf0) >>4) { case 0x8: // 1000 (together these 4 make 10xxxxx) case 0x9: // 1001 case 0xa: // 1010 case 0xb: // 1011 // trail byte is an error throw new XmlSyntaxException( LineNo ); case 0xc: // 1100 (these two make 110xxxxx) case 0xd: // 1101 // two byte encoding (1 trail byte) i &= 0x1f; cb = 2; break; case 0xe: // 1110 (this gets us 1110xxxx) // three byte encoding (2 trail bytes) i &= 0x0f; cb = 3; break; case 0xf: // 1111 (and finally 1111xxxx) // 4 byte encoding is an error throw new XmlSyntaxException( LineNo ); } // at least one trail byte, fetch it if (_inIndex >= _inSize) throw new XmlSyntaxException (LineNo, Environment.GetResourceString( "XMLSyntax_UnexpectedEndOfFile" )); ch = _inBytes[_inIndex++]; // must be trail byte encoding if ((ch & 0xc0) != 0x80) throw new XmlSyntaxException( LineNo ); i = (i<<6) | (ch & 0x3f); // done now if 2 byte encoding, otherwise go for 3 if (cb == 2) break; if (_inIndex >= _inSize) throw new XmlSyntaxException (LineNo, Environment.GetResourceString( "XMLSyntax_UnexpectedEndOfFile" )); ch = _inBytes[_inIndex++]; // must be trail byte encoding if ((ch & 0xc0) != 0x80) throw new XmlSyntaxException( LineNo ); i = (i<<6) | (ch & 0x3f); break; case TokenSource.ASCIIByteArray: if (_inIndex >= _inSize) { stream.AddToken( -1 ); return; } i = (int)(_inBytes[_inIndex++]); break; case TokenSource.CharArray: if (_inIndex >= _inSize) { stream.AddToken( -1 ); return; } i = (int)(_inChars[_inIndex++]); break; case TokenSource.String: if (_inIndex >= _inSize) { stream.AddToken( -1 ); return; } i = (int)(_inString[_inIndex++]); break; case TokenSource.NestedStrings: if (_inNestedSize != 0) { if (_inNestedIndex < _inNestedSize) { i = _inNestedString[_inNestedIndex++]; break; } _inNestedSize = 0; } if (_inIndex >= _inSize) { stream.AddToken( -1 ); return; } i = (int)(_inString[_inIndex++]); if (i != '{') break; for (int istr=0; istr < _searchStrings.Length; istr++) { if (0==String.Compare(_searchStrings[istr], 0, _inString, _inIndex-1, _searchStrings[istr].Length, StringComparison.Ordinal)) { _inNestedString = _replaceStrings[istr]; _inNestedSize = _inNestedString.Length; _inNestedIndex = 1; i = _inNestedString[0]; _inIndex += _searchStrings[istr].Length - 1; break; } } break; default: i = _inTokenReader.Read(); if (i == -1) { stream.AddToken( -1 ); return; } break; } if (!inLiteral) { switch (i) { // skip whitespace case intSpace: case intTab: case intCR: goto BEGINNING; // count linefeeds case intLF: LineNo++; goto BEGINNING; case intOpenBracket: _inProcessingTag++; stream.AddToken( bra ); continue; case intCloseBracket: _inProcessingTag--; stream.AddToken( ket ); if (endAfterKet) return; continue; case intEquals: stream.AddToken( equals ); continue; case intSlash: if (_inProcessingTag != 0) { stream.AddToken( slash ); continue; } break; case intQuest: if (_inProcessingTag != 0) { stream.AddToken( quest ); continue; } break; case intBang: if (_inProcessingTag != 0) { stream.AddToken( bang ); continue; } break; case intDash: if (_inProcessingTag != 0) { stream.AddToken( dash ); continue; } break; case intQuote: inLiteral = true; inQuotedString = true; goto BEGINNING; } } else { switch (i) { case intOpenBracket: if (!inQuotedString) { _inSavedCharacter = i; stream.AddToken( cstr ); stream.AddString( this.GetStringToken() ); continue; } break; case intCloseBracket: case intEquals: case intSlash: if (!inQuotedString && _inProcessingTag != 0) { _inSavedCharacter = i; stream.AddToken( cstr ); stream.AddString( this.GetStringToken() ); continue; } break; case intQuote: if (inQuotedString) { stream.AddToken( cstr ); stream.AddString( this.GetStringToken() ); continue; } break; case intTab: case intCR: case intSpace: if (!inQuotedString) { stream.AddToken( cstr ); stream.AddString( this.GetStringToken() ); continue; } break; // count linefeeds case intLF: LineNo++; if (!inQuotedString) { stream.AddToken( cstr ); stream.AddString( this.GetStringToken() ); continue; } break; } } inLiteral = true; // add character to the string if (m._outIndex < StringMaker.outMaxSize) { // easy case m._outChars[m._outIndex++] = (char)i; } else { if (m._outStringBuilder == null) { // OK, first check if we have to init the StringBuilder m._outStringBuilder = new StringBuilder(); } // OK, copy from _outChars to _outStringBuilder m._outStringBuilder.Append(m._outChars, 0, StringMaker.outMaxSize); // reset _outChars pointer m._outChars[0] = (char)i; m._outIndex = 1; } goto BEGINNING; } } [Serializable] internal sealed class StringMaker { String[] aStrings; uint cStringsMax; uint cStringsUsed; public StringBuilder _outStringBuilder; public char[] _outChars; public int _outIndex; public const int outMaxSize = 512; static uint HashString(String str) { uint hash = 0; int l = str.Length; // rotate in string character for (int i=0; i < l; i++) { hash = (hash << 3) ^ (uint)str[i] ^ (hash >> 29); } return hash; } static uint HashCharArray(char[] a, int l) { uint hash = 0; // rotate in a character for (int i=0; i < l; i++) { hash = (hash << 3) ^ (uint)a[i] ^ (hash >> 29); } return hash; } public StringMaker() { cStringsMax = 2048; cStringsUsed = 0; aStrings = new String[cStringsMax]; _outChars = new char[outMaxSize]; } bool CompareStringAndChars(String str, char [] a, int l) { if (str.Length != l) return false; for (int i=0; i (cStringsMax / 4) * 3) { // we need to rehash uint cNewMax = cStringsMax * 2; String [] aStringsNew = new String[cNewMax]; for (int i=0; i < cStringsMax;i++) { if (aStrings[i] != null) { hash = HashString(aStrings[i]) % cNewMax; while (aStringsNew[hash] != null) { // slot full, skip if (++hash >= cNewMax) hash = 0; } aStringsNew[hash] = aStrings[i]; } } // all done, cutover to the new hash table cStringsMax = cNewMax; aStrings = aStringsNew; } hash = HashCharArray(a, l) % cStringsMax; String str; while ((str = aStrings[hash]) != null) { if (CompareStringAndChars(str, a, l)) return str; if (++hash >= cStringsMax) hash = 0; } str = new String(a,0,l); aStrings[hash] = str; cStringsUsed++; return str; } } //================================================================ // // private String GetStringToken () { return _maker.MakeString(); } internal interface ITokenReader { int Read(); } internal class StreamTokenReader : ITokenReader { internal StreamReader _in; internal int _numCharRead; internal StreamTokenReader(StreamReader input) { _in = input; _numCharRead = 0; } public virtual int Read() { int value = _in.Read(); if (value != -1) _numCharRead++; return value; } internal int NumCharEncountered { get { return _numCharRead; } } } } internal sealed class TokenizerShortBlock { internal short[] m_block = new short[16]; internal TokenizerShortBlock m_next = null; } internal sealed class TokenizerStringBlock { internal String[] m_block = new String[16]; internal TokenizerStringBlock m_next = null; } internal sealed class TokenizerStream { private int m_countTokens; private TokenizerShortBlock m_headTokens; private TokenizerShortBlock m_lastTokens; private TokenizerShortBlock m_currentTokens; private int m_indexTokens; private TokenizerStringBlock m_headStrings; private TokenizerStringBlock m_currentStrings; private int m_indexStrings; #if _DEBUG private bool m_bLastWasCStr; #endif internal TokenizerStream() { m_countTokens = 0; m_headTokens = new TokenizerShortBlock(); m_headStrings = new TokenizerStringBlock(); Reset(); } internal void AddToken( short token ) { if (m_currentTokens.m_block.Length <= m_indexTokens) { m_currentTokens.m_next = new TokenizerShortBlock(); m_currentTokens = m_currentTokens.m_next; m_indexTokens = 0; } m_countTokens++; m_currentTokens.m_block[m_indexTokens++] = token; } internal void AddString( String str ) { if (m_currentStrings.m_block.Length <= m_indexStrings) { m_currentStrings.m_next = new TokenizerStringBlock(); m_currentStrings = m_currentStrings.m_next; m_indexStrings = 0; } m_currentStrings.m_block[m_indexStrings++] = str; } internal void Reset() { m_lastTokens = null; m_currentTokens = m_headTokens; m_currentStrings = m_headStrings; m_indexTokens = 0; m_indexStrings = 0; #if _DEBUG m_bLastWasCStr = false; #endif } internal short GetNextFullToken() { if (m_currentTokens.m_block.Length <= m_indexTokens) { m_lastTokens = m_currentTokens; m_currentTokens = m_currentTokens.m_next; m_indexTokens = 0; } return m_currentTokens.m_block[m_indexTokens++]; } internal short GetNextToken() { short retval = (short)(GetNextFullToken() & 0x00FF); #if _DEBUG Contract.Assert( !m_bLastWasCStr, "CStr token not followed by GetNextString()" ); m_bLastWasCStr = (retval == Tokenizer.cstr); #endif return retval; } internal String GetNextString() { if (m_currentStrings.m_block.Length <= m_indexStrings) { m_currentStrings = m_currentStrings.m_next; m_indexStrings = 0; } #if _DEBUG m_bLastWasCStr = false; #endif return m_currentStrings.m_block[m_indexStrings++]; } internal void ThrowAwayNextString() { GetNextString(); } internal void TagLastToken( short tag ) { if (m_indexTokens == 0) m_lastTokens.m_block[m_lastTokens.m_block.Length-1] = (short)((ushort)m_lastTokens.m_block[m_lastTokens.m_block.Length-1] | (ushort)tag); else m_currentTokens.m_block[m_indexTokens-1] = (short)((ushort)m_currentTokens.m_block[m_indexTokens-1] | (ushort)tag); } internal int GetTokenCount() { return m_countTokens; } internal void GoToPosition( int position ) { Reset(); for (int count = 0; count < position; ++count) { if (GetNextToken() == Tokenizer.cstr) ThrowAwayNextString(); } } } }