X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;f=mcs%2Fclass%2Fcorlib%2FSystem.Text%2FUTF8Encoding.cs;h=1944d98b2c081d34ec5f4410521f07f4fc144f63;hb=a93fa79df7dcaebf723bb3359c915291135faf2f;hp=52447e392e9e1be72ded9b338eedcf2791c23bf4;hpb=a3ea7ceb4d4f5e2cb8ea421313e8939640fb898c;p=mono.git diff --git a/mcs/class/corlib/System.Text/UTF8Encoding.cs b/mcs/class/corlib/System.Text/UTF8Encoding.cs index 52447e392e9..1944d98b2c0 100644 --- a/mcs/class/corlib/System.Text/UTF8Encoding.cs +++ b/mcs/class/corlib/System.Text/UTF8Encoding.cs @@ -30,11 +30,8 @@ using System; using System.Runtime.InteropServices; [Serializable] -[MonoTODO ("Fix serialization compatibility with MS.NET")] -#if NET_2_0 -[MonoTODO ("EncoderFallback is not handled")] +[MonoLimitation ("Serialization format not compatible with .NET")] [ComVisible (true)] -#endif public class UTF8Encoding : Encoding { // Magic number used by Windows for UTF-8. @@ -42,9 +39,6 @@ public class UTF8Encoding : Encoding // Internal state. private bool emitIdentifier; -#if !NET_2_0 - private bool throwOnInvalid; -#endif // Constructors. public UTF8Encoding () : this (false, false) {} @@ -55,411 +49,624 @@ public class UTF8Encoding : Encoding : base (UTF8_CODE_PAGE) { emitIdentifier = encoderShouldEmitUTF8Identifier; -#if NET_2_0 if (throwOnInvalidBytes) - SetFallbackInternal (null, new DecoderExceptionFallback ()); + SetFallbackInternal (EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback); else - SetFallbackInternal (null, new DecoderReplacementFallback (String.Empty)); -#else - throwOnInvalid = throwOnInvalidBytes; -#endif + SetFallbackInternal (EncoderFallback.StandardSafeFallback, DecoderFallback.StandardSafeFallback); web_name = body_name = header_name = "utf-8"; encoding_name = "Unicode (UTF-8)"; is_browser_save = true; is_browser_display = true; is_mail_news_display = true; + is_mail_news_save = true; windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE; } - #region GetByteCount() - - // Internal version of "GetByteCount" which can handle a rolling - // state between multiple calls to this method. - private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush) - { - // Validate the parameters. - if (chars == null) { - throw new ArgumentNullException ("chars"); - } - if (index < 0 || index > chars.Length) { - throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array")); + /////////////////////////////////////////////////////////////////////// + // INTERNAL DECODING FUNCTION (UTF8 -> CHAR/UTF16) + /////////////////////////////////////////////////////////////////////// + + internal enum DecoderStatus { + Ok, + InsufficientSpace, + InvalidChar, + InvalidSequence, + InvalidStart, + InputRunOut, + SurrogateFound, + Overlong, + }; + + // following method decodes an utf8 character from a byte buffer. + // NOTE: If 'chars' is null, this function only counts bytes and chars + // without writing anything. + // NOTE: BOM (0xEF 0xBB 0xBF) is not yet supported. + // See http://www.cl.cam.ac.uk/~mgk25/unicode.html + private unsafe static DecoderStatus InternalGetChar ( + byte* bytes, int byteCount, + char* chars, int charCount, + out int bytesProcessed, out int charsProcessed, + ref uint leftBytes, ref uint leftBits, ref uint procBytes) + { + uint ch; + bool checkByte; + + // reset counters + bytesProcessed = 0; + charsProcessed = 0; + + // Fetch the start character from the byte buffer. + if (leftBytes == 0) { + if (byteCount == 0) + return DecoderStatus.InputRunOut; + ch = (uint) (*bytes++); + bytesProcessed++; + byteCount--; + procBytes = ch; + if (ch < (uint) 0x0080) { + // Single-byte UTF-8 character. + leftBits = ch; + leftBytes = 0; + } else if (ch == (uint) 0xc0 || ch == (uint) 0xc1) { + // invalid start + return DecoderStatus.InvalidChar; + } else if ((ch & (uint) 0xE0) == (uint) 0xC0) { + // Double-byte UTF-8 character. + leftBits = ((ch & (uint) 0x1F) << 6*1); + leftBytes = 1; + } else if ((ch & (uint) 0xF0) == (uint) 0xE0) { + // Three-byte UTF-8 character. + leftBits = ((ch & (uint) 0x0F) << 6*2); + leftBytes = 2; + } else if ((ch & (uint) 0xF8) == (uint) 0xF0) { + // Four-byte UTF-8 character. + leftBits = ((ch & (uint) 0x07) << 6*3); + leftBytes = 3; + // extra check for detecting as soon as + // possible too big four-byte utf chars + if (leftBits >= (uint) 0x110000) + return DecoderStatus.InvalidChar; + } else { + // Invalid five-or-six-byte or start char + // NOTE: I keep here the code for 5/6 bytes if + // needed, but technically these combinations + // are invalid in UTF-8 sequences. + // (ch & (uint) 0xFC) == (uint) 0xF8 => + // leftBits = ch & (uint) 0x03; + // leftBytes = 4; + // (ch & (uint) 0xFE) == (uint) 0xFC => + // leftBits = ch & (uint) 0x01; + // leftBytes = 5; + leftBits = leftBytes = 0; + return DecoderStatus.InvalidStart; + } + checkByte = (leftBytes > 0 && leftBits == 0); + } else { + // restore state + checkByte = (leftBytes >> 4) != 0; + leftBytes &= (uint) 0x0f; } - if (count < 0 || count > (chars.Length - index)) { - throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array")); + + // process the required bytes... + for (; leftBytes > 0; leftBytes--) { + if (byteCount == 0) { + leftBytes = ((uint) (checkByte ? 0x10 : 0x00)) | leftBytes; + return DecoderStatus.InputRunOut; + } + ch = (uint) (*bytes++); + if ((ch & (uint) 0xC0) != (uint) 0x80) { + // Invalid UTF-8 sequence: clear and restart. + // NOTE: we return before counting the + // processed bytes for restarting + // decoding later at this point + return DecoderStatus.InvalidSequence; + } + bytesProcessed++; + byteCount--; + procBytes = (procBytes << 8) | ch; + if (checkByte && ((~((uint) 0x1f >> (int) leftBytes - 2)) & ch) == 0x80) { + // detected an overlong sequence :( + return DecoderStatus.Overlong; + } + checkByte = false; + leftBits = leftBits | ((ch & (uint) 0x3F) << (6*(int) (leftBytes - 1))); + if (leftBits >= (uint) 0x110000) { + // this UTF-8 is too big ... + return DecoderStatus.InvalidChar; + } + if ((leftBits & 0xF800) == 0xD800) { + // UTF-8 doesn't use surrogate characters + return DecoderStatus.SurrogateFound; + } } - if (index == chars.Length) { - if (flush && leftOver != '\0') { - // Flush the left-over surrogate pair start. - leftOver = '\0'; - return 3; + // convert this character to UTF-16 + if (leftBits < (uint) 0x10000) { + if(chars != null) { + if(charCount < 1) + return DecoderStatus.InsufficientSpace; + *chars = (char) leftBits; + } + charsProcessed++; + } else { + if(chars != null) { + if(charCount < 2) + return DecoderStatus.InsufficientSpace; + leftBits -= (uint) 0x10000; + *chars++ = (char) ((leftBits >> 10) + (uint) 0xD800); + *chars++ = (char) ((leftBits & (uint) 0x3FF) + (uint) 0xDC00); } - return 0; + charsProcessed += 2; } - unsafe { - fixed (char* cptr = chars) { - return InternalGetByteCount (cptr + index, count, ref leftOver, flush); + // we've read a complete char... reset decoder status and finish + leftBytes = leftBits = procBytes = 0; + return DecoderStatus.Ok; + } + + internal unsafe static DecoderStatus InternalGetChars ( + byte* bytes, int byteCount, + char* chars, int charCount, + DecoderFallbackBuffer fallbackBuffer, + out int bytesProcessed, out int charsProcessed, + ref uint leftBytes, ref uint leftBits, ref uint procBytes) + { + DecoderStatus s; + int t_bytesProcessed, t_charsProcessed; + + // Validate parameters + if (bytes == null) + throw new ArgumentNullException ("bytes"); + if (byteCount < 0) + throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative")); + if (charCount < 0) + throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative")); + + // reset counters + charsProcessed = 0; + bytesProcessed = 0; + + // byte processing loop + while(byteCount - bytesProcessed > 0 && (chars == null || charCount - charsProcessed > 0)) { + // fetch a char from the input byte array + s = chars != null + ? InternalGetChar ( + bytes + bytesProcessed, byteCount - bytesProcessed, + chars + charsProcessed, charCount - charsProcessed, + out t_bytesProcessed, out t_charsProcessed, + ref leftBytes, ref leftBits, ref procBytes) + : InternalGetChar ( + bytes + bytesProcessed, byteCount - bytesProcessed, + null, 0, + out t_bytesProcessed, out t_charsProcessed, + ref leftBytes, ref leftBits, ref procBytes); + + // update counters + charsProcessed += t_charsProcessed; + bytesProcessed += t_bytesProcessed; + + switch(s) { + case DecoderStatus.Ok: + break; // everything OK :D + + case DecoderStatus.InsufficientSpace: + throw new ArgumentException ("Insufficient Space", "chars"); + + case DecoderStatus.Overlong: + case DecoderStatus.InvalidSequence: + case DecoderStatus.InvalidStart: + case DecoderStatus.InvalidChar: + case DecoderStatus.SurrogateFound: + // Invalid UTF-8 characters and sequences... + // now we build a 'bytesUnknown' array with the + // stored bytes in 'procBytes'. + int extra = 0; + for (uint t = procBytes; t != 0; extra++) + t = t >> 8; + byte [] bytesUnknown = new byte [extra]; + for (int i = extra; i > 0; i--) + bytesUnknown [i - 1] = (byte) ((procBytes >> (8 * (extra - i))) & 0xff); + // partial reset: this condition avoids + // infinite loops + if (s == DecoderStatus.InvalidSequence) + leftBytes = 0; + // call the fallback and cross fingers + fallbackBuffer.Fallback (bytesUnknown, bytesProcessed - extra); + if(chars != null) { + while (fallbackBuffer.Remaining > 0) { + if (charsProcessed >= charCount) + throw new ArgumentException ("Insufficient Space", "chars/fallback"); + chars [charsProcessed++] = fallbackBuffer.GetNextChar (); + } + } else + charsProcessed += fallbackBuffer.Remaining; + fallbackBuffer.Reset (); + // recovery was succesful, reset decoder state + leftBits = leftBytes = procBytes = 0; + break; + + case DecoderStatus.InputRunOut: + return DecoderStatus.InputRunOut; } } + return DecoderStatus.Ok; } + // Get the characters that result from decoding a byte buffer. + internal unsafe static DecoderStatus InternalGetChars ( + byte[] bytes, int byteIndex, int byteCount, + char[] chars, int charIndex, + DecoderFallbackBuffer fallbackBuffer, + out int bytesProcessed, out int charsProcessed, + ref uint leftBytes, ref uint leftBits, ref uint procBytes) + { + // Validate the parameters. + if (bytes == null) + throw new ArgumentNullException ("bytes"); + if (byteIndex < 0 || byteIndex >= bytes.Length) + throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array")); + if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) + throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array")); + if (charIndex < 0 || charIndex > (chars != null && chars.Length > 0 ? chars.Length - 1 : 0)) + throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array")); + + fixed (char* cptr = chars) { + fixed (byte* bptr = bytes) { + return InternalGetChars ( + bptr + byteIndex, byteCount, + chars != null ? cptr + charIndex : null, + chars != null ? chars.Length - charIndex : 0, + fallbackBuffer, + out bytesProcessed, out charsProcessed, + ref leftBytes, ref leftBits, ref procBytes); + } + } + } - private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush) + /////////////////////////////////////////////////////////////////////// + // INTERNAL ENCODING FUNCTION (CHAR/UTF16 -> UTF8) + /////////////////////////////////////////////////////////////////////// + + internal enum EncoderStatus { + Ok, + InputRunOut, + InsufficientSpace, + InvalidChar, + InvalidSurrogate, + }; + + // following method encodes an utf8 character into a byte buffer. + // NOTE: If 'bytes' is null, this function only counts bytes and chars + // without writing anything. + // NOTE: BOM (0xEF 0xBB 0xBF) is not yet supported. + // See http://www.cl.cam.ac.uk/~mgk25/unicode.html + private unsafe static EncoderStatus InternalGetByte ( + char* chars, int charCount, + byte* bytes, int byteCount, + out int charsProcessed, out int bytesProcessed, ref uint leftChar) { - int index = 0; - - // Determine the lengths of all characters. - char ch; - int length = 0; - char pair = leftOver; - while (count > 0) { - ch = chars[index]; - if (pair == 0) { - if (ch < '\u0080') { - // fast path optimization - int end = index + count; - for (; index < end; index++, count--) { - if (chars [index] < '\x80') - ++length; - else - break; - } - continue; - //length++; - } else if (ch < '\u0800') { - length += 2; - } else if (ch >= '\uD800' && ch <= '\uDBFF') { - // This is the start of a surrogate pair. - pair = ch; - } else { - length += 3; + uint ch; + + // reset counters + charsProcessed = 0; + bytesProcessed = 0; + + // process one char (this block executes twice if a surrogate is found) +again: + if (charCount < 1) + return EncoderStatus.InputRunOut; + + ch = *chars++; + + if (leftChar == 0) { + // char counting is inside if for reason discused in else + charsProcessed++; + charCount--; + if (ch < (uint) 0x80) { + if (bytes != null) { + if(byteCount < 1) + return EncoderStatus.InsufficientSpace; + *bytes++ = (byte) ch; + byteCount--; } - } else if (ch >= '\uDC00' && ch <= '\uDFFF') { - if (pair != 0) { - // We have a surrogate pair. - length += 4; - pair = '\0'; - } else { - // We have a surrogate tail without - // leading surrogate. In NET_2_0 it - // uses fallback. In NET_1_1 we output - // wrong surrogate. - length += 3; - pair = '\0'; + bytesProcessed++; + } else if (ch < (uint) 0x0800) { + if (bytes != null) { + if (byteCount < 2) + return EncoderStatus.InsufficientSpace; + *bytes++ = (byte) ((uint) 0xC0 | (ch >> 6) & 0x3f); + *bytes++ = (byte) ((uint) 0x80 | ch & 0x3f); + byteCount -= 2; } + bytesProcessed += 2; + } else if (ch < (uint) 0xD800 || ch > (uint) 0xDFFF) { + if (bytes != null) { + if (byteCount < 3) + return EncoderStatus.InsufficientSpace; + *bytes++ = (byte) ((uint) 0xE0 | (ch >> 12)); + *bytes++ = (byte) ((uint) 0x80 | ((ch >> 6) & 0x3F)); + *bytes++ = (byte) ((uint) 0x80 | (ch & 0x3F)); + byteCount -= 3; + } + bytesProcessed += 3; + } else if (ch <= (uint) 0xDBFF) { + // This is a surrogate char, repeat please + leftChar = ch; + goto again; + } else { + // We have a surrogate tail without + // leading surrogate. + return EncoderStatus.InvalidChar; + } + } else { + if (ch >= (uint) 0xDC00 && ch <= (uint) 0xDFFF) { + // We have a correct surrogate pair. + ch = 0x10000 + (uint) ch - (uint) 0xDC00 + + ((leftChar - (uint) 0xD800) << 10); + if (bytes != null) { + if (byteCount < 4) + return EncoderStatus.InsufficientSpace; + *bytes++ = (byte) (0xF0 | (ch >> 18)); + *bytes++ = (byte) (0x80 | ((ch >> 12) & 0x3F)); + *bytes++ = (byte) (0x80 | ((ch >> 6) & 0x3F)); + *bytes++ = (byte) (0x80 | (ch & 0x3F)); + byteCount -= 4; + } + bytesProcessed += 4; } else { // We have a surrogate start followed by a // regular character. Technically, this is - // invalid, but we have to do something. - // We write out the surrogate start and then - // re-visit the current character again. - length += 3; - pair = '\0'; - continue; + // invalid, so we fail :( + return EncoderStatus.InvalidSurrogate; } - ++index; - --count; - } - if (flush) { - if (pair != '\0') - // Flush the left-over surrogate pair start. - length += 3; - leftOver = '\0'; + // increment counters; this is done after processing + // the surrogate: in case of a bad surrogate the + // encoding should restart on the faulty char (maybe + // the correct surrogate has been lost, and in this + // case the best option is to restart processing on the + // erroneus char to avoid losing more chars during the + // encoding. + charsProcessed++; + charCount--; + leftChar = 0; } - else - leftOver = pair; - - // Return the final length to the caller. - return length; + return EncoderStatus.Ok; } - // Get the number of bytes needed to encode a character buffer. - public override int GetByteCount (char[] chars, int index, int count) + internal unsafe static EncoderStatus InternalGetBytes ( + char* chars, int charCount, + byte* bytes, int byteCount, + EncoderFallbackBuffer fallbackBuffer, + out int charsProcessed, out int bytesProcessed, + ref uint leftChar) { - char dummy = '\0'; - return InternalGetByteCount (chars, index, count, ref dummy, true); - } + EncoderStatus s; + int t_charsProcessed, t_bytesProcessed; -#if !NET_2_0 - // Convenience wrappers for "GetByteCount". - public override int GetByteCount (String s) - { - // Validate the parameters. - if (s == null) { - throw new ArgumentNullException ("s"); - } + // Validate the parameters + if (chars == null) + throw new ArgumentNullException ("bytes"); + if (charCount < 0) + throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative")); + if (byteCount < 0) + throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative")); - unsafe { - fixed (char* cptr = s) { - char dummy = '\0'; - return InternalGetByteCount (cptr, s.Length, ref dummy, true); + // reset counters + charsProcessed = 0; + bytesProcessed = 0; + + // char processing loop + while (charCount - charsProcessed > 0) { + s = bytes != null + ? InternalGetByte ( + chars + charsProcessed, charCount - charsProcessed, + bytes + bytesProcessed, byteCount - bytesProcessed, + out t_charsProcessed, out t_bytesProcessed, ref leftChar) + : InternalGetByte ( + chars + charsProcessed, charCount - charsProcessed, + null, 0, + out t_charsProcessed, out t_bytesProcessed, ref leftChar); + + charsProcessed += t_charsProcessed; + bytesProcessed += t_bytesProcessed; + + switch (s) { + case EncoderStatus.Ok: + break; // everything OK :D + + case EncoderStatus.InsufficientSpace: + throw new ArgumentException ("Insufficient Space", "bytes"); + + case EncoderStatus.InputRunOut: + return EncoderStatus.InputRunOut; + + case EncoderStatus.InvalidChar: + case EncoderStatus.InvalidSurrogate: + // we've found an invalid char or surrogate + if (fallbackBuffer == null) { + // without a fallbackBuffer abort + // returning 'InvalidChar' or + // 'InvalidSurrogate' + return s; + } + if(t_charsProcessed >= 1) { + // one-char invalid UTF-16 or an + // invalid surrogate + fallbackBuffer.Fallback ( + chars [charsProcessed - 1], + charsProcessed - 1); + } else { + // we've read a two-char invalid UTF-16 + // but in this buffer we have only the + // invalid surrogate tail + fallbackBuffer.Fallback ( + (char) leftChar, + -1); + } + // if we've arrived here we are working in + // replacement mode: build a replacement + // fallback_chars buffer + char[] fallback_chars = new char [fallbackBuffer.Remaining]; + for (int i = 0; i < fallback_chars.Length; i++) + fallback_chars [i] = fallbackBuffer.GetNextChar (); + fallbackBuffer.Reset (); + // and encode it into UTF8 bytes... + fixed (char *fb_chars = fallback_chars) { + leftChar = 0; + switch (bytes != null + ? InternalGetBytes (fb_chars, fallback_chars.Length, + bytes + bytesProcessed, byteCount - bytesProcessed, + null, out t_charsProcessed, out t_bytesProcessed, + ref leftChar) + : InternalGetBytes (fb_chars, fallback_chars.Length, + null, 0, + null, out t_charsProcessed, out t_bytesProcessed, + ref leftChar)) { + case EncoderStatus.Ok: + // everything OK :D + bytesProcessed += t_bytesProcessed; + break; + case EncoderStatus.InsufficientSpace: + throw new ArgumentException ("Insufficient Space", "fallback buffer bytes"); + case EncoderStatus.InputRunOut: + case EncoderStatus.InvalidChar: + case EncoderStatus.InvalidSurrogate: + throw new ArgumentException ("Fallback chars are pure evil.", "fallback buffer bytes"); + } + } + // partial reset of encoder state + leftChar = 0; + break; } } + return EncoderStatus.Ok; } -#endif -#if NET_2_0 - [CLSCompliant (false)] - [ComVisible (false)] - public unsafe override int GetByteCount (char* chars, int count) + internal unsafe static EncoderStatus InternalGetBytes ( + char[] chars, int charIndex, int charCount, + byte[] bytes, int byteIndex, + EncoderFallbackBuffer fallbackBuffer, + out int charsProcessed, out int bytesProcessed, + ref uint leftChar) { if (chars == null) throw new ArgumentNullException ("chars"); - if (count == 0) - return 0; - char dummy = '\0'; - return InternalGetByteCount (chars, count, ref dummy, true); - } -#endif - - #endregion - - #region GetBytes() - - // Internal version of "GetBytes" which can handle a rolling - // state between multiple calls to this method. - private static int InternalGetBytes (char[] chars, int charIndex, - int charCount, byte[] bytes, - int byteIndex, ref char leftOver, - bool flush) - { - // Validate the parameters. - if (chars == null) { - throw new ArgumentNullException ("chars"); - } - if (bytes == null) { - throw new ArgumentNullException ("bytes"); - } - if (charIndex < 0 || charIndex > chars.Length) { + if (charIndex < 0 || charIndex >= chars.Length) throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array")); - } - if (charCount < 0 || charCount > (chars.Length - charIndex)) { + if (charCount < 0 || charCount > (chars.Length - charIndex)) throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array")); - } - if (byteIndex < 0 || byteIndex > bytes.Length) { + if (byteIndex < 0 || byteIndex > (bytes != null && bytes.Length > 0 ? bytes.Length - 1 : 0)) throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array")); - } - - if (charIndex == chars.Length) { - if (flush && leftOver != '\0') { -#if NET_2_0 - // FIXME: use EncoderFallback. - // - // By default it is empty, so I do nothing for now. - leftOver = '\0'; -#else - // Flush the left-over surrogate pair start. - if (byteIndex >= bytes.Length - 3) - throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - bytes [byteIndex++] = 0xEF; - bytes [byteIndex++] = 0xBB; - bytes [byteIndex++] = 0xBF; - leftOver = '\0'; - return 3; -#endif - } - return 0; - } unsafe { - fixed (char* cptr = chars) { - if (bytes.Length == byteIndex) - return InternalGetBytes ( - cptr + charIndex, charCount, - null, 0, ref leftOver, flush); + fixed (char *cptr = chars) { fixed (byte *bptr = bytes) { return InternalGetBytes ( cptr + charIndex, charCount, - bptr + byteIndex, bytes.Length - byteIndex, - ref leftOver, flush); + bytes != null ? bptr + byteIndex : null, + bytes != null ? bytes.Length - byteIndex : 0, + fallbackBuffer, + out charsProcessed, out bytesProcessed, + ref leftChar); } } } } - private unsafe static int InternalGetBytes (char* chars, int charCount, - byte* bytes, int byteCount, - ref char leftOver, bool flush) - { - int charIndex = 0; - int byteIndex = 0; - - // Convert the characters into bytes. - // Convert the characters into bytes. - char ch; - int length = byteCount; - char pair = leftOver; - int posn = byteIndex; - int code = 0; - - while (charCount > 0) { - // Fetch the next UTF-16 character pair value. - ch = chars [charIndex]; - if (pair == '\0') { - if (ch < '\uD800' || ch >= '\uE000') { - if (ch < '\x80') { // fast path optimization - int end = charIndex + charCount; - for (; charIndex < end; posn++, charIndex++, charCount--) { - if (chars [charIndex] < '\x80') - bytes [posn] = (byte) chars [charIndex]; - else - break; - } - continue; - } - code = ch; - } - else if (ch < '\uDC00') { - // surrogate start - pair = ch; - ++charIndex; - --charCount; - continue; - } else { // ch <= '\uDFFF' - // We have a surrogate tail without leading - // surrogate. In NET_2_0 it uses fallback. - // In NET_1_1 we output wrong surrogate. - if (posn > length - 3) { - throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - } - bytes [posn++] = (byte) (0xE0 | (ch >> 12)); - bytes [posn++] = (byte) (0x80 | ((ch >> 6) & 0x3F)); - bytes [posn++] = (byte) (0x80 | (ch & 0x3F)); - ++charIndex; - --charCount; - continue; - } - } else { - if ('\uDC00' <= ch && ch <= '\uDFFF') - code = 0x10000 + (int) ch - 0xDC00 + - (((int) pair - 0xD800) << 10); - else { - // We have a surrogate start followed by a - // regular character. Technically, this is - // invalid, but we have to do something. - // We write out the surrogate start and then - // re-visit the current character again. - if (posn > length - 3) { - throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - } - bytes [posn++] = (byte) (0xE0 | (pair >> 12)); - bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F)); - bytes [posn++] = (byte) (0x80 | (pair & 0x3F)); - pair = '\0'; - continue; - } - pair = '\0'; - } - ++charIndex; - --charCount; - - // Encode the character pair value. - if (code < 0x0080) { - if (posn >= length) - throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - bytes [posn++] = (byte)code; - } else if (code < 0x0800) { - if ((posn + 2) > length) - throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - bytes [posn++] = (byte) (0xC0 | (code >> 6)); - bytes [posn++] = (byte) (0x80 | (code & 0x3F)); - } else if (code < 0x10000) { - if (posn > length - 3) - throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - bytes [posn++] = (byte) (0xE0 | (code >> 12)); - bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F)); - bytes [posn++] = (byte) (0x80 | (code & 0x3F)); - } else { - if (posn > length - 4) - throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - bytes [posn++] = (byte) (0xF0 | (code >> 18)); - bytes [posn++] = (byte) (0x80 | ((code >> 12) & 0x3F)); - bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F)); - bytes [posn++] = (byte) (0x80 | (code & 0x3F)); - } - } - - if (flush) { - if (pair != '\0') { - // Flush the left-over incomplete surrogate. - if (posn > length - 3) { - throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes"); - } - bytes [posn++] = (byte) (0xE0 | (pair >> 12)); - bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F)); - bytes [posn++] = (byte) (0x80 | (pair & 0x3F)); - } - leftOver = '\0'; - } - else - leftOver = pair; -Char.IsLetterOrDigit (pair); + #region GetByteCount() - // Return the final count to the caller. - return posn - byteIndex; + // Get the number of bytes needed to encode a character buffer. + public override int GetByteCount (char[] chars, int index, int count) + { + uint leftChar = 0; + int charsProcessed, bytesProcessed; + InternalGetBytes (chars, index, count, + null, 0, + EncoderFallback.CreateFallbackBuffer (), + out charsProcessed, out bytesProcessed, + ref leftChar); + return bytesProcessed; } - private unsafe int Fallback (byte* bytes, int byteCount, char lead, char tail) + + [CLSCompliant (false)] + [ComVisible (false)] + public unsafe override int GetByteCount (char* chars, int count) { - throw new NotImplementedException (); + int charsProcessed, bytesProcessed; + uint leftChar = 0; + if (chars == null) + throw new ArgumentNullException ("chars"); + if (count < 0) + throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array")); + InternalGetBytes (chars, count, + null, 0, + EncoderFallback.CreateFallbackBuffer (), + out charsProcessed, out bytesProcessed, + ref leftChar); + return bytesProcessed; } + #endregion + + #region GetBytes() + // Get the bytes that result from encoding a character buffer. public override int GetBytes (char[] chars, int charIndex, int charCount, - byte[] bytes, int byteIndex) + byte[] bytes, int byteIndex) { - char leftOver = '\0'; - return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true); + int charsProcessed, bytesProcessed; + uint leftChar = 0; + if (bytes == null) { + throw new ArgumentNullException ("bytes"); + } + + InternalGetBytes (chars, charIndex, charCount, + bytes, byteIndex, + EncoderFallback.CreateFallbackBuffer (), + out charsProcessed, out bytesProcessed, + ref leftChar); + return bytesProcessed; } // Convenience wrappers for "GetBytes". - public override int GetBytes (String s, int charIndex, int charCount, - byte[] bytes, int byteIndex) + public unsafe override int GetBytes (String s, int charIndex, int charCount, + byte[] bytes, int byteIndex) { - // Validate the parameters. - if (s == null) { + int charsProcessed, bytesProcessed; + uint leftChar = 0; + if (s == null) throw new ArgumentNullException ("s"); - } - if (bytes == null) { + if (bytes == null) throw new ArgumentNullException ("bytes"); - } - if (charIndex < 0 || charIndex > s.Length) { + if (charIndex < 0 || charIndex >= s.Length) throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex")); - } - if (charCount < 0 || charCount > (s.Length - charIndex)) { + if (charCount < 0 || charCount > (s.Length - charIndex)) throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange")); - } - if (byteIndex < 0 || byteIndex > bytes.Length) { + if (byteIndex < 0 || byteIndex > (bytes.Length > 0 ? bytes.Length - 1 : 0)) throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array")); - } - - if (charIndex == s.Length) - return 0; - unsafe { - fixed (char* cptr = s) { - char dummy = '\0'; - if (bytes.Length == byteIndex) - return InternalGetBytes ( - cptr + charIndex, charCount, - null, 0, ref dummy, true); + fixed (char *cptr = s) { fixed (byte *bptr = bytes) { - return InternalGetBytes ( + InternalGetBytes ( cptr + charIndex, charCount, bptr + byteIndex, bytes.Length - byteIndex, - ref dummy, true); + EncoderFallback.CreateFallbackBuffer (), + out charsProcessed, out bytesProcessed, + ref leftChar); } } } + return bytesProcessed; } -#if NET_2_0 [CLSCompliant (false)] [ComVisible (false)] public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount) { + int charsProcessed, bytesProcessed; + uint leftChar = 0; if (chars == null) throw new ArgumentNullException ("chars"); if (charCount < 0) @@ -468,534 +675,85 @@ Char.IsLetterOrDigit (pair); throw new ArgumentNullException ("bytes"); if (byteCount < 0) throw new IndexOutOfRangeException ("charCount"); - - if (charCount == 0) - return 0; - - char dummy = '\0'; - if (byteCount == 0) - return InternalGetBytes (chars, charCount, null, 0, ref dummy, true); - else - return InternalGetBytes (chars, charCount, bytes, byteCount, ref dummy, true); + InternalGetBytes ( + chars, charCount, bytes, byteCount, + EncoderFallback.CreateFallbackBuffer (), + out charsProcessed, out bytesProcessed, + ref leftChar); + return bytesProcessed; } -#endif #endregion - // Internal version of "GetCharCount" which can handle a rolling - // state between multiple calls to this method. -#if NET_2_0 - private unsafe static int InternalGetCharCount ( - byte[] bytes, int index, int count, uint leftOverBits, - uint leftOverCount, object provider, - ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush) -#else - private unsafe static int InternalGetCharCount ( - byte[] bytes, int index, int count, uint leftOverBits, - uint leftOverCount, bool throwOnInvalid, bool flush) -#endif - { - // Validate the parameters. - if (bytes == null) { - throw new ArgumentNullException ("bytes"); - } - if (index < 0 || index > bytes.Length) { - throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array")); - } - if (count < 0 || count > (bytes.Length - index)) { - throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array")); - } - - if (count == 0) - return 0; - fixed (byte *bptr = bytes) -#if NET_2_0 - return InternalGetCharCount (bptr + index, count, - leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush); -#else - return InternalGetCharCount (bptr + index, count, - leftOverBits, leftOverCount, throwOnInvalid, flush); -#endif - } - -#if NET_2_0 - private unsafe static int InternalGetCharCount ( - byte* bytes, int count, uint leftOverBits, - uint leftOverCount, object provider, - ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush) -#else - private unsafe static int InternalGetCharCount ( - byte* bytes, int count, uint leftOverBits, - uint leftOverCount, bool throwOnInvalid, bool flush) -#endif - { - int index = 0; - - int length = 0; - - if (leftOverCount == 0) { - int end = index + count; - for (; index < end; index++, count--) { - if (bytes [index] < 0x80) - length++; - else - break; - } - } - - // Determine the number of characters that we have. - uint ch; - uint leftBits = leftOverBits; - uint leftSoFar = (leftOverCount & (uint)0x0F); - uint leftSize = ((leftOverCount >> 4) & (uint)0x0F); - while (count > 0) { - ch = (uint)(bytes[index++]); - --count; - if (leftSize == 0) { - // Process a UTF-8 start character. - if (ch < (uint)0x0080) { - // Single-byte UTF-8 character. - ++length; - } else if ((ch & (uint)0xE0) == (uint)0xC0) { - // Double-byte UTF-8 character. - leftBits = (ch & (uint)0x1F); - leftSoFar = 1; - leftSize = 2; - } else if ((ch & (uint)0xF0) == (uint)0xE0) { - // Three-byte UTF-8 character. - leftBits = (ch & (uint)0x0F); - leftSoFar = 1; - leftSize = 3; - } else if ((ch & (uint)0xF8) == (uint)0xF0) { - // Four-byte UTF-8 character. - leftBits = (ch & (uint)0x07); - leftSoFar = 1; - leftSize = 4; - } else if ((ch & (uint)0xFC) == (uint)0xF8) { - // Five-byte UTF-8 character. - leftBits = (ch & (uint)0x03); - leftSoFar = 1; - leftSize = 5; - } else if ((ch & (uint)0xFE) == (uint)0xFC) { - // Six-byte UTF-8 character. - leftBits = (ch & (uint)0x03); - leftSoFar = 1; - leftSize = 6; - } else { - // Invalid UTF-8 start character. -#if NET_2_0 - length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1); -#else - if (throwOnInvalid) - throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes"); -#endif - } - } else { - // Process an extra byte in a multi-byte sequence. - if ((ch & (uint)0xC0) == (uint)0x80) { - leftBits = ((leftBits << 6) | (ch & (uint)0x3F)); - if (++leftSoFar >= leftSize) { - // We have a complete character now. - if (leftBits < (uint)0x10000) { - // is it an overlong ? - bool overlong = false; - switch (leftSize) { - case 2: - overlong = (leftBits <= 0x7F); - break; - case 3: - overlong = (leftBits <= 0x07FF); - break; - case 4: - overlong = (leftBits <= 0xFFFF); - break; - case 5: - overlong = (leftBits <= 0x1FFFFF); - break; - case 6: - overlong = (leftBits <= 0x03FFFFFF); - break; - } - if (overlong) { -#if NET_2_0 - length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1); -#else - if (throwOnInvalid) - throw new ArgumentException (_("Overlong"), leftBits.ToString ()); -#endif - } - else - ++length; - } else if (leftBits < (uint)0x110000) { - length += 2; - } else { -#if NET_2_0 - length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1); -#else - if (throwOnInvalid) - throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes"); -#endif - } - leftSize = 0; - } - } else { - // Invalid UTF-8 sequence: clear and restart. -#if NET_2_0 - length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1); -#else - if (throwOnInvalid) - throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes"); -#endif - leftSize = 0; - --index; - ++count; - } - } - } - if (flush && leftSize != 0) { - // We had left-over bytes that didn't make up - // a complete UTF-8 character sequence. -#if NET_2_0 - length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index); -#else - if (throwOnInvalid) - throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes"); -#endif - } - - // Return the final length to the caller. - return length; - } - -#if NET_2_0 - // for GetCharCount() - static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, int index) - { - if (buffer == null) { - DecoderFallback fb = provider as DecoderFallback; - if (fb != null) - buffer = fb.CreateFallbackBuffer (); - else - buffer = ((Decoder) provider).FallbackBuffer; - } - if (bufferArg == null) - bufferArg = new byte [1]; - bufferArg [0] = bytes [index]; - buffer.Fallback (bufferArg, 0); - return buffer.Remaining; - } - - // for GetChars() - static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, int byteIndex, - char* chars, ref int charIndex) - { - if (buffer == null) { - DecoderFallback fb = provider as DecoderFallback; - if (fb != null) - buffer = fb.CreateFallbackBuffer (); - else - buffer = ((Decoder) provider).FallbackBuffer; - } - if (bufferArg == null) - bufferArg = new byte [1]; - bufferArg [0] = bytes [byteIndex]; - buffer.Fallback (bufferArg, 0); - while (buffer.Remaining > 0) - chars [charIndex++] = buffer.GetNextChar (); - } -#endif + #region GetCharCount() // Get the number of characters needed to decode a byte buffer. public override int GetCharCount (byte[] bytes, int index, int count) { -#if NET_2_0 - DecoderFallbackBuffer buf = null; - byte [] bufferArg = null; - return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true); -#else - return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true); -#endif + int bytesProcessed, charsProcessed; + uint leftBytes = 0, leftBits = 0, procBytes = 0; + InternalGetChars ( + bytes, index, count, + null, 0, + DecoderFallback.CreateFallbackBuffer(), + out bytesProcessed, out charsProcessed, + ref leftBytes, ref leftBits, ref procBytes); + return charsProcessed; } -#if NET_2_0 [CLSCompliant (false)] [ComVisible (false)] public unsafe override int GetCharCount (byte* bytes, int count) { - DecoderFallbackBuffer buf = null; - byte [] bufferArg = null; - return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true); + int bytesProcessed, charsProcessed; + uint leftBytes = 0, leftBits = 0, procBytes = 0; + InternalGetChars ( + bytes, count, + null, 0, + DecoderFallback.CreateFallbackBuffer(), + out bytesProcessed, out charsProcessed, + ref leftBytes, ref leftBits, ref procBytes); + return charsProcessed; } -#endif - // Get the characters that result from decoding a byte buffer. -#if NET_2_0 - private unsafe static int InternalGetChars ( - byte[] bytes, int byteIndex, int byteCount, char[] chars, - int charIndex, ref uint leftOverBits, ref uint leftOverCount, - object provider, - ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush) -#else - private unsafe static int InternalGetChars ( - byte[] bytes, int byteIndex, int byteCount, char[] chars, - int charIndex, ref uint leftOverBits, ref uint leftOverCount, - bool throwOnInvalid, bool flush) -#endif - { - // Validate the parameters. - if (bytes == null) { - throw new ArgumentNullException ("bytes"); - } - if (chars == null) { - throw new ArgumentNullException ("chars"); - } - if (byteIndex < 0 || byteIndex > bytes.Length) { - throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array")); - } - if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) { - throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array")); - } - if (charIndex < 0 || charIndex > chars.Length) { - throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array")); - } - - if (charIndex == chars.Length) - return 0; - - fixed (char* cptr = chars) { -#if NET_2_0 - if (byteCount == 0 || byteIndex == bytes.Length) - return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush); - // otherwise... - fixed (byte* bptr = bytes) - return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush); -#else - if (byteCount == 0 || byteIndex == bytes.Length) - return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush); - // otherwise... - fixed (byte* bptr = bytes) - return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush); -#endif - } - } - -#if NET_2_0 - private unsafe static int InternalGetChars ( - byte* bytes, int byteCount, char* chars, int charCount, - ref uint leftOverBits, ref uint leftOverCount, - object provider, - ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush) -#else - private unsafe static int InternalGetChars ( - byte* bytes, int byteCount, char* chars, int charCount, - ref uint leftOverBits, ref uint leftOverCount, - bool throwOnInvalid, bool flush) -#endif - { - int charIndex = 0, byteIndex = 0; - int length = charCount; - int posn = charIndex; - - if (leftOverCount == 0) { - int end = byteIndex + byteCount; - for (; byteIndex < end; posn++, byteIndex++, byteCount--) { - if (bytes [byteIndex] < 0x80) - chars [posn] = (char) bytes [byteIndex]; - else - break; - } - } - - // Convert the bytes into the output buffer. - uint ch; - uint leftBits = leftOverBits; - uint leftSoFar = (leftOverCount & (uint)0x0F); - uint leftSize = ((leftOverCount >> 4) & (uint)0x0F); - - int byteEnd = byteIndex + byteCount; - for(; byteIndex < byteEnd; byteIndex++) { - // Fetch the next character from the byte buffer. - ch = (uint)(bytes[byteIndex]); - if (leftSize == 0) { - // Process a UTF-8 start character. - if (ch < (uint)0x0080) { - // Single-byte UTF-8 character. - if (posn >= length) { - throw new ArgumentException (_("Arg_InsufficientSpace"), "chars"); - } - chars[posn++] = (char)ch; - } else if ((ch & (uint)0xE0) == (uint)0xC0) { - // Double-byte UTF-8 character. - leftBits = (ch & (uint)0x1F); - leftSoFar = 1; - leftSize = 2; - } else if ((ch & (uint)0xF0) == (uint)0xE0) { - // Three-byte UTF-8 character. - leftBits = (ch & (uint)0x0F); - leftSoFar = 1; - leftSize = 3; - } else if ((ch & (uint)0xF8) == (uint)0xF0) { - // Four-byte UTF-8 character. - leftBits = (ch & (uint)0x07); - leftSoFar = 1; - leftSize = 4; - } else if ((ch & (uint)0xFC) == (uint)0xF8) { - // Five-byte UTF-8 character. - leftBits = (ch & (uint)0x03); - leftSoFar = 1; - leftSize = 5; - } else if ((ch & (uint)0xFE) == (uint)0xFC) { - // Six-byte UTF-8 character. - leftBits = (ch & (uint)0x03); - leftSoFar = 1; - leftSize = 6; - } else { - // Invalid UTF-8 start character. -#if NET_2_0 - Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn); -#else - if (throwOnInvalid) - throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes"); -#endif - } - } else { - // Process an extra byte in a multi-byte sequence. - if ((ch & (uint)0xC0) == (uint)0x80) { - leftBits = ((leftBits << 6) | (ch & (uint)0x3F)); - if (++leftSoFar >= leftSize) { - // We have a complete character now. - if (leftBits < (uint)0x10000) { - // is it an overlong ? - bool overlong = false; - switch (leftSize) { - case 2: - overlong = (leftBits <= 0x7F); - break; - case 3: - overlong = (leftBits <= 0x07FF); - break; - case 4: - overlong = (leftBits <= 0xFFFF); - break; - case 5: - overlong = (leftBits <= 0x1FFFFF); - break; - case 6: - overlong = (leftBits <= 0x03FFFFFF); - break; - } - if (overlong) { -#if NET_2_0 - Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn); -#else - if (throwOnInvalid) - throw new ArgumentException (_("Overlong"), leftBits.ToString ()); -#endif - } - else if ((leftBits & 0xF800) == 0xD800) { - // UTF-8 doesn't use surrogate characters -#if NET_2_0 - Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn); -#else - if (throwOnInvalid) - throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes"); -#endif - } - else { - if (posn >= length) { - throw new ArgumentException - (_("Arg_InsufficientSpace"), "chars"); - } - chars[posn++] = (char)leftBits; - } - } else if (leftBits < (uint)0x110000) { - if ((posn + 2) > length) { - throw new ArgumentException - (_("Arg_InsufficientSpace"), "chars"); - } - leftBits -= (uint)0x10000; - chars[posn++] = (char)((leftBits >> 10) + - (uint)0xD800); - chars[posn++] = - (char)((leftBits & (uint)0x3FF) + (uint)0xDC00); - } else { -#if NET_2_0 - Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn); -#else - if (throwOnInvalid) - throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes"); -#endif - } - leftSize = 0; - } - } else { - // Invalid UTF-8 sequence: clear and restart. -#if NET_2_0 - Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn); -#else - if (throwOnInvalid) - throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes"); -#endif - leftSize = 0; - --byteIndex; - } - } - } - if (flush && leftSize != 0) { - // We had left-over bytes that didn't make up - // a complete UTF-8 character sequence. -#if NET_2_0 - Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn); -#else - if (throwOnInvalid) - throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes"); -#endif - } - leftOverBits = leftBits; - leftOverCount = (leftSoFar | (leftSize << 4)); - - // Return the final length to the caller. - return posn - charIndex; - } + #endregion // Get the characters that result from decoding a byte buffer. public override int GetChars (byte[] bytes, int byteIndex, int byteCount, - char[] chars, int charIndex) + char[] chars, int charIndex) { - uint leftOverBits = 0; - uint leftOverCount = 0; -#if NET_2_0 - DecoderFallbackBuffer buf = null; - byte [] bufferArg = null; - return InternalGetChars (bytes, byteIndex, byteCount, chars, - charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true); -#else - return InternalGetChars (bytes, byteIndex, byteCount, chars, - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true); -#endif + int bytesProcessed, charsProcessed; + uint leftBytes = 0, leftBits = 0, procBytes = 0; + InternalGetChars ( + bytes, byteIndex, byteCount, + chars, charIndex, + DecoderFallback.CreateFallbackBuffer(), + out bytesProcessed, out charsProcessed, + ref leftBytes, ref leftBits, ref procBytes); + return charsProcessed; } -#if NET_2_0 [CLSCompliant (false)] [ComVisible (false)] public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount) { - DecoderFallbackBuffer buf = null; - byte [] bufferArg = null; - uint leftOverBits = 0; - uint leftOverCount = 0; - return InternalGetChars (bytes, byteCount, chars, - charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true); + int bytesProcessed, charsProcessed; + uint leftBytes = 0, leftBits = 0, procBytes = 0; + InternalGetChars ( + bytes, byteCount, + chars, charCount, + DecoderFallback.CreateFallbackBuffer(), + out bytesProcessed, out charsProcessed, + ref leftBytes, ref leftBits, ref procBytes); + return charsProcessed; } -#endif // Get the maximum number of bytes needed to encode a // specified number of characters. public override int GetMaxByteCount (int charCount) { - if (charCount < 0) { + if (charCount < 0) throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative")); - } return charCount * 4; } @@ -1003,40 +761,32 @@ Char.IsLetterOrDigit (pair); // specified number of bytes. public override int GetMaxCharCount (int byteCount) { - if (byteCount < 0) { + if (byteCount < 0) throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative")); - } return byteCount; } // Get a UTF8-specific decoder that is attached to this instance. public override Decoder GetDecoder () { -#if NET_2_0 return new UTF8Decoder (DecoderFallback); -#else - return new UTF8Decoder (throwOnInvalid); -#endif } // Get a UTF8-specific encoder that is attached to this instance. public override Encoder GetEncoder () { - return new UTF8Encoder (emitIdentifier); + return new UTF8Encoder (EncoderFallback, emitIdentifier); } // Get the UTF8 preamble. + // XXX: why does this method return a preamble or void array depending + // on 'emitIdentifier' attribute? public override byte[] GetPreamble () { - if (emitIdentifier) { - byte[] pre = new byte [3]; - pre[0] = (byte)0xEF; - pre[1] = (byte)0xBB; - pre[2] = (byte)0xBF; - return pre; - } else { - return new byte [0]; - } + if (emitIdentifier) + return new byte [] { 0xEF, 0xBB, 0xBF }; + + return EmptyArray.Value; } // Determine if this object is equal to another. @@ -1044,16 +794,10 @@ Char.IsLetterOrDigit (pair); { UTF8Encoding enc = (value as UTF8Encoding); if (enc != null) { -#if NET_2_0 - return (codePage == enc.codePage && - emitIdentifier == enc.emitIdentifier && - DecoderFallback == enc.DecoderFallback && - EncoderFallback == enc.EncoderFallback); -#else return (codePage == enc.codePage && - emitIdentifier == enc.emitIdentifier && - throwOnInvalid == enc.throwOnInvalid); -#endif + emitIdentifier == enc.emitIdentifier && + DecoderFallback.Equals (enc.DecoderFallback) && + EncoderFallback.Equals (enc.EncoderFallback)); } else { return false; } @@ -1065,89 +809,146 @@ Char.IsLetterOrDigit (pair); return base.GetHashCode (); } -#if NET_2_0 - [MonoTODO] - public override int GetByteCount (string s) + public override int GetByteCount (string chars) { // hmm, does this override make any sense? - return base.GetByteCount (s); + return base.GetByteCount (chars); } - [MonoTODO] [ComVisible (false)] public override string GetString (byte [] bytes, int index, int count) { // hmm, does this override make any sense? return base.GetString (bytes, index, count); } -#endif - -#if !NET_2_0 - public override byte [] GetBytes (String s) - { - if (s == null) - throw new ArgumentNullException ("s"); - - int length = GetByteCount (s); - byte [] bytes = new byte [length]; - GetBytes (s, 0, s.Length, bytes, 0); - return bytes; - } -#endif // UTF-8 decoder implementation. [Serializable] private class UTF8Decoder : Decoder { -#if !NET_2_0 - private bool throwOnInvalid; -#endif - private uint leftOverBits; - private uint leftOverCount; + // internal encoder state + private uint leftBytes; + private uint leftBits; + private uint procBytes; // Constructor. -#if NET_2_0 public UTF8Decoder (DecoderFallback fallback) -#else - public UTF8Decoder (bool throwOnInvalid) -#endif { -#if NET_2_0 Fallback = fallback; -#else - this.throwOnInvalid = throwOnInvalid; -#endif - leftOverBits = 0; - leftOverCount = 0; + leftBytes = 0; + leftBits = 0; + procBytes = 0; } // Override inherited methods. public override int GetCharCount (byte[] bytes, int index, int count) { -#if NET_2_0 - DecoderFallbackBuffer buf = null; - byte [] bufferArg = null; - return InternalGetCharCount (bytes, index, count, - leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false); -#else - return InternalGetCharCount (bytes, index, count, - leftOverBits, leftOverCount, throwOnInvalid, false); -#endif + int bytesProcessed, charsProcessed; + InternalGetChars ( + bytes, index, count, + null, 0, + this.FallbackBuffer, + out bytesProcessed, out charsProcessed, + ref leftBytes, ref leftBits, ref procBytes); + return charsProcessed; + } + + [ComVisibleAttribute(false)] + public override int GetCharCount (byte[] bytes, int index, int count, bool flush) + { + int r = GetCharCount (bytes, index, count); + if (flush) + leftBytes = leftBits = procBytes = 0; + return r; } + + [ComVisibleAttribute(false)] + public unsafe override int GetCharCount (byte* bytes, int count, bool flush) + { + int bytesProcessed, charsProcessed; + InternalGetChars ( + bytes, count, + null, 0, + this.FallbackBuffer, + out bytesProcessed, out charsProcessed, + ref leftBytes, ref leftBits, ref procBytes); + if (flush) + leftBytes = leftBits = procBytes = 0; + return charsProcessed; + } + + [ComVisibleAttribute(false)] + public unsafe override int GetChars (byte* bytes, int byteCount, + char* chars, int charCount, bool flush) + { + int bytesProcessed, charsProcessed; + InternalGetChars ( + bytes, byteCount, + chars, charCount, + this.FallbackBuffer, + out bytesProcessed, out charsProcessed, + ref leftBytes, ref leftBits, ref procBytes); + if (flush) + leftBytes = leftBits = procBytes = 0; + return charsProcessed; + } + public override int GetChars (byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex) { -#if NET_2_0 - DecoderFallbackBuffer buf = null; - byte [] bufferArg = null; - return InternalGetChars (bytes, byteIndex, byteCount, - chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false); -#else - return InternalGetChars (bytes, byteIndex, byteCount, - chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false); -#endif + int bytesProcessed, charsProcessed; + InternalGetChars ( + bytes, byteIndex, byteCount, + chars, charIndex, + this.FallbackBuffer, + out bytesProcessed, out charsProcessed, + ref leftBytes, ref leftBits, ref procBytes); + return charsProcessed; + } + + public override int GetChars (byte[] bytes, int byteIndex, + int byteCount, char[] chars, int charIndex, bool flush) + { + int r = GetChars (bytes, byteIndex, byteCount, chars, charIndex); + if (flush) + leftBytes = leftBits = procBytes = 0; + return r; } + public override void Reset () + { + base.Reset(); + leftBytes = 0; + leftBits = 0; + procBytes = 0; + } + + public unsafe override void Convert ( + byte* bytes, int byteCount, + char* chars, int charCount, bool flush, + out int bytesUsed, out int charsUsed, out bool completed) + { + if (chars == null) + throw new ArgumentNullException ("chars"); + if (charCount < 0) + throw new IndexOutOfRangeException ("charCount"); + if (bytes == null) + throw new ArgumentNullException ("bytes"); + if (byteCount < 0) + throw new IndexOutOfRangeException ("charCount"); + UTF8Encoding.InternalGetChars ( + bytes, byteCount, + chars, charCount, + this.FallbackBuffer, + out bytesUsed, out charsUsed, + ref leftBytes, ref leftBits, ref procBytes); + // only completed if all bytes have been processed and + // succesful converted to chars!! + completed = (byteCount == bytesUsed); + // flush state + if (flush) + leftBytes = leftBits = procBytes = 0; + } } // class UTF8Decoder // UTF-8 encoder implementation. @@ -1155,48 +956,149 @@ Char.IsLetterOrDigit (pair); private class UTF8Encoder : Encoder { private bool emitIdentifier; - private char leftOverForCount; - private char leftOverForConv; + + // internal encoder state + private uint leftChar; + private bool emittedIdentifier; // Constructor. - public UTF8Encoder (bool emitIdentifier) + public UTF8Encoder (EncoderFallback fallback, bool emitIdentifier) { + this.Fallback = fallback; + this.leftChar = 0; this.emitIdentifier = emitIdentifier; - leftOverForCount = '\0'; - leftOverForConv = '\0'; + this.emittedIdentifier = false; } // Override inherited methods. + [ComVisibleAttribute(false)] + public unsafe override int GetByteCount (char* chars, int count, bool flush) + { + int charsProcessed, bytesProcessed, preambleSize = 0; + if (emitIdentifier && !emittedIdentifier) { + preambleSize = 3; + emittedIdentifier = true; + } + InternalGetBytes (chars, count, + null, 0, + this.FallbackBuffer, + out charsProcessed, out bytesProcessed, + ref leftChar); + if (flush) + leftChar = 0; + return bytesProcessed + preambleSize; + } + public override int GetByteCount (char[] chars, int index, int count, bool flush) { - return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush); + int charsProcessed, bytesProcessed, preambleSize = 0; + if (emitIdentifier && !emittedIdentifier) { + preambleSize = 3; + emittedIdentifier = true; + } + InternalGetBytes (chars, index, count, + null, 0, + this.FallbackBuffer, + out charsProcessed, out bytesProcessed, + ref leftChar); + if (flush) + leftChar = 0; + return bytesProcessed + preambleSize; } + + [ComVisibleAttribute(false)] + public unsafe override int GetBytes (char* chars, int charCount, + byte* bytes, int byteCount, bool flush) + { + int charsProcessed, bytesProcessed, preambleSize = 0; + if (emitIdentifier && !emittedIdentifier) { + if (byteCount < 3) + throw new ArgumentException ("Insufficient Space", "UTF8 preamble"); + *bytes++ = 0xEF; + *bytes++ = 0xBB; + *bytes++ = 0xBF; + preambleSize = 3; + emittedIdentifier = true; + byteCount -= 3; + } + InternalGetBytes (chars, charCount, + bytes, byteCount, + this.FallbackBuffer, + out charsProcessed, out bytesProcessed, + ref leftChar); + if (flush) + leftChar = 0; + return bytesProcessed + preambleSize; + } + public override int GetBytes (char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush) { - int result; - result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush); - emitIdentifier = false; - return result; + int charsProcessed, bytesProcessed, preambleSize = 0; + if (emitIdentifier && !emittedIdentifier) { + if (bytes.Length - byteIndex < 3) + throw new ArgumentException ("Insufficient Space", "UTF8 preamble"); + bytes[byteIndex++] = 0xEF; + bytes[byteIndex++] = 0xBB; + bytes[byteIndex++] = 0xBF; + preambleSize = 3; + emittedIdentifier = true; + } + InternalGetBytes (chars, charIndex, charCount, + bytes, byteIndex, + this.FallbackBuffer, + out charsProcessed, out bytesProcessed, + ref leftChar); + if (flush) + leftChar = 0; + return bytesProcessed + preambleSize; } -#if NET_2_0 - public unsafe override int GetByteCount (char* chars, int count, bool flush) + public override void Reset () { - return InternalGetByteCount (chars, count, ref leftOverForCount, flush); + base.Reset(); + this.leftChar = 0; + this.emittedIdentifier = false; } - public unsafe override int GetBytes (char* chars, int charCount, - byte* bytes, int byteCount, bool flush) + public unsafe override void Convert ( + char* chars, int charCount, + byte* bytes, int byteCount, bool flush, + out int charsUsed, out int bytesUsed, out bool completed) { - int result; - result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush); - emitIdentifier = false; - return result; + int preambleSize = 0; + if (bytes == null) + throw new ArgumentNullException ("bytes"); + if (byteCount < 0) + throw new IndexOutOfRangeException ("charCount"); + if (chars == null) + throw new ArgumentNullException ("chars"); + if (charCount < 0) + throw new IndexOutOfRangeException ("charCount"); + if (emitIdentifier && !emittedIdentifier) { + if (byteCount < 3) + throw new ArgumentException ("Insufficient Space", "UTF8 preamble"); + *bytes++ = 0xEF; + *bytes++ = 0xBB; + *bytes++ = 0xBF; + preambleSize = 3; + emittedIdentifier = true; + byteCount -= 3; + } + InternalGetBytes ( + chars, charCount, + bytes, byteCount, + this.FallbackBuffer, + out charsUsed, out bytesUsed, + ref leftChar); + // only completed if all chars have been processed and + // succesful converted to chars!! + completed = (charCount == charsUsed); + bytesUsed += preambleSize; + if (flush) + leftChar = 0; } -#endif - } // class UTF8Encoder }; // class UTF8Encoding