2 * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
4 * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
5 * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 * OTHER DEALINGS IN THE SOFTWARE.
30 using System.Runtime.InteropServices;
33 [MonoTODO ("Serialization format not compatible with .NET")]
35 [MonoTODO ("EncoderFallback is not handled")]
38 public class UTF8Encoding : Encoding
40 // Magic number used by Windows for UTF-8.
41 internal const int UTF8_CODE_PAGE = 65001;
44 private bool emitIdentifier;
46 private bool throwOnInvalid;
50 public UTF8Encoding () : this (false, false) {}
51 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
52 : this (encoderShouldEmitUTF8Identifier, false) {}
54 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
55 : base (UTF8_CODE_PAGE)
57 emitIdentifier = encoderShouldEmitUTF8Identifier;
59 if (throwOnInvalidBytes)
60 SetFallbackInternal (null, new DecoderExceptionFallback ());
62 SetFallbackInternal (null, new DecoderReplacementFallback ("\uFFFD"));
64 throwOnInvalid = throwOnInvalidBytes;
67 web_name = body_name = header_name = "utf-8";
68 encoding_name = "Unicode (UTF-8)";
69 is_browser_save = true;
70 is_browser_display = true;
71 is_mail_news_display = true;
72 is_mail_news_save = true;
73 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
76 #region GetByteCount()
78 // Internal version of "GetByteCount" which can handle a rolling
79 // state between multiple calls to this method.
80 private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
82 // Validate the parameters.
84 throw new ArgumentNullException ("chars");
86 if (index < 0 || index > chars.Length) {
87 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
89 if (count < 0 || count > (chars.Length - index)) {
90 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
93 if (index == chars.Length) {
94 if (flush && leftOver != '\0') {
95 // Flush the left-over surrogate pair start.
103 fixed (char* cptr = chars) {
104 return InternalGetByteCount (cptr + index, count, ref leftOver, flush);
109 private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
112 char* end = chars + count;
113 while (chars < end) {
115 for (; chars < end; chars++) {
116 if (*chars < '\x80') {
118 } else if (*chars < '\x800') {
120 } else if (*chars < '\uD800' || *chars > '\uDFFF') {
122 } else if (*chars <= '\uDBFF') {
123 // This is a surrogate start char, exit the inner loop only
124 // if we don't find the complete surrogate pair.
125 if (chars + 1 < end && chars [1] >= '\uDC00' && chars [1] <= '\uDFFF') {
134 // We have a surrogate tail without
135 // leading surrogate. In NET_2_0 it
136 // uses fallback. In NET_1_1 we output
143 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
144 // We have a correct surrogate pair.
148 // We have a surrogate start followed by a
149 // regular character. Technically, this is
150 // invalid, but we have to do something.
151 // We write out the surrogate start and then
152 // re-visit the current character again.
159 // Flush the left-over surrogate pair start.
160 if (leftOver != '\0') {
168 // Get the number of bytes needed to encode a character buffer.
169 public override int GetByteCount (char[] chars, int index, int count)
172 return InternalGetByteCount (chars, index, count, ref dummy, true);
176 // Convenience wrappers for "GetByteCount".
177 public override int GetByteCount (String chars)
179 // Validate the parameters.
181 throw new ArgumentNullException ("chars");
185 fixed (char* cptr = chars) {
187 return InternalGetByteCount (cptr, chars.Length, ref dummy, true);
194 [CLSCompliant (false)]
196 public unsafe override int GetByteCount (char* chars, int count)
199 throw new ArgumentNullException ("chars");
203 return InternalGetByteCount (chars, count, ref dummy, true);
211 // Internal version of "GetBytes" which can handle a rolling
212 // state between multiple calls to this method.
213 private static int InternalGetBytes (char[] chars, int charIndex,
214 int charCount, byte[] bytes,
215 int byteIndex, ref char leftOver,
218 // Validate the parameters.
220 throw new ArgumentNullException ("chars");
223 throw new ArgumentNullException ("bytes");
225 if (charIndex < 0 || charIndex > chars.Length) {
226 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
228 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
229 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
231 if (byteIndex < 0 || byteIndex > bytes.Length) {
232 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
235 if (charIndex == chars.Length) {
236 if (flush && leftOver != '\0') {
238 // FIXME: use EncoderFallback.
240 // By default it is empty, so I do nothing for now.
243 // Flush the left-over surrogate pair start.
244 if (byteIndex >= bytes.Length - 3)
245 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
246 bytes [byteIndex++] = 0xEF;
247 bytes [byteIndex++] = 0xBB;
248 bytes [byteIndex++] = 0xBF;
257 fixed (char* cptr = chars) {
258 if (bytes.Length == byteIndex)
259 return InternalGetBytes (
260 cptr + charIndex, charCount,
261 null, 0, ref leftOver, flush);
262 fixed (byte *bptr = bytes) {
263 return InternalGetBytes (
264 cptr + charIndex, charCount,
265 bptr + byteIndex, bytes.Length - byteIndex,
266 ref leftOver, flush);
272 private unsafe static int InternalGetBytes (char* chars, int count, byte* bytes, int bcount, ref char leftOver, bool flush)
274 char* end = chars + count;
275 byte* end_bytes = bytes + bcount;
276 while (chars < end) {
278 for (; chars < end; chars++) {
281 if (bytes >= end_bytes)
284 } else if (ch < '\x800') {
285 if (bytes + 1 >= end_bytes)
287 bytes [0] = (byte) (0xC0 | (ch >> 6));
288 bytes [1] = (byte) (0x80 | (ch & 0x3F));
290 } else if (ch < '\uD800' || ch > '\uDFFF') {
291 if (bytes + 2 >= end_bytes)
293 bytes [0] = (byte) (0xE0 | (ch >> 12));
294 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
295 bytes [2] = (byte) (0x80 | (ch & 0x3F));
297 } else if (ch <= '\uDBFF') {
298 // This is a surrogate char, exit the inner loop.
303 // We have a surrogate tail without
304 // leading surrogate. In NET_2_0 it
305 // uses fallback. In NET_1_1 we output
307 if (bytes + 2 >= end_bytes)
309 bytes [0] = (byte) (0xE0 | (ch >> 12));
310 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
311 bytes [2] = (byte) (0x80 | (ch & 0x3F));
317 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
318 // We have a correct surrogate pair.
319 int ch = 0x10000 + (int) *chars - 0xDC00 + (((int) leftOver - 0xD800) << 10);
320 if (bytes + 3 >= end_bytes)
322 bytes [0] = (byte) (0xF0 | (ch >> 18));
323 bytes [1] = (byte) (0x80 | ((ch >> 12) & 0x3F));
324 bytes [2] = (byte) (0x80 | ((ch >> 6) & 0x3F));
325 bytes [3] = (byte) (0x80 | (ch & 0x3F));
329 // We have a surrogate start followed by a
330 // regular character. Technically, this is
331 // invalid, but we have to do something.
332 // We write out the surrogate start and then
333 // re-visit the current character again.
335 if (bytes + 2 >= end_bytes)
337 bytes [0] = (byte) (0xE0 | (ch >> 12));
338 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
339 bytes [2] = (byte) (0x80 | (ch & 0x3F));
346 // Flush the left-over surrogate pair start.
347 if (leftOver != '\0') {
349 if (bytes + 2 < end_bytes) {
350 bytes [0] = (byte) (0xE0 | (ch >> 12));
351 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
352 bytes [2] = (byte) (0x80 | (ch & 0x3F));
360 return (int)(bytes - (end_bytes - bcount));
362 throw new ArgumentException ("Insufficient Space", "bytes");
365 // Get the bytes that result from encoding a character buffer.
366 public override int GetBytes (char[] chars, int charIndex, int charCount,
367 byte[] bytes, int byteIndex)
369 char leftOver = '\0';
370 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
373 // Convenience wrappers for "GetBytes".
374 public override int GetBytes (String s, int charIndex, int charCount,
375 byte[] bytes, int byteIndex)
377 // Validate the parameters.
379 throw new ArgumentNullException ("s");
382 throw new ArgumentNullException ("bytes");
384 if (charIndex < 0 || charIndex > s.Length) {
385 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
387 if (charCount < 0 || charCount > (s.Length - charIndex)) {
388 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
390 if (byteIndex < 0 || byteIndex > bytes.Length) {
391 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
394 if (charIndex == s.Length)
398 fixed (char* cptr = s) {
400 if (bytes.Length == byteIndex)
401 return InternalGetBytes (
402 cptr + charIndex, charCount,
403 null, 0, ref dummy, true);
404 fixed (byte *bptr = bytes) {
405 return InternalGetBytes (
406 cptr + charIndex, charCount,
407 bptr + byteIndex, bytes.Length - byteIndex,
415 [CLSCompliant (false)]
417 public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
420 throw new ArgumentNullException ("chars");
422 throw new IndexOutOfRangeException ("charCount");
424 throw new ArgumentNullException ("bytes");
426 throw new IndexOutOfRangeException ("charCount");
433 return InternalGetBytes (chars, charCount, null, 0, ref dummy, true);
435 return InternalGetBytes (chars, charCount, bytes, byteCount, ref dummy, true);
441 // Internal version of "GetCharCount" which can handle a rolling
442 // state between multiple calls to this method.
444 private unsafe static int InternalGetCharCount (
445 byte[] bytes, int index, int count, uint leftOverBits,
446 uint leftOverCount, object provider,
447 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
449 private unsafe static int InternalGetCharCount (
450 byte[] bytes, int index, int count, uint leftOverBits,
451 uint leftOverCount, bool throwOnInvalid, bool flush)
454 // Validate the parameters.
456 throw new ArgumentNullException ("bytes");
458 if (index < 0 || index > bytes.Length) {
459 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
461 if (count < 0 || count > (bytes.Length - index)) {
462 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
467 fixed (byte *bptr = bytes)
469 return InternalGetCharCount (bptr + index, count,
470 leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
472 return InternalGetCharCount (bptr + index, count,
473 leftOverBits, leftOverCount, throwOnInvalid, flush);
478 private unsafe static int InternalGetCharCount (
479 byte* bytes, int count, uint leftOverBits,
480 uint leftOverCount, object provider,
481 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
483 private unsafe static int InternalGetCharCount (
484 byte* bytes, int count, uint leftOverBits,
485 uint leftOverCount, bool throwOnInvalid, bool flush)
492 if (leftOverCount == 0) {
493 int end = index + count;
494 for (; index < end; index++, count--) {
495 if (bytes [index] < 0x80)
502 // Determine the number of characters that we have.
504 uint leftBits = leftOverBits;
505 uint leftSoFar = (leftOverCount & (uint)0x0F);
506 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
508 ch = (uint)(bytes[index++]);
511 // Process a UTF-8 start character.
512 if (ch < (uint)0x0080) {
513 // Single-byte UTF-8 character.
515 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
516 // Double-byte UTF-8 character.
517 leftBits = (ch & (uint)0x1F);
520 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
521 // Three-byte UTF-8 character.
522 leftBits = (ch & (uint)0x0F);
525 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
526 // Four-byte UTF-8 character.
527 leftBits = (ch & (uint)0x07);
530 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
531 // Five-byte UTF-8 character.
532 leftBits = (ch & (uint)0x03);
535 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
536 // Six-byte UTF-8 character.
537 leftBits = (ch & (uint)0x03);
541 // Invalid UTF-8 start character.
543 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1, 1);
546 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
550 // Process an extra byte in a multi-byte sequence.
551 if ((ch & (uint)0xC0) == (uint)0x80) {
552 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
553 if (++leftSoFar >= leftSize) {
554 // We have a complete character now.
555 if (leftBits < (uint)0x10000) {
556 // is it an overlong ?
557 bool overlong = false;
560 overlong = (leftBits <= 0x7F);
563 overlong = (leftBits <= 0x07FF);
566 overlong = (leftBits <= 0xFFFF);
569 overlong = (leftBits <= 0x1FFFFF);
572 overlong = (leftBits <= 0x03FFFFFF);
577 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
580 throw new ArgumentException (_("Overlong"), leftBits.ToString ());
585 } else if (leftBits < (uint)0x110000) {
589 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
592 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
598 // Invalid UTF-8 sequence: clear and restart.
600 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
603 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
611 if (flush && leftSize != 0) {
612 // We had left-over bytes that didn't make up
613 // a complete UTF-8 character sequence.
615 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
618 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
622 // Return the final length to the caller.
627 // for GetCharCount()
628 static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long index, uint size)
630 if (buffer == null) {
631 DecoderFallback fb = provider as DecoderFallback;
633 buffer = fb.CreateFallbackBuffer ();
635 buffer = ((Decoder) provider).FallbackBuffer;
637 if (bufferArg == null)
638 bufferArg = new byte [1];
640 for (int i = 0; i < size; i++) {
641 bufferArg [0] = bytes [(int) index + i];
642 buffer.Fallback (bufferArg, 0);
643 ret += buffer.Remaining;
650 static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long byteIndex, uint size,
651 char* chars, ref int charIndex)
653 if (buffer == null) {
654 DecoderFallback fb = provider as DecoderFallback;
656 buffer = fb.CreateFallbackBuffer ();
658 buffer = ((Decoder) provider).FallbackBuffer;
660 if (bufferArg == null)
661 bufferArg = new byte [1];
662 for (int i = 0; i < size; i++) {
663 bufferArg [0] = bytes [byteIndex + i];
664 buffer.Fallback (bufferArg, 0);
665 while (buffer.Remaining > 0)
666 chars [charIndex++] = buffer.GetNextChar ();
672 // Get the number of characters needed to decode a byte buffer.
673 public override int GetCharCount (byte[] bytes, int index, int count)
676 DecoderFallbackBuffer buf = null;
677 byte [] bufferArg = null;
678 return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
680 return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
685 [CLSCompliant (false)]
687 public unsafe override int GetCharCount (byte* bytes, int count)
689 DecoderFallbackBuffer buf = null;
690 byte [] bufferArg = null;
691 return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
695 // Get the characters that result from decoding a byte buffer.
697 private unsafe static int InternalGetChars (
698 byte[] bytes, int byteIndex, int byteCount, char[] chars,
699 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
701 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
703 private unsafe static int InternalGetChars (
704 byte[] bytes, int byteIndex, int byteCount, char[] chars,
705 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
706 bool throwOnInvalid, bool flush)
709 // Validate the parameters.
711 throw new ArgumentNullException ("bytes");
714 throw new ArgumentNullException ("chars");
716 if (byteIndex < 0 || byteIndex > bytes.Length) {
717 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
719 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
720 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
722 if (charIndex < 0 || charIndex > chars.Length) {
723 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
726 if (charIndex == chars.Length)
729 fixed (char* cptr = chars) {
731 if (byteCount == 0 || byteIndex == bytes.Length)
732 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
734 fixed (byte* bptr = bytes)
735 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
737 if (byteCount == 0 || byteIndex == bytes.Length)
738 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
740 fixed (byte* bptr = bytes)
741 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
747 private unsafe static int InternalGetChars (
748 byte* bytes, int byteCount, char* chars, int charCount,
749 ref uint leftOverBits, ref uint leftOverCount,
751 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
753 private unsafe static int InternalGetChars (
754 byte* bytes, int byteCount, char* chars, int charCount,
755 ref uint leftOverBits, ref uint leftOverCount,
756 bool throwOnInvalid, bool flush)
759 int charIndex = 0, byteIndex = 0;
760 int length = charCount;
761 int posn = charIndex;
763 if (leftOverCount == 0) {
764 int end = byteIndex + byteCount;
765 for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
766 if (bytes [byteIndex] < 0x80)
767 chars [posn] = (char) bytes [byteIndex];
773 // Convert the bytes into the output buffer.
775 uint leftBits = leftOverBits;
776 uint leftSoFar = (leftOverCount & (uint)0x0F);
777 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
779 int byteEnd = byteIndex + byteCount;
780 for(; byteIndex < byteEnd; byteIndex++) {
781 // Fetch the next character from the byte buffer.
782 ch = (uint)(bytes[byteIndex]);
784 // Process a UTF-8 start character.
785 if (ch < (uint)0x0080) {
786 // Single-byte UTF-8 character.
787 if (posn >= length) {
788 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
790 chars[posn++] = (char)ch;
791 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
792 // Double-byte UTF-8 character.
793 leftBits = (ch & (uint)0x1F);
796 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
797 // Three-byte UTF-8 character.
798 leftBits = (ch & (uint)0x0F);
801 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
802 // Four-byte UTF-8 character.
803 leftBits = (ch & (uint)0x07);
806 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
807 // Five-byte UTF-8 character.
808 leftBits = (ch & (uint)0x03);
811 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
812 // Six-byte UTF-8 character.
813 leftBits = (ch & (uint)0x03);
817 // Invalid UTF-8 start character.
819 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, 1, chars, ref posn);
822 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
826 // Process an extra byte in a multi-byte sequence.
827 if ((ch & (uint)0xC0) == (uint)0x80) {
828 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
829 if (++leftSoFar >= leftSize) {
830 // We have a complete character now.
831 if (leftBits < (uint)0x10000) {
832 // is it an overlong ?
833 bool overlong = false;
836 overlong = (leftBits <= 0x7F);
839 overlong = (leftBits <= 0x07FF);
842 overlong = (leftBits <= 0xFFFF);
845 overlong = (leftBits <= 0x1FFFFF);
848 overlong = (leftBits <= 0x03FFFFFF);
853 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
856 throw new ArgumentException (_("Overlong"), leftBits.ToString ());
859 else if ((leftBits & 0xF800) == 0xD800) {
860 // UTF-8 doesn't use surrogate characters
862 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
865 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
869 if (posn >= length) {
870 throw new ArgumentException
871 (_("Arg_InsufficientSpace"), "chars");
873 chars[posn++] = (char)leftBits;
875 } else if (leftBits < (uint)0x110000) {
876 if ((posn + 2) > length) {
877 throw new ArgumentException
878 (_("Arg_InsufficientSpace"), "chars");
880 leftBits -= (uint)0x10000;
881 chars[posn++] = (char)((leftBits >> 10) +
884 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
887 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
890 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
896 // Invalid UTF-8 sequence: clear and restart.
898 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
901 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
908 if (flush && leftSize != 0) {
909 // We had left-over bytes that didn't make up
910 // a complete UTF-8 character sequence.
912 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
915 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
918 leftOverBits = leftBits;
919 leftOverCount = (leftSoFar | (leftSize << 4));
921 // Return the final length to the caller.
922 return posn - charIndex;
925 // Get the characters that result from decoding a byte buffer.
926 public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
927 char[] chars, int charIndex)
929 uint leftOverBits = 0;
930 uint leftOverCount = 0;
932 DecoderFallbackBuffer buf = null;
933 byte [] bufferArg = null;
934 return InternalGetChars (bytes, byteIndex, byteCount, chars,
935 charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
937 return InternalGetChars (bytes, byteIndex, byteCount, chars,
938 charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
943 [CLSCompliant (false)]
945 public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
947 DecoderFallbackBuffer buf = null;
948 byte [] bufferArg = null;
949 uint leftOverBits = 0;
950 uint leftOverCount = 0;
951 return InternalGetChars (bytes, byteCount, chars,
952 charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
956 // Get the maximum number of bytes needed to encode a
957 // specified number of characters.
958 public override int GetMaxByteCount (int charCount)
961 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
963 return charCount * 4;
966 // Get the maximum number of characters needed to decode a
967 // specified number of bytes.
968 public override int GetMaxCharCount (int byteCount)
971 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
976 // Get a UTF8-specific decoder that is attached to this instance.
977 public override Decoder GetDecoder ()
980 return new UTF8Decoder (DecoderFallback);
982 return new UTF8Decoder (throwOnInvalid);
986 // Get a UTF8-specific encoder that is attached to this instance.
987 public override Encoder GetEncoder ()
989 return new UTF8Encoder (emitIdentifier);
992 // Get the UTF8 preamble.
993 public override byte[] GetPreamble ()
995 if (emitIdentifier) {
996 byte[] pre = new byte [3];
1002 return new byte [0];
1006 // Determine if this object is equal to another.
1007 public override bool Equals (Object value)
1009 UTF8Encoding enc = (value as UTF8Encoding);
1012 return (codePage == enc.codePage &&
1013 emitIdentifier == enc.emitIdentifier &&
1014 DecoderFallback == enc.DecoderFallback &&
1015 EncoderFallback == enc.EncoderFallback);
1017 return (codePage == enc.codePage &&
1018 emitIdentifier == enc.emitIdentifier &&
1019 throwOnInvalid == enc.throwOnInvalid);
1026 // Get the hash code for this object.
1027 public override int GetHashCode ()
1029 return base.GetHashCode ();
1033 public override int GetByteCount (string chars)
1035 // hmm, does this override make any sense?
1036 return base.GetByteCount (chars);
1039 [ComVisible (false)]
1040 public override string GetString (byte [] bytes, int index, int count)
1042 // hmm, does this override make any sense?
1043 return base.GetString (bytes, index, count);
1048 public override byte [] GetBytes (String s)
1051 throw new ArgumentNullException ("s");
1053 int length = GetByteCount (s);
1054 byte [] bytes = new byte [length];
1055 GetBytes (s, 0, s.Length, bytes, 0);
1060 // UTF-8 decoder implementation.
1062 private class UTF8Decoder : Decoder
1065 private bool throwOnInvalid;
1067 private uint leftOverBits;
1068 private uint leftOverCount;
1072 public UTF8Decoder (DecoderFallback fallback)
1074 public UTF8Decoder (bool throwOnInvalid)
1078 Fallback = fallback;
1080 this.throwOnInvalid = throwOnInvalid;
1086 // Override inherited methods.
1087 public override int GetCharCount (byte[] bytes, int index, int count)
1090 DecoderFallbackBuffer buf = null;
1091 byte [] bufferArg = null;
1092 return InternalGetCharCount (bytes, index, count,
1093 leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
1095 return InternalGetCharCount (bytes, index, count,
1096 leftOverBits, leftOverCount, throwOnInvalid, false);
1099 public override int GetChars (byte[] bytes, int byteIndex,
1100 int byteCount, char[] chars, int charIndex)
1103 DecoderFallbackBuffer buf = null;
1104 byte [] bufferArg = null;
1105 return InternalGetChars (bytes, byteIndex, byteCount,
1106 chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
1108 return InternalGetChars (bytes, byteIndex, byteCount,
1109 chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
1113 } // class UTF8Decoder
1115 // UTF-8 encoder implementation.
1117 private class UTF8Encoder : Encoder
1119 // private bool emitIdentifier;
1120 private char leftOverForCount;
1121 private char leftOverForConv;
1124 public UTF8Encoder (bool emitIdentifier)
1126 // this.emitIdentifier = emitIdentifier;
1127 leftOverForCount = '\0';
1128 leftOverForConv = '\0';
1131 // Override inherited methods.
1132 public override int GetByteCount (char[] chars, int index,
1133 int count, bool flush)
1135 return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush);
1137 public override int GetBytes (char[] chars, int charIndex,
1138 int charCount, byte[] bytes, int byteIndex, bool flush)
1141 result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush);
1142 // emitIdentifier = false;
1147 public unsafe override int GetByteCount (char* chars, int count, bool flush)
1149 return InternalGetByteCount (chars, count, ref leftOverForCount, flush);
1152 public unsafe override int GetBytes (char* chars, int charCount,
1153 byte* bytes, int byteCount, bool flush)
1156 result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush);
1157 // emitIdentifier = false;
1162 } // class UTF8Encoder
1164 }; // class UTF8Encoding
1166 }; // namespace System.Text