2 * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
4 * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
5 * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 * OTHER DEALINGS IN THE SOFTWARE.
30 using System.Runtime.InteropServices;
33 [MonoLimitation ("Serialization format not compatible with .NET")]
35 public class UTF8Encoding : Encoding
37 // Magic number used by Windows for UTF-8.
38 internal const int UTF8_CODE_PAGE = 65001;
41 private bool emitIdentifier;
44 public UTF8Encoding () : this (false, false) {}
45 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
46 : this (encoderShouldEmitUTF8Identifier, false) {}
48 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
49 : base (UTF8_CODE_PAGE)
51 emitIdentifier = encoderShouldEmitUTF8Identifier;
52 if (throwOnInvalidBytes)
53 SetFallbackInternal (EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
55 SetFallbackInternal (EncoderFallback.StandardSafeFallback, DecoderFallback.StandardSafeFallback);
57 web_name = body_name = header_name = "utf-8";
58 encoding_name = "Unicode (UTF-8)";
59 is_browser_save = true;
60 is_browser_display = true;
61 is_mail_news_display = true;
62 is_mail_news_save = true;
63 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
66 #region GetByteCount()
68 // Internal version of "GetByteCount" which can handle a rolling
69 // state between multiple calls to this method.
70 private static int InternalGetByteCount (char[] chars, int index, int count, EncoderFallback fallback, ref char leftOver, bool flush)
72 // Validate the parameters.
74 throw new ArgumentNullException ("chars");
76 if (index < 0 || index > chars.Length) {
77 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
79 if (count < 0 || count > (chars.Length - index)) {
80 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
83 if (index == chars.Length) {
84 if (flush && leftOver != '\0') {
85 // Flush the left-over surrogate pair start.
93 fixed (char* cptr = chars) {
94 return InternalGetByteCount (cptr + index, count, fallback, ref leftOver, flush);
99 private unsafe static int InternalGetByteCount (char* chars, int count, EncoderFallback fallback, ref char leftOver, bool flush)
102 char* end = chars + count;
104 EncoderFallbackBuffer buffer = null;
105 while (chars < end) {
107 for (; chars < end; chars++) {
108 if (*chars < '\x80') {
110 } else if (*chars < '\x800') {
112 } else if (*chars < '\uD800' || *chars > '\uDFFF') {
114 } else if (*chars <= '\uDBFF') {
115 // This is a surrogate start char, exit the inner loop only
116 // if we don't find the complete surrogate pair.
117 if (chars + 1 < end && chars [1] >= '\uDC00' && chars [1] <= '\uDFFF') {
126 // We have a surrogate tail without
127 // leading surrogate.
128 char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
129 fixed (char *fb_chars = fallback_chars) {
131 length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
138 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
139 // We have a correct surrogate pair.
143 // We have a surrogate start followed by a
144 // regular character. Technically, this is
145 // invalid, but we have to do something.
146 // We write out the surrogate start and then
147 // re-visit the current character again.
148 char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
149 fixed (char *fb_chars = fallback_chars) {
151 length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
158 // Flush the left-over surrogate pair start.
159 if (leftOver != '\0') {
167 unsafe static char [] GetFallbackChars (char *chars, char *start, EncoderFallback fallback, ref EncoderFallbackBuffer buffer)
170 buffer = fallback.CreateFallbackBuffer ();
172 buffer.Fallback (*chars, (int) (chars - start));
174 char [] fallback_chars = new char [buffer.Remaining];
175 for (int i = 0; i < fallback_chars.Length; i++)
176 fallback_chars [i] = buffer.GetNextChar ();
180 return fallback_chars;
183 // Get the number of bytes needed to encode a character buffer.
184 public override int GetByteCount (char[] chars, int index, int count)
187 return InternalGetByteCount (chars, index, count, EncoderFallback, ref dummy, true);
191 [CLSCompliant (false)]
193 public unsafe override int GetByteCount (char* chars, int count)
196 throw new ArgumentNullException ("chars");
200 return InternalGetByteCount (chars, count, EncoderFallback, ref dummy, true);
207 // Internal version of "GetBytes" which can handle a rolling
208 // state between multiple calls to this method.
209 private static int InternalGetBytes (char[] chars, int charIndex,
210 int charCount, byte[] bytes,
212 EncoderFallback fallback, ref EncoderFallbackBuffer buffer,
213 ref char leftOver, bool flush)
215 // Validate the parameters.
217 throw new ArgumentNullException ("chars");
220 throw new ArgumentNullException ("bytes");
222 if (charIndex < 0 || charIndex > chars.Length) {
223 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
225 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
226 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
228 if (byteIndex < 0 || byteIndex > bytes.Length) {
229 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
232 if (charIndex == chars.Length) {
233 if (flush && leftOver != '\0') {
234 // FIXME: use EncoderFallback.
236 // By default it is empty, so I do nothing for now.
243 fixed (char* cptr = chars) {
244 if (bytes.Length == byteIndex)
245 return InternalGetBytes (
246 cptr + charIndex, charCount,
247 null, 0, fallback, ref buffer, ref leftOver, flush);
248 fixed (byte *bptr = bytes) {
249 return InternalGetBytes (
250 cptr + charIndex, charCount,
251 bptr + byteIndex, bytes.Length - byteIndex,
252 fallback, ref buffer,
253 ref leftOver, flush);
259 private unsafe static int InternalGetBytes (char* chars, int count, byte* bytes, int bcount, EncoderFallback fallback, ref EncoderFallbackBuffer buffer, ref char leftOver, bool flush)
261 char* end = chars + count;
263 byte* start_bytes = bytes;
264 byte* end_bytes = bytes + bcount;
265 while (chars < end) {
267 for (; chars < end; chars++) {
270 if (bytes >= end_bytes)
273 } else if (ch < '\x800') {
274 if (bytes + 1 >= end_bytes)
276 bytes [0] = (byte) (0xC0 | (ch >> 6));
277 bytes [1] = (byte) (0x80 | (ch & 0x3F));
279 } else if (ch < '\uD800' || ch > '\uDFFF') {
280 if (bytes + 2 >= end_bytes)
282 bytes [0] = (byte) (0xE0 | (ch >> 12));
283 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
284 bytes [2] = (byte) (0x80 | (ch & 0x3F));
286 } else if (ch <= '\uDBFF') {
287 // This is a surrogate char, exit the inner loop.
292 // We have a surrogate tail without
293 // leading surrogate.
294 char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
296 if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
298 fixed (char *fb_chars = fallback_chars) {
299 bytes += InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
306 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
307 // We have a correct surrogate pair.
308 int ch = 0x10000 + (int) *chars - 0xDC00 + (((int) leftOver - 0xD800) << 10);
309 if (bytes + 3 >= end_bytes)
311 bytes [0] = (byte) (0xF0 | (ch >> 18));
312 bytes [1] = (byte) (0x80 | ((ch >> 12) & 0x3F));
313 bytes [2] = (byte) (0x80 | ((ch >> 6) & 0x3F));
314 bytes [3] = (byte) (0x80 | (ch & 0x3F));
318 // We have a surrogate start followed by a
319 // regular character. Technically, this is
320 // invalid, but we have to do something.
321 // We write out the surrogate start and then
322 // re-visit the current character again.
323 char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
325 if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
327 fixed (char *fb_chars = fallback_chars) {
328 InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
337 // Flush the left-over surrogate pair start.
338 if (leftOver != '\0') {
340 if (bytes + 2 < end_bytes) {
341 bytes [0] = (byte) (0xE0 | (ch >> 12));
342 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
343 bytes [2] = (byte) (0x80 | (ch & 0x3F));
351 return (int)(bytes - (end_bytes - bcount));
353 throw new ArgumentException ("Insufficient Space", "bytes");
356 // Get the bytes that result from encoding a character buffer.
357 public override int GetBytes (char[] chars, int charIndex, int charCount,
358 byte[] bytes, int byteIndex)
360 char leftOver = '\0';
361 EncoderFallbackBuffer buffer = null;
362 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, EncoderFallback, ref buffer, ref leftOver, true);
365 // Convenience wrappers for "GetBytes".
366 public override int GetBytes (String s, int charIndex, int charCount,
367 byte[] bytes, int byteIndex)
369 // Validate the parameters.
371 throw new ArgumentNullException ("s");
374 throw new ArgumentNullException ("bytes");
376 if (charIndex < 0 || charIndex > s.Length) {
377 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
379 if (charCount < 0 || charCount > (s.Length - charIndex)) {
380 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
382 if (byteIndex < 0 || byteIndex > bytes.Length) {
383 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
386 if (charIndex == s.Length)
390 fixed (char* cptr = s) {
392 EncoderFallbackBuffer buffer = null;
393 if (bytes.Length == byteIndex)
394 return InternalGetBytes (
395 cptr + charIndex, charCount,
396 null, 0, EncoderFallback, ref buffer, ref dummy, true);
397 fixed (byte *bptr = bytes) {
398 return InternalGetBytes (
399 cptr + charIndex, charCount,
400 bptr + byteIndex, bytes.Length - byteIndex,
401 EncoderFallback, ref buffer,
408 [CLSCompliant (false)]
410 public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
413 throw new ArgumentNullException ("chars");
415 throw new IndexOutOfRangeException ("charCount");
417 throw new ArgumentNullException ("bytes");
419 throw new IndexOutOfRangeException ("charCount");
425 EncoderFallbackBuffer buffer = null;
427 return InternalGetBytes (chars, charCount, null, 0, EncoderFallback, ref buffer, ref dummy, true);
429 return InternalGetBytes (chars, charCount, bytes, byteCount, EncoderFallback, ref buffer, ref dummy, true);
434 // Internal version of "GetCharCount" which can handle a rolling
435 // state between multiple calls to this method.
436 private unsafe static int InternalGetCharCount (
437 byte[] bytes, int index, int count, uint leftOverBits,
438 uint leftOverCount, object provider,
439 ref DecoderFallbackBuffer fallbackBuffer, bool flush)
441 // Validate the parameters.
443 throw new ArgumentNullException ("bytes");
445 if (index < 0 || index > bytes.Length) {
446 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
448 if (count < 0 || count > (bytes.Length - index)) {
449 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
454 fixed (byte *bptr = bytes)
455 return InternalGetCharCount (bptr + index, count,
456 leftOverBits, leftOverCount, provider, ref fallbackBuffer, flush);
459 private unsafe static int InternalGetCharCount (
460 byte* bytes, int byteCount, uint leftOverBits,
461 uint leftOverCount, object provider,
462 ref DecoderFallbackBuffer fallbackBuffer, bool flush)
468 if (leftOverCount == 0) {
469 int end = byteIndex + byteCount;
470 for (; byteIndex < end; byteIndex++, byteCount--) {
471 if (bytes [byteIndex] < 0x80)
478 // Determine the number of characters that we have.
480 uint leftBits = leftOverBits;
481 uint leftSoFar = (leftOverCount & (uint)0x0F);
482 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
484 int byteEnd = byteIndex + byteCount;
485 for(; byteIndex < byteEnd; byteIndex++) {
486 // Fetch the next character from the byte buffer.
487 ch = (uint)(bytes[byteIndex]);
489 // Process a UTF-8 start character.
490 if (ch < (uint)0x0080) {
491 // Single-byte UTF-8 character.
493 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
494 // Double-byte UTF-8 character.
495 leftBits = (ch & (uint)0x1F);
498 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
499 // Three-byte UTF-8 character.
500 leftBits = (ch & (uint)0x0F);
503 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
504 // Four-byte UTF-8 character.
505 leftBits = (ch & (uint)0x07);
508 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
509 // Five-byte UTF-8 character.
510 leftBits = (ch & (uint)0x03);
513 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
514 // Six-byte UTF-8 character.
515 leftBits = (ch & (uint)0x03);
519 // Invalid UTF-8 start character.
520 length += Fallback (provider, ref fallbackBuffer, bytes, byteIndex, 1);
523 // Process an extra byte in a multi-byte sequence.
524 if ((ch & (uint)0xC0) == (uint)0x80) {
525 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
526 if (++leftSoFar >= leftSize) {
527 // We have a complete character now.
528 if (leftBits < (uint)0x10000) {
529 // is it an overlong ?
530 bool overlong = false;
533 overlong = (leftBits <= 0x7F);
536 overlong = (leftBits <= 0x07FF);
539 overlong = (leftBits <= 0xFFFF);
542 overlong = (leftBits <= 0x1FFFFF);
545 overlong = (leftBits <= 0x03FFFFFF);
549 length += Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar);
550 --byteIndex; //process byte again
552 else if ((leftBits & 0xF800) == 0xD800) {
553 // UTF-8 doesn't use surrogate characters
554 length += Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar);
558 } else if (leftBits < (uint)0x110000) {
561 length += Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar);
566 // Invalid UTF-8 sequence: clear and restart.
567 length += Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar);
573 if (flush && leftSize != 0) {
574 // We had left-over bytes that didn't make up
575 // a complete UTF-8 character sequence.
576 length += Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar);
579 // Return the final length to the caller.
583 // for GetCharCount()
584 static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, byte* bytes, long index, uint size)
586 if (buffer == null) {
587 DecoderFallback fb = provider as DecoderFallback;
589 buffer = fb.CreateFallbackBuffer ();
591 buffer = ((Decoder) provider).FallbackBuffer;
594 var bufferArg = new byte [size];
596 for (int i = 0; i < size; i++)
597 bufferArg [i] = bytes [(int) index + i];
599 buffer.Fallback (bufferArg, 0);
600 int ret = buffer.Remaining;
607 static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, byte* bytes, long byteIndex, uint size,
608 char* chars, ref int charIndex)
610 if (buffer == null) {
611 DecoderFallback fb = provider as DecoderFallback;
613 buffer = fb.CreateFallbackBuffer ();
615 buffer = ((Decoder) provider).FallbackBuffer;
618 var bufferArg = new byte [size];
620 for (int i = 0; i < size; i++)
621 bufferArg [i] = bytes [byteIndex + i];
623 buffer.Fallback (bufferArg, 0);
624 while (buffer.Remaining > 0)
625 chars [charIndex++] = buffer.GetNextChar ();
629 // Get the number of characters needed to decode a byte buffer.
630 public override int GetCharCount (byte[] bytes, int index, int count)
632 DecoderFallbackBuffer buf = null;
633 return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, true);
636 [CLSCompliant (false)]
638 public unsafe override int GetCharCount (byte* bytes, int count)
640 DecoderFallbackBuffer buf = null;
641 return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, true);
644 // Get the characters that result from decoding a byte buffer.
645 private unsafe static int InternalGetChars (
646 byte[] bytes, int byteIndex, int byteCount, char[] chars,
647 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
649 ref DecoderFallbackBuffer fallbackBuffer, bool flush)
651 // Validate the parameters.
653 throw new ArgumentNullException ("bytes");
656 throw new ArgumentNullException ("chars");
658 if (byteIndex < 0 || byteIndex > bytes.Length) {
659 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
661 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
662 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
664 if (charIndex < 0 || charIndex > chars.Length) {
665 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
668 if (charIndex == chars.Length && byteCount == 0)
671 fixed (char* cptr = chars) {
672 if (byteCount == 0 || byteIndex == bytes.Length)
673 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, flush);
675 fixed (byte* bptr = bytes)
676 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, flush);
680 private unsafe static int InternalGetChars (
681 byte* bytes, int byteCount, char* chars, int charCount,
682 ref uint leftOverBits, ref uint leftOverCount,
684 ref DecoderFallbackBuffer fallbackBuffer, bool flush)
686 int charIndex = 0, byteIndex = 0;
687 int length = charCount;
688 int posn = charIndex;
690 if (leftOverCount == 0) {
691 int end = byteIndex + byteCount;
692 for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
693 if (bytes [byteIndex] < 0x80) {
694 if (posn >= length) {
695 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
697 chars [posn] = (char) bytes [byteIndex];
704 // Convert the bytes into the output buffer.
706 uint leftBits = leftOverBits;
707 uint leftSoFar = (leftOverCount & (uint)0x0F);
708 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
710 int byteEnd = byteIndex + byteCount;
711 for(; byteIndex < byteEnd; byteIndex++) {
712 // Fetch the next character from the byte buffer.
713 ch = (uint)(bytes[byteIndex]);
715 // Process a UTF-8 start character.
716 if (ch < (uint)0x0080) {
717 // Single-byte UTF-8 character.
718 if (posn >= length) {
719 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
721 chars[posn++] = (char)ch;
722 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
723 // Double-byte UTF-8 character.
724 leftBits = (ch & (uint)0x1F);
727 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
728 // Three-byte UTF-8 character.
729 leftBits = (ch & (uint)0x0F);
732 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
733 // Four-byte UTF-8 character.
734 leftBits = (ch & (uint)0x07);
737 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
738 // Five-byte UTF-8 character.
739 leftBits = (ch & (uint)0x03);
742 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
743 // Six-byte UTF-8 character.
744 leftBits = (ch & (uint)0x03);
748 // Invalid UTF-8 start character.
749 Fallback (provider, ref fallbackBuffer, bytes, byteIndex, 1, chars, ref posn);
752 // Process an extra byte in a multi-byte sequence.
753 if ((ch & (uint)0xC0) == (uint)0x80) {
754 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
755 if (++leftSoFar >= leftSize) {
756 // We have a complete character now.
757 if (leftBits < (uint)0x10000) {
758 // is it an overlong ?
759 bool overlong = false;
762 overlong = (leftBits <= 0x7F);
765 overlong = (leftBits <= 0x07FF);
768 overlong = (leftBits <= 0xFFFF);
771 overlong = (leftBits <= 0x1FFFFF);
774 overlong = (leftBits <= 0x03FFFFFF);
778 Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
779 --byteIndex; //process byte again
781 else if ((leftBits & 0xF800) == 0xD800) {
782 // UTF-8 doesn't use surrogate characters
783 Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
786 if (posn >= length) {
787 throw new ArgumentException
788 (_("Arg_InsufficientSpace"), "chars");
790 chars[posn++] = (char)leftBits;
792 } else if (leftBits < (uint)0x110000) {
793 if ((posn + 2) > length) {
794 throw new ArgumentException
795 (_("Arg_InsufficientSpace"), "chars");
797 leftBits -= (uint)0x10000;
798 chars[posn++] = (char)((leftBits >> 10) +
801 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
803 Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
808 // Invalid UTF-8 sequence: clear and restart.
809 Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
815 if (flush && leftSize != 0) {
816 // We had left-over bytes that didn't make up
817 // a complete UTF-8 character sequence.
818 Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
820 leftOverBits = leftBits;
821 leftOverCount = (leftSoFar | (leftSize << 4));
823 // Return the final length to the caller.
824 return posn - charIndex;
827 // Get the characters that result from decoding a byte buffer.
828 public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
829 char[] chars, int charIndex)
831 uint leftOverBits = 0;
832 uint leftOverCount = 0;
833 DecoderFallbackBuffer buf = null;
834 byte [] bufferArg = null;
835 return InternalGetChars (bytes, byteIndex, byteCount, chars,
836 charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, true);
839 [CLSCompliant (false)]
841 public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
843 DecoderFallbackBuffer buf = null;
844 uint leftOverBits = 0;
845 uint leftOverCount = 0;
846 return InternalGetChars (bytes, byteCount, chars,
847 charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, true);
850 // Get the maximum number of bytes needed to encode a
851 // specified number of characters.
852 public override int GetMaxByteCount (int charCount)
855 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
858 // Add 1 to charCount since there may be a lead surrogate left from the previous call to GetBytes/Encoder.Convert
859 charCount = charCount + 1;
860 if (EncoderFallback.MaxCharCount > 1) {
861 charCount = charCount * EncoderFallback.MaxCharCount;
864 return charCount * 3;
867 // Get the maximum number of characters needed to decode a
868 // specified number of bytes.
869 public override int GetMaxCharCount (int byteCount)
872 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
875 // Add 1 to byteCount since there may be the bytes from part of a surrogate pair left from the previous call to GetChars/Decoder.Convert
876 int maxCharCount = byteCount + 1;
877 if (DecoderFallback.MaxCharCount > 1) {
878 maxCharCount = maxCharCount * DecoderFallback.MaxCharCount;
884 // Get a UTF8-specific decoder that is attached to this instance.
885 public override Decoder GetDecoder ()
887 return new UTF8Decoder (this);
890 // Get a UTF8-specific encoder that is attached to this instance.
891 public override Encoder GetEncoder ()
893 return new UTF8Encoder (this);
896 // Get the UTF8 preamble.
897 public override byte[] GetPreamble ()
900 return new byte [] { 0xEF, 0xBB, 0xBF };
902 return EmptyArray<byte>.Value;
905 // Determine if this object is equal to another.
906 public override bool Equals (Object value)
908 UTF8Encoding enc = (value as UTF8Encoding);
910 return (codePage == enc.codePage &&
911 emitIdentifier == enc.emitIdentifier &&
912 DecoderFallback.Equals (enc.DecoderFallback) &&
913 EncoderFallback.Equals (enc.EncoderFallback));
919 // Get the hash code for this object.
920 public override int GetHashCode ()
922 return base.GetHashCode ();
925 public override int GetByteCount (string chars)
927 // hmm, does this override make any sense?
928 return base.GetByteCount (chars);
932 public override string GetString (byte [] bytes, int index, int count)
934 // hmm, does this override make any sense?
935 return base.GetString (bytes, index, count);
938 // UTF-8 decoder implementation.
940 private class UTF8Decoder : EncodingDecoder
942 private uint leftOverBits;
943 private uint leftOverCount;
946 public UTF8Decoder (Encoding encoding)
953 // Override inherited methods.
954 public override int GetCharCount (byte[] bytes, int index, int count)
956 DecoderFallbackBuffer buf = null;
957 return InternalGetCharCount (bytes, index, count,
958 leftOverBits, leftOverCount, this, ref buf, false);
960 public override int GetChars (byte[] bytes, int byteIndex,
961 int byteCount, char[] chars, int charIndex)
963 DecoderFallbackBuffer buf = null;
964 return InternalGetChars (bytes, byteIndex, byteCount,
965 chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, false);
968 } // class UTF8Decoder
970 // UTF-8 encoder implementation.
972 private class UTF8Encoder : EncodingEncoder
974 // private bool emitIdentifier;
975 private char leftOverForCount;
976 private char leftOverForConv;
979 public UTF8Encoder (UTF8Encoding encoding)
982 leftOverForCount = '\0';
983 leftOverForConv = '\0';
986 // Override inherited methods.
987 public override int GetByteCount (char[] chars, int index,
988 int count, bool flush)
990 return InternalGetByteCount (chars, index, count, Fallback, ref leftOverForCount, flush);
992 public override int GetBytes (char[] chars, int charIndex,
993 int charCount, byte[] bytes, int byteIndex, bool flush)
996 EncoderFallbackBuffer buffer = null;
997 result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, Fallback, ref buffer, ref leftOverForConv, flush);
998 // emitIdentifier = false;
1002 public unsafe override int GetByteCount (char* chars, int count, bool flush)
1004 return InternalGetByteCount (chars, count, Fallback, ref leftOverForCount, flush);
1007 public unsafe override int GetBytes (char* chars, int charCount,
1008 byte* bytes, int byteCount, bool flush)
1011 EncoderFallbackBuffer buffer = null;
1012 result = InternalGetBytes (chars, charCount, bytes, byteCount, Fallback, ref buffer, ref leftOverForConv, flush);
1013 // emitIdentifier = false;
1016 } // class UTF8Encoder
1018 }; // class UTF8Encoding
1020 }; // namespace System.Text