2 * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
4 * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
5 * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 * OTHER DEALINGS IN THE SOFTWARE.
30 using System.Runtime.InteropServices;
33 [MonoTODO ("Fix serialization compatibility with MS.NET")]
35 [MonoTODO ("EncoderFallback is not handled")]
38 public class UTF8Encoding : Encoding
40 // Magic number used by Windows for UTF-8.
41 internal const int UTF8_CODE_PAGE = 65001;
44 private bool emitIdentifier;
46 private bool throwOnInvalid;
50 public UTF8Encoding () : this (false, false) {}
51 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
52 : this (encoderShouldEmitUTF8Identifier, false) {}
54 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
55 : base (UTF8_CODE_PAGE)
57 emitIdentifier = encoderShouldEmitUTF8Identifier;
59 if (throwOnInvalidBytes)
60 SetFallbackInternal (null, new DecoderExceptionFallback ());
62 SetFallbackInternal (null, new DecoderReplacementFallback (String.Empty));
64 throwOnInvalid = throwOnInvalidBytes;
67 web_name = body_name = header_name = "utf-8";
68 encoding_name = "Unicode (UTF-8)";
69 is_browser_save = true;
70 is_browser_display = true;
71 is_mail_news_display = true;
72 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
75 #region GetByteCount()
77 // Internal version of "GetByteCount" which can handle a rolling
78 // state between multiple calls to this method.
79 private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
81 // Validate the parameters.
83 throw new ArgumentNullException ("chars");
85 if (index < 0 || index > chars.Length) {
86 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
88 if (count < 0 || count > (chars.Length - index)) {
89 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
92 if (index == chars.Length) {
93 if (flush && leftOver != '\0') {
94 // Flush the left-over surrogate pair start.
102 fixed (char* cptr = chars) {
103 return InternalGetByteCount (cptr + index, count, ref leftOver, flush);
109 private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
113 // Determine the lengths of all characters.
116 char pair = leftOver;
121 // fast path optimization
122 int end = index + count;
123 for (; index < end; index++, count--) {
124 if (chars [index] < '\x80')
131 } else if (ch < '\u0800') {
133 } else if (ch >= '\uD800' && ch <= '\uDBFF') {
134 // This is the start of a surrogate pair.
139 } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
141 // We have a surrogate pair.
145 // We have a surrogate tail without
146 // leading surrogate. In NET_2_0 it
147 // uses fallback. In NET_1_1 we output
153 // We have a surrogate start followed by a
154 // regular character. Technically, this is
155 // invalid, but we have to do something.
156 // We write out the surrogate start and then
157 // re-visit the current character again.
167 // Flush the left-over surrogate pair start.
174 // Return the final length to the caller.
178 // Get the number of bytes needed to encode a character buffer.
179 public override int GetByteCount (char[] chars, int index, int count)
182 return InternalGetByteCount (chars, index, count, ref dummy, true);
186 // Convenience wrappers for "GetByteCount".
187 public override int GetByteCount (String s)
189 // Validate the parameters.
191 throw new ArgumentNullException ("s");
195 fixed (char* cptr = s) {
197 return InternalGetByteCount (cptr, s.Length, ref dummy, true);
204 [CLSCompliant (false)]
206 public unsafe override int GetByteCount (char* chars, int count)
209 throw new ArgumentNullException ("chars");
213 return InternalGetByteCount (chars, count, ref dummy, true);
221 // Internal version of "GetBytes" which can handle a rolling
222 // state between multiple calls to this method.
223 private static int InternalGetBytes (char[] chars, int charIndex,
224 int charCount, byte[] bytes,
225 int byteIndex, ref char leftOver,
228 // Validate the parameters.
230 throw new ArgumentNullException ("chars");
233 throw new ArgumentNullException ("bytes");
235 if (charIndex < 0 || charIndex > chars.Length) {
236 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
238 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
239 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
241 if (byteIndex < 0 || byteIndex > bytes.Length) {
242 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
245 if (charIndex == chars.Length) {
246 if (flush && leftOver != '\0') {
248 // FIXME: use EncoderFallback.
250 // By default it is empty, so I do nothing for now.
253 // Flush the left-over surrogate pair start.
254 if (byteIndex >= bytes.Length - 3)
255 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
256 bytes [byteIndex++] = 0xEF;
257 bytes [byteIndex++] = 0xBB;
258 bytes [byteIndex++] = 0xBF;
267 fixed (char* cptr = chars) {
268 if (bytes.Length == byteIndex)
269 return InternalGetBytes (
270 cptr + charIndex, charCount,
271 null, 0, ref leftOver, flush);
272 fixed (byte *bptr = bytes) {
273 return InternalGetBytes (
274 cptr + charIndex, charCount,
275 bptr + byteIndex, bytes.Length - byteIndex,
276 ref leftOver, flush);
282 private unsafe static int InternalGetBytes (char* chars, int charCount,
283 byte* bytes, int byteCount,
284 ref char leftOver, bool flush)
289 // Convert the characters into bytes.
290 // Convert the characters into bytes.
292 int length = byteCount;
293 char pair = leftOver;
294 int posn = byteIndex;
297 while (charCount > 0) {
298 // Fetch the next UTF-16 character pair value.
299 ch = chars [charIndex];
301 if (ch < '\uD800' || ch >= '\uE000') {
302 if (ch < '\x80') { // fast path optimization
303 int end = charIndex + charCount;
304 for (; charIndex < end; posn++, charIndex++, charCount--) {
305 if (chars [charIndex] < '\x80')
306 bytes [posn] = (byte) chars [charIndex];
314 else if (ch < '\uDC00') {
320 } else { // ch <= '\uDFFF'
321 // We have a surrogate tail without leading
322 // surrogate. In NET_2_0 it uses fallback.
323 // In NET_1_1 we output wrong surrogate.
324 if (posn > length - 3) {
325 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
327 bytes [posn++] = (byte) (0xE0 | (ch >> 12));
328 bytes [posn++] = (byte) (0x80 | ((ch >> 6) & 0x3F));
329 bytes [posn++] = (byte) (0x80 | (ch & 0x3F));
335 if ('\uDC00' <= ch && ch <= '\uDFFF')
336 code = 0x10000 + (int) ch - 0xDC00 +
337 (((int) pair - 0xD800) << 10);
339 // We have a surrogate start followed by a
340 // regular character. Technically, this is
341 // invalid, but we have to do something.
342 // We write out the surrogate start and then
343 // re-visit the current character again.
344 if (posn > length - 3) {
345 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
347 bytes [posn++] = (byte) (0xE0 | (pair >> 12));
348 bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
349 bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
358 // Encode the character pair value.
361 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
362 bytes [posn++] = (byte)code;
363 } else if (code < 0x0800) {
364 if ((posn + 2) > length)
365 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
366 bytes [posn++] = (byte) (0xC0 | (code >> 6));
367 bytes [posn++] = (byte) (0x80 | (code & 0x3F));
368 } else if (code < 0x10000) {
369 if (posn > length - 3)
370 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
371 bytes [posn++] = (byte) (0xE0 | (code >> 12));
372 bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
373 bytes [posn++] = (byte) (0x80 | (code & 0x3F));
375 if (posn > length - 4)
376 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
377 bytes [posn++] = (byte) (0xF0 | (code >> 18));
378 bytes [posn++] = (byte) (0x80 | ((code >> 12) & 0x3F));
379 bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
380 bytes [posn++] = (byte) (0x80 | (code & 0x3F));
386 // Flush the left-over incomplete surrogate.
387 if (posn > length - 3) {
388 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
390 bytes [posn++] = (byte) (0xE0 | (pair >> 12));
391 bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
392 bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
398 Char.IsLetterOrDigit (pair);
400 // Return the final count to the caller.
401 return posn - byteIndex;
404 private unsafe int Fallback (byte* bytes, int byteCount, char lead, char tail)
406 throw new NotImplementedException ();
409 // Get the bytes that result from encoding a character buffer.
410 public override int GetBytes (char[] chars, int charIndex, int charCount,
411 byte[] bytes, int byteIndex)
413 char leftOver = '\0';
414 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
417 // Convenience wrappers for "GetBytes".
418 public override int GetBytes (String s, int charIndex, int charCount,
419 byte[] bytes, int byteIndex)
421 // Validate the parameters.
423 throw new ArgumentNullException ("s");
426 throw new ArgumentNullException ("bytes");
428 if (charIndex < 0 || charIndex > s.Length) {
429 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
431 if (charCount < 0 || charCount > (s.Length - charIndex)) {
432 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
434 if (byteIndex < 0 || byteIndex > bytes.Length) {
435 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
438 if (charIndex == s.Length)
442 fixed (char* cptr = s) {
444 if (bytes.Length == byteIndex)
445 return InternalGetBytes (
446 cptr + charIndex, charCount,
447 null, 0, ref dummy, true);
448 fixed (byte *bptr = bytes) {
449 return InternalGetBytes (
450 cptr + charIndex, charCount,
451 bptr + byteIndex, bytes.Length - byteIndex,
459 [CLSCompliant (false)]
461 public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
464 throw new ArgumentNullException ("chars");
466 throw new IndexOutOfRangeException ("charCount");
468 throw new ArgumentNullException ("bytes");
470 throw new IndexOutOfRangeException ("charCount");
477 return InternalGetBytes (chars, charCount, null, 0, ref dummy, true);
479 return InternalGetBytes (chars, charCount, bytes, byteCount, ref dummy, true);
485 // Internal version of "GetCharCount" which can handle a rolling
486 // state between multiple calls to this method.
488 private unsafe static int InternalGetCharCount (
489 byte[] bytes, int index, int count, uint leftOverBits,
490 uint leftOverCount, object provider,
491 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
493 private unsafe static int InternalGetCharCount (
494 byte[] bytes, int index, int count, uint leftOverBits,
495 uint leftOverCount, bool throwOnInvalid, bool flush)
498 // Validate the parameters.
500 throw new ArgumentNullException ("bytes");
502 if (index < 0 || index > bytes.Length) {
503 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
505 if (count < 0 || count > (bytes.Length - index)) {
506 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
511 fixed (byte *bptr = bytes)
513 return InternalGetCharCount (bptr + index, count,
514 leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
516 return InternalGetCharCount (bptr + index, count,
517 leftOverBits, leftOverCount, throwOnInvalid, flush);
522 private unsafe static int InternalGetCharCount (
523 byte* bytes, int count, uint leftOverBits,
524 uint leftOverCount, object provider,
525 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
527 private unsafe static int InternalGetCharCount (
528 byte* bytes, int count, uint leftOverBits,
529 uint leftOverCount, bool throwOnInvalid, bool flush)
536 if (leftOverCount == 0) {
537 int end = index + count;
538 for (; index < end; index++, count--) {
539 if (bytes [index] < 0x80)
546 // Determine the number of characters that we have.
548 uint leftBits = leftOverBits;
549 uint leftSoFar = (leftOverCount & (uint)0x0F);
550 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
552 ch = (uint)(bytes[index++]);
555 // Process a UTF-8 start character.
556 if (ch < (uint)0x0080) {
557 // Single-byte UTF-8 character.
559 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
560 // Double-byte UTF-8 character.
561 leftBits = (ch & (uint)0x1F);
564 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
565 // Three-byte UTF-8 character.
566 leftBits = (ch & (uint)0x0F);
569 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
570 // Four-byte UTF-8 character.
571 leftBits = (ch & (uint)0x07);
574 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
575 // Five-byte UTF-8 character.
576 leftBits = (ch & (uint)0x03);
579 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
580 // Six-byte UTF-8 character.
581 leftBits = (ch & (uint)0x03);
585 // Invalid UTF-8 start character.
587 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
590 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
594 // Process an extra byte in a multi-byte sequence.
595 if ((ch & (uint)0xC0) == (uint)0x80) {
596 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
597 if (++leftSoFar >= leftSize) {
598 // We have a complete character now.
599 if (leftBits < (uint)0x10000) {
600 // is it an overlong ?
601 bool overlong = false;
604 overlong = (leftBits <= 0x7F);
607 overlong = (leftBits <= 0x07FF);
610 overlong = (leftBits <= 0xFFFF);
613 overlong = (leftBits <= 0x1FFFFF);
616 overlong = (leftBits <= 0x03FFFFFF);
621 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
624 throw new ArgumentException (_("Overlong"), leftBits.ToString ());
629 } else if (leftBits < (uint)0x110000) {
633 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
636 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
642 // Invalid UTF-8 sequence: clear and restart.
644 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
647 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
655 if (flush && leftSize != 0) {
656 // We had left-over bytes that didn't make up
657 // a complete UTF-8 character sequence.
659 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index);
662 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
666 // Return the final length to the caller.
671 // for GetCharCount()
672 static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, int index)
674 if (buffer == null) {
675 DecoderFallback fb = provider as DecoderFallback;
677 buffer = fb.CreateFallbackBuffer ();
679 buffer = ((Decoder) provider).FallbackBuffer;
681 if (bufferArg == null)
682 bufferArg = new byte [1];
683 bufferArg [0] = bytes [index];
684 buffer.Fallback (bufferArg, 0);
685 return buffer.Remaining;
689 static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, int byteIndex,
690 char* chars, ref int charIndex)
692 if (buffer == null) {
693 DecoderFallback fb = provider as DecoderFallback;
695 buffer = fb.CreateFallbackBuffer ();
697 buffer = ((Decoder) provider).FallbackBuffer;
699 if (bufferArg == null)
700 bufferArg = new byte [1];
701 bufferArg [0] = bytes [byteIndex];
702 buffer.Fallback (bufferArg, 0);
703 while (buffer.Remaining > 0)
704 chars [charIndex++] = buffer.GetNextChar ();
708 // Get the number of characters needed to decode a byte buffer.
709 public override int GetCharCount (byte[] bytes, int index, int count)
712 DecoderFallbackBuffer buf = null;
713 byte [] bufferArg = null;
714 return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
716 return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
721 [CLSCompliant (false)]
723 public unsafe override int GetCharCount (byte* bytes, int count)
725 DecoderFallbackBuffer buf = null;
726 byte [] bufferArg = null;
727 return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
731 // Get the characters that result from decoding a byte buffer.
733 private unsafe static int InternalGetChars (
734 byte[] bytes, int byteIndex, int byteCount, char[] chars,
735 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
737 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
739 private unsafe static int InternalGetChars (
740 byte[] bytes, int byteIndex, int byteCount, char[] chars,
741 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
742 bool throwOnInvalid, bool flush)
745 // Validate the parameters.
747 throw new ArgumentNullException ("bytes");
750 throw new ArgumentNullException ("chars");
752 if (byteIndex < 0 || byteIndex > bytes.Length) {
753 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
755 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
756 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
758 if (charIndex < 0 || charIndex > chars.Length) {
759 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
762 if (charIndex == chars.Length)
765 fixed (char* cptr = chars) {
767 if (byteCount == 0 || byteIndex == bytes.Length)
768 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
770 fixed (byte* bptr = bytes)
771 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
773 if (byteCount == 0 || byteIndex == bytes.Length)
774 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
776 fixed (byte* bptr = bytes)
777 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
783 private unsafe static int InternalGetChars (
784 byte* bytes, int byteCount, char* chars, int charCount,
785 ref uint leftOverBits, ref uint leftOverCount,
787 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
789 private unsafe static int InternalGetChars (
790 byte* bytes, int byteCount, char* chars, int charCount,
791 ref uint leftOverBits, ref uint leftOverCount,
792 bool throwOnInvalid, bool flush)
795 int charIndex = 0, byteIndex = 0;
796 int length = charCount;
797 int posn = charIndex;
799 if (leftOverCount == 0) {
800 int end = byteIndex + byteCount;
801 for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
802 if (bytes [byteIndex] < 0x80)
803 chars [posn] = (char) bytes [byteIndex];
809 // Convert the bytes into the output buffer.
811 uint leftBits = leftOverBits;
812 uint leftSoFar = (leftOverCount & (uint)0x0F);
813 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
815 int byteEnd = byteIndex + byteCount;
816 for(; byteIndex < byteEnd; byteIndex++) {
817 // Fetch the next character from the byte buffer.
818 ch = (uint)(bytes[byteIndex]);
820 // Process a UTF-8 start character.
821 if (ch < (uint)0x0080) {
822 // Single-byte UTF-8 character.
823 if (posn >= length) {
824 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
826 chars[posn++] = (char)ch;
827 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
828 // Double-byte UTF-8 character.
829 leftBits = (ch & (uint)0x1F);
832 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
833 // Three-byte UTF-8 character.
834 leftBits = (ch & (uint)0x0F);
837 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
838 // Four-byte UTF-8 character.
839 leftBits = (ch & (uint)0x07);
842 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
843 // Five-byte UTF-8 character.
844 leftBits = (ch & (uint)0x03);
847 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
848 // Six-byte UTF-8 character.
849 leftBits = (ch & (uint)0x03);
853 // Invalid UTF-8 start character.
855 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
858 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
862 // Process an extra byte in a multi-byte sequence.
863 if ((ch & (uint)0xC0) == (uint)0x80) {
864 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
865 if (++leftSoFar >= leftSize) {
866 // We have a complete character now.
867 if (leftBits < (uint)0x10000) {
868 // is it an overlong ?
869 bool overlong = false;
872 overlong = (leftBits <= 0x7F);
875 overlong = (leftBits <= 0x07FF);
878 overlong = (leftBits <= 0xFFFF);
881 overlong = (leftBits <= 0x1FFFFF);
884 overlong = (leftBits <= 0x03FFFFFF);
889 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
892 throw new ArgumentException (_("Overlong"), leftBits.ToString ());
895 else if ((leftBits & 0xF800) == 0xD800) {
896 // UTF-8 doesn't use surrogate characters
898 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
901 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
905 if (posn >= length) {
906 throw new ArgumentException
907 (_("Arg_InsufficientSpace"), "chars");
909 chars[posn++] = (char)leftBits;
911 } else if (leftBits < (uint)0x110000) {
912 if ((posn + 2) > length) {
913 throw new ArgumentException
914 (_("Arg_InsufficientSpace"), "chars");
916 leftBits -= (uint)0x10000;
917 chars[posn++] = (char)((leftBits >> 10) +
920 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
923 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
926 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
932 // Invalid UTF-8 sequence: clear and restart.
934 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
937 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
944 if (flush && leftSize != 0) {
945 // We had left-over bytes that didn't make up
946 // a complete UTF-8 character sequence.
948 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
951 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
954 leftOverBits = leftBits;
955 leftOverCount = (leftSoFar | (leftSize << 4));
957 // Return the final length to the caller.
958 return posn - charIndex;
961 // Get the characters that result from decoding a byte buffer.
962 public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
963 char[] chars, int charIndex)
965 uint leftOverBits = 0;
966 uint leftOverCount = 0;
968 DecoderFallbackBuffer buf = null;
969 byte [] bufferArg = null;
970 return InternalGetChars (bytes, byteIndex, byteCount, chars,
971 charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
973 return InternalGetChars (bytes, byteIndex, byteCount, chars,
974 charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
979 [CLSCompliant (false)]
981 public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
983 DecoderFallbackBuffer buf = null;
984 byte [] bufferArg = null;
985 uint leftOverBits = 0;
986 uint leftOverCount = 0;
987 return InternalGetChars (bytes, byteCount, chars,
988 charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
992 // Get the maximum number of bytes needed to encode a
993 // specified number of characters.
994 public override int GetMaxByteCount (int charCount)
997 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
999 return charCount * 4;
1002 // Get the maximum number of characters needed to decode a
1003 // specified number of bytes.
1004 public override int GetMaxCharCount (int byteCount)
1006 if (byteCount < 0) {
1007 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
1012 // Get a UTF8-specific decoder that is attached to this instance.
1013 public override Decoder GetDecoder ()
1016 return new UTF8Decoder (DecoderFallback);
1018 return new UTF8Decoder (throwOnInvalid);
1022 // Get a UTF8-specific encoder that is attached to this instance.
1023 public override Encoder GetEncoder ()
1025 return new UTF8Encoder (emitIdentifier);
1028 // Get the UTF8 preamble.
1029 public override byte[] GetPreamble ()
1031 if (emitIdentifier) {
1032 byte[] pre = new byte [3];
1033 pre[0] = (byte)0xEF;
1034 pre[1] = (byte)0xBB;
1035 pre[2] = (byte)0xBF;
1038 return new byte [0];
1042 // Determine if this object is equal to another.
1043 public override bool Equals (Object value)
1045 UTF8Encoding enc = (value as UTF8Encoding);
1048 return (codePage == enc.codePage &&
1049 emitIdentifier == enc.emitIdentifier &&
1050 DecoderFallback == enc.DecoderFallback &&
1051 EncoderFallback == enc.EncoderFallback);
1053 return (codePage == enc.codePage &&
1054 emitIdentifier == enc.emitIdentifier &&
1055 throwOnInvalid == enc.throwOnInvalid);
1062 // Get the hash code for this object.
1063 public override int GetHashCode ()
1065 return base.GetHashCode ();
1070 public override int GetByteCount (string s)
1072 // hmm, does this override make any sense?
1073 return base.GetByteCount (s);
1077 [ComVisible (false)]
1078 public override string GetString (byte [] bytes, int index, int count)
1080 // hmm, does this override make any sense?
1081 return base.GetString (bytes, index, count);
1086 public override byte [] GetBytes (String s)
1089 throw new ArgumentNullException ("s");
1091 int length = GetByteCount (s);
1092 byte [] bytes = new byte [length];
1093 GetBytes (s, 0, s.Length, bytes, 0);
1098 // UTF-8 decoder implementation.
1100 private class UTF8Decoder : Decoder
1103 private bool throwOnInvalid;
1105 private uint leftOverBits;
1106 private uint leftOverCount;
1110 public UTF8Decoder (DecoderFallback fallback)
1112 public UTF8Decoder (bool throwOnInvalid)
1116 Fallback = fallback;
1118 this.throwOnInvalid = throwOnInvalid;
1124 // Override inherited methods.
1125 public override int GetCharCount (byte[] bytes, int index, int count)
1128 DecoderFallbackBuffer buf = null;
1129 byte [] bufferArg = null;
1130 return InternalGetCharCount (bytes, index, count,
1131 leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
1133 return InternalGetCharCount (bytes, index, count,
1134 leftOverBits, leftOverCount, throwOnInvalid, false);
1137 public override int GetChars (byte[] bytes, int byteIndex,
1138 int byteCount, char[] chars, int charIndex)
1141 DecoderFallbackBuffer buf = null;
1142 byte [] bufferArg = null;
1143 return InternalGetChars (bytes, byteIndex, byteCount,
1144 chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
1146 return InternalGetChars (bytes, byteIndex, byteCount,
1147 chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
1151 } // class UTF8Decoder
1153 // UTF-8 encoder implementation.
1155 private class UTF8Encoder : Encoder
1157 private bool emitIdentifier;
1158 private char leftOverForCount;
1159 private char leftOverForConv;
1162 public UTF8Encoder (bool emitIdentifier)
1164 this.emitIdentifier = emitIdentifier;
1165 leftOverForCount = '\0';
1166 leftOverForConv = '\0';
1169 // Override inherited methods.
1170 public override int GetByteCount (char[] chars, int index,
1171 int count, bool flush)
1173 return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush);
1175 public override int GetBytes (char[] chars, int charIndex,
1176 int charCount, byte[] bytes, int byteIndex, bool flush)
1179 result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush);
1180 emitIdentifier = false;
1185 public unsafe override int GetByteCount (char* chars, int count, bool flush)
1187 return InternalGetByteCount (chars, count, ref leftOverForCount, flush);
1190 public unsafe override int GetBytes (char* chars, int charCount,
1191 byte* bytes, int byteCount, bool flush)
1194 result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush);
1195 emitIdentifier = false;
1200 } // class UTF8Encoder
1202 }; // class UTF8Encoding
1204 }; // namespace System.Text