2 * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
4 * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
5 * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 * OTHER DEALINGS IN THE SOFTWARE.
30 using System.Runtime.InteropServices;
33 [MonoLimitation ("Serialization format not compatible with .NET")]
35 public class UTF8Encoding : Encoding
37 // Magic number used by Windows for UTF-8.
38 internal const int UTF8_CODE_PAGE = 65001;
41 private bool emitIdentifier;
44 public UTF8Encoding () : this (false, false) {}
45 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
46 : this (encoderShouldEmitUTF8Identifier, false) {}
48 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
49 : base (UTF8_CODE_PAGE)
51 emitIdentifier = encoderShouldEmitUTF8Identifier;
52 if (throwOnInvalidBytes)
53 SetFallbackInternal (EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
55 SetFallbackInternal (EncoderFallback.StandardSafeFallback, DecoderFallback.StandardSafeFallback);
57 web_name = body_name = header_name = "utf-8";
58 encoding_name = "Unicode (UTF-8)";
59 is_browser_save = true;
60 is_browser_display = true;
61 is_mail_news_display = true;
62 is_mail_news_save = true;
63 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
66 ///////////////////////////////////////////////////////////////////////
67 // INTERNAL DECODING FUNCTION (UTF8 -> CHAR/UTF16)
68 ///////////////////////////////////////////////////////////////////////
70 internal enum DecoderStatus {
81 // following method decodes an utf8 character from a byte buffer.
82 // NOTE: If 'charCount' is < 0, this function only counts bytes and
83 // chars without writing anything.
84 // NOTE: BOM (0xEF 0xBB 0xBF) is not yet supported.
85 // See http://www.cl.cam.ac.uk/~mgk25/unicode.html
86 private unsafe static DecoderStatus InternalGetChar (
87 byte* bytes, int byteCount,
88 char* chars, int charCount,
89 out int bytesProcessed, out int charsProcessed,
90 ref uint leftBytes, ref uint leftBits, ref uint procBytes)
99 // Fetch the start character from the byte buffer.
100 if (leftBytes == 0) {
102 return DecoderStatus.InputRunOut;
103 ch = (uint) (*bytes++);
107 if (ch < (uint) 0x0080) {
108 // Single-byte UTF-8 character.
111 } else if (ch == (uint) 0xc0 || ch == (uint) 0xc1) {
113 return DecoderStatus.InvalidChar;
114 } else if ((ch & (uint) 0xE0) == (uint) 0xC0) {
115 // Double-byte UTF-8 character.
116 leftBits = ((ch & (uint) 0x1F) << 6*1);
118 } else if ((ch & (uint) 0xF0) == (uint) 0xE0) {
119 // Three-byte UTF-8 character.
120 leftBits = ((ch & (uint) 0x0F) << 6*2);
122 } else if ((ch & (uint) 0xF8) == (uint) 0xF0) {
123 // Four-byte UTF-8 character.
124 leftBits = ((ch & (uint) 0x07) << 6*3);
126 // extra check for detecting as soon as
127 // possible too big four-byte utf chars
128 if (leftBits >= (uint) 0x110000)
129 return DecoderStatus.InvalidChar;
131 // Invalid five-or-six-byte or start char
132 // NOTE: I keep here the code for 5/6 bytes if
133 // needed, but technically these combinations
134 // are invalid in UTF-8 sequences.
135 // (ch & (uint) 0xFC) == (uint) 0xF8 =>
136 // leftBits = ch & (uint) 0x03;
138 // (ch & (uint) 0xFE) == (uint) 0xFC =>
139 // leftBits = ch & (uint) 0x01;
141 leftBits = leftBytes = 0;
142 return DecoderStatus.InvalidStart;
144 checkByte = (leftBytes > 0 && leftBits == 0);
147 checkByte = (leftBytes >> 4) != 0;
148 leftBytes &= (uint) 0x0f;
151 // process the required bytes...
152 for (; leftBytes > 0; leftBytes--) {
153 if (byteCount == 0) {
154 leftBytes = ((uint) (checkByte ? 0x10 : 0x00)) | leftBytes;
155 return DecoderStatus.InputRunOut;
157 ch = (uint) (*bytes++);
158 if ((ch & (uint) 0xC0) != (uint) 0x80) {
159 // Invalid UTF-8 sequence: clear and restart.
160 // NOTE: we return before counting the
161 // processed bytes for restarting
162 // decoding later at this point
163 return DecoderStatus.InvalidSequence;
167 procBytes = (procBytes << 8) | ch;
168 if (checkByte && ((~((uint) 0x1f >> (int) leftBytes - 2)) & ch) == 0x80) {
169 // detected an overlong sequence :(
170 return DecoderStatus.Overlong;
173 leftBits = leftBits | ((ch & (uint) 0x3F) << (6*(int) (leftBytes - 1)));
174 if (leftBits >= (uint) 0x110000) {
175 // this UTF-8 is too big ...
176 return DecoderStatus.InvalidChar;
178 if ((leftBits & 0xF800) == 0xD800) {
179 // UTF-8 doesn't use surrogate characters
180 return DecoderStatus.SurrogateFound;
184 // convert this character to UTF-16
185 if (leftBits < (uint) 0x10000) {
186 if (charCount >= 0) {
188 return DecoderStatus.InsufficientSpace;
189 *chars = (char) leftBits;
193 if (charCount >= 0) {
195 return DecoderStatus.InsufficientSpace;
196 leftBits -= (uint) 0x10000;
197 *chars++ = (char) ((leftBits >> 10) + (uint) 0xD800);
198 *chars++ = (char) ((leftBits & (uint) 0x3FF) + (uint) 0xDC00);
203 // we've read a complete char... reset decoder status and finish
204 leftBytes = leftBits = procBytes = 0;
205 return DecoderStatus.Ok;
208 // This function is called when we want to flush the decoder state
209 // (i.e. in case of invalid UTF-8 characters or interrupted sequences)
210 internal unsafe static DecoderStatus InternalGetCharsFlush (
211 char* chars, int charCount,
212 DecoderFallbackBuffer fallbackBuffer,
214 int bytesProcessed, ref int charsProcessed,
215 ref uint leftBytes, ref uint leftBits, ref uint procBytes)
217 // if there is nothing to flush, then exit silently
219 return DecoderStatus.Ok;
220 // now we build a 'bytesUnknown' array with the
221 // stored bytes in 'procBytes'.
223 for (uint t = procBytes; t != 0; extra++)
225 byte [] bytesUnknown = new byte [extra];
226 for (int i = extra; i > 0; i--)
227 bytesUnknown [i - 1] = (byte) ((procBytes >> (8 * (extra - i))) & 0xff);
228 // partial reset: this condition avoids infinite loops
229 if (s == DecoderStatus.InvalidSequence)
231 // call the fallback and cross fingers
232 fallbackBuffer.Fallback (bytesUnknown, bytesProcessed - extra);
234 while (fallbackBuffer.Remaining > 0) {
235 if (charsProcessed >= charCount)
236 return DecoderStatus.InsufficientSpace;
237 chars [charsProcessed++] = fallbackBuffer.GetNextChar ();
240 charsProcessed += fallbackBuffer.Remaining;
241 fallbackBuffer.Reset ();
243 // recovery was succesful, flush decoder state
244 leftBits = leftBytes = procBytes = 0;
246 return DecoderStatus.Ok;
249 // InternalGetChars processor. Can decode or count space needed for
250 // decoding, depending on the enabled mode:
252 // enabled when charCount >= 0 (but chars may be null)
254 // enabled when chars == null && charCount < 0
255 internal unsafe static DecoderStatus InternalGetChars (
256 byte* bytes, int byteCount,
257 char* chars, int charCount,
258 DecoderFallbackBuffer fallbackBuffer,
259 out int bytesProcessed, out int charsProcessed,
260 ref uint leftBytes, ref uint leftBits, ref uint procBytes,
264 int t_bytesProcessed, t_charsProcessed;
266 // Validate parameters
268 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
270 if (byteCount > 0 && bytes == null)
271 throw new ArgumentNullException ("bytes");
274 throw new ArgumentNullException ("chars");
277 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
284 // byte processing loop
285 while (byteCount - bytesProcessed > 0) {
286 // fetch a char from the input byte array
289 bytes + bytesProcessed, byteCount - bytesProcessed,
290 chars + charsProcessed, charCount - charsProcessed,
291 out t_bytesProcessed, out t_charsProcessed,
292 ref leftBytes, ref leftBits, ref procBytes)
294 bytes + bytesProcessed, byteCount - bytesProcessed,
296 out t_bytesProcessed, out t_charsProcessed,
297 ref leftBytes, ref leftBits, ref procBytes);
299 // if not enough space return here
300 // NOTE: maybe we should restore the original encoder
301 // state ... we should check what ms do in this case
302 if(s == DecoderStatus.InsufficientSpace)
303 return DecoderStatus.InsufficientSpace;
306 charsProcessed += t_charsProcessed;
307 bytesProcessed += t_bytesProcessed;
310 case DecoderStatus.Ok:
311 break; // everything OK :D
313 case DecoderStatus.Overlong:
314 case DecoderStatus.InvalidSequence:
315 case DecoderStatus.InvalidStart:
316 case DecoderStatus.InvalidChar:
317 case DecoderStatus.SurrogateFound:
318 s = InternalGetCharsFlush (
322 bytesProcessed, ref charsProcessed,
323 ref leftBytes, ref leftBits, ref procBytes);
324 if (s != DecoderStatus.Ok)
328 case DecoderStatus.InputRunOut:
330 ? InternalGetCharsFlush (
334 bytesProcessed, ref charsProcessed,
335 ref leftBytes, ref leftBits, ref procBytes)
336 : DecoderStatus.InputRunOut;
340 ? InternalGetCharsFlush (
344 bytesProcessed, ref charsProcessed,
345 ref leftBytes, ref leftBits, ref procBytes)
349 internal unsafe static DecoderStatus InternalGetCharsDecode (
350 byte* bytes, int byteCount,
351 char* chars, int charCount,
352 DecoderFallbackBuffer fallbackBuffer,
353 out int bytesProcessed, out int charsProcessed,
354 ref uint leftBytes, ref uint leftBits, ref uint procBytes,
358 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
360 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
362 return InternalGetChars (
366 out bytesProcessed, out charsProcessed,
367 ref leftBytes, ref leftBits, ref procBytes,
371 internal unsafe static DecoderStatus InternalGetCharsDecode (
372 byte[] bytes, int byteIndex, int byteCount,
373 char[] chars, int charIndex,
374 DecoderFallbackBuffer fallbackBuffer,
375 out int bytesProcessed, out int charsProcessed,
376 ref uint leftBytes, ref uint leftBits, ref uint procBytes,
380 throw new ArgumentNullException ("bytes");
382 throw new ArgumentNullException ("chars");
383 if (byteIndex < 0 || byteIndex > bytes.Length)
384 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
385 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex))
386 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
387 if (charIndex < 0 || charIndex > chars.Length)
388 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
390 fixed (char* cptr = chars) {
391 fixed (byte* bptr = bytes) {
392 return InternalGetChars (
393 bptr + byteIndex, byteCount,
394 cptr + charIndex, chars.Length - charIndex,
396 out bytesProcessed, out charsProcessed,
397 ref leftBytes, ref leftBits, ref procBytes,
403 internal unsafe static DecoderStatus InternalGetCharsCount (
404 byte* bytes, int byteCount,
405 DecoderFallbackBuffer fallbackBuffer,
406 out int bytesProcessed, out int charsProcessed,
407 ref uint leftBytes, ref uint leftBits, ref uint procBytes,
411 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
413 return InternalGetChars (
417 out bytesProcessed, out charsProcessed,
418 ref leftBytes, ref leftBits, ref procBytes,
422 internal unsafe static DecoderStatus InternalGetCharsCount (
423 byte[] bytes, int byteIndex, int byteCount,
424 DecoderFallbackBuffer fallbackBuffer,
425 out int bytesProcessed, out int charsProcessed,
426 ref uint leftBytes, ref uint leftBits, ref uint procBytes,
430 throw new ArgumentNullException ("bytes");
431 if (byteIndex < 0 || byteIndex > bytes.Length)
432 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
433 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex))
434 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
436 fixed (byte* bptr = bytes) {
437 return InternalGetChars (
438 bptr + byteIndex, byteCount,
441 out bytesProcessed, out charsProcessed,
442 ref leftBytes, ref leftBits, ref procBytes,
447 ///////////////////////////////////////////////////////////////////////
448 // INTERNAL ENCODING FUNCTION (CHAR/UTF16 -> UTF8)
449 ///////////////////////////////////////////////////////////////////////
451 internal enum EncoderStatus {
459 // following method encodes an utf8 character into a byte buffer.
460 // NOTE: If 'byteCount' is < 0, this function only counts used bytes
461 // without writing anything.
462 // NOTE: BOM (0xEF 0xBB 0xBF) is not yet supported.
463 // See http://www.cl.cam.ac.uk/~mgk25/unicode.html
464 private unsafe static EncoderStatus InternalGetByte (
465 char* chars, int charCount,
466 byte* bytes, int byteCount,
467 out int charsProcessed, out int bytesProcessed, ref uint leftChar)
475 // process one char (this block executes twice if a surrogate is found)
478 return EncoderStatus.InputRunOut;
483 // char counting is inside if for reason discused in else
486 if (ch < (uint) 0x80) {
487 if (byteCount >= 0) {
489 return EncoderStatus.InsufficientSpace;
490 *bytes++ = (byte) ch;
494 } else if (ch < (uint) 0x0800) {
495 if (byteCount >= 0) {
497 return EncoderStatus.InsufficientSpace;
498 *bytes++ = (byte) ((uint) 0xC0 | (ch >> 6) & 0x3f);
499 *bytes++ = (byte) ((uint) 0x80 | ch & 0x3f);
503 } else if (ch < (uint) 0xD800 || ch > (uint) 0xDFFF) {
504 if (byteCount >= 0) {
506 return EncoderStatus.InsufficientSpace;
507 *bytes++ = (byte) ((uint) 0xE0 | (ch >> 12));
508 *bytes++ = (byte) ((uint) 0x80 | ((ch >> 6) & 0x3F));
509 *bytes++ = (byte) ((uint) 0x80 | (ch & 0x3F));
513 } else if (ch <= (uint) 0xDBFF) {
514 // This is a surrogate char, repeat please
518 // We have a surrogate tail without
519 // leading surrogate.
521 return EncoderStatus.InvalidChar;
524 if (ch >= (uint) 0xDC00 && ch <= (uint) 0xDFFF) {
525 // We have a correct surrogate pair.
526 ch = 0x10000 + (uint) ch - (uint) 0xDC00
527 + ((leftChar - (uint) 0xD800) << 10);
528 if (byteCount >= 0) {
530 return EncoderStatus.InsufficientSpace;
531 *bytes++ = (byte) (0xF0 | (ch >> 18));
532 *bytes++ = (byte) (0x80 | ((ch >> 12) & 0x3F));
533 *bytes++ = (byte) (0x80 | ((ch >> 6) & 0x3F));
534 *bytes++ = (byte) (0x80 | (ch & 0x3F));
539 // We have a surrogate start followed by a
540 // regular character. Technically, this is
541 // invalid, so we fail :(
542 return EncoderStatus.InvalidSurrogate;
544 // increment counters; this is done after processing
545 // the surrogate: in case of a bad surrogate the
546 // encoding should restart on the faulty char (maybe
547 // the correct surrogate has been lost, and in this
548 // case the best option is to restart processing on the
549 // erroneus char to avoid losing more chars during the
555 return EncoderStatus.Ok;
558 // This function is called when we want to flush the decoder state
559 // (i.e. in case of invalid UTF-16 characters or dangling surrogates)
560 internal unsafe static EncoderStatus InternalGetBytesFlush (
561 byte* bytes, int byteCount,
562 EncoderFallbackBuffer fallbackBuffer,
563 int charsProcessed, ref int bytesProcessed,
566 int t_charsProcessed, t_bytesProcessed;
568 // in normal circumstances fallbackBuffer never is null, except
569 // when we have called InternalGetBytes from this function
570 // (for avoiding infinite recursive calls)
571 if (fallbackBuffer == null)
572 return EncoderStatus.Ok;
574 // if there is nothing to flush, then return silently
576 return EncoderStatus.Ok;
578 // invalid UTF-16 or invalid surrogate
579 fallbackBuffer.Fallback ((char) leftChar, charsProcessed - 1);
580 // if we've arrived here we are working in replacement mode:
581 // build a replacement fallback_chars buffer
582 char[] fallback_chars = new char [fallbackBuffer.Remaining];
583 for (int i = 0; i < fallback_chars.Length; i++)
584 fallback_chars [i] = fallbackBuffer.GetNextChar ();
585 fallbackBuffer.Reset ();
586 // and encode it into UTF8 bytes...
587 fixed (char *fb_chars = fallback_chars) {
589 switch (bytes != null
591 fb_chars, fallback_chars.Length,
592 bytes + bytesProcessed, byteCount - bytesProcessed,
593 null, out t_charsProcessed, out t_bytesProcessed,
597 fb_chars, fallback_chars.Length,
599 null, out t_charsProcessed, out t_bytesProcessed,
602 case EncoderStatus.Ok:
604 bytesProcessed += t_bytesProcessed;
606 case EncoderStatus.InsufficientSpace:
607 return EncoderStatus.InsufficientSpace;
608 case EncoderStatus.InputRunOut:
609 case EncoderStatus.InvalidChar:
610 case EncoderStatus.InvalidSurrogate:
611 throw new ArgumentException ("Fallback chars are pure evil.", "fallback buffer bytes");
614 // flush encoder state
616 return EncoderStatus.Ok;
619 // InternalGetBytes processor. Can encode or count space needed for
620 // encoding, depending on the enabled mode:
622 // enabled when byteCount >= 0 (but bytes may be null)
624 // enabled when bytes == null && byteCount < 0
625 internal unsafe static EncoderStatus InternalGetBytes (
626 char* chars, int charCount,
627 byte* bytes, int byteCount,
628 EncoderFallbackBuffer fallbackBuffer,
629 out int charsProcessed, out int bytesProcessed,
634 int t_charsProcessed, t_bytesProcessed;
636 // Validate the parameters
638 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
640 if (charCount > 0 && chars == null)
641 throw new ArgumentNullException ("chars");
644 throw new ArgumentNullException ("bytes");
647 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
654 // char processing loop
655 while (charCount - charsProcessed > 0) {
658 chars + charsProcessed, charCount - charsProcessed,
659 bytes + bytesProcessed, byteCount - bytesProcessed,
660 out t_charsProcessed, out t_bytesProcessed, ref leftChar)
662 chars + charsProcessed, charCount - charsProcessed,
664 out t_charsProcessed, out t_bytesProcessed, ref leftChar);
666 // if not enough space return here
667 // NOTE: maybe we should restore the original encoder
668 // state ... we should check what ms do in this case
669 if(s == EncoderStatus.InsufficientSpace)
670 return EncoderStatus.InsufficientSpace;
673 charsProcessed += t_charsProcessed;
674 bytesProcessed += t_bytesProcessed;
677 case EncoderStatus.Ok:
678 break; // everything OK :D
680 case EncoderStatus.InputRunOut:
682 ? InternalGetBytesFlush (
685 charsProcessed, ref bytesProcessed,
687 : EncoderStatus.InputRunOut;
689 case EncoderStatus.InvalidChar:
690 case EncoderStatus.InvalidSurrogate:
691 s = InternalGetBytesFlush (
694 charsProcessed, ref bytesProcessed,
696 if (s != EncoderStatus.Ok)
702 ? InternalGetBytesFlush (
705 charsProcessed, ref bytesProcessed,
710 internal unsafe static EncoderStatus InternalGetBytesEncode (
711 char* chars, int charCount,
712 byte* bytes, int byteCount,
713 EncoderFallbackBuffer fallbackBuffer,
714 out int charsProcessed, out int bytesProcessed,
719 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
721 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
723 return InternalGetBytes (
727 out charsProcessed, out bytesProcessed,
732 internal unsafe static EncoderStatus InternalGetBytesEncode (
733 char[] chars, int charIndex, int charCount,
734 byte[] bytes, int byteIndex,
735 EncoderFallbackBuffer fallbackBuffer,
736 out int charsProcessed, out int bytesProcessed,
741 throw new ArgumentNullException ("chars");
743 throw new ArgumentNullException ("bytes");
744 if (charIndex < 0 || charIndex > chars.Length)
745 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
746 if (charCount < 0 || charCount > (chars.Length - charIndex))
747 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
748 if (byteIndex < 0 || byteIndex > bytes.Length)
749 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
752 fixed (char *cptr = chars) {
753 fixed (byte *bptr = bytes) {
754 return InternalGetBytes (
755 cptr + charIndex, charCount,
756 bptr + byteIndex, bytes.Length - byteIndex,
758 out charsProcessed, out bytesProcessed,
766 internal unsafe static EncoderStatus InternalGetBytesCount (
767 char* chars, int charCount,
768 EncoderFallbackBuffer fallbackBuffer,
769 out int charsProcessed, out int bytesProcessed,
774 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
776 return InternalGetBytes (
780 out charsProcessed, out bytesProcessed,
785 internal unsafe static EncoderStatus InternalGetBytesCount (
786 char[] chars, int charIndex, int charCount,
787 EncoderFallbackBuffer fallbackBuffer,
788 out int charsProcessed, out int bytesProcessed,
793 throw new ArgumentNullException ("chars");
794 if (charIndex < 0 || charIndex > chars.Length)
795 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
796 if (charCount < 0 || charCount > (chars.Length - charIndex))
797 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
799 fixed (char *cptr = chars) {
800 return InternalGetBytes (
801 cptr + charIndex, charCount,
804 out charsProcessed, out bytesProcessed,
810 #region GetByteCount()
812 // Get the number of bytes needed to encode a character buffer.
813 public override int GetByteCount (char[] chars, int index, int count)
816 int charsProcessed, bytesProcessed;
817 InternalGetBytesCount (
819 EncoderFallback.CreateFallbackBuffer (),
820 out charsProcessed, out bytesProcessed,
823 return bytesProcessed;
827 [CLSCompliant (false)]
829 public unsafe override int GetByteCount (char* chars, int count)
831 int charsProcessed, bytesProcessed;
833 InternalGetBytesCount (
835 EncoderFallback.CreateFallbackBuffer (),
836 out charsProcessed, out bytesProcessed,
839 return bytesProcessed;
846 // Get the bytes that result from encoding a character buffer.
847 public override int GetBytes (char[] chars, int charIndex, int charCount,
848 byte[] bytes, int byteIndex)
850 int charsProcessed, bytesProcessed;
852 if (InternalGetBytesEncode (
853 chars, charIndex, charCount,
855 EncoderFallback.CreateFallbackBuffer (),
856 out charsProcessed, out bytesProcessed,
858 true) == EncoderStatus.InsufficientSpace)
859 throw new ArgumentException ("Insufficient Space", "bytes");
860 return bytesProcessed;
863 // Convenience wrappers for "GetBytes".
864 public unsafe override int GetBytes (String s, int charIndex, int charCount,
865 byte[] bytes, int byteIndex)
867 int charsProcessed, bytesProcessed;
869 EncoderStatus status;
871 throw new ArgumentNullException ("s");
872 if (charIndex < 0 || charIndex >= s.Length)
873 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
874 if (charCount < 0 || charCount > (s.Length - charIndex))
875 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
876 if (byteIndex < 0 || byteIndex > bytes.Length)
877 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
879 fixed (char *cptr = s) {
880 fixed (byte *bptr = bytes) {
881 status = InternalGetBytesEncode (
882 cptr + charIndex, charCount,
883 bptr + byteIndex, bytes.Length - byteIndex,
884 EncoderFallback.CreateFallbackBuffer (),
885 out charsProcessed, out bytesProcessed,
891 if (status == EncoderStatus.InsufficientSpace)
892 throw new ArgumentException ("Insufficient Space", "bytes");
893 return bytesProcessed;
896 [CLSCompliant (false)]
898 public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
900 int charsProcessed, bytesProcessed;
902 if (InternalGetBytesEncode (
903 chars, charCount, bytes, byteCount,
904 EncoderFallback.CreateFallbackBuffer (),
905 out charsProcessed, out bytesProcessed,
907 true) == EncoderStatus.InsufficientSpace)
908 throw new ArgumentException ("Insufficient Space", "bytes");
909 return bytesProcessed;
914 #region GetCharCount()
916 // Get the number of characters needed to decode a byte buffer.
917 public override int GetCharCount (byte[] bytes, int index, int count)
919 int bytesProcessed, charsProcessed;
920 uint leftBytes = 0, leftBits = 0, procBytes = 0;
921 InternalGetCharsCount (
923 DecoderFallback.CreateFallbackBuffer (),
924 out bytesProcessed, out charsProcessed,
925 ref leftBytes, ref leftBits, ref procBytes,
927 return charsProcessed;
930 [CLSCompliant (false)]
932 public unsafe override int GetCharCount (byte* bytes, int count)
934 int bytesProcessed, charsProcessed;
935 uint leftBytes = 0, leftBits = 0, procBytes = 0;
936 InternalGetCharsCount (
938 DecoderFallback.CreateFallbackBuffer (),
939 out bytesProcessed, out charsProcessed,
940 ref leftBytes, ref leftBits, ref procBytes,
942 return charsProcessed;
947 // Get the characters that result from decoding a byte buffer.
948 public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
949 char[] chars, int charIndex)
951 int bytesProcessed, charsProcessed;
952 uint leftBytes = 0, leftBits = 0, procBytes = 0;
954 if (InternalGetCharsDecode (
955 bytes, byteIndex, byteCount,
957 DecoderFallback.CreateFallbackBuffer (),
958 out bytesProcessed, out charsProcessed,
959 ref leftBytes, ref leftBits, ref procBytes,
960 true) == DecoderStatus.InsufficientSpace)
961 throw new ArgumentException ("Insufficient Space", "bytes");
963 return charsProcessed;
966 [CLSCompliant (false)]
968 public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
970 int bytesProcessed, charsProcessed;
971 uint leftBytes = 0, leftBits = 0, procBytes = 0;
973 if (InternalGetCharsDecode (
976 DecoderFallback.CreateFallbackBuffer (),
977 out bytesProcessed, out charsProcessed,
978 ref leftBytes, ref leftBits, ref procBytes,
979 true) == DecoderStatus.InsufficientSpace)
980 throw new ArgumentException ("Insufficient Space", "bytes");
982 return charsProcessed;
985 // Get the maximum number of bytes needed to encode a
986 // specified number of characters.
987 public override int GetMaxByteCount (int charCount)
990 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
991 return charCount * 4;
994 // Get the maximum number of characters needed to decode a
995 // specified number of bytes.
996 public override int GetMaxCharCount (int byteCount)
999 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
1003 // Get a UTF8-specific decoder that is attached to this instance.
1004 public override Decoder GetDecoder ()
1006 return new UTF8Decoder (DecoderFallback);
1009 // Get a UTF8-specific encoder that is attached to this instance.
1010 public override Encoder GetEncoder ()
1012 return new UTF8Encoder (EncoderFallback, emitIdentifier);
1015 // Get the UTF8 preamble.
1016 // XXX: why does this method return a preamble or void array depending
1017 // on 'emitIdentifier' attribute?
1018 public override byte[] GetPreamble ()
1021 return new byte [] { 0xEF, 0xBB, 0xBF };
1023 return EmptyArray<byte>.Value;
1026 // Determine if this object is equal to another.
1027 public override bool Equals (Object value)
1029 UTF8Encoding enc = (value as UTF8Encoding);
1031 return (codePage == enc.codePage &&
1032 emitIdentifier == enc.emitIdentifier &&
1033 DecoderFallback.Equals (enc.DecoderFallback) &&
1034 EncoderFallback.Equals (enc.EncoderFallback));
1040 // Get the hash code for this object.
1041 public override int GetHashCode ()
1043 return base.GetHashCode ();
1046 public override int GetByteCount (string chars)
1048 // hmm, does this override make any sense?
1049 return base.GetByteCount (chars);
1052 [ComVisible (false)]
1053 public override string GetString (byte [] bytes, int index, int count)
1055 // hmm, does this override make any sense?
1056 return base.GetString (bytes, index, count);
1059 // UTF-8 decoder implementation.
1061 private class UTF8Decoder : Decoder
1063 // internal encoder state
1064 private uint leftBytes;
1065 private uint leftBits;
1066 private uint procBytes;
1069 public UTF8Decoder (DecoderFallback fallback)
1071 Fallback = fallback;
1077 // Override inherited methods.
1078 public override int GetCharCount (byte[] bytes, int index, int count, bool flush)
1080 int bytesProcessed, charsProcessed;
1081 InternalGetCharsCount (
1082 bytes, index, count,
1083 this.FallbackBuffer,
1084 out bytesProcessed, out charsProcessed,
1085 ref leftBytes, ref leftBits, ref procBytes,
1087 return charsProcessed;
1090 [ComVisibleAttribute(false)]
1091 public override int GetCharCount (byte[] bytes, int index, int count)
1093 return GetCharCount (bytes, index, count, true);
1096 [ComVisibleAttribute(false)]
1097 public unsafe override int GetCharCount (byte* bytes, int count, bool flush)
1099 int bytesProcessed, charsProcessed;
1100 InternalGetCharsCount (
1102 this.FallbackBuffer,
1103 out bytesProcessed, out charsProcessed,
1104 ref leftBytes, ref leftBits, ref procBytes,
1106 return charsProcessed;
1109 [ComVisibleAttribute(false)]
1110 public unsafe override int GetChars (byte* bytes, int byteCount,
1111 char* chars, int charCount, bool flush)
1113 int bytesProcessed, charsProcessed;
1114 if (InternalGetCharsDecode (
1117 this.FallbackBuffer,
1118 out bytesProcessed, out charsProcessed,
1119 ref leftBytes, ref leftBits, ref procBytes,
1120 flush) == DecoderStatus.InsufficientSpace)
1121 throw new ArgumentException ("Insufficient Space", "bytes");
1122 return charsProcessed;
1125 public override int GetChars (byte[] bytes, int byteIndex,
1126 int byteCount, char[] chars, int charIndex, bool flush)
1128 int bytesProcessed, charsProcessed;
1129 if (InternalGetCharsDecode (
1130 bytes, byteIndex, byteCount,
1132 this.FallbackBuffer,
1133 out bytesProcessed, out charsProcessed,
1134 ref leftBytes, ref leftBits, ref procBytes,
1135 flush) == DecoderStatus.InsufficientSpace)
1136 throw new ArgumentException ("Insufficient Space", "bytes");
1137 return charsProcessed;
1140 public override int GetChars (byte[] bytes, int byteIndex,
1141 int byteCount, char[] chars, int charIndex)
1143 return GetChars (bytes, byteIndex, byteCount, chars, charIndex, true);
1146 public override void Reset ()
1154 public unsafe override void Convert (
1155 byte* bytes, int byteCount,
1156 char* chars, int charCount, bool flush,
1157 out int bytesUsed, out int charsUsed, out bool completed)
1159 InternalGetCharsDecode (
1162 this.FallbackBuffer,
1163 out bytesUsed, out charsUsed,
1164 ref leftBytes, ref leftBits, ref procBytes,
1166 // only completed if all bytes have been processed and
1167 // succesful converted to chars!!
1168 completed = (byteCount == bytesUsed);
1170 } // class UTF8Decoder
1172 // UTF-8 encoder implementation.
1174 private class UTF8Encoder : Encoder
1176 private bool emitIdentifier;
1178 // internal encoder state
1179 private uint leftChar;
1180 private bool emittedIdentifier;
1183 public UTF8Encoder (EncoderFallback fallback, bool emitIdentifier)
1185 this.Fallback = fallback;
1187 this.emitIdentifier = false; //emitIdentifier;
1188 this.emittedIdentifier = false;
1191 // Override inherited methods.
1192 [ComVisibleAttribute(false)]
1193 public unsafe override int GetByteCount (char* chars, int count, bool flush)
1195 int charsProcessed, bytesProcessed, preambleSize = 0;
1196 if (emitIdentifier && !emittedIdentifier) {
1198 emittedIdentifier = true;
1200 InternalGetBytesCount (
1202 this.FallbackBuffer,
1203 out charsProcessed, out bytesProcessed,
1206 return bytesProcessed + preambleSize;
1209 public override int GetByteCount (char[] chars, int index,
1210 int count, bool flush)
1212 int charsProcessed, bytesProcessed, preambleSize = 0;
1213 if (emitIdentifier && !emittedIdentifier) {
1215 emittedIdentifier = true;
1217 InternalGetBytesCount (
1218 chars, index, count,
1219 this.FallbackBuffer,
1220 out charsProcessed, out bytesProcessed,
1223 return bytesProcessed + preambleSize;
1226 [ComVisibleAttribute(false)]
1227 public unsafe override int GetBytes (char* chars, int charCount,
1228 byte* bytes, int byteCount, bool flush)
1230 int charsProcessed, bytesProcessed, preambleSize = 0;
1231 if (emitIdentifier && !emittedIdentifier) {
1233 throw new ArgumentException ("Insufficient Space", "UTF8 preamble");
1238 emittedIdentifier = true;
1241 if (InternalGetBytesEncode (
1244 this.FallbackBuffer,
1245 out charsProcessed, out bytesProcessed,
1247 flush) == EncoderStatus.InsufficientSpace)
1248 throw new ArgumentException ("Insufficient Space", "bytes");
1249 return bytesProcessed + preambleSize;
1252 public override int GetBytes (char[] chars, int charIndex,
1253 int charCount, byte[] bytes,
1254 int byteIndex, bool flush)
1256 int charsProcessed, bytesProcessed, preambleSize = 0;
1257 if (emitIdentifier && !emittedIdentifier) {
1258 if (bytes.Length - byteIndex < 3)
1259 throw new ArgumentException ("Insufficient Space", "UTF8 preamble");
1260 bytes[byteIndex++] = 0xEF;
1261 bytes[byteIndex++] = 0xBB;
1262 bytes[byteIndex++] = 0xBF;
1264 emittedIdentifier = true;
1266 if (InternalGetBytesEncode (
1267 chars, charIndex, charCount,
1269 this.FallbackBuffer,
1270 out charsProcessed, out bytesProcessed,
1272 flush) == EncoderStatus.InsufficientSpace)
1273 throw new ArgumentException ("Insufficient Space", "bytes");
1274 return bytesProcessed + preambleSize;
1277 public override void Reset ()
1281 this.emittedIdentifier = false;
1284 public unsafe override void Convert (
1285 char* chars, int charCount,
1286 byte* bytes, int byteCount, bool flush,
1287 out int charsUsed, out int bytesUsed, out bool completed)
1289 int preambleSize = 0;
1290 if (emitIdentifier && !emittedIdentifier) {
1291 if (bytes != null && byteCount >= 3)
1297 emittedIdentifier = true;
1301 InternalGetBytesEncode (
1304 this.FallbackBuffer,
1305 out charsUsed, out bytesUsed,
1308 // only completed if all chars have been processed and
1309 // succesful converted to chars!!
1310 completed = (charCount == charsUsed);
1311 bytesUsed += preambleSize;
1313 } // class UTF8Encoder
1315 }; // class UTF8Encoding
1317 }; // namespace System.Text