2 * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
4 * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
5 * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 * OTHER DEALINGS IN THE SOFTWARE.
30 using System.Runtime.InteropServices;
33 [MonoLimitation ("Serialization format not compatible with .NET")]
35 public class UTF8Encoding : Encoding
37 // Magic number used by Windows for UTF-8.
38 internal const int UTF8_CODE_PAGE = 65001;
41 private bool emitIdentifier;
44 public UTF8Encoding () : this (false, false) {}
45 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
46 : this (encoderShouldEmitUTF8Identifier, false) {}
48 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
49 : base (UTF8_CODE_PAGE)
51 emitIdentifier = encoderShouldEmitUTF8Identifier;
52 if (throwOnInvalidBytes)
53 SetFallbackInternal (EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
55 SetFallbackInternal (EncoderFallback.StandardSafeFallback, DecoderFallback.StandardSafeFallback);
57 web_name = body_name = header_name = "utf-8";
58 encoding_name = "Unicode (UTF-8)";
59 is_browser_save = true;
60 is_browser_display = true;
61 is_mail_news_display = true;
62 is_mail_news_save = true;
63 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
66 ///////////////////////////////////////////////////////////////////////
67 // INTERNAL DECODING FUNCTION (UTF8 -> CHAR/UTF16)
68 ///////////////////////////////////////////////////////////////////////
70 internal enum DecoderStatus {
81 // following method decodes an utf8 character from a byte buffer.
82 // NOTE: If 'chars' is null, this function only counts bytes and chars
83 // without writing anything.
84 // NOTE: BOM (0xEF 0xBB 0xBF) is not yet supported.
85 // See http://www.cl.cam.ac.uk/~mgk25/unicode.html
86 private unsafe static DecoderStatus InternalGetChar (
87 byte* bytes, int byteCount,
88 char* chars, int charCount,
89 out int bytesProcessed, out int charsProcessed,
90 ref uint leftBytes, ref uint leftBits, ref uint procBytes)
99 // Fetch the start character from the byte buffer.
100 if (leftBytes == 0) {
102 return DecoderStatus.InputRunOut;
103 ch = (uint) (*bytes++);
107 if (ch < (uint) 0x0080) {
108 // Single-byte UTF-8 character.
111 } else if (ch == (uint) 0xc0 || ch == (uint) 0xc1) {
113 return DecoderStatus.InvalidChar;
114 } else if ((ch & (uint) 0xE0) == (uint) 0xC0) {
115 // Double-byte UTF-8 character.
116 leftBits = ((ch & (uint) 0x1F) << 6*1);
118 } else if ((ch & (uint) 0xF0) == (uint) 0xE0) {
119 // Three-byte UTF-8 character.
120 leftBits = ((ch & (uint) 0x0F) << 6*2);
122 } else if ((ch & (uint) 0xF8) == (uint) 0xF0) {
123 // Four-byte UTF-8 character.
124 leftBits = ((ch & (uint) 0x07) << 6*3);
126 // extra check for detecting as soon as
127 // possible too big four-byte utf chars
128 if (leftBits >= (uint) 0x110000)
129 return DecoderStatus.InvalidChar;
131 // Invalid five-or-six-byte or start char
132 // NOTE: I keep here the code for 5/6 bytes if
133 // needed, but technically these combinations
134 // are invalid in UTF-8 sequences.
135 // (ch & (uint) 0xFC) == (uint) 0xF8 =>
136 // leftBits = ch & (uint) 0x03;
138 // (ch & (uint) 0xFE) == (uint) 0xFC =>
139 // leftBits = ch & (uint) 0x01;
141 leftBits = leftBytes = 0;
142 return DecoderStatus.InvalidStart;
144 checkByte = (leftBytes > 0 && leftBits == 0);
147 checkByte = (leftBytes >> 4) != 0;
148 leftBytes &= (uint) 0x0f;
151 // process the required bytes...
152 for (; leftBytes > 0; leftBytes--) {
153 if (byteCount == 0) {
154 leftBytes = ((uint) (checkByte ? 0x10 : 0x00)) | leftBytes;
155 return DecoderStatus.InputRunOut;
157 ch = (uint) (*bytes++);
158 if ((ch & (uint) 0xC0) != (uint) 0x80) {
159 // Invalid UTF-8 sequence: clear and restart.
160 // NOTE: we return before counting the
161 // processed bytes for restarting
162 // decoding later at this point
163 return DecoderStatus.InvalidSequence;
167 procBytes = (procBytes << 8) | ch;
168 if (checkByte && ((~((uint) 0x1f >> (int) leftBytes - 2)) & ch) == 0x80) {
169 // detected an overlong sequence :(
170 return DecoderStatus.Overlong;
173 leftBits = leftBits | ((ch & (uint) 0x3F) << (6*(int) (leftBytes - 1)));
174 if (leftBits >= (uint) 0x110000) {
175 // this UTF-8 is too big ...
176 return DecoderStatus.InvalidChar;
178 if ((leftBits & 0xF800) == 0xD800) {
179 // UTF-8 doesn't use surrogate characters
180 return DecoderStatus.SurrogateFound;
184 // convert this character to UTF-16
185 if (leftBits < (uint) 0x10000) {
188 return DecoderStatus.InsufficientSpace;
189 *chars = (char) leftBits;
195 return DecoderStatus.InsufficientSpace;
196 leftBits -= (uint) 0x10000;
197 *chars++ = (char) ((leftBits >> 10) + (uint) 0xD800);
198 *chars++ = (char) ((leftBits & (uint) 0x3FF) + (uint) 0xDC00);
203 // we've read a complete char... reset decoder status and finish
204 leftBytes = leftBits = procBytes = 0;
205 return DecoderStatus.Ok;
208 internal unsafe static DecoderStatus InternalGetChars (
209 byte* bytes, int byteCount,
210 char* chars, int charCount,
211 DecoderFallbackBuffer fallbackBuffer,
212 out int bytesProcessed, out int charsProcessed,
213 ref uint leftBytes, ref uint leftBits, ref uint procBytes)
216 int t_bytesProcessed, t_charsProcessed;
218 // Validate parameters
220 throw new ArgumentNullException ("bytes");
222 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
224 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
230 // byte processing loop
231 while(byteCount - bytesProcessed > 0 && (chars == null || charCount - charsProcessed > 0)) {
232 // fetch a char from the input byte array
235 bytes + bytesProcessed, byteCount - bytesProcessed,
236 chars + charsProcessed, charCount - charsProcessed,
237 out t_bytesProcessed, out t_charsProcessed,
238 ref leftBytes, ref leftBits, ref procBytes)
240 bytes + bytesProcessed, byteCount - bytesProcessed,
242 out t_bytesProcessed, out t_charsProcessed,
243 ref leftBytes, ref leftBits, ref procBytes);
246 charsProcessed += t_charsProcessed;
247 bytesProcessed += t_bytesProcessed;
250 case DecoderStatus.Ok:
251 break; // everything OK :D
253 case DecoderStatus.InsufficientSpace:
254 throw new ArgumentException ("Insufficient Space", "chars");
256 case DecoderStatus.Overlong:
257 case DecoderStatus.InvalidSequence:
258 case DecoderStatus.InvalidStart:
259 case DecoderStatus.InvalidChar:
260 case DecoderStatus.SurrogateFound:
261 // Invalid UTF-8 characters and sequences...
262 // now we build a 'bytesUnknown' array with the
263 // stored bytes in 'procBytes'.
265 for (uint t = procBytes; t != 0; extra++)
267 byte [] bytesUnknown = new byte [extra];
268 for (int i = extra; i > 0; i--)
269 bytesUnknown [i - 1] = (byte) ((procBytes >> (8 * (extra - i))) & 0xff);
270 // partial reset: this condition avoids
272 if (s == DecoderStatus.InvalidSequence)
274 // call the fallback and cross fingers
275 fallbackBuffer.Fallback (bytesUnknown, bytesProcessed - extra);
277 while (fallbackBuffer.Remaining > 0) {
278 if (charsProcessed >= charCount)
279 throw new ArgumentException ("Insufficient Space", "chars/fallback");
280 chars [charsProcessed++] = fallbackBuffer.GetNextChar ();
283 charsProcessed += fallbackBuffer.Remaining;
284 fallbackBuffer.Reset ();
285 // recovery was succesful, reset decoder state
286 leftBits = leftBytes = procBytes = 0;
289 case DecoderStatus.InputRunOut:
290 return DecoderStatus.InputRunOut;
293 return DecoderStatus.Ok;
296 // Get the characters that result from decoding a byte buffer.
297 internal unsafe static DecoderStatus InternalGetChars (
298 byte[] bytes, int byteIndex, int byteCount,
299 char[] chars, int charIndex,
300 DecoderFallbackBuffer fallbackBuffer,
301 out int bytesProcessed, out int charsProcessed,
302 ref uint leftBytes, ref uint leftBits, ref uint procBytes)
304 // Validate the parameters.
306 throw new ArgumentNullException ("bytes");
307 if (byteIndex < 0 || byteIndex >= bytes.Length)
308 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
309 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex))
310 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
311 if (charIndex < 0 || charIndex > (chars != null && chars.Length > 0 ? chars.Length - 1 : 0))
312 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
314 fixed (char* cptr = chars) {
315 fixed (byte* bptr = bytes) {
316 return InternalGetChars (
317 bptr + byteIndex, byteCount,
318 chars != null ? cptr + charIndex : null,
319 chars != null ? chars.Length - charIndex : 0,
321 out bytesProcessed, out charsProcessed,
322 ref leftBytes, ref leftBits, ref procBytes);
327 ///////////////////////////////////////////////////////////////////////
328 // INTERNAL ENCODING FUNCTION (CHAR/UTF16 -> UTF8)
329 ///////////////////////////////////////////////////////////////////////
331 internal enum EncoderStatus {
339 // following method encodes an utf8 character into a byte buffer.
340 // NOTE: If 'bytes' is null, this function only counts bytes and chars
341 // without writing anything.
342 // NOTE: BOM (0xEF 0xBB 0xBF) is not yet supported.
343 // See http://www.cl.cam.ac.uk/~mgk25/unicode.html
344 private unsafe static EncoderStatus InternalGetByte (
345 char* chars, int charCount,
346 byte* bytes, int byteCount,
347 out int charsProcessed, out int bytesProcessed, ref uint leftChar)
355 // process one char (this block executes twice if a surrogate is found)
358 return EncoderStatus.InputRunOut;
363 // char counting is inside if for reason discused in else
366 if (ch < (uint) 0x80) {
369 return EncoderStatus.InsufficientSpace;
370 *bytes++ = (byte) ch;
374 } else if (ch < (uint) 0x0800) {
377 return EncoderStatus.InsufficientSpace;
378 *bytes++ = (byte) ((uint) 0xC0 | (ch >> 6) & 0x3f);
379 *bytes++ = (byte) ((uint) 0x80 | ch & 0x3f);
383 } else if (ch < (uint) 0xD800 || ch > (uint) 0xDFFF) {
386 return EncoderStatus.InsufficientSpace;
387 *bytes++ = (byte) ((uint) 0xE0 | (ch >> 12));
388 *bytes++ = (byte) ((uint) 0x80 | ((ch >> 6) & 0x3F));
389 *bytes++ = (byte) ((uint) 0x80 | (ch & 0x3F));
393 } else if (ch <= (uint) 0xDBFF) {
394 // This is a surrogate char, repeat please
398 // We have a surrogate tail without
399 // leading surrogate.
400 return EncoderStatus.InvalidChar;
403 if (ch >= (uint) 0xDC00 && ch <= (uint) 0xDFFF) {
404 // We have a correct surrogate pair.
405 ch = 0x10000 + (uint) ch - (uint) 0xDC00
406 + ((leftChar - (uint) 0xD800) << 10);
409 return EncoderStatus.InsufficientSpace;
410 *bytes++ = (byte) (0xF0 | (ch >> 18));
411 *bytes++ = (byte) (0x80 | ((ch >> 12) & 0x3F));
412 *bytes++ = (byte) (0x80 | ((ch >> 6) & 0x3F));
413 *bytes++ = (byte) (0x80 | (ch & 0x3F));
418 // We have a surrogate start followed by a
419 // regular character. Technically, this is
420 // invalid, so we fail :(
421 return EncoderStatus.InvalidSurrogate;
423 // increment counters; this is done after processing
424 // the surrogate: in case of a bad surrogate the
425 // encoding should restart on the faulty char (maybe
426 // the correct surrogate has been lost, and in this
427 // case the best option is to restart processing on the
428 // erroneus char to avoid losing more chars during the
434 return EncoderStatus.Ok;
437 internal unsafe static EncoderStatus InternalGetBytes (
438 char* chars, int charCount,
439 byte* bytes, int byteCount,
440 EncoderFallbackBuffer fallbackBuffer,
441 out int charsProcessed, out int bytesProcessed,
445 int t_charsProcessed, t_bytesProcessed;
447 // Validate the parameters
449 throw new ArgumentNullException ("bytes");
451 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
453 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
459 // char processing loop
460 while (charCount - charsProcessed > 0) {
463 chars + charsProcessed, charCount - charsProcessed,
464 bytes + bytesProcessed, byteCount - bytesProcessed,
465 out t_charsProcessed, out t_bytesProcessed, ref leftChar)
467 chars + charsProcessed, charCount - charsProcessed,
469 out t_charsProcessed, out t_bytesProcessed, ref leftChar);
471 charsProcessed += t_charsProcessed;
472 bytesProcessed += t_bytesProcessed;
475 case EncoderStatus.Ok:
476 break; // everything OK :D
478 case EncoderStatus.InsufficientSpace:
479 throw new ArgumentException ("Insufficient Space", "bytes");
481 case EncoderStatus.InputRunOut:
482 return EncoderStatus.InputRunOut;
484 case EncoderStatus.InvalidChar:
485 case EncoderStatus.InvalidSurrogate:
486 // we've found an invalid char or surrogate
487 if (fallbackBuffer == null) {
488 // without a fallbackBuffer abort
489 // returning 'InvalidChar' or
490 // 'InvalidSurrogate'
493 if(t_charsProcessed >= 1) {
494 // one-char invalid UTF-16 or an
496 fallbackBuffer.Fallback (
497 chars [charsProcessed - 1],
500 // we've read a two-char invalid UTF-16
501 // but in this buffer we have only the
502 // invalid surrogate tail
503 fallbackBuffer.Fallback (
507 // if we've arrived here we are working in
508 // replacement mode: build a replacement
509 // fallback_chars buffer
510 char[] fallback_chars = new char [fallbackBuffer.Remaining];
511 for (int i = 0; i < fallback_chars.Length; i++)
512 fallback_chars [i] = fallbackBuffer.GetNextChar ();
513 fallbackBuffer.Reset ();
514 // and encode it into UTF8 bytes...
515 fixed (char *fb_chars = fallback_chars) {
517 switch (bytes != null
518 ? InternalGetBytes (fb_chars, fallback_chars.Length,
519 bytes + bytesProcessed, byteCount - bytesProcessed,
520 null, out t_charsProcessed, out t_bytesProcessed,
522 : InternalGetBytes (fb_chars, fallback_chars.Length,
524 null, out t_charsProcessed, out t_bytesProcessed,
526 case EncoderStatus.Ok:
528 bytesProcessed += t_bytesProcessed;
530 case EncoderStatus.InsufficientSpace:
531 throw new ArgumentException ("Insufficient Space", "fallback buffer bytes");
532 case EncoderStatus.InputRunOut:
533 case EncoderStatus.InvalidChar:
534 case EncoderStatus.InvalidSurrogate:
535 throw new ArgumentException ("Fallback chars are pure evil.", "fallback buffer bytes");
538 // partial reset of encoder state
543 return EncoderStatus.Ok;
546 internal unsafe static EncoderStatus InternalGetBytes (
547 char[] chars, int charIndex, int charCount,
548 byte[] bytes, int byteIndex,
549 EncoderFallbackBuffer fallbackBuffer,
550 out int charsProcessed, out int bytesProcessed,
554 throw new ArgumentNullException ("chars");
555 if (charIndex < 0 || charIndex >= chars.Length)
556 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
557 if (charCount < 0 || charCount > (chars.Length - charIndex))
558 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
559 if (byteIndex < 0 || byteIndex > (bytes != null && bytes.Length > 0 ? bytes.Length - 1 : 0))
560 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
563 fixed (char *cptr = chars) {
564 fixed (byte *bptr = bytes) {
565 return InternalGetBytes (
566 cptr + charIndex, charCount,
567 bytes != null ? bptr + byteIndex : null,
568 bytes != null ? bytes.Length - byteIndex : 0,
570 out charsProcessed, out bytesProcessed,
577 #region GetByteCount()
579 // Get the number of bytes needed to encode a character buffer.
580 public override int GetByteCount (char[] chars, int index, int count)
583 int charsProcessed, bytesProcessed;
584 InternalGetBytes (chars, index, count,
586 EncoderFallback.CreateFallbackBuffer (),
587 out charsProcessed, out bytesProcessed,
589 return bytesProcessed;
593 [CLSCompliant (false)]
595 public unsafe override int GetByteCount (char* chars, int count)
597 int charsProcessed, bytesProcessed;
600 throw new ArgumentNullException ("chars");
602 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
603 InternalGetBytes (chars, count,
605 EncoderFallback.CreateFallbackBuffer (),
606 out charsProcessed, out bytesProcessed,
608 return bytesProcessed;
615 // Get the bytes that result from encoding a character buffer.
616 public override int GetBytes (char[] chars, int charIndex, int charCount,
617 byte[] bytes, int byteIndex)
619 int charsProcessed, bytesProcessed;
622 throw new ArgumentNullException ("bytes");
625 InternalGetBytes (chars, charIndex, charCount,
627 EncoderFallback.CreateFallbackBuffer (),
628 out charsProcessed, out bytesProcessed,
630 return bytesProcessed;
633 // Convenience wrappers for "GetBytes".
634 public unsafe override int GetBytes (String s, int charIndex, int charCount,
635 byte[] bytes, int byteIndex)
637 int charsProcessed, bytesProcessed;
640 throw new ArgumentNullException ("s");
642 throw new ArgumentNullException ("bytes");
643 if (charIndex < 0 || charIndex >= s.Length)
644 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
645 if (charCount < 0 || charCount > (s.Length - charIndex))
646 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
647 if (byteIndex < 0 || byteIndex > (bytes.Length > 0 ? bytes.Length - 1 : 0))
648 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
650 fixed (char *cptr = s) {
651 fixed (byte *bptr = bytes) {
653 cptr + charIndex, charCount,
654 bptr + byteIndex, bytes.Length - byteIndex,
655 EncoderFallback.CreateFallbackBuffer (),
656 out charsProcessed, out bytesProcessed,
661 return bytesProcessed;
664 [CLSCompliant (false)]
666 public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
668 int charsProcessed, bytesProcessed;
671 throw new ArgumentNullException ("chars");
673 throw new IndexOutOfRangeException ("charCount");
675 throw new ArgumentNullException ("bytes");
677 throw new IndexOutOfRangeException ("charCount");
679 chars, charCount, bytes, byteCount,
680 EncoderFallback.CreateFallbackBuffer (),
681 out charsProcessed, out bytesProcessed,
683 return bytesProcessed;
688 #region GetCharCount()
690 // Get the number of characters needed to decode a byte buffer.
691 public override int GetCharCount (byte[] bytes, int index, int count)
693 int bytesProcessed, charsProcessed;
694 uint leftBytes = 0, leftBits = 0, procBytes = 0;
698 DecoderFallback.CreateFallbackBuffer(),
699 out bytesProcessed, out charsProcessed,
700 ref leftBytes, ref leftBits, ref procBytes);
701 return charsProcessed;
704 [CLSCompliant (false)]
706 public unsafe override int GetCharCount (byte* bytes, int count)
708 int bytesProcessed, charsProcessed;
709 uint leftBytes = 0, leftBits = 0, procBytes = 0;
713 DecoderFallback.CreateFallbackBuffer(),
714 out bytesProcessed, out charsProcessed,
715 ref leftBytes, ref leftBits, ref procBytes);
716 return charsProcessed;
721 // Get the characters that result from decoding a byte buffer.
722 public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
723 char[] chars, int charIndex)
725 int bytesProcessed, charsProcessed;
726 uint leftBytes = 0, leftBits = 0, procBytes = 0;
728 bytes, byteIndex, byteCount,
730 DecoderFallback.CreateFallbackBuffer(),
731 out bytesProcessed, out charsProcessed,
732 ref leftBytes, ref leftBits, ref procBytes);
733 return charsProcessed;
736 [CLSCompliant (false)]
738 public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
740 int bytesProcessed, charsProcessed;
741 uint leftBytes = 0, leftBits = 0, procBytes = 0;
745 DecoderFallback.CreateFallbackBuffer(),
746 out bytesProcessed, out charsProcessed,
747 ref leftBytes, ref leftBits, ref procBytes);
748 return charsProcessed;
751 // Get the maximum number of bytes needed to encode a
752 // specified number of characters.
753 public override int GetMaxByteCount (int charCount)
756 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
757 return charCount * 4;
760 // Get the maximum number of characters needed to decode a
761 // specified number of bytes.
762 public override int GetMaxCharCount (int byteCount)
765 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
769 // Get a UTF8-specific decoder that is attached to this instance.
770 public override Decoder GetDecoder ()
772 return new UTF8Decoder (DecoderFallback);
775 // Get a UTF8-specific encoder that is attached to this instance.
776 public override Encoder GetEncoder ()
778 return new UTF8Encoder (EncoderFallback, emitIdentifier);
781 // Get the UTF8 preamble.
782 // XXX: why does this method return a preamble or void array depending
783 // on 'emitIdentifier' attribute?
784 public override byte[] GetPreamble ()
787 return new byte [] { 0xEF, 0xBB, 0xBF };
789 return EmptyArray<byte>.Value;
792 // Determine if this object is equal to another.
793 public override bool Equals (Object value)
795 UTF8Encoding enc = (value as UTF8Encoding);
797 return (codePage == enc.codePage &&
798 emitIdentifier == enc.emitIdentifier &&
799 DecoderFallback.Equals (enc.DecoderFallback) &&
800 EncoderFallback.Equals (enc.EncoderFallback));
806 // Get the hash code for this object.
807 public override int GetHashCode ()
809 return base.GetHashCode ();
812 public override int GetByteCount (string chars)
814 // hmm, does this override make any sense?
815 return base.GetByteCount (chars);
819 public override string GetString (byte [] bytes, int index, int count)
821 // hmm, does this override make any sense?
822 return base.GetString (bytes, index, count);
825 // UTF-8 decoder implementation.
827 private class UTF8Decoder : Decoder
829 // internal encoder state
830 private uint leftBytes;
831 private uint leftBits;
832 private uint procBytes;
835 public UTF8Decoder (DecoderFallback fallback)
843 // Override inherited methods.
844 public override int GetCharCount (byte[] bytes, int index, int count)
846 int bytesProcessed, charsProcessed;
851 out bytesProcessed, out charsProcessed,
852 ref leftBytes, ref leftBits, ref procBytes);
853 return charsProcessed;
856 [ComVisibleAttribute(false)]
857 public override int GetCharCount (byte[] bytes, int index, int count, bool flush)
859 int r = GetCharCount (bytes, index, count);
861 leftBytes = leftBits = procBytes = 0;
865 [ComVisibleAttribute(false)]
866 public unsafe override int GetCharCount (byte* bytes, int count, bool flush)
868 int bytesProcessed, charsProcessed;
873 out bytesProcessed, out charsProcessed,
874 ref leftBytes, ref leftBits, ref procBytes);
876 leftBytes = leftBits = procBytes = 0;
877 return charsProcessed;
880 [ComVisibleAttribute(false)]
881 public unsafe override int GetChars (byte* bytes, int byteCount,
882 char* chars, int charCount, bool flush)
884 int bytesProcessed, charsProcessed;
889 out bytesProcessed, out charsProcessed,
890 ref leftBytes, ref leftBits, ref procBytes);
892 leftBytes = leftBits = procBytes = 0;
893 return charsProcessed;
896 public override int GetChars (byte[] bytes, int byteIndex,
897 int byteCount, char[] chars, int charIndex)
899 int bytesProcessed, charsProcessed;
901 bytes, byteIndex, byteCount,
904 out bytesProcessed, out charsProcessed,
905 ref leftBytes, ref leftBits, ref procBytes);
906 return charsProcessed;
909 public override int GetChars (byte[] bytes, int byteIndex,
910 int byteCount, char[] chars, int charIndex, bool flush)
912 int r = GetChars (bytes, byteIndex, byteCount, chars, charIndex);
914 leftBytes = leftBits = procBytes = 0;
918 public override void Reset ()
926 public unsafe override void Convert (
927 byte* bytes, int byteCount,
928 char* chars, int charCount, bool flush,
929 out int bytesUsed, out int charsUsed, out bool completed)
932 throw new ArgumentNullException ("chars");
934 throw new IndexOutOfRangeException ("charCount");
936 throw new ArgumentNullException ("bytes");
938 throw new IndexOutOfRangeException ("charCount");
939 UTF8Encoding.InternalGetChars (
943 out bytesUsed, out charsUsed,
944 ref leftBytes, ref leftBits, ref procBytes);
945 // only completed if all bytes have been processed and
946 // succesful converted to chars!!
947 completed = (byteCount == bytesUsed);
950 leftBytes = leftBits = procBytes = 0;
952 } // class UTF8Decoder
954 // UTF-8 encoder implementation.
956 private class UTF8Encoder : Encoder
958 private bool emitIdentifier;
960 // internal encoder state
961 private uint leftChar;
962 private bool emittedIdentifier;
965 public UTF8Encoder (EncoderFallback fallback, bool emitIdentifier)
967 this.Fallback = fallback;
969 this.emitIdentifier = emitIdentifier;
970 this.emittedIdentifier = false;
973 // Override inherited methods.
974 [ComVisibleAttribute(false)]
975 public unsafe override int GetByteCount (char* chars, int count, bool flush)
977 int charsProcessed, bytesProcessed, preambleSize = 0;
978 if (emitIdentifier && !emittedIdentifier) {
980 emittedIdentifier = true;
982 InternalGetBytes (chars, count,
985 out charsProcessed, out bytesProcessed,
989 return bytesProcessed + preambleSize;
992 public override int GetByteCount (char[] chars, int index,
993 int count, bool flush)
995 int charsProcessed, bytesProcessed, preambleSize = 0;
996 if (emitIdentifier && !emittedIdentifier) {
998 emittedIdentifier = true;
1000 InternalGetBytes (chars, index, count,
1002 this.FallbackBuffer,
1003 out charsProcessed, out bytesProcessed,
1007 return bytesProcessed + preambleSize;
1010 [ComVisibleAttribute(false)]
1011 public unsafe override int GetBytes (char* chars, int charCount,
1012 byte* bytes, int byteCount, bool flush)
1014 int charsProcessed, bytesProcessed, preambleSize = 0;
1015 if (emitIdentifier && !emittedIdentifier) {
1017 throw new ArgumentException ("Insufficient Space", "UTF8 preamble");
1022 emittedIdentifier = true;
1025 InternalGetBytes (chars, charCount,
1027 this.FallbackBuffer,
1028 out charsProcessed, out bytesProcessed,
1032 return bytesProcessed + preambleSize;
1035 public override int GetBytes (char[] chars, int charIndex,
1036 int charCount, byte[] bytes, int byteIndex, bool flush)
1038 int charsProcessed, bytesProcessed, preambleSize = 0;
1039 if (emitIdentifier && !emittedIdentifier) {
1040 if (bytes.Length - byteIndex < 3)
1041 throw new ArgumentException ("Insufficient Space", "UTF8 preamble");
1042 bytes[byteIndex++] = 0xEF;
1043 bytes[byteIndex++] = 0xBB;
1044 bytes[byteIndex++] = 0xBF;
1046 emittedIdentifier = true;
1048 InternalGetBytes (chars, charIndex, charCount,
1050 this.FallbackBuffer,
1051 out charsProcessed, out bytesProcessed,
1055 return bytesProcessed + preambleSize;
1058 public override void Reset ()
1062 this.emittedIdentifier = false;
1065 public unsafe override void Convert (
1066 char* chars, int charCount,
1067 byte* bytes, int byteCount, bool flush,
1068 out int charsUsed, out int bytesUsed, out bool completed)
1070 int preambleSize = 0;
1072 throw new ArgumentNullException ("bytes");
1074 throw new IndexOutOfRangeException ("charCount");
1076 throw new ArgumentNullException ("chars");
1078 throw new IndexOutOfRangeException ("charCount");
1079 if (emitIdentifier && !emittedIdentifier) {
1081 throw new ArgumentException ("Insufficient Space", "UTF8 preamble");
1086 emittedIdentifier = true;
1092 this.FallbackBuffer,
1093 out charsUsed, out bytesUsed,
1095 // only completed if all chars have been processed and
1096 // succesful converted to chars!!
1097 completed = (charCount == charsUsed);
1098 bytesUsed += preambleSize;
1102 } // class UTF8Encoder
1104 }; // class UTF8Encoding
1106 }; // namespace System.Text