3 // Copyright (c) Microsoft Corporation. All rights reserved.
7 // The worker functions in this file was optimized for performance. If you make changes
8 // you should use care to consider all of the interesting cases.
10 // The code of all worker functions in this file is written twice: Once as as a slow loop, and the
11 // second time as a fast loop. The slow loops handles all special cases, throws exceptions, etc.
12 // The fast loops attempts to blaze through as fast as possible with optimistic range checks,
13 // processing multiple characters at a time, and falling back to the slow loop for all special cases.
15 // This define can be used to turn off the fast loops. Useful for finding whether
16 // the problem is fastloop-specific.
22 using System.Globalization;
23 using System.Runtime.Serialization;
24 using System.Security.Permissions;
25 using System.Diagnostics.Contracts;
27 // Encodes text into and out of UTF-8. UTF-8 is a way of writing
28 // Unicode characters with variable numbers of bytes per character,
29 // optimized for the lower 127 ASCII characters. It's an efficient way
30 // of encoding US English in an internationalizable way.
32 // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
34 // The UTF-8 byte order mark is simply the Unicode byte order mark
35 // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF). The byte order mark is
36 // used mostly to distinguish UTF-8 text from other encodings, and doesn't
37 // switch the byte orderings.
40 [System.Runtime.InteropServices.ComVisible(true)]
41 public class UTF8Encoding : Encoding
44 bytes bits UTF-8 representation
45 ----- ---- -----------------------------------
47 2 11 110vvvvv 10vvvvvv
48 3 16 1110vvvv 10vvvvvv 10vvvvvv
49 4 21 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
50 ----- ---- -----------------------------------
53 Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
56 private const int UTF8_CODEPAGE=65001;
58 // Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
60 private bool emitUTF8Identifier = false;
62 private bool isThrowException = false;
65 public UTF8Encoding(): this(false)
70 public UTF8Encoding(bool encoderShouldEmitUTF8Identifier):
71 this(encoderShouldEmitUTF8Identifier, false)
76 public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes):
79 this.emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
80 this.isThrowException = throwOnInvalidBytes;
82 // Encoding's constructor already did this, but it'll be wrong if we're throwing exceptions
83 if (this.isThrowException)
84 SetDefaultFallbacks();
87 internal override void SetDefaultFallbacks()
89 // For UTF-X encodings, we use a replacement fallback with an empty string
90 if (this.isThrowException)
92 this.encoderFallback = EncoderFallback.ExceptionFallback;
93 this.decoderFallback = DecoderFallback.ExceptionFallback;
97 this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
98 this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
104 // WARNING: GetByteCount(string chars)
105 // WARNING: has different variable names than EncodingNLS.cs, so this can't just be cut & pasted,
106 // WARNING: otherwise it'll break VB's way of declaring these.
108 // The following methods are copied from EncodingNLS.cs.
109 // Unfortunately EncodingNLS.cs is internal and we're public, so we have to reimpliment them here.
110 // These should be kept in [....] for the following classes:
111 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
114 // Returns the number of bytes required to encode a range of characters in
115 // a character array.
117 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
118 // So if you fix this, fix the others. Currently those include:
119 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
120 // parent method is safe
122 [System.Security.SecuritySafeCritical] // auto-generated
123 public override unsafe int GetByteCount(char[] chars, int index, int count)
125 // Validate input parameters
127 throw new ArgumentNullException("chars",
128 Environment.GetResourceString("ArgumentNull_Array"));
130 if (index < 0 || count < 0)
131 throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"),
132 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
134 if (chars.Length - index < count)
135 throw new ArgumentOutOfRangeException("chars",
136 Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
137 Contract.EndContractBlock();
139 // If no input, return 0, avoid fixed empty array problem
140 if (chars.Length == 0)
143 // Just call the pointer version
144 fixed (char* pChars = chars)
145 return GetByteCount(pChars + index, count, null);
148 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
149 // So if you fix this, fix the others. Currently those include:
150 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
151 // parent method is safe
153 [System.Security.SecuritySafeCritical] // auto-generated
154 public override unsafe int GetByteCount(String chars)
158 throw new ArgumentNullException("s");
159 Contract.EndContractBlock();
161 fixed (char* pChars = chars)
162 return GetByteCount(pChars, chars.Length, null);
165 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
166 // So if you fix this, fix the others. Currently those include:
167 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
169 [System.Security.SecurityCritical] // auto-generated
170 [CLSCompliant(false)]
171 [System.Runtime.InteropServices.ComVisible(false)]
172 public override unsafe int GetByteCount(char* chars, int count)
174 // Validate Parameters
176 throw new ArgumentNullException("chars",
177 Environment.GetResourceString("ArgumentNull_Array"));
180 throw new ArgumentOutOfRangeException("count",
181 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
182 Contract.EndContractBlock();
184 // Call it with empty encoder
185 return GetByteCount(chars, count, null);
188 // Parent method is safe.
189 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
190 // So if you fix this, fix the others. Currently those include:
191 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
193 [System.Security.SecuritySafeCritical] // auto-generated
194 public override unsafe int GetBytes(String s, int charIndex, int charCount,
195 byte[] bytes, int byteIndex)
197 if (s == null || bytes == null)
198 throw new ArgumentNullException((s == null ? "s" : "bytes"),
199 Environment.GetResourceString("ArgumentNull_Array"));
201 if (charIndex < 0 || charCount < 0)
202 throw new ArgumentOutOfRangeException((charIndex<0 ? "charIndex" : "charCount"),
203 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
205 if (s.Length - charIndex < charCount)
206 throw new ArgumentOutOfRangeException("s",
207 Environment.GetResourceString("ArgumentOutOfRange_IndexCount"));
209 if (byteIndex < 0 || byteIndex > bytes.Length)
210 throw new ArgumentOutOfRangeException("byteIndex",
211 Environment.GetResourceString("ArgumentOutOfRange_Index"));
212 Contract.EndContractBlock();
214 int byteCount = bytes.Length - byteIndex;
216 // Fixed doesn't like 0 length arrays.
217 if (bytes.Length == 0)
220 fixed (char* pChars = s)
221 fixed ( byte* pBytes = bytes)
222 return GetBytes(pChars + charIndex, charCount,
223 pBytes + byteIndex, byteCount, null);
226 // Encodes a range of characters in a character array into a range of bytes
227 // in a byte array. An exception occurs if the byte array is not large
228 // enough to hold the complete encoding of the characters. The
229 // GetByteCount method can be used to determine the exact number of
230 // bytes that will be produced for a given range of characters.
231 // Alternatively, the GetMaxByteCount method can be used to
232 // determine the maximum number of bytes that will be produced for a given
233 // number of characters, regardless of the actual character values.
235 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
236 // So if you fix this, fix the others. Currently those include:
237 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
238 // parent method is safe
240 [System.Security.SecuritySafeCritical] // auto-generated
241 public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
242 byte[] bytes, int byteIndex)
244 // Validate parameters
245 if (chars == null || bytes == null)
246 throw new ArgumentNullException((chars == null ? "chars" : "bytes"),
247 Environment.GetResourceString("ArgumentNull_Array"));
249 if (charIndex < 0 || charCount < 0)
250 throw new ArgumentOutOfRangeException((charIndex<0 ? "charIndex" : "charCount"),
251 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
253 if (chars.Length - charIndex < charCount)
254 throw new ArgumentOutOfRangeException("chars",
255 Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
257 if (byteIndex < 0 || byteIndex > bytes.Length)
258 throw new ArgumentOutOfRangeException("byteIndex",
259 Environment.GetResourceString("ArgumentOutOfRange_Index"));
260 Contract.EndContractBlock();
262 // If nothing to encode return 0, avoid fixed problem
263 if (chars.Length == 0)
266 // Just call pointer version
267 int byteCount = bytes.Length - byteIndex;
269 // Fixed doesn't like 0 length arrays.
270 if (bytes.Length == 0)
273 fixed (char* pChars = chars)
274 fixed (byte* pBytes = bytes)
275 // Remember that byteCount is # to decode, not size of array.
276 return GetBytes(pChars + charIndex, charCount,
277 pBytes + byteIndex, byteCount, null);
280 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
281 // So if you fix this, fix the others. Currently those include:
282 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
284 [System.Security.SecurityCritical] // auto-generated
285 [CLSCompliant(false)]
286 [System.Runtime.InteropServices.ComVisible(false)]
287 public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
289 // Validate Parameters
290 if (bytes == null || chars == null)
291 throw new ArgumentNullException(bytes == null ? "bytes" : "chars",
292 Environment.GetResourceString("ArgumentNull_Array"));
294 if (charCount < 0 || byteCount < 0)
295 throw new ArgumentOutOfRangeException((charCount<0 ? "charCount" : "byteCount"),
296 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
297 Contract.EndContractBlock();
299 return GetBytes(chars, charCount, bytes, byteCount, null);
302 // Returns the number of characters produced by decoding a range of bytes
305 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
306 // So if you fix this, fix the others. Currently those include:
307 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
308 // parent method is safe
310 [System.Security.SecuritySafeCritical] // auto-generated
311 public override unsafe int GetCharCount(byte[] bytes, int index, int count)
313 // Validate Parameters
315 throw new ArgumentNullException("bytes",
316 Environment.GetResourceString("ArgumentNull_Array"));
318 if (index < 0 || count < 0)
319 throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"),
320 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
322 if (bytes.Length - index < count)
323 throw new ArgumentOutOfRangeException("bytes",
324 Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
325 Contract.EndContractBlock();
327 // If no input just return 0, fixed doesn't like 0 length arrays.
328 if (bytes.Length == 0)
331 // Just call pointer version
332 fixed (byte* pBytes = bytes)
333 return GetCharCount(pBytes + index, count, null);
336 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
337 // So if you fix this, fix the others. Currently those include:
338 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
340 [System.Security.SecurityCritical] // auto-generated
341 [CLSCompliant(false)]
342 [System.Runtime.InteropServices.ComVisible(false)]
343 public override unsafe int GetCharCount(byte* bytes, int count)
345 // Validate Parameters
347 throw new ArgumentNullException("bytes",
348 Environment.GetResourceString("ArgumentNull_Array"));
351 throw new ArgumentOutOfRangeException("count",
352 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
353 Contract.EndContractBlock();
355 return GetCharCount(bytes, count, null);
358 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
359 // So if you fix this, fix the others. Currently those include:
360 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
361 // parent method is safe
363 [System.Security.SecuritySafeCritical] // auto-generated
364 public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
365 char[] chars, int charIndex)
367 // Validate Parameters
368 if (bytes == null || chars == null)
369 throw new ArgumentNullException(bytes == null ? "bytes" : "chars",
370 Environment.GetResourceString("ArgumentNull_Array"));
372 if (byteIndex < 0 || byteCount < 0)
373 throw new ArgumentOutOfRangeException((byteIndex<0 ? "byteIndex" : "byteCount"),
374 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
376 if ( bytes.Length - byteIndex < byteCount)
377 throw new ArgumentOutOfRangeException("bytes",
378 Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
380 if (charIndex < 0 || charIndex > chars.Length)
381 throw new ArgumentOutOfRangeException("charIndex",
382 Environment.GetResourceString("ArgumentOutOfRange_Index"));
383 Contract.EndContractBlock();
385 // If no input, return 0 & avoid fixed problem
386 if (bytes.Length == 0)
389 // Just call pointer version
390 int charCount = chars.Length - charIndex;
392 // Fixed doesn't like 0 length arrays.
393 if (chars.Length == 0)
396 fixed (byte* pBytes = bytes)
397 fixed (char* pChars = chars)
398 // Remember that charCount is # to decode, not size of array
399 return GetChars(pBytes + byteIndex, byteCount,
400 pChars + charIndex, charCount, null);
403 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
404 // So if you fix this, fix the others. Currently those include:
405 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
407 [System.Security.SecurityCritical] // auto-generated
408 [CLSCompliant(false)]
409 [System.Runtime.InteropServices.ComVisible(false)]
410 public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
412 // Validate Parameters
413 if (bytes == null || chars == null)
414 throw new ArgumentNullException(bytes == null ? "bytes" : "chars",
415 Environment.GetResourceString("ArgumentNull_Array"));
417 if (charCount < 0 || byteCount < 0)
418 throw new ArgumentOutOfRangeException((charCount<0 ? "charCount" : "byteCount"),
419 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
420 Contract.EndContractBlock();
422 return GetChars(bytes, byteCount, chars, charCount, null);
425 // Returns a string containing the decoded representation of a range of
426 // bytes in a byte array.
428 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
429 // So if you fix this, fix the others. Currently those include:
430 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
431 // parent method is safe
433 [System.Security.SecuritySafeCritical] // auto-generated
434 [System.Runtime.InteropServices.ComVisible(false)]
435 public override unsafe String GetString(byte[] bytes, int index, int count)
437 // Validate Parameters
439 throw new ArgumentNullException("bytes",
440 Environment.GetResourceString("ArgumentNull_Array"));
442 if (index < 0 || count < 0)
443 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"),
444 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
446 if (bytes.Length - index < count)
447 throw new ArgumentOutOfRangeException("bytes",
448 Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
449 Contract.EndContractBlock();
451 // Avoid problems with empty input buffer
452 if (bytes.Length == 0) return String.Empty;
454 fixed (byte* pBytes = bytes)
455 return String.CreateStringFromEncoding(
456 pBytes + index, count, this);
460 // End of standard methods copied from EncodingNLS.cs
463 // To simplify maintenance, the structure of GetByteCount and GetBytes should be
464 // kept the same as much as possible
465 [System.Security.SecurityCritical] // auto-generated
466 internal override unsafe int GetByteCount(char *chars, int count, EncoderNLS baseEncoder)
468 // For fallback we may need a fallback buffer.
469 // We wait to initialize it though in case we don't have any broken input unicode
470 EncoderFallbackBuffer fallbackBuffer = null;
472 char *pEnd = pSrc+count;
474 // Start by assuming we have as many as count
475 int byteCount = count;
479 if (baseEncoder != null) {
480 UTF8Encoder encoder = (UTF8Encoder)baseEncoder;
481 ch = encoder.surrogateChar;
483 // We mustn't have left over fallback data when counting
484 if (encoder.InternalHasFallbackBuffer)
486 fallbackBuffer = encoder.FallbackBuffer;
487 if (fallbackBuffer.Remaining > 0)
488 throw new ArgumentException(Environment.GetResourceString("Argument_EncoderFallbackNotEmpty",
489 this.EncodingName, encoder.Fallback.GetType()));
491 // Set our internal fallback interesting things.
492 fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false);
497 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
501 // Unroll any fallback that happens at the end
502 ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
508 // Case of surrogates in the fallback.
509 if (fallbackBuffer != null && fallbackBuffer.bFallingBack) {
510 Contract.Assert(ch >= 0xD800 && ch <= 0xDBFF,
511 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
513 ch = fallbackBuffer.InternalGetNextChar();
516 if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
523 byteCount--; // ignore last one.
532 if (baseEncoder != null && !baseEncoder.MustFlush) {
536 // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
542 Contract.Assert(ch >= 0xD800 && ch <= 0xDBFF,
543 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
545 // use separate helper variables for local contexts so that the jit optimizations
546 // won't get confused about the variable lifetimes
549 // count the pending surrogate
552 // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
553 // if (IsLowSurrogate(cha)) {
554 if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
555 // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
557 // ch = cha + (ch << 10) +
559 // - CharUnicodeInfo.LOW_SURROGATE_START
560 // - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
562 // Use this next char
565 // else ch is still high surrogate and encoding will fail (so don't add count)
567 // attempt to encode the surrogate or partial surrogate
571 // If we've used a fallback, then we have to check for it
572 if (fallbackBuffer != null)
574 ch = fallbackBuffer.InternalGetNextChar();
577 // We have an extra byte we weren't expecting.
583 // read next char. The JIT optimization seems to be getting confused when
584 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
589 // if (IsHighSurrogate(ch)) {
590 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END)) {
591 // we will count this surrogate next time around
595 // either good char or partial surrogate
598 // throw exception on partial surrogate if necessary
599 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
600 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
602 // Lone surrogates aren't allowed
603 // Have to make a fallback buffer if we don't have one
604 if (fallbackBuffer == null)
606 // wait on fallbacks if we can
607 // For fallback we may need a fallback buffer
608 if (baseEncoder == null)
609 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
611 fallbackBuffer = baseEncoder.FallbackBuffer;
613 // Set our internal fallback interesting things.
614 fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false);
617 // Do our fallback. Actually we already know its a mixed up surrogate,
618 // so the ref pSrc isn't gonna do anything.
619 fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrc);
621 // Ignore it if we don't throw (we had preallocated this ch)
630 // the extra surrogate byte was compensated by the second surrogate character
631 // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char)
638 // check for overflow
645 // If still have fallback don't do fast loop
646 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
648 // We're reserving 1 byte for each char by default
653 int availableChars = PtrDiff(pEnd, pSrc);
655 // don't fall into the fast decoding loop if we don't have enough characters
656 if (availableChars <= 13) {
657 // try to get over the remainder of the ascii characters fast though
658 char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
659 while (pSrc < pLocalEnd) {
671 // make sure that we won't get a silent overflow inside the fast loop
672 // (Fall out to slow loop if we have this many characters)
673 availableChars &= 0x0FFFFFFF;
676 // To compute the upper bound, assume that all characters are ASCII characters at this point,
677 // the boundary will be decreased for every non-ASCII character we encounter
678 // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
679 char *pStop = pSrc + availableChars - (3 + 4);
681 while (pSrc < pStop) {
685 if (ch > 0x7F) // Not ASCII
687 if (ch > 0x7FF) // Not 2 Byte
689 if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
697 if ((unchecked((int)pSrc) & 0x2) != 0) {
700 if (ch > 0x7F) // Not ASCII
702 if (ch > 0x7FF) // Not 2 Byte
704 if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
712 // Run 2 * 4 characters at a time!
713 while (pSrc < pStop) {
715 int chc = *(int*)(pSrc+2);
716 if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII
718 if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte
720 goto LongCodeWithMask;
724 if ((ch & unchecked((int)0xFF800000)) != 0) // Actually 0x07800780 is all we care about (4 bits)
726 if ((ch & unchecked((int)0xFF80)) != 0)
728 if ((chc & unchecked((int)0xFF800000)) != 0)
730 if ((chc & unchecked((int)0xFF80)) != 0)
736 chc = *(int*)(pSrc+2);
737 if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII
739 if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte
741 goto LongCodeWithMask;
744 if ((ch & unchecked((int)0xFF800000)) != 0)
746 if ((ch & unchecked((int)0xFF80)) != 0)
748 if ((chc & unchecked((int)0xFF800000)) != 0)
750 if ((chc & unchecked((int)0xFF80)) != 0)
758 if (!BitConverter.IsLittleEndian) {
759 // be careful about the sign extension
760 ch = (int)(((uint)ch) >> 16);
771 // use separate helper variables for slow and fast loop so that the jit optimizations
772 // won't get confused about the variable lifetimes
774 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
775 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
776 // 4 byte encoding - high surrogate + low surrogate
780 // !IsHighSurrogate(ch) // low without high -> bad
781 ch > CharUnicodeInfo.HIGH_SURROGATE_END ||
782 // !IsLowSurrogate(chd) // high not followed by low -> bad
783 !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END) )
785 // Back up and drop out to slow loop to figure out error
791 // byteCount - this byte is compensated by the second surrogate character
797 // byteCount - the last byte is already included
801 // no pending char at this point
806 // check for overflow
808 throw new ArgumentException(
809 Environment.GetResourceString("Argument_ConversionOverflow"));
813 Contract.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
814 "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
819 // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic
820 // is good enough for us, and it tends to generate better code than the signed
821 // arithmetic generated by default
822 [System.Security.SecurityCritical] // auto-generated
823 unsafe private static int PtrDiff(char *a, char* b)
825 return (int)(((uint)((byte*)a - (byte*)b)) >> 1);
828 // byte* flavor just for parity
829 [System.Security.SecurityCritical] // auto-generated
830 unsafe private static int PtrDiff(byte* a, byte* b)
835 private static bool InRange(int ch, int start, int end)
837 return (uint)(ch - start) <= (uint)(end - start);
841 // Note: We ignore mismatched surrogates, unless the exception flag is set in which case we throw
842 [System.Security.SecurityCritical] // auto-generated
843 internal override unsafe int GetBytes(char* chars, int charCount,
844 byte* bytes, int byteCount, EncoderNLS baseEncoder)
846 Contract.Assert(chars!=null, "[UTF8Encoding.GetBytes]chars!=null");
847 Contract.Assert(byteCount >=0, "[UTF8Encoding.GetBytes]byteCount >=0");
848 Contract.Assert(charCount >=0, "[UTF8Encoding.GetBytes]charCount >=0");
849 Contract.Assert(bytes!=null, "[UTF8Encoding.GetBytes]bytes!=null");
851 UTF8Encoder encoder = null;
853 // For fallback we may need a fallback buffer.
854 // We wait to initialize it though in case we don't have any broken input unicode
855 EncoderFallbackBuffer fallbackBuffer = null;
857 byte *pTarget = bytes;
859 char *pEnd = pSrc+charCount;
860 byte *pAllocatedBufferEnd = pTarget+byteCount;
864 // assume that JIT will enregister pSrc, pTarget and ch
866 if (baseEncoder != null) {
867 encoder = (UTF8Encoder)baseEncoder;
868 ch = encoder.surrogateChar;
870 // We mustn't have left over fallback data when counting
871 if (encoder.InternalHasFallbackBuffer)
873 // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
874 fallbackBuffer = encoder.FallbackBuffer;
875 if (fallbackBuffer.Remaining > 0 && encoder.m_throwOnOverflow)
876 throw new ArgumentException(Environment.GetResourceString("Argument_EncoderFallbackNotEmpty",
877 this.EncodingName, encoder.Fallback.GetType()));
879 // Set our internal fallback interesting things.
880 fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true);
885 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
890 // Check if there's anthing left to get out of the fallback buffer
891 ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
896 // Case of leftover surrogates in the fallback buffer
897 if (fallbackBuffer != null && fallbackBuffer.bFallingBack) {
898 Contract.Assert(ch >= 0xD800 && ch <= 0xDBFF,
899 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
903 ch = fallbackBuffer.InternalGetNextChar();
905 if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
906 ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
916 // attempt to encode the partial surrogate (will fail or ignore)
917 if (ch > 0 && (encoder == null || encoder.MustFlush))
925 // We have a high surrogate left over from a previous loop.
926 Contract.Assert(ch >= 0xD800 && ch <= 0xDBFF,
927 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
929 // use separate helper variables for local contexts so that the jit optimizations
930 // won't get confused about the variable lifetimes
933 // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
934 // if (IsLowSurrogate(cha)) {
935 if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
936 ch = cha + (ch << 10) +
938 - CharUnicodeInfo.LOW_SURROGATE_START
939 - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
943 // else ch is still high surrogate and encoding will fail
945 // attempt to encode the surrogate or partial surrogate
949 // If we've used a fallback, then we have to check for it
950 if (fallbackBuffer != null)
952 ch = fallbackBuffer.InternalGetNextChar();
953 if (ch > 0) goto ProcessChar;
956 // read next char. The JIT optimization seems to be getting confused when
957 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
962 // if (IsHighSurrogate(ch)) {
963 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END)) {
966 // either good char or partial surrogate
969 // throw exception on partial surrogate if necessary
970 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
971 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
973 // Lone surrogates aren't allowed, we have to do fallback for them
974 // Have to make a fallback buffer if we don't have one
975 if (fallbackBuffer == null)
977 // wait on fallbacks if we can
978 // For fallback we may need a fallback buffer
979 if (baseEncoder == null)
980 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
982 fallbackBuffer = baseEncoder.FallbackBuffer;
984 // Set our internal fallback interesting things.
985 fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true);
988 // Do our fallback. Actually we already know its a mixed up surrogate,
989 // so the ref pSrc isn't gonna do anything.
990 fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrc);
992 // Ignore it if we don't throw
997 // Count bytes needed
1002 bytesNeeded++; // 4 bytes (surrogate pair)
1004 bytesNeeded++; // 3 bytes (800-FFFF)
1006 bytesNeeded++; // 2 bytes (80-7FF)
1009 if (pTarget > pAllocatedBufferEnd - bytesNeeded) {
1010 // Left over surrogate from last time will cause pSrc == chars, so we'll throw
1011 if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
1013 fallbackBuffer.MovePrevious(); // Didn't use this fallback char
1015 fallbackBuffer.MovePrevious(); // Was surrogate, didn't use 2nd part either
1019 pSrc--; // Didn't use this char
1021 pSrc--; // Was surrogate, didn't use 2nd part either
1023 Contract.Assert(pSrc >= chars || pTarget == bytes,
1024 "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
1025 ThrowBytesOverflow(encoder, pTarget == bytes); // Throw if we must
1026 ch = 0; // Nothing left over (we backed up to start of pair if supplimentary)
1031 *pTarget = (byte)ch;
1034 // use separate helper variables for local contexts so that the jit optimizations
1035 // won't get confused about the variable lifetimes
1039 chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6));
1044 chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12));
1048 *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1051 chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1053 *pTarget = (byte)chb;
1056 chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1058 *pTarget = (byte)chb;
1061 *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1067 // If still have fallback don't do fast loop
1068 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
1071 int availableChars = PtrDiff(pEnd, pSrc);
1072 int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
1074 // don't fall into the fast decoding loop if we don't have enough characters
1075 // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
1076 if (availableChars <= 13) {
1077 // we are hoping for 1 byte per char
1078 if (availableBytes < availableChars) {
1079 // not enough output room. no pending bits at this point
1084 // try to get over the remainder of the ascii characters fast though
1085 char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1086 while (pSrc < pLocalEnd) {
1090 // Not ASCII, need more than 1 byte per char
1094 *pTarget = (byte)ch;
1097 // we are done, let ch be 0 to clear encoder
1102 // we need at least 1 byte per character, but Convert might allow us to convert
1103 // only part of the input, so try as much as we can. Reduce charCount if necessary
1104 if (availableBytes < availableChars)
1106 availableChars = availableBytes;
1110 // - optimistic range checks
1111 // - fallbacks to the slow loop for all special cases, exception throwing, etc.
1113 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1114 // the boundary will be decreased for every non-ASCII character we encounter
1115 // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
1116 // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
1117 char *pStop = pSrc + availableChars - 5;
1119 while (pSrc < pStop) {
1126 *pTarget = (byte)ch;
1130 if ((unchecked((int)pSrc) & 0x2) != 0) {
1136 *pTarget = (byte)ch;
1140 // Run 4 characters at a time!
1141 while (pSrc < pStop) {
1143 int chc = *(int*)(pSrc+2);
1144 if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) {
1145 goto LongCodeWithMask;
1148 // Unfortunately, this is endianess sensitive
1149 if (!BitConverter.IsLittleEndian) {
1150 *pTarget = (byte)(ch>>16);
1151 *(pTarget+1) = (byte)ch;
1153 *(pTarget+2) = (byte)(chc>>16);
1154 *(pTarget+3) = (byte)chc;
1157 *pTarget = (byte)ch;
1158 *(pTarget+1) = (byte)(ch>>16);
1160 *(pTarget+2) = (byte)chc;
1161 *(pTarget+3) = (byte)(chc>>16);
1168 if (!BitConverter.IsLittleEndian) {
1169 // be careful about the sign extension
1170 ch = (int)(((uint)ch) >> 16);
1179 *pTarget = (byte)ch;
1184 // use separate helper variables for slow and fast loop so that the jit optimizations
1185 // won't get confused about the variable lifetimes
1189 chd = unchecked((sbyte)0xC0) | (ch >> 6);
1192 // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
1193 if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
1195 chd = unchecked((sbyte)0xE0) | (ch >> 12);
1199 // 4 byte encoding - high surrogate + low surrogate
1200 // if (!IsHighSurrogate(ch))
1201 if (ch > CharUnicodeInfo.HIGH_SURROGATE_END) {
1202 // low without high -> bad, try again in slow loop
1210 // if (!IsLowSurrogate(chd)) {
1211 if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
1212 // high not followed by low -> bad, try again in slow loop
1217 ch = chd + (ch << 10) +
1219 - CharUnicodeInfo.LOW_SURROGATE_START
1220 - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
1222 *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1223 // pStop - this byte is compensated by the second surrogate character
1224 // 2 input chars require 4 output bytes. 2 have been anticipated already
1225 // and 2 more will be accounted for by the 2 pStop-- calls below.
1228 chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1230 *pTarget = (byte)chd;
1231 pStop--; // 3 byte sequence for 1 char, so need pStop-- and the one below too.
1234 chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1236 *pTarget = (byte)chd;
1237 pStop--; // 2 byte sequence for 1 char so need pStop--.
1240 *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1241 // pStop - this byte is already included
1245 Contract.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
1249 // no pending char at this point
1253 // Do we have to set the encoder bytes?
1254 if (encoder != null)
1256 Contract.Assert(!encoder.MustFlush || ch == 0,
1257 "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture));
1259 encoder.surrogateChar = ch;
1260 encoder.m_charsUsed = (int)(pSrc - chars);
1263 Contract.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
1264 baseEncoder == null || !baseEncoder.m_throwOnOverflow,
1265 "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting");
1267 return (int)(pTarget - bytes);
1271 // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
1272 // while the actual character is being built in the lower bits. They are shifted together
1273 // with the actual bits of the character.
1275 // bits 30 & 31 are used for pending bits fixup
1276 private const int FinalByte = 1 << 29;
1277 private const int SupplimentarySeq = 1 << 28;
1278 private const int ThreeByteSeq = 1 << 27;
1280 // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
1281 // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1283 // To simplify maintenance, the structure of GetCharCount and GetChars should be
1284 // kept the same as much as possible
1285 [System.Security.SecurityCritical] // auto-generated
1286 internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
1288 Contract.Assert(count >=0, "[UTF8Encoding.GetCharCount]count >=0");
1289 Contract.Assert(bytes!=null, "[UTF8Encoding.GetCharCount]bytes!=null");
1293 byte *pEnd = pSrc+count;
1295 // Start by assuming we have as many as count, charCount always includes the adjustment
1296 // for the character being decoded
1297 int charCount = count;
1299 DecoderFallbackBuffer fallback = null;
1301 if (baseDecoder != null) {
1302 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1304 charCount -= (ch >> 30); // Adjust char count for # of expected bytes and expected output chars.
1306 // Shouldn't have anything in fallback buffer for GetCharCount
1307 // (don't have to check m_throwOnOverflow for count)
1308 Contract.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1309 "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start");
1314 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1325 // read next byte. The JIT optimization seems to be getting confused when
1326 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1330 // we are expecting to see trailing bytes like 10vvvvvv
1331 if ((cha & unchecked((sbyte)0xC0)) != 0x80) {
1332 // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1333 // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1335 charCount += (ch >> 30);
1336 goto InvalidByteSequence;
1339 // fold in the new byte
1340 ch = (ch << 6) | (cha & 0x3F);
1342 if ((ch & FinalByte) == 0) {
1343 Contract.Assert( (ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1344 "[UTF8Encoding.GetChars]Invariant volation");
1346 if ((ch & SupplimentarySeq) != 0) {
1347 if ((ch & (FinalByte >> 6)) != 0) {
1348 // this is 3rd byte (of 4 byte supplimentary) - nothing to do
1352 // 2nd byte, check for non-shortest form of supplimentary char and the valid
1353 // supplimentary characters in range 0x010000 - 0x10FFFF at the same time
1354 if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
1355 goto InvalidByteSequence;
1359 // Must be 2nd byte of a 3-byte sequence
1360 // check for non-shortest form of 3 byte seq
1361 if ((ch & (0x1F << 5)) == 0 || // non-shortest form
1362 (ch & (0xF800 >> 6) ) == (0xD800 >> 6)) // illegal individually encoded surrogate
1364 goto InvalidByteSequence;
1372 // adjust for surrogates in non-shortest form
1373 if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) {
1378 InvalidByteSequence:
1379 // this code fragment should be close to the gotos referencing it
1380 // Have to do fallback for invalid bytes
1381 if (fallback == null)
1383 if (baseDecoder == null)
1384 fallback = this.decoderFallback.CreateFallbackBuffer();
1386 fallback = baseDecoder.FallbackBuffer;
1387 fallback.InternalInitialize(bytes, null);
1389 charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1400 // If its > 0x7F, its start of a new multi-byte sequence
1402 // Long sequence, so unreserve our char.
1405 // bit 6 has to be non-zero for start of multibyte chars.
1406 if ((ch & 0x40) == 0) {
1407 // Unexpected trail byte
1408 goto InvalidByteSequence;
1411 // start a new long code
1412 if ((ch & 0x20) != 0) {
1413 if ((ch & 0x10) != 0) {
1414 // 4 byte encoding - supplimentary character (2 surrogates)
1418 // check that bit 4 is zero and the valid supplimentary character
1419 // range 0x000000 - 0x10FFFF at the same time
1422 goto InvalidByteSequence;
1425 // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1426 // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
1427 ch |= (FinalByte >> 3*6) | // Final byte is 3 more bytes from now
1428 (1 << 30) | // If it dies on next byte we'll need an extra char
1429 (3 << (30-2*6)) | // If it dies on last byte we'll need to subtract a char
1430 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1431 (SupplimentarySeq >> 2*6) | (SupplimentarySeq >> 3*6);
1433 // Our character count will be 2 characters for these 4 bytes, so subtract another char
1438 // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1439 ch = (ch & 0x0F) | ( (FinalByte >> 2*6) | (1 << 30) |
1440 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2*6) );
1442 // We'll expect 1 character for these 3 bytes, so subtract another char.
1451 // check for non-shortest form
1454 goto InvalidByteSequence;
1457 // Add bit flags so we'll be flagged correctly
1458 ch |= (FinalByte >> 6);
1466 int availableBytes = PtrDiff(pEnd, pSrc);
1468 // don't fall into the fast decoding loop if we don't have enough bytes
1469 if (availableBytes <= 13) {
1470 // try to get over the remainder of the ascii characters fast though
1471 byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1472 while (pSrc < pLocalEnd) {
1484 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1485 // the boundary will be decreased for every non-ASCII character we encounter
1486 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
1487 byte *pStop = pSrc + availableBytes - 7;
1489 while (pSrc < pStop) {
1497 // get pSrc 2-byte aligned
1498 if ((unchecked((int)pSrc) & 0x1) != 0) {
1506 // get pSrc 4-byte aligned
1507 if ((unchecked((int)pSrc) & 0x2) != 0) {
1508 ch = *(ushort*)pSrc;
1509 if ((ch & 0x8080) != 0) {
1510 goto LongCodeWithMask16;
1515 // Run 8 + 8 characters at a time!
1516 while (pSrc < pStop) {
1518 int chb = *(int*)(pSrc+4);
1519 if (((ch | chb) & unchecked((int)0x80808080)) != 0) {
1520 goto LongCodeWithMask32;
1524 // This is a really small loop - unroll it
1529 chb = *(int*)(pSrc+4);
1530 if (((ch | chb) & unchecked((int)0x80808080)) != 0) {
1531 goto LongCodeWithMask32;
1538 // be careful about the sign extension
1539 if (!BitConverter.IsLittleEndian) {
1540 ch = (int)(((uint)ch) >> 16);
1545 if (!BitConverter.IsLittleEndian) {
1546 ch = (int)(((uint)ch) >> 8);
1560 // bit 6 has to be zero
1562 // we are expecting to see trailing bytes like 10vvvvvv
1563 (chc & unchecked((sbyte)0xC0)) != 0x80)
1570 // start a new long code
1571 if ((ch & 0x20) != 0) {
1573 // fold the first two bytes together
1574 chc |= (ch & 0x0F) << 6;
1576 if ((ch & 0x10) != 0) {
1577 // 4 byte encoding - surrogate
1580 // check that bit 4 is zero, the non-shortest form of surrogate
1581 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
1582 !InRange(chc >> 4, 0x01, 0x10) ||
1583 // we are expecting to see trailing bytes like 10vvvvvv
1584 (ch & unchecked((sbyte)0xC0)) != 0x80 )
1589 chc = (chc << 6) | (ch & 0x3F);
1592 // we are expecting to see trailing bytes like 10vvvvvv
1593 if ((ch & unchecked((sbyte)0xC0)) != 0x80) {
1605 // check for non-shortest form of 3 byte seq
1606 (chc & (0x1F << 5)) == 0 ||
1607 // Can't have surrogates here.
1608 (chc & (0xF800 >> 6) ) == (0xD800 >> 6) ||
1609 // we are expecting to see trailing bytes like 10vvvvvv
1610 (ch & unchecked((sbyte)0xC0)) != 0x80 )
1623 // check for non-shortest form
1624 if ((ch & 0x1E) == 0) {
1634 // no pending bits at this point
1644 // May have a problem if we have to flush
1647 // We were already adjusting for these, so need to unadjust
1648 charCount += (ch >> 30);
1649 if (baseDecoder == null || baseDecoder.MustFlush)
1651 // Have to do fallback for invalid bytes
1652 if (fallback == null)
1654 if (baseDecoder == null)
1655 fallback = this.decoderFallback.CreateFallbackBuffer();
1657 fallback = baseDecoder.FallbackBuffer;
1658 fallback.InternalInitialize(bytes, null);
1660 charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1664 // Shouldn't have anything in fallback buffer for GetCharCount
1665 // (don't have to check m_throwOnOverflow for count)
1666 Contract.Assert(fallback == null || fallback.Remaining == 0,
1667 "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
1672 // WARNING: If we throw an error, then System.Resources.ResourceReader calls this method.
1673 // So if we're really broken, then that could also throw an error... recursively.
1674 // So try to make sure GetChars can at least process all uses by
1675 // System.Resources.ResourceReader!
1677 // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
1678 // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1680 // To simplify maintenance, the structure of GetCharCount and GetChars should be
1681 // kept the same as much as possible
1682 [System.Security.SecurityCritical] // auto-generated
1683 internal override unsafe int GetChars(byte* bytes, int byteCount,
1684 char* chars, int charCount, DecoderNLS baseDecoder)
1686 Contract.Assert(chars!=null, "[UTF8Encoding.GetChars]chars!=null");
1687 Contract.Assert(byteCount >=0, "[UTF8Encoding.GetChars]count >=0");
1688 Contract.Assert(charCount >=0, "[UTF8Encoding.GetChars]charCount >=0");
1689 Contract.Assert(bytes!=null, "[UTF8Encoding.GetChars]bytes!=null");
1692 char *pTarget = chars;
1694 byte *pEnd = pSrc+byteCount;
1695 char *pAllocatedBufferEnd = pTarget+charCount;
1699 DecoderFallbackBuffer fallback = null;
1700 if (baseDecoder != null) {
1701 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1704 // Shouldn't have anything in fallback buffer for GetChars
1705 // (don't have to check m_throwOnOverflow for chars, we always use all or none so always should be empty)
1706 Contract.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1707 "[UTF8Encoding.GetChars]Expected empty fallback buffer at start");
1712 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1723 // read next byte. The JIT optimization seems to be getting confused when
1724 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1728 // we are expecting to see trailing bytes like 10vvvvvv
1729 if ((cha & unchecked((sbyte)0xC0)) != 0x80) {
1730 // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1731 // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1733 goto InvalidByteSequence;
1736 // fold in the new byte
1737 ch = (ch << 6) | (cha & 0x3F);
1739 if ((ch & FinalByte) == 0) {
1740 // Not at last byte yet
1741 Contract.Assert( (ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1742 "[UTF8Encoding.GetChars]Invariant volation");
1744 if ((ch & SupplimentarySeq) != 0) {
1745 // Its a 4-byte supplimentary sequence
1746 if ((ch & (FinalByte >> 6)) != 0) {
1747 // this is 3rd byte of 4 byte sequence - nothing to do
1751 // 2nd byte of 4 bytes
1752 // check for non-shortest form of surrogate and the valid surrogate
1753 // range 0x000000 - 0x10FFFF at the same time
1754 if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
1755 goto InvalidByteSequence;
1759 // Must be 2nd byte of a 3-byte sequence
1760 // check for non-shortest form of 3 byte seq
1761 if ((ch & (0x1F << 5)) == 0 || // non-shortest form
1762 (ch & (0xF800 >> 6) ) == (0xD800 >> 6)) // illegal individually encoded surrogate
1764 goto InvalidByteSequence;
1772 // surrogate in shortest form?
1773 // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte?
1774 if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) {
1775 // let the range check for the second char throw the exception
1776 if (pTarget < pAllocatedBufferEnd) {
1777 *pTarget = (char)( ((ch >> 10) & 0x7FF) +
1778 unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))) );
1782 unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START));
1788 InvalidByteSequence:
1789 // this code fragment should be close to the gotos referencing it
1790 // Have to do fallback for invalid bytes
1791 if (fallback == null)
1793 if (baseDecoder == null)
1794 fallback = this.decoderFallback.CreateFallbackBuffer();
1796 fallback = baseDecoder.FallbackBuffer;
1797 fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
1799 // This'll back us up the appropriate # of bytes if we didn't get anywhere
1800 if (!FallbackInvalidByteSequence(ref pSrc, ch, fallback, ref pTarget))
1802 // Ran out of buffer space
1803 // Need to throw an exception?
1804 Contract.Assert(pSrc >= bytes || pTarget == chars,
1805 "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
1806 fallback.InternalReset();
1807 ThrowCharsOverflow(baseDecoder, pTarget == chars);
1811 Contract.Assert(pSrc >= bytes,
1812 "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
1822 // If its > 0x7F, its start of a new multi-byte sequence
1824 // bit 6 has to be non-zero
1825 if ((ch & 0x40) == 0) {
1826 goto InvalidByteSequence;
1829 // start a new long code
1830 if ((ch & 0x20) != 0) {
1831 if ((ch & 0x10) != 0) {
1832 // 4 byte encoding - supplimentary character (2 surrogates)
1836 // check that bit 4 is zero and the valid supplimentary character
1837 // range 0x000000 - 0x10FFFF at the same time
1840 goto InvalidByteSequence;
1843 ch |= (FinalByte >> 3*6) | (1 << 30) | (3 << (30-2*6)) |
1844 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1845 (SupplimentarySeq >> 2*6) | (SupplimentarySeq >> 3*6);
1849 ch = (ch & 0x0F) | ( (FinalByte >> 2*6) | (1 << 30) |
1850 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2*6) );
1858 // check for non-shortest form
1861 goto InvalidByteSequence;
1864 ch |= (FinalByte >> 6);
1870 // write the pending character
1871 if (pTarget >= pAllocatedBufferEnd)
1873 // Fix chars so we make sure to throw if we didn't output anything
1879 if (ch >= CharUnicodeInfo.LOW_SURROGATE_START &&
1880 ch <= CharUnicodeInfo.LOW_SURROGATE_END)
1882 pSrc--; // It was 4 bytes
1883 pTarget--; // 1 was stored already, but we can't remember 1/2, so back up
1885 else if (ch > 0xffff)
1887 pSrc--; // It was 4 bytes, nothing was stored
1889 pSrc--; // It was at least 3 bytes
1891 pSrc--; // It was at least 2 bytes
1895 // Throw that we don't have enough room (pSrc could be < chars if we had started to process
1896 // a 4 byte sequence alredy)
1897 Contract.Assert(pSrc >= bytes || pTarget == chars,
1898 "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
1899 ThrowCharsOverflow(baseDecoder, pTarget == chars);
1901 // Don't store ch in decoder, we already backed up to its start
1904 // Didn't throw, just use this buffer size.
1907 *pTarget = (char)ch;
1911 int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
1912 int availableBytes = PtrDiff(pEnd, pSrc);
1914 // don't fall into the fast decoding loop if we don't have enough bytes
1915 // Test for availableChars is done because pStop would be <= pTarget.
1916 if (availableBytes <= 13) {
1917 // we may need as many as 1 character per byte
1918 if (availableChars < availableBytes) {
1919 // not enough output room. no pending bits at this point
1924 // try to get over the remainder of the ascii characters fast though
1925 byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1926 while (pSrc < pLocalEnd) {
1933 *pTarget = (char)ch;
1941 // we may need as many as 1 character per byte, so reduce the byte count if necessary.
1942 // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
1943 if (availableChars < availableBytes) {
1944 availableBytes = availableChars;
1947 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1948 // the boundary will be decreased for every non-ASCII character we encounter
1949 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
1950 char *pStop = pTarget + availableBytes - 7;
1952 while (pTarget < pStop) {
1959 *pTarget = (char)ch;
1962 // get pSrc to be 2-byte aligned
1963 if ((unchecked((int)pSrc) & 0x1) != 0) {
1969 *pTarget = (char)ch;
1973 // get pSrc to be 4-byte aligned
1974 if ((unchecked((int)pSrc) & 0x2) != 0) {
1975 ch = *(ushort*)pSrc;
1976 if ((ch & 0x8080) != 0) {
1977 goto LongCodeWithMask16;
1980 // Unfortunately, this is endianess sensitive
1981 if (!BitConverter.IsLittleEndian) {
1982 *pTarget = (char)((ch >> 8) & 0x7F);
1984 *(pTarget+1) = (char)(ch & 0x7F);
1987 *pTarget = (char)(ch & 0x7F);
1989 *(pTarget+1) = (char)((ch >> 8) & 0x7F);
1994 // Run 8 characters at a time!
1995 while (pTarget < pStop) {
1997 int chb = *(int*)(pSrc+4);
1998 if (((ch | chb) & unchecked((int)0x80808080)) != 0) {
1999 goto LongCodeWithMask32;
2002 // Unfortunately, this is endianess sensitive
2003 if (!BitConverter.IsLittleEndian) {
2004 *pTarget = (char)((ch >> 24) & 0x7F);
2005 *(pTarget+1) = (char)((ch >> 16) & 0x7F);
2006 *(pTarget+2) = (char)((ch >> 8) & 0x7F);
2007 *(pTarget+3) = (char)(ch & 0x7F);
2009 *(pTarget+4) = (char)((chb >> 24) & 0x7F);
2010 *(pTarget+5) = (char)((chb >> 16) & 0x7F);
2011 *(pTarget+6) = (char)((chb >> 8) & 0x7F);
2012 *(pTarget+7) = (char)(chb & 0x7F);
2015 *pTarget = (char)(ch & 0x7F);
2016 *(pTarget+1) = (char)((ch >> 8) & 0x7F);
2017 *(pTarget+2) = (char)((ch >> 16) & 0x7F);
2018 *(pTarget+3) = (char)((ch >> 24) & 0x7F);
2020 *(pTarget+4) = (char)(chb & 0x7F);
2021 *(pTarget+5) = (char)((chb >> 8) & 0x7F);
2022 *(pTarget+6) = (char)((chb >> 16) & 0x7F);
2023 *(pTarget+7) = (char)((chb >> 24) & 0x7F);
2030 if (!BitConverter.IsLittleEndian) {
2031 // be careful about the sign extension
2032 ch = (int)(((uint)ch) >> 16);
2037 if (!BitConverter.IsLittleEndian) {
2038 ch = (int)(((uint)ch) >> 8);
2044 *pTarget = (char)ch;
2054 // bit 6 has to be zero
2056 // we are expecting to see trailing bytes like 10vvvvvv
2057 (chc & unchecked((sbyte)0xC0)) != 0x80)
2064 // start a new long code
2065 if ((ch & 0x20) != 0) {
2067 // fold the first two bytes together
2068 chc |= (ch & 0x0F) << 6;
2070 if ((ch & 0x10) != 0) {
2071 // 4 byte encoding - surrogate
2074 // check that bit 4 is zero, the non-shortest form of surrogate
2075 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
2076 !InRange(chc >> 4, 0x01, 0x10) ||
2077 // we are expecting to see trailing bytes like 10vvvvvv
2078 (ch & unchecked((sbyte)0xC0)) != 0x80 )
2083 chc = (chc << 6) | (ch & 0x3F);
2086 // we are expecting to see trailing bytes like 10vvvvvv
2087 if ((ch & unchecked((sbyte)0xC0)) != 0x80) {
2092 ch = (chc << 6) | (ch & 0x3F);
2094 *pTarget = (char)( ((ch >> 10) & 0x7FF) +
2095 unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))) );
2099 unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START));
2101 // extra byte, we're already planning 2 chars for 2 of these bytes,
2102 // but the big loop is testing the target against pStop, so we need
2103 // to subtract 2 more or we risk overrunning the input. Subtract
2104 // one here and one below.
2111 // check for non-shortest form of 3 byte seq
2112 (chc & (0x1F << 5)) == 0 ||
2113 // Can't have surrogates here.
2114 (chc & (0xF800 >> 6) ) == (0xD800 >> 6) ||
2115 // we are expecting to see trailing bytes like 10vvvvvv
2116 (ch & unchecked((sbyte)0xC0)) != 0x80 )
2122 ch = (chc << 6) | (ch & 0x3F);
2124 // extra byte, we're only expecting 1 char for each of these 3 bytes,
2125 // but the loop is testing the target (not source) against pStop, so
2126 // we need to subtract 2 more or we risk overrunning the input.
2127 // Subtract 1 here and one more below
2136 // check for non-shortest form
2140 ch = (ch << 6) | chc;
2143 *pTarget = (char)ch;
2146 // extra byte, we're only expecting 1 char for each of these 2 bytes,
2147 // but the loop is testing the target (not source) against pStop.
2148 // subtract an extra count from pStop so that we don't overrun the input.
2153 Contract.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
2155 // no pending bits at this point
2165 if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush))
2167 // Have to do fallback for invalid bytes
2168 if (fallback == null)
2170 if (baseDecoder == null)
2171 fallback = this.decoderFallback.CreateFallbackBuffer();
2173 fallback = baseDecoder.FallbackBuffer;
2174 fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
2177 // This'll back us up the appropriate # of bytes if we didn't get anywhere
2178 if (!FallbackInvalidByteSequence(ref pSrc, ch, fallback, ref pTarget))
2180 Contract.Assert(pSrc >= bytes || pTarget == chars,
2181 "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
2183 // Ran out of buffer space
2184 // Need to throw an exception?
2185 fallback.InternalReset();
2186 ThrowCharsOverflow(baseDecoder, pTarget == chars);
2188 Contract.Assert(pSrc >= bytes,
2189 "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
2193 if (baseDecoder != null)
2195 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
2197 // If we're storing flush data we expect all bits to be used or else
2198 // we're stuck in the middle of a conversion
2199 Contract.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder.m_throwOnOverflow,
2200 "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow.");
2202 // Remember our leftover bits.
2205 baseDecoder.m_bytesUsed = (int)(pSrc - bytes);
2208 // Shouldn't have anything in fallback buffer for GetChars
2209 // (don't have to check m_throwOnOverflow for chars)
2210 Contract.Assert(fallback == null || fallback.Remaining == 0,
2211 "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
2213 return PtrDiff(pTarget, chars);
2216 // During GetChars we had an invalid byte sequence
2217 // pSrc is backed up to the start of the bad sequence if we didn't have room to
2218 // fall it back. Otherwise pSrc remains wher it is.
2219 [System.Security.SecurityCritical] // auto-generated
2220 private unsafe bool FallbackInvalidByteSequence(
2221 ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget)
2224 byte *pStart = pSrc;
2225 byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch);
2227 // Do the actual fallback
2228 if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget))
2230 // Oops, it failed, back up to pStart
2239 // During GetCharCount we had an invalid byte sequence
2240 // pSrc is used to find the index that points to the invalid bytes,
2241 // however the byte[] contains the fallback bytes (in case the index is -1)
2242 [System.Security.SecurityCritical] // auto-generated
2243 private unsafe int FallbackInvalidByteSequence(
2244 byte* pSrc, int ch, DecoderFallbackBuffer fallback)
2247 byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch);
2249 // Do the actual fallback
2250 int count = fallback.InternalFallback(bytesUnknown, pSrc);
2252 // # of fallback chars expected.
2253 // Note that we only get here for "long" sequences, and have already unreserved
2254 // the count that we prereserved for the input bytes
2258 // Note that some of these bytes may have come from a previous fallback, so we cannot
2259 // just decrement the pointer and use the values we read. In those cases we have
2260 // to regenerate the original values.
2261 [System.Security.SecurityCritical] // auto-generated
2262 private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch)
2265 byte[] bytesUnknown = null;
2267 // See if it was a plain char
2268 // (have to check >= 0 because we have all sorts of wierd bit flags)
2269 if (ch < 0x100 && ch >= 0)
2272 bytesUnknown = new byte[] { unchecked((byte)ch) };
2274 // See if its an unfinished 2 byte sequence
2275 else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
2278 bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F )| 0xc0)) };
2280 // So now we're either 2nd byte of 3 or 4 byte sequence or
2281 // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
2282 // 1st check if its a 4 byte sequence
2283 else if ((ch & SupplimentarySeq) != 0)
2285 // 3rd byte of 4 byte sequence?
2286 if ((ch & (FinalByte >> 6)) != 0)
2288 // 3rd byte of 4 byte sequence
2290 bytesUnknown = new byte[] {
2291 unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)),
2292 unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)),
2293 unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2295 else if ((ch & (FinalByte >> 12)) != 0)
2297 // 2nd byte of a 4 byte sequence
2299 bytesUnknown = new byte[] {
2300 unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)),
2301 unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2305 // 4th byte of a 4 byte sequence
2307 bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0))};
2312 // 2nd byte of 3 byte sequence?
2313 if ((ch & (FinalByte >> 6)) != 0)
2315 // So its 2nd byte of a 3 byte sequence
2317 bytesUnknown = new byte[] {
2318 unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) };
2322 // 1st byte of a 3 byte sequence
2324 bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0))};
2328 return bytesUnknown;
2332 public override Decoder GetDecoder() {
2333 return new UTF8Decoder(this);
2337 public override Encoder GetEncoder() {
2338 return new UTF8Encoder(this);
2342 public override int GetMaxByteCount(int charCount)
2345 throw new ArgumentOutOfRangeException("charCount",
2346 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
2347 Contract.EndContractBlock();
2349 // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
2350 long byteCount = (long)charCount + 1;
2352 if (EncoderFallback.MaxCharCount > 1)
2353 byteCount *= EncoderFallback.MaxCharCount;
2355 // Max 3 bytes per char. (4 bytes per 2 chars for surrogates)
2358 if (byteCount > 0x7fffffff)
2359 throw new ArgumentOutOfRangeException("charCount", Environment.GetResourceString("ArgumentOutOfRange_GetByteCountOverflow"));
2361 return (int)byteCount;
2365 public override int GetMaxCharCount(int byteCount)
2368 throw new ArgumentOutOfRangeException("byteCount",
2369 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
2370 Contract.EndContractBlock();
2372 // Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
2373 long charCount = ((long)byteCount + 1);
2375 // Non-shortest form would fall back, so get max count from fallback.
2376 // So would 11... followed by 11..., so you could fall back every byte
2377 if (DecoderFallback.MaxCharCount > 1)
2379 charCount *= DecoderFallback.MaxCharCount;
2382 if (charCount > 0x7fffffff)
2383 throw new ArgumentOutOfRangeException("byteCount", Environment.GetResourceString("ArgumentOutOfRange_GetCharCountOverflow"));
2385 return (int)charCount;
2389 public override byte[] GetPreamble()
2391 if (emitUTF8Identifier) {
2392 // Allocate new array to prevent users from modifying it.
2393 return new byte[3] { 0xEF, 0xBB, 0xBF };
2396 return EmptyArray<Byte>.Value;
2400 public override bool Equals(Object value) {
2401 UTF8Encoding that = value as UTF8Encoding;
2403 return (emitUTF8Identifier == that.emitUTF8Identifier) &&
2404 // (isThrowException == that.isThrowException) && // Same as encoder/decoderfallbacks being exception
2405 (EncoderFallback.Equals(that.EncoderFallback)) &&
2406 (DecoderFallback.Equals(that.DecoderFallback));
2412 public override int GetHashCode() {
2413 //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
2414 return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
2415 UTF8_CODEPAGE + (emitUTF8Identifier?1:0);
2419 internal class UTF8Encoder : EncoderNLS, ISerializable
2421 // We must save a high surrogate value until the next call, looking
2422 // for a low surrogate value.
2423 internal int surrogateChar;
2425 public UTF8Encoder(UTF8Encoding encoding) : base(encoding)
2430 // Constructor called by serialization, have to handle deserializing from Everett
2431 internal UTF8Encoder(SerializationInfo info, StreamingContext context)
2434 if (info==null) throw new ArgumentNullException("info");
2435 Contract.EndContractBlock();
2438 this.m_encoding = (Encoding)info.GetValue("encoding", typeof(Encoding));
2440 // SurrogateChar happens to mean the same thing
2441 this.surrogateChar = (int)info.GetValue("surrogateChar", typeof(int));
2445 this.m_fallback = (EncoderFallback) info.GetValue("m_fallback", typeof(EncoderFallback));
2447 catch (SerializationException)
2449 this.m_fallback = null;
2453 #if FEATURE_SERIALIZATION
2454 // ISerializable implementation, get data for this object
2455 [System.Security.SecurityCritical] // auto-generated_required
2456 void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
2459 if (info==null) throw new ArgumentNullException("info");
2460 Contract.EndContractBlock();
2462 // Save Whidbey data
2463 // Just need Everett maxCharSize (BaseCodePageEncoding) or m_maxByteSize (MLangBaseCodePageEncoding)
2464 info.AddValue("encoding", this.m_encoding);
2465 info.AddValue("surrogateChar", this.surrogateChar);
2467 info.AddValue("m_fallback", this.m_fallback);
2469 // Extra stuff for Everett that Whidbey doesn't use
2470 info.AddValue("storedSurrogate", this.surrogateChar > 0 ? true : false);
2471 info.AddValue("mustFlush", false); // Everett doesn't actually use this either, but it accidently serialized it!
2475 public override void Reset()
2478 this.surrogateChar = 0;
2479 if (m_fallbackBuffer != null)
2480 m_fallbackBuffer.Reset();
2483 // Anything left in our encoder?
2484 internal override bool HasState
2488 return (this.surrogateChar != 0);
2494 internal class UTF8Decoder : DecoderNLS, ISerializable
2496 // We'll need to remember the previous information. See the comments around definition
2497 // of FinalByte for details.
2500 public UTF8Decoder(UTF8Encoding encoding) : base(encoding)
2505 // Constructor called by serialization, have to handle deserializing from Everett
2506 internal UTF8Decoder(SerializationInfo info, StreamingContext context)
2509 if (info==null) throw new ArgumentNullException("info");
2510 Contract.EndContractBlock();
2513 this.m_encoding = (Encoding)info.GetValue("encoding", typeof(Encoding));
2517 // Get whidbey version of bits
2518 this.bits = (int)info.GetValue("wbits", typeof(int));
2519 this.m_fallback = (DecoderFallback) info.GetValue("m_fallback", typeof(DecoderFallback));
2521 catch (SerializationException)
2523 // Everett calls bits bits instead of wbits, so this is Everett
2525 this.m_fallback = null;
2529 #if FEATURE_SERIALIZATION
2530 // ISerializable implementation, get data for this object
2531 [System.Security.SecurityCritical] // auto-generated_required
2532 void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
2535 if (info==null) throw new ArgumentNullException("info");
2536 Contract.EndContractBlock();
2538 // Save new Whidbey data
2539 info.AddValue("encoding", this.m_encoding);
2540 info.AddValue("wbits", this.bits); // Special whidbey bits name
2541 info.AddValue("m_fallback", this.m_fallback);
2543 // Everett has extra stuff, we set it all to 0 in case this deserializes in Everett
2544 info.AddValue("bits", (int)0);
2545 info.AddValue("trailCount", (int)0);
2546 info.AddValue("isSurrogate", false);
2547 info.AddValue("byteSequence", (int)0);
2551 public override void Reset()
2554 if (m_fallbackBuffer != null)
2555 m_fallbackBuffer.Reset();
2558 // Anything left in our decoder?
2559 internal override bool HasState
2563 return (this.bits != 0);