mcs/class/referencesource/mscorlib/system/text/utf8encoding.cs

   1 // ==++==
   2 //
   3 //   Copyright (c) Microsoft Corporation.  All rights reserved.
   4 //
   5 // ==--==
   6
   7 // The worker functions in this file was optimized for performance. If you make changes
   8 // you should use care to consider all of the interesting cases.
   9
  10 // The code of all worker functions in this file is written twice: Once as as a slow loop, and the
  11 // second time as a fast loop. The slow loops handles all special cases, throws exceptions, etc.
  12 // The fast loops attempts to blaze through as fast as possible with optimistic range checks,
  13 // processing multiple characters at a time, and falling back to the slow loop for all special cases.
  14
  15 // This define can be used to turn off the fast loops. Useful for finding whether
  16 // the problem is fastloop-specific.
  17 #define FASTLOOP
  18
  19 namespace System.Text
  20 {
  21     using System;
  22     using System.Globalization;
  23     using System.Runtime.Serialization;
  24     using System.Security.Permissions;
  25     using System.Diagnostics.Contracts;
  26
  27     // Encodes text into and out of UTF-8.  UTF-8 is a way of writing
  28     // Unicode characters with variable numbers of bytes per character,
  29     // optimized for the lower 127 ASCII characters.  It's an efficient way
  30     // of encoding US English in an internationalizable way.
  31     //
  32     // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
  33     //
  34     // The UTF-8 byte order mark is simply the Unicode byte order mark
  35     // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF).  The byte order mark is
  36     // used mostly to distinguish UTF-8 text from other encodings, and doesn't
  37     // switch the byte orderings.
  38
  39     [Serializable]
  40 [System.Runtime.InteropServices.ComVisible(true)]
  41     public class UTF8Encoding : Encoding
  42     {
  43         /*
  44             bytes   bits    UTF-8 representation
  45             -----   ----    -----------------------------------
  46             1        7      0vvvvvvv
  47             2       11      110vvvvv 10vvvvvv
  48             3       16      1110vvvv 10vvvvvv 10vvvvvv
  49             4       21      11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
  50             -----   ----    -----------------------------------
  51
  52             Surrogate:
  53             Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
  54          */
  55
  56         private const int UTF8_CODEPAGE=65001;
  57
  58         // Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
  59         // the standard.
  60         private bool emitUTF8Identifier = false;
  61
  62         private bool isThrowException = false;
  63
  64
  65         public UTF8Encoding(): this(false)
  66         {
  67         }
  68
  69
  70         public UTF8Encoding(bool encoderShouldEmitUTF8Identifier):
  71             this(encoderShouldEmitUTF8Identifier, false)
  72         {
  73         }
  74
  75
  76         public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes):
  77             base(UTF8_CODEPAGE)
  78         {
  79             this.emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
  80             this.isThrowException = throwOnInvalidBytes;
  81
  82             // Encoding's constructor already did this, but it'll be wrong if we're throwing exceptions
  83             if (this.isThrowException)
  84                 SetDefaultFallbacks();
  85         }
  86
  87         internal override void SetDefaultFallbacks()
  88         {
  89             // For UTF-X encodings, we use a replacement fallback with an empty string
  90             if (this.isThrowException)
  91             {
  92                 this.encoderFallback = EncoderFallback.ExceptionFallback;
  93                 this.decoderFallback = DecoderFallback.ExceptionFallback;
  94             }
  95             else
  96             {
  97                 this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
  98                 this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
  99             }
 100         }
 101
 102
 103         //
 104         // WARNING: GetByteCount(string chars)
 105         // WARNING: has different variable names than EncodingNLS.cs, so this can't just be cut & pasted,
 106         // WARNING: otherwise it'll break VB's way of declaring these.
 107         //
 108         // The following methods are copied from EncodingNLS.cs.
 109         // Unfortunately EncodingNLS.cs is internal and we're public, so we have to reimpliment them here.
 110         // These should be kept in [....] for the following classes:
 111         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 112         //
 113
 114         // Returns the number of bytes required to encode a range of characters in
 115         // a character array.
 116         //
 117         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 118         // So if you fix this, fix the others.  Currently those include:
 119         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 120         // parent method is safe
 121
 122         [System.Security.SecuritySafeCritical]  // auto-generated
 123         public override unsafe int GetByteCount(char[] chars, int index, int count)
 124         {
 125             // Validate input parameters
 126             if (chars == null)
 127                 throw new ArgumentNullException("chars",
 128                       Environment.GetResourceString("ArgumentNull_Array"));
 129
 130             if (index < 0 || count < 0)
 131                 throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"),
 132                       Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
 133
 134             if (chars.Length - index < count)
 135                 throw new ArgumentOutOfRangeException("chars",
 136                       Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
 137             Contract.EndContractBlock();
 138
 139             // If no input, return 0, avoid fixed empty array problem
 140             if (chars.Length == 0)
 141                 return 0;
 142
 143             // Just call the pointer version
 144             fixed (char* pChars = chars)
 145                 return GetByteCount(pChars + index, count, null);
 146         }
 147
 148         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 149         // So if you fix this, fix the others.  Currently those include:
 150         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 151         // parent method is safe
 152
 153         [System.Security.SecuritySafeCritical]  // auto-generated
 154         public override unsafe int GetByteCount(String chars)
 155         {
 156             // Validate input
 157             if (chars==null)
 158                 throw new ArgumentNullException("s");
 159             Contract.EndContractBlock();
 160
 161             fixed (char* pChars = chars)
 162                 return GetByteCount(pChars, chars.Length, null);
 163         }
 164
 165         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 166         // So if you fix this, fix the others.  Currently those include:
 167         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 168
 169         [System.Security.SecurityCritical]  // auto-generated
 170         [CLSCompliant(false)]
 171         [System.Runtime.InteropServices.ComVisible(false)]
 172         public override unsafe int GetByteCount(char* chars, int count)
 173         {
 174             // Validate Parameters
 175             if (chars == null)
 176                 throw new ArgumentNullException("chars",
 177                     Environment.GetResourceString("ArgumentNull_Array"));
 178
 179             if (count < 0)
 180                 throw new ArgumentOutOfRangeException("count",
 181                     Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
 182             Contract.EndContractBlock();
 183
 184             // Call it with empty encoder
 185             return GetByteCount(chars, count, null);
 186         }
 187
 188         // Parent method is safe.
 189         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 190         // So if you fix this, fix the others.  Currently those include:
 191         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 192
 193         [System.Security.SecuritySafeCritical]  // auto-generated
 194         public override unsafe int GetBytes(String s, int charIndex, int charCount,
 195                                               byte[] bytes, int byteIndex)
 196         {
 197             if (s == null || bytes == null)
 198                 throw new ArgumentNullException((s == null ? "s" : "bytes"),
 199                       Environment.GetResourceString("ArgumentNull_Array"));
 200
 201             if (charIndex < 0 || charCount < 0)
 202                 throw new ArgumentOutOfRangeException((charIndex<0 ? "charIndex" : "charCount"),
 203                       Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
 204
 205             if (s.Length - charIndex < charCount)
 206                 throw new ArgumentOutOfRangeException("s",
 207                       Environment.GetResourceString("ArgumentOutOfRange_IndexCount"));
 208
 209             if (byteIndex < 0 || byteIndex > bytes.Length)
 210                 throw new ArgumentOutOfRangeException("byteIndex",
 211                     Environment.GetResourceString("ArgumentOutOfRange_Index"));
 212             Contract.EndContractBlock();
 213
 214             int byteCount = bytes.Length - byteIndex;
 215
 216             // Fixed doesn't like 0 length arrays.
 217             if (bytes.Length == 0)
 218                 bytes = new byte[1];
 219
 220             fixed (char* pChars = s)
 221                 fixed ( byte* pBytes = bytes)
 222                     return GetBytes(pChars + charIndex, charCount,
 223                                     pBytes + byteIndex, byteCount, null);
 224         }
 225
 226         // Encodes a range of characters in a character array into a range of bytes
 227         // in a byte array. An exception occurs if the byte array is not large
 228         // enough to hold the complete encoding of the characters. The
 229         // GetByteCount method can be used to determine the exact number of
 230         // bytes that will be produced for a given range of characters.
 231         // Alternatively, the GetMaxByteCount method can be used to
 232         // determine the maximum number of bytes that will be produced for a given
 233         // number of characters, regardless of the actual character values.
 234         //
 235         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 236         // So if you fix this, fix the others.  Currently those include:
 237         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 238         // parent method is safe
 239
 240         [System.Security.SecuritySafeCritical]  // auto-generated
 241         public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
 242                                                byte[] bytes, int byteIndex)
 243         {
 244             // Validate parameters
 245             if (chars == null || bytes == null)
 246                 throw new ArgumentNullException((chars == null ? "chars" : "bytes"),
 247                       Environment.GetResourceString("ArgumentNull_Array"));
 248
 249             if (charIndex < 0 || charCount < 0)
 250                 throw new ArgumentOutOfRangeException((charIndex<0 ? "charIndex" : "charCount"),
 251                       Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
 252
 253             if (chars.Length - charIndex < charCount)
 254                 throw new ArgumentOutOfRangeException("chars",
 255                       Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
 256
 257             if (byteIndex < 0 || byteIndex > bytes.Length)
 258                 throw new ArgumentOutOfRangeException("byteIndex",
 259                      Environment.GetResourceString("ArgumentOutOfRange_Index"));
 260             Contract.EndContractBlock();
 261
 262             // If nothing to encode return 0, avoid fixed problem
 263             if (chars.Length == 0)
 264                 return 0;
 265
 266             // Just call pointer version
 267             int byteCount = bytes.Length - byteIndex;
 268
 269             // Fixed doesn't like 0 length arrays.
 270             if (bytes.Length == 0)
 271                 bytes = new byte[1];
 272
 273             fixed (char* pChars = chars)
 274                 fixed (byte* pBytes = bytes)
 275                     // Remember that byteCount is # to decode, not size of array.
 276                     return GetBytes(pChars + charIndex, charCount,
 277                                     pBytes + byteIndex, byteCount, null);
 278         }
 279
 280         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 281         // So if you fix this, fix the others.  Currently those include:
 282         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 283
 284         [System.Security.SecurityCritical]  // auto-generated
 285         [CLSCompliant(false)]
 286         [System.Runtime.InteropServices.ComVisible(false)]
 287         public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
 288         {
 289             // Validate Parameters
 290             if (bytes == null || chars == null)
 291                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars",
 292                     Environment.GetResourceString("ArgumentNull_Array"));
 293
 294             if (charCount < 0 || byteCount < 0)
 295                 throw new ArgumentOutOfRangeException((charCount<0 ? "charCount" : "byteCount"),
 296                     Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
 297             Contract.EndContractBlock();
 298
 299             return GetBytes(chars, charCount, bytes, byteCount, null);
 300         }
 301
 302         // Returns the number of characters produced by decoding a range of bytes
 303         // in a byte array.
 304         //
 305         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 306         // So if you fix this, fix the others.  Currently those include:
 307         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 308         // parent method is safe
 309
 310         [System.Security.SecuritySafeCritical]  // auto-generated
 311         public override unsafe int GetCharCount(byte[] bytes, int index, int count)
 312         {
 313             // Validate Parameters
 314             if (bytes == null)
 315                 throw new ArgumentNullException("bytes",
 316                     Environment.GetResourceString("ArgumentNull_Array"));
 317
 318             if (index < 0 || count < 0)
 319                 throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"),
 320                     Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
 321
 322             if (bytes.Length - index < count)
 323                 throw new ArgumentOutOfRangeException("bytes",
 324                     Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
 325             Contract.EndContractBlock();
 326
 327             // If no input just return 0, fixed doesn't like 0 length arrays.
 328             if (bytes.Length == 0)
 329                 return 0;
 330
 331             // Just call pointer version
 332             fixed (byte* pBytes = bytes)
 333                 return GetCharCount(pBytes + index, count, null);
 334         }
 335
 336         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 337         // So if you fix this, fix the others.  Currently those include:
 338         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 339
 340         [System.Security.SecurityCritical]  // auto-generated
 341         [CLSCompliant(false)]
 342         [System.Runtime.InteropServices.ComVisible(false)]
 343         public override unsafe int GetCharCount(byte* bytes, int count)
 344         {
 345             // Validate Parameters
 346             if (bytes == null)
 347                 throw new ArgumentNullException("bytes",
 348                     Environment.GetResourceString("ArgumentNull_Array"));
 349
 350             if (count < 0)
 351                 throw new ArgumentOutOfRangeException("count",
 352                     Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
 353             Contract.EndContractBlock();
 354
 355             return GetCharCount(bytes, count, null);
 356         }
 357
 358         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 359         // So if you fix this, fix the others.  Currently those include:
 360         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 361         // parent method is safe
 362
 363         [System.Security.SecuritySafeCritical]  // auto-generated
 364         public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
 365                                               char[] chars, int charIndex)
 366         {
 367             // Validate Parameters
 368             if (bytes == null || chars == null)
 369                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars",
 370                     Environment.GetResourceString("ArgumentNull_Array"));
 371
 372             if (byteIndex < 0 || byteCount < 0)
 373                 throw new ArgumentOutOfRangeException((byteIndex<0 ? "byteIndex" : "byteCount"),
 374                     Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
 375
 376             if ( bytes.Length - byteIndex < byteCount)
 377                 throw new ArgumentOutOfRangeException("bytes",
 378                     Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
 379
 380             if (charIndex < 0 || charIndex > chars.Length)
 381                 throw new ArgumentOutOfRangeException("charIndex",
 382                     Environment.GetResourceString("ArgumentOutOfRange_Index"));
 383             Contract.EndContractBlock();
 384
 385             // If no input, return 0 & avoid fixed problem
 386             if (bytes.Length == 0)
 387                 return 0;
 388
 389             // Just call pointer version
 390             int charCount = chars.Length - charIndex;
 391
 392             // Fixed doesn't like 0 length arrays.
 393             if (chars.Length == 0)
 394                 chars = new char[1];
 395
 396             fixed (byte* pBytes = bytes)
 397                 fixed (char* pChars = chars)
 398                     // Remember that charCount is # to decode, not size of array
 399                     return GetChars(pBytes + byteIndex, byteCount,
 400                                     pChars + charIndex, charCount, null);
 401         }
 402
 403         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 404         // So if you fix this, fix the others.  Currently those include:
 405         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 406
 407         [System.Security.SecurityCritical]  // auto-generated
 408         [CLSCompliant(false)]
 409         [System.Runtime.InteropServices.ComVisible(false)]
 410         public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
 411         {
 412             // Validate Parameters
 413             if (bytes == null || chars == null)
 414                 throw new ArgumentNullException(bytes == null ? "bytes" : "chars",
 415                     Environment.GetResourceString("ArgumentNull_Array"));
 416
 417             if (charCount < 0 || byteCount < 0)
 418                 throw new ArgumentOutOfRangeException((charCount<0 ? "charCount" : "byteCount"),
 419                     Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
 420             Contract.EndContractBlock();
 421
 422             return GetChars(bytes, byteCount, chars, charCount, null);
 423         }
 424
 425         // Returns a string containing the decoded representation of a range of
 426         // bytes in a byte array.
 427         //
 428         // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
 429         // So if you fix this, fix the others.  Currently those include:
 430         // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
 431         // parent method is safe
 432
 433         [System.Security.SecuritySafeCritical]  // auto-generated
 434         [System.Runtime.InteropServices.ComVisible(false)]
 435         public override unsafe String GetString(byte[] bytes, int index, int count)
 436         {
 437             // Validate Parameters
 438             if (bytes == null)
 439                 throw new ArgumentNullException("bytes",
 440                     Environment.GetResourceString("ArgumentNull_Array"));
 441
 442             if (index < 0 || count < 0)
 443                 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"),
 444                     Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
 445
 446             if (bytes.Length - index < count)
 447                 throw new ArgumentOutOfRangeException("bytes",
 448                     Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
 449             Contract.EndContractBlock();
 450
 451             // Avoid problems with empty input buffer
 452             if (bytes.Length == 0) return String.Empty;
 453
 454             fixed (byte* pBytes = bytes)
 455                 return String.CreateStringFromEncoding(
 456                     pBytes + index, count, this);
 457         }
 458
 459         //
 460         // End of standard methods copied from EncodingNLS.cs
 461         //
 462
 463         // To simplify maintenance, the structure of GetByteCount and GetBytes should be
 464         // kept the same as much as possible
 465         [System.Security.SecurityCritical]  // auto-generated
 466         internal override unsafe int GetByteCount(char *chars, int count, EncoderNLS baseEncoder)
 467         {
 468             // For fallback we may need a fallback buffer.
 469             // We wait to initialize it though in case we don't have any broken input unicode
 470             EncoderFallbackBuffer fallbackBuffer = null;
 471             char *pSrc = chars;
 472             char *pEnd = pSrc+count;
 473
 474             // Start by assuming we have as many as count
 475             int byteCount = count;
 476
 477             int ch = 0;
 478
 479             if (baseEncoder != null) {
 480                 UTF8Encoder encoder = (UTF8Encoder)baseEncoder;
 481                 ch = encoder.surrogateChar;
 482
 483                 // We mustn't have left over fallback data when counting
 484                 if (encoder.InternalHasFallbackBuffer)
 485                 {
 486                     fallbackBuffer = encoder.FallbackBuffer;
 487                     if (fallbackBuffer.Remaining > 0)
 488                         throw new ArgumentException(Environment.GetResourceString("Argument_EncoderFallbackNotEmpty",
 489                         this.EncodingName, encoder.Fallback.GetType()));
 490
 491                     // Set our internal fallback interesting things.
 492                     fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false);
 493                 }
 494             }
 495
 496             for (;;) {
 497                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
 498                 if (pSrc >= pEnd) {
 499
 500                     if (ch == 0) {
 501                         // Unroll any fallback that happens at the end
 502                         ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
 503                         if (ch > 0) {
 504                             byteCount++;
 505                             goto ProcessChar;
 506                         }
 507                     } else {
 508                         // Case of surrogates in the fallback.
 509                         if (fallbackBuffer != null && fallbackBuffer.bFallingBack) {
 510                             Contract.Assert(ch >= 0xD800 && ch <= 0xDBFF,
 511                                 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
 512
 513                             ch = fallbackBuffer.InternalGetNextChar();
 514                             byteCount++;
 515
 516                             if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
 517                                 ch = 0xfffd;
 518                                 byteCount++;
 519                                 goto EncodeChar;
 520                             } else if (ch > 0){
 521                                 goto ProcessChar;
 522                             } else {
 523                                 byteCount--; // ignore last one.
 524                                 break;
 525                             }
 526                         }
 527                     }
 528
 529                     if (ch <= 0) {
 530                         break;
 531                     }
 532                     if (baseEncoder != null && !baseEncoder.MustFlush) {
 533                         break;
 534                     }
 535
 536                     // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
 537                     byteCount++;
 538                     goto EncodeChar;
 539                 }
 540
 541                 if (ch > 0) {
 542                     Contract.Assert(ch >= 0xD800 && ch <= 0xDBFF,
 543                         "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
 544
 545                     // use separate helper variables for local contexts so that the jit optimizations
 546                     // won't get confused about the variable lifetimes
 547                     int cha = *pSrc;
 548
 549                     // count the pending surrogate
 550                     byteCount++;
 551
 552                     // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
 553                     // if (IsLowSurrogate(cha)) {
 554                     if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
 555                         // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
 556                         ch = 0xfffd;
 557 //                        ch = cha + (ch << 10) +
 558 //                            (0x10000
 559 //                            - CharUnicodeInfo.LOW_SURROGATE_START
 560 //                            - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
 561
 562                         // Use this next char
 563                         pSrc++;
 564                     }
 565                     // else ch is still high surrogate and encoding will fail (so don't add count)
 566
 567                     // attempt to encode the surrogate or partial surrogate
 568                     goto EncodeChar;
 569                 }
 570
 571                 // If we've used a fallback, then we have to check for it
 572                 if (fallbackBuffer != null)
 573                 {
 574                     ch = fallbackBuffer.InternalGetNextChar();
 575                     if (ch > 0)
 576                     {
 577                         // We have an extra byte we weren't expecting.
 578                         byteCount++;
 579                         goto ProcessChar;
 580                     }
 581                 }
 582
 583                 // read next char. The JIT optimization seems to be getting confused when
 584                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
 585                 ch = *pSrc;
 586                 pSrc++;
 587
 588             ProcessChar:
 589                 // if (IsHighSurrogate(ch)) {
 590                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END)) {
 591                     // we will count this surrogate next time around
 592                     byteCount--;
 593                     continue;
 594                 }
 595                 // either good char or partial surrogate
 596
 597             EncodeChar:
 598                 // throw exception on partial surrogate if necessary
 599                 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
 600                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 601                 {
 602                     // Lone surrogates aren't allowed
 603                     // Have to make a fallback buffer if we don't have one
 604                     if (fallbackBuffer == null)
 605                     {
 606                         // wait on fallbacks if we can
 607                         // For fallback we may need a fallback buffer
 608                         if (baseEncoder == null)
 609                             fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
 610                         else
 611                             fallbackBuffer = baseEncoder.FallbackBuffer;
 612
 613                         // Set our internal fallback interesting things.
 614                         fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false);
 615                     }
 616
 617                     // Do our fallback.  Actually we already know its a mixed up surrogate,
 618                     // so the ref pSrc isn't gonna do anything.
 619                     fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrc);
 620
 621                     // Ignore it if we don't throw (we had preallocated this ch)
 622                     byteCount--;
 623                     ch = 0;
 624                     continue;
 625                 }
 626
 627                 // Count them
 628                 if (ch > 0x7F) {
 629                     if (ch > 0x7FF) {
 630                         // the extra surrogate byte was compensated by the second surrogate character
 631                         // (2 surrogates make 4 bytes.  We've already counted 2 bytes, 1 per char)
 632                         byteCount++;
 633                     }
 634                     byteCount++;
 635                 }
 636
 637 #if WIN64
 638                 // check for overflow
 639                 if (byteCount < 0) {
 640                     break;
 641                 }
 642 #endif
 643
 644 #if FASTLOOP
 645                 // If still have fallback don't do fast loop
 646                 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
 647                 {
 648                     // We're reserving 1 byte for each char by default
 649                     byteCount++;
 650                     goto ProcessChar;
 651                 }
 652
 653                 int availableChars = PtrDiff(pEnd, pSrc);
 654
 655                 // don't fall into the fast decoding loop if we don't have enough characters
 656                 if (availableChars <= 13) {
 657                     // try to get over the remainder of the ascii characters fast though
 658                     char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
 659                     while (pSrc < pLocalEnd) {
 660                         ch = *pSrc;
 661                         pSrc++;
 662                         if (ch > 0x7F)
 663                             goto ProcessChar;
 664                     }
 665
 666                     // we are done
 667                     break;
 668                 }
 669
 670 #if WIN64
 671                 // make sure that we won't get a silent overflow inside the fast loop
 672                 // (Fall out to slow loop if we have this many characters)
 673                 availableChars &= 0x0FFFFFFF;
 674 #endif
 675
 676                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
 677                 //  the boundary will be decreased for every non-ASCII character we encounter
 678                 // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
 679                 char *pStop = pSrc + availableChars - (3 + 4);
 680
 681                 while (pSrc < pStop) {
 682                     ch = *pSrc;
 683                     pSrc++;
 684
 685                     if (ch > 0x7F)                                                  // Not ASCII
 686                     {
 687                         if (ch > 0x7FF)                                             // Not 2 Byte
 688                         {
 689                             if ((ch & 0xF800) == 0xD800)                            // See if its a Surrogate
 690                                 goto LongCode;
 691                             byteCount++;
 692                         }
 693                         byteCount ++;
 694                     }
 695
 696                     // get pSrc aligned
 697                     if ((unchecked((int)pSrc) & 0x2) != 0) {
 698                         ch = *pSrc;
 699                         pSrc++;
 700                         if (ch > 0x7F)                                              // Not ASCII
 701                         {
 702                             if (ch > 0x7FF)                                         // Not 2 Byte
 703                             {
 704                                 if ((ch & 0xF800) == 0xD800)                        // See if its a Surrogate
 705                                     goto LongCode;
 706                                 byteCount++;
 707                             }
 708                             byteCount ++;
 709                         }
 710                     }
 711
 712                     // Run 2 * 4 characters at a time!
 713                     while (pSrc < pStop) {
 714                         ch = *(int*)pSrc;
 715                         int chc = *(int*)(pSrc+2);
 716                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)         // See if not ASCII
 717                         {
 718                             if (((ch | chc) & unchecked((int)0xF800F800)) != 0)     // See if not 2 Byte
 719                             {
 720                                 goto LongCodeWithMask;
 721                             }
 722
 723
 724                             if ((ch & unchecked((int)0xFF800000)) != 0)             // Actually 0x07800780 is all we care about (4 bits)
 725                                 byteCount++;
 726                             if ((ch & unchecked((int)0xFF80)) != 0)
 727                                 byteCount++;
 728                             if ((chc & unchecked((int)0xFF800000)) != 0)
 729                                 byteCount++;
 730                             if ((chc & unchecked((int)0xFF80)) != 0)
 731                                 byteCount++;
 732                         }
 733                         pSrc += 4;
 734
 735                         ch = *(int*)pSrc;
 736                         chc = *(int*)(pSrc+2);
 737                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)         // See if not ASCII
 738                         {
 739                             if (((ch | chc) & unchecked((int)0xF800F800)) != 0)     // See if not 2 Byte
 740                             {
 741                                 goto LongCodeWithMask;
 742                             }
 743
 744                             if ((ch & unchecked((int)0xFF800000)) != 0)
 745                                 byteCount++;
 746                             if ((ch & unchecked((int)0xFF80)) != 0)
 747                                 byteCount++;
 748                             if ((chc & unchecked((int)0xFF800000)) != 0)
 749                                 byteCount++;
 750                             if ((chc & unchecked((int)0xFF80)) != 0)
 751                                 byteCount++;
 752                         }
 753                         pSrc += 4;
 754                     }
 755                     break;
 756
 757                 LongCodeWithMask:
 758 if (!BitConverter.IsLittleEndian) {
 759                     // be careful about the sign extension
 760                     ch = (int)(((uint)ch) >> 16);
 761 } else {
 762                     ch = (char)ch;
 763 }
 764                     pSrc++;
 765
 766                     if (ch <= 0x7F) {
 767                         continue;
 768                     }
 769
 770                 LongCode:
 771                     // use separate helper variables for slow and fast loop so that the jit optimizations
 772                     // won't get confused about the variable lifetimes
 773                     if (ch > 0x7FF) {
 774                         // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
 775                         if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
 776                             // 4 byte encoding - high surrogate + low surrogate
 777
 778                             int chd = *pSrc;
 779                             if (
 780                                 // !IsHighSurrogate(ch) // low without high -> bad
 781                                 ch > CharUnicodeInfo.HIGH_SURROGATE_END ||
 782                                 // !IsLowSurrogate(chd) // high not followed by low -> bad
 783                                 !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END) )
 784                             {
 785                                 // Back up and drop out to slow loop to figure out error
 786                                 pSrc--;
 787                                 break;
 788                             }
 789                             pSrc++;
 790
 791                             // byteCount - this byte is compensated by the second surrogate character
 792                         }
 793                         byteCount++;
 794                     }
 795                     byteCount++;
 796
 797                     // byteCount - the last byte is already included
 798                 }
 799 #endif // FASTLOOP
 800
 801                 // no pending char at this point
 802                 ch = 0;
 803             }
 804
 805 #if WIN64
 806             // check for overflow
 807             if (byteCount < 0) {
 808                 throw new ArgumentException(
 809                         Environment.GetResourceString("Argument_ConversionOverflow"));
 810             }
 811 #endif
 812
 813             Contract.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
 814                 "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
 815
 816             return byteCount;
 817         }
 818
 819         // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic
 820         // is good enough for us, and it tends to generate better code than the signed
 821         // arithmetic generated by default
 822         [System.Security.SecurityCritical]  // auto-generated
 823         unsafe private static int PtrDiff(char *a, char* b)
 824         {
 825             return (int)(((uint)((byte*)a - (byte*)b)) >> 1);
 826         }
 827
 828         // byte* flavor just for parity
 829         [System.Security.SecurityCritical]  // auto-generated
 830         unsafe private static int PtrDiff(byte* a, byte* b)
 831         {
 832             return (int)(a - b);
 833         }
 834
 835         private static bool InRange(int ch, int start, int end)
 836         {
 837             return (uint)(ch - start) <= (uint)(end - start);
 838         }
 839
 840         // Our workhorse
 841         // Note:  We ignore mismatched surrogates, unless the exception flag is set in which case we throw
 842         [System.Security.SecurityCritical]  // auto-generated
 843         internal override unsafe int GetBytes(char* chars, int charCount,
 844                                                 byte* bytes, int byteCount, EncoderNLS baseEncoder)
 845         {
 846             Contract.Assert(chars!=null, "[UTF8Encoding.GetBytes]chars!=null");
 847             Contract.Assert(byteCount >=0, "[UTF8Encoding.GetBytes]byteCount >=0");
 848             Contract.Assert(charCount >=0, "[UTF8Encoding.GetBytes]charCount >=0");
 849             Contract.Assert(bytes!=null, "[UTF8Encoding.GetBytes]bytes!=null");
 850
 851             UTF8Encoder encoder = null;
 852
 853             // For fallback we may need a fallback buffer.
 854             // We wait to initialize it though in case we don't have any broken input unicode
 855             EncoderFallbackBuffer fallbackBuffer = null;
 856             char *pSrc = chars;
 857             byte *pTarget = bytes;
 858
 859             char *pEnd = pSrc+charCount;
 860             byte *pAllocatedBufferEnd = pTarget+byteCount;
 861
 862             int ch = 0;
 863
 864             // assume that JIT will enregister pSrc, pTarget and ch
 865
 866             if (baseEncoder != null) {
 867                 encoder = (UTF8Encoder)baseEncoder;
 868                 ch = encoder.surrogateChar;
 869
 870                 // We mustn't have left over fallback data when counting
 871                 if (encoder.InternalHasFallbackBuffer)
 872                 {
 873                     // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
 874                     fallbackBuffer = encoder.FallbackBuffer;
 875                     if (fallbackBuffer.Remaining > 0 && encoder.m_throwOnOverflow)
 876                         throw new ArgumentException(Environment.GetResourceString("Argument_EncoderFallbackNotEmpty",
 877                         this.EncodingName, encoder.Fallback.GetType()));
 878
 879                     // Set our internal fallback interesting things.
 880                     fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true);
 881                 }
 882             }
 883
 884             for (;;) {
 885                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
 886
 887                 if (pSrc >= pEnd) {
 888
 889                     if (ch == 0) {
 890                         // Check if there's anthing left to get out of the fallback buffer
 891                         ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
 892                         if (ch > 0) {
 893                             goto ProcessChar;
 894                         }
 895                     } else {
 896                         // Case of leftover surrogates in the fallback buffer
 897                         if (fallbackBuffer != null && fallbackBuffer.bFallingBack) {
 898                             Contract.Assert(ch >= 0xD800 && ch <= 0xDBFF,
 899                                 "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
 900
 901                             int cha = ch;
 902
 903                             ch = fallbackBuffer.InternalGetNextChar();
 904
 905                             if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
 906                                 ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
 907                                 goto EncodeChar;
 908                             } else if (ch > 0){
 909                                 goto ProcessChar;
 910                             } else {
 911                                 break;
 912                             }
 913                         }
 914                     }
 915
 916                     // attempt to encode the partial surrogate (will fail or ignore)
 917                     if (ch > 0 && (encoder == null || encoder.MustFlush))
 918                         goto EncodeChar;
 919
 920                     // We're done
 921                     break;
 922                 }
 923
 924                 if (ch > 0) {
 925                     // We have a high surrogate left over from a previous loop.
 926                     Contract.Assert(ch >= 0xD800 && ch <= 0xDBFF,
 927                         "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
 928
 929                     // use separate helper variables for local contexts so that the jit optimizations
 930                     // won't get confused about the variable lifetimes
 931                     int cha = *pSrc;
 932
 933                     // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
 934                     // if (IsLowSurrogate(cha)) {
 935                     if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
 936                         ch = cha + (ch << 10) +
 937                             (0x10000
 938                             - CharUnicodeInfo.LOW_SURROGATE_START
 939                             - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
 940
 941                         pSrc++;
 942                     }
 943                     // else ch is still high surrogate and encoding will fail
 944
 945                     // attempt to encode the surrogate or partial surrogate
 946                     goto EncodeChar;
 947                 }
 948
 949                 // If we've used a fallback, then we have to check for it
 950                 if (fallbackBuffer != null)
 951                 {
 952                     ch = fallbackBuffer.InternalGetNextChar();
 953                     if (ch > 0) goto ProcessChar;
 954                 }
 955
 956                 // read next char. The JIT optimization seems to be getting confused when
 957                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
 958                 ch = *pSrc;
 959                 pSrc++;
 960
 961             ProcessChar:
 962                 // if (IsHighSurrogate(ch)) {
 963                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END)) {
 964                     continue;
 965                 }
 966                 // either good char or partial surrogate
 967
 968             EncodeChar:
 969                 // throw exception on partial surrogate if necessary
 970                 // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
 971                 if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
 972                 {
 973                     // Lone surrogates aren't allowed, we have to do fallback for them
 974                     // Have to make a fallback buffer if we don't have one
 975                     if (fallbackBuffer == null)
 976                     {
 977                         // wait on fallbacks if we can
 978                         // For fallback we may need a fallback buffer
 979                         if (baseEncoder == null)
 980                             fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
 981                         else
 982                             fallbackBuffer = baseEncoder.FallbackBuffer;
 983
 984                         // Set our internal fallback interesting things.
 985                         fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true);
 986                     }
 987
 988                     // Do our fallback.  Actually we already know its a mixed up surrogate,
 989                     // so the ref pSrc isn't gonna do anything.
 990                     fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrc);
 991
 992                     // Ignore it if we don't throw
 993                     ch = 0;
 994                     continue;
 995                 }
 996
 997                 // Count bytes needed
 998                 int bytesNeeded = 1;
 999                 if (ch > 0x7F) {
1000                     if (ch > 0x7FF) {
1001                         if (ch > 0xFFFF) {
1002                             bytesNeeded++;  // 4 bytes (surrogate pair)
1003                         }
1004                         bytesNeeded++;      // 3 bytes (800-FFFF)
1005                     }
1006                     bytesNeeded++;          // 2 bytes (80-7FF)
1007                 }
1008
1009                 if (pTarget > pAllocatedBufferEnd - bytesNeeded) {
1010                     // Left over surrogate from last time will cause pSrc == chars, so we'll throw
1011                     if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
1012                     {
1013                         fallbackBuffer.MovePrevious();              // Didn't use this fallback char
1014                         if (ch > 0xFFFF)
1015                             fallbackBuffer.MovePrevious();          // Was surrogate, didn't use 2nd part either
1016                     }
1017                     else
1018                     {
1019                         pSrc--;                                     // Didn't use this char
1020                         if (ch > 0xFFFF)
1021                             pSrc--;                                 // Was surrogate, didn't use 2nd part either
1022                     }
1023                     Contract.Assert(pSrc >= chars || pTarget == bytes,
1024                         "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
1025                     ThrowBytesOverflow(encoder, pTarget == bytes);  // Throw if we must
1026                     ch = 0;                                         // Nothing left over (we backed up to start of pair if supplimentary)
1027                     break;
1028                 }
1029
1030                 if (ch <= 0x7F) {
1031                     *pTarget = (byte)ch;
1032                 }
1033                 else {
1034                     // use separate helper variables for local contexts so that the jit optimizations
1035                     // won't get confused about the variable lifetimes
1036                     int chb;
1037                     if (ch <= 0x7FF) {
1038                         // 2 byte encoding
1039                         chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6));
1040                     }
1041                     else
1042                     {
1043                         if (ch <= 0xFFFF) {
1044                             chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12));
1045                         }
1046                         else
1047                         {
1048                             *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1049                             pTarget++;
1050
1051                             chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1052                         }
1053                         *pTarget = (byte)chb;
1054                         pTarget++;
1055
1056                         chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1057                     }
1058                     *pTarget = (byte)chb;
1059                     pTarget++;
1060
1061                     *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1062                 }
1063                 pTarget++;
1064
1065
1066 #if FASTLOOP
1067                 // If still have fallback don't do fast loop
1068                 if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
1069                     goto ProcessChar;
1070
1071                 int availableChars = PtrDiff(pEnd, pSrc);
1072                 int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
1073
1074                 // don't fall into the fast decoding loop if we don't have enough characters
1075                 // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
1076                 if (availableChars <= 13) {
1077                     // we are hoping for 1 byte per char
1078                     if (availableBytes < availableChars) {
1079                         // not enough output room.  no pending bits at this point
1080                         ch = 0;
1081                         continue;
1082                     }
1083
1084                     // try to get over the remainder of the ascii characters fast though
1085                     char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1086                     while (pSrc < pLocalEnd) {
1087                         ch = *pSrc;
1088                         pSrc++;
1089
1090                         // Not ASCII, need more than 1 byte per char
1091                         if (ch > 0x7F)
1092                             goto ProcessChar;
1093
1094                         *pTarget = (byte)ch;
1095                         pTarget++;
1096                     }
1097                     // we are done, let ch be 0 to clear encoder
1098                     ch = 0;
1099                     break;
1100                 }
1101
1102                 // we need at least 1 byte per character, but Convert might allow us to convert
1103                 // only part of the input, so try as much as we can.  Reduce charCount if necessary
1104                 if (availableBytes < availableChars)
1105                 {
1106                     availableChars = availableBytes;
1107                 }
1108
1109                 // FASTLOOP:
1110                 // - optimistic range checks
1111                 // - fallbacks to the slow loop for all special cases, exception throwing, etc.
1112
1113                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1114                 //  the boundary will be decreased for every non-ASCII character we encounter
1115                 // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
1116                 // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
1117                 char *pStop = pSrc + availableChars - 5;
1118
1119                 while (pSrc < pStop) {
1120                     ch = *pSrc;
1121                     pSrc++;
1122
1123                     if (ch > 0x7F) {
1124                         goto LongCode;
1125                     }
1126                     *pTarget = (byte)ch;
1127                     pTarget++;
1128
1129                     // get pSrc aligned
1130                     if ((unchecked((int)pSrc) & 0x2) != 0) {
1131                         ch = *pSrc;
1132                         pSrc++;
1133                         if (ch > 0x7F) {
1134                             goto LongCode;
1135                         }
1136                         *pTarget = (byte)ch;
1137                         pTarget++;
1138                     }
1139
1140                     // Run 4 characters at a time!
1141                     while (pSrc < pStop) {
1142                         ch = *(int*)pSrc;
1143                         int chc = *(int*)(pSrc+2);
1144                         if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) {
1145                             goto LongCodeWithMask;
1146                         }
1147
1148                         // Unfortunately, this is endianess sensitive
1149 if (!BitConverter.IsLittleEndian) {
1150                         *pTarget = (byte)(ch>>16);
1151                         *(pTarget+1) = (byte)ch;
1152                         pSrc += 4;
1153                         *(pTarget+2) = (byte)(chc>>16);
1154                         *(pTarget+3) = (byte)chc;
1155                         pTarget += 4;
1156 } else {
1157                         *pTarget = (byte)ch;
1158                         *(pTarget+1) = (byte)(ch>>16);
1159                         pSrc += 4;
1160                         *(pTarget+2) = (byte)chc;
1161                         *(pTarget+3) = (byte)(chc>>16);
1162                         pTarget += 4;
1163 }
1164                     }
1165                     continue;
1166
1167                 LongCodeWithMask:
1168 if (!BitConverter.IsLittleEndian) {
1169                     // be careful about the sign extension
1170                     ch = (int)(((uint)ch) >> 16);
1171 } else {
1172                     ch = (char)ch;
1173 }
1174                     pSrc++;
1175
1176                     if (ch > 0x7F) {
1177                         goto LongCode;
1178                     }
1179                     *pTarget = (byte)ch;
1180                     pTarget++;
1181                     continue;
1182
1183                 LongCode:
1184                     // use separate helper variables for slow and fast loop so that the jit optimizations
1185                     // won't get confused about the variable lifetimes
1186                     int chd;
1187                     if (ch <= 0x7FF) {
1188                         // 2 byte encoding
1189                         chd = unchecked((sbyte)0xC0) | (ch >> 6);
1190                     }
1191                     else {
1192                         // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
1193                         if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
1194                             // 3 byte encoding
1195                             chd = unchecked((sbyte)0xE0) | (ch >> 12);
1196                         }
1197                         else
1198                         {
1199                             // 4 byte encoding - high surrogate + low surrogate
1200                             // if (!IsHighSurrogate(ch))
1201                             if (ch > CharUnicodeInfo.HIGH_SURROGATE_END) {
1202                                 // low without high -> bad, try again in slow loop
1203                                 pSrc -= 1;
1204                                 break;
1205                             }
1206
1207                             chd = *pSrc;
1208                             pSrc++;
1209
1210                             // if (!IsLowSurrogate(chd)) {
1211                             if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END)) {
1212                                 // high not followed by low -> bad, try again in slow loop
1213                                 pSrc -= 2;
1214                                 break;
1215                             }
1216
1217                             ch = chd + (ch << 10) +
1218                                 (0x10000
1219                                 - CharUnicodeInfo.LOW_SURROGATE_START
1220                                 - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
1221
1222                             *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
1223                             // pStop - this byte is compensated by the second surrogate character
1224                             // 2 input chars require 4 output bytes.  2 have been anticipated already
1225                             // and 2 more will be accounted for by the 2 pStop-- calls below.
1226                             pTarget++;
1227
1228                             chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
1229                         }
1230                         *pTarget = (byte)chd;
1231                         pStop--;                    // 3 byte sequence for 1 char, so need pStop-- and the one below too.
1232                         pTarget++;
1233
1234                         chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
1235                     }
1236                     *pTarget = (byte)chd;
1237                     pStop--;                        // 2 byte sequence for 1 char so need pStop--.
1238                     pTarget++;
1239
1240                     *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
1241                     // pStop - this byte is already included
1242                     pTarget++;
1243                 }
1244
1245                 Contract.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
1246
1247 #endif // FASTLOOP
1248
1249                 // no pending char at this point
1250                 ch = 0;
1251             }
1252
1253             // Do we have to set the encoder bytes?
1254             if (encoder != null)
1255             {
1256                 Contract.Assert(!encoder.MustFlush || ch == 0,
1257                     "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture));
1258
1259                 encoder.surrogateChar = ch;
1260                 encoder.m_charsUsed = (int)(pSrc - chars);
1261             }
1262
1263             Contract.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
1264                 baseEncoder == null || !baseEncoder.m_throwOnOverflow,
1265                 "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting");
1266
1267             return (int)(pTarget - bytes);
1268         }
1269
1270
1271         // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
1272         // while the actual character is being built in the lower bits. They are shifted together
1273         // with the actual bits of the character.
1274
1275         // bits 30 & 31 are used for pending bits fixup
1276         private const int FinalByte         = 1 << 29;
1277         private const int SupplimentarySeq  = 1 << 28;
1278         private const int ThreeByteSeq      = 1 << 27;
1279
1280         // Note:  We throw exceptions on individually encoded surrogates and other non-shortest forms.
1281         //        If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1282         //
1283         // To simplify maintenance, the structure of GetCharCount and GetChars should be
1284         // kept the same as much as possible
1285         [System.Security.SecurityCritical]  // auto-generated
1286         internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
1287         {
1288             Contract.Assert(count >=0, "[UTF8Encoding.GetCharCount]count >=0");
1289             Contract.Assert(bytes!=null, "[UTF8Encoding.GetCharCount]bytes!=null");
1290
1291             // Initialize stuff
1292             byte *pSrc = bytes;
1293             byte *pEnd = pSrc+count;
1294
1295             // Start by assuming we have as many as count, charCount always includes the adjustment
1296             // for the character being decoded
1297             int charCount = count;
1298             int ch = 0;
1299             DecoderFallbackBuffer fallback = null;
1300
1301             if (baseDecoder != null) {
1302                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1303                 ch = decoder.bits;
1304                 charCount -= (ch >> 30);        // Adjust char count for # of expected bytes and expected output chars.
1305
1306                 // Shouldn't have anything in fallback buffer for GetCharCount
1307                 // (don't have to check m_throwOnOverflow for count)
1308                 Contract.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1309                     "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start");
1310             }
1311
1312             for (;;)
1313             {
1314                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1315
1316                 if (pSrc >= pEnd) {
1317                     break;
1318                 }
1319
1320                 if (ch == 0) {
1321                     // no pending bits
1322                     goto ReadChar;
1323                 }
1324
1325                 // read next byte. The JIT optimization seems to be getting confused when
1326                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1327                 int cha = *pSrc;
1328                 pSrc++;
1329
1330                 // we are expecting to see trailing bytes like 10vvvvvv
1331                 if ((cha & unchecked((sbyte)0xC0)) != 0x80) {
1332                     // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1333                     // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1334                     pSrc--;
1335                     charCount += (ch >> 30);
1336                     goto InvalidByteSequence;
1337                 }
1338
1339                 // fold in the new byte
1340                 ch = (ch << 6) | (cha & 0x3F);
1341
1342                 if ((ch & FinalByte) == 0) {
1343                     Contract.Assert( (ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1344                         "[UTF8Encoding.GetChars]Invariant volation");
1345
1346                     if ((ch & SupplimentarySeq) != 0) {
1347                         if ((ch & (FinalByte >> 6)) != 0) {
1348                             // this is 3rd byte (of 4 byte supplimentary) - nothing to do
1349                             continue;
1350                         }
1351
1352                         // 2nd byte, check for non-shortest form of supplimentary char and the valid
1353                         // supplimentary characters in range 0x010000 - 0x10FFFF at the same time
1354                         if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
1355                             goto InvalidByteSequence;
1356                         }
1357                     }
1358                     else {
1359                         // Must be 2nd byte of a 3-byte sequence
1360                         // check for non-shortest form of 3 byte seq
1361                         if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
1362                             (ch & (0xF800 >> 6) ) == (0xD800 >> 6))     // illegal individually encoded surrogate
1363                         {
1364                             goto InvalidByteSequence;
1365                         }
1366                     }
1367                     continue;
1368                 }
1369
1370                 // ready to punch
1371
1372                 // adjust for surrogates in non-shortest form
1373                 if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) {
1374                     charCount--;
1375                 }
1376                 goto EncodeChar;
1377
1378             InvalidByteSequence:
1379                 // this code fragment should be close to the gotos referencing it
1380                 // Have to do fallback for invalid bytes
1381                 if (fallback == null)
1382                 {
1383                     if (baseDecoder == null)
1384                         fallback = this.decoderFallback.CreateFallbackBuffer();
1385                     else
1386                         fallback = baseDecoder.FallbackBuffer;
1387                     fallback.InternalInitialize(bytes, null);
1388                 }
1389                 charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1390
1391                 ch = 0;
1392                 continue;
1393
1394             ReadChar:
1395                 ch = *pSrc;
1396                 pSrc++;
1397
1398             ProcessChar:
1399                 if (ch > 0x7F) {
1400                     // If its > 0x7F, its start of a new multi-byte sequence
1401
1402                     // Long sequence, so unreserve our char.
1403                     charCount--;
1404
1405                     // bit 6 has to be non-zero for start of multibyte chars.
1406                     if ((ch & 0x40) == 0) {
1407                         // Unexpected trail byte
1408                         goto InvalidByteSequence;
1409                     }
1410
1411                     // start a new long code
1412                     if ((ch & 0x20) != 0) {
1413                         if ((ch & 0x10) != 0) {
1414                             // 4 byte encoding - supplimentary character (2 surrogates)
1415
1416                             ch &= 0x0F;
1417
1418                             // check that bit 4 is zero and the valid supplimentary character
1419                             // range 0x000000 - 0x10FFFF at the same time
1420                             if (ch > 0x04) {
1421                                 ch |= 0xf0;
1422                                 goto InvalidByteSequence;
1423                             }
1424
1425                             // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1426                             // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
1427                             ch |= (FinalByte >> 3*6) |  // Final byte is 3 more bytes from now
1428                                   (1 << 30) |           // If it dies on next byte we'll need an extra char
1429                                   (3 << (30-2*6)) |     // If it dies on last byte we'll need to subtract a char
1430                                 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1431                                 (SupplimentarySeq >> 2*6) | (SupplimentarySeq >> 3*6);
1432
1433                             // Our character count will be 2 characters for these 4 bytes, so subtract another char
1434                             charCount--;
1435                         }
1436                         else {
1437                             // 3 byte encoding
1438                             // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
1439                             ch = (ch & 0x0F) | ( (FinalByte >> 2*6) | (1 << 30) |
1440                                 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2*6) );
1441
1442                             // We'll expect 1 character for these 3 bytes, so subtract another char.
1443                             charCount--;
1444                         }
1445                     }
1446                     else {
1447                         // 2 byte encoding
1448
1449                         ch &= 0x1F;
1450
1451                         // check for non-shortest form
1452                         if (ch <= 1) {
1453                             ch |= 0xc0;
1454                             goto InvalidByteSequence;
1455                         }
1456
1457                         // Add bit flags so we'll be flagged correctly
1458                         ch |= (FinalByte >> 6);
1459                     }
1460                     continue;
1461                 }
1462
1463             EncodeChar:
1464
1465 #if FASTLOOP
1466                 int availableBytes = PtrDiff(pEnd, pSrc);
1467
1468                 // don't fall into the fast decoding loop if we don't have enough bytes
1469                 if (availableBytes <= 13) {
1470                     // try to get over the remainder of the ascii characters fast though
1471                     byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1472                     while (pSrc < pLocalEnd) {
1473                         ch = *pSrc;
1474                         pSrc++;
1475
1476                         if (ch > 0x7F)
1477                             goto ProcessChar;
1478                     }
1479                     // we are done
1480                     ch = 0;
1481                     break;
1482                 }
1483
1484                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1485                 //  the boundary will be decreased for every non-ASCII character we encounter
1486                 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
1487                 byte *pStop = pSrc + availableBytes - 7;
1488
1489                 while (pSrc < pStop) {
1490                     ch = *pSrc;
1491                     pSrc++;
1492
1493                     if (ch > 0x7F) {
1494                         goto LongCode;
1495                     }
1496
1497                     // get pSrc 2-byte aligned
1498                     if ((unchecked((int)pSrc) & 0x1) != 0) {
1499                         ch = *pSrc;
1500                         pSrc++;
1501                         if (ch > 0x7F) {
1502                             goto LongCode;
1503                         }
1504                     }
1505
1506                     // get pSrc 4-byte aligned
1507                     if ((unchecked((int)pSrc) & 0x2) != 0) {
1508                         ch = *(ushort*)pSrc;
1509                         if ((ch & 0x8080) != 0) {
1510                             goto LongCodeWithMask16;
1511                         }
1512                         pSrc += 2;
1513                     }
1514
1515                     // Run 8 + 8 characters at a time!
1516                     while (pSrc < pStop) {
1517                         ch = *(int*)pSrc;
1518                         int chb = *(int*)(pSrc+4);
1519                         if (((ch | chb) & unchecked((int)0x80808080)) != 0) {
1520                             goto LongCodeWithMask32;
1521                         }
1522                         pSrc += 8;
1523
1524                         // This is a really small loop - unroll it
1525                         if (pSrc >= pStop)
1526                             break;
1527
1528                         ch = *(int*)pSrc;
1529                         chb = *(int*)(pSrc+4);
1530                         if (((ch | chb) & unchecked((int)0x80808080)) != 0) {
1531                             goto LongCodeWithMask32;
1532                         }
1533                         pSrc += 8;
1534                     }
1535                     break;
1536
1537                 LongCodeWithMask32:
1538                     // be careful about the sign extension
1539 if (!BitConverter.IsLittleEndian) {
1540                     ch = (int)(((uint)ch) >> 16);
1541 } else {
1542                     ch &= 0xFF;
1543 }
1544                 LongCodeWithMask16:
1545 if (!BitConverter.IsLittleEndian) {
1546                     ch = (int)(((uint)ch) >> 8);
1547 } else {
1548                     ch &= 0xFF;
1549 }
1550                     pSrc++;
1551                     if (ch <= 0x7F) {
1552                         continue;
1553                     }
1554
1555                 LongCode:
1556                     int chc = *pSrc;
1557                     pSrc++;
1558
1559                     if (
1560                         // bit 6 has to be zero
1561                         (ch & 0x40) == 0 ||
1562                         // we are expecting to see trailing bytes like 10vvvvvv
1563                         (chc & unchecked((sbyte)0xC0)) != 0x80)
1564                     {
1565                         goto BadLongCode;
1566                     }
1567
1568                     chc &= 0x3F;
1569
1570                     // start a new long code
1571                     if ((ch & 0x20) != 0) {
1572
1573                         // fold the first two bytes together
1574                         chc |= (ch & 0x0F) << 6;
1575
1576                         if ((ch & 0x10) != 0) {
1577                             // 4 byte encoding - surrogate
1578                             ch = *pSrc;
1579                             if (
1580                                 // check that bit 4 is zero, the non-shortest form of surrogate
1581                                 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
1582                                 !InRange(chc >> 4, 0x01, 0x10) ||
1583                                 // we are expecting to see trailing bytes like 10vvvvvv
1584                                 (ch & unchecked((sbyte)0xC0)) != 0x80 )
1585                             {
1586                                 goto BadLongCode;
1587                             }
1588
1589                             chc = (chc << 6) | (ch & 0x3F);
1590
1591                             ch = *(pSrc+1);
1592                             // we are expecting to see trailing bytes like 10vvvvvv
1593                             if ((ch & unchecked((sbyte)0xC0)) != 0x80) {
1594                                 goto BadLongCode;
1595                             }
1596                             pSrc += 2;
1597
1598                             // extra byte
1599                             charCount--;
1600                         }
1601                         else {
1602                             // 3 byte encoding
1603                             ch = *pSrc;
1604                             if (
1605                                 // check for non-shortest form of 3 byte seq
1606                                 (chc & (0x1F << 5)) == 0 ||
1607                                 // Can't have surrogates here.
1608                                 (chc & (0xF800 >> 6) ) == (0xD800 >> 6) ||
1609                                 // we are expecting to see trailing bytes like 10vvvvvv
1610                                 (ch & unchecked((sbyte)0xC0)) != 0x80 )
1611                             {
1612                                 goto BadLongCode;
1613                             }
1614                             pSrc++;
1615
1616                             // extra byte
1617                             charCount--;
1618                         }
1619                     }
1620                     else {
1621                         // 2 byte encoding
1622
1623                         // check for non-shortest form
1624                         if ((ch & 0x1E) == 0) {
1625                             goto BadLongCode;
1626                         }
1627                     }
1628
1629                     // extra byte
1630                     charCount--;
1631                 }
1632 #endif // FASTLOOP
1633
1634                 // no pending bits at this point
1635                 ch = 0;
1636                 continue;
1637
1638             BadLongCode:
1639                 pSrc -= 2;
1640                 ch = 0;
1641                 continue;
1642             }
1643
1644             // May have a problem if we have to flush
1645             if (ch != 0)
1646             {
1647                 // We were already adjusting for these, so need to unadjust
1648                 charCount += (ch >> 30);
1649                 if (baseDecoder == null || baseDecoder.MustFlush)
1650                 {
1651                     // Have to do fallback for invalid bytes
1652                     if (fallback == null)
1653                     {
1654                         if (baseDecoder == null)
1655                             fallback = this.decoderFallback.CreateFallbackBuffer();
1656                         else
1657                             fallback = baseDecoder.FallbackBuffer;
1658                         fallback.InternalInitialize(bytes, null);
1659                     }
1660                     charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
1661                 }
1662             }
1663
1664             // Shouldn't have anything in fallback buffer for GetCharCount
1665             // (don't have to check m_throwOnOverflow for count)
1666             Contract.Assert(fallback == null || fallback.Remaining == 0,
1667                 "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
1668
1669             return charCount;
1670         }
1671
1672         // WARNING:  If we throw an error, then System.Resources.ResourceReader calls this method.
1673         //           So if we're really broken, then that could also throw an error... recursively.
1674         //           So try to make sure GetChars can at least process all uses by
1675         //           System.Resources.ResourceReader!
1676         //
1677         // Note:  We throw exceptions on individually encoded surrogates and other non-shortest forms.
1678         //        If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
1679         //
1680         // To simplify maintenance, the structure of GetCharCount and GetChars should be
1681         // kept the same as much as possible
1682         [System.Security.SecurityCritical]  // auto-generated
1683         internal override unsafe int GetChars(byte* bytes, int byteCount,
1684                                                 char* chars, int charCount, DecoderNLS baseDecoder)
1685         {
1686             Contract.Assert(chars!=null, "[UTF8Encoding.GetChars]chars!=null");
1687             Contract.Assert(byteCount >=0, "[UTF8Encoding.GetChars]count >=0");
1688             Contract.Assert(charCount >=0, "[UTF8Encoding.GetChars]charCount >=0");
1689             Contract.Assert(bytes!=null, "[UTF8Encoding.GetChars]bytes!=null");
1690
1691             byte *pSrc = bytes;
1692             char *pTarget = chars;
1693
1694             byte *pEnd = pSrc+byteCount;
1695             char *pAllocatedBufferEnd = pTarget+charCount;
1696
1697             int ch = 0;
1698
1699             DecoderFallbackBuffer fallback = null;
1700             if (baseDecoder != null) {
1701                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
1702                 ch = decoder.bits;
1703
1704                 // Shouldn't have anything in fallback buffer for GetChars
1705                 // (don't have to check m_throwOnOverflow for chars, we always use all or none so always should be empty)
1706                 Contract.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1707                     "[UTF8Encoding.GetChars]Expected empty fallback buffer at start");
1708             }
1709
1710             for (;;)
1711             {
1712                 // SLOWLOOP: does all range checks, handles all special cases, but it is slow
1713
1714                 if (pSrc >= pEnd) {
1715                     break;
1716                 }
1717
1718                 if (ch == 0) {
1719                     // no pending bits
1720                     goto ReadChar;
1721                 }
1722
1723                 // read next byte. The JIT optimization seems to be getting confused when
1724                 // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
1725                 int cha = *pSrc;
1726                 pSrc++;
1727
1728                 // we are expecting to see trailing bytes like 10vvvvvv
1729                 if ((cha & unchecked((sbyte)0xC0)) != 0x80) {
1730                     // This can be a valid starting byte for another UTF8 byte sequence, so let's put
1731                     // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
1732                     pSrc--;
1733                     goto InvalidByteSequence;
1734                 }
1735
1736                 // fold in the new byte
1737                 ch = (ch << 6) | (cha & 0x3F);
1738
1739                 if ((ch & FinalByte) == 0) {
1740                     // Not at last byte yet
1741                     Contract.Assert( (ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
1742                         "[UTF8Encoding.GetChars]Invariant volation");
1743
1744                     if ((ch & SupplimentarySeq) != 0) {
1745                         // Its a 4-byte supplimentary sequence
1746                         if ((ch & (FinalByte >> 6)) != 0) {
1747                             // this is 3rd byte of 4 byte sequence - nothing to do
1748                             continue;
1749                         }
1750
1751                         // 2nd byte of 4 bytes
1752                         // check for non-shortest form of surrogate and the valid surrogate
1753                         // range 0x000000 - 0x10FFFF at the same time
1754                         if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
1755                             goto InvalidByteSequence;
1756                         }
1757                     }
1758                     else {
1759                         // Must be 2nd byte of a 3-byte sequence
1760                         // check for non-shortest form of 3 byte seq
1761                         if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
1762                             (ch & (0xF800 >> 6) ) == (0xD800 >> 6))     // illegal individually encoded surrogate
1763                         {
1764                             goto InvalidByteSequence;
1765                         }
1766                     }
1767                     continue;
1768                 }
1769
1770                 // ready to punch
1771
1772                 // surrogate in shortest form?
1773                 // Might be possible to get rid of this?  Already did non-shortest check for 4-byte sequence when reading 2nd byte?
1774                 if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) {
1775                     // let the range check for the second char throw the exception
1776                     if (pTarget < pAllocatedBufferEnd) {
1777                         *pTarget = (char)( ((ch >> 10) & 0x7FF) +
1778                             unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))) );
1779                         pTarget++;
1780
1781                         ch = (ch & 0x3FF) +
1782                             unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START));
1783                     }
1784                 }
1785
1786                 goto EncodeChar;
1787
1788             InvalidByteSequence:
1789                 // this code fragment should be close to the gotos referencing it
1790                 // Have to do fallback for invalid bytes
1791                 if (fallback == null)
1792                 {
1793                     if (baseDecoder == null)
1794                         fallback = this.decoderFallback.CreateFallbackBuffer();
1795                     else
1796                         fallback = baseDecoder.FallbackBuffer;
1797                     fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
1798                 }
1799                 // This'll back us up the appropriate # of bytes if we didn't get anywhere
1800                 if (!FallbackInvalidByteSequence(ref pSrc, ch, fallback, ref pTarget))
1801                 {
1802                     // Ran out of buffer space
1803                     // Need to throw an exception?
1804                     Contract.Assert(pSrc >= bytes || pTarget == chars,
1805                         "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
1806                     fallback.InternalReset();
1807                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
1808                     ch = 0;
1809                     break;
1810                 }
1811                 Contract.Assert(pSrc >= bytes,
1812                     "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
1813                 ch = 0;
1814                 continue;
1815
1816             ReadChar:
1817                 ch = *pSrc;
1818                 pSrc++;
1819
1820             ProcessChar:
1821                 if (ch > 0x7F) {
1822                     // If its > 0x7F, its start of a new multi-byte sequence
1823
1824                     // bit 6 has to be non-zero
1825                     if ((ch & 0x40) == 0) {
1826                         goto InvalidByteSequence;
1827                     }
1828
1829                     // start a new long code
1830                     if ((ch & 0x20) != 0) {
1831                         if ((ch & 0x10) != 0) {
1832                             // 4 byte encoding - supplimentary character (2 surrogates)
1833
1834                             ch &= 0x0F;
1835
1836                             // check that bit 4 is zero and the valid supplimentary character
1837                             // range 0x000000 - 0x10FFFF at the same time
1838                             if (ch > 0x04) {
1839                                 ch |= 0xf0;
1840                                 goto InvalidByteSequence;
1841                             }
1842
1843                             ch |= (FinalByte >> 3*6) | (1 << 30) | (3 << (30-2*6)) |
1844                                 (SupplimentarySeq) | (SupplimentarySeq >> 6) |
1845                                 (SupplimentarySeq >> 2*6) | (SupplimentarySeq >> 3*6);
1846                         }
1847                         else {
1848                             // 3 byte encoding
1849                             ch = (ch & 0x0F) | ( (FinalByte >> 2*6) | (1 << 30) |
1850                                 (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2*6) );
1851                         }
1852                     }
1853                     else {
1854                         // 2 byte encoding
1855
1856                         ch &= 0x1F;
1857
1858                         // check for non-shortest form
1859                         if (ch <= 1) {
1860                             ch |= 0xc0;
1861                             goto InvalidByteSequence;
1862                         }
1863
1864                         ch |= (FinalByte >> 6);
1865                     }
1866                     continue;
1867                 }
1868
1869             EncodeChar:
1870                 // write the pending character
1871                 if (pTarget >= pAllocatedBufferEnd)
1872                 {
1873                     // Fix chars so we make sure to throw if we didn't output anything
1874                     ch &= 0x1fffff;
1875                     if (ch > 0x7f)
1876                     {
1877                         if (ch > 0x7ff)
1878                         {
1879                             if (ch >= CharUnicodeInfo.LOW_SURROGATE_START &&
1880                                 ch <= CharUnicodeInfo.LOW_SURROGATE_END)
1881                             {
1882                                 pSrc--;     // It was 4 bytes
1883                                 pTarget--;  // 1 was stored already, but we can't remember 1/2, so back up
1884                             }
1885                             else if (ch > 0xffff)
1886                             {
1887                                 pSrc--;     // It was 4 bytes, nothing was stored
1888                             }
1889                             pSrc--;         // It was at least 3 bytes
1890                         }
1891                         pSrc--;             // It was at least 2 bytes
1892                     }
1893                     pSrc--;
1894
1895                     // Throw that we don't have enough room (pSrc could be < chars if we had started to process
1896                     // a 4 byte sequence alredy)
1897                     Contract.Assert(pSrc >= bytes || pTarget == chars,
1898                         "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
1899                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
1900
1901                     // Don't store ch in decoder, we already backed up to its start
1902                     ch = 0;
1903
1904                     // Didn't throw, just use this buffer size.
1905                     break;
1906                 }
1907                 *pTarget = (char)ch;
1908                 pTarget++;
1909
1910 #if FASTLOOP
1911                 int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
1912                 int availableBytes = PtrDiff(pEnd, pSrc);
1913
1914                 // don't fall into the fast decoding loop if we don't have enough bytes
1915                 // Test for availableChars is done because pStop would be <= pTarget.
1916                 if (availableBytes <= 13) {
1917                     // we may need as many as 1 character per byte
1918                     if (availableChars < availableBytes) {
1919                         // not enough output room.  no pending bits at this point
1920                         ch = 0;
1921                         continue;
1922                     }
1923
1924                     // try to get over the remainder of the ascii characters fast though
1925                     byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
1926                     while (pSrc < pLocalEnd) {
1927                         ch = *pSrc;
1928                         pSrc++;
1929
1930                         if (ch > 0x7F)
1931                             goto ProcessChar;
1932
1933                         *pTarget = (char)ch;
1934                         pTarget++;
1935                     }
1936                     // we are done
1937                     ch = 0;
1938                     break;
1939                 }
1940
1941                 // we may need as many as 1 character per byte, so reduce the byte count if necessary.
1942                 // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
1943                 if (availableChars < availableBytes) {
1944                     availableBytes = availableChars;
1945                 }
1946
1947                 // To compute the upper bound, assume that all characters are ASCII characters at this point,
1948                 //  the boundary will be decreased for every non-ASCII character we encounter
1949                 // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
1950                 char *pStop = pTarget + availableBytes - 7;
1951
1952                 while (pTarget < pStop) {
1953                     ch = *pSrc;
1954                     pSrc++;
1955
1956                     if (ch > 0x7F) {
1957                         goto LongCode;
1958                     }
1959                     *pTarget = (char)ch;
1960                     pTarget++;
1961
1962                     // get pSrc to be 2-byte aligned
1963                     if ((unchecked((int)pSrc) & 0x1) != 0) {
1964                         ch = *pSrc;
1965                         pSrc++;
1966                         if (ch > 0x7F) {
1967                             goto LongCode;
1968                         }
1969                         *pTarget = (char)ch;
1970                         pTarget++;
1971                     }
1972
1973                     // get pSrc to be 4-byte aligned
1974                     if ((unchecked((int)pSrc) & 0x2) != 0) {
1975                         ch = *(ushort*)pSrc;
1976                         if ((ch & 0x8080) != 0) {
1977                             goto LongCodeWithMask16;
1978                         }
1979
1980                         // Unfortunately, this is endianess sensitive
1981 if (!BitConverter.IsLittleEndian) {
1982                         *pTarget = (char)((ch >> 8) & 0x7F);
1983                         pSrc += 2;
1984                         *(pTarget+1) = (char)(ch & 0x7F);
1985                         pTarget += 2;
1986 } else {
1987                         *pTarget = (char)(ch & 0x7F);
1988                         pSrc += 2;
1989                         *(pTarget+1) = (char)((ch >> 8) & 0x7F);
1990                         pTarget += 2;
1991 }
1992                     }
1993
1994                     // Run 8 characters at a time!
1995                     while (pTarget < pStop) {
1996                         ch = *(int*)pSrc;
1997                         int chb = *(int*)(pSrc+4);
1998                         if (((ch | chb) & unchecked((int)0x80808080)) != 0) {
1999                             goto LongCodeWithMask32;
2000                         }
2001
2002                         // Unfortunately, this is endianess sensitive
2003 if (!BitConverter.IsLittleEndian) {
2004                         *pTarget = (char)((ch >> 24) & 0x7F);
2005                         *(pTarget+1) = (char)((ch >> 16) & 0x7F);
2006                         *(pTarget+2) = (char)((ch >> 8) & 0x7F);
2007                         *(pTarget+3) = (char)(ch & 0x7F);
2008                         pSrc += 8;
2009                         *(pTarget+4) = (char)((chb >> 24) & 0x7F);
2010                         *(pTarget+5) = (char)((chb >> 16) & 0x7F);
2011                         *(pTarget+6) = (char)((chb >> 8) & 0x7F);
2012                         *(pTarget+7) = (char)(chb & 0x7F);
2013                         pTarget += 8;
2014 } else {
2015                         *pTarget = (char)(ch & 0x7F);
2016                         *(pTarget+1) = (char)((ch >> 8) & 0x7F);
2017                         *(pTarget+2) = (char)((ch >> 16) & 0x7F);
2018                         *(pTarget+3) = (char)((ch >> 24) & 0x7F);
2019                         pSrc += 8;
2020                         *(pTarget+4) = (char)(chb & 0x7F);
2021                         *(pTarget+5) = (char)((chb >> 8) & 0x7F);
2022                         *(pTarget+6) = (char)((chb >> 16) & 0x7F);
2023                         *(pTarget+7) = (char)((chb >> 24) & 0x7F);
2024                         pTarget += 8;
2025 }
2026                     }
2027                     break;
2028
2029                 LongCodeWithMask32:
2030 if (!BitConverter.IsLittleEndian) {
2031                     // be careful about the sign extension
2032                     ch = (int)(((uint)ch) >> 16);
2033 } else {
2034                     ch &= 0xFF;
2035 }
2036                 LongCodeWithMask16:
2037 if (!BitConverter.IsLittleEndian) {
2038                     ch = (int)(((uint)ch) >> 8);
2039 } else {
2040                     ch &= 0xFF;
2041 }
2042                     pSrc++;
2043                     if (ch <= 0x7F) {
2044                         *pTarget = (char)ch;
2045                         pTarget++;
2046                         continue;
2047                     }
2048
2049                 LongCode:
2050                     int chc = *pSrc;
2051                     pSrc++;
2052
2053                     if (
2054                         // bit 6 has to be zero
2055                         (ch & 0x40) == 0 ||
2056                         // we are expecting to see trailing bytes like 10vvvvvv
2057                         (chc & unchecked((sbyte)0xC0)) != 0x80)
2058                     {
2059                         goto BadLongCode;
2060                     }
2061
2062                     chc &= 0x3F;
2063
2064                     // start a new long code
2065                     if ((ch & 0x20) != 0) {
2066
2067                         // fold the first two bytes together
2068                         chc |= (ch & 0x0F) << 6;
2069
2070                         if ((ch & 0x10) != 0) {
2071                             // 4 byte encoding - surrogate
2072                             ch = *pSrc;
2073                             if (
2074                                 // check that bit 4 is zero, the non-shortest form of surrogate
2075                                 // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
2076                                 !InRange(chc >> 4, 0x01, 0x10) ||
2077                                 // we are expecting to see trailing bytes like 10vvvvvv
2078                                 (ch & unchecked((sbyte)0xC0)) != 0x80 )
2079                             {
2080                                 goto BadLongCode;
2081                             }
2082
2083                             chc = (chc << 6) | (ch & 0x3F);
2084
2085                             ch = *(pSrc+1);
2086                             // we are expecting to see trailing bytes like 10vvvvvv
2087                             if ((ch & unchecked((sbyte)0xC0)) != 0x80) {
2088                                 goto BadLongCode;
2089                             }
2090                             pSrc += 2;
2091
2092                             ch = (chc << 6) | (ch & 0x3F);
2093
2094                             *pTarget = (char)( ((ch >> 10) & 0x7FF) +
2095                                 unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))) );
2096                             pTarget++;
2097
2098                             ch = (ch & 0x3FF) +
2099                                 unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START));
2100
2101                             // extra byte, we're already planning 2 chars for 2 of these bytes,
2102                             // but the big loop is testing the target against pStop, so we need
2103                             // to subtract 2 more or we risk overrunning the input.  Subtract
2104                             // one here and one below.
2105                             pStop--;
2106                         }
2107                         else {
2108                             // 3 byte encoding
2109                             ch = *pSrc;
2110                             if (
2111                                 // check for non-shortest form of 3 byte seq
2112                                 (chc & (0x1F << 5)) == 0 ||
2113                                 // Can't have surrogates here.
2114                                 (chc & (0xF800 >> 6) ) == (0xD800 >> 6) ||
2115                                 // we are expecting to see trailing bytes like 10vvvvvv
2116                                 (ch & unchecked((sbyte)0xC0)) != 0x80 )
2117                             {
2118                                 goto BadLongCode;
2119                             }
2120                             pSrc++;
2121
2122                             ch = (chc << 6) | (ch & 0x3F);
2123
2124                             // extra byte, we're only expecting 1 char for each of these 3 bytes,
2125                             // but the loop is testing the target (not source) against pStop, so
2126                             // we need to subtract 2 more or we risk overrunning the input.
2127                             // Subtract 1 here and one more below
2128                             pStop--;
2129                         }
2130                     }
2131                     else {
2132                         // 2 byte encoding
2133
2134                         ch &= 0x1F;
2135
2136                         // check for non-shortest form
2137                         if (ch <= 1) {
2138                             goto BadLongCode;
2139                         }
2140                         ch = (ch << 6) | chc;
2141                     }
2142
2143                     *pTarget = (char)ch;
2144                     pTarget++;
2145
2146                     // extra byte, we're only expecting 1 char for each of these 2 bytes,
2147                     // but the loop is testing the target (not source) against pStop.
2148                     // subtract an extra count from pStop so that we don't overrun the input.
2149                     pStop--;
2150                 }
2151 #endif // FASTLOOP
2152
2153                 Contract.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
2154
2155                 // no pending bits at this point
2156                 ch = 0;
2157                 continue;
2158
2159             BadLongCode:
2160                 pSrc -= 2;
2161                 ch = 0;
2162                 continue;
2163             }
2164
2165             if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush))
2166             {
2167                 // Have to do fallback for invalid bytes
2168                 if (fallback == null)
2169                 {
2170                     if (baseDecoder == null)
2171                         fallback = this.decoderFallback.CreateFallbackBuffer();
2172                     else
2173                         fallback = baseDecoder.FallbackBuffer;
2174                     fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
2175                 }
2176
2177                 // This'll back us up the appropriate # of bytes if we didn't get anywhere
2178                 if (!FallbackInvalidByteSequence(ref pSrc, ch, fallback, ref pTarget))
2179                 {
2180                     Contract.Assert(pSrc >= bytes || pTarget == chars,
2181                         "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
2182
2183                     // Ran out of buffer space
2184                     // Need to throw an exception?
2185                     fallback.InternalReset();
2186                     ThrowCharsOverflow(baseDecoder, pTarget == chars);
2187                 }
2188                 Contract.Assert(pSrc >= bytes,
2189                     "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
2190                 ch = 0;
2191             }
2192
2193             if (baseDecoder != null)
2194             {
2195                 UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
2196
2197                 // If we're storing flush data we expect all bits to be used or else
2198                 // we're stuck in the middle of a conversion
2199                 Contract.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder.m_throwOnOverflow,
2200                     "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow.");
2201
2202                 // Remember our leftover bits.
2203                 decoder.bits = ch;
2204
2205                 baseDecoder.m_bytesUsed = (int)(pSrc - bytes);
2206             }
2207
2208             // Shouldn't have anything in fallback buffer for GetChars
2209             // (don't have to check m_throwOnOverflow for chars)
2210             Contract.Assert(fallback == null || fallback.Remaining == 0,
2211                 "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
2212
2213             return PtrDiff(pTarget, chars);
2214         }
2215
2216         // During GetChars we had an invalid byte sequence
2217         // pSrc is backed up to the start of the bad sequence if we didn't have room to
2218         // fall it back.  Otherwise pSrc remains wher it is.
2219         [System.Security.SecurityCritical]  // auto-generated
2220         private unsafe bool FallbackInvalidByteSequence(
2221             ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget)
2222         {
2223             // Get our byte[]
2224             byte *pStart = pSrc;
2225             byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch);
2226
2227             // Do the actual fallback
2228             if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget))
2229             {
2230                 // Oops, it failed, back up to pStart
2231                 pSrc = pStart;
2232                 return false;
2233             }
2234
2235             // It worked
2236             return true;
2237         }
2238
2239         // During GetCharCount we had an invalid byte sequence
2240         // pSrc is used to find the index that points to the invalid bytes,
2241         // however the byte[] contains the fallback bytes (in case the index is -1)
2242         [System.Security.SecurityCritical]  // auto-generated
2243         private unsafe int FallbackInvalidByteSequence(
2244             byte* pSrc, int ch, DecoderFallbackBuffer fallback)
2245         {
2246             // Get our byte[]
2247             byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch);
2248
2249             // Do the actual fallback
2250             int count = fallback.InternalFallback(bytesUnknown, pSrc);
2251
2252             // # of fallback chars expected.
2253             // Note that we only get here for "long" sequences, and have already unreserved
2254             // the count that we prereserved for the input bytes
2255             return count;
2256         }
2257
2258         // Note that some of these bytes may have come from a previous fallback, so we cannot
2259         // just decrement the pointer and use the values we read.  In those cases we have
2260         // to regenerate the original values.
2261         [System.Security.SecurityCritical]  // auto-generated
2262         private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch)
2263         {
2264             // Get our byte[]
2265             byte[] bytesUnknown = null;
2266
2267             // See if it was a plain char
2268             // (have to check >= 0 because we have all sorts of wierd bit flags)
2269             if (ch < 0x100 && ch >= 0)
2270             {
2271                 pSrc--;
2272                 bytesUnknown = new byte[] { unchecked((byte)ch) };
2273             }
2274             // See if its an unfinished 2 byte sequence
2275             else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
2276             {
2277                 pSrc--;
2278                 bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F )| 0xc0)) };
2279             }
2280             // So now we're either 2nd byte of 3 or 4 byte sequence or
2281             // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
2282             // 1st check if its a 4 byte sequence
2283             else if ((ch & SupplimentarySeq) != 0)
2284             {
2285                 //  3rd byte of 4 byte sequence?
2286                 if ((ch & (FinalByte >> 6)) != 0)
2287                 {
2288                     // 3rd byte of 4 byte sequence
2289                     pSrc-=3;
2290                     bytesUnknown = new byte[] {
2291                         unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)),
2292                         unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)),
2293                         unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2294                 }
2295                 else if ((ch & (FinalByte >> 12)) != 0)
2296                 {
2297                     // 2nd byte of a 4 byte sequence
2298                     pSrc-=2;
2299                     bytesUnknown = new byte[] {
2300                         unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)),
2301                         unchecked((byte)(((ch) & 0x3F) | 0x80)) };
2302                 }
2303                 else
2304                 {
2305                     // 4th byte of a 4 byte sequence
2306                     pSrc--;
2307                     bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0))};
2308                 }
2309             }
2310             else
2311             {
2312                 // 2nd byte of 3 byte sequence?
2313                 if ((ch & (FinalByte >> 6)) != 0)
2314                 {
2315                     // So its 2nd byte of a 3 byte sequence
2316                     pSrc-=2;
2317                     bytesUnknown = new byte[] {
2318                         unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) };
2319                 }
2320                 else
2321                 {
2322                     // 1st byte of a 3 byte sequence
2323                     pSrc--;
2324                     bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0))};
2325                 }
2326             }
2327
2328             return bytesUnknown;
2329         }
2330
2331
2332         public override Decoder GetDecoder() {
2333             return new UTF8Decoder(this);
2334         }
2335
2336
2337         public override Encoder GetEncoder() {
2338             return new UTF8Encoder(this);
2339         }
2340
2341
2342         public override int GetMaxByteCount(int charCount)
2343         {
2344             if (charCount < 0)
2345                throw new ArgumentOutOfRangeException("charCount",
2346                     Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
2347             Contract.EndContractBlock();
2348
2349             // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
2350             long byteCount = (long)charCount + 1;
2351
2352             if (EncoderFallback.MaxCharCount > 1)
2353                 byteCount *= EncoderFallback.MaxCharCount;
2354
2355             // Max 3 bytes per char.  (4 bytes per 2 chars for surrogates)
2356             byteCount *= 3;
2357
2358             if (byteCount > 0x7fffffff)
2359                 throw new ArgumentOutOfRangeException("charCount", Environment.GetResourceString("ArgumentOutOfRange_GetByteCountOverflow"));
2360
2361             return (int)byteCount;
2362         }
2363
2364
2365         public override int GetMaxCharCount(int byteCount)
2366         {
2367             if (byteCount < 0)
2368                throw new ArgumentOutOfRangeException("byteCount",
2369                     Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
2370             Contract.EndContractBlock();
2371
2372             // Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
2373             long charCount = ((long)byteCount + 1);
2374
2375             // Non-shortest form would fall back, so get max count from fallback.
2376             // So would 11... followed by 11..., so you could fall back every byte
2377             if (DecoderFallback.MaxCharCount > 1)
2378             {
2379                 charCount *= DecoderFallback.MaxCharCount;
2380             }
2381
2382             if (charCount > 0x7fffffff)
2383                 throw new ArgumentOutOfRangeException("byteCount", Environment.GetResourceString("ArgumentOutOfRange_GetCharCountOverflow"));
2384
2385             return (int)charCount;
2386         }
2387
2388
2389         public override byte[] GetPreamble()
2390         {
2391             if (emitUTF8Identifier) {
2392                 // Allocate new array to prevent users from modifying it.
2393                 return new byte[3] { 0xEF, 0xBB, 0xBF };
2394             }
2395             else
2396                 return EmptyArray<Byte>.Value;
2397         }
2398
2399
2400         public override bool Equals(Object value) {
2401             UTF8Encoding that = value as UTF8Encoding;
2402             if (that != null) {
2403                 return (emitUTF8Identifier == that.emitUTF8Identifier) &&
2404 //                       (isThrowException == that.isThrowException) && // Same as encoder/decoderfallbacks being exception
2405                        (EncoderFallback.Equals(that.EncoderFallback)) &&
2406                        (DecoderFallback.Equals(that.DecoderFallback));
2407             }
2408             return (false);
2409         }
2410
2411
2412         public override int GetHashCode() {
2413             //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
2414             return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
2415                    UTF8_CODEPAGE + (emitUTF8Identifier?1:0);
2416         }
2417
2418         [Serializable]
2419         internal class UTF8Encoder : EncoderNLS, ISerializable
2420         {
2421             // We must save a high surrogate value until the next call, looking
2422             // for a low surrogate value.
2423             internal int surrogateChar;
2424
2425             public UTF8Encoder(UTF8Encoding encoding) : base(encoding)
2426             {
2427                 // base calls reset
2428             }
2429
2430             // Constructor called by serialization, have to handle deserializing from Everett
2431             internal UTF8Encoder(SerializationInfo info, StreamingContext context)
2432             {
2433                 // Any info?
2434                 if (info==null) throw new ArgumentNullException("info");
2435                 Contract.EndContractBlock();
2436
2437                 // Get common info
2438                 this.m_encoding = (Encoding)info.GetValue("encoding", typeof(Encoding));
2439
2440                 // SurrogateChar happens to mean the same thing
2441                 this.surrogateChar = (int)info.GetValue("surrogateChar", typeof(int));
2442
2443                 try
2444                 {
2445                     this.m_fallback = (EncoderFallback) info.GetValue("m_fallback", typeof(EncoderFallback));
2446                 }
2447                 catch (SerializationException)
2448                 {
2449                     this.m_fallback = null;
2450                 }
2451             }
2452
2453 #if FEATURE_SERIALIZATION
2454             // ISerializable implementation, get data for this object
2455             [System.Security.SecurityCritical]  // auto-generated_required
2456             void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
2457             {
2458                 // Any info?
2459                 if (info==null) throw new ArgumentNullException("info");
2460                 Contract.EndContractBlock();
2461
2462                 // Save Whidbey data
2463                 // Just need Everett maxCharSize (BaseCodePageEncoding) or m_maxByteSize (MLangBaseCodePageEncoding)
2464                 info.AddValue("encoding", this.m_encoding);
2465                 info.AddValue("surrogateChar", this.surrogateChar);
2466
2467                 info.AddValue("m_fallback", this.m_fallback);
2468
2469                 // Extra stuff for Everett that Whidbey doesn't use
2470                 info.AddValue("storedSurrogate", this.surrogateChar > 0 ? true : false);
2471                 info.AddValue("mustFlush", false);  // Everett doesn't actually use this either, but it accidently serialized it!
2472             }
2473 #endif
2474
2475             public override void Reset()
2476
2477             {
2478                 this.surrogateChar = 0;
2479                 if (m_fallbackBuffer != null)
2480                     m_fallbackBuffer.Reset();
2481             }
2482
2483             // Anything left in our encoder?
2484             internal override bool HasState
2485             {
2486                 get
2487                 {
2488                     return (this.surrogateChar != 0);
2489                 }
2490             }
2491         }
2492
2493         [Serializable]
2494         internal class UTF8Decoder : DecoderNLS, ISerializable
2495         {
2496             // We'll need to remember the previous information. See the comments around definition
2497             // of FinalByte for details.
2498             internal int bits;
2499
2500             public UTF8Decoder(UTF8Encoding encoding) : base(encoding)
2501             {
2502                 // base calls reset
2503             }
2504
2505             // Constructor called by serialization, have to handle deserializing from Everett
2506             internal UTF8Decoder(SerializationInfo info, StreamingContext context)
2507             {
2508                 // Any info?
2509                 if (info==null) throw new ArgumentNullException("info");
2510                 Contract.EndContractBlock();
2511
2512                 // Get common info
2513                 this.m_encoding = (Encoding)info.GetValue("encoding", typeof(Encoding));
2514
2515                 try
2516                 {
2517                     // Get whidbey version of bits
2518                     this.bits = (int)info.GetValue("wbits", typeof(int));
2519                     this.m_fallback = (DecoderFallback) info.GetValue("m_fallback", typeof(DecoderFallback));
2520                 }
2521                 catch (SerializationException)
2522                 {
2523                     // Everett calls bits bits instead of wbits, so this is Everett
2524                     this.bits = 0;
2525                     this.m_fallback = null;
2526                 }
2527             }
2528
2529 #if FEATURE_SERIALIZATION
2530             // ISerializable implementation, get data for this object
2531             [System.Security.SecurityCritical]  // auto-generated_required
2532             void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
2533             {
2534                 // Any info?
2535                 if (info==null) throw new ArgumentNullException("info");
2536                 Contract.EndContractBlock();
2537
2538                 // Save new Whidbey data
2539                 info.AddValue("encoding", this.m_encoding);
2540                 info.AddValue("wbits", this.bits);          // Special whidbey bits name
2541                 info.AddValue("m_fallback", this.m_fallback);
2542
2543                 // Everett has extra stuff, we set it all to 0 in case this deserializes in Everett
2544                 info.AddValue("bits", (int)0);
2545                 info.AddValue("trailCount", (int)0);
2546                 info.AddValue("isSurrogate", false);
2547                 info.AddValue("byteSequence", (int)0);
2548             }
2549 #endif
2550
2551             public override void Reset()
2552             {
2553                 this.bits = 0;
2554                 if (m_fallbackBuffer != null)
2555                     m_fallbackBuffer.Reset();
2556             }
2557
2558             // Anything left in our decoder?
2559             internal override bool HasState
2560             {
2561                 get
2562                 {
2563                     return (this.bits != 0);
2564                 }
2565             }
2566         }
2567     }
2568 }