Merge pull request #1304 from slluis/mac-proxy-autoconfig
[mono.git] / mcs / class / corlib / System.Text / UTF8Encoding.cs
1 /*
2  * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
3  *
4  * Copyright (c) 2001, 2002  Southern Storm Software, Pty Ltd
5  * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23  * OTHER DEALINGS IN THE SOFTWARE.
24  */
25
26 namespace System.Text
27 {
28
29 using System;
30 using System.Runtime.InteropServices;
31
32 [Serializable]
33 [MonoLimitation ("Serialization format not compatible with .NET")]
34 [ComVisible (true)]
35 public class UTF8Encoding : Encoding
36 {
37         // Magic number used by Windows for UTF-8.
38         internal const int UTF8_CODE_PAGE = 65001;
39
40         // Internal state.
41         private bool emitIdentifier;
42
43         // Constructors.
44         public UTF8Encoding () : this (false, false) {}
45         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
46                         : this (encoderShouldEmitUTF8Identifier, false) {}
47         
48         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
49                 : base (UTF8_CODE_PAGE)
50         {
51                 emitIdentifier = encoderShouldEmitUTF8Identifier;
52                 if (throwOnInvalidBytes)
53                         SetFallbackInternal (EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
54                 else
55                         SetFallbackInternal (EncoderFallback.StandardSafeFallback, DecoderFallback.StandardSafeFallback);
56
57                 web_name = body_name = header_name = "utf-8";
58                 encoding_name = "Unicode (UTF-8)";
59                 is_browser_save = true;
60                 is_browser_display = true;
61                 is_mail_news_display = true;
62                 is_mail_news_save = true;
63                 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
64         }
65
66         #region GetByteCount()
67
68         // Internal version of "GetByteCount" which can handle a rolling
69         // state between multiple calls to this method.
70         private static int InternalGetByteCount (char[] chars, int index, int count, EncoderFallback fallback, ref char leftOver, bool flush)
71         {
72                 // Validate the parameters.
73                 if (chars == null) {
74                         throw new ArgumentNullException ("chars");
75                 }
76                 if (index < 0 || index > chars.Length) {
77                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
78                 }
79                 if (count < 0 || count > (chars.Length - index)) {
80                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
81                 }
82
83                 if (index == chars.Length) {
84                         if (flush && leftOver != '\0') {
85                                 // Flush the left-over surrogate pair start.
86                                 leftOver = '\0';
87                                 return 3;
88                         }
89                         return 0;
90                 }
91
92                 unsafe {
93                         fixed (char* cptr = chars) {
94                                 return InternalGetByteCount (cptr + index, count, fallback, ref leftOver, flush);
95                         }
96                 }
97         }
98
99         private unsafe static int InternalGetByteCount (char* chars, int count, EncoderFallback fallback, ref char leftOver, bool flush)
100         {
101                 int length = 0;
102                 char* end = chars + count;
103                 char* start = chars;
104                 EncoderFallbackBuffer buffer = null;
105                 while (chars < end) {
106                         if (leftOver == 0) {
107                                 for (; chars < end; chars++) {
108                                         if (*chars < '\x80') {
109                                                 ++length;
110                                         } else if (*chars < '\x800') {
111                                                 length += 2;
112                                         } else if (*chars < '\uD800' || *chars > '\uDFFF') {
113                                                 length += 3;
114                                         } else if (*chars <= '\uDBFF') {
115                                                 // This is a surrogate start char, exit the inner loop only
116                                                 // if we don't find the complete surrogate pair.
117                                                 if (chars + 1 < end && chars [1] >= '\uDC00' && chars [1] <= '\uDFFF') {
118                                                         length += 4;
119                                                         chars++;
120                                                         continue;
121                                                 }
122                                                 leftOver = *chars;
123                                                 chars++;
124                                                 break;
125                                         } else {
126                                                 // We have a surrogate tail without 
127                                                 // leading surrogate.
128                                                 char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
129                                                 fixed (char *fb_chars = fallback_chars) {
130                                                         char dummy = '\0';
131                                                         length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
132                                                 }
133
134                                                 leftOver = '\0';
135                                         }
136                                 }
137                         } else {
138                                 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
139                                         // We have a correct surrogate pair.
140                                         length += 4;
141                                         chars++;
142                                 } else {
143                                         // We have a surrogate start followed by a
144                                         // regular character.  Technically, this is
145                                         // invalid, but we have to do something.
146                                         // We write out the surrogate start and then
147                                         // re-visit the current character again.
148                                         char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
149                                         fixed (char *fb_chars = fallback_chars) {
150                                                 char dummy = '\0';
151                                                 length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
152                                         }
153                                 }
154                                 leftOver = '\0';
155                         }
156                 }
157                 if (flush) {
158                         // Flush the left-over surrogate pair start.
159                         if (leftOver != '\0') {
160                                 length += 3;
161                                 leftOver = '\0';
162                         }
163                 }
164                 return length;
165         }
166
167         unsafe static char [] GetFallbackChars (char *chars, char *start, EncoderFallback fallback, ref EncoderFallbackBuffer buffer)
168         {
169                 if (buffer == null)
170                         buffer = fallback.CreateFallbackBuffer ();
171
172                 buffer.Fallback (*chars, (int) (chars - start));
173
174                 char [] fallback_chars = new char [buffer.Remaining];
175                 for (int i = 0; i < fallback_chars.Length; i++)
176                         fallback_chars [i] = buffer.GetNextChar ();
177
178                 buffer.Reset ();
179
180                 return fallback_chars;
181         }
182
183         // Get the number of bytes needed to encode a character buffer.
184         public override int GetByteCount (char[] chars, int index, int count)
185         {
186                 char dummy = '\0';
187                 return InternalGetByteCount (chars, index, count, EncoderFallback, ref dummy, true);
188         }
189
190
191         [CLSCompliant (false)]
192         [ComVisible (false)]
193         public unsafe override int GetByteCount (char* chars, int count)
194         {
195                 if (chars == null)
196                         throw new ArgumentNullException ("chars");
197                 if (count == 0)
198                         return 0;
199                 char dummy = '\0';
200                 return InternalGetByteCount (chars, count, EncoderFallback, ref dummy, true);
201         }
202
203         #endregion
204
205         #region GetBytes()
206
207         // Internal version of "GetBytes" which can handle a rolling
208         // state between multiple calls to this method.
209         private static int InternalGetBytes (char[] chars, int charIndex,
210                                              int charCount, byte[] bytes,
211                                              int byteIndex,
212                                                  EncoderFallback fallback, ref EncoderFallbackBuffer buffer,
213                                                  ref char leftOver, bool flush)
214         {
215                 // Validate the parameters.
216                 if (chars == null) {
217                         throw new ArgumentNullException ("chars");
218                 }
219                 if (bytes == null) {
220                         throw new ArgumentNullException ("bytes");
221                 }
222                 if (charIndex < 0 || charIndex > chars.Length) {
223                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
224                 }
225                 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
226                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
227                 }
228                 if (byteIndex < 0 || byteIndex > bytes.Length) {
229                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
230                 }
231
232                 if (charIndex == chars.Length) {
233                         if (flush && leftOver != '\0') {
234                                 // FIXME: use EncoderFallback.
235                                 //
236                                 // By default it is empty, so I do nothing for now.
237                                 leftOver = '\0';
238                         }
239                         return 0;
240                 }
241
242                 unsafe {
243                         fixed (char* cptr = chars) {
244                                 if (bytes.Length == byteIndex)
245                                         return InternalGetBytes (
246                                                 cptr + charIndex, charCount, 
247                                                 null, 0, fallback, ref buffer, ref leftOver, flush);
248                                 fixed (byte *bptr = bytes) {
249                                         return InternalGetBytes (
250                                                 cptr + charIndex, charCount,
251                                                 bptr + byteIndex, bytes.Length - byteIndex,
252                                                 fallback, ref buffer,
253                                                 ref leftOver, flush);
254                                 }
255                         }
256                 }
257         }
258
259         private unsafe static int InternalGetBytes (char* chars, int count, byte* bytes, int bcount, EncoderFallback fallback, ref EncoderFallbackBuffer buffer, ref char leftOver, bool flush)
260         {
261                 char* end = chars + count;
262                 char* start = chars;
263                 byte* start_bytes = bytes;
264                 byte* end_bytes = bytes + bcount;
265                 while (chars < end) {
266                         if (leftOver == 0) {
267                                 for (; chars < end; chars++) {
268                                         int ch = *chars;
269                                         if (ch < '\x80') {
270                                                 if (bytes >= end_bytes)
271                                                         goto fail_no_space;
272                                                 *bytes++ = (byte)ch;
273                                         } else if (ch < '\x800') {
274                                                 if (bytes + 1 >= end_bytes)
275                                                         goto fail_no_space;
276                                                 bytes [0] = (byte) (0xC0 | (ch >> 6));
277                                                 bytes [1] = (byte) (0x80 | (ch & 0x3F));
278                                                 bytes += 2;
279                                         } else if (ch < '\uD800' || ch > '\uDFFF') {
280                                                 if (bytes + 2 >= end_bytes)
281                                                         goto fail_no_space;
282                                                 bytes [0] = (byte) (0xE0 | (ch >> 12));
283                                                 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
284                                                 bytes [2] = (byte) (0x80 | (ch & 0x3F));
285                                                 bytes += 3;
286                                         } else if (ch <= '\uDBFF') {
287                                                 // This is a surrogate char, exit the inner loop.
288                                                 leftOver = *chars;
289                                                 chars++;
290                                                 break;
291                                         } else {
292                                                 // We have a surrogate tail without 
293                                                 // leading surrogate.
294                                                 char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer); 
295                                                 char dummy = '\0';
296                                                 if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
297                                                         goto fail_no_space;
298                                                 fixed (char *fb_chars = fallback_chars) {
299                                                         bytes += InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
300                                                 }
301
302                                                 leftOver = '\0';
303                                         }
304                                 }
305                         } else {
306                                 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
307                                         // We have a correct surrogate pair.
308                                         int ch = 0x10000 + (int) *chars - 0xDC00 + (((int) leftOver - 0xD800) << 10);
309                                         if (bytes + 3 >= end_bytes)
310                                                 goto fail_no_space;
311                                         bytes [0] = (byte) (0xF0 | (ch >> 18));
312                                         bytes [1] = (byte) (0x80 | ((ch >> 12) & 0x3F));
313                                         bytes [2] = (byte) (0x80 | ((ch >> 6) & 0x3F));
314                                         bytes [3] = (byte) (0x80 | (ch & 0x3F));
315                                         bytes += 4;
316                                         chars++;
317                                 } else {
318                                         // We have a surrogate start followed by a
319                                         // regular character.  Technically, this is
320                                         // invalid, but we have to do something.
321                                         // We write out the surrogate start and then
322                                         // re-visit the current character again.
323                                         char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer); 
324                                         char dummy = '\0';
325                                         if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
326                                                 goto fail_no_space;
327                                         fixed (char *fb_chars = fallback_chars) {
328                                                 InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
329                                         }
330
331                                         leftOver = '\0';
332                                 }
333                                 leftOver = '\0';
334                         }
335                 }
336                 if (flush) {
337                         // Flush the left-over surrogate pair start.
338                         if (leftOver != '\0') {
339                                 int ch = leftOver;
340                                 if (bytes + 2 < end_bytes) {
341                                         bytes [0] = (byte) (0xE0 | (ch >> 12));
342                                         bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
343                                         bytes [2] = (byte) (0x80 | (ch & 0x3F));
344                                         bytes += 3;
345                                 } else {
346                                         goto fail_no_space;
347                                 }
348                                 leftOver = '\0';
349                         }
350                 }
351                 return (int)(bytes - (end_bytes - bcount));
352 fail_no_space:
353                 throw new ArgumentException ("Insufficient Space", "bytes");
354         }
355
356         // Get the bytes that result from encoding a character buffer.
357         public override int GetBytes (char[] chars, int charIndex, int charCount,
358                                                                  byte[] bytes, int byteIndex)
359         {
360                 char leftOver = '\0';
361                 EncoderFallbackBuffer buffer = null;
362                 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, EncoderFallback, ref buffer, ref leftOver, true);
363         }
364
365         // Convenience wrappers for "GetBytes".
366         public override int GetBytes (String s, int charIndex, int charCount,
367                                                                  byte[] bytes, int byteIndex)
368         {
369                 // Validate the parameters.
370                 if (s == null) {
371                         throw new ArgumentNullException ("s");
372                 }
373                 if (bytes == null) {
374                         throw new ArgumentNullException ("bytes");
375                 }
376                 if (charIndex < 0 || charIndex > s.Length) {
377                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
378                 }
379                 if (charCount < 0 || charCount > (s.Length - charIndex)) {
380                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
381                 }
382                 if (byteIndex < 0 || byteIndex > bytes.Length) {
383                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
384                 }
385
386                 if (charIndex == s.Length)
387                         return 0;
388
389                 unsafe {
390                         fixed (char* cptr = s) {
391                                 char dummy = '\0';
392                                 EncoderFallbackBuffer buffer = null;
393                                 if (bytes.Length == byteIndex)
394                                         return InternalGetBytes (
395                                                 cptr + charIndex, charCount,
396                                                 null, 0, EncoderFallback, ref buffer, ref dummy, true);
397                                 fixed (byte *bptr = bytes) {
398                                         return InternalGetBytes (
399                                                 cptr + charIndex, charCount,
400                                                 bptr + byteIndex, bytes.Length - byteIndex,
401                                                 EncoderFallback, ref buffer,
402                                                 ref dummy, true);
403                                 }
404                         }
405                 }
406         }
407
408         [CLSCompliant (false)]
409         [ComVisible (false)]
410         public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
411         {
412                 if (chars == null)
413                         throw new ArgumentNullException ("chars");
414                 if (charCount < 0)
415                         throw new IndexOutOfRangeException ("charCount");
416                 if (bytes == null)
417                         throw new ArgumentNullException ("bytes");
418                 if (byteCount < 0)
419                         throw new IndexOutOfRangeException ("charCount");
420
421                 if (charCount == 0)
422                         return 0;
423
424                 char dummy = '\0';
425                 EncoderFallbackBuffer buffer = null;
426                 if (byteCount == 0)
427                         return InternalGetBytes (chars, charCount, null, 0, EncoderFallback, ref buffer, ref dummy, true);
428                 else
429                         return InternalGetBytes (chars, charCount, bytes, byteCount, EncoderFallback, ref buffer, ref dummy, true);
430         }
431
432         #endregion
433
434         // Internal version of "GetCharCount" which can handle a rolling
435         // state between multiple calls to this method.
436         private unsafe static int InternalGetCharCount (
437                 byte[] bytes, int index, int count, uint leftOverBits,
438                 uint leftOverCount, object provider,
439                 ref DecoderFallbackBuffer fallbackBuffer, bool flush)
440         {
441                 // Validate the parameters.
442                 if (bytes == null) {
443                         throw new ArgumentNullException ("bytes");
444                 }
445                 if (index < 0 || index > bytes.Length) {
446                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
447                 }
448                 if (count < 0 || count > (bytes.Length - index)) {
449                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
450                 }
451
452                 if (count == 0)
453                         return 0;
454                 fixed (byte *bptr = bytes)
455                         return InternalGetCharCount (bptr + index, count,
456                                 leftOverBits, leftOverCount, provider, ref fallbackBuffer, flush);
457         }
458
459         private unsafe static int InternalGetCharCount (
460                 byte* bytes, int byteCount, uint leftOverBits,
461                 uint leftOverCount, object provider,
462                 ref DecoderFallbackBuffer fallbackBuffer, bool flush)
463         {
464                 int byteIndex = 0;
465
466                 int length = 0;
467
468                 if (leftOverCount == 0) {
469                         int end = byteIndex + byteCount;
470                         for (; byteIndex < end; byteIndex++, byteCount--) {
471                                 if (bytes [byteIndex] < 0x80)
472                                         length++;
473                                 else
474                                         break;
475                         }
476                 }
477
478                 // Determine the number of characters that we have.
479                 uint ch;
480                 uint leftBits = leftOverBits;
481                 uint leftSoFar = (leftOverCount & (uint)0x0F);
482                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
483
484                 int byteEnd = byteIndex + byteCount;
485                 for(; byteIndex < byteEnd; byteIndex++) {
486                         // Fetch the next character from the byte buffer.
487                         ch = (uint)(bytes[byteIndex]);
488                         if (leftSize == 0) {
489                                 // Process a UTF-8 start character.
490                                 if (ch < (uint)0x0080) {
491                                         // Single-byte UTF-8 character.
492                                         ++length;
493                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
494                                         // Double-byte UTF-8 character.
495                                         leftBits = (ch & (uint)0x1F);
496                                         leftSoFar = 1;
497                                         leftSize = 2;
498                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
499                                         // Three-byte UTF-8 character.
500                                         leftBits = (ch & (uint)0x0F);
501                                         leftSoFar = 1;
502                                         leftSize = 3;
503                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
504                                         // Four-byte UTF-8 character.
505                                         leftBits = (ch & (uint)0x07);
506                                         leftSoFar = 1;
507                                         leftSize = 4;
508                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
509                                         // Five-byte UTF-8 character.
510                                         leftBits = (ch & (uint)0x03);
511                                         leftSoFar = 1;
512                                         leftSize = 5;
513                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
514                                         // Six-byte UTF-8 character.
515                                         leftBits = (ch & (uint)0x03);
516                                         leftSoFar = 1;
517                                         leftSize = 6;
518                                 } else {
519                                         // Invalid UTF-8 start character.
520                                         length += Fallback (provider, ref fallbackBuffer, bytes, byteIndex, 1);
521                                 }
522                         } else {
523                                 // Process an extra byte in a multi-byte sequence.
524                                 if ((ch & (uint)0xC0) == (uint)0x80) {
525                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
526                                         if (++leftSoFar >= leftSize) {
527                                                 // We have a complete character now.
528                                                 if (leftBits < (uint)0x10000) {
529                                                         // is it an overlong ?
530                                                         bool overlong = false;
531                                                         switch (leftSize) {
532                                                         case 2:
533                                                                 overlong = (leftBits <= 0x7F);
534                                                                 break;
535                                                         case 3:
536                                                                 overlong = (leftBits <= 0x07FF);
537                                                                 break;
538                                                         case 4:
539                                                                 overlong = (leftBits <= 0xFFFF);
540                                                                 break;
541                                                         case 5:
542                                                                 overlong = (leftBits <= 0x1FFFFF);
543                                                                 break;
544                                                         case 6:
545                                                                 overlong = (leftBits <= 0x03FFFFFF);
546                                                                 break;
547                                                         }
548                                                         if (overlong) {
549                                                                 length += Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar);
550                                                                 --byteIndex; //process byte again
551                                                         }
552                                                         else if ((leftBits & 0xF800) == 0xD800) {
553                                                                 // UTF-8 doesn't use surrogate characters
554                                                                 length += Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar);
555                                                         }
556                                                         else
557                                                                 ++length;
558                                                 } else if (leftBits < (uint)0x110000) {
559                                                         length += 2;
560                                                 } else {
561                                                         length += Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar);
562                                                 }
563                                                 leftSize = 0;
564                                         }
565                                 } else {
566                                         // Invalid UTF-8 sequence: clear and restart.
567                                         length += Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar);
568                                         leftSize = 0;
569                                         --byteIndex;
570                                 }
571                         }
572                 }
573                 if (flush && leftSize != 0) {
574                         // We had left-over bytes that didn't make up
575                         // a complete UTF-8 character sequence.
576                         length += Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar);
577                 }
578
579                 // Return the final length to the caller.
580                 return length;
581         }
582
583         // for GetCharCount()
584         static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, byte* bytes, long index, uint size)
585         {
586                 if (buffer == null) {
587                         DecoderFallback fb = provider as DecoderFallback;
588                         if (fb != null)
589                                 buffer = fb.CreateFallbackBuffer ();
590                         else
591                                 buffer = ((Decoder) provider).FallbackBuffer;
592                 }
593
594                 var bufferArg = new byte [size];
595
596                 for (int i = 0; i < size; i++)
597                         bufferArg [i] = bytes [(int) index + i];
598
599                 buffer.Fallback (bufferArg, 0);
600                 int ret = buffer.Remaining;
601                 buffer.Reset ();
602
603                 return ret;
604         }
605
606         // for GetChars()
607         static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, byte* bytes, long byteIndex, uint size,
608                 char* chars, ref int charIndex)
609         {
610                 if (buffer == null) {
611                         DecoderFallback fb = provider as DecoderFallback;
612                         if (fb != null)
613                                 buffer = fb.CreateFallbackBuffer ();
614                         else
615                                 buffer = ((Decoder) provider).FallbackBuffer;
616                 }
617
618                 var bufferArg = new byte [size];
619
620                 for (int i = 0; i < size; i++)
621                         bufferArg [i] = bytes [byteIndex + i];
622
623                 buffer.Fallback (bufferArg, 0);
624                 while (buffer.Remaining > 0)
625                         chars [charIndex++] = buffer.GetNextChar ();
626                 buffer.Reset ();
627         }
628
629         // Get the number of characters needed to decode a byte buffer.
630         public override int GetCharCount (byte[] bytes, int index, int count)
631         {
632                 DecoderFallbackBuffer buf = null;
633                 return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, true);
634         }
635
636         [CLSCompliant (false)]
637         [ComVisible (false)]
638         public unsafe override int GetCharCount (byte* bytes, int count)
639         {
640                 DecoderFallbackBuffer buf = null;
641                 return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, true);
642         }
643
644         // Get the characters that result from decoding a byte buffer.
645         private unsafe static int InternalGetChars (
646                 byte[] bytes, int byteIndex, int byteCount, char[] chars,
647                 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
648                 object provider,
649                 ref DecoderFallbackBuffer fallbackBuffer, bool flush)
650         {
651                 // Validate the parameters.
652                 if (bytes == null) {
653                         throw new ArgumentNullException ("bytes");
654                 }
655                 if (chars == null) {
656                         throw new ArgumentNullException ("chars");
657                 }
658                 if (byteIndex < 0 || byteIndex > bytes.Length) {
659                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
660                 }
661                 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
662                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
663                 }
664                 if (charIndex < 0 || charIndex > chars.Length) {
665                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
666                 }
667
668                 if (charIndex == chars.Length && byteCount == 0)
669                         return 0;
670
671                 fixed (char* cptr = chars) {
672                         if (byteCount == 0 || byteIndex == bytes.Length)
673                                 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, flush);
674                         // otherwise...
675                         fixed (byte* bptr = bytes)
676                                 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, flush);
677                 }
678         }
679
680         private unsafe static int InternalGetChars (
681                 byte* bytes, int byteCount, char* chars, int charCount,
682                 ref uint leftOverBits, ref uint leftOverCount,
683                 object provider,
684                 ref DecoderFallbackBuffer fallbackBuffer, bool flush)
685         {
686                 int charIndex = 0, byteIndex = 0;
687                 int length = charCount;
688                 int posn = charIndex;
689
690                 if (leftOverCount == 0) {
691                         int end = byteIndex + byteCount;
692                         for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
693                                 if (bytes [byteIndex] < 0x80) {
694                                         if (posn >= length) {
695                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
696                                         }
697                                         chars [posn] = (char) bytes [byteIndex];
698                                 } else {
699                                         break;
700                                 }
701                         }
702                 }
703
704                 // Convert the bytes into the output buffer.
705                 uint ch;
706                 uint leftBits = leftOverBits;
707                 uint leftSoFar = (leftOverCount & (uint)0x0F);
708                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
709
710                 int byteEnd = byteIndex + byteCount;
711                 for(; byteIndex < byteEnd; byteIndex++) {
712                         // Fetch the next character from the byte buffer.
713                         ch = (uint)(bytes[byteIndex]);
714                         if (leftSize == 0) {
715                                 // Process a UTF-8 start character.
716                                 if (ch < (uint)0x0080) {
717                                         // Single-byte UTF-8 character.
718                                         if (posn >= length) {
719                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
720                                         }
721                                         chars[posn++] = (char)ch;
722                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
723                                         // Double-byte UTF-8 character.
724                                         leftBits = (ch & (uint)0x1F);
725                                         leftSoFar = 1;
726                                         leftSize = 2;
727                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
728                                         // Three-byte UTF-8 character.
729                                         leftBits = (ch & (uint)0x0F);
730                                         leftSoFar = 1;
731                                         leftSize = 3;
732                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
733                                         // Four-byte UTF-8 character.
734                                         leftBits = (ch & (uint)0x07);
735                                         leftSoFar = 1;
736                                         leftSize = 4;
737                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
738                                         // Five-byte UTF-8 character.
739                                         leftBits = (ch & (uint)0x03);
740                                         leftSoFar = 1;
741                                         leftSize = 5;
742                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
743                                         // Six-byte UTF-8 character.
744                                         leftBits = (ch & (uint)0x03);
745                                         leftSoFar = 1;
746                                         leftSize = 6;
747                                 } else {
748                                         // Invalid UTF-8 start character.
749                                         Fallback (provider, ref fallbackBuffer, bytes, byteIndex, 1, chars, ref posn);
750                                 }
751                         } else {
752                                 // Process an extra byte in a multi-byte sequence.
753                                 if ((ch & (uint)0xC0) == (uint)0x80) {
754                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
755                                         if (++leftSoFar >= leftSize) {
756                                                 // We have a complete character now.
757                                                 if (leftBits < (uint)0x10000) {
758                                                         // is it an overlong ?
759                                                         bool overlong = false;
760                                                         switch (leftSize) {
761                                                         case 2:
762                                                                 overlong = (leftBits <= 0x7F);
763                                                                 break;
764                                                         case 3:
765                                                                 overlong = (leftBits <= 0x07FF);
766                                                                 break;
767                                                         case 4:
768                                                                 overlong = (leftBits <= 0xFFFF);
769                                                                 break;
770                                                         case 5:
771                                                                 overlong = (leftBits <= 0x1FFFFF);
772                                                                 break;
773                                                         case 6:
774                                                                 overlong = (leftBits <= 0x03FFFFFF);
775                                                                 break;
776                                                         }
777                                                         if (overlong) {
778                                                                 Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
779                                                                 --byteIndex; //process byte again
780                                                         }
781                                                         else if ((leftBits & 0xF800) == 0xD800) {
782                                                                 // UTF-8 doesn't use surrogate characters
783                                                                 Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
784                                                         }
785                                                         else {
786                                                                 if (posn >= length) {
787                                                                         throw new ArgumentException
788                                                                                 (_("Arg_InsufficientSpace"), "chars");
789                                                                 }
790                                                                 chars[posn++] = (char)leftBits;
791                                                         }
792                                                 } else if (leftBits < (uint)0x110000) {
793                                                         if ((posn + 2) > length) {
794                                                                 throw new ArgumentException
795                                                                         (_("Arg_InsufficientSpace"), "chars");
796                                                         }
797                                                         leftBits -= (uint)0x10000;
798                                                         chars[posn++] = (char)((leftBits >> 10) +
799                                                                                                    (uint)0xD800);
800                                                         chars[posn++] =
801                                                                 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
802                                                 } else {
803                                                         Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
804                                                 }
805                                                 leftSize = 0;
806                                         }
807                                 } else {
808                                         // Invalid UTF-8 sequence: clear and restart.
809                                         Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
810                                         leftSize = 0;
811                                         --byteIndex;
812                                 }
813                         }
814                 }
815                 if (flush && leftSize != 0) {
816                         // We had left-over bytes that didn't make up
817                         // a complete UTF-8 character sequence.
818                         Fallback (provider, ref fallbackBuffer, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
819                 }
820                 leftOverBits = leftBits;
821                 leftOverCount = (leftSoFar | (leftSize << 4));
822
823                 // Return the final length to the caller.
824                 return posn - charIndex;
825         }
826
827         // Get the characters that result from decoding a byte buffer.
828         public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
829                                                                  char[] chars, int charIndex)
830         {
831                 uint leftOverBits = 0;
832                 uint leftOverCount = 0;
833                 DecoderFallbackBuffer buf = null;
834                 byte [] bufferArg = null;
835                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
836                                 charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, true);
837         }
838
839         [CLSCompliant (false)]
840         [ComVisible (false)]
841         public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
842         {
843                 DecoderFallbackBuffer buf = null;
844                 uint leftOverBits = 0;
845                 uint leftOverCount = 0;
846                 return InternalGetChars (bytes, byteCount, chars, 
847                                 charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, true);
848         }
849
850         // Get the maximum number of bytes needed to encode a
851         // specified number of characters.
852         public override int GetMaxByteCount (int charCount)
853         {
854                 if (charCount < 0) {
855                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
856                 }
857
858                 // Add 1 to charCount since there may be a lead surrogate left from the previous call to GetBytes/Encoder.Convert
859                 charCount = charCount + 1;
860                 if (EncoderFallback.MaxCharCount > 1) {
861                         charCount = charCount * EncoderFallback.MaxCharCount;
862                 }
863
864                 return charCount * 3;
865         }
866
867         // Get the maximum number of characters needed to decode a
868         // specified number of bytes.
869         public override int GetMaxCharCount (int byteCount)
870         {
871                 if (byteCount < 0) {
872                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
873                 }
874
875                 // Add 1 to byteCount since there may be the bytes from part of a surrogate pair left from the previous call to GetChars/Decoder.Convert
876                 int maxCharCount = byteCount + 1;
877                 if (DecoderFallback.MaxCharCount > 1) {
878                         maxCharCount = maxCharCount * DecoderFallback.MaxCharCount;
879                 }
880
881                 return maxCharCount;
882         }
883
884         // Get a UTF8-specific decoder that is attached to this instance.
885         public override Decoder GetDecoder ()
886         {
887                 return new UTF8Decoder (this);
888         }
889
890         // Get a UTF8-specific encoder that is attached to this instance.
891         public override Encoder GetEncoder ()
892         {
893                 return new UTF8Encoder (this);
894         }
895
896         // Get the UTF8 preamble.
897         public override byte[] GetPreamble ()
898         {
899                 if (emitIdentifier)
900                         return new byte [] { 0xEF, 0xBB, 0xBF };
901
902                 return EmptyArray<byte>.Value;
903         }
904
905         // Determine if this object is equal to another.
906         public override bool Equals (Object value)
907         {
908                 UTF8Encoding enc = (value as UTF8Encoding);
909                 if (enc != null) {
910                         return (codePage == enc.codePage &&
911                                 emitIdentifier == enc.emitIdentifier &&
912                                 DecoderFallback.Equals (enc.DecoderFallback) &&
913                                 EncoderFallback.Equals (enc.EncoderFallback));
914                 } else {
915                         return false;
916                 }
917         }
918
919         // Get the hash code for this object.
920         public override int GetHashCode ()
921         {
922                 return base.GetHashCode ();
923         }
924
925         public override int GetByteCount (string chars)
926         {
927                 // hmm, does this override make any sense?
928                 return base.GetByteCount (chars);
929         }
930
931         [ComVisible (false)]
932         public override string GetString (byte [] bytes, int index, int count)
933         {
934                 // hmm, does this override make any sense?
935                 return base.GetString (bytes, index, count);
936         }
937
938         // UTF-8 decoder implementation.
939         [Serializable]
940         private class UTF8Decoder : EncodingDecoder
941         {
942                 private uint leftOverBits;
943                 private uint leftOverCount;
944
945                 // Constructor.
946                 public UTF8Decoder (Encoding encoding)
947                         : base (encoding)
948                 {
949                         leftOverBits = 0;
950                         leftOverCount = 0;
951                 }
952
953                 // Override inherited methods.
954                 public override int GetCharCount (byte[] bytes, int index, int count)
955                 {
956                         DecoderFallbackBuffer buf = null;
957                         return InternalGetCharCount (bytes, index, count,
958                                 leftOverBits, leftOverCount, this, ref buf, false);
959                 }
960                 public override int GetChars (byte[] bytes, int byteIndex,
961                                                  int byteCount, char[] chars, int charIndex)
962                 {
963                         DecoderFallbackBuffer buf = null;
964                         return InternalGetChars (bytes, byteIndex, byteCount,
965                                 chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, false);
966                 }
967
968         } // class UTF8Decoder
969
970         // UTF-8 encoder implementation.
971         [Serializable]
972         private class UTF8Encoder : EncodingEncoder
973         {
974 //              private bool emitIdentifier;
975                 private char leftOverForCount;
976                 private char leftOverForConv;
977
978                 // Constructor.
979                 public UTF8Encoder (UTF8Encoding encoding)
980                         : base (encoding)
981                 {
982                         leftOverForCount = '\0';
983                         leftOverForConv = '\0';
984                 }
985
986                 // Override inherited methods.
987                 public override int GetByteCount (char[] chars, int index,
988                                          int count, bool flush)
989                 {
990                         return InternalGetByteCount (chars, index, count, Fallback, ref leftOverForCount, flush);
991                 }
992                 public override int GetBytes (char[] chars, int charIndex,
993                                          int charCount, byte[] bytes, int byteIndex, bool flush)
994                 {
995                         int result;
996                         EncoderFallbackBuffer buffer = null;
997                         result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, Fallback, ref buffer, ref leftOverForConv, flush);
998 //                      emitIdentifier = false;
999                         return result;
1000                 }
1001
1002                 public unsafe override int GetByteCount (char* chars, int count, bool flush)
1003                 {
1004                         return InternalGetByteCount (chars, count, Fallback, ref leftOverForCount, flush);
1005                 }
1006
1007                 public unsafe override int GetBytes (char* chars, int charCount,
1008                         byte* bytes, int byteCount, bool flush)
1009                 {
1010                         int result;
1011                         EncoderFallbackBuffer buffer = null;
1012                         result = InternalGetBytes (chars, charCount, bytes, byteCount, Fallback, ref buffer, ref leftOverForConv, flush);
1013 //                      emitIdentifier = false;
1014                         return result;
1015                 }
1016         } // class UTF8Encoder
1017
1018 }; // class UTF8Encoding
1019
1020 }; // namespace System.Text