Merge pull request #778 from cmorris98/master
[mono.git] / mcs / class / corlib / System.Text / UTF8Encoding.cs
1 /*
2  * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
3  *
4  * Copyright (c) 2001, 2002  Southern Storm Software, Pty Ltd
5  * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23  * OTHER DEALINGS IN THE SOFTWARE.
24  */
25
26 namespace System.Text
27 {
28
29 using System;
30 using System.Runtime.InteropServices;
31
32 [Serializable]
33 [MonoLimitation ("Serialization format not compatible with .NET")]
34 [ComVisible (true)]
35 public class UTF8Encoding : Encoding
36 {
37         // Magic number used by Windows for UTF-8.
38         internal const int UTF8_CODE_PAGE = 65001;
39
40         // Internal state.
41         private bool emitIdentifier;
42
43         // Constructors.
44         public UTF8Encoding () : this (false, false) {}
45         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
46                         : this (encoderShouldEmitUTF8Identifier, false) {}
47         
48         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
49                 : base (UTF8_CODE_PAGE)
50         {
51                 emitIdentifier = encoderShouldEmitUTF8Identifier;
52                 if (throwOnInvalidBytes)
53                         SetFallbackInternal (EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
54                 else
55                         SetFallbackInternal (EncoderFallback.StandardSafeFallback, DecoderFallback.StandardSafeFallback);
56
57                 web_name = body_name = header_name = "utf-8";
58                 encoding_name = "Unicode (UTF-8)";
59                 is_browser_save = true;
60                 is_browser_display = true;
61                 is_mail_news_display = true;
62                 is_mail_news_save = true;
63                 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
64         }
65
66         #region GetByteCount()
67
68         // Internal version of "GetByteCount" which can handle a rolling
69         // state between multiple calls to this method.
70         private static int InternalGetByteCount (char[] chars, int index, int count, EncoderFallback fallback, ref char leftOver, bool flush)
71         {
72                 // Validate the parameters.
73                 if (chars == null) {
74                         throw new ArgumentNullException ("chars");
75                 }
76                 if (index < 0 || index > chars.Length) {
77                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
78                 }
79                 if (count < 0 || count > (chars.Length - index)) {
80                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
81                 }
82
83                 if (index == chars.Length) {
84                         if (flush && leftOver != '\0') {
85                                 // Flush the left-over surrogate pair start.
86                                 leftOver = '\0';
87                                 return 3;
88                         }
89                         return 0;
90                 }
91
92                 unsafe {
93                         fixed (char* cptr = chars) {
94                                 return InternalGetByteCount (cptr + index, count, fallback, ref leftOver, flush);
95                         }
96                 }
97         }
98
99         private unsafe static int InternalGetByteCount (char* chars, int count, EncoderFallback fallback, ref char leftOver, bool flush)
100         {
101                 int length = 0;
102                 char* end = chars + count;
103                 char* start = chars;
104                 EncoderFallbackBuffer buffer = null;
105                 while (chars < end) {
106                         if (leftOver == 0) {
107                                 for (; chars < end; chars++) {
108                                         if (*chars < '\x80') {
109                                                 ++length;
110                                         } else if (*chars < '\x800') {
111                                                 length += 2;
112                                         } else if (*chars < '\uD800' || *chars > '\uDFFF') {
113                                                 length += 3;
114                                         } else if (*chars <= '\uDBFF') {
115                                                 // This is a surrogate start char, exit the inner loop only
116                                                 // if we don't find the complete surrogate pair.
117                                                 if (chars + 1 < end && chars [1] >= '\uDC00' && chars [1] <= '\uDFFF') {
118                                                         length += 4;
119                                                         chars++;
120                                                         continue;
121                                                 }
122                                                 leftOver = *chars;
123                                                 chars++;
124                                                 break;
125                                         } else {
126                                                 // We have a surrogate tail without 
127                                                 // leading surrogate.
128                                                 char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
129                                                 fixed (char *fb_chars = fallback_chars) {
130                                                         char dummy = '\0';
131                                                         length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
132                                                 }
133
134                                                 leftOver = '\0';
135                                         }
136                                 }
137                         } else {
138                                 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
139                                         // We have a correct surrogate pair.
140                                         length += 4;
141                                         chars++;
142                                 } else {
143                                         // We have a surrogate start followed by a
144                                         // regular character.  Technically, this is
145                                         // invalid, but we have to do something.
146                                         // We write out the surrogate start and then
147                                         // re-visit the current character again.
148                                         char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
149                                         fixed (char *fb_chars = fallback_chars) {
150                                                 char dummy = '\0';
151                                                 length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
152                                         }
153                                 }
154                                 leftOver = '\0';
155                         }
156                 }
157                 if (flush) {
158                         // Flush the left-over surrogate pair start.
159                         if (leftOver != '\0') {
160                                 length += 3;
161                                 leftOver = '\0';
162                         }
163                 }
164                 return length;
165         }
166
167         unsafe static char [] GetFallbackChars (char *chars, char *start, EncoderFallback fallback, ref EncoderFallbackBuffer buffer)
168         {
169                 if (buffer == null)
170                         buffer = fallback.CreateFallbackBuffer ();
171
172                 buffer.Fallback (*chars, (int) (chars - start));
173
174                 char [] fallback_chars = new char [buffer.Remaining];
175                 for (int i = 0; i < fallback_chars.Length; i++)
176                         fallback_chars [i] = buffer.GetNextChar ();
177
178                 buffer.Reset ();
179
180                 return fallback_chars;
181         }
182
183         // Get the number of bytes needed to encode a character buffer.
184         public override int GetByteCount (char[] chars, int index, int count)
185         {
186                 char dummy = '\0';
187                 return InternalGetByteCount (chars, index, count, EncoderFallback, ref dummy, true);
188         }
189
190
191         [CLSCompliant (false)]
192         [ComVisible (false)]
193         public unsafe override int GetByteCount (char* chars, int count)
194         {
195                 if (chars == null)
196                         throw new ArgumentNullException ("chars");
197                 if (count == 0)
198                         return 0;
199                 char dummy = '\0';
200                 return InternalGetByteCount (chars, count, EncoderFallback, ref dummy, true);
201         }
202
203         #endregion
204
205         #region GetBytes()
206
207         // Internal version of "GetBytes" which can handle a rolling
208         // state between multiple calls to this method.
209         private static int InternalGetBytes (char[] chars, int charIndex,
210                                              int charCount, byte[] bytes,
211                                              int byteIndex,
212                                                  EncoderFallback fallback, ref EncoderFallbackBuffer buffer,
213                                                  ref char leftOver, bool flush)
214         {
215                 // Validate the parameters.
216                 if (chars == null) {
217                         throw new ArgumentNullException ("chars");
218                 }
219                 if (bytes == null) {
220                         throw new ArgumentNullException ("bytes");
221                 }
222                 if (charIndex < 0 || charIndex > chars.Length) {
223                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
224                 }
225                 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
226                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
227                 }
228                 if (byteIndex < 0 || byteIndex > bytes.Length) {
229                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
230                 }
231
232                 if (charIndex == chars.Length) {
233                         if (flush && leftOver != '\0') {
234                                 // FIXME: use EncoderFallback.
235                                 //
236                                 // By default it is empty, so I do nothing for now.
237                                 leftOver = '\0';
238                         }
239                         return 0;
240                 }
241
242                 unsafe {
243                         fixed (char* cptr = chars) {
244                                 if (bytes.Length == byteIndex)
245                                         return InternalGetBytes (
246                                                 cptr + charIndex, charCount, 
247                                                 null, 0, fallback, ref buffer, ref leftOver, flush);
248                                 fixed (byte *bptr = bytes) {
249                                         return InternalGetBytes (
250                                                 cptr + charIndex, charCount,
251                                                 bptr + byteIndex, bytes.Length - byteIndex,
252                                                 fallback, ref buffer,
253                                                 ref leftOver, flush);
254                                 }
255                         }
256                 }
257         }
258
259         private unsafe static int InternalGetBytes (char* chars, int count, byte* bytes, int bcount, EncoderFallback fallback, ref EncoderFallbackBuffer buffer, ref char leftOver, bool flush)
260         {
261                 char* end = chars + count;
262                 char* start = chars;
263                 byte* start_bytes = bytes;
264                 byte* end_bytes = bytes + bcount;
265                 while (chars < end) {
266                         if (leftOver == 0) {
267                                 for (; chars < end; chars++) {
268                                         int ch = *chars;
269                                         if (ch < '\x80') {
270                                                 if (bytes >= end_bytes)
271                                                         goto fail_no_space;
272                                                 *bytes++ = (byte)ch;
273                                         } else if (ch < '\x800') {
274                                                 if (bytes + 1 >= end_bytes)
275                                                         goto fail_no_space;
276                                                 bytes [0] = (byte) (0xC0 | (ch >> 6));
277                                                 bytes [1] = (byte) (0x80 | (ch & 0x3F));
278                                                 bytes += 2;
279                                         } else if (ch < '\uD800' || ch > '\uDFFF') {
280                                                 if (bytes + 2 >= end_bytes)
281                                                         goto fail_no_space;
282                                                 bytes [0] = (byte) (0xE0 | (ch >> 12));
283                                                 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
284                                                 bytes [2] = (byte) (0x80 | (ch & 0x3F));
285                                                 bytes += 3;
286                                         } else if (ch <= '\uDBFF') {
287                                                 // This is a surrogate char, exit the inner loop.
288                                                 leftOver = *chars;
289                                                 chars++;
290                                                 break;
291                                         } else {
292                                                 // We have a surrogate tail without 
293                                                 // leading surrogate.
294                                                 char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer); 
295                                                 char dummy = '\0';
296                                                 if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
297                                                         goto fail_no_space;
298                                                 fixed (char *fb_chars = fallback_chars) {
299                                                         bytes += InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
300                                                 }
301
302                                                 leftOver = '\0';
303                                         }
304                                 }
305                         } else {
306                                 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
307                                         // We have a correct surrogate pair.
308                                         int ch = 0x10000 + (int) *chars - 0xDC00 + (((int) leftOver - 0xD800) << 10);
309                                         if (bytes + 3 >= end_bytes)
310                                                 goto fail_no_space;
311                                         bytes [0] = (byte) (0xF0 | (ch >> 18));
312                                         bytes [1] = (byte) (0x80 | ((ch >> 12) & 0x3F));
313                                         bytes [2] = (byte) (0x80 | ((ch >> 6) & 0x3F));
314                                         bytes [3] = (byte) (0x80 | (ch & 0x3F));
315                                         bytes += 4;
316                                         chars++;
317                                 } else {
318                                         // We have a surrogate start followed by a
319                                         // regular character.  Technically, this is
320                                         // invalid, but we have to do something.
321                                         // We write out the surrogate start and then
322                                         // re-visit the current character again.
323                                         char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer); 
324                                         char dummy = '\0';
325                                         if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
326                                                 goto fail_no_space;
327                                         fixed (char *fb_chars = fallback_chars) {
328                                                 InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
329                                         }
330
331                                         leftOver = '\0';
332                                 }
333                                 leftOver = '\0';
334                         }
335                 }
336                 if (flush) {
337                         // Flush the left-over surrogate pair start.
338                         if (leftOver != '\0') {
339                                 int ch = leftOver;
340                                 if (bytes + 2 < end_bytes) {
341                                         bytes [0] = (byte) (0xE0 | (ch >> 12));
342                                         bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
343                                         bytes [2] = (byte) (0x80 | (ch & 0x3F));
344                                         bytes += 3;
345                                 } else {
346                                         goto fail_no_space;
347                                 }
348                                 leftOver = '\0';
349                         }
350                 }
351                 return (int)(bytes - (end_bytes - bcount));
352 fail_no_space:
353                 throw new ArgumentException ("Insufficient Space", "bytes");
354         }
355
356         // Get the bytes that result from encoding a character buffer.
357         public override int GetBytes (char[] chars, int charIndex, int charCount,
358                                                                  byte[] bytes, int byteIndex)
359         {
360                 char leftOver = '\0';
361                 EncoderFallbackBuffer buffer = null;
362                 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, EncoderFallback, ref buffer, ref leftOver, true);
363         }
364
365         // Convenience wrappers for "GetBytes".
366         public override int GetBytes (String s, int charIndex, int charCount,
367                                                                  byte[] bytes, int byteIndex)
368         {
369                 // Validate the parameters.
370                 if (s == null) {
371                         throw new ArgumentNullException ("s");
372                 }
373                 if (bytes == null) {
374                         throw new ArgumentNullException ("bytes");
375                 }
376                 if (charIndex < 0 || charIndex > s.Length) {
377                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
378                 }
379                 if (charCount < 0 || charCount > (s.Length - charIndex)) {
380                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
381                 }
382                 if (byteIndex < 0 || byteIndex > bytes.Length) {
383                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
384                 }
385
386                 if (charIndex == s.Length)
387                         return 0;
388
389                 unsafe {
390                         fixed (char* cptr = s) {
391                                 char dummy = '\0';
392                                 EncoderFallbackBuffer buffer = null;
393                                 if (bytes.Length == byteIndex)
394                                         return InternalGetBytes (
395                                                 cptr + charIndex, charCount,
396                                                 null, 0, EncoderFallback, ref buffer, ref dummy, true);
397                                 fixed (byte *bptr = bytes) {
398                                         return InternalGetBytes (
399                                                 cptr + charIndex, charCount,
400                                                 bptr + byteIndex, bytes.Length - byteIndex,
401                                                 EncoderFallback, ref buffer,
402                                                 ref dummy, true);
403                                 }
404                         }
405                 }
406         }
407
408         [CLSCompliant (false)]
409         [ComVisible (false)]
410         public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
411         {
412                 if (chars == null)
413                         throw new ArgumentNullException ("chars");
414                 if (charCount < 0)
415                         throw new IndexOutOfRangeException ("charCount");
416                 if (bytes == null)
417                         throw new ArgumentNullException ("bytes");
418                 if (byteCount < 0)
419                         throw new IndexOutOfRangeException ("charCount");
420
421                 if (charCount == 0)
422                         return 0;
423
424                 char dummy = '\0';
425                 EncoderFallbackBuffer buffer = null;
426                 if (byteCount == 0)
427                         return InternalGetBytes (chars, charCount, null, 0, EncoderFallback, ref buffer, ref dummy, true);
428                 else
429                         return InternalGetBytes (chars, charCount, bytes, byteCount, EncoderFallback, ref buffer, ref dummy, true);
430         }
431
432         #endregion
433
434         // Internal version of "GetCharCount" which can handle a rolling
435         // state between multiple calls to this method.
436         private unsafe static int InternalGetCharCount (
437                 byte[] bytes, int index, int count, uint leftOverBits,
438                 uint leftOverCount, object provider,
439                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
440         {
441                 // Validate the parameters.
442                 if (bytes == null) {
443                         throw new ArgumentNullException ("bytes");
444                 }
445                 if (index < 0 || index > bytes.Length) {
446                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
447                 }
448                 if (count < 0 || count > (bytes.Length - index)) {
449                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
450                 }
451
452                 if (count == 0)
453                         return 0;
454                 fixed (byte *bptr = bytes)
455                         return InternalGetCharCount (bptr + index, count,
456                                 leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
457         }
458
459         private unsafe static int InternalGetCharCount (
460                 byte* bytes, int count, uint leftOverBits,
461                 uint leftOverCount, object provider,
462                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
463         {
464                 int index = 0;
465
466                 int length = 0;
467
468                 if (leftOverCount == 0) {
469                         int end = index + count;
470                         for (; index < end; index++, count--) {
471                                 if (bytes [index] < 0x80)
472                                         length++;
473                                 else
474                                         break;
475                         }
476                 }
477
478                 // Determine the number of characters that we have.
479                 uint ch;
480                 uint leftBits = leftOverBits;
481                 uint leftSoFar = (leftOverCount & (uint)0x0F);
482                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
483                 while (count > 0) {
484                         ch = (uint)(bytes[index++]);
485                         --count;
486                         if (leftSize == 0) {
487                                 // Process a UTF-8 start character.
488                                 if (ch < (uint)0x0080) {
489                                         // Single-byte UTF-8 character.
490                                         ++length;
491                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
492                                         // Double-byte UTF-8 character.
493                                         leftBits = (ch & (uint)0x1F);
494                                         leftSoFar = 1;
495                                         leftSize = 2;
496                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
497                                         // Three-byte UTF-8 character.
498                                         leftBits = (ch & (uint)0x0F);
499                                         leftSoFar = 1;
500                                         leftSize = 3;
501                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
502                                         // Four-byte UTF-8 character.
503                                         leftBits = (ch & (uint)0x07);
504                                         leftSoFar = 1;
505                                         leftSize = 4;
506                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
507                                         // Five-byte UTF-8 character.
508                                         leftBits = (ch & (uint)0x03);
509                                         leftSoFar = 1;
510                                         leftSize = 5;
511                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
512                                         // Six-byte UTF-8 character.
513                                         leftBits = (ch & (uint)0x03);
514                                         leftSoFar = 1;
515                                         leftSize = 6;
516                                 } else {
517                                         // Invalid UTF-8 start character.
518                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1, 1);
519                                 }
520                         } else {
521                                 // Process an extra byte in a multi-byte sequence.
522                                 if ((ch & (uint)0xC0) == (uint)0x80) {
523                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
524                                         if (++leftSoFar >= leftSize) {
525                                                 // We have a complete character now.
526                                                 if (leftBits < (uint)0x10000) {
527                                                         // is it an overlong ?
528                                                         bool overlong = false;
529                                                         switch (leftSize) {
530                                                         case 2:
531                                                                 overlong = (leftBits <= 0x7F);
532                                                                 break;
533                                                         case 3:
534                                                                 overlong = (leftBits <= 0x07FF);
535                                                                 break;
536                                                         case 4:
537                                                                 overlong = (leftBits <= 0xFFFF);
538                                                                 break;
539                                                         case 5:
540                                                                 overlong = (leftBits <= 0x1FFFFF);
541                                                                 break;
542                                                         case 6:
543                                                                 overlong = (leftBits <= 0x03FFFFFF);
544                                                                 break;
545                                                         }
546                                                         if (overlong) {
547                                                                 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
548                                                         }
549                                                         else if ((leftBits & 0xF800) == 0xD800) {
550                                                                 // UTF-8 doesn't use surrogate characters
551                                                                 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
552                                                         }
553                                                         else
554                                                                 ++length;
555                                                 } else if (leftBits < (uint)0x110000) {
556                                                         length += 2;
557                                                 } else {
558                                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
559                                                 }
560                                                 leftSize = 0;
561                                         }
562                                 } else {
563                                         // Invalid UTF-8 sequence: clear and restart.
564                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
565                                         leftSize = 0;
566                                         --index;
567                                         ++count;
568                                 }
569                         }
570                 }
571                 if (flush && leftSize != 0) {
572                         // We had left-over bytes that didn't make up
573                         // a complete UTF-8 character sequence.
574                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
575                 }
576
577                 // Return the final length to the caller.
578                 return length;
579         }
580
581         // for GetCharCount()
582         static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long index, uint size)
583         {
584                 if (buffer == null) {
585                         DecoderFallback fb = provider as DecoderFallback;
586                         if (fb != null)
587                                 buffer = fb.CreateFallbackBuffer ();
588                         else
589                                 buffer = ((Decoder) provider).FallbackBuffer;
590                 }
591                 if (bufferArg == null)
592                         bufferArg = new byte [1];
593                 int ret = 0;
594                 for (int i = 0; i < size; i++) {
595                         bufferArg [0] = bytes [(int) index + i];
596                         buffer.Fallback (bufferArg, 0);
597                         ret += buffer.Remaining;
598                         buffer.Reset ();
599                 }
600                 return ret;
601         }
602
603         // for GetChars()
604         static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long byteIndex, uint size,
605                 char* chars, ref int charIndex)
606         {
607                 if (buffer == null) {
608                         DecoderFallback fb = provider as DecoderFallback;
609                         if (fb != null)
610                                 buffer = fb.CreateFallbackBuffer ();
611                         else
612                                 buffer = ((Decoder) provider).FallbackBuffer;
613                 }
614                 if (bufferArg == null)
615                         bufferArg = new byte [1];
616                 for (int i = 0; i < size; i++) {
617                         bufferArg [0] = bytes [byteIndex + i];
618                         buffer.Fallback (bufferArg, 0);
619                         while (buffer.Remaining > 0)
620                                 chars [charIndex++] = buffer.GetNextChar ();
621                         buffer.Reset ();
622                 }
623         }
624
625         // Get the number of characters needed to decode a byte buffer.
626         public override int GetCharCount (byte[] bytes, int index, int count)
627         {
628                 DecoderFallbackBuffer buf = null;
629                 byte [] bufferArg = null;
630                 return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
631         }
632
633         [CLSCompliant (false)]
634         [ComVisible (false)]
635         public unsafe override int GetCharCount (byte* bytes, int count)
636         {
637                 DecoderFallbackBuffer buf = null;
638                 byte [] bufferArg = null;
639                 return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
640         }
641
642         // Get the characters that result from decoding a byte buffer.
643         private unsafe static int InternalGetChars (
644                 byte[] bytes, int byteIndex, int byteCount, char[] chars,
645                 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
646                 object provider,
647                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
648         {
649                 // Validate the parameters.
650                 if (bytes == null) {
651                         throw new ArgumentNullException ("bytes");
652                 }
653                 if (chars == null) {
654                         throw new ArgumentNullException ("chars");
655                 }
656                 if (byteIndex < 0 || byteIndex > bytes.Length) {
657                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
658                 }
659                 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
660                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
661                 }
662                 if (charIndex < 0 || charIndex > chars.Length) {
663                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
664                 }
665
666                 if (charIndex == chars.Length && byteCount == 0)
667                         return 0;
668
669                 fixed (char* cptr = chars) {
670                         if (byteCount == 0 || byteIndex == bytes.Length)
671                                 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
672                         // otherwise...
673                         fixed (byte* bptr = bytes)
674                                 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
675                 }
676         }
677
678         private unsafe static int InternalGetChars (
679                 byte* bytes, int byteCount, char* chars, int charCount,
680                 ref uint leftOverBits, ref uint leftOverCount,
681                 object provider,
682                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
683         {
684                 int charIndex = 0, byteIndex = 0;
685                 int length = charCount;
686                 int posn = charIndex;
687
688                 if (leftOverCount == 0) {
689                         int end = byteIndex + byteCount;
690                         for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
691                                 if (bytes [byteIndex] < 0x80) {
692                                         if (posn >= length) {
693                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
694                                         }
695                                         chars [posn] = (char) bytes [byteIndex];
696                                 } else {
697                                         break;
698                                 }
699                         }
700                 }
701
702                 // Convert the bytes into the output buffer.
703                 uint ch;
704                 uint leftBits = leftOverBits;
705                 uint leftSoFar = (leftOverCount & (uint)0x0F);
706                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
707
708                 int byteEnd = byteIndex + byteCount;
709                 for(; byteIndex < byteEnd; byteIndex++) {
710                         // Fetch the next character from the byte buffer.
711                         ch = (uint)(bytes[byteIndex]);
712                         if (leftSize == 0) {
713                                 // Process a UTF-8 start character.
714                                 if (ch < (uint)0x0080) {
715                                         // Single-byte UTF-8 character.
716                                         if (posn >= length) {
717                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
718                                         }
719                                         chars[posn++] = (char)ch;
720                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
721                                         // Double-byte UTF-8 character.
722                                         leftBits = (ch & (uint)0x1F);
723                                         leftSoFar = 1;
724                                         leftSize = 2;
725                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
726                                         // Three-byte UTF-8 character.
727                                         leftBits = (ch & (uint)0x0F);
728                                         leftSoFar = 1;
729                                         leftSize = 3;
730                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
731                                         // Four-byte UTF-8 character.
732                                         leftBits = (ch & (uint)0x07);
733                                         leftSoFar = 1;
734                                         leftSize = 4;
735                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
736                                         // Five-byte UTF-8 character.
737                                         leftBits = (ch & (uint)0x03);
738                                         leftSoFar = 1;
739                                         leftSize = 5;
740                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
741                                         // Six-byte UTF-8 character.
742                                         leftBits = (ch & (uint)0x03);
743                                         leftSoFar = 1;
744                                         leftSize = 6;
745                                 } else {
746                                         // Invalid UTF-8 start character.
747                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, 1, chars, ref posn);
748                                 }
749                         } else {
750                                 // Process an extra byte in a multi-byte sequence.
751                                 if ((ch & (uint)0xC0) == (uint)0x80) {
752                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
753                                         if (++leftSoFar >= leftSize) {
754                                                 // We have a complete character now.
755                                                 if (leftBits < (uint)0x10000) {
756                                                         // is it an overlong ?
757                                                         bool overlong = false;
758                                                         switch (leftSize) {
759                                                         case 2:
760                                                                 overlong = (leftBits <= 0x7F);
761                                                                 break;
762                                                         case 3:
763                                                                 overlong = (leftBits <= 0x07FF);
764                                                                 break;
765                                                         case 4:
766                                                                 overlong = (leftBits <= 0xFFFF);
767                                                                 break;
768                                                         case 5:
769                                                                 overlong = (leftBits <= 0x1FFFFF);
770                                                                 break;
771                                                         case 6:
772                                                                 overlong = (leftBits <= 0x03FFFFFF);
773                                                                 break;
774                                                         }
775                                                         if (overlong) {
776                                                                 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
777                                                         }
778                                                         else if ((leftBits & 0xF800) == 0xD800) {
779                                                                 // UTF-8 doesn't use surrogate characters
780                                                                 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
781                                                         }
782                                                         else {
783                                                                 if (posn >= length) {
784                                                                         throw new ArgumentException
785                                                                                 (_("Arg_InsufficientSpace"), "chars");
786                                                                 }
787                                                                 chars[posn++] = (char)leftBits;
788                                                         }
789                                                 } else if (leftBits < (uint)0x110000) {
790                                                         if ((posn + 2) > length) {
791                                                                 throw new ArgumentException
792                                                                         (_("Arg_InsufficientSpace"), "chars");
793                                                         }
794                                                         leftBits -= (uint)0x10000;
795                                                         chars[posn++] = (char)((leftBits >> 10) +
796                                                                                                    (uint)0xD800);
797                                                         chars[posn++] =
798                                                                 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
799                                                 } else {
800                                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
801                                                 }
802                                                 leftSize = 0;
803                                         }
804                                 } else {
805                                         // Invalid UTF-8 sequence: clear and restart.
806                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
807                                         leftSize = 0;
808                                         --byteIndex;
809                                 }
810                         }
811                 }
812                 if (flush && leftSize != 0) {
813                         // We had left-over bytes that didn't make up
814                         // a complete UTF-8 character sequence.
815                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
816                 }
817                 leftOverBits = leftBits;
818                 leftOverCount = (leftSoFar | (leftSize << 4));
819
820                 // Return the final length to the caller.
821                 return posn - charIndex;
822         }
823
824         // Get the characters that result from decoding a byte buffer.
825         public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
826                                                                  char[] chars, int charIndex)
827         {
828                 uint leftOverBits = 0;
829                 uint leftOverCount = 0;
830                 DecoderFallbackBuffer buf = null;
831                 byte [] bufferArg = null;
832                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
833                                 charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
834         }
835
836         [CLSCompliant (false)]
837         [ComVisible (false)]
838         public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
839         {
840                 DecoderFallbackBuffer buf = null;
841                 byte [] bufferArg = null;
842                 uint leftOverBits = 0;
843                 uint leftOverCount = 0;
844                 return InternalGetChars (bytes, byteCount, chars, 
845                                 charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
846         }
847
848         // Get the maximum number of bytes needed to encode a
849         // specified number of characters.
850         public override int GetMaxByteCount (int charCount)
851         {
852                 if (charCount < 0) {
853                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
854                 }
855
856                 // Add 1 to charCount since there may be a lead surrogate left from the previous call to GetBytes/Encoder.Convert
857                 charCount = charCount + 1;
858                 if (EncoderFallback.MaxCharCount > 1) {
859                         charCount = charCount * EncoderFallback.MaxCharCount;
860                 }
861
862                 return charCount * 3;
863         }
864
865         // Get the maximum number of characters needed to decode a
866         // specified number of bytes.
867         public override int GetMaxCharCount (int byteCount)
868         {
869                 if (byteCount < 0) {
870                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
871                 }
872
873                 // Add 1 to byteCount since there may be the bytes from part of a surrogate pair left from the previous call to GetChars/Decoder.Convert
874                 int maxCharCount = byteCount + 1;
875                 if (DecoderFallback.MaxCharCount > 1) {
876                         maxCharCount = maxCharCount * DecoderFallback.MaxCharCount;
877                 }
878
879                 return maxCharCount;
880         }
881
882         // Get a UTF8-specific decoder that is attached to this instance.
883         public override Decoder GetDecoder ()
884         {
885                 return new UTF8Decoder (DecoderFallback);
886         }
887
888         // Get a UTF8-specific encoder that is attached to this instance.
889         public override Encoder GetEncoder ()
890         {
891                 return new UTF8Encoder (EncoderFallback, emitIdentifier);
892         }
893
894         // Get the UTF8 preamble.
895         public override byte[] GetPreamble ()
896         {
897                 if (emitIdentifier)
898                         return new byte [] { 0xEF, 0xBB, 0xBF };
899
900                 return EmptyArray<byte>.Value;
901         }
902
903         // Determine if this object is equal to another.
904         public override bool Equals (Object value)
905         {
906                 UTF8Encoding enc = (value as UTF8Encoding);
907                 if (enc != null) {
908                         return (codePage == enc.codePage &&
909                                 emitIdentifier == enc.emitIdentifier &&
910                                 DecoderFallback.Equals (enc.DecoderFallback) &&
911                                 EncoderFallback.Equals (enc.EncoderFallback));
912                 } else {
913                         return false;
914                 }
915         }
916
917         // Get the hash code for this object.
918         public override int GetHashCode ()
919         {
920                 return base.GetHashCode ();
921         }
922
923         public override int GetByteCount (string chars)
924         {
925                 // hmm, does this override make any sense?
926                 return base.GetByteCount (chars);
927         }
928
929         [ComVisible (false)]
930         public override string GetString (byte [] bytes, int index, int count)
931         {
932                 // hmm, does this override make any sense?
933                 return base.GetString (bytes, index, count);
934         }
935
936         // UTF-8 decoder implementation.
937         [Serializable]
938         private class UTF8Decoder : Decoder
939         {
940                 private uint leftOverBits;
941                 private uint leftOverCount;
942
943                 // Constructor.
944                 public UTF8Decoder (DecoderFallback fallback)
945                 {
946                         Fallback = fallback;
947                         leftOverBits = 0;
948                         leftOverCount = 0;
949                 }
950
951                 // Override inherited methods.
952                 public override int GetCharCount (byte[] bytes, int index, int count)
953                 {
954                         DecoderFallbackBuffer buf = null;
955                         byte [] bufferArg = null;
956                         return InternalGetCharCount (bytes, index, count,
957                                 leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
958                 }
959                 public override int GetChars (byte[] bytes, int byteIndex,
960                                                  int byteCount, char[] chars, int charIndex)
961                 {
962                         DecoderFallbackBuffer buf = null;
963                         byte [] bufferArg = null;
964                         return InternalGetChars (bytes, byteIndex, byteCount,
965                                 chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
966                 }
967
968         } // class UTF8Decoder
969
970         // UTF-8 encoder implementation.
971         [Serializable]
972         private class UTF8Encoder : Encoder
973         {
974 //              private bool emitIdentifier;
975                 private char leftOverForCount;
976                 private char leftOverForConv;
977
978                 // Constructor.
979                 public UTF8Encoder (EncoderFallback fallback, bool emitIdentifier)
980                 {
981                         Fallback = fallback;
982 //                      this.emitIdentifier = emitIdentifier;
983                         leftOverForCount = '\0';
984                         leftOverForConv = '\0';
985                 }
986
987                 // Override inherited methods.
988                 public override int GetByteCount (char[] chars, int index,
989                                          int count, bool flush)
990                 {
991                         return InternalGetByteCount (chars, index, count, Fallback, ref leftOverForCount, flush);
992                 }
993                 public override int GetBytes (char[] chars, int charIndex,
994                                          int charCount, byte[] bytes, int byteIndex, bool flush)
995                 {
996                         int result;
997                         EncoderFallbackBuffer buffer = null;
998                         result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, Fallback, ref buffer, ref leftOverForConv, flush);
999 //                      emitIdentifier = false;
1000                         return result;
1001                 }
1002
1003                 public unsafe override int GetByteCount (char* chars, int count, bool flush)
1004                 {
1005                         return InternalGetByteCount (chars, count, Fallback, ref leftOverForCount, flush);
1006                 }
1007
1008                 public unsafe override int GetBytes (char* chars, int charCount,
1009                         byte* bytes, int byteCount, bool flush)
1010                 {
1011                         int result;
1012                         EncoderFallbackBuffer buffer = null;
1013                         result = InternalGetBytes (chars, charCount, bytes, byteCount, Fallback, ref buffer, ref leftOverForConv, flush);
1014 //                      emitIdentifier = false;
1015                         return result;
1016                 }
1017         } // class UTF8Encoder
1018
1019 }; // class UTF8Encoding
1020
1021 }; // namespace System.Text