System.Drawing: added email to icon and test file headers
[mono.git] / mcs / class / corlib / System.Text / UTF8Encoding.cs
1 /*
2  * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
3  *
4  * Copyright (c) 2001, 2002  Southern Storm Software, Pty Ltd
5  * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23  * OTHER DEALINGS IN THE SOFTWARE.
24  */
25
26 namespace System.Text
27 {
28
29 using System;
30 using System.Runtime.InteropServices;
31
32 [Serializable]
33 [MonoLimitation ("Serialization format not compatible with .NET")]
34 [ComVisible (true)]
35 public class UTF8Encoding : Encoding
36 {
37         // Magic number used by Windows for UTF-8.
38         internal const int UTF8_CODE_PAGE = 65001;
39
40         // Internal state.
41         private bool emitIdentifier;
42
43         // Constructors.
44         public UTF8Encoding () : this (false, false) {}
45         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
46                         : this (encoderShouldEmitUTF8Identifier, false) {}
47         
48         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
49                 : base (UTF8_CODE_PAGE)
50         {
51                 emitIdentifier = encoderShouldEmitUTF8Identifier;
52                 if (throwOnInvalidBytes)
53                         SetFallbackInternal (EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
54                 else
55                         SetFallbackInternal (EncoderFallback.StandardSafeFallback, DecoderFallback.StandardSafeFallback);
56
57                 web_name = body_name = header_name = "utf-8";
58                 encoding_name = "Unicode (UTF-8)";
59                 is_browser_save = true;
60                 is_browser_display = true;
61                 is_mail_news_display = true;
62                 is_mail_news_save = true;
63                 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
64         }
65
66         #region GetByteCount()
67
68         // Internal version of "GetByteCount" which can handle a rolling
69         // state between multiple calls to this method.
70         private static int InternalGetByteCount (char[] chars, int index, int count, EncoderFallback fallback, ref char leftOver, bool flush)
71         {
72                 // Validate the parameters.
73                 if (chars == null) {
74                         throw new ArgumentNullException ("chars");
75                 }
76                 if (index < 0 || index > chars.Length) {
77                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
78                 }
79                 if (count < 0 || count > (chars.Length - index)) {
80                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
81                 }
82
83                 if (index == chars.Length) {
84                         if (flush && leftOver != '\0') {
85                                 // Flush the left-over surrogate pair start.
86                                 leftOver = '\0';
87                                 return 3;
88                         }
89                         return 0;
90                 }
91
92                 unsafe {
93                         fixed (char* cptr = chars) {
94                                 return InternalGetByteCount (cptr + index, count, fallback, ref leftOver, flush);
95                         }
96                 }
97         }
98
99         private unsafe static int InternalGetByteCount (char* chars, int count, EncoderFallback fallback, ref char leftOver, bool flush)
100         {
101                 int length = 0;
102                 char* end = chars + count;
103                 char* start = chars;
104                 EncoderFallbackBuffer buffer = null;
105                 while (chars < end) {
106                         if (leftOver == 0) {
107                                 for (; chars < end; chars++) {
108                                         if (*chars < '\x80') {
109                                                 ++length;
110                                         } else if (*chars < '\x800') {
111                                                 length += 2;
112                                         } else if (*chars < '\uD800' || *chars > '\uDFFF') {
113                                                 length += 3;
114                                         } else if (*chars <= '\uDBFF') {
115                                                 // This is a surrogate start char, exit the inner loop only
116                                                 // if we don't find the complete surrogate pair.
117                                                 if (chars + 1 < end && chars [1] >= '\uDC00' && chars [1] <= '\uDFFF') {
118                                                         length += 4;
119                                                         chars++;
120                                                         continue;
121                                                 }
122                                                 leftOver = *chars;
123                                                 chars++;
124                                                 break;
125                                         } else {
126                                                 // We have a surrogate tail without 
127                                                 // leading surrogate. In NET_2_0 it
128                                                 // uses fallback. In NET_1_1 we output
129                                                 // wrong surrogate.
130                                                 char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
131                                                 fixed (char *fb_chars = fallback_chars) {
132                                                         char dummy = '\0';
133                                                         length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
134                                                 }
135
136                                                 leftOver = '\0';
137                                         }
138                                 }
139                         } else {
140                                 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
141                                         // We have a correct surrogate pair.
142                                         length += 4;
143                                         chars++;
144                                 } else {
145                                         // We have a surrogate start followed by a
146                                         // regular character.  Technically, this is
147                                         // invalid, but we have to do something.
148                                         // We write out the surrogate start and then
149                                         // re-visit the current character again.
150                                         char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
151                                         fixed (char *fb_chars = fallback_chars) {
152                                                 char dummy = '\0';
153                                                 length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
154                                         }
155                                 }
156                                 leftOver = '\0';
157                         }
158                 }
159                 if (flush) {
160                         // Flush the left-over surrogate pair start.
161                         if (leftOver != '\0') {
162                                 length += 3;
163                                 leftOver = '\0';
164                         }
165                 }
166                 return length;
167         }
168
169         unsafe static char [] GetFallbackChars (char *chars, char *start, EncoderFallback fallback, ref EncoderFallbackBuffer buffer)
170         {
171                 if (buffer == null)
172                         buffer = fallback.CreateFallbackBuffer ();
173
174                 buffer.Fallback (*chars, (int) (chars - start));
175
176                 char [] fallback_chars = new char [buffer.Remaining];
177                 for (int i = 0; i < fallback_chars.Length; i++)
178                         fallback_chars [i] = buffer.GetNextChar ();
179
180                 buffer.Reset ();
181
182                 return fallback_chars;
183         }
184
185         // Get the number of bytes needed to encode a character buffer.
186         public override int GetByteCount (char[] chars, int index, int count)
187         {
188                 char dummy = '\0';
189                 return InternalGetByteCount (chars, index, count, EncoderFallback, ref dummy, true);
190         }
191
192
193         [CLSCompliant (false)]
194         [ComVisible (false)]
195         public unsafe override int GetByteCount (char* chars, int count)
196         {
197                 if (chars == null)
198                         throw new ArgumentNullException ("chars");
199                 if (count == 0)
200                         return 0;
201                 char dummy = '\0';
202                 return InternalGetByteCount (chars, count, EncoderFallback, ref dummy, true);
203         }
204
205         #endregion
206
207         #region GetBytes()
208
209         // Internal version of "GetBytes" which can handle a rolling
210         // state between multiple calls to this method.
211         private static int InternalGetBytes (char[] chars, int charIndex,
212                                              int charCount, byte[] bytes,
213                                              int byteIndex,
214                                                  EncoderFallback fallback, ref EncoderFallbackBuffer buffer,
215                                                  ref char leftOver, bool flush)
216         {
217                 // Validate the parameters.
218                 if (chars == null) {
219                         throw new ArgumentNullException ("chars");
220                 }
221                 if (bytes == null) {
222                         throw new ArgumentNullException ("bytes");
223                 }
224                 if (charIndex < 0 || charIndex > chars.Length) {
225                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
226                 }
227                 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
228                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
229                 }
230                 if (byteIndex < 0 || byteIndex > bytes.Length) {
231                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
232                 }
233
234                 if (charIndex == chars.Length) {
235                         if (flush && leftOver != '\0') {
236                                 // FIXME: use EncoderFallback.
237                                 //
238                                 // By default it is empty, so I do nothing for now.
239                                 leftOver = '\0';
240                         }
241                         return 0;
242                 }
243
244                 unsafe {
245                         fixed (char* cptr = chars) {
246                                 if (bytes.Length == byteIndex)
247                                         return InternalGetBytes (
248                                                 cptr + charIndex, charCount, 
249                                                 null, 0, fallback, ref buffer, ref leftOver, flush);
250                                 fixed (byte *bptr = bytes) {
251                                         return InternalGetBytes (
252                                                 cptr + charIndex, charCount,
253                                                 bptr + byteIndex, bytes.Length - byteIndex,
254                                                 fallback, ref buffer,
255                                                 ref leftOver, flush);
256                                 }
257                         }
258                 }
259         }
260
261         private unsafe static int InternalGetBytes (char* chars, int count, byte* bytes, int bcount, EncoderFallback fallback, ref EncoderFallbackBuffer buffer, ref char leftOver, bool flush)
262         {
263                 char* end = chars + count;
264                 char* start = chars;
265                 byte* start_bytes = bytes;
266                 byte* end_bytes = bytes + bcount;
267                 while (chars < end) {
268                         if (leftOver == 0) {
269                                 for (; chars < end; chars++) {
270                                         int ch = *chars;
271                                         if (ch < '\x80') {
272                                                 if (bytes >= end_bytes)
273                                                         goto fail_no_space;
274                                                 *bytes++ = (byte)ch;
275                                         } else if (ch < '\x800') {
276                                                 if (bytes + 1 >= end_bytes)
277                                                         goto fail_no_space;
278                                                 bytes [0] = (byte) (0xC0 | (ch >> 6));
279                                                 bytes [1] = (byte) (0x80 | (ch & 0x3F));
280                                                 bytes += 2;
281                                         } else if (ch < '\uD800' || ch > '\uDFFF') {
282                                                 if (bytes + 2 >= end_bytes)
283                                                         goto fail_no_space;
284                                                 bytes [0] = (byte) (0xE0 | (ch >> 12));
285                                                 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
286                                                 bytes [2] = (byte) (0x80 | (ch & 0x3F));
287                                                 bytes += 3;
288                                         } else if (ch <= '\uDBFF') {
289                                                 // This is a surrogate char, exit the inner loop.
290                                                 leftOver = *chars;
291                                                 chars++;
292                                                 break;
293                                         } else {
294                                                 // We have a surrogate tail without 
295                                                 // leading surrogate. In NET_2_0 it
296                                                 // uses fallback. In NET_1_1 we output
297                                                 // wrong surrogate.
298                                                 char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer); 
299                                                 char dummy = '\0';
300                                                 if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
301                                                         goto fail_no_space;
302                                                 fixed (char *fb_chars = fallback_chars) {
303                                                         bytes += InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
304                                                 }
305
306                                                 leftOver = '\0';
307                                         }
308                                 }
309                         } else {
310                                 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
311                                         // We have a correct surrogate pair.
312                                         int ch = 0x10000 + (int) *chars - 0xDC00 + (((int) leftOver - 0xD800) << 10);
313                                         if (bytes + 3 >= end_bytes)
314                                                 goto fail_no_space;
315                                         bytes [0] = (byte) (0xF0 | (ch >> 18));
316                                         bytes [1] = (byte) (0x80 | ((ch >> 12) & 0x3F));
317                                         bytes [2] = (byte) (0x80 | ((ch >> 6) & 0x3F));
318                                         bytes [3] = (byte) (0x80 | (ch & 0x3F));
319                                         bytes += 4;
320                                         chars++;
321                                 } else {
322                                         // We have a surrogate start followed by a
323                                         // regular character.  Technically, this is
324                                         // invalid, but we have to do something.
325                                         // We write out the surrogate start and then
326                                         // re-visit the current character again.
327                                         char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer); 
328                                         char dummy = '\0';
329                                         if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
330                                                 goto fail_no_space;
331                                         fixed (char *fb_chars = fallback_chars) {
332                                                 InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
333                                         }
334
335                                         leftOver = '\0';
336                                 }
337                                 leftOver = '\0';
338                         }
339                 }
340                 if (flush) {
341                         // Flush the left-over surrogate pair start.
342                         if (leftOver != '\0') {
343                                 int ch = leftOver;
344                                 if (bytes + 2 < end_bytes) {
345                                         bytes [0] = (byte) (0xE0 | (ch >> 12));
346                                         bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
347                                         bytes [2] = (byte) (0x80 | (ch & 0x3F));
348                                         bytes += 3;
349                                 } else {
350                                         goto fail_no_space;
351                                 }
352                                 leftOver = '\0';
353                         }
354                 }
355                 return (int)(bytes - (end_bytes - bcount));
356 fail_no_space:
357                 throw new ArgumentException ("Insufficient Space", "bytes");
358         }
359
360         // Get the bytes that result from encoding a character buffer.
361         public override int GetBytes (char[] chars, int charIndex, int charCount,
362                                                                  byte[] bytes, int byteIndex)
363         {
364                 char leftOver = '\0';
365                 EncoderFallbackBuffer buffer = null;
366                 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, EncoderFallback, ref buffer, ref leftOver, true);
367         }
368
369         // Convenience wrappers for "GetBytes".
370         public override int GetBytes (String s, int charIndex, int charCount,
371                                                                  byte[] bytes, int byteIndex)
372         {
373                 // Validate the parameters.
374                 if (s == null) {
375                         throw new ArgumentNullException ("s");
376                 }
377                 if (bytes == null) {
378                         throw new ArgumentNullException ("bytes");
379                 }
380                 if (charIndex < 0 || charIndex > s.Length) {
381                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
382                 }
383                 if (charCount < 0 || charCount > (s.Length - charIndex)) {
384                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
385                 }
386                 if (byteIndex < 0 || byteIndex > bytes.Length) {
387                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
388                 }
389
390                 if (charIndex == s.Length)
391                         return 0;
392
393                 unsafe {
394                         fixed (char* cptr = s) {
395                                 char dummy = '\0';
396                                 EncoderFallbackBuffer buffer = null;
397                                 if (bytes.Length == byteIndex)
398                                         return InternalGetBytes (
399                                                 cptr + charIndex, charCount,
400                                                 null, 0, EncoderFallback, ref buffer, ref dummy, true);
401                                 fixed (byte *bptr = bytes) {
402                                         return InternalGetBytes (
403                                                 cptr + charIndex, charCount,
404                                                 bptr + byteIndex, bytes.Length - byteIndex,
405                                                 EncoderFallback, ref buffer,
406                                                 ref dummy, true);
407                                 }
408                         }
409                 }
410         }
411
412         [CLSCompliant (false)]
413         [ComVisible (false)]
414         public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
415         {
416                 if (chars == null)
417                         throw new ArgumentNullException ("chars");
418                 if (charCount < 0)
419                         throw new IndexOutOfRangeException ("charCount");
420                 if (bytes == null)
421                         throw new ArgumentNullException ("bytes");
422                 if (byteCount < 0)
423                         throw new IndexOutOfRangeException ("charCount");
424
425                 if (charCount == 0)
426                         return 0;
427
428                 char dummy = '\0';
429                 EncoderFallbackBuffer buffer = null;
430                 if (byteCount == 0)
431                         return InternalGetBytes (chars, charCount, null, 0, EncoderFallback, ref buffer, ref dummy, true);
432                 else
433                         return InternalGetBytes (chars, charCount, bytes, byteCount, EncoderFallback, ref buffer, ref dummy, true);
434         }
435
436         #endregion
437
438         // Internal version of "GetCharCount" which can handle a rolling
439         // state between multiple calls to this method.
440         private unsafe static int InternalGetCharCount (
441                 byte[] bytes, int index, int count, uint leftOverBits,
442                 uint leftOverCount, object provider,
443                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
444         {
445                 // Validate the parameters.
446                 if (bytes == null) {
447                         throw new ArgumentNullException ("bytes");
448                 }
449                 if (index < 0 || index > bytes.Length) {
450                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
451                 }
452                 if (count < 0 || count > (bytes.Length - index)) {
453                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
454                 }
455
456                 if (count == 0)
457                         return 0;
458                 fixed (byte *bptr = bytes)
459                         return InternalGetCharCount (bptr + index, count,
460                                 leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
461         }
462
463         private unsafe static int InternalGetCharCount (
464                 byte* bytes, int count, uint leftOverBits,
465                 uint leftOverCount, object provider,
466                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
467         {
468                 int index = 0;
469
470                 int length = 0;
471
472                 if (leftOverCount == 0) {
473                         int end = index + count;
474                         for (; index < end; index++, count--) {
475                                 if (bytes [index] < 0x80)
476                                         length++;
477                                 else
478                                         break;
479                         }
480                 }
481
482                 // Determine the number of characters that we have.
483                 uint ch;
484                 uint leftBits = leftOverBits;
485                 uint leftSoFar = (leftOverCount & (uint)0x0F);
486                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
487                 while (count > 0) {
488                         ch = (uint)(bytes[index++]);
489                         --count;
490                         if (leftSize == 0) {
491                                 // Process a UTF-8 start character.
492                                 if (ch < (uint)0x0080) {
493                                         // Single-byte UTF-8 character.
494                                         ++length;
495                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
496                                         // Double-byte UTF-8 character.
497                                         leftBits = (ch & (uint)0x1F);
498                                         leftSoFar = 1;
499                                         leftSize = 2;
500                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
501                                         // Three-byte UTF-8 character.
502                                         leftBits = (ch & (uint)0x0F);
503                                         leftSoFar = 1;
504                                         leftSize = 3;
505                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
506                                         // Four-byte UTF-8 character.
507                                         leftBits = (ch & (uint)0x07);
508                                         leftSoFar = 1;
509                                         leftSize = 4;
510                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
511                                         // Five-byte UTF-8 character.
512                                         leftBits = (ch & (uint)0x03);
513                                         leftSoFar = 1;
514                                         leftSize = 5;
515                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
516                                         // Six-byte UTF-8 character.
517                                         leftBits = (ch & (uint)0x03);
518                                         leftSoFar = 1;
519                                         leftSize = 6;
520                                 } else {
521                                         // Invalid UTF-8 start character.
522                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1, 1);
523                                 }
524                         } else {
525                                 // Process an extra byte in a multi-byte sequence.
526                                 if ((ch & (uint)0xC0) == (uint)0x80) {
527                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
528                                         if (++leftSoFar >= leftSize) {
529                                                 // We have a complete character now.
530                                                 if (leftBits < (uint)0x10000) {
531                                                         // is it an overlong ?
532                                                         bool overlong = false;
533                                                         switch (leftSize) {
534                                                         case 2:
535                                                                 overlong = (leftBits <= 0x7F);
536                                                                 break;
537                                                         case 3:
538                                                                 overlong = (leftBits <= 0x07FF);
539                                                                 break;
540                                                         case 4:
541                                                                 overlong = (leftBits <= 0xFFFF);
542                                                                 break;
543                                                         case 5:
544                                                                 overlong = (leftBits <= 0x1FFFFF);
545                                                                 break;
546                                                         case 6:
547                                                                 overlong = (leftBits <= 0x03FFFFFF);
548                                                                 break;
549                                                         }
550                                                         if (overlong) {
551                                                                 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
552                                                         }
553                                                         else if ((leftBits & 0xF800) == 0xD800) {
554                                                                 // UTF-8 doesn't use surrogate characters
555                                                                 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
556                                                         }
557                                                         else
558                                                                 ++length;
559                                                 } else if (leftBits < (uint)0x110000) {
560                                                         length += 2;
561                                                 } else {
562                                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
563                                                 }
564                                                 leftSize = 0;
565                                         }
566                                 } else {
567                                         // Invalid UTF-8 sequence: clear and restart.
568                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
569                                         leftSize = 0;
570                                         --index;
571                                         ++count;
572                                 }
573                         }
574                 }
575                 if (flush && leftSize != 0) {
576                         // We had left-over bytes that didn't make up
577                         // a complete UTF-8 character sequence.
578                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
579                 }
580
581                 // Return the final length to the caller.
582                 return length;
583         }
584
585         // for GetCharCount()
586         static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long index, uint size)
587         {
588                 if (buffer == null) {
589                         DecoderFallback fb = provider as DecoderFallback;
590                         if (fb != null)
591                                 buffer = fb.CreateFallbackBuffer ();
592                         else
593                                 buffer = ((Decoder) provider).FallbackBuffer;
594                 }
595                 if (bufferArg == null)
596                         bufferArg = new byte [1];
597                 int ret = 0;
598                 for (int i = 0; i < size; i++) {
599                         bufferArg [0] = bytes [(int) index + i];
600                         buffer.Fallback (bufferArg, 0);
601                         ret += buffer.Remaining;
602                         buffer.Reset ();
603                 }
604                 return ret;
605         }
606
607         // for GetChars()
608         static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long byteIndex, uint size,
609                 char* chars, ref int charIndex)
610         {
611                 if (buffer == null) {
612                         DecoderFallback fb = provider as DecoderFallback;
613                         if (fb != null)
614                                 buffer = fb.CreateFallbackBuffer ();
615                         else
616                                 buffer = ((Decoder) provider).FallbackBuffer;
617                 }
618                 if (bufferArg == null)
619                         bufferArg = new byte [1];
620                 for (int i = 0; i < size; i++) {
621                         bufferArg [0] = bytes [byteIndex + i];
622                         buffer.Fallback (bufferArg, 0);
623                         while (buffer.Remaining > 0)
624                                 chars [charIndex++] = buffer.GetNextChar ();
625                         buffer.Reset ();
626                 }
627         }
628
629         // Get the number of characters needed to decode a byte buffer.
630         public override int GetCharCount (byte[] bytes, int index, int count)
631         {
632                 DecoderFallbackBuffer buf = null;
633                 byte [] bufferArg = null;
634                 return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
635         }
636
637         [CLSCompliant (false)]
638         [ComVisible (false)]
639         public unsafe override int GetCharCount (byte* bytes, int count)
640         {
641                 DecoderFallbackBuffer buf = null;
642                 byte [] bufferArg = null;
643                 return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
644         }
645
646         // Get the characters that result from decoding a byte buffer.
647         private unsafe static int InternalGetChars (
648                 byte[] bytes, int byteIndex, int byteCount, char[] chars,
649                 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
650                 object provider,
651                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
652         {
653                 // Validate the parameters.
654                 if (bytes == null) {
655                         throw new ArgumentNullException ("bytes");
656                 }
657                 if (chars == null) {
658                         throw new ArgumentNullException ("chars");
659                 }
660                 if (byteIndex < 0 || byteIndex > bytes.Length) {
661                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
662                 }
663                 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
664                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
665                 }
666                 if (charIndex < 0 || charIndex > chars.Length) {
667                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
668                 }
669
670                 if (charIndex == chars.Length)
671                         return 0;
672
673                 fixed (char* cptr = chars) {
674                         if (byteCount == 0 || byteIndex == bytes.Length)
675                                 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
676                         // otherwise...
677                         fixed (byte* bptr = bytes)
678                                 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
679                 }
680         }
681
682         private unsafe static int InternalGetChars (
683                 byte* bytes, int byteCount, char* chars, int charCount,
684                 ref uint leftOverBits, ref uint leftOverCount,
685                 object provider,
686                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
687         {
688                 int charIndex = 0, byteIndex = 0;
689                 int length = charCount;
690                 int posn = charIndex;
691
692                 if (leftOverCount == 0) {
693                         int end = byteIndex + byteCount;
694                         for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
695                                 if (bytes [byteIndex] < 0x80)
696                                         chars [posn] = (char) bytes [byteIndex];
697                                 else
698                                         break;
699                         }
700                 }
701
702                 // Convert the bytes into the output buffer.
703                 uint ch;
704                 uint leftBits = leftOverBits;
705                 uint leftSoFar = (leftOverCount & (uint)0x0F);
706                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
707
708                 int byteEnd = byteIndex + byteCount;
709                 for(; byteIndex < byteEnd; byteIndex++) {
710                         // Fetch the next character from the byte buffer.
711                         ch = (uint)(bytes[byteIndex]);
712                         if (leftSize == 0) {
713                                 // Process a UTF-8 start character.
714                                 if (ch < (uint)0x0080) {
715                                         // Single-byte UTF-8 character.
716                                         if (posn >= length) {
717                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
718                                         }
719                                         chars[posn++] = (char)ch;
720                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
721                                         // Double-byte UTF-8 character.
722                                         leftBits = (ch & (uint)0x1F);
723                                         leftSoFar = 1;
724                                         leftSize = 2;
725                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
726                                         // Three-byte UTF-8 character.
727                                         leftBits = (ch & (uint)0x0F);
728                                         leftSoFar = 1;
729                                         leftSize = 3;
730                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
731                                         // Four-byte UTF-8 character.
732                                         leftBits = (ch & (uint)0x07);
733                                         leftSoFar = 1;
734                                         leftSize = 4;
735                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
736                                         // Five-byte UTF-8 character.
737                                         leftBits = (ch & (uint)0x03);
738                                         leftSoFar = 1;
739                                         leftSize = 5;
740                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
741                                         // Six-byte UTF-8 character.
742                                         leftBits = (ch & (uint)0x03);
743                                         leftSoFar = 1;
744                                         leftSize = 6;
745                                 } else {
746                                         // Invalid UTF-8 start character.
747                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, 1, chars, ref posn);
748                                 }
749                         } else {
750                                 // Process an extra byte in a multi-byte sequence.
751                                 if ((ch & (uint)0xC0) == (uint)0x80) {
752                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
753                                         if (++leftSoFar >= leftSize) {
754                                                 // We have a complete character now.
755                                                 if (leftBits < (uint)0x10000) {
756                                                         // is it an overlong ?
757                                                         bool overlong = false;
758                                                         switch (leftSize) {
759                                                         case 2:
760                                                                 overlong = (leftBits <= 0x7F);
761                                                                 break;
762                                                         case 3:
763                                                                 overlong = (leftBits <= 0x07FF);
764                                                                 break;
765                                                         case 4:
766                                                                 overlong = (leftBits <= 0xFFFF);
767                                                                 break;
768                                                         case 5:
769                                                                 overlong = (leftBits <= 0x1FFFFF);
770                                                                 break;
771                                                         case 6:
772                                                                 overlong = (leftBits <= 0x03FFFFFF);
773                                                                 break;
774                                                         }
775                                                         if (overlong) {
776                                                                 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
777                                                         }
778                                                         else if ((leftBits & 0xF800) == 0xD800) {
779                                                                 // UTF-8 doesn't use surrogate characters
780                                                                 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
781                                                         }
782                                                         else {
783                                                                 if (posn >= length) {
784                                                                         throw new ArgumentException
785                                                                                 (_("Arg_InsufficientSpace"), "chars");
786                                                                 }
787                                                                 chars[posn++] = (char)leftBits;
788                                                         }
789                                                 } else if (leftBits < (uint)0x110000) {
790                                                         if ((posn + 2) > length) {
791                                                                 throw new ArgumentException
792                                                                         (_("Arg_InsufficientSpace"), "chars");
793                                                         }
794                                                         leftBits -= (uint)0x10000;
795                                                         chars[posn++] = (char)((leftBits >> 10) +
796                                                                                                    (uint)0xD800);
797                                                         chars[posn++] =
798                                                                 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
799                                                 } else {
800                                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
801                                                 }
802                                                 leftSize = 0;
803                                         }
804                                 } else {
805                                         // Invalid UTF-8 sequence: clear and restart.
806                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
807                                         leftSize = 0;
808                                         --byteIndex;
809                                 }
810                         }
811                 }
812                 if (flush && leftSize != 0) {
813                         // We had left-over bytes that didn't make up
814                         // a complete UTF-8 character sequence.
815                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
816                 }
817                 leftOverBits = leftBits;
818                 leftOverCount = (leftSoFar | (leftSize << 4));
819
820                 // Return the final length to the caller.
821                 return posn - charIndex;
822         }
823
824         // Get the characters that result from decoding a byte buffer.
825         public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
826                                                                  char[] chars, int charIndex)
827         {
828                 uint leftOverBits = 0;
829                 uint leftOverCount = 0;
830                 DecoderFallbackBuffer buf = null;
831                 byte [] bufferArg = null;
832                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
833                                 charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
834         }
835
836         [CLSCompliant (false)]
837         [ComVisible (false)]
838         public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
839         {
840                 DecoderFallbackBuffer buf = null;
841                 byte [] bufferArg = null;
842                 uint leftOverBits = 0;
843                 uint leftOverCount = 0;
844                 return InternalGetChars (bytes, byteCount, chars, 
845                                 charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
846         }
847
848         // Get the maximum number of bytes needed to encode a
849         // specified number of characters.
850         public override int GetMaxByteCount (int charCount)
851         {
852                 if (charCount < 0) {
853                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
854                 }
855                 return charCount * 4;
856         }
857
858         // Get the maximum number of characters needed to decode a
859         // specified number of bytes.
860         public override int GetMaxCharCount (int byteCount)
861         {
862                 if (byteCount < 0) {
863                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
864                 }
865                 return byteCount;
866         }
867
868         // Get a UTF8-specific decoder that is attached to this instance.
869         public override Decoder GetDecoder ()
870         {
871                 return new UTF8Decoder (DecoderFallback);
872         }
873
874         // Get a UTF8-specific encoder that is attached to this instance.
875         public override Encoder GetEncoder ()
876         {
877                 return new UTF8Encoder (EncoderFallback, emitIdentifier);
878         }
879
880         // Get the UTF8 preamble.
881         public override byte[] GetPreamble ()
882         {
883                 if (emitIdentifier)
884                         return new byte [] { 0xEF, 0xBB, 0xBF };
885
886                 return new byte [0];
887         }
888
889         // Determine if this object is equal to another.
890         public override bool Equals (Object value)
891         {
892                 UTF8Encoding enc = (value as UTF8Encoding);
893                 if (enc != null) {
894                         return (codePage == enc.codePage &&
895                                 emitIdentifier == enc.emitIdentifier &&
896                                 DecoderFallback.Equals (enc.DecoderFallback) &&
897                                 EncoderFallback.Equals (enc.EncoderFallback));
898                 } else {
899                         return false;
900                 }
901         }
902
903         // Get the hash code for this object.
904         public override int GetHashCode ()
905         {
906                 return base.GetHashCode ();
907         }
908
909         public override int GetByteCount (string chars)
910         {
911                 // hmm, does this override make any sense?
912                 return base.GetByteCount (chars);
913         }
914
915         [ComVisible (false)]
916         public override string GetString (byte [] bytes, int index, int count)
917         {
918                 // hmm, does this override make any sense?
919                 return base.GetString (bytes, index, count);
920         }
921
922         // UTF-8 decoder implementation.
923         [Serializable]
924         private class UTF8Decoder : Decoder
925         {
926                 private uint leftOverBits;
927                 private uint leftOverCount;
928
929                 // Constructor.
930                 public UTF8Decoder (DecoderFallback fallback)
931                 {
932                         Fallback = fallback;
933                         leftOverBits = 0;
934                         leftOverCount = 0;
935                 }
936
937                 // Override inherited methods.
938                 public override int GetCharCount (byte[] bytes, int index, int count)
939                 {
940                         DecoderFallbackBuffer buf = null;
941                         byte [] bufferArg = null;
942                         return InternalGetCharCount (bytes, index, count,
943                                 leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
944                 }
945                 public override int GetChars (byte[] bytes, int byteIndex,
946                                                  int byteCount, char[] chars, int charIndex)
947                 {
948                         DecoderFallbackBuffer buf = null;
949                         byte [] bufferArg = null;
950                         return InternalGetChars (bytes, byteIndex, byteCount,
951                                 chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
952                 }
953
954         } // class UTF8Decoder
955
956         // UTF-8 encoder implementation.
957         [Serializable]
958         private class UTF8Encoder : Encoder
959         {
960 //              private bool emitIdentifier;
961                 private char leftOverForCount;
962                 private char leftOverForConv;
963
964                 // Constructor.
965                 public UTF8Encoder (EncoderFallback fallback, bool emitIdentifier)
966                 {
967                         Fallback = fallback;
968 //                      this.emitIdentifier = emitIdentifier;
969                         leftOverForCount = '\0';
970                         leftOverForConv = '\0';
971                 }
972
973                 // Override inherited methods.
974                 public override int GetByteCount (char[] chars, int index,
975                                          int count, bool flush)
976                 {
977                         return InternalGetByteCount (chars, index, count, Fallback, ref leftOverForCount, flush);
978                 }
979                 public override int GetBytes (char[] chars, int charIndex,
980                                          int charCount, byte[] bytes, int byteIndex, bool flush)
981                 {
982                         int result;
983                         EncoderFallbackBuffer buffer = null;
984                         result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, Fallback, ref buffer, ref leftOverForConv, flush);
985 //                      emitIdentifier = false;
986                         return result;
987                 }
988
989                 public unsafe override int GetByteCount (char* chars, int count, bool flush)
990                 {
991                         return InternalGetByteCount (chars, count, Fallback, ref leftOverForCount, flush);
992                 }
993
994                 public unsafe override int GetBytes (char* chars, int charCount,
995                         byte* bytes, int byteCount, bool flush)
996                 {
997                         int result;
998                         EncoderFallbackBuffer buffer = null;
999                         result = InternalGetBytes (chars, charCount, bytes, byteCount, Fallback, ref buffer, ref leftOverForConv, flush);
1000 //                      emitIdentifier = false;
1001                         return result;
1002                 }
1003         } // class UTF8Encoder
1004
1005 }; // class UTF8Encoding
1006
1007 }; // namespace System.Text