New test.
[mono.git] / mcs / class / corlib / System.Text / UTF8Encoding.cs
1 /*
2  * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
3  *
4  * Copyright (c) 2001, 2002  Southern Storm Software, Pty Ltd
5  * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23  * OTHER DEALINGS IN THE SOFTWARE.
24  */
25
26 namespace System.Text
27 {
28
29 using System;
30 using System.Runtime.InteropServices;
31
32 [Serializable]
33 [MonoTODO ("Fix serialization compatibility with MS.NET")]
34 #if NET_2_0
35 [MonoTODO ("EncoderFallback is not handled")]
36 [ComVisible (true)]
37 #endif
38 public class UTF8Encoding : Encoding
39 {
40         // Magic number used by Windows for UTF-8.
41         internal const int UTF8_CODE_PAGE = 65001;
42
43         // Internal state.
44         private bool emitIdentifier;
45 #if !NET_2_0
46         private bool throwOnInvalid;
47 #endif
48
49         // Constructors.
50         public UTF8Encoding () : this (false, false) {}
51         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
52                         : this (encoderShouldEmitUTF8Identifier, false) {}
53         
54         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
55                 : base (UTF8_CODE_PAGE)
56         {
57                 emitIdentifier = encoderShouldEmitUTF8Identifier;
58 #if NET_2_0
59                 if (throwOnInvalidBytes)
60                         SetFallbackInternal (null, new DecoderExceptionFallback ());
61                 else
62                         SetFallbackInternal (null, new DecoderReplacementFallback (String.Empty));
63 #else
64                 throwOnInvalid = throwOnInvalidBytes;
65 #endif
66
67                 web_name = body_name = header_name = "utf-8";
68                 encoding_name = "Unicode (UTF-8)";
69                 is_browser_save = true;
70                 is_browser_display = true;
71                 is_mail_news_display = true;
72                 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
73         }
74
75         #region GetByteCount()
76
77         // Internal version of "GetByteCount" which can handle a rolling
78         // state between multiple calls to this method.
79         private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
80         {
81                 // Validate the parameters.
82                 if (chars == null) {
83                         throw new ArgumentNullException ("chars");
84                 }
85                 if (index < 0 || index > chars.Length) {
86                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
87                 }
88                 if (count < 0 || count > (chars.Length - index)) {
89                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
90                 }
91
92                 if (index == chars.Length) {
93                         if (flush && leftOver != '\0') {
94                                 // Flush the left-over surrogate pair start.
95                                 leftOver = '\0';
96                                 return 3;
97                         }
98                         return 0;
99                 }
100
101                 unsafe {
102                         fixed (char* cptr = chars) {
103                                 return InternalGetByteCount (cptr + index, count, ref leftOver, flush);
104                         }
105                 }
106         }
107
108
109         private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
110         {
111                 int index = 0;
112
113                 // Determine the lengths of all characters.
114                 char ch;
115                 int length = 0;
116                 char pair = leftOver;
117                 while (count > 0) {
118                         ch = chars[index];
119                         if (pair == 0) {
120                                 if (ch < '\u0080') {
121                                         // fast path optimization
122                                         int end = index + count;
123                                         for (; index < end; index++, count--) {
124                                                 if (chars [index] < '\x80')
125                                                         ++length;
126                                                 else
127                                                         break;
128                                         }
129                                         continue;
130                                         //length++;
131                                 } else if (ch < '\u0800') {
132                                         length += 2;
133                                 } else if (ch >= '\uD800' && ch <= '\uDBFF') {
134                                         // This is the start of a surrogate pair.
135                                         pair = ch;
136                                 } else {
137                                         length += 3;
138                                 }
139                         } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
140                                 if (pair != 0) {
141                                         // We have a surrogate pair.
142                                         length += 4;
143                                         pair = '\0';
144                                 } else {
145                                         // We have a surrogate tail without 
146                                         // leading surrogate. In NET_2_0 it
147                                         // uses fallback. In NET_1_1 we output
148                                         // wrong surrogate.
149                                         length += 3;
150                                         pair = '\0';
151                                 }
152                         } else {
153                                 // We have a surrogate start followed by a
154                                 // regular character.  Technically, this is
155                                 // invalid, but we have to do something.
156                                 // We write out the surrogate start and then
157                                 // re-visit the current character again.
158                                 length += 3;
159                                 pair = '\0';
160                                 continue;
161                         }
162                         ++index;
163                         --count;
164                 }
165                 if (flush) {
166                         if (pair != '\0')
167                                 // Flush the left-over surrogate pair start.
168                                 length += 3;
169                         leftOver = '\0';
170                 }
171                 else
172                         leftOver = pair;
173
174                 // Return the final length to the caller.
175                 return length;
176         }
177
178         // Get the number of bytes needed to encode a character buffer.
179         public override int GetByteCount (char[] chars, int index, int count)
180         {
181                 char dummy = '\0';
182                 return InternalGetByteCount (chars, index, count, ref dummy, true);
183         }
184
185 #if !NET_2_0
186         // Convenience wrappers for "GetByteCount".
187         public override int GetByteCount (String s)
188         {
189                 // Validate the parameters.
190                 if (s == null) {
191                         throw new ArgumentNullException ("s");
192                 }
193
194                 unsafe {
195                         fixed (char* cptr = s) {
196                                 char dummy = '\0';
197                                 return InternalGetByteCount (cptr, s.Length, ref dummy, true);
198                         }
199                 }
200         }
201 #endif
202
203 #if NET_2_0
204         [CLSCompliant (false)]
205         [ComVisible (false)]
206         public unsafe override int GetByteCount (char* chars, int count)
207         {
208                 if (chars == null)
209                         throw new ArgumentNullException ("chars");
210                 if (count == 0)
211                         return 0;
212                 char dummy = '\0';
213                 return InternalGetByteCount (chars, count, ref dummy, true);
214         }
215 #endif
216
217         #endregion
218
219         #region GetBytes()
220
221         // Internal version of "GetBytes" which can handle a rolling
222         // state between multiple calls to this method.
223         private static int InternalGetBytes (char[] chars, int charIndex,
224                                              int charCount, byte[] bytes,
225                                              int byteIndex, ref char leftOver,
226                                              bool flush)
227         {
228                 // Validate the parameters.
229                 if (chars == null) {
230                         throw new ArgumentNullException ("chars");
231                 }
232                 if (bytes == null) {
233                         throw new ArgumentNullException ("bytes");
234                 }
235                 if (charIndex < 0 || charIndex > chars.Length) {
236                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
237                 }
238                 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
239                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
240                 }
241                 if (byteIndex < 0 || byteIndex > bytes.Length) {
242                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
243                 }
244
245                 if (charIndex == chars.Length) {
246                         if (flush && leftOver != '\0') {
247 #if NET_2_0
248                                 // FIXME: use EncoderFallback.
249                                 //
250                                 // By default it is empty, so I do nothing for now.
251                                 leftOver = '\0';
252 #else
253                                 // Flush the left-over surrogate pair start.
254                                 if (byteIndex >= bytes.Length - 3)
255                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
256                                 bytes [byteIndex++] = 0xEF;
257                                 bytes [byteIndex++] = 0xBB;
258                                 bytes [byteIndex++] = 0xBF;
259                                 leftOver = '\0';
260                                 return 3;
261 #endif
262                         }
263                         return 0;
264                 }
265
266                 unsafe {
267                         fixed (char* cptr = chars) {
268                                 if (bytes.Length == byteIndex)
269                                         return InternalGetBytes (
270                                                 cptr + charIndex, charCount, 
271                                                 null, 0, ref leftOver, flush);
272                                 fixed (byte *bptr = bytes) {
273                                         return InternalGetBytes (
274                                                 cptr + charIndex, charCount,
275                                                 bptr + byteIndex, bytes.Length - byteIndex,
276                                                 ref leftOver, flush);
277                                 }
278                         }
279                 }
280         }
281
282         private unsafe static int InternalGetBytes (char* chars, int charCount,
283                                              byte* bytes, int byteCount,
284                                              ref char leftOver, bool flush)
285         {
286                 int charIndex = 0;
287                 int byteIndex = 0;
288
289                 // Convert the characters into bytes.
290                 // Convert the characters into bytes.
291                 char ch;
292                 int length = byteCount;
293                 char pair = leftOver;
294                 int posn = byteIndex;
295                 int code = 0;
296
297                 while (charCount > 0) {
298                         // Fetch the next UTF-16 character pair value.
299                         ch = chars [charIndex];
300                         if (pair == '\0') {
301                                 if (ch < '\uD800' || ch >= '\uE000') {
302                                         if (ch < '\x80') { // fast path optimization
303                                                 int end = charIndex + charCount;
304                                                 for (; charIndex < end; posn++, charIndex++, charCount--) {
305                                                         if (chars [charIndex] < '\x80')
306                                                                 bytes [posn] = (byte) chars [charIndex];
307                                                         else
308                                                                 break;
309                                                 }
310                                                 continue;
311                                         }
312                                         code = ch;
313                                 }
314                                 else if (ch < '\uDC00') {
315                                         // surrogate start
316                                         pair = ch;
317                                         ++charIndex;
318                                         --charCount;
319                                         continue;
320                                 } else { // ch <= '\uDFFF'
321                                         // We have a surrogate tail without leading 
322                                         // surrogate. In NET_2_0 it uses fallback.
323                                         // In NET_1_1 we output wrong surrogate.
324                                         if (posn > length - 3) {
325                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
326                                         }
327                                         bytes [posn++] = (byte) (0xE0 | (ch >> 12));
328                                         bytes [posn++] = (byte) (0x80 | ((ch >> 6) & 0x3F));
329                                         bytes [posn++] = (byte) (0x80 | (ch & 0x3F));
330                                         ++charIndex;
331                                         --charCount;
332                                         continue;
333                                 }
334                         } else {
335                                 if ('\uDC00' <= ch && ch <= '\uDFFF')
336                                         code =  0x10000 + (int) ch - 0xDC00 +
337                                                 (((int) pair - 0xD800) << 10);
338                                 else {
339                                         // We have a surrogate start followed by a
340                                         // regular character.  Technically, this is
341                                         // invalid, but we have to do something.
342                                         // We write out the surrogate start and then
343                                         // re-visit the current character again.
344                                         if (posn > length - 3) {
345                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
346                                         }
347                                         bytes [posn++] = (byte) (0xE0 | (pair >> 12));
348                                         bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
349                                         bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
350                                         pair = '\0';
351                                         continue;
352                                 }
353                                 pair = '\0';
354                         }
355                         ++charIndex;
356                         --charCount;
357
358                         // Encode the character pair value.
359                         if (code < 0x0080) {
360                                 if (posn >= length)
361                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
362                                 bytes [posn++] = (byte)code;
363                         } else if (code < 0x0800) {
364                                 if ((posn + 2) > length)
365                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
366                                 bytes [posn++] = (byte) (0xC0 | (code >> 6));
367                                 bytes [posn++] = (byte) (0x80 | (code & 0x3F));
368                         } else if (code < 0x10000) {
369                                 if (posn > length - 3)
370                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
371                                 bytes [posn++] = (byte) (0xE0 | (code >> 12));
372                                 bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
373                                 bytes [posn++] = (byte) (0x80 | (code & 0x3F));
374                         } else {
375                                 if (posn > length - 4)
376                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
377                                 bytes [posn++] = (byte) (0xF0 | (code >> 18));
378                                 bytes [posn++] = (byte) (0x80 | ((code >> 12) & 0x3F));
379                                 bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
380                                 bytes [posn++] = (byte) (0x80 | (code & 0x3F));
381                         }
382                 }
383
384                 if (flush) {
385                         if (pair != '\0') {
386                                 // Flush the left-over incomplete surrogate.
387                                 if (posn > length - 3) {
388                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
389                                 }
390                                 bytes [posn++] = (byte) (0xE0 | (pair >> 12));
391                                 bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
392                                 bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
393                         }
394                         leftOver = '\0';
395                 }
396                 else
397                         leftOver = pair;
398 Char.IsLetterOrDigit (pair);
399
400                 // Return the final count to the caller.
401                 return posn - byteIndex;
402         }
403
404         private unsafe int Fallback (byte* bytes, int byteCount, char lead, char tail)
405         {
406                 throw new NotImplementedException ();
407         }
408
409         // Get the bytes that result from encoding a character buffer.
410         public override int GetBytes (char[] chars, int charIndex, int charCount,
411                                                                  byte[] bytes, int byteIndex)
412         {
413                 char leftOver = '\0';
414                 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
415         }
416
417         // Convenience wrappers for "GetBytes".
418         public override int GetBytes (String s, int charIndex, int charCount,
419                                                                  byte[] bytes, int byteIndex)
420         {
421                 // Validate the parameters.
422                 if (s == null) {
423                         throw new ArgumentNullException ("s");
424                 }
425                 if (bytes == null) {
426                         throw new ArgumentNullException ("bytes");
427                 }
428                 if (charIndex < 0 || charIndex > s.Length) {
429                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
430                 }
431                 if (charCount < 0 || charCount > (s.Length - charIndex)) {
432                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
433                 }
434                 if (byteIndex < 0 || byteIndex > bytes.Length) {
435                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
436                 }
437
438                 if (charIndex == s.Length)
439                         return 0;
440
441                 unsafe {
442                         fixed (char* cptr = s) {
443                                 char dummy = '\0';
444                                 if (bytes.Length == byteIndex)
445                                         return InternalGetBytes (
446                                                 cptr + charIndex, charCount,
447                                                 null, 0, ref dummy, true);
448                                 fixed (byte *bptr = bytes) {
449                                         return InternalGetBytes (
450                                                 cptr + charIndex, charCount,
451                                                 bptr + byteIndex, bytes.Length - byteIndex,
452                                                 ref dummy, true);
453                                 }
454                         }
455                 }
456         }
457
458 #if NET_2_0
459         [CLSCompliant (false)]
460         [ComVisible (false)]
461         public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
462         {
463                 if (chars == null)
464                         throw new ArgumentNullException ("chars");
465                 if (charCount < 0)
466                         throw new IndexOutOfRangeException ("charCount");
467                 if (bytes == null)
468                         throw new ArgumentNullException ("bytes");
469                 if (byteCount < 0)
470                         throw new IndexOutOfRangeException ("charCount");
471
472                 if (charCount == 0)
473                         return 0;
474
475                 char dummy = '\0';
476                 if (byteCount == 0)
477                         return InternalGetBytes (chars, charCount, null, 0, ref dummy, true);
478                 else
479                         return InternalGetBytes (chars, charCount, bytes, byteCount, ref dummy, true);
480         }
481 #endif
482
483         #endregion
484
485         // Internal version of "GetCharCount" which can handle a rolling
486         // state between multiple calls to this method.
487 #if NET_2_0
488         private unsafe static int InternalGetCharCount (
489                 byte[] bytes, int index, int count, uint leftOverBits,
490                 uint leftOverCount, object provider,
491                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
492 #else
493         private unsafe static int InternalGetCharCount (
494                 byte[] bytes, int index, int count, uint leftOverBits,
495                 uint leftOverCount, bool throwOnInvalid, bool flush)
496 #endif
497         {
498                 // Validate the parameters.
499                 if (bytes == null) {
500                         throw new ArgumentNullException ("bytes");
501                 }
502                 if (index < 0 || index > bytes.Length) {
503                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
504                 }
505                 if (count < 0 || count > (bytes.Length - index)) {
506                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
507                 }
508
509                 if (count == 0)
510                         return 0;
511                 fixed (byte *bptr = bytes)
512 #if NET_2_0
513                         return InternalGetCharCount (bptr + index, count,
514                                 leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
515 #else
516                         return InternalGetCharCount (bptr + index, count,
517                                 leftOverBits, leftOverCount, throwOnInvalid, flush);
518 #endif
519         }
520
521 #if NET_2_0
522         private unsafe static int InternalGetCharCount (
523                 byte* bytes, int count, uint leftOverBits,
524                 uint leftOverCount, object provider,
525                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
526 #else
527         private unsafe static int InternalGetCharCount (
528                 byte* bytes, int count, uint leftOverBits,
529                 uint leftOverCount, bool throwOnInvalid, bool flush)
530 #endif
531         {
532                 int index = 0;
533
534                 int length = 0;
535
536                 if (leftOverCount == 0) {
537                         int end = index + count;
538                         for (; index < end; index++, count--) {
539                                 if (bytes [index] < 0x80)
540                                         length++;
541                                 else
542                                         break;
543                         }
544                 }
545
546                 // Determine the number of characters that we have.
547                 uint ch;
548                 uint leftBits = leftOverBits;
549                 uint leftSoFar = (leftOverCount & (uint)0x0F);
550                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
551                 while (count > 0) {
552                         ch = (uint)(bytes[index++]);
553                         --count;
554                         if (leftSize == 0) {
555                                 // Process a UTF-8 start character.
556                                 if (ch < (uint)0x0080) {
557                                         // Single-byte UTF-8 character.
558                                         ++length;
559                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
560                                         // Double-byte UTF-8 character.
561                                         leftBits = (ch & (uint)0x1F);
562                                         leftSoFar = 1;
563                                         leftSize = 2;
564                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
565                                         // Three-byte UTF-8 character.
566                                         leftBits = (ch & (uint)0x0F);
567                                         leftSoFar = 1;
568                                         leftSize = 3;
569                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
570                                         // Four-byte UTF-8 character.
571                                         leftBits = (ch & (uint)0x07);
572                                         leftSoFar = 1;
573                                         leftSize = 4;
574                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
575                                         // Five-byte UTF-8 character.
576                                         leftBits = (ch & (uint)0x03);
577                                         leftSoFar = 1;
578                                         leftSize = 5;
579                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
580                                         // Six-byte UTF-8 character.
581                                         leftBits = (ch & (uint)0x03);
582                                         leftSoFar = 1;
583                                         leftSize = 6;
584                                 } else {
585                                         // Invalid UTF-8 start character.
586 #if NET_2_0
587                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
588 #else
589                                         if (throwOnInvalid)
590                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
591 #endif
592                                 }
593                         } else {
594                                 // Process an extra byte in a multi-byte sequence.
595                                 if ((ch & (uint)0xC0) == (uint)0x80) {
596                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
597                                         if (++leftSoFar >= leftSize) {
598                                                 // We have a complete character now.
599                                                 if (leftBits < (uint)0x10000) {
600                                                         // is it an overlong ?
601                                                         bool overlong = false;
602                                                         switch (leftSize) {
603                                                         case 2:
604                                                                 overlong = (leftBits <= 0x7F);
605                                                                 break;
606                                                         case 3:
607                                                                 overlong = (leftBits <= 0x07FF);
608                                                                 break;
609                                                         case 4:
610                                                                 overlong = (leftBits <= 0xFFFF);
611                                                                 break;
612                                                         case 5:
613                                                                 overlong = (leftBits <= 0x1FFFFF);
614                                                                 break;
615                                                         case 6:
616                                                                 overlong = (leftBits <= 0x03FFFFFF);
617                                                                 break;
618                                                         }
619                                                         if (overlong) {
620 #if NET_2_0
621                                                                 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
622 #else
623                                                                 if (throwOnInvalid)
624                                                                         throw new ArgumentException (_("Overlong"), leftBits.ToString ());
625 #endif
626                                                         }
627                                                         else
628                                                                 ++length;
629                                                 } else if (leftBits < (uint)0x110000) {
630                                                         length += 2;
631                                                 } else {
632 #if NET_2_0
633                                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
634 #else
635                                                         if (throwOnInvalid)
636                                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
637 #endif
638                                                 }
639                                                 leftSize = 0;
640                                         }
641                                 } else {
642                                         // Invalid UTF-8 sequence: clear and restart.
643 #if NET_2_0
644                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
645 #else
646                                         if (throwOnInvalid)
647                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
648 #endif
649                                         leftSize = 0;
650                                         --index;
651                                         ++count;
652                                 }
653                         }
654                 }
655                 if (flush && leftSize != 0) {
656                         // We had left-over bytes that didn't make up
657                         // a complete UTF-8 character sequence.
658 #if NET_2_0
659                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index);
660 #else
661                         if (throwOnInvalid)
662                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
663 #endif
664                 }
665
666                 // Return the final length to the caller.
667                 return length;
668         }
669
670 #if NET_2_0
671         // for GetCharCount()
672         static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, int index)
673         {
674                 if (buffer == null) {
675                         DecoderFallback fb = provider as DecoderFallback;
676                         if (fb != null)
677                                 buffer = fb.CreateFallbackBuffer ();
678                         else
679                                 buffer = ((Decoder) provider).FallbackBuffer;
680                 }
681                 if (bufferArg == null)
682                         bufferArg = new byte [1];
683                 bufferArg [0] = bytes [index];
684                 buffer.Fallback (bufferArg, 0);
685                 return buffer.Remaining;
686         }
687
688         // for GetChars()
689         static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, int byteIndex,
690                 char* chars, ref int charIndex)
691         {
692                 if (buffer == null) {
693                         DecoderFallback fb = provider as DecoderFallback;
694                         if (fb != null)
695                                 buffer = fb.CreateFallbackBuffer ();
696                         else
697                                 buffer = ((Decoder) provider).FallbackBuffer;
698                 }
699                 if (bufferArg == null)
700                         bufferArg = new byte [1];
701                 bufferArg [0] = bytes [byteIndex];
702                 buffer.Fallback (bufferArg, 0);
703                 while (buffer.Remaining > 0)
704                         chars [charIndex++] = buffer.GetNextChar ();
705         }
706 #endif
707
708         // Get the number of characters needed to decode a byte buffer.
709         public override int GetCharCount (byte[] bytes, int index, int count)
710         {
711 #if NET_2_0
712                 DecoderFallbackBuffer buf = null;
713                 byte [] bufferArg = null;
714                 return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
715 #else
716                 return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
717 #endif
718         }
719
720 #if NET_2_0
721         [CLSCompliant (false)]
722         [ComVisible (false)]
723         public unsafe override int GetCharCount (byte* bytes, int count)
724         {
725                 DecoderFallbackBuffer buf = null;
726                 byte [] bufferArg = null;
727                 return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
728         }
729 #endif
730
731         // Get the characters that result from decoding a byte buffer.
732 #if NET_2_0
733         private unsafe static int InternalGetChars (
734                 byte[] bytes, int byteIndex, int byteCount, char[] chars,
735                 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
736                 object provider,
737                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
738 #else
739         private unsafe static int InternalGetChars (
740                 byte[] bytes, int byteIndex, int byteCount, char[] chars,
741                 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
742                 bool throwOnInvalid, bool flush)
743 #endif
744         {
745                 // Validate the parameters.
746                 if (bytes == null) {
747                         throw new ArgumentNullException ("bytes");
748                 }
749                 if (chars == null) {
750                         throw new ArgumentNullException ("chars");
751                 }
752                 if (byteIndex < 0 || byteIndex > bytes.Length) {
753                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
754                 }
755                 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
756                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
757                 }
758                 if (charIndex < 0 || charIndex > chars.Length) {
759                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
760                 }
761
762                 if (charIndex == chars.Length)
763                         return 0;
764
765                 fixed (char* cptr = chars) {
766 #if NET_2_0
767                         if (byteCount == 0 || byteIndex == bytes.Length)
768                                 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
769                         // otherwise...
770                         fixed (byte* bptr = bytes)
771                                 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
772 #else
773                         if (byteCount == 0 || byteIndex == bytes.Length)
774                                 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
775                         // otherwise...
776                         fixed (byte* bptr = bytes)
777                                 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
778 #endif
779                 }
780         }
781
782 #if NET_2_0
783         private unsafe static int InternalGetChars (
784                 byte* bytes, int byteCount, char* chars, int charCount,
785                 ref uint leftOverBits, ref uint leftOverCount,
786                 object provider,
787                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
788 #else
789         private unsafe static int InternalGetChars (
790                 byte* bytes, int byteCount, char* chars, int charCount,
791                 ref uint leftOverBits, ref uint leftOverCount,
792                 bool throwOnInvalid, bool flush)
793 #endif
794         {
795                 int charIndex = 0, byteIndex = 0;
796                 int length = charCount;
797                 int posn = charIndex;
798
799                 if (leftOverCount == 0) {
800                         int end = byteIndex + byteCount;
801                         for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
802                                 if (bytes [byteIndex] < 0x80)
803                                         chars [posn] = (char) bytes [byteIndex];
804                                 else
805                                         break;
806                         }
807                 }
808
809                 // Convert the bytes into the output buffer.
810                 uint ch;
811                 uint leftBits = leftOverBits;
812                 uint leftSoFar = (leftOverCount & (uint)0x0F);
813                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
814
815                 int byteEnd = byteIndex + byteCount;
816                 for(; byteIndex < byteEnd; byteIndex++) {
817                         // Fetch the next character from the byte buffer.
818                         ch = (uint)(bytes[byteIndex]);
819                         if (leftSize == 0) {
820                                 // Process a UTF-8 start character.
821                                 if (ch < (uint)0x0080) {
822                                         // Single-byte UTF-8 character.
823                                         if (posn >= length) {
824                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
825                                         }
826                                         chars[posn++] = (char)ch;
827                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
828                                         // Double-byte UTF-8 character.
829                                         leftBits = (ch & (uint)0x1F);
830                                         leftSoFar = 1;
831                                         leftSize = 2;
832                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
833                                         // Three-byte UTF-8 character.
834                                         leftBits = (ch & (uint)0x0F);
835                                         leftSoFar = 1;
836                                         leftSize = 3;
837                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
838                                         // Four-byte UTF-8 character.
839                                         leftBits = (ch & (uint)0x07);
840                                         leftSoFar = 1;
841                                         leftSize = 4;
842                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
843                                         // Five-byte UTF-8 character.
844                                         leftBits = (ch & (uint)0x03);
845                                         leftSoFar = 1;
846                                         leftSize = 5;
847                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
848                                         // Six-byte UTF-8 character.
849                                         leftBits = (ch & (uint)0x03);
850                                         leftSoFar = 1;
851                                         leftSize = 6;
852                                 } else {
853                                         // Invalid UTF-8 start character.
854 #if NET_2_0
855                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
856 #else
857                                         if (throwOnInvalid)
858                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
859 #endif
860                                 }
861                         } else {
862                                 // Process an extra byte in a multi-byte sequence.
863                                 if ((ch & (uint)0xC0) == (uint)0x80) {
864                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
865                                         if (++leftSoFar >= leftSize) {
866                                                 // We have a complete character now.
867                                                 if (leftBits < (uint)0x10000) {
868                                                         // is it an overlong ?
869                                                         bool overlong = false;
870                                                         switch (leftSize) {
871                                                         case 2:
872                                                                 overlong = (leftBits <= 0x7F);
873                                                                 break;
874                                                         case 3:
875                                                                 overlong = (leftBits <= 0x07FF);
876                                                                 break;
877                                                         case 4:
878                                                                 overlong = (leftBits <= 0xFFFF);
879                                                                 break;
880                                                         case 5:
881                                                                 overlong = (leftBits <= 0x1FFFFF);
882                                                                 break;
883                                                         case 6:
884                                                                 overlong = (leftBits <= 0x03FFFFFF);
885                                                                 break;
886                                                         }
887                                                         if (overlong) {
888 #if NET_2_0
889                                                                 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
890 #else
891                                                                 if (throwOnInvalid)
892                                                                         throw new ArgumentException (_("Overlong"), leftBits.ToString ());
893 #endif
894                                                         }
895                                                         else if ((leftBits & 0xF800) == 0xD800) {
896                                                                 // UTF-8 doesn't use surrogate characters
897 #if NET_2_0
898                                                                 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
899 #else
900                                                                 if (throwOnInvalid)
901                                                                         throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
902 #endif
903                                                         }
904                                                         else {
905                                                                 if (posn >= length) {
906                                                                         throw new ArgumentException
907                                                                                 (_("Arg_InsufficientSpace"), "chars");
908                                                                 }
909                                                                 chars[posn++] = (char)leftBits;
910                                                         }
911                                                 } else if (leftBits < (uint)0x110000) {
912                                                         if ((posn + 2) > length) {
913                                                                 throw new ArgumentException
914                                                                         (_("Arg_InsufficientSpace"), "chars");
915                                                         }
916                                                         leftBits -= (uint)0x10000;
917                                                         chars[posn++] = (char)((leftBits >> 10) +
918                                                                                                    (uint)0xD800);
919                                                         chars[posn++] =
920                                                                 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
921                                                 } else {
922 #if NET_2_0
923                                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
924 #else
925                                                         if (throwOnInvalid)
926                                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
927 #endif
928                                                 }
929                                                 leftSize = 0;
930                                         }
931                                 } else {
932                                         // Invalid UTF-8 sequence: clear and restart.
933 #if NET_2_0
934                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
935 #else
936                                         if (throwOnInvalid)
937                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
938 #endif
939                                         leftSize = 0;
940                                         --byteIndex;
941                                 }
942                         }
943                 }
944                 if (flush && leftSize != 0) {
945                         // We had left-over bytes that didn't make up
946                         // a complete UTF-8 character sequence.
947 #if NET_2_0
948                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
949 #else
950                         if (throwOnInvalid)
951                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
952 #endif
953                 }
954                 leftOverBits = leftBits;
955                 leftOverCount = (leftSoFar | (leftSize << 4));
956
957                 // Return the final length to the caller.
958                 return posn - charIndex;
959         }
960
961         // Get the characters that result from decoding a byte buffer.
962         public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
963                                                                  char[] chars, int charIndex)
964         {
965                 uint leftOverBits = 0;
966                 uint leftOverCount = 0;
967 #if NET_2_0
968                 DecoderFallbackBuffer buf = null;
969                 byte [] bufferArg = null;
970                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
971                                 charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
972 #else
973                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
974                                 charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
975 #endif
976         }
977
978 #if NET_2_0
979         [CLSCompliant (false)]
980         [ComVisible (false)]
981         public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
982         {
983                 DecoderFallbackBuffer buf = null;
984                 byte [] bufferArg = null;
985                 uint leftOverBits = 0;
986                 uint leftOverCount = 0;
987                 return InternalGetChars (bytes, byteCount, chars, 
988                                 charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
989         }
990 #endif
991
992         // Get the maximum number of bytes needed to encode a
993         // specified number of characters.
994         public override int GetMaxByteCount (int charCount)
995         {
996                 if (charCount < 0) {
997                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
998                 }
999                 return charCount * 4;
1000         }
1001
1002         // Get the maximum number of characters needed to decode a
1003         // specified number of bytes.
1004         public override int GetMaxCharCount (int byteCount)
1005         {
1006                 if (byteCount < 0) {
1007                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
1008                 }
1009                 return byteCount;
1010         }
1011
1012         // Get a UTF8-specific decoder that is attached to this instance.
1013         public override Decoder GetDecoder ()
1014         {
1015 #if NET_2_0
1016                 return new UTF8Decoder (DecoderFallback);
1017 #else
1018                 return new UTF8Decoder (throwOnInvalid);
1019 #endif
1020         }
1021
1022         // Get a UTF8-specific encoder that is attached to this instance.
1023         public override Encoder GetEncoder ()
1024         {
1025                 return new UTF8Encoder (emitIdentifier);
1026         }
1027
1028         // Get the UTF8 preamble.
1029         public override byte[] GetPreamble ()
1030         {
1031                 if (emitIdentifier) {
1032                         byte[] pre = new byte [3];
1033                         pre[0] = (byte)0xEF;
1034                         pre[1] = (byte)0xBB;
1035                         pre[2] = (byte)0xBF;
1036                         return pre;
1037                 } else {
1038                         return new byte [0];
1039                 }
1040         }
1041
1042         // Determine if this object is equal to another.
1043         public override bool Equals (Object value)
1044         {
1045                 UTF8Encoding enc = (value as UTF8Encoding);
1046                 if (enc != null) {
1047 #if NET_2_0
1048                         return (codePage == enc.codePage &&
1049                                         emitIdentifier == enc.emitIdentifier &&
1050                                         DecoderFallback == enc.DecoderFallback &&
1051                                         EncoderFallback == enc.EncoderFallback);
1052 #else
1053                         return (codePage == enc.codePage &&
1054                                         emitIdentifier == enc.emitIdentifier &&
1055                                         throwOnInvalid == enc.throwOnInvalid);
1056 #endif
1057                 } else {
1058                         return false;
1059                 }
1060         }
1061
1062         // Get the hash code for this object.
1063         public override int GetHashCode ()
1064         {
1065                 return base.GetHashCode ();
1066         }
1067
1068 #if NET_2_0
1069         [MonoTODO]
1070         public override int GetByteCount (string s)
1071         {
1072                 // hmm, does this override make any sense?
1073                 return base.GetByteCount (s);
1074         }
1075
1076         [MonoTODO]
1077         [ComVisible (false)]
1078         public override string GetString (byte [] bytes, int index, int count)
1079         {
1080                 // hmm, does this override make any sense?
1081                 return base.GetString (bytes, index, count);
1082         }
1083 #endif
1084
1085 #if !NET_2_0
1086         public override byte [] GetBytes (String s)
1087         {
1088                 if (s == null)
1089                         throw new ArgumentNullException ("s");
1090                 
1091                 int length = GetByteCount (s);
1092                 byte [] bytes = new byte [length];
1093                 GetBytes (s, 0, s.Length, bytes, 0);
1094                 return bytes;
1095         }
1096 #endif
1097
1098         // UTF-8 decoder implementation.
1099         [Serializable]
1100         private class UTF8Decoder : Decoder
1101         {
1102 #if !NET_2_0
1103                 private bool throwOnInvalid;
1104 #endif
1105                 private uint leftOverBits;
1106                 private uint leftOverCount;
1107
1108                 // Constructor.
1109 #if NET_2_0
1110                 public UTF8Decoder (DecoderFallback fallback)
1111 #else
1112                 public UTF8Decoder (bool throwOnInvalid)
1113 #endif
1114                 {
1115 #if NET_2_0
1116                         Fallback = fallback;
1117 #else
1118                         this.throwOnInvalid = throwOnInvalid;
1119 #endif
1120                         leftOverBits = 0;
1121                         leftOverCount = 0;
1122                 }
1123
1124                 // Override inherited methods.
1125                 public override int GetCharCount (byte[] bytes, int index, int count)
1126                 {
1127 #if NET_2_0
1128                         DecoderFallbackBuffer buf = null;
1129                         byte [] bufferArg = null;
1130                         return InternalGetCharCount (bytes, index, count,
1131                                 leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
1132 #else
1133                         return InternalGetCharCount (bytes, index, count,
1134                                         leftOverBits, leftOverCount, throwOnInvalid, false);
1135 #endif
1136                 }
1137                 public override int GetChars (byte[] bytes, int byteIndex,
1138                                                  int byteCount, char[] chars, int charIndex)
1139                 {
1140 #if NET_2_0
1141                         DecoderFallbackBuffer buf = null;
1142                         byte [] bufferArg = null;
1143                         return InternalGetChars (bytes, byteIndex, byteCount,
1144                                 chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
1145 #else
1146                         return InternalGetChars (bytes, byteIndex, byteCount,
1147                                 chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
1148 #endif
1149                 }
1150
1151         } // class UTF8Decoder
1152
1153         // UTF-8 encoder implementation.
1154         [Serializable]
1155         private class UTF8Encoder : Encoder
1156         {
1157                 private bool emitIdentifier;
1158                 private char leftOverForCount;
1159                 private char leftOverForConv;
1160
1161                 // Constructor.
1162                 public UTF8Encoder (bool emitIdentifier)
1163                 {
1164                         this.emitIdentifier = emitIdentifier;
1165                         leftOverForCount = '\0';
1166                         leftOverForConv = '\0';
1167                 }
1168
1169                 // Override inherited methods.
1170                 public override int GetByteCount (char[] chars, int index,
1171                                          int count, bool flush)
1172                 {
1173                         return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush);
1174                 }
1175                 public override int GetBytes (char[] chars, int charIndex,
1176                                          int charCount, byte[] bytes, int byteIndex, bool flush)
1177                 {
1178                         int result;
1179                         result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush);
1180                         emitIdentifier = false;
1181                         return result;
1182                 }
1183
1184 #if NET_2_0
1185                 public unsafe override int GetByteCount (char* chars, int count, bool flush)
1186                 {
1187                         return InternalGetByteCount (chars, count, ref leftOverForCount, flush);
1188                 }
1189
1190                 public unsafe override int GetBytes (char* chars, int charCount,
1191                         byte* bytes, int byteCount, bool flush)
1192                 {
1193                         int result;
1194                         result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush);
1195                         emitIdentifier = false;
1196                         return result;
1197                 }
1198 #endif
1199
1200         } // class UTF8Encoder
1201
1202 }; // class UTF8Encoding
1203
1204 }; // namespace System.Text