2006-02-02 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / corlib / System.Text / UTF8Encoding.cs
1 /*
2  * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
3  *
4  * Copyright (c) 2001, 2002  Southern Storm Software, Pty Ltd
5  * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23  * OTHER DEALINGS IN THE SOFTWARE.
24  */
25
26 namespace System.Text
27 {
28
29 using System;
30
31 [Serializable]
32 [MonoTODO ("Fix serialization compatibility with MS.NET")]
33 #if NET_2_0
34 [MonoTODO ("EncoderFallback is not handled")]
35 #endif
36 public class UTF8Encoding : Encoding
37 {
38         // Magic number used by Windows for UTF-8.
39         internal const int UTF8_CODE_PAGE = 65001;
40
41         // Internal state.
42         private bool emitIdentifier;
43 #if !NET_2_0
44         private bool throwOnInvalid;
45 #endif
46
47         // Constructors.
48         public UTF8Encoding () : this (false, false) {}
49         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
50                         : this (encoderShouldEmitUTF8Identifier, false) {}
51         
52         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
53                 : base (UTF8_CODE_PAGE)
54         {
55                 emitIdentifier = encoderShouldEmitUTF8Identifier;
56 #if NET_2_0
57                 if (throwOnInvalidBytes)
58                         SetFallbackInternal (null, new DecoderExceptionFallback ());
59                 else
60                         SetFallbackInternal (null, new DecoderReplacementFallback (String.Empty));
61 #else
62                 throwOnInvalid = throwOnInvalidBytes;
63 #endif
64
65                 web_name = body_name = header_name = "utf-8";
66                 encoding_name = "Unicode (UTF-8)";
67                 is_browser_save = true;
68                 is_browser_display = true;
69                 is_mail_news_display = true;
70                 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
71         }
72
73         #region GetByteCount()
74
75         // Internal version of "GetByteCount" which can handle a rolling
76         // state between multiple calls to this method.
77         private static int InternalGetByteCount (char[] chars, int index, int count, ref uint leftOver, bool flush)
78         {
79                 // Validate the parameters.
80                 if (chars == null) {
81                         throw new ArgumentNullException ("chars");
82                 }
83                 if (index < 0 || index > chars.Length) {
84                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
85                 }
86                 if (count < 0 || count > (chars.Length - index)) {
87                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
88                 }
89
90                 if (index == chars.Length) {
91                         if (flush && leftOver != 0) {
92                                 // Flush the left-over surrogate pair start.
93                                 leftOver = 0;
94                                 return 3;
95                         }
96                         return 0;
97                 }
98
99                 unsafe {
100                         fixed (char* cptr = chars) {
101                                 return InternalGetByteCount (cptr + index, count, ref leftOver, flush);
102                         }
103                 }
104         }
105
106
107         private unsafe static int InternalGetByteCount (char* chars, int count, ref uint leftOver, bool flush)
108         {
109                 int index = 0;
110
111                 // Determine the lengths of all characters.
112                 char ch;
113                 int length = 0;
114                 uint pair = leftOver;
115                 while (count > 0) {
116                         ch = chars[index];
117                         if (pair == 0) {
118                                 if (ch < '\u0080') {
119                                         ++length;
120                                 } else if (ch < '\u0800') {
121                                         length += 2;
122                                 } else if (ch >= '\uD800' && ch <= '\uDBFF') {
123                                         // This is the start of a surrogate pair.
124                                         pair = (uint)ch;
125                                 } else {
126                                         length += 3;
127                                 }
128                         } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
129                                 // We have a surrogate pair.
130                                 length += 4;
131                                 pair = 0;
132                         } else {
133                                 // We have a surrogate start followed by a
134                                 // regular character.  Technically, this is
135                                 // invalid, but we have to do something.
136                                 // We write out the surrogate start and then
137                                 // re-visit the current character again.
138                                 length += 3;
139                                 pair = 0;
140                                 continue;
141                         }
142                         ++index;
143                         --count;
144                 }
145                 if (flush && pair != 0) {
146                         // Flush the left-over surrogate pair start.
147                         length += 3;
148                 }
149
150                 leftOver = pair;
151
152                 // Return the final length to the caller.
153                 return length;
154         }
155
156         // Get the number of bytes needed to encode a character buffer.
157         public override int GetByteCount (char[] chars, int index, int count)
158         {
159                 uint dummy = 0;
160                 return InternalGetByteCount (chars, index, count, ref dummy, true);
161         }
162
163         // Convenience wrappers for "GetByteCount".
164         public override int GetByteCount (String s)
165         {
166                 // Validate the parameters.
167                 if (s == null) {
168                         throw new ArgumentNullException ("s");
169                 }
170
171                 unsafe {
172                         fixed (char* cptr = s) {
173                                 uint dummy = 0;
174                                 return InternalGetByteCount (cptr, s.Length, ref dummy, true);
175                         }
176                 }
177         }
178
179         #endregion
180
181         #region GetBytes()
182
183         // Internal version of "GetBytes" which can handle a rolling
184         // state between multiple calls to this method.
185         private static int InternalGetBytes (char[] chars, int charIndex,
186                                              int charCount, byte[] bytes,
187                                              int byteIndex, ref uint leftOver,
188                                              bool flush)
189         {
190                 // Validate the parameters.
191                 if (chars == null) {
192                         throw new ArgumentNullException ("chars");
193                 }
194                 if (bytes == null) {
195                         throw new ArgumentNullException ("bytes");
196                 }
197                 if (charIndex < 0 || charIndex > chars.Length) {
198                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
199                 }
200                 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
201                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
202                 }
203                 if (byteIndex < 0 || byteIndex > bytes.Length) {
204                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
205                 }
206
207                 if (charIndex == chars.Length) {
208                         if (flush && leftOver != 0) {
209                                 // Flush the left-over surrogate pair start.
210                                 bytes [byteIndex++] = 0xEF;
211                                 bytes [byteIndex++] = 0xBB;
212                                 bytes [byteIndex++] = 0xBF;
213                                 leftOver = 0;
214                                 return 3;
215                         }
216                         return 0;
217                 }
218
219                 unsafe {
220                         fixed (char* cptr = chars) {
221                                 fixed (byte *bptr = bytes) {
222                                         return InternalGetBytes (
223                                                 cptr + charIndex, charCount,
224                                                 bptr + byteIndex, bytes.Length - byteIndex,
225                                                 ref leftOver, flush);
226                                 }
227                         }
228                 }
229         }
230
231         private unsafe static int InternalGetBytes (char* chars, int charCount,
232                                              byte* bytes, int byteCount,
233                                              ref uint leftOver, bool flush)
234         {
235                 int charIndex = 0;
236                 int byteIndex = 0;
237
238                 // Convert the characters into bytes.
239                 // Convert the characters into bytes.
240                 char ch;
241                 int length = byteCount;
242                 uint pair = leftOver;
243                 int posn = byteIndex;
244
245                 while (charCount > 0) {
246                         // Fetch the next UTF-16 character pair value.
247                         ch = chars [charIndex++];
248                         if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
249                                 // This may be the start of a surrogate pair.
250                                 pair = (uint) chars [charIndex];
251                                 if (pair >= 0xDC00 && pair <= 0xDFFF) {
252                                         pair = pair - 0xDC00 +
253                                                 (((uint) ch - 0xD800) << 10) +
254                                                 0x10000;
255                                         ++charIndex;
256                                         --charCount;
257                                 } else {
258                                         pair = (uint) ch;
259                                 }
260                         } else {
261                                 pair = (uint) ch;
262                         }
263                         --charCount;
264
265                         // Encode the character pair value.
266                         if (pair < 0x0080) {
267                                 if (posn >= length)
268                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
269                                 bytes [posn++] = (byte)pair;
270                         } else if (pair < 0x0800) {
271                                 if ((posn + 2) > length)
272                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
273                                 bytes [posn++] = (byte) (0xC0 | (pair >> 6));
274                                 bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
275                         } else if (pair < 0x10000) {
276                                 if ((posn + 3) > length)
277                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
278                                 bytes [posn++] = (byte) (0xE0 | (pair >> 12));
279                                 bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
280                                 bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
281                         } else {
282                                 if ((posn + 4) > length)
283                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
284                                 bytes [posn++] = (byte) (0xF0 | (pair >> 18));
285                                 bytes [posn++] = (byte) (0x80 | ((pair >> 12) & 0x3F));
286                                 bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
287                                 bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
288                         }
289                 }
290
291                 if (flush && pair >= 0xD800 && pair < 0xDC00) {
292                         // Flush the left-over surrogate pair start.
293                         if ((posn + 3) > length) {
294                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
295                         }
296                         bytes [posn++] = (byte) (0xE0 | (pair >> 12));
297                         bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
298                         bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
299                         leftOver = 0;
300                 }
301                 else
302                         leftOver = pair;
303
304                 // Return the final count to the caller.
305                 return posn - byteIndex;
306         }
307
308         // Get the bytes that result from encoding a character buffer.
309         public override int GetBytes (char[] chars, int charIndex, int charCount,
310                                                                  byte[] bytes, int byteIndex)
311         {
312                 uint leftOver = 0;
313                 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
314         }
315
316         // Convenience wrappers for "GetBytes".
317         public override int GetBytes (String s, int charIndex, int charCount,
318                                                                  byte[] bytes, int byteIndex)
319         {
320                 // Validate the parameters.
321                 if (s == null) {
322                         throw new ArgumentNullException ("s");
323                 }
324                 if (bytes == null) {
325                         throw new ArgumentNullException ("bytes");
326                 }
327                 if (charIndex < 0 || charIndex > s.Length) {
328                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
329                 }
330                 if (charCount < 0 || charCount > (s.Length - charIndex)) {
331                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
332                 }
333                 if (byteIndex < 0 || byteIndex > bytes.Length) {
334                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
335                 }
336
337                 if (charIndex == s.Length)
338                         return 0;
339
340                 unsafe {
341                         fixed (char* cptr = s) {
342                                 fixed (byte *bptr = bytes) {
343                                         uint dummy = 0;
344                                         return InternalGetBytes (
345                                                 cptr + charIndex, charCount,
346                                                 bptr + byteIndex, bytes.Length - byteIndex,
347                                                 ref dummy, true);
348                                 }
349                         }
350                 }
351         }
352
353         #endregion
354
355         // Internal version of "GetCharCount" which can handle a rolling
356         // state between multiple calls to this method.
357 #if NET_2_0
358         // Internal version of "GetCharCount" which can handle a rolling
359         // state between multiple calls to this method.
360         private static int InternalGetCharCount (
361                 byte[] bytes, int index, int count, uint leftOverBits,
362                 uint leftOverCount, object provider,
363                 ref DecoderFallbackBuffer fallbackBuffer, bool flush)
364 #else
365         private static int InternalGetCharCount (
366                 byte[] bytes, int index, int count, uint leftOverBits,
367                 uint leftOverCount, bool throwOnInvalid, bool flush)
368 #endif
369         {
370                 // Validate the parameters.
371                 if (bytes == null) {
372                         throw new ArgumentNullException ("bytes");
373                 }
374                 if (index < 0 || index > bytes.Length) {
375                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
376                 }
377                 if (count < 0 || count > (bytes.Length - index)) {
378                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
379                 }
380
381                 int length = 0;
382
383                 if (leftOverCount == 0) {
384                         int end = index + count;
385                         for (; index < end; index++, count--) {
386                                 if (bytes [index] < 0x80)
387                                         length++;
388                                 else
389                                         break;
390                         }
391                 }
392
393                 // Determine the number of characters that we have.
394                 uint ch;
395                 uint leftBits = leftOverBits;
396                 uint leftSoFar = (leftOverCount & (uint)0x0F);
397                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
398                 while (count > 0) {
399                         ch = (uint)(bytes[index++]);
400                         --count;
401                         if (leftSize == 0) {
402                                 // Process a UTF-8 start character.
403                                 if (ch < (uint)0x0080) {
404                                         // Single-byte UTF-8 character.
405                                         ++length;
406                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
407                                         // Double-byte UTF-8 character.
408                                         leftBits = (ch & (uint)0x1F);
409                                         leftSoFar = 1;
410                                         leftSize = 2;
411                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
412                                         // Three-byte UTF-8 character.
413                                         leftBits = (ch & (uint)0x0F);
414                                         leftSoFar = 1;
415                                         leftSize = 3;
416                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
417                                         // Four-byte UTF-8 character.
418                                         leftBits = (ch & (uint)0x07);
419                                         leftSoFar = 1;
420                                         leftSize = 4;
421                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
422                                         // Five-byte UTF-8 character.
423                                         leftBits = (ch & (uint)0x03);
424                                         leftSoFar = 1;
425                                         leftSize = 5;
426                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
427                                         // Six-byte UTF-8 character.
428                                         leftBits = (ch & (uint)0x03);
429                                         leftSoFar = 1;
430                                         leftSize = 6;
431                                 } else {
432                                         // Invalid UTF-8 start character.
433 #if NET_2_0
434                                         length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
435 #else
436                                         if (throwOnInvalid)
437                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
438 #endif
439                                 }
440                         } else {
441                                 // Process an extra byte in a multi-byte sequence.
442                                 if ((ch & (uint)0xC0) == (uint)0x80) {
443                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
444                                         if (++leftSoFar >= leftSize) {
445                                                 // We have a complete character now.
446                                                 if (leftBits < (uint)0x10000) {
447                                                         // is it an overlong ?
448                                                         bool overlong = false;
449                                                         switch (leftSize) {
450                                                         case 2:
451                                                                 overlong = (leftBits <= 0x7F);
452                                                                 break;
453                                                         case 3:
454                                                                 overlong = (leftBits <= 0x07FF);
455                                                                 break;
456                                                         case 4:
457                                                                 overlong = (leftBits <= 0xFFFF);
458                                                                 break;
459                                                         case 5:
460                                                                 overlong = (leftBits <= 0x1FFFFF);
461                                                                 break;
462                                                         case 6:
463                                                                 overlong = (leftBits <= 0x03FFFFFF);
464                                                                 break;
465                                                         }
466                                                         if (overlong) {
467 #if NET_2_0
468                                                                 length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
469 #else
470                                                                 if (throwOnInvalid)
471                                                                         throw new ArgumentException (_("Overlong"), leftBits.ToString ());
472 #endif
473                                                         }
474                                                         else
475                                                                 ++length;
476                                                 } else if (leftBits < (uint)0x110000) {
477                                                         length += 2;
478                                                 } else {
479 #if NET_2_0
480                                                         length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
481 #else
482                                                         if (throwOnInvalid)
483                                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
484 #endif
485                                                 }
486                                                 leftSize = 0;
487                                         }
488                                 } else {
489                                         // Invalid UTF-8 sequence: clear and restart.
490 #if NET_2_0
491                                         length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
492 #else
493                                         if (throwOnInvalid)
494                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
495 #endif
496                                         leftSize = 0;
497                                         --index;
498                                         ++count;
499                                 }
500                         }
501                 }
502                 if (flush && leftSize != 0) {
503                         // We had left-over bytes that didn't make up
504                         // a complete UTF-8 character sequence.
505 #if NET_2_0
506                         length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
507 #else
508                         if (throwOnInvalid)
509                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
510 #endif
511                 }
512
513                 // Return the final length to the caller.
514                 return length;
515         }
516
517 #if NET_2_0
518         // for GetCharCount()
519         static int Fallback (object provider, ref DecoderFallbackBuffer buffer, byte [] bytes, int index)
520         {
521                 if (buffer == null) {
522                         DecoderFallback fb = provider as DecoderFallback;
523                         if (fb != null)
524                                 buffer = fb.CreateFallbackBuffer ();
525                         else
526                                 buffer = ((Decoder) provider).FallbackBuffer;
527                 }
528                 buffer.Fallback (bytes, index - 1);
529                 return buffer.Remaining;
530         }
531
532         // for GetChars()
533         static void Fallback (object provider, ref DecoderFallbackBuffer buffer, byte [] bytes, int byteIndex,
534                 char [] chars, ref int charIndex)
535         {
536                 if (buffer == null) {
537                         DecoderFallback fb = provider as DecoderFallback;
538                         if (fb != null)
539                                 buffer = fb.CreateFallbackBuffer ();
540                         else
541                                 buffer = ((Decoder) provider).FallbackBuffer;
542                 }
543                 buffer.Fallback (bytes, byteIndex - 1);
544                 while (buffer.Remaining > 0)
545                         chars [charIndex++] = buffer.GetNextChar ();
546         }
547 #endif
548
549         // Get the number of characters needed to decode a byte buffer.
550         public override int GetCharCount (byte[] bytes, int index, int count)
551         {
552 #if NET_2_0
553                 DecoderFallbackBuffer buf = null;
554                 return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, true);
555 #else
556                 return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
557 #endif
558         }
559
560         // Get the characters that result from decoding a byte buffer.
561 #if NET_2_0
562         private static int InternalGetChars (
563                 byte[] bytes, int byteIndex, int byteCount, char[] chars,
564                 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
565                 object provider,
566                 ref DecoderFallbackBuffer fallbackBuffer, bool flush)
567 #else
568         private static int InternalGetChars (
569                 byte[] bytes, int byteIndex, int byteCount, char[] chars,
570                 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
571                 bool throwOnInvalid, bool flush)
572 #endif
573         {
574                 // Validate the parameters.
575                 if (bytes == null) {
576                         throw new ArgumentNullException ("bytes");
577                 }
578                 if (chars == null) {
579                         throw new ArgumentNullException ("chars");
580                 }
581                 if (byteIndex < 0 || byteIndex > bytes.Length) {
582                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
583                 }
584                 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
585                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
586                 }
587                 if (charIndex < 0 || charIndex > chars.Length) {
588                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
589                 }
590
591                 if (charIndex == chars.Length)
592                         return 0;
593
594                 int posn = charIndex;
595
596                 if (leftOverCount == 0) {
597                         int end = byteIndex + byteCount;
598                         for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
599                                 if (bytes [byteIndex] < 0x80)
600                                         chars [posn] = (char) bytes [byteIndex];
601                                 else
602                                         break;
603                         }
604                 }
605
606                 // Convert the bytes into the output buffer.
607                 uint ch;
608                 int length = chars.Length;
609                 uint leftBits = leftOverBits;
610                 uint leftSoFar = (leftOverCount & (uint)0x0F);
611                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
612
613                 int byteEnd = byteIndex + byteCount;
614                 if (byteEnd < 0 || byteEnd > bytes.Length)
615                         throw new SystemException (String.Format ("INTERNAL ERROR: should not happen: {0} {1} {2}", byteIndex, byteCount, byteEnd));
616
617                 for(; byteIndex < byteEnd; byteIndex++) {
618                         // Fetch the next character from the byte buffer.
619                         ch = (uint)(bytes[byteIndex]);
620                         if (leftSize == 0) {
621                                 // Process a UTF-8 start character.
622                                 if (ch < (uint)0x0080) {
623                                         // Single-byte UTF-8 character.
624                                         if (posn >= length) {
625                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
626                                         }
627                                         chars[posn++] = (char)ch;
628                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
629                                         // Double-byte UTF-8 character.
630                                         leftBits = (ch & (uint)0x1F);
631                                         leftSoFar = 1;
632                                         leftSize = 2;
633                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
634                                         // Three-byte UTF-8 character.
635                                         leftBits = (ch & (uint)0x0F);
636                                         leftSoFar = 1;
637                                         leftSize = 3;
638                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
639                                         // Four-byte UTF-8 character.
640                                         leftBits = (ch & (uint)0x07);
641                                         leftSoFar = 1;
642                                         leftSize = 4;
643                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
644                                         // Five-byte UTF-8 character.
645                                         leftBits = (ch & (uint)0x03);
646                                         leftSoFar = 1;
647                                         leftSize = 5;
648                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
649                                         // Six-byte UTF-8 character.
650                                         leftBits = (ch & (uint)0x03);
651                                         leftSoFar = 1;
652                                         leftSize = 6;
653                                 } else {
654                                         // Invalid UTF-8 start character.
655 #if NET_2_0
656                                         Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
657 #else
658                                         if (throwOnInvalid)
659                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
660 #endif
661                                 }
662                         } else {
663                                 // Process an extra byte in a multi-byte sequence.
664                                 if ((ch & (uint)0xC0) == (uint)0x80) {
665                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
666                                         if (++leftSoFar >= leftSize) {
667                                                 // We have a complete character now.
668                                                 if (leftBits < (uint)0x10000) {
669                                                         // is it an overlong ?
670                                                         bool overlong = false;
671                                                         switch (leftSize) {
672                                                         case 2:
673                                                                 overlong = (leftBits <= 0x7F);
674                                                                 break;
675                                                         case 3:
676                                                                 overlong = (leftBits <= 0x07FF);
677                                                                 break;
678                                                         case 4:
679                                                                 overlong = (leftBits <= 0xFFFF);
680                                                                 break;
681                                                         case 5:
682                                                                 overlong = (leftBits <= 0x1FFFFF);
683                                                                 break;
684                                                         case 6:
685                                                                 overlong = (leftBits <= 0x03FFFFFF);
686                                                                 break;
687                                                         }
688                                                         if (overlong) {
689 #if NET_2_0
690                                                                 Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
691 #else
692                                                                 if (throwOnInvalid)
693                                                                         throw new ArgumentException (_("Overlong"), leftBits.ToString ());
694 #endif
695                                                         }
696                                                         else if ((leftBits & 0xF800) == 0xD800) {
697                                                                 // UTF-8 doesn't use surrogate characters
698 #if NET_2_0
699                                                                 Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
700 #else
701                                                                 if (throwOnInvalid)
702                                                                         throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
703 #endif
704                                                         }
705                                                         else {
706                                                                 if (posn >= length) {
707                                                                         throw new ArgumentException
708                                                                                 (_("Arg_InsufficientSpace"), "chars");
709                                                                 }
710                                                                 chars[posn++] = (char)leftBits;
711                                                         }
712                                                 } else if (leftBits < (uint)0x110000) {
713                                                         if ((posn + 2) > length) {
714                                                                 throw new ArgumentException
715                                                                         (_("Arg_InsufficientSpace"), "chars");
716                                                         }
717                                                         leftBits -= (uint)0x10000;
718                                                         chars[posn++] = (char)((leftBits >> 10) +
719                                                                                                    (uint)0xD800);
720                                                         chars[posn++] =
721                                                                 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
722                                                 } else {
723 #if NET_2_0
724                                                         Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
725 #else
726                                                         if (throwOnInvalid)
727                                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
728 #endif
729                                                 }
730                                                 leftSize = 0;
731                                         }
732                                 } else {
733                                         // Invalid UTF-8 sequence: clear and restart.
734 #if NET_2_0
735                                         Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
736 #else
737                                         if (throwOnInvalid)
738                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
739 #endif
740                                         leftSize = 0;
741                                         --byteIndex;
742                                 }
743                         }
744                 }
745                 if (flush && leftSize != 0) {
746                         // We had left-over bytes that didn't make up
747                         // a complete UTF-8 character sequence.
748 #if NET_2_0
749                         Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
750 #else
751                         if (throwOnInvalid)
752                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
753 #endif
754                 }
755                 leftOverBits = leftBits;
756                 leftOverCount = (leftSoFar | (leftSize << 4));
757
758                 // Return the final length to the caller.
759                 return posn - charIndex;
760         }
761
762         // Get the characters that result from decoding a byte buffer.
763         public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
764                                                                  char[] chars, int charIndex)
765         {
766                 uint leftOverBits = 0;
767                 uint leftOverCount = 0;
768 #if NET_2_0
769                 DecoderFallbackBuffer buf = null;
770                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
771                                 charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, true);
772 #else
773                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
774                                 charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
775 #endif
776         }
777
778         // Get the maximum number of bytes needed to encode a
779         // specified number of characters.
780         public override int GetMaxByteCount (int charCount)
781         {
782                 if (charCount < 0) {
783                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
784                 }
785                 return charCount * 4;
786         }
787
788         // Get the maximum number of characters needed to decode a
789         // specified number of bytes.
790         public override int GetMaxCharCount (int byteCount)
791         {
792                 if (byteCount < 0) {
793                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
794                 }
795                 return byteCount;
796         }
797
798         // Get a UTF8-specific decoder that is attached to this instance.
799         public override Decoder GetDecoder ()
800         {
801 #if NET_2_0
802                 return new UTF8Decoder (DecoderFallback);
803 #else
804                 return new UTF8Decoder (throwOnInvalid);
805 #endif
806         }
807
808         // Get a UTF8-specific encoder that is attached to this instance.
809         public override Encoder GetEncoder ()
810         {
811                 return new UTF8Encoder (emitIdentifier);
812         }
813
814         // Get the UTF8 preamble.
815         public override byte[] GetPreamble ()
816         {
817                 if (emitIdentifier) {
818                         byte[] pre = new byte [3];
819                         pre[0] = (byte)0xEF;
820                         pre[1] = (byte)0xBB;
821                         pre[2] = (byte)0xBF;
822                         return pre;
823                 } else {
824                         return new byte [0];
825                 }
826         }
827
828         // Determine if this object is equal to another.
829         public override bool Equals (Object value)
830         {
831                 UTF8Encoding enc = (value as UTF8Encoding);
832                 if (enc != null) {
833 #if NET_2_0
834                         return (codePage == enc.codePage &&
835                                         emitIdentifier == enc.emitIdentifier &&
836                                         DecoderFallback == enc.DecoderFallback &&
837                                         EncoderFallback == enc.EncoderFallback);
838 #else
839                         return (codePage == enc.codePage &&
840                                         emitIdentifier == enc.emitIdentifier &&
841                                         throwOnInvalid == enc.throwOnInvalid);
842 #endif
843                 } else {
844                         return false;
845                 }
846         }
847
848         // Get the hash code for this object.
849         public override int GetHashCode ()
850         {
851                 return base.GetHashCode ();
852         }
853         
854         public override byte [] GetBytes (String s)
855         {
856                 if (s == null)
857                         throw new ArgumentNullException ("s");
858                 
859                 int length = GetByteCount (s);
860                 byte [] bytes = new byte [length];
861                 GetBytes (s, 0, s.Length, bytes, 0);
862                 return bytes;
863         }
864
865         // UTF-8 decoder implementation.
866         [Serializable]
867         private class UTF8Decoder : Decoder
868         {
869 #if !NET_2_0
870                 private bool throwOnInvalid;
871 #endif
872                 private uint leftOverBits;
873                 private uint leftOverCount;
874
875                 // Constructor.
876 #if NET_2_0
877                 public UTF8Decoder (DecoderFallback fallback)
878 #else
879                 public UTF8Decoder (bool throwOnInvalid)
880 #endif
881                 {
882 #if NET_2_0
883                         Fallback = fallback;
884 #else
885                         this.throwOnInvalid = throwOnInvalid;
886 #endif
887                         leftOverBits = 0;
888                         leftOverCount = 0;
889                 }
890
891                 // Override inherited methods.
892                 public override int GetCharCount (byte[] bytes, int index, int count)
893                 {
894 #if NET_2_0
895                         DecoderFallbackBuffer buf = null;
896                         return InternalGetCharCount (bytes, index, count,
897                                 leftOverBits, leftOverCount, this, ref buf, false);
898 #else
899                         return InternalGetCharCount (bytes, index, count,
900                                         leftOverBits, leftOverCount, throwOnInvalid, false);
901 #endif
902                 }
903                 public override int GetChars (byte[] bytes, int byteIndex,
904                                                  int byteCount, char[] chars, int charIndex)
905                 {
906 #if NET_2_0
907                         DecoderFallbackBuffer buf = null;
908                         return InternalGetChars (bytes, byteIndex, byteCount,
909                                 chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, false);
910 #else
911                         return InternalGetChars (bytes, byteIndex, byteCount,
912                                 chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
913 #endif
914                 }
915
916         } // class UTF8Decoder
917
918         // UTF-8 encoder implementation.
919         [Serializable]
920         private class UTF8Encoder : Encoder
921         {
922                 private bool emitIdentifier;
923                 private uint leftOverForCount;
924                 private uint leftOverForConv;
925
926                 // Constructor.
927                 public UTF8Encoder (bool emitIdentifier)
928                 {
929                         this.emitIdentifier = emitIdentifier;
930                         leftOverForCount = 0;
931                         leftOverForConv = 0;
932                 }
933
934                 // Override inherited methods.
935                 public override int GetByteCount (char[] chars, int index,
936                                          int count, bool flush)
937                 {
938                         return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush);
939                 }
940                 public override int GetBytes (char[] chars, int charIndex,
941                                          int charCount, byte[] bytes, int byteIndex, bool flush)
942                 {
943                         int result;
944                         result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush);
945                         emitIdentifier = false;
946                         return result;
947                 }
948
949 #if NET_2_0
950                 public unsafe override int GetByteCount (char* chars, int count, bool flush)
951                 {
952                         return InternalGetByteCount (chars, count, ref leftOverForCount, flush);
953                 }
954
955                 public unsafe override int GetBytes (char* chars, int charCount,
956                         byte* bytes, int byteCount, bool flush)
957                 {
958                         int result;
959                         result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush);
960                         emitIdentifier = false;
961                         return result;
962                 }
963 #endif
964
965         } // class UTF8Encoder
966
967 }; // class UTF8Encoding
968
969 }; // namespace System.Text