2c00c72eeaf43682588adfc19c1a40b9b3c86417
[mono.git] / mcs / class / corlib / System.Text / UTF8Encoding.cs
1 /*
2  * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
3  *
4  * Copyright (c) 2001, 2002  Southern Storm Software, Pty Ltd
5  * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23  * OTHER DEALINGS IN THE SOFTWARE.
24  */
25
26 namespace System.Text
27 {
28
29 using System;
30
31 [Serializable]
32 [MonoTODO ("Fix serialization compatibility with MS.NET")]
33 #if NET_2_0
34 [MonoTODO ("EncoderFallback is not handled")]
35 #endif
36 public class UTF8Encoding : Encoding
37 {
38         // Magic number used by Windows for UTF-8.
39         internal const int UTF8_CODE_PAGE = 65001;
40
41         // Internal state.
42         private bool emitIdentifier;
43 #if !NET_2_0
44         private bool throwOnInvalid;
45 #endif
46
47         // Constructors.
48         public UTF8Encoding () : this (false, false) {}
49         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
50                         : this (encoderShouldEmitUTF8Identifier, false) {}
51         
52         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
53                 : base (UTF8_CODE_PAGE)
54         {
55                 emitIdentifier = encoderShouldEmitUTF8Identifier;
56 #if NET_2_0
57                 if (throwOnInvalidBytes)
58                         SetFallbackInternal (null, new DecoderExceptionFallback ());
59                 else
60                         SetFallbackInternal (null, new DecoderReplacementFallback (String.Empty));
61 #else
62                 throwOnInvalid = throwOnInvalidBytes;
63 #endif
64
65                 web_name = body_name = header_name = "utf-8";
66                 encoding_name = "Unicode (UTF-8)";
67                 is_browser_save = true;
68                 is_browser_display = true;
69                 is_mail_news_display = true;
70                 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
71         }
72
73         // Internal version of "GetByteCount" which can handle a rolling
74         // state between multiple calls to this method.
75         private static int InternalGetByteCount (char[] chars, int index, int count, uint leftOver, bool flush)
76         {
77                 // Validate the parameters.
78                 if (chars == null) {
79                         throw new ArgumentNullException ("chars");
80                 }
81                 if (index < 0 || index > chars.Length) {
82                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
83                 }
84                 if (count < 0 || count > (chars.Length - index)) {
85                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
86                 }
87
88                 // Determine the lengths of all characters.
89                 char ch;
90                 int length = 0;
91                 uint pair = leftOver;
92                 while (count > 0) {
93                         ch = chars[index];
94                         if (pair == 0) {
95                                 if (ch < '\u0080') {
96                                         ++length;
97                                 } else if (ch < '\u0800') {
98                                         length += 2;
99                                 } else if (ch >= '\uD800' && ch <= '\uDBFF') {
100                                         // This is the start of a surrogate pair.
101                                         pair = (uint)ch;
102                                 } else {
103                                         length += 3;
104                                 }
105                         } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
106                                 // We have a surrogate pair.
107                                 length += 4;
108                                 pair = 0;
109                         } else {
110                                 // We have a surrogate start followed by a
111                                 // regular character.  Technically, this is
112                                 // invalid, but we have to do something.
113                                 // We write out the surrogate start and then
114                                 // re-visit the current character again.
115                                 length += 3;
116                                 pair = 0;
117                                 continue;
118                         }
119                         ++index;
120                         --count;
121                 }
122                 if (flush && pair != 0) {
123                         // Flush the left-over surrogate pair start.
124                         length += 3;
125                 }
126
127                 // Return the final length to the caller.
128                 return length;
129         }
130
131         // Get the number of bytes needed to encode a character buffer.
132         public override int GetByteCount (char[] chars, int index, int count)
133         {
134                 return InternalGetByteCount (chars, index, count, 0, true);
135         }
136
137         // Convenience wrappers for "GetByteCount".
138         public override int GetByteCount (String s)
139         {
140                 // Validate the parameters.
141                 if (s == null) {
142                         throw new ArgumentNullException ("s");
143                 }
144
145                 // Determine the lengths of all characters.
146                 char ch;
147                 int index = 0;
148                 int count = s.Length;
149                 int length = 0;
150                 uint pair;
151                 while (count > 0) {
152                         ch = s[index++];
153                         if (ch < '\u0080') {
154                                 ++length;
155                         } else if (ch < '\u0800') {
156                                 length += 2;
157                         } else if (ch >= '\uD800' && ch <= '\uDBFF' && count > 1) {
158                                 // This may be the start of a surrogate pair.
159                                 pair = (uint)(s[index]);
160                                 if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
161                                         length += 4;
162                                         ++index;
163                                         --count;
164                                 } else {
165                                         length += 3;
166                                 }
167                         } else {
168                                 length += 3;
169                         }
170                         --count;
171                 }
172
173                 // Return the final length to the caller.
174                 return length;
175         }
176
177         // Internal version of "GetBytes" which can handle a rolling
178         // state between multiple calls to this method.
179         private static int InternalGetBytes (char[] chars, int charIndex,
180                                              int charCount, byte[] bytes,
181                                              int byteIndex, ref uint leftOver,
182                                              bool flush)
183         {
184                 // Validate the parameters.
185                 if (chars == null) {
186                         throw new ArgumentNullException ("chars");
187                 }
188                 if (bytes == null) {
189                         throw new ArgumentNullException ("bytes");
190                 }
191                 if (charIndex < 0 || charIndex > chars.Length) {
192                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
193                 }
194                 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
195                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
196                 }
197                 if (byteIndex < 0 || byteIndex > bytes.Length) {
198                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
199                 }
200
201                 // Convert the characters into bytes.
202                 char ch;
203                 int length = bytes.Length;
204                 uint pair;
205                 uint left = leftOver;
206                 int posn = byteIndex;
207                 while (charCount > 0) {
208                         // Fetch the next UTF-16 character pair value.
209                         ch = chars[charIndex++];
210                         --charCount;
211                         if (left == 0) {
212                                 if (ch >= '\uD800' && ch <= '\uDBFF') {
213                                         // This is the start of a surrogate pair.
214                                         left = (uint)ch;
215                                         continue;
216                                 } else {
217                                         // This is a regular character.
218                                         pair = (uint)ch;
219                                 }
220                         } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
221                                 // We have a surrogate pair.
222                                 pair = ((left - (uint)0xD800) << 10) +
223                                            (((uint)ch) - (uint)0xDC00) +
224                                            (uint)0x10000;
225                                 left = 0;
226                         } else {
227                                 // We have a surrogate start followed by a
228                                 // regular character.  Technically, this is
229                                 // invalid, but we have to do something.
230                                 // We write out the surrogate start and then
231                                 // re-visit the current character again.
232                                 pair = (uint)left;
233                                 left = 0;
234                                 --charIndex;
235                                 ++charCount;
236                         }
237
238                         // Encode the character pair value.
239                         if (pair < (uint)0x0080) {
240                                 if (posn >= length) {
241                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
242                                 }
243                                 bytes[posn++] = (byte)pair;
244                         } else if (pair < (uint)0x0800) {
245                                 if ((posn + 2) > length) {
246                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
247                                 }
248                                 bytes[posn++] = (byte)(0xC0 | (pair >> 6));
249                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
250                         } else if (pair < (uint)0x10000) {
251                                 if ((posn + 3) > length) {
252                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
253                                 }
254                                 bytes[posn++] = (byte)(0xE0 | (pair >> 12));
255                                 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
256                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
257                         } else {
258                                 if ((posn + 4) > length) {
259                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
260                                 }
261                                 bytes[posn++] = (byte)(0xF0 | (pair >> 18));
262                                 bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
263                                 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
264                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
265                         }
266                 }
267                 if (flush && left != 0) {
268                         // Flush the left-over surrogate pair start.
269                         if ((posn + 3) > length) {
270                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
271                         }
272                         bytes[posn++] = (byte)(0xE0 | (left >> 12));
273                         bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));
274                         bytes[posn++] = (byte)(0x80 | (left & 0x3F));
275                         left = 0;
276                 }
277                 leftOver = left;
278
279                 // Return the final count to the caller.
280                 return posn - byteIndex;
281         }
282
283         // Get the bytes that result from encoding a character buffer.
284         public override int GetBytes (char[] chars, int charIndex, int charCount,
285                                                                  byte[] bytes, int byteIndex)
286         {
287                 uint leftOver = 0;
288                 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
289         }
290
291         // Convenience wrappers for "GetBytes".
292         public override int GetBytes (String s, int charIndex, int charCount,
293                                                                  byte[] bytes, int byteIndex)
294         {
295                 // Validate the parameters.
296                 if (s == null) {
297                         throw new ArgumentNullException ("s");
298                 }
299                 if (bytes == null) {
300                         throw new ArgumentNullException ("bytes");
301                 }
302                 if (charIndex < 0 || charIndex > s.Length) {
303                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
304                 }
305                 if (charCount < 0 || charCount > (s.Length - charIndex)) {
306                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
307                 }
308                 if (byteIndex < 0 || byteIndex > bytes.Length) {
309                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
310                 }
311
312                 // Convert the characters into bytes.
313                 char ch;
314                 int length = bytes.Length;
315                 uint pair;
316                 int posn = byteIndex;
317                 while (charCount > 0) {
318                         // Fetch the next UTF-16 character pair value.
319                         ch = s[charIndex++];
320                         if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
321                                 // This may be the start of a surrogate pair.
322                                 pair = (uint)(s[charIndex]);
323                                 if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
324                                         pair = (pair - (uint)0xDC00) +
325                                                    ((((uint)ch) - (uint)0xD800) << 10) +
326                                                    (uint)0x10000;
327                                         ++charIndex;
328                                         --charCount;
329                                 } else {
330                                         pair = (uint)ch;
331                                 }
332                         } else {
333                                 pair = (uint)ch;
334                         }
335                         --charCount;
336
337                         // Encode the character pair value.
338                         if (pair < (uint)0x0080) {
339                                 if (posn >= length) {
340                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
341                                 }
342                                 bytes[posn++] = (byte)pair;
343                         } else if (pair < (uint)0x0800) {
344                                 if ((posn + 2) > length) {
345                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
346                                 }
347                                 bytes[posn++] = (byte)(0xC0 | (pair >> 6));
348                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
349                         } else if (pair < (uint)0x10000) {
350                                 if ((posn + 3) > length) {
351                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
352                                 }
353                                 bytes[posn++] = (byte)(0xE0 | (pair >> 12));
354                                 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
355                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
356                         } else {
357                                 if ((posn + 4) > length) {
358                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
359                                 }
360                                 bytes[posn++] = (byte)(0xF0 | (pair >> 18));
361                                 bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
362                                 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
363                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
364                         }
365                 }
366
367                 // Return the final count to the caller.
368                 return posn - byteIndex;
369         }
370
371         // Internal version of "GetCharCount" which can handle a rolling
372         // state between multiple calls to this method.
373 #if NET_2_0
374         // Internal version of "GetCharCount" which can handle a rolling
375         // state between multiple calls to this method.
376         private static int InternalGetCharCount (
377                 byte[] bytes, int index, int count, uint leftOverBits,
378                 uint leftOverCount, DecoderFallbackBuffer fallbackBuffer, bool flush)
379 #else
380         private static int InternalGetCharCount (
381                 byte[] bytes, int index, int count, uint leftOverBits,
382                 uint leftOverCount, bool throwOnInvalid, bool flush)
383 #endif
384         {
385                 // Validate the parameters.
386                 if (bytes == null) {
387                         throw new ArgumentNullException ("bytes");
388                 }
389                 if (index < 0 || index > bytes.Length) {
390                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
391                 }
392                 if (count < 0 || count > (bytes.Length - index)) {
393                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
394                 }
395
396                 // Determine the number of characters that we have.
397                 uint ch;
398                 int length = 0;
399                 uint leftBits = leftOverBits;
400                 uint leftSoFar = (leftOverCount & (uint)0x0F);
401                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
402                 while (count > 0) {
403                         ch = (uint)(bytes[index++]);
404                         --count;
405                         if (leftSize == 0) {
406                                 // Process a UTF-8 start character.
407                                 if (ch < (uint)0x0080) {
408                                         // Single-byte UTF-8 character.
409                                         ++length;
410                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
411                                         // Double-byte UTF-8 character.
412                                         leftBits = (ch & (uint)0x1F);
413                                         leftSoFar = 1;
414                                         leftSize = 2;
415                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
416                                         // Three-byte UTF-8 character.
417                                         leftBits = (ch & (uint)0x0F);
418                                         leftSoFar = 1;
419                                         leftSize = 3;
420                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
421                                         // Four-byte UTF-8 character.
422                                         leftBits = (ch & (uint)0x07);
423                                         leftSoFar = 1;
424                                         leftSize = 4;
425                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
426                                         // Five-byte UTF-8 character.
427                                         leftBits = (ch & (uint)0x03);
428                                         leftSoFar = 1;
429                                         leftSize = 5;
430                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
431                                         // Six-byte UTF-8 character.
432                                         leftBits = (ch & (uint)0x03);
433                                         leftSoFar = 1;
434                                         leftSize = 6;
435                                 } else {
436                                         // Invalid UTF-8 start character.
437 #if NET_2_0
438                                         length += Fallback (fallbackBuffer, bytes, index - 1);
439 #else
440                                         if (throwOnInvalid)
441                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
442 #endif
443                                 }
444                         } else {
445                                 // Process an extra byte in a multi-byte sequence.
446                                 if ((ch & (uint)0xC0) == (uint)0x80) {
447                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
448                                         if (++leftSoFar >= leftSize) {
449                                                 // We have a complete character now.
450                                                 if (leftBits < (uint)0x10000) {
451                                                         // is it an overlong ?
452                                                         bool overlong = false;
453                                                         switch (leftSize) {
454                                                         case 2:
455                                                                 overlong = (leftBits <= 0x7F);
456                                                                 break;
457                                                         case 3:
458                                                                 overlong = (leftBits <= 0x07FF);
459                                                                 break;
460                                                         case 4:
461                                                                 overlong = (leftBits <= 0xFFFF);
462                                                                 break;
463                                                         case 5:
464                                                                 overlong = (leftBits <= 0x1FFFFF);
465                                                                 break;
466                                                         case 6:
467                                                                 overlong = (leftBits <= 0x03FFFFFF);
468                                                                 break;
469                                                         }
470                                                         if (overlong) {
471 #if NET_2_0
472                                                                 length += Fallback (fallbackBuffer, bytes, index - 1);
473 #else
474                                                                 if (throwOnInvalid)
475                                                                         throw new ArgumentException (_("Overlong"), leftBits.ToString ());
476 #endif
477                                                         }
478                                                         else
479                                                                 ++length;
480                                                 } else if (leftBits < (uint)0x110000) {
481                                                         length += 2;
482                                                 } else {
483 #if NET_2_0
484                                                         length += Fallback (fallbackBuffer, bytes, index - 1);
485 #else
486                                                         if (throwOnInvalid)
487                                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
488 #endif
489                                                 }
490                                                 leftSize = 0;
491                                         }
492                                 } else {
493                                         // Invalid UTF-8 sequence: clear and restart.
494 #if NET_2_0
495                                         length += Fallback (fallbackBuffer, bytes, index - 1);
496 #else
497                                         if (throwOnInvalid)
498                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
499 #endif
500                                         leftSize = 0;
501                                         --index;
502                                         ++count;
503                                 }
504                         }
505                 }
506                 if (flush && leftSize != 0) {
507                         // We had left-over bytes that didn't make up
508                         // a complete UTF-8 character sequence.
509 #if NET_2_0
510                         length += Fallback (fallbackBuffer, bytes, index - 1);
511 #else
512                         if (throwOnInvalid)
513                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
514 #endif
515                 }
516
517                 // Return the final length to the caller.
518                 return length;
519         }
520
521 #if NET_2_0
522         // for GetCharCount()
523         static int Fallback (DecoderFallbackBuffer buffer, byte [] bytes, int index)
524         {
525                 buffer.Fallback (bytes, index - 1);
526                 return buffer.Remaining;
527         }
528
529         // for GetChars()
530         static void Fallback (DecoderFallbackBuffer buffer, byte [] bytes, int byteIndex,
531                 char [] chars, ref int charIndex)
532         {
533                 buffer.Fallback (bytes, byteIndex - 1);
534                 while (buffer.Remaining > 0)
535                         chars [charIndex++] = buffer.GetNextChar ();
536         }
537 #endif
538
539         // Get the number of characters needed to decode a byte buffer.
540         public override int GetCharCount (byte[] bytes, int index, int count)
541         {
542 #if NET_2_0
543                 return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback.CreateFallbackBuffer (), true);
544 #else
545                 return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
546 #endif
547         }
548
549         // Get the characters that result from decoding a byte buffer.
550 #if NET_2_0
551         private static int InternalGetChars (
552                 byte[] bytes, int byteIndex, int byteCount, char[] chars,
553                 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
554                 DecoderFallbackBuffer fallbackBuffer, bool flush)
555 #else
556         private static int InternalGetChars (
557                 byte[] bytes, int byteIndex, int byteCount, char[] chars,
558                 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
559                 bool throwOnInvalid, bool flush)
560 #endif
561         {
562                 // Validate the parameters.
563                 if (bytes == null) {
564                         throw new ArgumentNullException ("bytes");
565                 }
566                 if (chars == null) {
567                         throw new ArgumentNullException ("chars");
568                 }
569                 if (byteIndex < 0 || byteIndex > bytes.Length) {
570                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
571                 }
572                 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
573                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
574                 }
575                 if (charIndex < 0 || charIndex > chars.Length) {
576                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
577                 }
578
579                 if (charIndex == chars.Length)
580                         return 0;
581
582                 // Convert the bytes into the output buffer.
583                 uint ch;
584                 int length = chars.Length;
585                 int posn = charIndex;
586                 uint leftBits = leftOverBits;
587                 uint leftSoFar = (leftOverCount & (uint)0x0F);
588                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
589                 while (byteCount > 0) {
590                         // Fetch the next character from the byte buffer.
591                         ch = (uint)(bytes[byteIndex++]);
592                         --byteCount;
593                         if (leftSize == 0) {
594                                 // Process a UTF-8 start character.
595                                 if (ch < (uint)0x0080) {
596                                         // Single-byte UTF-8 character.
597                                         if (posn >= length) {
598                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
599                                         }
600                                         chars[posn++] = (char)ch;
601                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
602                                         // Double-byte UTF-8 character.
603                                         leftBits = (ch & (uint)0x1F);
604                                         leftSoFar = 1;
605                                         leftSize = 2;
606                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
607                                         // Three-byte UTF-8 character.
608                                         leftBits = (ch & (uint)0x0F);
609                                         leftSoFar = 1;
610                                         leftSize = 3;
611                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
612                                         // Four-byte UTF-8 character.
613                                         leftBits = (ch & (uint)0x07);
614                                         leftSoFar = 1;
615                                         leftSize = 4;
616                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
617                                         // Five-byte UTF-8 character.
618                                         leftBits = (ch & (uint)0x03);
619                                         leftSoFar = 1;
620                                         leftSize = 5;
621                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
622                                         // Six-byte UTF-8 character.
623                                         leftBits = (ch & (uint)0x03);
624                                         leftSoFar = 1;
625                                         leftSize = 6;
626                                 } else {
627                                         // Invalid UTF-8 start character.
628 #if NET_2_0
629                                         Fallback (fallbackBuffer, bytes, byteIndex, chars, ref posn);
630 #else
631                                         if (throwOnInvalid)
632                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
633 #endif
634                                 }
635                         } else {
636                                 // Process an extra byte in a multi-byte sequence.
637                                 if ((ch & (uint)0xC0) == (uint)0x80) {
638                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
639                                         if (++leftSoFar >= leftSize) {
640                                                 // We have a complete character now.
641                                                 if (leftBits < (uint)0x10000) {
642                                                         // is it an overlong ?
643                                                         bool overlong = false;
644                                                         switch (leftSize) {
645                                                         case 2:
646                                                                 overlong = (leftBits <= 0x7F);
647                                                                 break;
648                                                         case 3:
649                                                                 overlong = (leftBits <= 0x07FF);
650                                                                 break;
651                                                         case 4:
652                                                                 overlong = (leftBits <= 0xFFFF);
653                                                                 break;
654                                                         case 5:
655                                                                 overlong = (leftBits <= 0x1FFFFF);
656                                                                 break;
657                                                         case 6:
658                                                                 overlong = (leftBits <= 0x03FFFFFF);
659                                                                 break;
660                                                         }
661                                                         if (overlong) {
662 #if NET_2_0
663                                                                 Fallback (fallbackBuffer, bytes, byteIndex, chars, ref posn);
664 #else
665                                                                 if (throwOnInvalid)
666                                                                         throw new ArgumentException (_("Overlong"), leftBits.ToString ());
667 #endif
668                                                         }
669                                                         else {
670                                                                 if (posn >= length) {
671                                                                         throw new ArgumentException
672                                                                                 (_("Arg_InsufficientSpace"), "chars");
673                                                                 }
674                                                                 chars[posn++] = (char)leftBits;
675                                                         }
676                                                 } else if (leftBits < (uint)0x110000) {
677                                                         if ((posn + 2) > length) {
678                                                                 throw new ArgumentException
679                                                                         (_("Arg_InsufficientSpace"), "chars");
680                                                         }
681                                                         leftBits -= (uint)0x10000;
682                                                         chars[posn++] = (char)((leftBits >> 10) +
683                                                                                                    (uint)0xD800);
684                                                         chars[posn++] =
685                                                                 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
686                                                 } else {
687 #if NET_2_0
688                                                         Fallback (fallbackBuffer, bytes, byteIndex, chars, ref posn);
689 #else
690                                                         if (throwOnInvalid)
691                                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
692 #endif
693                                                 }
694                                                 leftSize = 0;
695                                         }
696                                 } else {
697                                         // Invalid UTF-8 sequence: clear and restart.
698 #if NET_2_0
699                                         Fallback (fallbackBuffer, bytes, byteIndex, chars, ref posn);
700 #else
701                                         if (throwOnInvalid)
702                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
703 #endif
704                                         leftSize = 0;
705                                         --byteIndex;
706                                         ++byteCount;
707                                 }
708                         }
709                 }
710                 if (flush && leftSize != 0) {
711                         // We had left-over bytes that didn't make up
712                         // a complete UTF-8 character sequence.
713 #if NET_2_0
714                         Fallback (fallbackBuffer, bytes, byteIndex, chars, ref posn);
715 #else
716                         if (throwOnInvalid)
717                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
718 #endif
719                 }
720                 leftOverBits = leftBits;
721                 leftOverCount = (leftSoFar | (leftSize << 4));
722
723                 // Return the final length to the caller.
724                 return posn - charIndex;
725         }
726
727         // Get the characters that result from decoding a byte buffer.
728         public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
729                                                                  char[] chars, int charIndex)
730         {
731                 uint leftOverBits = 0;
732                 uint leftOverCount = 0;
733 #if NET_2_0
734                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
735                                 charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback.CreateFallbackBuffer (), true);
736 #else
737                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
738                                 charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
739 #endif
740         }
741
742         // Get the maximum number of bytes needed to encode a
743         // specified number of characters.
744         public override int GetMaxByteCount (int charCount)
745         {
746                 if (charCount < 0) {
747                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
748                 }
749                 return charCount * 4;
750         }
751
752         // Get the maximum number of characters needed to decode a
753         // specified number of bytes.
754         public override int GetMaxCharCount (int byteCount)
755         {
756                 if (byteCount < 0) {
757                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
758                 }
759                 return byteCount;
760         }
761
762         // Get a UTF8-specific decoder that is attached to this instance.
763         public override Decoder GetDecoder ()
764         {
765 #if NET_2_0
766                 return new UTF8Decoder (DecoderFallback);
767 #else
768                 return new UTF8Decoder (throwOnInvalid);
769 #endif
770         }
771
772         // Get a UTF8-specific encoder that is attached to this instance.
773         public override Encoder GetEncoder ()
774         {
775                 return new UTF8Encoder (emitIdentifier);
776         }
777
778         // Get the UTF8 preamble.
779         public override byte[] GetPreamble ()
780         {
781                 if (emitIdentifier) {
782                         byte[] pre = new byte [3];
783                         pre[0] = (byte)0xEF;
784                         pre[1] = (byte)0xBB;
785                         pre[2] = (byte)0xBF;
786                         return pre;
787                 } else {
788                         return new byte [0];
789                 }
790         }
791
792         // Determine if this object is equal to another.
793         public override bool Equals (Object value)
794         {
795                 UTF8Encoding enc = (value as UTF8Encoding);
796                 if (enc != null) {
797 #if NET_2_0
798                         return (codePage == enc.codePage &&
799                                         emitIdentifier == enc.emitIdentifier &&
800                                         DecoderFallback == enc.DecoderFallback &&
801                                         EncoderFallback == enc.EncoderFallback);
802 #else
803                         return (codePage == enc.codePage &&
804                                         emitIdentifier == enc.emitIdentifier &&
805                                         throwOnInvalid == enc.throwOnInvalid);
806 #endif
807                 } else {
808                         return false;
809                 }
810         }
811
812         // Get the hash code for this object.
813         public override int GetHashCode ()
814         {
815                 return base.GetHashCode ();
816         }
817         
818         public override byte [] GetBytes (String s)
819         {
820                 if (s == null)
821                         throw new ArgumentNullException ("s");
822                 
823                 int length = GetByteCount (s);
824                 byte [] bytes = new byte [length];
825                 GetBytes (s, 0, s.Length, bytes, 0);
826                 return bytes;
827         }
828
829         // UTF-8 decoder implementation.
830         [Serializable]
831         private class UTF8Decoder : Decoder
832         {
833 #if !NET_2_0
834                 private bool throwOnInvalid;
835 #endif
836                 private uint leftOverBits;
837                 private uint leftOverCount;
838
839                 // Constructor.
840 #if NET_2_0
841                 public UTF8Decoder (DecoderFallback fallback)
842 #else
843                 public UTF8Decoder (bool throwOnInvalid)
844 #endif
845                 {
846 #if NET_2_0
847                         Fallback = fallback;
848 #else
849                         this.throwOnInvalid = throwOnInvalid;
850 #endif
851                         leftOverBits = 0;
852                         leftOverCount = 0;
853                 }
854
855                 // Override inherited methods.
856                 public override int GetCharCount (byte[] bytes, int index, int count)
857                 {
858 #if NET_2_0
859                         return InternalGetCharCount (bytes, index, count,
860                                 leftOverBits, leftOverCount, FallbackBuffer, false);
861 #else
862                         return InternalGetCharCount (bytes, index, count,
863                                         leftOverBits, leftOverCount, throwOnInvalid, false);
864 #endif
865                 }
866                 public override int GetChars (byte[] bytes, int byteIndex,
867                                                  int byteCount, char[] chars, int charIndex)
868                 {
869 #if NET_2_0
870                         return InternalGetChars (bytes, byteIndex, byteCount,
871                                 chars, charIndex, ref leftOverBits, ref leftOverCount, FallbackBuffer, false);
872 #else
873                         return InternalGetChars (bytes, byteIndex, byteCount,
874                                 chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
875 #endif
876                 }
877
878         } // class UTF8Decoder
879
880         // UTF-8 encoder implementation.
881         [Serializable]
882         private class UTF8Encoder : Encoder
883         {
884                 private bool emitIdentifier;
885                 private uint leftOver;
886
887                 // Constructor.
888                 public UTF8Encoder (bool emitIdentifier)
889                 {
890                         this.emitIdentifier = emitIdentifier;
891                         leftOver = 0;
892                 }
893
894                 // Override inherited methods.
895                 public override int GetByteCount (char[] chars, int index,
896                                          int count, bool flush)
897                 {
898                         return InternalGetByteCount (chars, index, count, leftOver, flush);
899                 }
900                 public override int GetBytes (char[] chars, int charIndex,
901                                          int charCount, byte[] bytes, int byteCount, bool flush)
902                 {
903                         int result;
904                         result = InternalGetBytes (chars, charIndex, charCount, bytes, byteCount, ref leftOver, flush);
905                         emitIdentifier = false;
906                         return result;
907                 }
908
909         } // class UTF8Encoder
910
911 }; // class UTF8Encoding
912
913 }; // namespace System.Text