updating to the latest module.
[mono.git] / mcs / class / corlib / System.Text / UTF8Encoding.cs
1 /*
2  * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
3  *
4  * Copyright (c) 2001, 2002  Southern Storm Software, Pty Ltd
5  * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23  * OTHER DEALINGS IN THE SOFTWARE.
24  */
25
26 namespace System.Text
27 {
28
29 using System;
30
31 [Serializable]
32 [MonoTODO ("Fix serialization compatibility with MS.NET")]
33 public class UTF8Encoding : Encoding
34 {
35         // Magic number used by Windows for UTF-8.
36         internal const int UTF8_CODE_PAGE = 65001;
37
38         // Internal state.
39         private bool emitIdentifier;
40         private bool throwOnInvalid;
41
42         // Constructors.
43         public UTF8Encoding () : this (false, false) {}
44         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
45                         : this (encoderShouldEmitUTF8Identifier, false) {}
46         
47         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
48                 : base (UTF8_CODE_PAGE)
49         {
50                 emitIdentifier = encoderShouldEmitUTF8Identifier;
51                 throwOnInvalid = throwOnInvalidBytes;
52
53                 web_name = body_name = header_name = "utf-8";
54                 encoding_name = "Unicode (UTF-8)";
55                 is_browser_save = true;
56                 is_browser_display = true;
57                 is_mail_news_display = true;
58                 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
59         }
60
61         // Internal version of "GetByteCount" which can handle a rolling
62         // state between multiple calls to this method.
63         private static int InternalGetByteCount (char[] chars, int index, int count, uint leftOver, bool flush)
64         {
65                 // Validate the parameters.
66                 if (chars == null) {
67                         throw new ArgumentNullException ("chars");
68                 }
69                 if (index < 0 || index > chars.Length) {
70                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
71                 }
72                 if (count < 0 || count > (chars.Length - index)) {
73                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
74                 }
75
76                 // Determine the lengths of all characters.
77                 char ch;
78                 int length = 0;
79                 uint pair = leftOver;
80                 while (count > 0) {
81                         ch = chars[index];
82                         if (pair == 0) {
83                                 if (ch < '\u0080') {
84                                         ++length;
85                                 } else if (ch < '\u0800') {
86                                         length += 2;
87                                 } else if (ch >= '\uD800' && ch <= '\uDBFF') {
88                                         // This is the start of a surrogate pair.
89                                         pair = (uint)ch;
90                                 } else {
91                                         length += 3;
92                                 }
93                         } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
94                                 // We have a surrogate pair.
95                                 length += 4;
96                                 pair = 0;
97                         } else {
98                                 // We have a surrogate start followed by a
99                                 // regular character.  Technically, this is
100                                 // invalid, but we have to do something.
101                                 // We write out the surrogate start and then
102                                 // re-visit the current character again.
103                                 length += 3;
104                                 pair = 0;
105                                 continue;
106                         }
107                         ++index;
108                         --count;
109                 }
110                 if (flush && pair != 0) {
111                         // Flush the left-over surrogate pair start.
112                         length += 3;
113                 }
114
115                 // Return the final length to the caller.
116                 return length;
117         }
118
119         // Get the number of bytes needed to encode a character buffer.
120         public override int GetByteCount (char[] chars, int index, int count)
121         {
122                 return InternalGetByteCount (chars, index, count, 0, true);
123         }
124
125         // Convenience wrappers for "GetByteCount".
126         public override int GetByteCount (String s)
127         {
128                 // Validate the parameters.
129                 if (s == null) {
130                         throw new ArgumentNullException ("s");
131                 }
132
133                 // Determine the lengths of all characters.
134                 char ch;
135                 int index = 0;
136                 int count = s.Length;
137                 int length = 0;
138                 uint pair;
139                 while (count > 0) {
140                         ch = s[index++];
141                         if (ch < '\u0080') {
142                                 ++length;
143                         } else if (ch < '\u0800') {
144                                 length += 2;
145                         } else if (ch >= '\uD800' && ch <= '\uDBFF' && count > 1) {
146                                 // This may be the start of a surrogate pair.
147                                 pair = (uint)(s[index]);
148                                 if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
149                                         length += 4;
150                                         ++index;
151                                         --count;
152                                 } else {
153                                         length += 3;
154                                 }
155                         } else {
156                                 length += 3;
157                         }
158                         --count;
159                 }
160
161                 // Return the final length to the caller.
162                 return length;
163         }
164
165         // Internal version of "GetBytes" which can handle a rolling
166         // state between multiple calls to this method.
167         private static int InternalGetBytes (char[] chars, int charIndex,
168                                              int charCount, byte[] bytes,
169                                              int byteIndex, ref uint leftOver,
170                                              bool flush)
171         {
172                 // Validate the parameters.
173                 if (chars == null) {
174                         throw new ArgumentNullException ("chars");
175                 }
176                 if (bytes == null) {
177                         throw new ArgumentNullException ("bytes");
178                 }
179                 if (charIndex < 0 || charIndex > chars.Length) {
180                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
181                 }
182                 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
183                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
184                 }
185                 if (byteIndex < 0 || byteIndex > bytes.Length) {
186                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
187                 }
188
189                 // Convert the characters into bytes.
190                 char ch;
191                 int length = bytes.Length;
192                 uint pair;
193                 uint left = leftOver;
194                 int posn = byteIndex;
195                 while (charCount > 0) {
196                         // Fetch the next UTF-16 character pair value.
197                         ch = chars[charIndex++];
198                         --charCount;
199                         if (left == 0) {
200                                 if (ch >= '\uD800' && ch <= '\uDBFF') {
201                                         // This is the start of a surrogate pair.
202                                         left = (uint)ch;
203                                         continue;
204                                 } else {
205                                         // This is a regular character.
206                                         pair = (uint)ch;
207                                 }
208                         } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
209                                 // We have a surrogate pair.
210                                 pair = ((left - (uint)0xD800) << 10) +
211                                            (((uint)ch) - (uint)0xDC00) +
212                                            (uint)0x10000;
213                                 left = 0;
214                         } else {
215                                 // We have a surrogate start followed by a
216                                 // regular character.  Technically, this is
217                                 // invalid, but we have to do something.
218                                 // We write out the surrogate start and then
219                                 // re-visit the current character again.
220                                 pair = (uint)left;
221                                 left = 0;
222                                 --charIndex;
223                                 ++charCount;
224                         }
225
226                         // Encode the character pair value.
227                         if (pair < (uint)0x0080) {
228                                 if (posn >= length) {
229                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
230                                 }
231                                 bytes[posn++] = (byte)pair;
232                         } else if (pair < (uint)0x0800) {
233                                 if ((posn + 2) > length) {
234                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
235                                 }
236                                 bytes[posn++] = (byte)(0xC0 | (pair >> 6));
237                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
238                         } else if (pair < (uint)0x10000) {
239                                 if ((posn + 3) > length) {
240                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
241                                 }
242                                 bytes[posn++] = (byte)(0xE0 | (pair >> 12));
243                                 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
244                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
245                         } else {
246                                 if ((posn + 4) > length) {
247                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
248                                 }
249                                 bytes[posn++] = (byte)(0xF0 | (pair >> 18));
250                                 bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
251                                 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
252                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
253                         }
254                 }
255                 if (flush && left != 0) {
256                         // Flush the left-over surrogate pair start.
257                         if ((posn + 3) > length) {
258                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
259                         }
260                         bytes[posn++] = (byte)(0xE0 | (left >> 12));
261                         bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));
262                         bytes[posn++] = (byte)(0x80 | (left & 0x3F));
263                         left = 0;
264                 }
265                 leftOver = left;
266
267                 // Return the final count to the caller.
268                 return posn - byteIndex;
269         }
270
271         // Get the bytes that result from encoding a character buffer.
272         public override int GetBytes (char[] chars, int charIndex, int charCount,
273                                                                  byte[] bytes, int byteIndex)
274         {
275                 uint leftOver = 0;
276                 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
277         }
278
279         // Convenience wrappers for "GetBytes".
280         public override int GetBytes (String s, int charIndex, int charCount,
281                                                                  byte[] bytes, int byteIndex)
282         {
283                 // Validate the parameters.
284                 if (s == null) {
285                         throw new ArgumentNullException ("s");
286                 }
287                 if (bytes == null) {
288                         throw new ArgumentNullException ("bytes");
289                 }
290                 if (charIndex < 0 || charIndex > s.Length) {
291                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
292                 }
293                 if (charCount < 0 || charCount > (s.Length - charIndex)) {
294                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
295                 }
296                 if (byteIndex < 0 || byteIndex > bytes.Length) {
297                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
298                 }
299
300                 // Convert the characters into bytes.
301                 char ch;
302                 int length = bytes.Length;
303                 uint pair;
304                 int posn = byteIndex;
305                 while (charCount > 0) {
306                         // Fetch the next UTF-16 character pair value.
307                         ch = s[charIndex++];
308                         if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
309                                 // This may be the start of a surrogate pair.
310                                 pair = (uint)(s[charIndex]);
311                                 if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
312                                         pair = (pair - (uint)0xDC00) +
313                                                    ((((uint)ch) - (uint)0xD800) << 10) +
314                                                    (uint)0x10000;
315                                         ++charIndex;
316                                         --charCount;
317                                 } else {
318                                         pair = (uint)ch;
319                                 }
320                         } else {
321                                 pair = (uint)ch;
322                         }
323                         --charCount;
324
325                         // Encode the character pair value.
326                         if (pair < (uint)0x0080) {
327                                 if (posn >= length) {
328                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
329                                 }
330                                 bytes[posn++] = (byte)pair;
331                         } else if (pair < (uint)0x0800) {
332                                 if ((posn + 2) > length) {
333                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
334                                 }
335                                 bytes[posn++] = (byte)(0xC0 | (pair >> 6));
336                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
337                         } else if (pair < (uint)0x10000) {
338                                 if ((posn + 3) > length) {
339                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
340                                 }
341                                 bytes[posn++] = (byte)(0xE0 | (pair >> 12));
342                                 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
343                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
344                         } else {
345                                 if ((posn + 4) > length) {
346                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
347                                 }
348                                 bytes[posn++] = (byte)(0xF0 | (pair >> 18));
349                                 bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
350                                 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
351                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
352                         }
353                 }
354
355                 // Return the final count to the caller.
356                 return posn - byteIndex;
357         }
358
359         // Internal version of "GetCharCount" which can handle a rolling
360         // state between multiple calls to this method.
361         private static int InternalGetCharCount (byte[] bytes, int index, int count,
362                                                                                    uint leftOverBits,
363                                                                                    uint leftOverCount,
364                                                                                    bool throwOnInvalid, bool flush)
365         {
366                 // Validate the parameters.
367                 if (bytes == null) {
368                         throw new ArgumentNullException ("bytes");
369                 }
370                 if (index < 0 || index > bytes.Length) {
371                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
372                 }
373                 if (count < 0 || count > (bytes.Length - index)) {
374                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
375                 }
376
377                 // Determine the number of characters that we have.
378                 uint ch;
379                 int length = 0;
380                 uint leftBits = leftOverBits;
381                 uint leftSoFar = (leftOverCount & (uint)0x0F);
382                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
383                 while (count > 0) {
384                         ch = (uint)(bytes[index++]);
385                         --count;
386                         if (leftSize == 0) {
387                                 // Process a UTF-8 start character.
388                                 if (ch < (uint)0x0080) {
389                                         // Single-byte UTF-8 character.
390                                         ++length;
391                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
392                                         // Double-byte UTF-8 character.
393                                         leftBits = (ch & (uint)0x1F);
394                                         leftSoFar = 1;
395                                         leftSize = 2;
396                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
397                                         // Three-byte UTF-8 character.
398                                         leftBits = (ch & (uint)0x0F);
399                                         leftSoFar = 1;
400                                         leftSize = 3;
401                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
402                                         // Four-byte UTF-8 character.
403                                         leftBits = (ch & (uint)0x07);
404                                         leftSoFar = 1;
405                                         leftSize = 4;
406                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
407                                         // Five-byte UTF-8 character.
408                                         leftBits = (ch & (uint)0x03);
409                                         leftSoFar = 1;
410                                         leftSize = 5;
411                                 } else if ((ch & (uint)0xFC) == (uint)0xFC) {
412                                         // Six-byte UTF-8 character.
413                                         leftBits = (ch & (uint)0x03);
414                                         leftSoFar = 1;
415                                         leftSize = 6;
416                                 } else {
417                                         // Invalid UTF-8 start character.
418                                         if (throwOnInvalid) {
419                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
420                                         }
421                                 }
422                         } else {
423                                 // Process an extra byte in a multi-byte sequence.
424                                 if ((ch & (uint)0xC0) == (uint)0x80) {
425                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
426                                         if (++leftSoFar >= leftSize) {
427                                                 // We have a complete character now.
428                                                 if (leftBits < (uint)0x10000) {
429                                                         if (leftBits != (uint)0xFEFF) {
430                                                                 // is it an overlong ?
431                                                                 bool overlong = false;
432                                                                 switch (leftSize) {
433                                                                 case 2:
434                                                                         overlong = (leftBits <= 0x7F);
435                                                                         break;
436                                                                 case 3:
437                                                                         overlong = (leftBits <= 0x07FF);
438                                                                         break;
439                                                                 case 4:
440                                                                         overlong = (leftBits <= 0xFFFF);
441                                                                         break;
442                                                                 case 5:
443                                                                         overlong = (leftBits <= 0x1FFFFF);
444                                                                         break;
445                                                                 case 6:
446                                                                         overlong = (leftBits <= 0x03FFFFFF);
447                                                                         break;
448                                                                 }
449                                                                 if (overlong) {
450                                                                         if (throwOnInvalid)
451                                                                                 throw new ArgumentException (_("Overlong"), leftBits.ToString ());
452                                                                 }
453                                                                 else
454                                                                         ++length;
455                                                         }
456                                                 } else if (leftBits < (uint)0x110000) {
457                                                         length += 2;
458                                                 } else if (throwOnInvalid) {
459                                                         throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
460                                                 }
461                                                 leftSize = 0;
462                                         }
463                                 } else {
464                                         // Invalid UTF-8 sequence: clear and restart.
465                                         if (throwOnInvalid) {
466                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
467                                         }
468                                         leftSize = 0;
469                                         --index;
470                                         ++count;
471                                 }
472                         }
473                 }
474                 if (flush && leftSize != 0 && throwOnInvalid) {
475                         // We had left-over bytes that didn't make up
476                         // a complete UTF-8 character sequence.
477                         throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
478                 }
479
480                 // Return the final length to the caller.
481                 return length;
482         }
483
484         // Get the number of characters needed to decode a byte buffer.
485         public override int GetCharCount (byte[] bytes, int index, int count)
486         {
487                 return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
488         }
489
490         // Get the characters that result from decoding a byte buffer.
491         private static int InternalGetChars (byte[] bytes, int byteIndex,
492                                                                            int byteCount, char[] chars,
493                                                                            int charIndex, ref uint leftOverBits,
494                                                                            ref uint leftOverCount,
495                                                                            bool throwOnInvalid, bool flush)
496         {
497                 // Validate the parameters.
498                 if (bytes == null) {
499                         throw new ArgumentNullException ("bytes");
500                 }
501                 if (chars == null) {
502                         throw new ArgumentNullException ("chars");
503                 }
504                 if (byteIndex < 0 || byteIndex > bytes.Length) {
505                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
506                 }
507                 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
508                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
509                 }
510                 if (charIndex < 0 || charIndex > chars.Length) {
511                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
512                 }
513
514                 if (charIndex == chars.Length)
515                         return 0;
516
517                 // Convert the bytes into the output buffer.
518                 uint ch;
519                 int length = chars.Length;
520                 int posn = charIndex;
521                 uint leftBits = leftOverBits;
522                 uint leftSoFar = (leftOverCount & (uint)0x0F);
523                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
524                 while (byteCount > 0) {
525                         // Fetch the next character from the byte buffer.
526                         ch = (uint)(bytes[byteIndex++]);
527                         --byteCount;
528                         if (leftSize == 0) {
529                                 // Process a UTF-8 start character.
530                                 if (ch < (uint)0x0080) {
531                                         // Single-byte UTF-8 character.
532                                         if (posn >= length) {
533                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
534                                         }
535                                         chars[posn++] = (char)ch;
536                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
537                                         // Double-byte UTF-8 character.
538                                         leftBits = (ch & (uint)0x1F);
539                                         leftSoFar = 1;
540                                         leftSize = 2;
541                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
542                                         // Three-byte UTF-8 character.
543                                         leftBits = (ch & (uint)0x0F);
544                                         leftSoFar = 1;
545                                         leftSize = 3;
546                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
547                                         // Four-byte UTF-8 character.
548                                         leftBits = (ch & (uint)0x07);
549                                         leftSoFar = 1;
550                                         leftSize = 4;
551                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
552                                         // Five-byte UTF-8 character.
553                                         leftBits = (ch & (uint)0x03);
554                                         leftSoFar = 1;
555                                         leftSize = 5;
556                                 } else if ((ch & (uint)0xFC) == (uint)0xFC) {
557                                         // Six-byte UTF-8 character.
558                                         leftBits = (ch & (uint)0x03);
559                                         leftSoFar = 1;
560                                         leftSize = 6;
561                                 } else {
562                                         // Invalid UTF-8 start character.
563                                         if (throwOnInvalid) {
564                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
565                                         }
566                                 }
567                         } else {
568                                 // Process an extra byte in a multi-byte sequence.
569                                 if ((ch & (uint)0xC0) == (uint)0x80) {
570                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
571                                         if (++leftSoFar >= leftSize) {
572                                                 // We have a complete character now.
573                                                 if (leftBits < (uint)0x10000) {
574                                                         if (leftBits != (uint)0xFEFF) {
575                                                                 // is it an overlong ?
576                                                                 bool overlong = false;
577                                                                 switch (leftSize) {
578                                                                 case 2:
579                                                                         overlong = (leftBits <= 0x7F);
580                                                                         break;
581                                                                 case 3:
582                                                                         overlong = (leftBits <= 0x07FF);
583                                                                         break;
584                                                                 case 4:
585                                                                         overlong = (leftBits <= 0xFFFF);
586                                                                         break;
587                                                                 case 5:
588                                                                         overlong = (leftBits <= 0x1FFFFF);
589                                                                         break;
590                                                                 case 6:
591                                                                         overlong = (leftBits <= 0x03FFFFFF);
592                                                                         break;
593                                                                 }
594                                                                 if (overlong) {
595                                                                         if (throwOnInvalid)
596                                                                                 throw new ArgumentException (_("Overlong"), leftBits.ToString ());
597                                                                 }
598                                                                 else {
599                                                                         if (posn >= length) {
600                                                                                 throw new ArgumentException
601                                                                                         (_("Arg_InsufficientSpace"), "chars");
602                                                                         }
603                                                                         chars[posn++] = (char)leftBits;
604                                                                 }
605                                                         }
606                                                 } else if (leftBits < (uint)0x110000) {
607                                                         if ((posn + 2) > length) {
608                                                                 throw new ArgumentException
609                                                                         (_("Arg_InsufficientSpace"), "chars");
610                                                         }
611                                                         leftBits -= (uint)0x10000;
612                                                         chars[posn++] = (char)((leftBits >> 10) +
613                                                                                                    (uint)0xD800);
614                                                         chars[posn++] =
615                                                                 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
616                                                 } else if (throwOnInvalid) {
617                                                         throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
618                                                 }
619                                                 leftSize = 0;
620                                         }
621                                 } else {
622                                         // Invalid UTF-8 sequence: clear and restart.
623                                         if (throwOnInvalid) {
624                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
625                                         }
626                                         leftSize = 0;
627                                         --byteIndex;
628                                         ++byteCount;
629                                 }
630                         }
631                 }
632                 if (flush && leftSize != 0 && throwOnInvalid) {
633                         // We had left-over bytes that didn't make up
634                         // a complete UTF-8 character sequence.
635                         throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
636                 }
637                 leftOverBits = leftBits;
638                 leftOverCount = (leftSoFar | (leftSize << 4));
639
640                 // Return the final length to the caller.
641                 return posn - charIndex;
642         }
643
644         // Get the characters that result from decoding a byte buffer.
645         public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
646                                                                  char[] chars, int charIndex)
647         {
648                 uint leftOverBits = 0;
649                 uint leftOverCount = 0;
650                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
651                                 charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
652         }
653
654         // Get the maximum number of bytes needed to encode a
655         // specified number of characters.
656         public override int GetMaxByteCount (int charCount)
657         {
658                 if (charCount < 0) {
659                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
660                 }
661                 return charCount * 4;
662         }
663
664         // Get the maximum number of characters needed to decode a
665         // specified number of bytes.
666         public override int GetMaxCharCount (int byteCount)
667         {
668                 if (byteCount < 0) {
669                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
670                 }
671                 return byteCount;
672         }
673
674         // Get a UTF8-specific decoder that is attached to this instance.
675         public override Decoder GetDecoder ()
676         {
677                 return new UTF8Decoder (throwOnInvalid);
678         }
679
680         // Get a UTF8-specific encoder that is attached to this instance.
681         public override Encoder GetEncoder ()
682         {
683                 return new UTF8Encoder (emitIdentifier);
684         }
685
686         // Get the UTF8 preamble.
687         public override byte[] GetPreamble ()
688         {
689                 if (emitIdentifier) {
690                         byte[] pre = new byte [3];
691                         pre[0] = (byte)0xEF;
692                         pre[1] = (byte)0xBB;
693                         pre[2] = (byte)0xBF;
694                         return pre;
695                 } else {
696                         return new byte [0];
697                 }
698         }
699
700         // Determine if this object is equal to another.
701         public override bool Equals (Object value)
702         {
703                 UTF8Encoding enc = (value as UTF8Encoding);
704                 if (enc != null) {
705                         return (codePage == enc.codePage &&
706                                         emitIdentifier == enc.emitIdentifier &&
707                                         throwOnInvalid == enc.throwOnInvalid);
708                 } else {
709                         return false;
710                 }
711         }
712
713         // Get the hash code for this object.
714         public override int GetHashCode ()
715         {
716                 return base.GetHashCode ();
717         }
718         
719         public override byte [] GetBytes (String s)
720         {
721                 if (s == null)
722                         throw new ArgumentNullException ("s");
723                 
724                 int length = GetByteCount (s);
725                 byte [] bytes = new byte [length];
726                 GetBytes (s, 0, s.Length, bytes, 0);
727                 return bytes;
728         }
729
730         // UTF-8 decoder implementation.
731         [Serializable]
732         private class UTF8Decoder : Decoder
733         {
734                 private bool throwOnInvalid;
735                 private uint leftOverBits;
736                 private uint leftOverCount;
737
738                 // Constructor.
739                 public UTF8Decoder (bool throwOnInvalid)
740                 {
741                         this.throwOnInvalid = throwOnInvalid;
742                         leftOverBits = 0;
743                         leftOverCount = 0;
744                 }
745
746                 // Override inherited methods.
747                 public override int GetCharCount (byte[] bytes, int index, int count)
748                 {
749                         return InternalGetCharCount (bytes, index, count,
750                                         leftOverBits, leftOverCount, throwOnInvalid, false);
751                 }
752                 public override int GetChars (byte[] bytes, int byteIndex,
753                                                  int byteCount, char[] chars, int charIndex)
754                 {
755                         return InternalGetChars (bytes, byteIndex, byteCount,
756                                 chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
757                 }
758
759         } // class UTF8Decoder
760
761         // UTF-8 encoder implementation.
762         [Serializable]
763         private class UTF8Encoder : Encoder
764         {
765                 private bool emitIdentifier;
766                 private uint leftOver;
767
768                 // Constructor.
769                 public UTF8Encoder (bool emitIdentifier)
770                 {
771                         this.emitIdentifier = emitIdentifier;
772                         leftOver = 0;
773                 }
774
775                 // Override inherited methods.
776                 public override int GetByteCount (char[] chars, int index,
777                                          int count, bool flush)
778                 {
779                         return InternalGetByteCount (chars, index, count, leftOver, flush);
780                 }
781                 public override int GetBytes (char[] chars, int charIndex,
782                                          int charCount, byte[] bytes, int byteCount, bool flush)
783                 {
784                         int result;
785                         result = InternalGetBytes (chars, charIndex, charCount, bytes, byteCount, ref leftOver, flush);
786                         emitIdentifier = false;
787                         return result;
788                 }
789
790         } // class UTF8Encoder
791
792 }; // class UTF8Encoding
793
794 }; // namespace System.Text