2003-03-18 Atsushi Enomoto <ginga@kit.hi-ho.ne.jp>
[mono.git] / mcs / class / corlib / System.Text / UTF8Encoding.cs
1 /*
2  * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
3  *
4  * Copyright (c) 2001, 2002  Southern Storm Software, Pty Ltd
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining
7  * a copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included
14  * in all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  */
24
25 namespace System.Text
26 {
27
28 using System;
29
30 [Serializable]
31 public class UTF8Encoding : Encoding
32 {
33         // Magic number used by Windows for UTF-8.
34         internal const int UTF8_CODE_PAGE = 65001;
35
36         // Internal state.
37         private bool emitIdentifier;
38         private bool throwOnInvalid;
39
40         // Constructors.
41         public UTF8Encoding () : this (false, false) {}
42         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
43                         : this (encoderShouldEmitUTF8Identifier, false) {}
44         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
45                 : base (UTF8_CODE_PAGE)
46         {
47                 emitIdentifier = encoderShouldEmitUTF8Identifier;
48                 throwOnInvalid = throwOnInvalidBytes;
49         }
50
51         // Internal version of "GetByteCount" which can handle a rolling
52         // state between multiple calls to this method.
53         private static int InternalGetByteCount (char[] chars, int index, int count, uint leftOver, bool flush)
54         {
55                 // Validate the parameters.
56                 if (chars == null) {
57                         throw new ArgumentNullException ("chars");
58                 }
59                 if (index < 0 || index > chars.Length) {
60                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
61                 }
62                 if (count < 0 || count > (chars.Length - index)) {
63                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
64                 }
65
66                 // Determine the lengths of all characters.
67                 char ch;
68                 int length = 0;
69                 uint pair = leftOver;
70                 while (count > 0) {
71                         ch = chars[index];
72                         if (pair == 0) {
73                                 if (ch < '\u0080') {
74                                         ++length;
75                                 } else if (ch < '\u0800') {
76                                         length += 2;
77                                 } else if (ch >= '\uD800' && ch <= '\uDBFF') {
78                                         // This is the start of a surrogate pair.
79                                         pair = (uint)ch;
80                                 } else {
81                                         length += 3;
82                                 }
83                         } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
84                                 // We have a surrogate pair.
85                                 length += 4;
86                                 pair = 0;
87                         } else {
88                                 // We have a surrogate start followed by a
89                                 // regular character.  Technically, this is
90                                 // invalid, but we have to do something.
91                                 // We write out the surrogate start and then
92                                 // re-visit the current character again.
93                                 length += 3;
94                                 pair = 0;
95                                 continue;
96                         }
97                         ++index;
98                         --count;
99                 }
100                 if (flush && pair != 0) {
101                         // Flush the left-over surrogate pair start.
102                         length += 3;
103                 }
104
105                 // Return the final length to the caller.
106                 return length;
107         }
108
109         // Get the number of bytes needed to encode a character buffer.
110         public override int GetByteCount (char[] chars, int index, int count)
111         {
112                 return InternalGetByteCount (chars, index, count, 0, true);
113         }
114
115         // Convenience wrappers for "GetByteCount".
116         public override int GetByteCount (String s)
117         {
118                 // Validate the parameters.
119                 if (s == null) {
120                         throw new ArgumentNullException ("s");
121                 }
122
123                 // Determine the lengths of all characters.
124                 char ch;
125                 int index = 0;
126                 int count = s.Length;
127                 int length = 0;
128                 uint pair;
129                 while (count > 0) {
130                         ch = s[index++];
131                         if (ch < '\u0080') {
132                                 ++length;
133                         } else if (ch < '\u0800') {
134                                 length += 2;
135                         } else if (ch >= '\uD800' && ch <= '\uDBFF' && count > 1) {
136                                 // This may be the start of a surrogate pair.
137                                 pair = (uint)(s[index]);
138                                 if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
139                                         length += 4;
140                                         ++index;
141                                         --count;
142                                 } else {
143                                         length += 3;
144                                 }
145                         } else {
146                                 length += 3;
147                         }
148                         --count;
149                 }
150
151                 // Return the final length to the caller.
152                 return length;
153         }
154
155         // Internal version of "GetBytes" which can handle a rolling
156         // state between multiple calls to this method.
157         private static int InternalGetBytes (char[] chars, int charIndex,
158                                              int charCount, byte[] bytes,
159                                              int byteIndex, ref uint leftOver,
160                                              bool flush)
161         {
162                 // Validate the parameters.
163                 if (chars == null) {
164                         throw new ArgumentNullException ("chars");
165                 }
166                 if (bytes == null) {
167                         throw new ArgumentNullException ("bytes");
168                 }
169                 if (charIndex < 0 || charIndex > chars.Length) {
170                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
171                 }
172                 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
173                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
174                 }
175                 if (byteIndex < 0 || byteIndex > bytes.Length) {
176                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
177                 }
178
179                 // Convert the characters into bytes.
180                 char ch;
181                 int length = bytes.Length;
182                 uint pair;
183                 uint left = leftOver;
184                 int posn = byteIndex;
185                 while (charCount > 0) {
186                         // Fetch the next UTF-16 character pair value.
187                         ch = chars[charIndex++];
188                         --charCount;
189                         if (left == 0) {
190                                 if (ch >= '\uD800' && ch <= '\uDBFF') {
191                                         // This is the start of a surrogate pair.
192                                         left = (uint)ch;
193                                         continue;
194                                 } else {
195                                         // This is a regular character.
196                                         pair = (uint)ch;
197                                 }
198                         } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
199                                 // We have a surrogate pair.
200                                 pair = ((left - (uint)0xD800) << 10) +
201                                            (((uint)ch) - (uint)0xDC00) +
202                                            (uint)0x10000;
203                                 left = 0;
204                         } else {
205                                 // We have a surrogate start followed by a
206                                 // regular character.  Technically, this is
207                                 // invalid, but we have to do something.
208                                 // We write out the surrogate start and then
209                                 // re-visit the current character again.
210                                 pair = (uint)left;
211                                 left = 0;
212                                 --charIndex;
213                                 ++charCount;
214                         }
215
216                         // Encode the character pair value.
217                         if (pair < (uint)0x0080) {
218                                 if (posn >= length) {
219                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
220                                 }
221                                 bytes[posn++] = (byte)pair;
222                         } else if (pair < (uint)0x0800) {
223                                 if ((posn + 2) > length) {
224                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
225                                 }
226                                 bytes[posn++] = (byte)(0xC0 | (pair >> 6));
227                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
228                         } else if (pair < (uint)0x10000) {
229                                 if ((posn + 3) > length) {
230                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
231                                 }
232                                 bytes[posn++] = (byte)(0xE0 | (pair >> 12));
233                                 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
234                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
235                         } else {
236                                 if ((posn + 4) > length) {
237                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
238                                 }
239                                 bytes[posn++] = (byte)(0xF0 | (pair >> 18));
240                                 bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
241                                 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
242                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
243                         }
244                 }
245                 if (flush && left != 0) {
246                         // Flush the left-over surrogate pair start.
247                         if ((posn + 3) > length) {
248                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
249                         }
250                         bytes[posn++] = (byte)(0xE0 | (left >> 12));
251                         bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));
252                         bytes[posn++] = (byte)(0x80 | (left & 0x3F));
253                         left = 0;
254                 }
255                 leftOver = left;
256
257                 // Return the final count to the caller.
258                 return posn - byteIndex;
259         }
260
261         // Get the bytes that result from encoding a character buffer.
262         public override int GetBytes (char[] chars, int charIndex, int charCount,
263                                                                  byte[] bytes, int byteIndex)
264         {
265                 uint leftOver = 0;
266                 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
267         }
268
269         // Convenience wrappers for "GetBytes".
270         public override int GetBytes (String s, int charIndex, int charCount,
271                                                                  byte[] bytes, int byteIndex)
272         {
273                 // Validate the parameters.
274                 if (s == null) {
275                         throw new ArgumentNullException ("s");
276                 }
277                 if (bytes == null) {
278                         throw new ArgumentNullException ("bytes");
279                 }
280                 if (charIndex < 0 || charIndex > s.Length) {
281                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
282                 }
283                 if (charCount < 0 || charCount > (s.Length - charIndex)) {
284                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
285                 }
286                 if (byteIndex < 0 || byteIndex > bytes.Length) {
287                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
288                 }
289
290                 // Convert the characters into bytes.
291                 char ch;
292                 int length = bytes.Length;
293                 uint pair;
294                 int posn = byteIndex;
295                 while (charCount > 0) {
296                         // Fetch the next UTF-16 character pair value.
297                         ch = s[charIndex++];
298                         --charCount;
299                         if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
300                                 // This may be the start of a surrogate pair.
301                                 pair = (uint)(s[charIndex]);
302                                 if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
303                                         pair = (pair - (uint)0xDC00) +
304                                                    ((((uint)ch) - (uint)0xD800) << 10) +
305                                                    (uint)0x10000;
306                                         ++charIndex;
307                                         --charCount;
308                                 } else {
309                                         pair = (uint)ch;
310                                 }
311                         } else {
312                                 pair = (uint)ch;
313                         }
314
315                         // Encode the character pair value.
316                         if (pair < (uint)0x0080) {
317                                 if (posn >= length) {
318                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
319                                 }
320                                 bytes[posn++] = (byte)pair;
321                         } else if (pair < (uint)0x0800) {
322                                 if ((posn + 2) > length) {
323                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
324                                 }
325                                 bytes[posn++] = (byte)(0xC0 | (pair >> 6));
326                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
327                         } else if (pair < (uint)0x10000) {
328                                 if ((posn + 3) > length) {
329                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
330                                 }
331                                 bytes[posn++] = (byte)(0xE0 | (pair >> 12));
332                                 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
333                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
334                         } else {
335                                 if ((posn + 4) > length) {
336                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
337                                 }
338                                 bytes[posn++] = (byte)(0xF0 | (pair >> 18));
339                                 bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
340                                 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
341                                 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
342                         }
343                 }
344
345                 // Return the final count to the caller.
346                 return posn - byteIndex;
347         }
348
349         // Internal version of "GetCharCount" which can handle a rolling
350         // state between multiple calls to this method.
351         private static int InternalGetCharCount (byte[] bytes, int index, int count,
352                                                                                    uint leftOverBits,
353                                                                                    uint leftOverCount,
354                                                                                    bool throwOnInvalid, bool flush)
355         {
356                 // Validate the parameters.
357                 if (bytes == null) {
358                         throw new ArgumentNullException ("bytes");
359                 }
360                 if (index < 0 || index > bytes.Length) {
361                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
362                 }
363                 if (count < 0 || count > (bytes.Length - index)) {
364                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
365                 }
366
367                 // Determine the number of characters that we have.
368                 uint ch;
369                 int length = 0;
370                 uint leftBits = leftOverBits;
371                 uint leftSoFar = (leftOverCount & (uint)0x0F);
372                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
373                 while (count > 0) {
374                         ch = (uint)(bytes[index++]);
375                         --count;
376                         if (leftSize == 0) {
377                                 // Process a UTF-8 start character.
378                                 if (ch < (uint)0x0080) {
379                                         // Single-byte UTF-8 character.
380                                         ++length;
381                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
382                                         // Double-byte UTF-8 character.
383                                         leftBits = (ch & (uint)0x1F);
384                                         leftSoFar = 1;
385                                         leftSize = 2;
386                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
387                                         // Three-byte UTF-8 character.
388                                         leftBits = (ch & (uint)0x0F);
389                                         leftSoFar = 1;
390                                         leftSize = 3;
391                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
392                                         // Four-byte UTF-8 character.
393                                         leftBits = (ch & (uint)0x07);
394                                         leftSoFar = 1;
395                                         leftSize = 4;
396                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
397                                         // Five-byte UTF-8 character.
398                                         leftBits = (ch & (uint)0x03);
399                                         leftSoFar = 1;
400                                         leftSize = 5;
401                                 } else if ((ch & (uint)0xFC) == (uint)0xFC) {
402                                         // Six-byte UTF-8 character.
403                                         leftBits = (ch & (uint)0x03);
404                                         leftSoFar = 1;
405                                         leftSize = 6;
406                                 } else {
407                                         // Invalid UTF-8 start character.
408                                         if (throwOnInvalid) {
409                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
410                                         }
411                                 }
412                         } else {
413                                 // Process an extra byte in a multi-byte sequence.
414                                 if ((ch & (uint)0xC0) == (uint)0x80) {
415                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
416                                         if (++leftSoFar >= leftSize) {
417                                                 // We have a complete character now.
418                                                 if (leftBits < (uint)0x10000) {
419                                                         if (leftBits != (uint)0xFEFF) {
420                                                                 ++length;
421                                                         }
422                                                 } else if (leftBits < (uint)0x110000) {
423                                                         length += 2;
424                                                 } else if (throwOnInvalid) {
425                                                         throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
426                                                 }
427                                                 leftSize = 0;
428                                         }
429                                 } else {
430                                         // Invalid UTF-8 sequence: clear and restart.
431                                         if (throwOnInvalid) {
432                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
433                                         }
434                                         leftSize = 0;
435                                         --index;
436                                         ++count;
437                                 }
438                         }
439                 }
440                 if (flush && leftSize != 0 && throwOnInvalid) {
441                         // We had left-over bytes that didn't make up
442                         // a complete UTF-8 character sequence.
443                         throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
444                 }
445
446                 // Return the final length to the caller.
447                 return length;
448         }
449
450         // Get the number of characters needed to decode a byte buffer.
451         public override int GetCharCount (byte[] bytes, int index, int count)
452         {
453                 return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
454         }
455
456         // Get the characters that result from decoding a byte buffer.
457         private static int InternalGetChars (byte[] bytes, int byteIndex,
458                                                                            int byteCount, char[] chars,
459                                                                            int charIndex, ref uint leftOverBits,
460                                                                            ref uint leftOverCount,
461                                                                            bool throwOnInvalid, bool flush)
462         {
463                 // Validate the parameters.
464                 if (bytes == null) {
465                         throw new ArgumentNullException ("bytes");
466                 }
467                 if (chars == null) {
468                         throw new ArgumentNullException ("chars");
469                 }
470                 if (byteIndex < 0 || byteIndex > bytes.Length) {
471                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
472                 }
473                 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
474                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
475                 }
476                 if (charIndex < 0 || charIndex > chars.Length) {
477                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
478                 }
479
480                 // Convert the bytes into the output buffer.
481                 uint ch;
482                 int length = chars.Length;
483                 int posn = charIndex;
484                 uint leftBits = leftOverBits;
485                 uint leftSoFar = (leftOverCount & (uint)0x0F);
486                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
487                 while (byteCount > 0) {
488                         // Fetch the next character from the byte buffer.
489                         ch = (uint)(bytes[byteIndex++]);
490                         --byteCount;
491                         if (leftSize == 0) {
492                                 // Process a UTF-8 start character.
493                                 if (ch < (uint)0x0080) {
494                                         // Single-byte UTF-8 character.
495                                         if (posn >= length) {
496                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
497                                         }
498                                         chars[posn++] = (char)ch;
499                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
500                                         // Double-byte UTF-8 character.
501                                         leftBits = (ch & (uint)0x1F);
502                                         leftSoFar = 1;
503                                         leftSize = 2;
504                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
505                                         // Three-byte UTF-8 character.
506                                         leftBits = (ch & (uint)0x0F);
507                                         leftSoFar = 1;
508                                         leftSize = 3;
509                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
510                                         // Four-byte UTF-8 character.
511                                         leftBits = (ch & (uint)0x07);
512                                         leftSoFar = 1;
513                                         leftSize = 4;
514                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
515                                         // Five-byte UTF-8 character.
516                                         leftBits = (ch & (uint)0x03);
517                                         leftSoFar = 1;
518                                         leftSize = 5;
519                                 } else if ((ch & (uint)0xFC) == (uint)0xFC) {
520                                         // Six-byte UTF-8 character.
521                                         leftBits = (ch & (uint)0x03);
522                                         leftSoFar = 1;
523                                         leftSize = 6;
524                                 } else {
525                                         // Invalid UTF-8 start character.
526                                         if (throwOnInvalid) {
527                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
528                                         }
529                                 }
530                         } else {
531                                 // Process an extra byte in a multi-byte sequence.
532                                 if ((ch & (uint)0xC0) == (uint)0x80) {
533                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
534                                         if (++leftSoFar >= leftSize) {
535                                                 // We have a complete character now.
536                                                 if (leftBits < (uint)0x10000) {
537                                                         if (leftBits != (uint)0xFEFF) {
538                                                                 if (posn >= length) {
539                                                                         throw new ArgumentException
540                                                                                 (_("Arg_InsufficientSpace"), "chars");
541                                                                 }
542                                                                 chars[posn++] = (char)leftBits;
543                                                         }
544                                                 } else if (leftBits < (uint)0x110000) {
545                                                         if ((posn + 2) > length) {
546                                                                 throw new ArgumentException
547                                                                         (_("Arg_InsufficientSpace"), "chars");
548                                                         }
549                                                         leftBits -= (uint)0x10000;
550                                                         chars[posn++] = (char)((leftBits >> 10) +
551                                                                                                    (uint)0xD800);
552                                                         chars[posn++] =
553                                                                 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
554                                                 } else if (throwOnInvalid) {
555                                                         throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
556                                                 }
557                                                 leftSize = 0;
558                                         }
559                                 } else {
560                                         // Invalid UTF-8 sequence: clear and restart.
561                                         if (throwOnInvalid) {
562                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
563                                         }
564                                         leftSize = 0;
565                                         --byteIndex;
566                                         ++byteCount;
567                                 }
568                         }
569                 }
570                 if (flush && leftSize != 0 && throwOnInvalid) {
571                         // We had left-over bytes that didn't make up
572                         // a complete UTF-8 character sequence.
573                         throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
574                 }
575                 leftOverBits = leftBits;
576                 leftOverCount = (leftSoFar | (leftSize << 4));
577
578                 // Return the final length to the caller.
579                 return posn - charIndex;
580         }
581
582         // Get the characters that result from decoding a byte buffer.
583         public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
584                                                                  char[] chars, int charIndex)
585         {
586                 uint leftOverBits = 0;
587                 uint leftOverCount = 0;
588                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
589                                 charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
590         }
591
592         // Get the maximum number of bytes needed to encode a
593         // specified number of characters.
594         public override int GetMaxByteCount (int charCount)
595         {
596                 if (charCount < 0) {
597                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
598                 }
599                 return charCount * 4;
600         }
601
602         // Get the maximum number of characters needed to decode a
603         // specified number of bytes.
604         public override int GetMaxCharCount (int byteCount)
605         {
606                 if (byteCount < 0) {
607                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
608                 }
609                 return byteCount;
610         }
611
612         // Get a UTF8-specific decoder that is attached to this instance.
613         public override Decoder GetDecoder ()
614         {
615                 return new UTF8Decoder (throwOnInvalid);
616         }
617
618         // Get a UTF8-specific encoder that is attached to this instance.
619         public override Encoder GetEncoder ()
620         {
621                 return new UTF8Encoder (emitIdentifier);
622         }
623
624         // Get the UTF8 preamble.
625         public override byte[] GetPreamble ()
626         {
627                 if (emitIdentifier) {
628                         byte[] pre = new byte [3];
629                         pre[0] = (byte)0xEF;
630                         pre[1] = (byte)0xBB;
631                         pre[2] = (byte)0xBF;
632                         return pre;
633                 } else {
634                         return new byte [0];
635                 }
636         }
637
638         // Determine if this object is equal to another.
639         public override bool Equals (Object value)
640         {
641                 UTF8Encoding enc = (value as UTF8Encoding);
642                 if (enc != null) {
643                         return (codePage == enc.codePage &&
644                                         emitIdentifier == enc.emitIdentifier &&
645                                         throwOnInvalid == enc.throwOnInvalid);
646                 } else {
647                         return false;
648                 }
649         }
650
651         // Get the hash code for this object.
652         public override int GetHashCode ()
653         {
654                 return base.GetHashCode ();
655         }
656
657 #if !ECMA_COMPAT
658
659         // Get the mail body name for this encoding.
660         public override String BodyName
661         {
662                 get {
663                         return "utf-8";
664                 }
665         }
666
667         // Get the human-readable name for this encoding.
668         public override String EncodingName
669         {
670                 get {
671                         return "Unicode (UTF-8)";
672                 }
673         }
674
675         // Get the mail agent header name for this encoding.
676         public override String HeaderName
677         {
678                 get {
679                         return "utf-8";
680                 }
681         }
682
683         // Determine if this encoding can be displayed in a Web browser.
684         public override bool IsBrowserDisplay
685         {
686                 get {
687                         return true;
688                 }
689         }
690
691         // Determine if this encoding can be saved from a Web browser.
692         public override bool IsBrowserSave
693         {
694                 get {
695                         return true;
696                 }
697         }
698
699         // Determine if this encoding can be displayed in a mail/news agent.
700         public override bool IsMailNewsDisplay
701         {
702                 get {
703                         return true;
704                 }
705         }
706
707         // Determine if this encoding can be saved from a mail/news agent.
708         public override bool IsMailNewsSave
709         {
710                 get {
711                         return true;
712                 }
713         }
714
715         // Get the IANA-preferred Web name for this encoding.
716         public override String WebName
717         {
718                 get {
719                         return "utf-8";
720                 }
721         }
722
723         // Get the Windows code page represented by this object.
724         public override int WindowsCodePage
725         {
726                 get {
727                         return UnicodeEncoding.UNICODE_CODE_PAGE;
728                 }
729         }
730
731 #endif // !ECMA_COMPAT
732
733         // UTF-8 decoder implementation.
734         [Serializable]
735         private sealed class UTF8Decoder : Decoder
736         {
737                 private bool throwOnInvalid;
738                 private uint leftOverBits;
739                 private uint leftOverCount;
740
741                 // Constructor.
742                 public UTF8Decoder (bool throwOnInvalid)
743                 {
744                         this.throwOnInvalid = throwOnInvalid;
745                         leftOverBits = 0;
746                         leftOverCount = 0;
747                 }
748
749                 // Override inherited methods.
750                 public override int GetCharCount (byte[] bytes, int index, int count)
751                 {
752                         return InternalGetCharCount (bytes, index, count,
753                                         leftOverBits, leftOverCount, throwOnInvalid, false);
754                 }
755                 public override int GetChars (byte[] bytes, int byteIndex,
756                                                  int byteCount, char[] chars, int charIndex)
757                 {
758                         return InternalGetChars (bytes, byteIndex, byteCount,
759                                 chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
760                 }
761
762         } // class UTF8Decoder
763
764         // UTF-8 encoder implementation.
765         [Serializable]
766         private sealed class UTF8Encoder : Encoder
767         {
768                 private bool emitIdentifier;
769                 private uint leftOver;
770
771                 // Constructor.
772                 public UTF8Encoder (bool emitIdentifier)
773                 {
774                         this.emitIdentifier = emitIdentifier;
775                         leftOver = 0;
776                 }
777
778                 // Override inherited methods.
779                 public override int GetByteCount (char[] chars, int index,
780                                          int count, bool flush)
781                 {
782                         return InternalGetByteCount (chars, index, count, leftOver, flush);
783                 }
784                 public override int GetBytes (char[] chars, int charIndex,
785                                          int charCount, byte[] bytes, int byteCount, bool flush)
786                 {
787                         int result;
788                         result = InternalGetBytes (chars, charIndex, charCount, bytes, byteCount, ref leftOver, flush);
789                         emitIdentifier = false;
790                         return result;
791                 }
792
793         } // class UTF8Encoder
794
795 }; // class UTF8Encoding
796
797 }; // namespace System.Text