1944d98b2c081d34ec5f4410521f07f4fc144f63
[mono.git] / mcs / class / corlib / System.Text / UTF8Encoding.cs
1 /*
2  * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
3  *
4  * Copyright (c) 2001, 2002  Southern Storm Software, Pty Ltd
5  * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23  * OTHER DEALINGS IN THE SOFTWARE.
24  */
25
26 namespace System.Text
27 {
28
29 using System;
30 using System.Runtime.InteropServices;
31
32 [Serializable]
33 [MonoLimitation ("Serialization format not compatible with .NET")]
34 [ComVisible (true)]
35 public class UTF8Encoding : Encoding
36 {
37         // Magic number used by Windows for UTF-8.
38         internal const int UTF8_CODE_PAGE = 65001;
39
40         // Internal state.
41         private bool emitIdentifier;
42
43         // Constructors.
44         public UTF8Encoding () : this (false, false) {}
45         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
46                         : this (encoderShouldEmitUTF8Identifier, false) {}
47         
48         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
49                 : base (UTF8_CODE_PAGE)
50         {
51                 emitIdentifier = encoderShouldEmitUTF8Identifier;
52                 if (throwOnInvalidBytes)
53                         SetFallbackInternal (EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
54                 else
55                         SetFallbackInternal (EncoderFallback.StandardSafeFallback, DecoderFallback.StandardSafeFallback);
56
57                 web_name = body_name = header_name = "utf-8";
58                 encoding_name = "Unicode (UTF-8)";
59                 is_browser_save = true;
60                 is_browser_display = true;
61                 is_mail_news_display = true;
62                 is_mail_news_save = true;
63                 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
64         }
65
66         ///////////////////////////////////////////////////////////////////////
67         // INTERNAL DECODING FUNCTION (UTF8 -> CHAR/UTF16)
68         ///////////////////////////////////////////////////////////////////////
69
70         internal enum DecoderStatus {
71                 Ok,
72                 InsufficientSpace,
73                 InvalidChar,
74                 InvalidSequence,
75                 InvalidStart,
76                 InputRunOut,
77                 SurrogateFound,
78                 Overlong,
79         };
80
81         // following method decodes an utf8 character from a byte buffer.
82         // NOTE: If 'chars' is null, this function only counts bytes and chars
83         //       without writing anything.
84         // NOTE: BOM (0xEF 0xBB 0xBF) is not yet supported.
85         //       See http://www.cl.cam.ac.uk/~mgk25/unicode.html
86         private unsafe static DecoderStatus InternalGetChar (
87                 byte* bytes, int byteCount,
88                 char* chars, int charCount,
89                 out int bytesProcessed, out int charsProcessed,
90                 ref uint leftBytes, ref uint leftBits, ref uint procBytes)
91         {
92                 uint ch;
93                 bool checkByte;
94
95                 // reset counters
96                 bytesProcessed = 0;
97                 charsProcessed = 0;
98
99                 // Fetch the start character from the byte buffer.
100                 if (leftBytes == 0) {
101                         if (byteCount == 0)
102                                 return DecoderStatus.InputRunOut;
103                         ch = (uint) (*bytes++);
104                         bytesProcessed++;
105                         byteCount--;
106                         procBytes = ch;
107                         if (ch < (uint) 0x0080) {
108                                 // Single-byte UTF-8 character.
109                                 leftBits = ch;
110                                 leftBytes = 0;
111                         } else if (ch == (uint) 0xc0 || ch == (uint) 0xc1) {
112                                 // invalid start
113                                 return DecoderStatus.InvalidChar;
114                         } else if ((ch & (uint) 0xE0) == (uint) 0xC0) {
115                                 // Double-byte UTF-8 character.
116                                 leftBits = ((ch & (uint) 0x1F) << 6*1);
117                                 leftBytes = 1;
118                         } else if ((ch & (uint) 0xF0) == (uint) 0xE0) {
119                                 // Three-byte UTF-8 character.
120                                 leftBits = ((ch & (uint) 0x0F) << 6*2);
121                                 leftBytes = 2;
122                         } else if ((ch & (uint) 0xF8) == (uint) 0xF0) {
123                                 // Four-byte UTF-8 character.
124                                 leftBits = ((ch & (uint) 0x07) << 6*3);
125                                 leftBytes = 3;
126                                 // extra check for detecting as soon as
127                                 // possible too big four-byte utf chars
128                                 if (leftBits >= (uint) 0x110000)
129                                         return DecoderStatus.InvalidChar;
130                         } else {
131                                 // Invalid five-or-six-byte or start char
132                                 // NOTE: I keep here the code for 5/6 bytes if
133                                 // needed, but technically these combinations
134                                 // are invalid in UTF-8 sequences.
135                                 //   (ch & (uint) 0xFC) == (uint) 0xF8 =>
136                                 //              leftBits = ch & (uint) 0x03;
137                                 //              leftBytes = 4;
138                                 //   (ch & (uint) 0xFE) == (uint) 0xFC =>
139                                 //              leftBits = ch & (uint) 0x01;
140                                 //              leftBytes = 5;
141                                 leftBits = leftBytes = 0;
142                                 return DecoderStatus.InvalidStart;
143                         }
144                         checkByte = (leftBytes > 0 && leftBits == 0);
145                 } else {
146                         // restore state
147                         checkByte = (leftBytes >> 4) != 0;
148                         leftBytes &= (uint) 0x0f;
149                 }
150
151                 // process the required bytes...
152                 for (; leftBytes > 0; leftBytes--) {
153                         if (byteCount == 0) {
154                                 leftBytes = ((uint) (checkByte ? 0x10 : 0x00)) | leftBytes;
155                                 return DecoderStatus.InputRunOut;
156                         }
157                         ch = (uint) (*bytes++);
158                         if ((ch & (uint) 0xC0) != (uint) 0x80) {
159                                 // Invalid UTF-8 sequence: clear and restart.
160                                 // NOTE: we return before counting the
161                                 //       processed bytes for restarting
162                                 //       decoding later at this point
163                                 return DecoderStatus.InvalidSequence;
164                         }
165                         bytesProcessed++;
166                         byteCount--;
167                         procBytes = (procBytes << 8) | ch;
168                         if (checkByte && ((~((uint) 0x1f >> (int) leftBytes - 2)) & ch) == 0x80) {
169                                 // detected an overlong sequence :(
170                                 return DecoderStatus.Overlong;
171                         }
172                         checkByte = false;
173                         leftBits = leftBits | ((ch & (uint) 0x3F) << (6*(int) (leftBytes - 1)));
174                         if (leftBits >= (uint) 0x110000) {
175                                 // this UTF-8 is too big ...
176                                 return DecoderStatus.InvalidChar;
177                         }
178                         if ((leftBits & 0xF800) == 0xD800) {
179                                 // UTF-8 doesn't use surrogate characters
180                                 return DecoderStatus.SurrogateFound;
181                         }
182                 }
183
184                 // convert this character to UTF-16
185                 if (leftBits < (uint) 0x10000) {
186                         if(chars != null) {
187                                 if(charCount < 1)
188                                         return DecoderStatus.InsufficientSpace;
189                                 *chars = (char) leftBits;
190                         }
191                         charsProcessed++;
192                 } else  {
193                         if(chars != null) {
194                                 if(charCount < 2)
195                                         return DecoderStatus.InsufficientSpace;
196                                 leftBits -= (uint) 0x10000;
197                                 *chars++ = (char) ((leftBits >> 10) + (uint) 0xD800);
198                                 *chars++ = (char) ((leftBits & (uint) 0x3FF) + (uint) 0xDC00);
199                         }
200                         charsProcessed += 2;
201                 }
202
203                 // we've read a complete char... reset decoder status and finish
204                 leftBytes = leftBits = procBytes = 0;
205                 return DecoderStatus.Ok;
206         }
207
208         internal unsafe static DecoderStatus InternalGetChars (
209                 byte* bytes, int byteCount,
210                 char* chars, int charCount,
211                 DecoderFallbackBuffer fallbackBuffer,
212                 out int bytesProcessed, out int charsProcessed,
213                 ref uint leftBytes, ref uint leftBits, ref uint procBytes)
214         {
215                 DecoderStatus s;
216                 int t_bytesProcessed, t_charsProcessed;
217
218                 // Validate parameters
219                 if (bytes == null)
220                         throw new ArgumentNullException ("bytes");
221                 if (byteCount < 0)
222                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
223                 if (charCount < 0)
224                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
225
226                 // reset counters
227                 charsProcessed = 0;
228                 bytesProcessed = 0;
229
230                 // byte processing loop
231                 while(byteCount - bytesProcessed > 0 && (chars == null || charCount - charsProcessed > 0)) {
232                         // fetch a char from the input byte array
233                         s = chars != null
234                                 ? InternalGetChar (
235                                         bytes + bytesProcessed, byteCount - bytesProcessed,
236                                         chars + charsProcessed, charCount - charsProcessed,
237                                         out t_bytesProcessed, out t_charsProcessed,
238                                         ref leftBytes, ref leftBits, ref procBytes)
239                                 : InternalGetChar (
240                                         bytes + bytesProcessed, byteCount - bytesProcessed,
241                                         null, 0,
242                                         out t_bytesProcessed, out t_charsProcessed,
243                                         ref leftBytes, ref leftBits, ref procBytes);
244
245                         // update counters
246                         charsProcessed += t_charsProcessed;
247                         bytesProcessed += t_bytesProcessed;
248
249                         switch(s) {
250                         case DecoderStatus.Ok:
251                                 break;  // everything OK :D
252
253                         case DecoderStatus.InsufficientSpace:
254                                 throw new ArgumentException ("Insufficient Space", "chars");
255
256                         case DecoderStatus.Overlong:
257                         case DecoderStatus.InvalidSequence:
258                         case DecoderStatus.InvalidStart:
259                         case DecoderStatus.InvalidChar:
260                         case DecoderStatus.SurrogateFound:
261                                 // Invalid UTF-8 characters and sequences...
262                                 // now we build a 'bytesUnknown' array with the
263                                 // stored bytes in 'procBytes'.
264                                 int extra = 0;
265                                 for (uint t = procBytes; t != 0; extra++)
266                                         t = t >> 8;
267                                 byte [] bytesUnknown = new byte [extra];
268                                 for (int i = extra; i > 0; i--)
269                                         bytesUnknown [i - 1] = (byte) ((procBytes >> (8 * (extra - i))) & 0xff);
270                                 // partial reset: this condition avoids
271                                 // infinite loops
272                                 if (s == DecoderStatus.InvalidSequence)
273                                         leftBytes = 0;
274                                 // call the fallback and cross fingers
275                                 fallbackBuffer.Fallback (bytesUnknown, bytesProcessed - extra);
276                                 if(chars != null) {
277                                         while (fallbackBuffer.Remaining > 0) {
278                                                 if (charsProcessed >= charCount)
279                                                         throw new ArgumentException ("Insufficient Space", "chars/fallback");
280                                                 chars [charsProcessed++] = fallbackBuffer.GetNextChar ();
281                                         }
282                                 } else
283                                         charsProcessed += fallbackBuffer.Remaining;
284                                 fallbackBuffer.Reset ();
285                                 // recovery was succesful, reset decoder state
286                                 leftBits = leftBytes = procBytes = 0;
287                                 break;
288
289                         case DecoderStatus.InputRunOut:
290                                 return DecoderStatus.InputRunOut;
291                         }
292                 }
293                 return DecoderStatus.Ok;
294         }
295
296         // Get the characters that result from decoding a byte buffer.
297         internal unsafe static DecoderStatus InternalGetChars (
298                 byte[] bytes, int byteIndex, int byteCount,
299                 char[] chars, int charIndex,
300                 DecoderFallbackBuffer fallbackBuffer,
301                 out int bytesProcessed, out int charsProcessed,
302                 ref uint leftBytes, ref uint leftBits, ref uint procBytes)
303         {
304                 // Validate the parameters.
305                 if (bytes == null)
306                         throw new ArgumentNullException ("bytes");
307                 if (byteIndex < 0 || byteIndex >= bytes.Length)
308                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
309                 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex))
310                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
311                 if (charIndex < 0 || charIndex > (chars != null && chars.Length > 0 ? chars.Length - 1 : 0))
312                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
313
314                 fixed (char* cptr = chars) {
315                         fixed (byte* bptr = bytes) {
316                                 return InternalGetChars (
317                                                 bptr + byteIndex, byteCount,
318                                                 chars != null ? cptr + charIndex : null,
319                                                 chars != null ? chars.Length - charIndex : 0,
320                                                 fallbackBuffer,
321                                                 out bytesProcessed, out charsProcessed,
322                                                 ref leftBytes, ref leftBits, ref procBytes);
323                         }
324                 }
325         }
326
327         ///////////////////////////////////////////////////////////////////////
328         // INTERNAL ENCODING FUNCTION (CHAR/UTF16 -> UTF8)
329         ///////////////////////////////////////////////////////////////////////
330
331         internal enum EncoderStatus {
332                 Ok,
333                 InputRunOut,
334                 InsufficientSpace,
335                 InvalidChar,
336                 InvalidSurrogate,
337         };
338
339         // following method encodes an utf8 character into a byte buffer.
340         // NOTE: If 'bytes' is null, this function only counts bytes and chars
341         //       without writing anything.
342         // NOTE: BOM (0xEF 0xBB 0xBF) is not yet supported.
343         //       See http://www.cl.cam.ac.uk/~mgk25/unicode.html
344         private unsafe static EncoderStatus InternalGetByte (
345                 char* chars, int charCount,
346                 byte* bytes, int byteCount,
347                 out int charsProcessed, out int bytesProcessed, ref uint leftChar)
348         {
349                 uint ch;
350
351                 // reset counters
352                 charsProcessed = 0;
353                 bytesProcessed = 0;
354
355                 // process one char (this block executes twice if a surrogate is found)
356 again:
357                 if (charCount < 1)
358                         return EncoderStatus.InputRunOut;
359
360                 ch = *chars++;
361
362                 if (leftChar == 0) {
363                         // char counting is inside if for reason discused in else
364                         charsProcessed++;
365                         charCount--;
366                         if (ch < (uint) 0x80) {
367                                 if (bytes != null) {
368                                         if(byteCount < 1)
369                                                 return EncoderStatus.InsufficientSpace;
370                                         *bytes++ = (byte) ch;
371                                         byteCount--;
372                                 }
373                                 bytesProcessed++;
374                         } else if (ch < (uint) 0x0800) {
375                                 if (bytes != null) {
376                                         if (byteCount < 2)
377                                                 return EncoderStatus.InsufficientSpace;
378                                         *bytes++ = (byte) ((uint) 0xC0 | (ch >> 6) & 0x3f);
379                                         *bytes++ = (byte) ((uint) 0x80 | ch & 0x3f);
380                                         byteCount -= 2;
381                                 }
382                                 bytesProcessed += 2;
383                         } else if (ch < (uint) 0xD800 || ch > (uint) 0xDFFF) {
384                                 if (bytes != null) {
385                                         if (byteCount < 3)
386                                                 return EncoderStatus.InsufficientSpace;
387                                         *bytes++ = (byte) ((uint) 0xE0 | (ch >> 12));
388                                         *bytes++ = (byte) ((uint) 0x80 | ((ch >> 6) & 0x3F));
389                                         *bytes++ = (byte) ((uint) 0x80 | (ch & 0x3F));
390                                         byteCount -= 3;
391                                 }
392                                 bytesProcessed += 3;
393                         } else if (ch <= (uint) 0xDBFF) {
394                                 // This is a surrogate char, repeat please
395                                 leftChar = ch;
396                                 goto again;
397                         } else {
398                                 // We have a surrogate tail without 
399                                 // leading surrogate.
400                                 return EncoderStatus.InvalidChar;
401                         }
402                 } else {
403                         if (ch >= (uint) 0xDC00 && ch <= (uint) 0xDFFF) {
404                                 // We have a correct surrogate pair.
405                                 ch = 0x10000 + (uint) ch - (uint) 0xDC00
406                                    + ((leftChar - (uint) 0xD800) << 10);
407                                 if (bytes != null) {
408                                         if (byteCount < 4)
409                                                 return EncoderStatus.InsufficientSpace;
410                                         *bytes++ = (byte) (0xF0 | (ch >> 18));
411                                         *bytes++ = (byte) (0x80 | ((ch >> 12) & 0x3F));
412                                         *bytes++ = (byte) (0x80 | ((ch >> 6) & 0x3F));
413                                         *bytes++ = (byte) (0x80 | (ch & 0x3F));
414                                         byteCount -= 4;
415                                 }
416                                 bytesProcessed += 4;
417                         } else {
418                                 // We have a surrogate start followed by a
419                                 // regular character.  Technically, this is
420                                 // invalid, so we fail :(
421                                 return EncoderStatus.InvalidSurrogate;
422                         }
423                         // increment counters; this is done after processing
424                         // the surrogate: in case of a bad surrogate the
425                         // encoding should restart on the faulty char (maybe
426                         // the correct surrogate has been lost, and in this
427                         // case the best option is to restart processing on the
428                         // erroneus char to avoid losing more chars during the
429                         // encoding.
430                         charsProcessed++;
431                         charCount--;
432                         leftChar = 0;
433                 }
434                 return EncoderStatus.Ok;
435         }
436
437         internal unsafe static EncoderStatus InternalGetBytes (
438                 char* chars, int charCount,
439                 byte* bytes, int byteCount,
440                 EncoderFallbackBuffer fallbackBuffer,
441                 out int charsProcessed, out int bytesProcessed,
442                 ref uint leftChar)
443         {
444                 EncoderStatus s;
445                 int t_charsProcessed, t_bytesProcessed;
446
447                 // Validate the parameters
448                 if (chars == null)
449                         throw new ArgumentNullException ("bytes");
450                 if (charCount < 0)
451                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
452                 if (byteCount < 0)
453                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
454
455                 // reset counters
456                 charsProcessed = 0;
457                 bytesProcessed = 0;
458
459                 // char processing loop
460                 while (charCount - charsProcessed > 0) {
461                         s = bytes != null
462                                 ? InternalGetByte (
463                                         chars + charsProcessed, charCount - charsProcessed,
464                                         bytes + bytesProcessed, byteCount - bytesProcessed,
465                                         out t_charsProcessed, out t_bytesProcessed, ref leftChar)
466                                 : InternalGetByte (
467                                         chars + charsProcessed, charCount - charsProcessed,
468                                         null, 0,
469                                         out t_charsProcessed, out t_bytesProcessed, ref leftChar);
470
471                         charsProcessed += t_charsProcessed;
472                         bytesProcessed += t_bytesProcessed;
473
474                         switch (s) {
475                         case EncoderStatus.Ok:
476                                 break;  // everything OK :D
477
478                         case EncoderStatus.InsufficientSpace:
479                                 throw new ArgumentException ("Insufficient Space", "bytes");
480
481                         case EncoderStatus.InputRunOut:
482                                 return EncoderStatus.InputRunOut;
483
484                         case EncoderStatus.InvalidChar:
485                         case EncoderStatus.InvalidSurrogate:
486                                 // we've found an invalid char or surrogate
487                                 if (fallbackBuffer == null) {
488                                         // without a fallbackBuffer abort
489                                         // returning 'InvalidChar' or
490                                         // 'InvalidSurrogate'
491                                         return s;
492                                 }
493                                 if(t_charsProcessed >= 1) {
494                                         // one-char invalid UTF-16 or an
495                                         // invalid surrogate
496                                         fallbackBuffer.Fallback (
497                                                 chars [charsProcessed - 1],
498                                                 charsProcessed - 1);
499                                 } else {
500                                         // we've read a two-char invalid UTF-16
501                                         // but in this buffer we have only the
502                                         // invalid surrogate tail
503                                         fallbackBuffer.Fallback (
504                                                 (char) leftChar,
505                                                 -1);
506                                 }
507                                 // if we've arrived here we are working in
508                                 // replacement mode: build a replacement
509                                 // fallback_chars buffer
510                                 char[] fallback_chars = new char [fallbackBuffer.Remaining];
511                                 for (int i = 0; i < fallback_chars.Length; i++)
512                                         fallback_chars [i] = fallbackBuffer.GetNextChar ();
513                                 fallbackBuffer.Reset ();
514                                 // and encode it into UTF8 bytes...
515                                 fixed (char *fb_chars = fallback_chars) {
516                                         leftChar = 0;
517                                         switch (bytes != null
518                                                 ? InternalGetBytes (fb_chars, fallback_chars.Length,
519                                                                     bytes + bytesProcessed, byteCount - bytesProcessed,
520                                                                     null, out t_charsProcessed, out t_bytesProcessed,
521                                                                     ref leftChar)
522                                                 : InternalGetBytes (fb_chars, fallback_chars.Length,
523                                                                     null, 0,
524                                                                     null, out t_charsProcessed, out t_bytesProcessed,
525                                                                     ref leftChar)) {
526                                         case EncoderStatus.Ok:
527                                                 // everything OK :D
528                                                 bytesProcessed += t_bytesProcessed;
529                                                 break;
530                                         case EncoderStatus.InsufficientSpace:
531                                                 throw new ArgumentException ("Insufficient Space", "fallback buffer bytes");
532                                         case EncoderStatus.InputRunOut:
533                                         case EncoderStatus.InvalidChar:
534                                         case EncoderStatus.InvalidSurrogate:
535                                                 throw new ArgumentException ("Fallback chars are pure evil.", "fallback buffer bytes");
536                                         }
537                                 }
538                                 // partial reset of encoder state
539                                 leftChar = 0;
540                                 break;
541                         }
542                 }
543                 return EncoderStatus.Ok;
544         }
545
546         internal unsafe static EncoderStatus InternalGetBytes (
547                 char[] chars, int charIndex, int charCount,
548                 byte[] bytes, int byteIndex,
549                 EncoderFallbackBuffer fallbackBuffer,
550                 out int charsProcessed, out int bytesProcessed,
551                 ref uint leftChar)
552         {
553                 if (chars == null)
554                         throw new ArgumentNullException ("chars");
555                 if (charIndex < 0 || charIndex >= chars.Length)
556                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
557                 if (charCount < 0 || charCount > (chars.Length - charIndex))
558                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
559                 if (byteIndex < 0 || byteIndex > (bytes != null && bytes.Length > 0 ? bytes.Length - 1 : 0))
560                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
561
562                 unsafe {
563                         fixed (char *cptr = chars) {
564                                 fixed (byte *bptr = bytes) {
565                                         return InternalGetBytes (
566                                                 cptr + charIndex, charCount,
567                                                 bytes != null ? bptr + byteIndex : null,
568                                                 bytes != null ? bytes.Length - byteIndex : 0,
569                                                 fallbackBuffer,
570                                                 out charsProcessed, out bytesProcessed,
571                                                 ref leftChar);
572                                 }
573                         }
574                 }
575         }
576
577         #region GetByteCount()
578
579         // Get the number of bytes needed to encode a character buffer.
580         public override int GetByteCount (char[] chars, int index, int count)
581         {
582                 uint leftChar = 0;
583                 int charsProcessed, bytesProcessed;
584                 InternalGetBytes (chars, index, count,
585                                   null, 0,
586                                   EncoderFallback.CreateFallbackBuffer (),
587                                   out charsProcessed, out bytesProcessed,
588                                   ref leftChar);
589                 return bytesProcessed;
590         }
591
592
593         [CLSCompliant (false)]
594         [ComVisible (false)]
595         public unsafe override int GetByteCount (char* chars, int count)
596         {
597                 int charsProcessed, bytesProcessed;
598                 uint leftChar = 0;
599                 if (chars == null)
600                         throw new ArgumentNullException ("chars");
601                 if (count < 0)
602                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
603                 InternalGetBytes (chars, count,
604                                   null, 0,
605                                   EncoderFallback.CreateFallbackBuffer (),
606                                   out charsProcessed, out bytesProcessed,
607                                   ref leftChar);
608                 return bytesProcessed;
609         }
610
611         #endregion
612
613         #region GetBytes()
614
615         // Get the bytes that result from encoding a character buffer.
616         public override int GetBytes (char[] chars, int charIndex, int charCount,
617                                       byte[] bytes, int byteIndex)
618         {
619                 int charsProcessed, bytesProcessed;
620                 uint leftChar = 0;
621                 if (bytes == null) {
622                         throw new ArgumentNullException ("bytes");
623                 }
624
625                 InternalGetBytes (chars, charIndex, charCount,
626                                   bytes, byteIndex,
627                                   EncoderFallback.CreateFallbackBuffer (),
628                                   out charsProcessed, out bytesProcessed,
629                                   ref leftChar);
630                 return bytesProcessed;
631         }
632
633         // Convenience wrappers for "GetBytes".
634         public unsafe override int GetBytes (String s, int charIndex, int charCount,
635                                       byte[] bytes, int byteIndex)
636         {
637                 int charsProcessed, bytesProcessed;
638                 uint leftChar = 0;
639                 if (s == null)
640                         throw new ArgumentNullException ("s");
641                 if (bytes == null)
642                         throw new ArgumentNullException ("bytes");
643                 if (charIndex < 0 || charIndex >= s.Length)
644                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
645                 if (charCount < 0 || charCount > (s.Length - charIndex))
646                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
647                 if (byteIndex < 0 || byteIndex > (bytes.Length > 0 ? bytes.Length - 1 : 0))
648                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
649                 unsafe {
650                         fixed (char *cptr = s) {
651                                 fixed (byte *bptr = bytes) {
652                                         InternalGetBytes (
653                                                 cptr + charIndex, charCount,
654                                                 bptr + byteIndex, bytes.Length - byteIndex,
655                                                 EncoderFallback.CreateFallbackBuffer (),
656                                                 out charsProcessed, out bytesProcessed,
657                                                 ref leftChar);
658                                 }
659                         }
660                 }
661                 return bytesProcessed;
662         }
663
664         [CLSCompliant (false)]
665         [ComVisible (false)]
666         public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
667         {
668                 int charsProcessed, bytesProcessed;
669                 uint leftChar = 0;
670                 if (chars == null)
671                         throw new ArgumentNullException ("chars");
672                 if (charCount < 0)
673                         throw new IndexOutOfRangeException ("charCount");
674                 if (bytes == null)
675                         throw new ArgumentNullException ("bytes");
676                 if (byteCount < 0)
677                         throw new IndexOutOfRangeException ("charCount");
678                 InternalGetBytes (
679                                 chars, charCount, bytes, byteCount,
680                                 EncoderFallback.CreateFallbackBuffer (),
681                                 out charsProcessed, out bytesProcessed,
682                                 ref leftChar);
683                 return bytesProcessed;
684         }
685
686         #endregion
687
688         #region GetCharCount()
689
690         // Get the number of characters needed to decode a byte buffer.
691         public override int GetCharCount (byte[] bytes, int index, int count)
692         {
693                 int bytesProcessed, charsProcessed;
694                 uint leftBytes = 0, leftBits = 0, procBytes = 0;
695                 InternalGetChars (
696                         bytes, index, count,
697                         null, 0,
698                         DecoderFallback.CreateFallbackBuffer(),
699                         out bytesProcessed, out charsProcessed,
700                         ref leftBytes, ref leftBits, ref procBytes);
701                 return charsProcessed;
702         }
703
704         [CLSCompliant (false)]
705         [ComVisible (false)]
706         public unsafe override int GetCharCount (byte* bytes, int count)
707         {
708                 int bytesProcessed, charsProcessed;
709                 uint leftBytes = 0, leftBits = 0, procBytes = 0;
710                 InternalGetChars (
711                         bytes, count,
712                         null, 0,
713                         DecoderFallback.CreateFallbackBuffer(),
714                         out bytesProcessed, out charsProcessed,
715                         ref leftBytes, ref leftBits, ref procBytes);
716                 return charsProcessed;
717         }
718
719         #endregion
720
721         // Get the characters that result from decoding a byte buffer.
722         public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
723                                       char[] chars, int charIndex)
724         {
725                 int bytesProcessed, charsProcessed;
726                 uint leftBytes = 0, leftBits = 0, procBytes = 0;
727                 InternalGetChars (
728                         bytes, byteIndex, byteCount,
729                         chars, charIndex,
730                         DecoderFallback.CreateFallbackBuffer(),
731                         out bytesProcessed, out charsProcessed,
732                         ref leftBytes, ref leftBits, ref procBytes);
733                 return charsProcessed;
734         }
735
736         [CLSCompliant (false)]
737         [ComVisible (false)]
738         public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
739         {
740                 int bytesProcessed, charsProcessed;
741                 uint leftBytes = 0, leftBits = 0, procBytes = 0;
742                 InternalGetChars (
743                         bytes, byteCount,
744                         chars, charCount,
745                         DecoderFallback.CreateFallbackBuffer(),
746                         out bytesProcessed, out charsProcessed,
747                         ref leftBytes, ref leftBits, ref procBytes);
748                 return charsProcessed;
749         }
750
751         // Get the maximum number of bytes needed to encode a
752         // specified number of characters.
753         public override int GetMaxByteCount (int charCount)
754         {
755                 if (charCount < 0)
756                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
757                 return charCount * 4;
758         }
759
760         // Get the maximum number of characters needed to decode a
761         // specified number of bytes.
762         public override int GetMaxCharCount (int byteCount)
763         {
764                 if (byteCount < 0)
765                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
766                 return byteCount;
767         }
768
769         // Get a UTF8-specific decoder that is attached to this instance.
770         public override Decoder GetDecoder ()
771         {
772                 return new UTF8Decoder (DecoderFallback);
773         }
774
775         // Get a UTF8-specific encoder that is attached to this instance.
776         public override Encoder GetEncoder ()
777         {
778                 return new UTF8Encoder (EncoderFallback, emitIdentifier);
779         }
780
781         // Get the UTF8 preamble.
782         // XXX: why does this method return a preamble or void array depending
783         //      on 'emitIdentifier' attribute?
784         public override byte[] GetPreamble ()
785         {
786                 if (emitIdentifier)
787                         return new byte [] { 0xEF, 0xBB, 0xBF };
788
789                 return EmptyArray<byte>.Value;
790         }
791
792         // Determine if this object is equal to another.
793         public override bool Equals (Object value)
794         {
795                 UTF8Encoding enc = (value as UTF8Encoding);
796                 if (enc != null) {
797                         return (codePage == enc.codePage &&
798                                 emitIdentifier == enc.emitIdentifier &&
799                                 DecoderFallback.Equals (enc.DecoderFallback) &&
800                                 EncoderFallback.Equals (enc.EncoderFallback));
801                 } else {
802                         return false;
803                 }
804         }
805
806         // Get the hash code for this object.
807         public override int GetHashCode ()
808         {
809                 return base.GetHashCode ();
810         }
811
812         public override int GetByteCount (string chars)
813         {
814                 // hmm, does this override make any sense?
815                 return base.GetByteCount (chars);
816         }
817
818         [ComVisible (false)]
819         public override string GetString (byte [] bytes, int index, int count)
820         {
821                 // hmm, does this override make any sense?
822                 return base.GetString (bytes, index, count);
823         }
824
825         // UTF-8 decoder implementation.
826         [Serializable]
827         private class UTF8Decoder : Decoder
828         {
829                 // internal encoder state
830                 private uint leftBytes;
831                 private uint leftBits;
832                 private uint procBytes;
833
834                 // Constructor.
835                 public UTF8Decoder (DecoderFallback fallback)
836                 {
837                         Fallback = fallback;
838                         leftBytes = 0;
839                         leftBits = 0;
840                         procBytes = 0;
841                 }
842
843                 // Override inherited methods.
844                 public override int GetCharCount (byte[] bytes, int index, int count)
845                 {
846                         int bytesProcessed, charsProcessed;
847                         InternalGetChars (
848                                 bytes, index, count,
849                                 null, 0,
850                                 this.FallbackBuffer,
851                                 out bytesProcessed, out charsProcessed,
852                                 ref leftBytes, ref leftBits, ref procBytes);
853                         return charsProcessed;
854                 }
855
856                 [ComVisibleAttribute(false)]
857                 public override int GetCharCount (byte[] bytes, int index, int count, bool flush)
858                 {
859                         int r = GetCharCount (bytes, index, count);
860                         if (flush)
861                                 leftBytes = leftBits = procBytes = 0;
862                         return r;
863                 }
864
865                 [ComVisibleAttribute(false)] 
866                 public unsafe override int GetCharCount (byte* bytes, int count, bool flush)
867                 {
868                         int bytesProcessed, charsProcessed;
869                         InternalGetChars (
870                                 bytes, count,
871                                 null, 0,
872                                 this.FallbackBuffer,
873                                 out bytesProcessed, out charsProcessed,
874                                 ref leftBytes, ref leftBits, ref procBytes);
875                         if (flush)
876                                 leftBytes = leftBits = procBytes = 0;
877                         return charsProcessed;
878                 }
879
880                 [ComVisibleAttribute(false)]
881                 public unsafe override int GetChars (byte* bytes, int byteCount,
882                                                 char* chars, int charCount, bool flush)
883                 {
884                         int bytesProcessed, charsProcessed;
885                         InternalGetChars (
886                                 bytes, byteCount,
887                                 chars, charCount,
888                                 this.FallbackBuffer,
889                                 out bytesProcessed, out charsProcessed,
890                                 ref leftBytes, ref leftBits, ref procBytes);
891                         if (flush)
892                                 leftBytes = leftBits = procBytes = 0;
893                         return charsProcessed;
894                 }
895
896                 public override int GetChars (byte[] bytes, int byteIndex,
897                                                  int byteCount, char[] chars, int charIndex)
898                 {
899                         int bytesProcessed, charsProcessed;
900                         InternalGetChars (
901                                 bytes, byteIndex, byteCount,
902                                 chars, charIndex,
903                                 this.FallbackBuffer,
904                                 out bytesProcessed, out charsProcessed,
905                                 ref leftBytes, ref leftBits, ref procBytes);
906                         return charsProcessed;
907                 }
908
909                 public override int GetChars (byte[] bytes, int byteIndex,
910                                                  int byteCount, char[] chars, int charIndex, bool flush)
911                 {
912                         int r = GetChars (bytes, byteIndex, byteCount, chars, charIndex);
913                         if (flush)
914                                 leftBytes = leftBits = procBytes = 0;
915                         return r;
916                 }
917
918                 public override void Reset ()
919                 {
920                         base.Reset();
921                         leftBytes = 0;
922                         leftBits = 0;
923                         procBytes = 0;
924                 }
925
926                 public unsafe override void Convert (
927                         byte* bytes, int byteCount,
928                         char* chars, int charCount, bool flush,
929                         out int bytesUsed, out int charsUsed, out bool completed)
930                 {
931                         if (chars == null)
932                                 throw new ArgumentNullException ("chars");
933                         if (charCount < 0)
934                                 throw new IndexOutOfRangeException ("charCount");
935                         if (bytes == null)
936                                 throw new ArgumentNullException ("bytes");
937                         if (byteCount < 0)
938                                 throw new IndexOutOfRangeException ("charCount");
939                         UTF8Encoding.InternalGetChars (
940                                         bytes, byteCount,
941                                         chars, charCount,
942                                         this.FallbackBuffer,
943                                         out bytesUsed, out charsUsed,
944                                         ref leftBytes, ref leftBits, ref procBytes);
945                         // only completed if all bytes have been processed and
946                         // succesful converted to chars!!
947                         completed = (byteCount == bytesUsed);
948                         // flush state
949                         if (flush)
950                                 leftBytes = leftBits = procBytes = 0;
951                 }
952         } // class UTF8Decoder
953
954         // UTF-8 encoder implementation.
955         [Serializable]
956         private class UTF8Encoder : Encoder
957         {
958                 private bool emitIdentifier;
959
960                 // internal encoder state
961                 private uint leftChar;
962                 private bool emittedIdentifier;
963
964                 // Constructor.
965                 public UTF8Encoder (EncoderFallback fallback, bool emitIdentifier)
966                 {
967                         this.Fallback = fallback;
968                         this.leftChar = 0;
969                         this.emitIdentifier = emitIdentifier;
970                         this.emittedIdentifier = false;
971                 }
972
973                 // Override inherited methods.
974                 [ComVisibleAttribute(false)]
975                 public unsafe override int GetByteCount (char* chars, int count, bool flush)
976                 {
977                         int charsProcessed, bytesProcessed, preambleSize = 0;
978                         if (emitIdentifier && !emittedIdentifier) {
979                                 preambleSize = 3;
980                                 emittedIdentifier = true;
981                         }
982                         InternalGetBytes (chars, count,
983                                           null, 0,
984                                           this.FallbackBuffer,
985                                           out charsProcessed, out bytesProcessed,
986                                           ref leftChar);
987                         if (flush)
988                                 leftChar = 0;
989                         return bytesProcessed + preambleSize;
990                 }
991
992                 public override int GetByteCount (char[] chars, int index,
993                                          int count, bool flush)
994                 {
995                         int charsProcessed, bytesProcessed, preambleSize = 0;
996                         if (emitIdentifier && !emittedIdentifier) {
997                                 preambleSize = 3;
998                                 emittedIdentifier = true;
999                         }
1000                         InternalGetBytes (chars, index, count,
1001                                           null, 0,
1002                                           this.FallbackBuffer,
1003                                           out charsProcessed, out bytesProcessed,
1004                                           ref leftChar);
1005                         if (flush)
1006                                 leftChar = 0;
1007                         return bytesProcessed + preambleSize;
1008                 }
1009
1010                 [ComVisibleAttribute(false)]
1011                 public unsafe override int GetBytes (char* chars, int charCount,
1012                         byte* bytes, int byteCount, bool flush)
1013                 {
1014                         int charsProcessed, bytesProcessed, preambleSize = 0;
1015                         if (emitIdentifier && !emittedIdentifier) {
1016                                 if (byteCount < 3)
1017                                         throw new ArgumentException ("Insufficient Space", "UTF8 preamble");
1018                                 *bytes++ = 0xEF;
1019                                 *bytes++ = 0xBB;
1020                                 *bytes++ = 0xBF;
1021                                 preambleSize = 3;
1022                                 emittedIdentifier = true;
1023                                 byteCount -= 3;
1024                         }
1025                         InternalGetBytes (chars, charCount,
1026                                           bytes, byteCount,
1027                                           this.FallbackBuffer,
1028                                           out charsProcessed, out bytesProcessed,
1029                                           ref leftChar);
1030                         if (flush)
1031                                 leftChar = 0;
1032                         return bytesProcessed + preambleSize;
1033                 }
1034
1035                 public override int GetBytes (char[] chars, int charIndex,
1036                                          int charCount, byte[] bytes, int byteIndex, bool flush)
1037                 {
1038                         int charsProcessed, bytesProcessed, preambleSize = 0;
1039                         if (emitIdentifier && !emittedIdentifier) {
1040                                 if (bytes.Length - byteIndex < 3)
1041                                         throw new ArgumentException ("Insufficient Space", "UTF8 preamble");
1042                                 bytes[byteIndex++] = 0xEF;
1043                                 bytes[byteIndex++] = 0xBB;
1044                                 bytes[byteIndex++] = 0xBF;
1045                                 preambleSize = 3;
1046                                 emittedIdentifier = true;
1047                         }
1048                         InternalGetBytes (chars, charIndex, charCount,
1049                                           bytes, byteIndex,
1050                                           this.FallbackBuffer,
1051                                           out charsProcessed, out bytesProcessed,
1052                                           ref leftChar);
1053                         if (flush)
1054                                 leftChar = 0;
1055                         return bytesProcessed + preambleSize;
1056                 }
1057
1058                 public override void Reset ()
1059                 {
1060                         base.Reset();
1061                         this.leftChar = 0;
1062                         this.emittedIdentifier = false;
1063                 }
1064
1065                 public unsafe override void Convert (
1066                         char* chars, int charCount,
1067                         byte* bytes, int byteCount, bool flush,
1068                         out int charsUsed, out int bytesUsed, out bool completed)
1069                 {
1070                         int preambleSize = 0;
1071                         if (bytes == null)
1072                                 throw new ArgumentNullException ("bytes");
1073                         if (byteCount < 0)
1074                                 throw new IndexOutOfRangeException ("charCount");
1075                         if (chars == null)
1076                                 throw new ArgumentNullException ("chars");
1077                         if (charCount < 0)
1078                                 throw new IndexOutOfRangeException ("charCount");
1079                         if (emitIdentifier && !emittedIdentifier) {
1080                                 if (byteCount < 3)
1081                                         throw new ArgumentException ("Insufficient Space", "UTF8 preamble");
1082                                 *bytes++ = 0xEF;
1083                                 *bytes++ = 0xBB;
1084                                 *bytes++ = 0xBF;
1085                                 preambleSize = 3;
1086                                 emittedIdentifier = true;
1087                                 byteCount -= 3;
1088                         }
1089                         InternalGetBytes (
1090                                         chars, charCount,
1091                                         bytes, byteCount,
1092                                         this.FallbackBuffer,
1093                                         out charsUsed, out bytesUsed,
1094                                         ref leftChar);
1095                         // only completed if all chars have been processed and
1096                         // succesful converted to chars!!
1097                         completed = (charCount == charsUsed);
1098                         bytesUsed += preambleSize;
1099                         if (flush)
1100                                 leftChar = 0;
1101                 }
1102         } // class UTF8Encoder
1103
1104 }; // class UTF8Encoding
1105
1106 }; // namespace System.Text