Fix the 1.1 build.
[mono.git] / mcs / class / corlib / System.Text / UTF8Encoding.cs
1 /*
2  * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
3  *
4  * Copyright (c) 2001, 2002  Southern Storm Software, Pty Ltd
5  * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23  * OTHER DEALINGS IN THE SOFTWARE.
24  */
25
26 namespace System.Text
27 {
28
29 using System;
30 using System.Runtime.InteropServices;
31
32 [Serializable]
33 [MonoTODO ("Serialization format not compatible with .NET")]
34 #if NET_2_0
35 [MonoTODO ("EncoderFallback is not handled")]
36 [ComVisible (true)]
37 #endif
38 public class UTF8Encoding : Encoding
39 {
40         // Magic number used by Windows for UTF-8.
41         internal const int UTF8_CODE_PAGE = 65001;
42
43         // Internal state.
44         private bool emitIdentifier;
45 #if !NET_2_0
46         private bool throwOnInvalid;
47 #endif
48
49         // Constructors.
50         public UTF8Encoding () : this (false, false) {}
51         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
52                         : this (encoderShouldEmitUTF8Identifier, false) {}
53         
54         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
55                 : base (UTF8_CODE_PAGE)
56         {
57                 emitIdentifier = encoderShouldEmitUTF8Identifier;
58 #if NET_2_0
59                 if (throwOnInvalidBytes)
60                         SetFallbackInternal (null, new DecoderExceptionFallback ());
61                 else
62                         SetFallbackInternal (null, new DecoderReplacementFallback ("\uFFFD"));
63 #else
64                 throwOnInvalid = throwOnInvalidBytes;
65 #endif
66
67                 web_name = body_name = header_name = "utf-8";
68                 encoding_name = "Unicode (UTF-8)";
69                 is_browser_save = true;
70                 is_browser_display = true;
71                 is_mail_news_display = true;
72                 is_mail_news_save = true;
73                 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
74         }
75
76         #region GetByteCount()
77
78         // Internal version of "GetByteCount" which can handle a rolling
79         // state between multiple calls to this method.
80         private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
81         {
82                 // Validate the parameters.
83                 if (chars == null) {
84                         throw new ArgumentNullException ("chars");
85                 }
86                 if (index < 0 || index > chars.Length) {
87                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
88                 }
89                 if (count < 0 || count > (chars.Length - index)) {
90                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
91                 }
92
93                 if (index == chars.Length) {
94                         if (flush && leftOver != '\0') {
95                                 // Flush the left-over surrogate pair start.
96                                 leftOver = '\0';
97                                 return 3;
98                         }
99                         return 0;
100                 }
101
102                 unsafe {
103                         fixed (char* cptr = chars) {
104                                 return InternalGetByteCount (cptr + index, count, ref leftOver, flush);
105                         }
106                 }
107         }
108
109         private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
110         {
111                 int length = 0;
112                 char* end = chars + count;
113                 while (chars < end) {
114                         if (leftOver == 0) {
115                                 for (; chars < end; chars++) {
116                                         if (*chars < '\x80') {
117                                                 ++length;
118                                         } else if (*chars < '\x800') {
119                                                 length += 2;
120                                         } else if (*chars < '\uD800' || *chars > '\uDFFF') {
121                                                 length += 3;
122                                         } else if (*chars <= '\uDBFF') {
123                                                 // This is a surrogate start char, exit the inner loop only
124                                                 // if we don't find the complete surrogate pair.
125                                                 if (chars + 1 < end && chars [1] >= '\uDC00' && chars [1] <= '\uDFFF') {
126                                                         length += 4;
127                                                         chars++;
128                                                         continue;
129                                                 }
130                                                 leftOver = *chars;
131                                                 chars++;
132                                                 break;
133                                         } else {
134                                                 // We have a surrogate tail without 
135                                                 // leading surrogate. In NET_2_0 it
136                                                 // uses fallback. In NET_1_1 we output
137                                                 // wrong surrogate.
138                                                 length += 3;
139                                                 leftOver = '\0';
140                                         }
141                                 }
142                         } else {
143                                 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
144                                         // We have a correct surrogate pair.
145                                         length += 4;
146                                         chars++;
147                                 } else {
148                                         // We have a surrogate start followed by a
149                                         // regular character.  Technically, this is
150                                         // invalid, but we have to do something.
151                                         // We write out the surrogate start and then
152                                         // re-visit the current character again.
153                                         length += 3;
154                                 }
155                                 leftOver = '\0';
156                         }
157                 }
158                 if (flush) {
159                         // Flush the left-over surrogate pair start.
160                         if (leftOver != '\0') {
161                                 length += 3;
162                                 leftOver = '\0';
163                         }
164                 }
165                 return length;
166         }
167
168         // Get the number of bytes needed to encode a character buffer.
169         public override int GetByteCount (char[] chars, int index, int count)
170         {
171                 char dummy = '\0';
172                 return InternalGetByteCount (chars, index, count, ref dummy, true);
173         }
174
175 #if !NET_2_0
176         // Convenience wrappers for "GetByteCount".
177         public override int GetByteCount (String chars)
178         {
179                 // Validate the parameters.
180                 if (chars == null) {
181                         throw new ArgumentNullException ("chars");
182                 }
183
184                 unsafe {
185                         fixed (char* cptr = chars) {
186                                 char dummy = '\0';
187                                 return InternalGetByteCount (cptr, chars.Length, ref dummy, true);
188                         }
189                 }
190         }
191 #endif
192
193 #if NET_2_0
194         [CLSCompliant (false)]
195         [ComVisible (false)]
196         public unsafe override int GetByteCount (char* chars, int count)
197         {
198                 if (chars == null)
199                         throw new ArgumentNullException ("chars");
200                 if (count == 0)
201                         return 0;
202                 char dummy = '\0';
203                 return InternalGetByteCount (chars, count, ref dummy, true);
204         }
205 #endif
206
207         #endregion
208
209         #region GetBytes()
210
211         // Internal version of "GetBytes" which can handle a rolling
212         // state between multiple calls to this method.
213         private static int InternalGetBytes (char[] chars, int charIndex,
214                                              int charCount, byte[] bytes,
215                                              int byteIndex, ref char leftOver,
216                                              bool flush)
217         {
218                 // Validate the parameters.
219                 if (chars == null) {
220                         throw new ArgumentNullException ("chars");
221                 }
222                 if (bytes == null) {
223                         throw new ArgumentNullException ("bytes");
224                 }
225                 if (charIndex < 0 || charIndex > chars.Length) {
226                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
227                 }
228                 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
229                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
230                 }
231                 if (byteIndex < 0 || byteIndex > bytes.Length) {
232                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
233                 }
234
235                 if (charIndex == chars.Length) {
236                         if (flush && leftOver != '\0') {
237 #if NET_2_0
238                                 // FIXME: use EncoderFallback.
239                                 //
240                                 // By default it is empty, so I do nothing for now.
241                                 leftOver = '\0';
242 #else
243                                 // Flush the left-over surrogate pair start.
244                                 if (byteIndex >= bytes.Length - 3)
245                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
246                                 bytes [byteIndex++] = 0xEF;
247                                 bytes [byteIndex++] = 0xBB;
248                                 bytes [byteIndex++] = 0xBF;
249                                 leftOver = '\0';
250                                 return 3;
251 #endif
252                         }
253                         return 0;
254                 }
255
256                 unsafe {
257                         fixed (char* cptr = chars) {
258                                 if (bytes.Length == byteIndex)
259                                         return InternalGetBytes (
260                                                 cptr + charIndex, charCount, 
261                                                 null, 0, ref leftOver, flush);
262                                 fixed (byte *bptr = bytes) {
263                                         return InternalGetBytes (
264                                                 cptr + charIndex, charCount,
265                                                 bptr + byteIndex, bytes.Length - byteIndex,
266                                                 ref leftOver, flush);
267                                 }
268                         }
269                 }
270         }
271
272         private unsafe static int InternalGetBytes (char* chars, int count, byte* bytes, int bcount, ref char leftOver, bool flush)
273         {
274                 char* end = chars + count;
275                 byte* end_bytes = bytes + bcount;
276                 while (chars < end) {
277                         if (leftOver == 0) {
278                                 for (; chars < end; chars++) {
279                                         int ch = *chars;
280                                         if (ch < '\x80') {
281                                                 if (bytes >= end_bytes)
282                                                         goto fail_no_space;
283                                                 *bytes++ = (byte)ch;
284                                         } else if (ch < '\x800') {
285                                                 if (bytes + 1 >= end_bytes)
286                                                         goto fail_no_space;
287                                                 bytes [0] = (byte) (0xC0 | (ch >> 6));
288                                                 bytes [1] = (byte) (0x80 | (ch & 0x3F));
289                                                 bytes += 2;
290                                         } else if (ch < '\uD800' || ch > '\uDFFF') {
291                                                 if (bytes + 2 >= end_bytes)
292                                                         goto fail_no_space;
293                                                 bytes [0] = (byte) (0xE0 | (ch >> 12));
294                                                 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
295                                                 bytes [2] = (byte) (0x80 | (ch & 0x3F));
296                                                 bytes += 3;
297                                         } else if (ch <= '\uDBFF') {
298                                                 // This is a surrogate char, exit the inner loop.
299                                                 leftOver = *chars;
300                                                 chars++;
301                                                 break;
302                                         } else {
303                                                 // We have a surrogate tail without 
304                                                 // leading surrogate. In NET_2_0 it
305                                                 // uses fallback. In NET_1_1 we output
306                                                 // wrong surrogate.
307                                                 if (bytes + 2 >= end_bytes)
308                                                         goto fail_no_space;
309                                                 bytes [0] = (byte) (0xE0 | (ch >> 12));
310                                                 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
311                                                 bytes [2] = (byte) (0x80 | (ch & 0x3F));
312                                                 bytes += 3;
313                                                 leftOver = '\0';
314                                         }
315                                 }
316                         } else {
317                                 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
318                                         // We have a correct surrogate pair.
319                                         int ch = 0x10000 + (int) *chars - 0xDC00 + (((int) leftOver - 0xD800) << 10);
320                                         if (bytes + 3 >= end_bytes)
321                                                 goto fail_no_space;
322                                         bytes [0] = (byte) (0xF0 | (ch >> 18));
323                                         bytes [1] = (byte) (0x80 | ((ch >> 12) & 0x3F));
324                                         bytes [2] = (byte) (0x80 | ((ch >> 6) & 0x3F));
325                                         bytes [3] = (byte) (0x80 | (ch & 0x3F));
326                                         bytes += 4;
327                                         chars++;
328                                 } else {
329                                         // We have a surrogate start followed by a
330                                         // regular character.  Technically, this is
331                                         // invalid, but we have to do something.
332                                         // We write out the surrogate start and then
333                                         // re-visit the current character again.
334                                         int ch = leftOver;
335                                         if (bytes + 2 >= end_bytes)
336                                                 goto fail_no_space;
337                                         bytes [0] = (byte) (0xE0 | (ch >> 12));
338                                         bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
339                                         bytes [2] = (byte) (0x80 | (ch & 0x3F));
340                                         bytes += 3;
341                                 }
342                                 leftOver = '\0';
343                         }
344                 }
345                 if (flush) {
346                         // Flush the left-over surrogate pair start.
347                         if (leftOver != '\0') {
348                                 int ch = leftOver;
349                                 if (bytes + 2 < end_bytes) {
350                                         bytes [0] = (byte) (0xE0 | (ch >> 12));
351                                         bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
352                                         bytes [2] = (byte) (0x80 | (ch & 0x3F));
353                                         bytes += 3;
354                                 } else {
355                                         goto fail_no_space;
356                                 }
357                                 leftOver = '\0';
358                         }
359                 }
360                 return (int)(bytes - (end_bytes - bcount));
361 fail_no_space:
362                 throw new ArgumentException ("Insufficient Space", "bytes");
363         }
364
365         // Get the bytes that result from encoding a character buffer.
366         public override int GetBytes (char[] chars, int charIndex, int charCount,
367                                                                  byte[] bytes, int byteIndex)
368         {
369                 char leftOver = '\0';
370                 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
371         }
372
373         // Convenience wrappers for "GetBytes".
374         public override int GetBytes (String s, int charIndex, int charCount,
375                                                                  byte[] bytes, int byteIndex)
376         {
377                 // Validate the parameters.
378                 if (s == null) {
379                         throw new ArgumentNullException ("s");
380                 }
381                 if (bytes == null) {
382                         throw new ArgumentNullException ("bytes");
383                 }
384                 if (charIndex < 0 || charIndex > s.Length) {
385                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
386                 }
387                 if (charCount < 0 || charCount > (s.Length - charIndex)) {
388                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
389                 }
390                 if (byteIndex < 0 || byteIndex > bytes.Length) {
391                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
392                 }
393
394                 if (charIndex == s.Length)
395                         return 0;
396
397                 unsafe {
398                         fixed (char* cptr = s) {
399                                 char dummy = '\0';
400                                 if (bytes.Length == byteIndex)
401                                         return InternalGetBytes (
402                                                 cptr + charIndex, charCount,
403                                                 null, 0, ref dummy, true);
404                                 fixed (byte *bptr = bytes) {
405                                         return InternalGetBytes (
406                                                 cptr + charIndex, charCount,
407                                                 bptr + byteIndex, bytes.Length - byteIndex,
408                                                 ref dummy, true);
409                                 }
410                         }
411                 }
412         }
413
414 #if NET_2_0
415         [CLSCompliant (false)]
416         [ComVisible (false)]
417         public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
418         {
419                 if (chars == null)
420                         throw new ArgumentNullException ("chars");
421                 if (charCount < 0)
422                         throw new IndexOutOfRangeException ("charCount");
423                 if (bytes == null)
424                         throw new ArgumentNullException ("bytes");
425                 if (byteCount < 0)
426                         throw new IndexOutOfRangeException ("charCount");
427
428                 if (charCount == 0)
429                         return 0;
430
431                 char dummy = '\0';
432                 if (byteCount == 0)
433                         return InternalGetBytes (chars, charCount, null, 0, ref dummy, true);
434                 else
435                         return InternalGetBytes (chars, charCount, bytes, byteCount, ref dummy, true);
436         }
437 #endif
438
439         #endregion
440
441         // Internal version of "GetCharCount" which can handle a rolling
442         // state between multiple calls to this method.
443 #if NET_2_0
444         private unsafe static int InternalGetCharCount (
445                 byte[] bytes, int index, int count, uint leftOverBits,
446                 uint leftOverCount, object provider,
447                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
448 #else
449         private unsafe static int InternalGetCharCount (
450                 byte[] bytes, int index, int count, uint leftOverBits,
451                 uint leftOverCount, bool throwOnInvalid, bool flush)
452 #endif
453         {
454                 // Validate the parameters.
455                 if (bytes == null) {
456                         throw new ArgumentNullException ("bytes");
457                 }
458                 if (index < 0 || index > bytes.Length) {
459                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
460                 }
461                 if (count < 0 || count > (bytes.Length - index)) {
462                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
463                 }
464
465                 if (count == 0)
466                         return 0;
467                 fixed (byte *bptr = bytes)
468 #if NET_2_0
469                         return InternalGetCharCount (bptr + index, count,
470                                 leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
471 #else
472                         return InternalGetCharCount (bptr + index, count,
473                                 leftOverBits, leftOverCount, throwOnInvalid, flush);
474 #endif
475         }
476
477 #if NET_2_0
478         private unsafe static int InternalGetCharCount (
479                 byte* bytes, int count, uint leftOverBits,
480                 uint leftOverCount, object provider,
481                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
482 #else
483         private unsafe static int InternalGetCharCount (
484                 byte* bytes, int count, uint leftOverBits,
485                 uint leftOverCount, bool throwOnInvalid, bool flush)
486 #endif
487         {
488                 int index = 0;
489
490                 int length = 0;
491
492                 if (leftOverCount == 0) {
493                         int end = index + count;
494                         for (; index < end; index++, count--) {
495                                 if (bytes [index] < 0x80)
496                                         length++;
497                                 else
498                                         break;
499                         }
500                 }
501
502                 // Determine the number of characters that we have.
503                 uint ch;
504                 uint leftBits = leftOverBits;
505                 uint leftSoFar = (leftOverCount & (uint)0x0F);
506                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
507                 while (count > 0) {
508                         ch = (uint)(bytes[index++]);
509                         --count;
510                         if (leftSize == 0) {
511                                 // Process a UTF-8 start character.
512                                 if (ch < (uint)0x0080) {
513                                         // Single-byte UTF-8 character.
514                                         ++length;
515                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
516                                         // Double-byte UTF-8 character.
517                                         leftBits = (ch & (uint)0x1F);
518                                         leftSoFar = 1;
519                                         leftSize = 2;
520                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
521                                         // Three-byte UTF-8 character.
522                                         leftBits = (ch & (uint)0x0F);
523                                         leftSoFar = 1;
524                                         leftSize = 3;
525                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
526                                         // Four-byte UTF-8 character.
527                                         leftBits = (ch & (uint)0x07);
528                                         leftSoFar = 1;
529                                         leftSize = 4;
530                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
531                                         // Five-byte UTF-8 character.
532                                         leftBits = (ch & (uint)0x03);
533                                         leftSoFar = 1;
534                                         leftSize = 5;
535                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
536                                         // Six-byte UTF-8 character.
537                                         leftBits = (ch & (uint)0x03);
538                                         leftSoFar = 1;
539                                         leftSize = 6;
540                                 } else {
541                                         // Invalid UTF-8 start character.
542 #if NET_2_0
543                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1, 1);
544 #else
545                                         if (throwOnInvalid)
546                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
547 #endif
548                                 }
549                         } else {
550                                 // Process an extra byte in a multi-byte sequence.
551                                 if ((ch & (uint)0xC0) == (uint)0x80) {
552                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
553                                         if (++leftSoFar >= leftSize) {
554                                                 // We have a complete character now.
555                                                 if (leftBits < (uint)0x10000) {
556                                                         // is it an overlong ?
557                                                         bool overlong = false;
558                                                         switch (leftSize) {
559                                                         case 2:
560                                                                 overlong = (leftBits <= 0x7F);
561                                                                 break;
562                                                         case 3:
563                                                                 overlong = (leftBits <= 0x07FF);
564                                                                 break;
565                                                         case 4:
566                                                                 overlong = (leftBits <= 0xFFFF);
567                                                                 break;
568                                                         case 5:
569                                                                 overlong = (leftBits <= 0x1FFFFF);
570                                                                 break;
571                                                         case 6:
572                                                                 overlong = (leftBits <= 0x03FFFFFF);
573                                                                 break;
574                                                         }
575                                                         if (overlong) {
576 #if NET_2_0
577                                                                 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
578 #else
579                                                                 if (throwOnInvalid)
580                                                                         throw new ArgumentException (_("Overlong"), leftBits.ToString ());
581 #endif
582                                                         }
583                                                         else
584                                                                 ++length;
585                                                 } else if (leftBits < (uint)0x110000) {
586                                                         length += 2;
587                                                 } else {
588 #if NET_2_0
589                                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
590 #else
591                                                         if (throwOnInvalid)
592                                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
593 #endif
594                                                 }
595                                                 leftSize = 0;
596                                         }
597                                 } else {
598                                         // Invalid UTF-8 sequence: clear and restart.
599 #if NET_2_0
600                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
601 #else
602                                         if (throwOnInvalid)
603                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
604 #endif
605                                         leftSize = 0;
606                                         --index;
607                                         ++count;
608                                 }
609                         }
610                 }
611                 if (flush && leftSize != 0) {
612                         // We had left-over bytes that didn't make up
613                         // a complete UTF-8 character sequence.
614 #if NET_2_0
615                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
616 #else
617                         if (throwOnInvalid)
618                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
619 #endif
620                 }
621
622                 // Return the final length to the caller.
623                 return length;
624         }
625
626 #if NET_2_0
627         // for GetCharCount()
628         static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long index, uint size)
629         {
630                 if (buffer == null) {
631                         DecoderFallback fb = provider as DecoderFallback;
632                         if (fb != null)
633                                 buffer = fb.CreateFallbackBuffer ();
634                         else
635                                 buffer = ((Decoder) provider).FallbackBuffer;
636                 }
637                 if (bufferArg == null)
638                         bufferArg = new byte [1];
639                 int ret = 0;
640                 for (int i = 0; i < size; i++) {
641                         bufferArg [0] = bytes [(int) index + i];
642                         buffer.Fallback (bufferArg, 0);
643                         ret += buffer.Remaining;
644                         buffer.Reset ();
645                 }
646                 return ret;
647         }
648
649         // for GetChars()
650         static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long byteIndex, uint size,
651                 char* chars, ref int charIndex)
652         {
653                 if (buffer == null) {
654                         DecoderFallback fb = provider as DecoderFallback;
655                         if (fb != null)
656                                 buffer = fb.CreateFallbackBuffer ();
657                         else
658                                 buffer = ((Decoder) provider).FallbackBuffer;
659                 }
660                 if (bufferArg == null)
661                         bufferArg = new byte [1];
662                 for (int i = 0; i < size; i++) {
663                         bufferArg [0] = bytes [byteIndex + i];
664                         buffer.Fallback (bufferArg, 0);
665                         while (buffer.Remaining > 0)
666                                 chars [charIndex++] = buffer.GetNextChar ();
667                         buffer.Reset ();
668                 }
669         }
670 #endif
671
672         // Get the number of characters needed to decode a byte buffer.
673         public override int GetCharCount (byte[] bytes, int index, int count)
674         {
675 #if NET_2_0
676                 DecoderFallbackBuffer buf = null;
677                 byte [] bufferArg = null;
678                 return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
679 #else
680                 return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
681 #endif
682         }
683
684 #if NET_2_0
685         [CLSCompliant (false)]
686         [ComVisible (false)]
687         public unsafe override int GetCharCount (byte* bytes, int count)
688         {
689                 DecoderFallbackBuffer buf = null;
690                 byte [] bufferArg = null;
691                 return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
692         }
693 #endif
694
695         // Get the characters that result from decoding a byte buffer.
696 #if NET_2_0
697         private unsafe static int InternalGetChars (
698                 byte[] bytes, int byteIndex, int byteCount, char[] chars,
699                 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
700                 object provider,
701                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
702 #else
703         private unsafe static int InternalGetChars (
704                 byte[] bytes, int byteIndex, int byteCount, char[] chars,
705                 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
706                 bool throwOnInvalid, bool flush)
707 #endif
708         {
709                 // Validate the parameters.
710                 if (bytes == null) {
711                         throw new ArgumentNullException ("bytes");
712                 }
713                 if (chars == null) {
714                         throw new ArgumentNullException ("chars");
715                 }
716                 if (byteIndex < 0 || byteIndex > bytes.Length) {
717                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
718                 }
719                 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
720                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
721                 }
722                 if (charIndex < 0 || charIndex > chars.Length) {
723                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
724                 }
725
726                 if (charIndex == chars.Length)
727                         return 0;
728
729                 fixed (char* cptr = chars) {
730 #if NET_2_0
731                         if (byteCount == 0 || byteIndex == bytes.Length)
732                                 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
733                         // otherwise...
734                         fixed (byte* bptr = bytes)
735                                 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
736 #else
737                         if (byteCount == 0 || byteIndex == bytes.Length)
738                                 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
739                         // otherwise...
740                         fixed (byte* bptr = bytes)
741                                 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
742 #endif
743                 }
744         }
745
746 #if NET_2_0
747         private unsafe static int InternalGetChars (
748                 byte* bytes, int byteCount, char* chars, int charCount,
749                 ref uint leftOverBits, ref uint leftOverCount,
750                 object provider,
751                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
752 #else
753         private unsafe static int InternalGetChars (
754                 byte* bytes, int byteCount, char* chars, int charCount,
755                 ref uint leftOverBits, ref uint leftOverCount,
756                 bool throwOnInvalid, bool flush)
757 #endif
758         {
759                 int charIndex = 0, byteIndex = 0;
760                 int length = charCount;
761                 int posn = charIndex;
762
763                 if (leftOverCount == 0) {
764                         int end = byteIndex + byteCount;
765                         for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
766                                 if (bytes [byteIndex] < 0x80)
767                                         chars [posn] = (char) bytes [byteIndex];
768                                 else
769                                         break;
770                         }
771                 }
772
773                 // Convert the bytes into the output buffer.
774                 uint ch;
775                 uint leftBits = leftOverBits;
776                 uint leftSoFar = (leftOverCount & (uint)0x0F);
777                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
778
779                 int byteEnd = byteIndex + byteCount;
780                 for(; byteIndex < byteEnd; byteIndex++) {
781                         // Fetch the next character from the byte buffer.
782                         ch = (uint)(bytes[byteIndex]);
783                         if (leftSize == 0) {
784                                 // Process a UTF-8 start character.
785                                 if (ch < (uint)0x0080) {
786                                         // Single-byte UTF-8 character.
787                                         if (posn >= length) {
788                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
789                                         }
790                                         chars[posn++] = (char)ch;
791                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
792                                         // Double-byte UTF-8 character.
793                                         leftBits = (ch & (uint)0x1F);
794                                         leftSoFar = 1;
795                                         leftSize = 2;
796                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
797                                         // Three-byte UTF-8 character.
798                                         leftBits = (ch & (uint)0x0F);
799                                         leftSoFar = 1;
800                                         leftSize = 3;
801                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
802                                         // Four-byte UTF-8 character.
803                                         leftBits = (ch & (uint)0x07);
804                                         leftSoFar = 1;
805                                         leftSize = 4;
806                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
807                                         // Five-byte UTF-8 character.
808                                         leftBits = (ch & (uint)0x03);
809                                         leftSoFar = 1;
810                                         leftSize = 5;
811                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
812                                         // Six-byte UTF-8 character.
813                                         leftBits = (ch & (uint)0x03);
814                                         leftSoFar = 1;
815                                         leftSize = 6;
816                                 } else {
817                                         // Invalid UTF-8 start character.
818 #if NET_2_0
819                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, 1, chars, ref posn);
820 #else
821                                         if (throwOnInvalid)
822                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
823 #endif
824                                 }
825                         } else {
826                                 // Process an extra byte in a multi-byte sequence.
827                                 if ((ch & (uint)0xC0) == (uint)0x80) {
828                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
829                                         if (++leftSoFar >= leftSize) {
830                                                 // We have a complete character now.
831                                                 if (leftBits < (uint)0x10000) {
832                                                         // is it an overlong ?
833                                                         bool overlong = false;
834                                                         switch (leftSize) {
835                                                         case 2:
836                                                                 overlong = (leftBits <= 0x7F);
837                                                                 break;
838                                                         case 3:
839                                                                 overlong = (leftBits <= 0x07FF);
840                                                                 break;
841                                                         case 4:
842                                                                 overlong = (leftBits <= 0xFFFF);
843                                                                 break;
844                                                         case 5:
845                                                                 overlong = (leftBits <= 0x1FFFFF);
846                                                                 break;
847                                                         case 6:
848                                                                 overlong = (leftBits <= 0x03FFFFFF);
849                                                                 break;
850                                                         }
851                                                         if (overlong) {
852 #if NET_2_0
853                                                                 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
854 #else
855                                                                 if (throwOnInvalid)
856                                                                         throw new ArgumentException (_("Overlong"), leftBits.ToString ());
857 #endif
858                                                         }
859                                                         else if ((leftBits & 0xF800) == 0xD800) {
860                                                                 // UTF-8 doesn't use surrogate characters
861 #if NET_2_0
862                                                                 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
863 #else
864                                                                 if (throwOnInvalid)
865                                                                         throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
866 #endif
867                                                         }
868                                                         else {
869                                                                 if (posn >= length) {
870                                                                         throw new ArgumentException
871                                                                                 (_("Arg_InsufficientSpace"), "chars");
872                                                                 }
873                                                                 chars[posn++] = (char)leftBits;
874                                                         }
875                                                 } else if (leftBits < (uint)0x110000) {
876                                                         if ((posn + 2) > length) {
877                                                                 throw new ArgumentException
878                                                                         (_("Arg_InsufficientSpace"), "chars");
879                                                         }
880                                                         leftBits -= (uint)0x10000;
881                                                         chars[posn++] = (char)((leftBits >> 10) +
882                                                                                                    (uint)0xD800);
883                                                         chars[posn++] =
884                                                                 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
885                                                 } else {
886 #if NET_2_0
887                                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
888 #else
889                                                         if (throwOnInvalid)
890                                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
891 #endif
892                                                 }
893                                                 leftSize = 0;
894                                         }
895                                 } else {
896                                         // Invalid UTF-8 sequence: clear and restart.
897 #if NET_2_0
898                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
899 #else
900                                         if (throwOnInvalid)
901                                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
902 #endif
903                                         leftSize = 0;
904                                         --byteIndex;
905                                 }
906                         }
907                 }
908                 if (flush && leftSize != 0) {
909                         // We had left-over bytes that didn't make up
910                         // a complete UTF-8 character sequence.
911 #if NET_2_0
912                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
913 #else
914                         if (throwOnInvalid)
915                                 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
916 #endif
917                 }
918                 leftOverBits = leftBits;
919                 leftOverCount = (leftSoFar | (leftSize << 4));
920
921                 // Return the final length to the caller.
922                 return posn - charIndex;
923         }
924
925         // Get the characters that result from decoding a byte buffer.
926         public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
927                                                                  char[] chars, int charIndex)
928         {
929                 uint leftOverBits = 0;
930                 uint leftOverCount = 0;
931 #if NET_2_0
932                 DecoderFallbackBuffer buf = null;
933                 byte [] bufferArg = null;
934                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
935                                 charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
936 #else
937                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
938                                 charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
939 #endif
940         }
941
942 #if NET_2_0
943         [CLSCompliant (false)]
944         [ComVisible (false)]
945         public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
946         {
947                 DecoderFallbackBuffer buf = null;
948                 byte [] bufferArg = null;
949                 uint leftOverBits = 0;
950                 uint leftOverCount = 0;
951                 return InternalGetChars (bytes, byteCount, chars, 
952                                 charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
953         }
954 #endif
955
956         // Get the maximum number of bytes needed to encode a
957         // specified number of characters.
958         public override int GetMaxByteCount (int charCount)
959         {
960                 if (charCount < 0) {
961                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
962                 }
963                 return charCount * 4;
964         }
965
966         // Get the maximum number of characters needed to decode a
967         // specified number of bytes.
968         public override int GetMaxCharCount (int byteCount)
969         {
970                 if (byteCount < 0) {
971                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
972                 }
973                 return byteCount;
974         }
975
976         // Get a UTF8-specific decoder that is attached to this instance.
977         public override Decoder GetDecoder ()
978         {
979 #if NET_2_0
980                 return new UTF8Decoder (DecoderFallback);
981 #else
982                 return new UTF8Decoder (throwOnInvalid);
983 #endif
984         }
985
986         // Get a UTF8-specific encoder that is attached to this instance.
987         public override Encoder GetEncoder ()
988         {
989                 return new UTF8Encoder (emitIdentifier);
990         }
991
992         // Get the UTF8 preamble.
993         public override byte[] GetPreamble ()
994         {
995                 if (emitIdentifier) {
996                         byte[] pre = new byte [3];
997                         pre[0] = (byte)0xEF;
998                         pre[1] = (byte)0xBB;
999                         pre[2] = (byte)0xBF;
1000                         return pre;
1001                 } else {
1002                         return new byte [0];
1003                 }
1004         }
1005
1006         // Determine if this object is equal to another.
1007         public override bool Equals (Object value)
1008         {
1009                 UTF8Encoding enc = (value as UTF8Encoding);
1010                 if (enc != null) {
1011 #if NET_2_0
1012                         return (codePage == enc.codePage &&
1013                                         emitIdentifier == enc.emitIdentifier &&
1014                                         DecoderFallback == enc.DecoderFallback &&
1015                                         EncoderFallback == enc.EncoderFallback);
1016 #else
1017                         return (codePage == enc.codePage &&
1018                                         emitIdentifier == enc.emitIdentifier &&
1019                                         throwOnInvalid == enc.throwOnInvalid);
1020 #endif
1021                 } else {
1022                         return false;
1023                 }
1024         }
1025
1026         // Get the hash code for this object.
1027         public override int GetHashCode ()
1028         {
1029                 return base.GetHashCode ();
1030         }
1031
1032 #if NET_2_0
1033         public override int GetByteCount (string chars)
1034         {
1035                 // hmm, does this override make any sense?
1036                 return base.GetByteCount (chars);
1037         }
1038
1039         [ComVisible (false)]
1040         public override string GetString (byte [] bytes, int index, int count)
1041         {
1042                 // hmm, does this override make any sense?
1043                 return base.GetString (bytes, index, count);
1044         }
1045 #endif
1046
1047 #if !NET_2_0
1048         public override byte [] GetBytes (String s)
1049         {
1050                 if (s == null)
1051                         throw new ArgumentNullException ("s");
1052                 
1053                 int length = GetByteCount (s);
1054                 byte [] bytes = new byte [length];
1055                 GetBytes (s, 0, s.Length, bytes, 0);
1056                 return bytes;
1057         }
1058 #endif
1059
1060         // UTF-8 decoder implementation.
1061         [Serializable]
1062         private class UTF8Decoder : Decoder
1063         {
1064 #if !NET_2_0
1065                 private bool throwOnInvalid;
1066 #endif
1067                 private uint leftOverBits;
1068                 private uint leftOverCount;
1069
1070                 // Constructor.
1071 #if NET_2_0
1072                 public UTF8Decoder (DecoderFallback fallback)
1073 #else
1074                 public UTF8Decoder (bool throwOnInvalid)
1075 #endif
1076                 {
1077 #if NET_2_0
1078                         Fallback = fallback;
1079 #else
1080                         this.throwOnInvalid = throwOnInvalid;
1081 #endif
1082                         leftOverBits = 0;
1083                         leftOverCount = 0;
1084                 }
1085
1086                 // Override inherited methods.
1087                 public override int GetCharCount (byte[] bytes, int index, int count)
1088                 {
1089 #if NET_2_0
1090                         DecoderFallbackBuffer buf = null;
1091                         byte [] bufferArg = null;
1092                         return InternalGetCharCount (bytes, index, count,
1093                                 leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
1094 #else
1095                         return InternalGetCharCount (bytes, index, count,
1096                                         leftOverBits, leftOverCount, throwOnInvalid, false);
1097 #endif
1098                 }
1099                 public override int GetChars (byte[] bytes, int byteIndex,
1100                                                  int byteCount, char[] chars, int charIndex)
1101                 {
1102 #if NET_2_0
1103                         DecoderFallbackBuffer buf = null;
1104                         byte [] bufferArg = null;
1105                         return InternalGetChars (bytes, byteIndex, byteCount,
1106                                 chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
1107 #else
1108                         return InternalGetChars (bytes, byteIndex, byteCount,
1109                                 chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
1110 #endif
1111                 }
1112
1113         } // class UTF8Decoder
1114
1115         // UTF-8 encoder implementation.
1116         [Serializable]
1117         private class UTF8Encoder : Encoder
1118         {
1119 //              private bool emitIdentifier;
1120                 private char leftOverForCount;
1121                 private char leftOverForConv;
1122
1123                 // Constructor.
1124                 public UTF8Encoder (bool emitIdentifier)
1125                 {
1126 //                      this.emitIdentifier = emitIdentifier;
1127                         leftOverForCount = '\0';
1128                         leftOverForConv = '\0';
1129                 }
1130
1131                 // Override inherited methods.
1132                 public override int GetByteCount (char[] chars, int index,
1133                                          int count, bool flush)
1134                 {
1135                         return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush);
1136                 }
1137                 public override int GetBytes (char[] chars, int charIndex,
1138                                          int charCount, byte[] bytes, int byteIndex, bool flush)
1139                 {
1140                         int result;
1141                         result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush);
1142 //                      emitIdentifier = false;
1143                         return result;
1144                 }
1145
1146 #if NET_2_0
1147                 public unsafe override int GetByteCount (char* chars, int count, bool flush)
1148                 {
1149                         return InternalGetByteCount (chars, count, ref leftOverForCount, flush);
1150                 }
1151
1152                 public unsafe override int GetBytes (char* chars, int charCount,
1153                         byte* bytes, int byteCount, bool flush)
1154                 {
1155                         int result;
1156                         result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush);
1157 //                      emitIdentifier = false;
1158                         return result;
1159                 }
1160 #endif
1161
1162         } // class UTF8Encoder
1163
1164 }; // class UTF8Encoding
1165
1166 }; // namespace System.Text