Update mcs/class/Commons.Xml.Relaxng/Commons.Xml.Relaxng/RelaxngPattern.cs
[mono.git] / mcs / class / corlib / System.Text / UTF8Encoding.cs
1 /*
2  * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
3  *
4  * Copyright (c) 2001, 2002  Southern Storm Software, Pty Ltd
5  * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23  * OTHER DEALINGS IN THE SOFTWARE.
24  */
25
26 namespace System.Text
27 {
28
29 using System;
30 using System.Runtime.InteropServices;
31
32 [Serializable]
33 [MonoLimitation ("Serialization format not compatible with .NET")]
34 [ComVisible (true)]
35 public class UTF8Encoding : Encoding
36 {
37         // Magic number used by Windows for UTF-8.
38         internal const int UTF8_CODE_PAGE = 65001;
39
40         // Internal state.
41         private bool emitIdentifier;
42
43         // Constructors.
44         public UTF8Encoding () : this (false, false) {}
45         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
46                         : this (encoderShouldEmitUTF8Identifier, false) {}
47         
48         public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
49                 : base (UTF8_CODE_PAGE)
50         {
51                 emitIdentifier = encoderShouldEmitUTF8Identifier;
52                 if (throwOnInvalidBytes)
53                         SetFallbackInternal (EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
54                 else
55                         SetFallbackInternal (EncoderFallback.StandardSafeFallback, DecoderFallback.StandardSafeFallback);
56
57                 web_name = body_name = header_name = "utf-8";
58                 encoding_name = "Unicode (UTF-8)";
59                 is_browser_save = true;
60                 is_browser_display = true;
61                 is_mail_news_display = true;
62                 is_mail_news_save = true;
63                 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
64         }
65
66         #region GetByteCount()
67
68         // Internal version of "GetByteCount" which can handle a rolling
69         // state between multiple calls to this method.
70         private static int InternalGetByteCount (char[] chars, int index, int count, EncoderFallback fallback, ref char leftOver, bool flush)
71         {
72                 // Validate the parameters.
73                 if (chars == null) {
74                         throw new ArgumentNullException ("chars");
75                 }
76                 if (index < 0 || index > chars.Length) {
77                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
78                 }
79                 if (count < 0 || count > (chars.Length - index)) {
80                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
81                 }
82
83                 if (index == chars.Length) {
84                         if (flush && leftOver != '\0') {
85                                 // Flush the left-over surrogate pair start.
86                                 leftOver = '\0';
87                                 return 3;
88                         }
89                         return 0;
90                 }
91
92                 unsafe {
93                         fixed (char* cptr = chars) {
94                                 return InternalGetByteCount (cptr + index, count, fallback, ref leftOver, flush);
95                         }
96                 }
97         }
98
99         private unsafe static int InternalGetByteCount (char* chars, int count, EncoderFallback fallback, ref char leftOver, bool flush)
100         {
101                 int length = 0;
102                 char* end = chars + count;
103                 char* start = chars;
104                 EncoderFallbackBuffer buffer = null;
105                 while (chars < end) {
106                         if (leftOver == 0) {
107                                 for (; chars < end; chars++) {
108                                         if (*chars < '\x80') {
109                                                 ++length;
110                                         } else if (*chars < '\x800') {
111                                                 length += 2;
112                                         } else if (*chars < '\uD800' || *chars > '\uDFFF') {
113                                                 length += 3;
114                                         } else if (*chars <= '\uDBFF') {
115                                                 // This is a surrogate start char, exit the inner loop only
116                                                 // if we don't find the complete surrogate pair.
117                                                 if (chars + 1 < end && chars [1] >= '\uDC00' && chars [1] <= '\uDFFF') {
118                                                         length += 4;
119                                                         chars++;
120                                                         continue;
121                                                 }
122                                                 leftOver = *chars;
123                                                 chars++;
124                                                 break;
125                                         } else {
126                                                 // We have a surrogate tail without 
127                                                 // leading surrogate.
128                                                 char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
129                                                 fixed (char *fb_chars = fallback_chars) {
130                                                         char dummy = '\0';
131                                                         length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
132                                                 }
133
134                                                 leftOver = '\0';
135                                         }
136                                 }
137                         } else {
138                                 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
139                                         // We have a correct surrogate pair.
140                                         length += 4;
141                                         chars++;
142                                 } else {
143                                         // We have a surrogate start followed by a
144                                         // regular character.  Technically, this is
145                                         // invalid, but we have to do something.
146                                         // We write out the surrogate start and then
147                                         // re-visit the current character again.
148                                         char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
149                                         fixed (char *fb_chars = fallback_chars) {
150                                                 char dummy = '\0';
151                                                 length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
152                                         }
153                                 }
154                                 leftOver = '\0';
155                         }
156                 }
157                 if (flush) {
158                         // Flush the left-over surrogate pair start.
159                         if (leftOver != '\0') {
160                                 length += 3;
161                                 leftOver = '\0';
162                         }
163                 }
164                 return length;
165         }
166
167         unsafe static char [] GetFallbackChars (char *chars, char *start, EncoderFallback fallback, ref EncoderFallbackBuffer buffer)
168         {
169                 if (buffer == null)
170                         buffer = fallback.CreateFallbackBuffer ();
171
172                 buffer.Fallback (*chars, (int) (chars - start));
173
174                 char [] fallback_chars = new char [buffer.Remaining];
175                 for (int i = 0; i < fallback_chars.Length; i++)
176                         fallback_chars [i] = buffer.GetNextChar ();
177
178                 buffer.Reset ();
179
180                 return fallback_chars;
181         }
182
183         // Get the number of bytes needed to encode a character buffer.
184         public override int GetByteCount (char[] chars, int index, int count)
185         {
186                 char dummy = '\0';
187                 return InternalGetByteCount (chars, index, count, EncoderFallback, ref dummy, true);
188         }
189
190
191         [CLSCompliant (false)]
192         [ComVisible (false)]
193         public unsafe override int GetByteCount (char* chars, int count)
194         {
195                 if (chars == null)
196                         throw new ArgumentNullException ("chars");
197                 if (count == 0)
198                         return 0;
199                 char dummy = '\0';
200                 return InternalGetByteCount (chars, count, EncoderFallback, ref dummy, true);
201         }
202
203         #endregion
204
205         #region GetBytes()
206
207         // Internal version of "GetBytes" which can handle a rolling
208         // state between multiple calls to this method.
209         private static int InternalGetBytes (char[] chars, int charIndex,
210                                              int charCount, byte[] bytes,
211                                              int byteIndex,
212                                                  EncoderFallback fallback, ref EncoderFallbackBuffer buffer,
213                                                  ref char leftOver, bool flush)
214         {
215                 // Validate the parameters.
216                 if (chars == null) {
217                         throw new ArgumentNullException ("chars");
218                 }
219                 if (bytes == null) {
220                         throw new ArgumentNullException ("bytes");
221                 }
222                 if (charIndex < 0 || charIndex > chars.Length) {
223                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
224                 }
225                 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
226                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
227                 }
228                 if (byteIndex < 0 || byteIndex > bytes.Length) {
229                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
230                 }
231
232                 if (charIndex == chars.Length) {
233                         if (flush && leftOver != '\0') {
234                                 // FIXME: use EncoderFallback.
235                                 //
236                                 // By default it is empty, so I do nothing for now.
237                                 leftOver = '\0';
238                         }
239                         return 0;
240                 }
241
242                 unsafe {
243                         fixed (char* cptr = chars) {
244                                 if (bytes.Length == byteIndex)
245                                         return InternalGetBytes (
246                                                 cptr + charIndex, charCount, 
247                                                 null, 0, fallback, ref buffer, ref leftOver, flush);
248                                 fixed (byte *bptr = bytes) {
249                                         return InternalGetBytes (
250                                                 cptr + charIndex, charCount,
251                                                 bptr + byteIndex, bytes.Length - byteIndex,
252                                                 fallback, ref buffer,
253                                                 ref leftOver, flush);
254                                 }
255                         }
256                 }
257         }
258
259         private unsafe static int InternalGetBytes (char* chars, int count, byte* bytes, int bcount, EncoderFallback fallback, ref EncoderFallbackBuffer buffer, ref char leftOver, bool flush)
260         {
261                 char* end = chars + count;
262                 char* start = chars;
263                 byte* start_bytes = bytes;
264                 byte* end_bytes = bytes + bcount;
265                 while (chars < end) {
266                         if (leftOver == 0) {
267                                 for (; chars < end; chars++) {
268                                         int ch = *chars;
269                                         if (ch < '\x80') {
270                                                 if (bytes >= end_bytes)
271                                                         goto fail_no_space;
272                                                 *bytes++ = (byte)ch;
273                                         } else if (ch < '\x800') {
274                                                 if (bytes + 1 >= end_bytes)
275                                                         goto fail_no_space;
276                                                 bytes [0] = (byte) (0xC0 | (ch >> 6));
277                                                 bytes [1] = (byte) (0x80 | (ch & 0x3F));
278                                                 bytes += 2;
279                                         } else if (ch < '\uD800' || ch > '\uDFFF') {
280                                                 if (bytes + 2 >= end_bytes)
281                                                         goto fail_no_space;
282                                                 bytes [0] = (byte) (0xE0 | (ch >> 12));
283                                                 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
284                                                 bytes [2] = (byte) (0x80 | (ch & 0x3F));
285                                                 bytes += 3;
286                                         } else if (ch <= '\uDBFF') {
287                                                 // This is a surrogate char, exit the inner loop.
288                                                 leftOver = *chars;
289                                                 chars++;
290                                                 break;
291                                         } else {
292                                                 // We have a surrogate tail without 
293                                                 // leading surrogate.
294                                                 char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer); 
295                                                 char dummy = '\0';
296                                                 if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
297                                                         goto fail_no_space;
298                                                 fixed (char *fb_chars = fallback_chars) {
299                                                         bytes += InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
300                                                 }
301
302                                                 leftOver = '\0';
303                                         }
304                                 }
305                         } else {
306                                 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
307                                         // We have a correct surrogate pair.
308                                         int ch = 0x10000 + (int) *chars - 0xDC00 + (((int) leftOver - 0xD800) << 10);
309                                         if (bytes + 3 >= end_bytes)
310                                                 goto fail_no_space;
311                                         bytes [0] = (byte) (0xF0 | (ch >> 18));
312                                         bytes [1] = (byte) (0x80 | ((ch >> 12) & 0x3F));
313                                         bytes [2] = (byte) (0x80 | ((ch >> 6) & 0x3F));
314                                         bytes [3] = (byte) (0x80 | (ch & 0x3F));
315                                         bytes += 4;
316                                         chars++;
317                                 } else {
318                                         // We have a surrogate start followed by a
319                                         // regular character.  Technically, this is
320                                         // invalid, but we have to do something.
321                                         // We write out the surrogate start and then
322                                         // re-visit the current character again.
323                                         char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer); 
324                                         char dummy = '\0';
325                                         if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
326                                                 goto fail_no_space;
327                                         fixed (char *fb_chars = fallback_chars) {
328                                                 InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
329                                         }
330
331                                         leftOver = '\0';
332                                 }
333                                 leftOver = '\0';
334                         }
335                 }
336                 if (flush) {
337                         // Flush the left-over surrogate pair start.
338                         if (leftOver != '\0') {
339                                 int ch = leftOver;
340                                 if (bytes + 2 < end_bytes) {
341                                         bytes [0] = (byte) (0xE0 | (ch >> 12));
342                                         bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
343                                         bytes [2] = (byte) (0x80 | (ch & 0x3F));
344                                         bytes += 3;
345                                 } else {
346                                         goto fail_no_space;
347                                 }
348                                 leftOver = '\0';
349                         }
350                 }
351                 return (int)(bytes - (end_bytes - bcount));
352 fail_no_space:
353                 throw new ArgumentException ("Insufficient Space", "bytes");
354         }
355
356         // Get the bytes that result from encoding a character buffer.
357         public override int GetBytes (char[] chars, int charIndex, int charCount,
358                                                                  byte[] bytes, int byteIndex)
359         {
360                 char leftOver = '\0';
361                 EncoderFallbackBuffer buffer = null;
362                 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, EncoderFallback, ref buffer, ref leftOver, true);
363         }
364
365         // Convenience wrappers for "GetBytes".
366         public override int GetBytes (String s, int charIndex, int charCount,
367                                                                  byte[] bytes, int byteIndex)
368         {
369                 // Validate the parameters.
370                 if (s == null) {
371                         throw new ArgumentNullException ("s");
372                 }
373                 if (bytes == null) {
374                         throw new ArgumentNullException ("bytes");
375                 }
376                 if (charIndex < 0 || charIndex > s.Length) {
377                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
378                 }
379                 if (charCount < 0 || charCount > (s.Length - charIndex)) {
380                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
381                 }
382                 if (byteIndex < 0 || byteIndex > bytes.Length) {
383                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
384                 }
385
386                 if (charIndex == s.Length)
387                         return 0;
388
389                 unsafe {
390                         fixed (char* cptr = s) {
391                                 char dummy = '\0';
392                                 EncoderFallbackBuffer buffer = null;
393                                 if (bytes.Length == byteIndex)
394                                         return InternalGetBytes (
395                                                 cptr + charIndex, charCount,
396                                                 null, 0, EncoderFallback, ref buffer, ref dummy, true);
397                                 fixed (byte *bptr = bytes) {
398                                         return InternalGetBytes (
399                                                 cptr + charIndex, charCount,
400                                                 bptr + byteIndex, bytes.Length - byteIndex,
401                                                 EncoderFallback, ref buffer,
402                                                 ref dummy, true);
403                                 }
404                         }
405                 }
406         }
407
408         [CLSCompliant (false)]
409         [ComVisible (false)]
410         public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
411         {
412                 if (chars == null)
413                         throw new ArgumentNullException ("chars");
414                 if (charCount < 0)
415                         throw new IndexOutOfRangeException ("charCount");
416                 if (bytes == null)
417                         throw new ArgumentNullException ("bytes");
418                 if (byteCount < 0)
419                         throw new IndexOutOfRangeException ("charCount");
420
421                 if (charCount == 0)
422                         return 0;
423
424                 char dummy = '\0';
425                 EncoderFallbackBuffer buffer = null;
426                 if (byteCount == 0)
427                         return InternalGetBytes (chars, charCount, null, 0, EncoderFallback, ref buffer, ref dummy, true);
428                 else
429                         return InternalGetBytes (chars, charCount, bytes, byteCount, EncoderFallback, ref buffer, ref dummy, true);
430         }
431
432         #endregion
433
434         // Internal version of "GetCharCount" which can handle a rolling
435         // state between multiple calls to this method.
436         private unsafe static int InternalGetCharCount (
437                 byte[] bytes, int index, int count, uint leftOverBits,
438                 uint leftOverCount, object provider,
439                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
440         {
441                 // Validate the parameters.
442                 if (bytes == null) {
443                         throw new ArgumentNullException ("bytes");
444                 }
445                 if (index < 0 || index > bytes.Length) {
446                         throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
447                 }
448                 if (count < 0 || count > (bytes.Length - index)) {
449                         throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
450                 }
451
452                 if (count == 0)
453                         return 0;
454                 fixed (byte *bptr = bytes)
455                         return InternalGetCharCount (bptr + index, count,
456                                 leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
457         }
458
459         private unsafe static int InternalGetCharCount (
460                 byte* bytes, int count, uint leftOverBits,
461                 uint leftOverCount, object provider,
462                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
463         {
464                 int index = 0;
465
466                 int length = 0;
467
468                 if (leftOverCount == 0) {
469                         int end = index + count;
470                         for (; index < end; index++, count--) {
471                                 if (bytes [index] < 0x80)
472                                         length++;
473                                 else
474                                         break;
475                         }
476                 }
477
478                 // Determine the number of characters that we have.
479                 uint ch;
480                 uint leftBits = leftOverBits;
481                 uint leftSoFar = (leftOverCount & (uint)0x0F);
482                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
483                 while (count > 0) {
484                         ch = (uint)(bytes[index++]);
485                         --count;
486                         if (leftSize == 0) {
487                                 // Process a UTF-8 start character.
488                                 if (ch < (uint)0x0080) {
489                                         // Single-byte UTF-8 character.
490                                         ++length;
491                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
492                                         // Double-byte UTF-8 character.
493                                         leftBits = (ch & (uint)0x1F);
494                                         leftSoFar = 1;
495                                         leftSize = 2;
496                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
497                                         // Three-byte UTF-8 character.
498                                         leftBits = (ch & (uint)0x0F);
499                                         leftSoFar = 1;
500                                         leftSize = 3;
501                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
502                                         // Four-byte UTF-8 character.
503                                         leftBits = (ch & (uint)0x07);
504                                         leftSoFar = 1;
505                                         leftSize = 4;
506                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
507                                         // Five-byte UTF-8 character.
508                                         leftBits = (ch & (uint)0x03);
509                                         leftSoFar = 1;
510                                         leftSize = 5;
511                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
512                                         // Six-byte UTF-8 character.
513                                         leftBits = (ch & (uint)0x03);
514                                         leftSoFar = 1;
515                                         leftSize = 6;
516                                 } else {
517                                         // Invalid UTF-8 start character.
518                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1, 1);
519                                 }
520                         } else {
521                                 // Process an extra byte in a multi-byte sequence.
522                                 if ((ch & (uint)0xC0) == (uint)0x80) {
523                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
524                                         if (++leftSoFar >= leftSize) {
525                                                 // We have a complete character now.
526                                                 if (leftBits < (uint)0x10000) {
527                                                         // is it an overlong ?
528                                                         bool overlong = false;
529                                                         switch (leftSize) {
530                                                         case 2:
531                                                                 overlong = (leftBits <= 0x7F);
532                                                                 break;
533                                                         case 3:
534                                                                 overlong = (leftBits <= 0x07FF);
535                                                                 break;
536                                                         case 4:
537                                                                 overlong = (leftBits <= 0xFFFF);
538                                                                 break;
539                                                         case 5:
540                                                                 overlong = (leftBits <= 0x1FFFFF);
541                                                                 break;
542                                                         case 6:
543                                                                 overlong = (leftBits <= 0x03FFFFFF);
544                                                                 break;
545                                                         }
546                                                         if (overlong) {
547                                                                 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
548                                                         }
549                                                         else if ((leftBits & 0xF800) == 0xD800) {
550                                                                 // UTF-8 doesn't use surrogate characters
551                                                                 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
552                                                         }
553                                                         else
554                                                                 ++length;
555                                                 } else if (leftBits < (uint)0x110000) {
556                                                         length += 2;
557                                                 } else {
558                                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
559                                                 }
560                                                 leftSize = 0;
561                                         }
562                                 } else {
563                                         // Invalid UTF-8 sequence: clear and restart.
564                                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
565                                         leftSize = 0;
566                                         --index;
567                                         ++count;
568                                 }
569                         }
570                 }
571                 if (flush && leftSize != 0) {
572                         // We had left-over bytes that didn't make up
573                         // a complete UTF-8 character sequence.
574                         length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
575                 }
576
577                 // Return the final length to the caller.
578                 return length;
579         }
580
581         // for GetCharCount()
582         static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long index, uint size)
583         {
584                 if (buffer == null) {
585                         DecoderFallback fb = provider as DecoderFallback;
586                         if (fb != null)
587                                 buffer = fb.CreateFallbackBuffer ();
588                         else
589                                 buffer = ((Decoder) provider).FallbackBuffer;
590                 }
591                 if (bufferArg == null)
592                         bufferArg = new byte [1];
593                 int ret = 0;
594                 for (int i = 0; i < size; i++) {
595                         bufferArg [0] = bytes [(int) index + i];
596                         buffer.Fallback (bufferArg, 0);
597                         ret += buffer.Remaining;
598                         buffer.Reset ();
599                 }
600                 return ret;
601         }
602
603         // for GetChars()
604         static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long byteIndex, uint size,
605                 char* chars, ref int charIndex)
606         {
607                 if (buffer == null) {
608                         DecoderFallback fb = provider as DecoderFallback;
609                         if (fb != null)
610                                 buffer = fb.CreateFallbackBuffer ();
611                         else
612                                 buffer = ((Decoder) provider).FallbackBuffer;
613                 }
614                 if (bufferArg == null)
615                         bufferArg = new byte [1];
616                 for (int i = 0; i < size; i++) {
617                         bufferArg [0] = bytes [byteIndex + i];
618                         buffer.Fallback (bufferArg, 0);
619                         while (buffer.Remaining > 0)
620                                 chars [charIndex++] = buffer.GetNextChar ();
621                         buffer.Reset ();
622                 }
623         }
624
625         // Get the number of characters needed to decode a byte buffer.
626         public override int GetCharCount (byte[] bytes, int index, int count)
627         {
628                 DecoderFallbackBuffer buf = null;
629                 byte [] bufferArg = null;
630                 return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
631         }
632
633         [CLSCompliant (false)]
634         [ComVisible (false)]
635         public unsafe override int GetCharCount (byte* bytes, int count)
636         {
637                 DecoderFallbackBuffer buf = null;
638                 byte [] bufferArg = null;
639                 return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
640         }
641
642         // Get the characters that result from decoding a byte buffer.
643         private unsafe static int InternalGetChars (
644                 byte[] bytes, int byteIndex, int byteCount, char[] chars,
645                 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
646                 object provider,
647                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
648         {
649                 // Validate the parameters.
650                 if (bytes == null) {
651                         throw new ArgumentNullException ("bytes");
652                 }
653                 if (chars == null) {
654                         throw new ArgumentNullException ("chars");
655                 }
656                 if (byteIndex < 0 || byteIndex > bytes.Length) {
657                         throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
658                 }
659                 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
660                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
661                 }
662                 if (charIndex < 0 || charIndex > chars.Length) {
663                         throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
664                 }
665
666                 if (charIndex == chars.Length)
667                         return 0;
668
669                 fixed (char* cptr = chars) {
670                         if (byteCount == 0 || byteIndex == bytes.Length)
671                                 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
672                         // otherwise...
673                         fixed (byte* bptr = bytes)
674                                 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
675                 }
676         }
677
678         private unsafe static int InternalGetChars (
679                 byte* bytes, int byteCount, char* chars, int charCount,
680                 ref uint leftOverBits, ref uint leftOverCount,
681                 object provider,
682                 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
683         {
684                 int charIndex = 0, byteIndex = 0;
685                 int length = charCount;
686                 int posn = charIndex;
687
688                 if (leftOverCount == 0) {
689                         int end = byteIndex + byteCount;
690                         for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
691                                 if (bytes [byteIndex] < 0x80)
692                                         chars [posn] = (char) bytes [byteIndex];
693                                 else
694                                         break;
695                         }
696                 }
697
698                 // Convert the bytes into the output buffer.
699                 uint ch;
700                 uint leftBits = leftOverBits;
701                 uint leftSoFar = (leftOverCount & (uint)0x0F);
702                 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
703
704                 int byteEnd = byteIndex + byteCount;
705                 for(; byteIndex < byteEnd; byteIndex++) {
706                         // Fetch the next character from the byte buffer.
707                         ch = (uint)(bytes[byteIndex]);
708                         if (leftSize == 0) {
709                                 // Process a UTF-8 start character.
710                                 if (ch < (uint)0x0080) {
711                                         // Single-byte UTF-8 character.
712                                         if (posn >= length) {
713                                                 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
714                                         }
715                                         chars[posn++] = (char)ch;
716                                 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
717                                         // Double-byte UTF-8 character.
718                                         leftBits = (ch & (uint)0x1F);
719                                         leftSoFar = 1;
720                                         leftSize = 2;
721                                 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
722                                         // Three-byte UTF-8 character.
723                                         leftBits = (ch & (uint)0x0F);
724                                         leftSoFar = 1;
725                                         leftSize = 3;
726                                 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
727                                         // Four-byte UTF-8 character.
728                                         leftBits = (ch & (uint)0x07);
729                                         leftSoFar = 1;
730                                         leftSize = 4;
731                                 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
732                                         // Five-byte UTF-8 character.
733                                         leftBits = (ch & (uint)0x03);
734                                         leftSoFar = 1;
735                                         leftSize = 5;
736                                 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
737                                         // Six-byte UTF-8 character.
738                                         leftBits = (ch & (uint)0x03);
739                                         leftSoFar = 1;
740                                         leftSize = 6;
741                                 } else {
742                                         // Invalid UTF-8 start character.
743                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, 1, chars, ref posn);
744                                 }
745                         } else {
746                                 // Process an extra byte in a multi-byte sequence.
747                                 if ((ch & (uint)0xC0) == (uint)0x80) {
748                                         leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
749                                         if (++leftSoFar >= leftSize) {
750                                                 // We have a complete character now.
751                                                 if (leftBits < (uint)0x10000) {
752                                                         // is it an overlong ?
753                                                         bool overlong = false;
754                                                         switch (leftSize) {
755                                                         case 2:
756                                                                 overlong = (leftBits <= 0x7F);
757                                                                 break;
758                                                         case 3:
759                                                                 overlong = (leftBits <= 0x07FF);
760                                                                 break;
761                                                         case 4:
762                                                                 overlong = (leftBits <= 0xFFFF);
763                                                                 break;
764                                                         case 5:
765                                                                 overlong = (leftBits <= 0x1FFFFF);
766                                                                 break;
767                                                         case 6:
768                                                                 overlong = (leftBits <= 0x03FFFFFF);
769                                                                 break;
770                                                         }
771                                                         if (overlong) {
772                                                                 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
773                                                         }
774                                                         else if ((leftBits & 0xF800) == 0xD800) {
775                                                                 // UTF-8 doesn't use surrogate characters
776                                                                 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
777                                                         }
778                                                         else {
779                                                                 if (posn >= length) {
780                                                                         throw new ArgumentException
781                                                                                 (_("Arg_InsufficientSpace"), "chars");
782                                                                 }
783                                                                 chars[posn++] = (char)leftBits;
784                                                         }
785                                                 } else if (leftBits < (uint)0x110000) {
786                                                         if ((posn + 2) > length) {
787                                                                 throw new ArgumentException
788                                                                         (_("Arg_InsufficientSpace"), "chars");
789                                                         }
790                                                         leftBits -= (uint)0x10000;
791                                                         chars[posn++] = (char)((leftBits >> 10) +
792                                                                                                    (uint)0xD800);
793                                                         chars[posn++] =
794                                                                 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
795                                                 } else {
796                                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
797                                                 }
798                                                 leftSize = 0;
799                                         }
800                                 } else {
801                                         // Invalid UTF-8 sequence: clear and restart.
802                                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
803                                         leftSize = 0;
804                                         --byteIndex;
805                                 }
806                         }
807                 }
808                 if (flush && leftSize != 0) {
809                         // We had left-over bytes that didn't make up
810                         // a complete UTF-8 character sequence.
811                         Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
812                 }
813                 leftOverBits = leftBits;
814                 leftOverCount = (leftSoFar | (leftSize << 4));
815
816                 // Return the final length to the caller.
817                 return posn - charIndex;
818         }
819
820         // Get the characters that result from decoding a byte buffer.
821         public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
822                                                                  char[] chars, int charIndex)
823         {
824                 uint leftOverBits = 0;
825                 uint leftOverCount = 0;
826                 DecoderFallbackBuffer buf = null;
827                 byte [] bufferArg = null;
828                 return InternalGetChars (bytes, byteIndex, byteCount, chars, 
829                                 charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
830         }
831
832         [CLSCompliant (false)]
833         [ComVisible (false)]
834         public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
835         {
836                 DecoderFallbackBuffer buf = null;
837                 byte [] bufferArg = null;
838                 uint leftOverBits = 0;
839                 uint leftOverCount = 0;
840                 return InternalGetChars (bytes, byteCount, chars, 
841                                 charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
842         }
843
844         // Get the maximum number of bytes needed to encode a
845         // specified number of characters.
846         public override int GetMaxByteCount (int charCount)
847         {
848                 if (charCount < 0) {
849                         throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
850                 }
851                 return charCount * 4;
852         }
853
854         // Get the maximum number of characters needed to decode a
855         // specified number of bytes.
856         public override int GetMaxCharCount (int byteCount)
857         {
858                 if (byteCount < 0) {
859                         throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
860                 }
861                 return byteCount;
862         }
863
864         // Get a UTF8-specific decoder that is attached to this instance.
865         public override Decoder GetDecoder ()
866         {
867                 return new UTF8Decoder (DecoderFallback);
868         }
869
870         // Get a UTF8-specific encoder that is attached to this instance.
871         public override Encoder GetEncoder ()
872         {
873                 return new UTF8Encoder (EncoderFallback, emitIdentifier);
874         }
875
876         // Get the UTF8 preamble.
877         public override byte[] GetPreamble ()
878         {
879                 if (emitIdentifier)
880                         return new byte [] { 0xEF, 0xBB, 0xBF };
881
882                 return empty;
883         }
884
885         // Determine if this object is equal to another.
886         public override bool Equals (Object value)
887         {
888                 UTF8Encoding enc = (value as UTF8Encoding);
889                 if (enc != null) {
890                         return (codePage == enc.codePage &&
891                                 emitIdentifier == enc.emitIdentifier &&
892                                 DecoderFallback.Equals (enc.DecoderFallback) &&
893                                 EncoderFallback.Equals (enc.EncoderFallback));
894                 } else {
895                         return false;
896                 }
897         }
898
899         // Get the hash code for this object.
900         public override int GetHashCode ()
901         {
902                 return base.GetHashCode ();
903         }
904
905         public override int GetByteCount (string chars)
906         {
907                 // hmm, does this override make any sense?
908                 return base.GetByteCount (chars);
909         }
910
911         [ComVisible (false)]
912         public override string GetString (byte [] bytes, int index, int count)
913         {
914                 // hmm, does this override make any sense?
915                 return base.GetString (bytes, index, count);
916         }
917
918         // UTF-8 decoder implementation.
919         [Serializable]
920         private class UTF8Decoder : Decoder
921         {
922                 private uint leftOverBits;
923                 private uint leftOverCount;
924
925                 // Constructor.
926                 public UTF8Decoder (DecoderFallback fallback)
927                 {
928                         Fallback = fallback;
929                         leftOverBits = 0;
930                         leftOverCount = 0;
931                 }
932
933                 // Override inherited methods.
934                 public override int GetCharCount (byte[] bytes, int index, int count)
935                 {
936                         DecoderFallbackBuffer buf = null;
937                         byte [] bufferArg = null;
938                         return InternalGetCharCount (bytes, index, count,
939                                 leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
940                 }
941                 public override int GetChars (byte[] bytes, int byteIndex,
942                                                  int byteCount, char[] chars, int charIndex)
943                 {
944                         DecoderFallbackBuffer buf = null;
945                         byte [] bufferArg = null;
946                         return InternalGetChars (bytes, byteIndex, byteCount,
947                                 chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
948                 }
949
950         } // class UTF8Decoder
951
952         // UTF-8 encoder implementation.
953         [Serializable]
954         private class UTF8Encoder : Encoder
955         {
956 //              private bool emitIdentifier;
957                 private char leftOverForCount;
958                 private char leftOverForConv;
959
960                 // Constructor.
961                 public UTF8Encoder (EncoderFallback fallback, bool emitIdentifier)
962                 {
963                         Fallback = fallback;
964 //                      this.emitIdentifier = emitIdentifier;
965                         leftOverForCount = '\0';
966                         leftOverForConv = '\0';
967                 }
968
969                 // Override inherited methods.
970                 public override int GetByteCount (char[] chars, int index,
971                                          int count, bool flush)
972                 {
973                         return InternalGetByteCount (chars, index, count, Fallback, ref leftOverForCount, flush);
974                 }
975                 public override int GetBytes (char[] chars, int charIndex,
976                                          int charCount, byte[] bytes, int byteIndex, bool flush)
977                 {
978                         int result;
979                         EncoderFallbackBuffer buffer = null;
980                         result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, Fallback, ref buffer, ref leftOverForConv, flush);
981 //                      emitIdentifier = false;
982                         return result;
983                 }
984
985                 public unsafe override int GetByteCount (char* chars, int count, bool flush)
986                 {
987                         return InternalGetByteCount (chars, count, Fallback, ref leftOverForCount, flush);
988                 }
989
990                 public unsafe override int GetBytes (char* chars, int charCount,
991                         byte* bytes, int byteCount, bool flush)
992                 {
993                         int result;
994                         EncoderFallbackBuffer buffer = null;
995                         result = InternalGetBytes (chars, charCount, bytes, byteCount, Fallback, ref buffer, ref leftOverForConv, flush);
996 //                      emitIdentifier = false;
997                         return result;
998                 }
999         } // class UTF8Encoder
1000
1001 }; // class UTF8Encoding
1002
1003 }; // namespace System.Text