2 * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
4 * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
31 public class UTF8Encoding : Encoding
33 // Magic number used by Windows for UTF-8.
34 internal const int UTF8_CODE_PAGE = 65001;
37 private bool emitIdentifier;
38 private bool throwOnInvalid;
41 public UTF8Encoding () : this (false, false) {}
42 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
43 : this (encoderShouldEmitUTF8Identifier, false) {}
44 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
45 : base (UTF8_CODE_PAGE)
47 emitIdentifier = encoderShouldEmitUTF8Identifier;
48 throwOnInvalid = throwOnInvalidBytes;
51 // Internal version of "GetByteCount" which can handle a rolling
52 // state between multiple calls to this method.
53 private static int InternalGetByteCount (char[] chars, int index, int count, uint leftOver, bool flush)
55 // Validate the parameters.
57 throw new ArgumentNullException ("chars");
59 if (index < 0 || index > chars.Length) {
60 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
62 if (count < 0 || count > (chars.Length - index)) {
63 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
66 // Determine the lengths of all characters.
75 } else if (ch < '\u0800') {
77 } else if (ch >= '\uD800' && ch <= '\uDBFF') {
78 // This is the start of a surrogate pair.
83 } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
84 // We have a surrogate pair.
88 // We have a surrogate start followed by a
89 // regular character. Technically, this is
90 // invalid, but we have to do something.
91 // We write out the surrogate start and then
92 // re-visit the current character again.
100 if (flush && pair != 0) {
101 // Flush the left-over surrogate pair start.
105 // Return the final length to the caller.
109 // Get the number of bytes needed to encode a character buffer.
110 public override int GetByteCount (char[] chars, int index, int count)
112 return InternalGetByteCount (chars, index, count, 0, true);
115 // Convenience wrappers for "GetByteCount".
116 public override int GetByteCount (String s)
118 // Validate the parameters.
120 throw new ArgumentNullException ("s");
123 // Determine the lengths of all characters.
126 int count = s.Length;
133 } else if (ch < '\u0800') {
135 } else if (ch >= '\uD800' && ch <= '\uDBFF' && count > 1) {
136 // This may be the start of a surrogate pair.
137 pair = (uint)(s[index]);
138 if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
151 // Return the final length to the caller.
155 // Internal version of "GetBytes" which can handle a rolling
156 // state between multiple calls to this method.
157 private static int InternalGetBytes (char[] chars, int charIndex,
158 int charCount, byte[] bytes,
159 int byteIndex, ref uint leftOver,
162 // Validate the parameters.
164 throw new ArgumentNullException ("chars");
167 throw new ArgumentNullException ("bytes");
169 if (charIndex < 0 || charIndex > chars.Length) {
170 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
172 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
173 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
175 if (byteIndex < 0 || byteIndex > bytes.Length) {
176 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
179 // Convert the characters into bytes.
181 int length = bytes.Length;
183 uint left = leftOver;
184 int posn = byteIndex;
185 while (charCount > 0) {
186 // Fetch the next UTF-16 character pair value.
187 ch = chars[charIndex++];
190 if (ch >= '\uD800' && ch <= '\uDBFF') {
191 // This is the start of a surrogate pair.
195 // This is a regular character.
198 } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
199 // We have a surrogate pair.
200 pair = ((left - (uint)0xD800) << 10) +
201 (((uint)ch) - (uint)0xDC00) +
205 // We have a surrogate start followed by a
206 // regular character. Technically, this is
207 // invalid, but we have to do something.
208 // We write out the surrogate start and then
209 // re-visit the current character again.
216 // Encode the character pair value.
217 if (pair < (uint)0x0080) {
218 if (posn >= length) {
219 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
221 bytes[posn++] = (byte)pair;
222 } else if (pair < (uint)0x0800) {
223 if ((posn + 2) > length) {
224 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
226 bytes[posn++] = (byte)(0xC0 | (pair >> 6));
227 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
228 } else if (pair < (uint)0x10000) {
229 if ((posn + 3) > length) {
230 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
232 bytes[posn++] = (byte)(0xE0 | (pair >> 12));
233 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
234 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
236 if ((posn + 4) > length) {
237 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
239 bytes[posn++] = (byte)(0xF0 | (pair >> 18));
240 bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
241 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
242 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
245 if (flush && left != 0) {
246 // Flush the left-over surrogate pair start.
247 if ((posn + 3) > length) {
248 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
250 bytes[posn++] = (byte)(0xE0 | (left >> 12));
251 bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));
252 bytes[posn++] = (byte)(0x80 | (left & 0x3F));
257 // Return the final count to the caller.
258 return posn - byteIndex;
261 // Get the bytes that result from encoding a character buffer.
262 public override int GetBytes (char[] chars, int charIndex, int charCount,
263 byte[] bytes, int byteIndex)
266 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
269 // Convenience wrappers for "GetBytes".
270 public override int GetBytes (String s, int charIndex, int charCount,
271 byte[] bytes, int byteIndex)
273 // Validate the parameters.
275 throw new ArgumentNullException ("s");
278 throw new ArgumentNullException ("bytes");
280 if (charIndex < 0 || charIndex > s.Length) {
281 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
283 if (charCount < 0 || charCount > (s.Length - charIndex)) {
284 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
286 if (byteIndex < 0 || byteIndex > bytes.Length) {
287 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
290 // Convert the characters into bytes.
292 int length = bytes.Length;
294 int posn = byteIndex;
295 while (charCount > 0) {
296 // Fetch the next UTF-16 character pair value.
299 if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
300 // This may be the start of a surrogate pair.
301 pair = (uint)(s[charIndex]);
302 if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
303 pair = (pair - (uint)0xDC00) +
304 ((((uint)ch) - (uint)0xD800) << 10) +
315 // Encode the character pair value.
316 if (pair < (uint)0x0080) {
317 if (posn >= length) {
318 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
320 bytes[posn++] = (byte)pair;
321 } else if (pair < (uint)0x0800) {
322 if ((posn + 2) > length) {
323 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
325 bytes[posn++] = (byte)(0xC0 | (pair >> 6));
326 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
327 } else if (pair < (uint)0x10000) {
328 if ((posn + 3) > length) {
329 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
331 bytes[posn++] = (byte)(0xE0 | (pair >> 12));
332 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
333 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
335 if ((posn + 4) > length) {
336 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
338 bytes[posn++] = (byte)(0xF0 | (pair >> 18));
339 bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
340 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
341 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
345 // Return the final count to the caller.
346 return posn - byteIndex;
349 // Internal version of "GetCharCount" which can handle a rolling
350 // state between multiple calls to this method.
351 private static int InternalGetCharCount (byte[] bytes, int index, int count,
354 bool throwOnInvalid, bool flush)
356 // Validate the parameters.
358 throw new ArgumentNullException ("bytes");
360 if (index < 0 || index > bytes.Length) {
361 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
363 if (count < 0 || count > (bytes.Length - index)) {
364 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
367 // Determine the number of characters that we have.
370 uint leftBits = leftOverBits;
371 uint leftSoFar = (leftOverCount & (uint)0x0F);
372 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
374 ch = (uint)(bytes[index++]);
377 // Process a UTF-8 start character.
378 if (ch < (uint)0x0080) {
379 // Single-byte UTF-8 character.
381 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
382 // Double-byte UTF-8 character.
383 leftBits = (ch & (uint)0x1F);
386 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
387 // Three-byte UTF-8 character.
388 leftBits = (ch & (uint)0x0F);
391 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
392 // Four-byte UTF-8 character.
393 leftBits = (ch & (uint)0x07);
396 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
397 // Five-byte UTF-8 character.
398 leftBits = (ch & (uint)0x03);
401 } else if ((ch & (uint)0xFC) == (uint)0xFC) {
402 // Six-byte UTF-8 character.
403 leftBits = (ch & (uint)0x03);
407 // Invalid UTF-8 start character.
408 if (throwOnInvalid) {
409 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
413 // Process an extra byte in a multi-byte sequence.
414 if ((ch & (uint)0xC0) == (uint)0x80) {
415 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
416 if (++leftSoFar >= leftSize) {
417 // We have a complete character now.
418 if (leftBits < (uint)0x10000) {
419 if (leftBits != (uint)0xFEFF) {
422 } else if (leftBits < (uint)0x110000) {
424 } else if (throwOnInvalid) {
425 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
430 // Invalid UTF-8 sequence: clear and restart.
431 if (throwOnInvalid) {
432 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
440 if (flush && leftSize != 0 && throwOnInvalid) {
441 // We had left-over bytes that didn't make up
442 // a complete UTF-8 character sequence.
443 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
446 // Return the final length to the caller.
450 // Get the number of characters needed to decode a byte buffer.
451 public override int GetCharCount (byte[] bytes, int index, int count)
453 return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
456 // Get the characters that result from decoding a byte buffer.
457 private static int InternalGetChars (byte[] bytes, int byteIndex,
458 int byteCount, char[] chars,
459 int charIndex, ref uint leftOverBits,
460 ref uint leftOverCount,
461 bool throwOnInvalid, bool flush)
463 // Validate the parameters.
465 throw new ArgumentNullException ("bytes");
468 throw new ArgumentNullException ("chars");
470 if (byteIndex < 0 || byteIndex > bytes.Length) {
471 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
473 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
474 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
476 if (charIndex < 0 || charIndex > chars.Length) {
477 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
480 // Convert the bytes into the output buffer.
482 int length = chars.Length;
483 int posn = charIndex;
484 uint leftBits = leftOverBits;
485 uint leftSoFar = (leftOverCount & (uint)0x0F);
486 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
487 while (byteCount > 0) {
488 // Fetch the next character from the byte buffer.
489 ch = (uint)(bytes[byteIndex++]);
492 // Process a UTF-8 start character.
493 if (ch < (uint)0x0080) {
494 // Single-byte UTF-8 character.
495 if (posn >= length) {
496 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
498 chars[posn++] = (char)ch;
499 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
500 // Double-byte UTF-8 character.
501 leftBits = (ch & (uint)0x1F);
504 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
505 // Three-byte UTF-8 character.
506 leftBits = (ch & (uint)0x0F);
509 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
510 // Four-byte UTF-8 character.
511 leftBits = (ch & (uint)0x07);
514 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
515 // Five-byte UTF-8 character.
516 leftBits = (ch & (uint)0x03);
519 } else if ((ch & (uint)0xFC) == (uint)0xFC) {
520 // Six-byte UTF-8 character.
521 leftBits = (ch & (uint)0x03);
525 // Invalid UTF-8 start character.
526 if (throwOnInvalid) {
527 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
531 // Process an extra byte in a multi-byte sequence.
532 if ((ch & (uint)0xC0) == (uint)0x80) {
533 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
534 if (++leftSoFar >= leftSize) {
535 // We have a complete character now.
536 if (leftBits < (uint)0x10000) {
537 if (leftBits != (uint)0xFEFF) {
538 if (posn >= length) {
539 throw new ArgumentException
540 (_("Arg_InsufficientSpace"), "chars");
542 chars[posn++] = (char)leftBits;
544 } else if (leftBits < (uint)0x110000) {
545 if ((posn + 2) > length) {
546 throw new ArgumentException
547 (_("Arg_InsufficientSpace"), "chars");
549 leftBits -= (uint)0x10000;
550 chars[posn++] = (char)((leftBits >> 10) +
553 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
554 } else if (throwOnInvalid) {
555 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
560 // Invalid UTF-8 sequence: clear and restart.
561 if (throwOnInvalid) {
562 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
570 if (flush && leftSize != 0 && throwOnInvalid) {
571 // We had left-over bytes that didn't make up
572 // a complete UTF-8 character sequence.
573 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
575 leftOverBits = leftBits;
576 leftOverCount = (leftSoFar | (leftSize << 4));
578 // Return the final length to the caller.
579 return posn - charIndex;
582 // Get the characters that result from decoding a byte buffer.
583 public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
584 char[] chars, int charIndex)
586 uint leftOverBits = 0;
587 uint leftOverCount = 0;
588 return InternalGetChars (bytes, byteIndex, byteCount, chars,
589 charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
592 // Get the maximum number of bytes needed to encode a
593 // specified number of characters.
594 public override int GetMaxByteCount (int charCount)
597 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
599 return charCount * 4;
602 // Get the maximum number of characters needed to decode a
603 // specified number of bytes.
604 public override int GetMaxCharCount (int byteCount)
607 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
612 // Get a UTF8-specific decoder that is attached to this instance.
613 public override Decoder GetDecoder ()
615 return new UTF8Decoder (throwOnInvalid);
618 // Get a UTF8-specific encoder that is attached to this instance.
619 public override Encoder GetEncoder ()
621 return new UTF8Encoder (emitIdentifier);
624 // Get the UTF8 preamble.
625 public override byte[] GetPreamble ()
627 if (emitIdentifier) {
628 byte[] pre = new byte [3];
638 // Determine if this object is equal to another.
639 public override bool Equals (Object value)
641 UTF8Encoding enc = (value as UTF8Encoding);
643 return (codePage == enc.codePage &&
644 emitIdentifier == enc.emitIdentifier &&
645 throwOnInvalid == enc.throwOnInvalid);
651 // Get the hash code for this object.
652 public override int GetHashCode ()
654 return base.GetHashCode ();
659 // Get the mail body name for this encoding.
660 public override String BodyName
667 // Get the human-readable name for this encoding.
668 public override String EncodingName
671 return "Unicode (UTF-8)";
675 // Get the mail agent header name for this encoding.
676 public override String HeaderName
683 // Determine if this encoding can be displayed in a Web browser.
684 public override bool IsBrowserDisplay
691 // Determine if this encoding can be saved from a Web browser.
692 public override bool IsBrowserSave
699 // Determine if this encoding can be displayed in a mail/news agent.
700 public override bool IsMailNewsDisplay
707 // Determine if this encoding can be saved from a mail/news agent.
708 public override bool IsMailNewsSave
715 // Get the IANA-preferred Web name for this encoding.
716 public override String WebName
723 // Get the Windows code page represented by this object.
724 public override int WindowsCodePage
727 return UnicodeEncoding.UNICODE_CODE_PAGE;
731 #endif // !ECMA_COMPAT
733 // UTF-8 decoder implementation.
735 private sealed class UTF8Decoder : Decoder
737 private bool throwOnInvalid;
738 private uint leftOverBits;
739 private uint leftOverCount;
742 public UTF8Decoder (bool throwOnInvalid)
744 this.throwOnInvalid = throwOnInvalid;
749 // Override inherited methods.
750 public override int GetCharCount (byte[] bytes, int index, int count)
752 return InternalGetCharCount (bytes, index, count,
753 leftOverBits, leftOverCount, throwOnInvalid, false);
755 public override int GetChars (byte[] bytes, int byteIndex,
756 int byteCount, char[] chars, int charIndex)
758 return InternalGetChars (bytes, byteIndex, byteCount,
759 chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
762 } // class UTF8Decoder
764 // UTF-8 encoder implementation.
766 private sealed class UTF8Encoder : Encoder
768 private bool emitIdentifier;
769 private uint leftOver;
772 public UTF8Encoder (bool emitIdentifier)
774 this.emitIdentifier = emitIdentifier;
778 // Override inherited methods.
779 public override int GetByteCount (char[] chars, int index,
780 int count, bool flush)
782 return InternalGetByteCount (chars, index, count, leftOver, flush);
784 public override int GetBytes (char[] chars, int charIndex,
785 int charCount, byte[] bytes, int byteCount, bool flush)
788 result = InternalGetBytes (chars, charIndex, charCount, bytes, byteCount, ref leftOver, flush);
789 emitIdentifier = false;
793 } // class UTF8Encoder
795 }; // class UTF8Encoding
797 }; // namespace System.Text