2 * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
4 * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
5 * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 * OTHER DEALINGS IN THE SOFTWARE.
32 [MonoTODO ("Fix serialization compatibility with MS.NET")]
33 public class UTF8Encoding : Encoding
35 // Magic number used by Windows for UTF-8.
36 internal const int UTF8_CODE_PAGE = 65001;
39 private bool emitIdentifier;
40 private bool throwOnInvalid;
43 public UTF8Encoding () : this (false, false) {}
44 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
45 : this (encoderShouldEmitUTF8Identifier, false) {}
47 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
48 : base (UTF8_CODE_PAGE)
50 emitIdentifier = encoderShouldEmitUTF8Identifier;
51 throwOnInvalid = throwOnInvalidBytes;
53 web_name = body_name = header_name = "utf-8";
54 encoding_name = "Unicode (UTF-8)";
55 is_browser_save = true;
56 is_browser_display = true;
57 is_mail_news_display = true;
58 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
61 // Internal version of "GetByteCount" which can handle a rolling
62 // state between multiple calls to this method.
63 private static int InternalGetByteCount (char[] chars, int index, int count, uint leftOver, bool flush)
65 // Validate the parameters.
67 throw new ArgumentNullException ("chars");
69 if (index < 0 || index > chars.Length) {
70 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
72 if (count < 0 || count > (chars.Length - index)) {
73 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
76 // Determine the lengths of all characters.
85 } else if (ch < '\u0800') {
87 } else if (ch >= '\uD800' && ch <= '\uDBFF') {
88 // This is the start of a surrogate pair.
93 } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
94 // We have a surrogate pair.
98 // We have a surrogate start followed by a
99 // regular character. Technically, this is
100 // invalid, but we have to do something.
101 // We write out the surrogate start and then
102 // re-visit the current character again.
110 if (flush && pair != 0) {
111 // Flush the left-over surrogate pair start.
115 // Return the final length to the caller.
119 // Get the number of bytes needed to encode a character buffer.
120 public override int GetByteCount (char[] chars, int index, int count)
122 return InternalGetByteCount (chars, index, count, 0, true);
125 // Convenience wrappers for "GetByteCount".
126 public override int GetByteCount (String s)
128 // Validate the parameters.
130 throw new ArgumentNullException ("s");
133 // Determine the lengths of all characters.
136 int count = s.Length;
143 } else if (ch < '\u0800') {
145 } else if (ch >= '\uD800' && ch <= '\uDBFF' && count > 1) {
146 // This may be the start of a surrogate pair.
147 pair = (uint)(s[index]);
148 if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
161 // Return the final length to the caller.
165 // Internal version of "GetBytes" which can handle a rolling
166 // state between multiple calls to this method.
167 private static int InternalGetBytes (char[] chars, int charIndex,
168 int charCount, byte[] bytes,
169 int byteIndex, ref uint leftOver,
172 // Validate the parameters.
174 throw new ArgumentNullException ("chars");
177 throw new ArgumentNullException ("bytes");
179 if (charIndex < 0 || charIndex > chars.Length) {
180 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
182 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
183 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
185 if (byteIndex < 0 || byteIndex > bytes.Length) {
186 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
189 // Convert the characters into bytes.
191 int length = bytes.Length;
193 uint left = leftOver;
194 int posn = byteIndex;
195 while (charCount > 0) {
196 // Fetch the next UTF-16 character pair value.
197 ch = chars[charIndex++];
200 if (ch >= '\uD800' && ch <= '\uDBFF') {
201 // This is the start of a surrogate pair.
205 // This is a regular character.
208 } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
209 // We have a surrogate pair.
210 pair = ((left - (uint)0xD800) << 10) +
211 (((uint)ch) - (uint)0xDC00) +
215 // We have a surrogate start followed by a
216 // regular character. Technically, this is
217 // invalid, but we have to do something.
218 // We write out the surrogate start and then
219 // re-visit the current character again.
226 // Encode the character pair value.
227 if (pair < (uint)0x0080) {
228 if (posn >= length) {
229 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
231 bytes[posn++] = (byte)pair;
232 } else if (pair < (uint)0x0800) {
233 if ((posn + 2) > length) {
234 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
236 bytes[posn++] = (byte)(0xC0 | (pair >> 6));
237 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
238 } else if (pair < (uint)0x10000) {
239 if ((posn + 3) > length) {
240 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
242 bytes[posn++] = (byte)(0xE0 | (pair >> 12));
243 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
244 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
246 if ((posn + 4) > length) {
247 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
249 bytes[posn++] = (byte)(0xF0 | (pair >> 18));
250 bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
251 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
252 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
255 if (flush && left != 0) {
256 // Flush the left-over surrogate pair start.
257 if ((posn + 3) > length) {
258 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
260 bytes[posn++] = (byte)(0xE0 | (left >> 12));
261 bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));
262 bytes[posn++] = (byte)(0x80 | (left & 0x3F));
267 // Return the final count to the caller.
268 return posn - byteIndex;
271 // Get the bytes that result from encoding a character buffer.
272 public override int GetBytes (char[] chars, int charIndex, int charCount,
273 byte[] bytes, int byteIndex)
276 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
279 // Convenience wrappers for "GetBytes".
280 public override int GetBytes (String s, int charIndex, int charCount,
281 byte[] bytes, int byteIndex)
283 // Validate the parameters.
285 throw new ArgumentNullException ("s");
288 throw new ArgumentNullException ("bytes");
290 if (charIndex < 0 || charIndex > s.Length) {
291 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
293 if (charCount < 0 || charCount > (s.Length - charIndex)) {
294 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
296 if (byteIndex < 0 || byteIndex > bytes.Length) {
297 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
300 // Convert the characters into bytes.
302 int length = bytes.Length;
304 int posn = byteIndex;
305 while (charCount > 0) {
306 // Fetch the next UTF-16 character pair value.
308 if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
309 // This may be the start of a surrogate pair.
310 pair = (uint)(s[charIndex]);
311 if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
312 pair = (pair - (uint)0xDC00) +
313 ((((uint)ch) - (uint)0xD800) << 10) +
325 // Encode the character pair value.
326 if (pair < (uint)0x0080) {
327 if (posn >= length) {
328 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
330 bytes[posn++] = (byte)pair;
331 } else if (pair < (uint)0x0800) {
332 if ((posn + 2) > length) {
333 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
335 bytes[posn++] = (byte)(0xC0 | (pair >> 6));
336 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
337 } else if (pair < (uint)0x10000) {
338 if ((posn + 3) > length) {
339 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
341 bytes[posn++] = (byte)(0xE0 | (pair >> 12));
342 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
343 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
345 if ((posn + 4) > length) {
346 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
348 bytes[posn++] = (byte)(0xF0 | (pair >> 18));
349 bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
350 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
351 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
355 // Return the final count to the caller.
356 return posn - byteIndex;
359 // Internal version of "GetCharCount" which can handle a rolling
360 // state between multiple calls to this method.
361 private static int InternalGetCharCount (byte[] bytes, int index, int count,
364 bool throwOnInvalid, bool flush)
366 // Validate the parameters.
368 throw new ArgumentNullException ("bytes");
370 if (index < 0 || index > bytes.Length) {
371 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
373 if (count < 0 || count > (bytes.Length - index)) {
374 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
377 // Determine the number of characters that we have.
380 uint leftBits = leftOverBits;
381 uint leftSoFar = (leftOverCount & (uint)0x0F);
382 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
384 ch = (uint)(bytes[index++]);
387 // Process a UTF-8 start character.
388 if (ch < (uint)0x0080) {
389 // Single-byte UTF-8 character.
391 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
392 // Double-byte UTF-8 character.
393 leftBits = (ch & (uint)0x1F);
396 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
397 // Three-byte UTF-8 character.
398 leftBits = (ch & (uint)0x0F);
401 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
402 // Four-byte UTF-8 character.
403 leftBits = (ch & (uint)0x07);
406 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
407 // Five-byte UTF-8 character.
408 leftBits = (ch & (uint)0x03);
411 } else if ((ch & (uint)0xFC) == (uint)0xFC) {
412 // Six-byte UTF-8 character.
413 leftBits = (ch & (uint)0x03);
417 // Invalid UTF-8 start character.
418 if (throwOnInvalid) {
419 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
423 // Process an extra byte in a multi-byte sequence.
424 if ((ch & (uint)0xC0) == (uint)0x80) {
425 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
426 if (++leftSoFar >= leftSize) {
427 // We have a complete character now.
428 if (leftBits < (uint)0x10000) {
429 if (leftBits != (uint)0xFEFF) {
430 // is it an overlong ?
431 bool overlong = false;
434 overlong = (leftBits <= 0x7F);
437 overlong = (leftBits <= 0x07FF);
440 overlong = (leftBits <= 0xFFFF);
443 overlong = (leftBits <= 0x1FFFFF);
446 overlong = (leftBits <= 0x03FFFFFF);
451 throw new ArgumentException (_("Overlong"), leftBits.ToString ());
456 } else if (leftBits < (uint)0x110000) {
458 } else if (throwOnInvalid) {
459 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
464 // Invalid UTF-8 sequence: clear and restart.
465 if (throwOnInvalid) {
466 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
474 if (flush && leftSize != 0 && throwOnInvalid) {
475 // We had left-over bytes that didn't make up
476 // a complete UTF-8 character sequence.
477 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
480 // Return the final length to the caller.
484 // Get the number of characters needed to decode a byte buffer.
485 public override int GetCharCount (byte[] bytes, int index, int count)
487 return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
490 // Get the characters that result from decoding a byte buffer.
491 private static int InternalGetChars (byte[] bytes, int byteIndex,
492 int byteCount, char[] chars,
493 int charIndex, ref uint leftOverBits,
494 ref uint leftOverCount,
495 bool throwOnInvalid, bool flush)
497 // Validate the parameters.
499 throw new ArgumentNullException ("bytes");
502 throw new ArgumentNullException ("chars");
504 if (byteIndex < 0 || byteIndex > bytes.Length) {
505 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
507 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
508 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
510 if (charIndex < 0 || charIndex > chars.Length) {
511 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
514 if (charIndex == chars.Length)
517 // Convert the bytes into the output buffer.
519 int length = chars.Length;
520 int posn = charIndex;
521 uint leftBits = leftOverBits;
522 uint leftSoFar = (leftOverCount & (uint)0x0F);
523 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
524 while (byteCount > 0) {
525 // Fetch the next character from the byte buffer.
526 ch = (uint)(bytes[byteIndex++]);
529 // Process a UTF-8 start character.
530 if (ch < (uint)0x0080) {
531 // Single-byte UTF-8 character.
532 if (posn >= length) {
533 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
535 chars[posn++] = (char)ch;
536 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
537 // Double-byte UTF-8 character.
538 leftBits = (ch & (uint)0x1F);
541 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
542 // Three-byte UTF-8 character.
543 leftBits = (ch & (uint)0x0F);
546 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
547 // Four-byte UTF-8 character.
548 leftBits = (ch & (uint)0x07);
551 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
552 // Five-byte UTF-8 character.
553 leftBits = (ch & (uint)0x03);
556 } else if ((ch & (uint)0xFC) == (uint)0xFC) {
557 // Six-byte UTF-8 character.
558 leftBits = (ch & (uint)0x03);
562 // Invalid UTF-8 start character.
563 if (throwOnInvalid) {
564 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
568 // Process an extra byte in a multi-byte sequence.
569 if ((ch & (uint)0xC0) == (uint)0x80) {
570 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
571 if (++leftSoFar >= leftSize) {
572 // We have a complete character now.
573 if (leftBits < (uint)0x10000) {
574 if (leftBits != (uint)0xFEFF) {
575 // is it an overlong ?
576 bool overlong = false;
579 overlong = (leftBits <= 0x7F);
582 overlong = (leftBits <= 0x07FF);
585 overlong = (leftBits <= 0xFFFF);
588 overlong = (leftBits <= 0x1FFFFF);
591 overlong = (leftBits <= 0x03FFFFFF);
596 throw new ArgumentException (_("Overlong"), leftBits.ToString ());
599 if (posn >= length) {
600 throw new ArgumentException
601 (_("Arg_InsufficientSpace"), "chars");
603 chars[posn++] = (char)leftBits;
606 } else if (leftBits < (uint)0x110000) {
607 if ((posn + 2) > length) {
608 throw new ArgumentException
609 (_("Arg_InsufficientSpace"), "chars");
611 leftBits -= (uint)0x10000;
612 chars[posn++] = (char)((leftBits >> 10) +
615 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
616 } else if (throwOnInvalid) {
617 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
622 // Invalid UTF-8 sequence: clear and restart.
623 if (throwOnInvalid) {
624 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
632 if (flush && leftSize != 0 && throwOnInvalid) {
633 // We had left-over bytes that didn't make up
634 // a complete UTF-8 character sequence.
635 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
637 leftOverBits = leftBits;
638 leftOverCount = (leftSoFar | (leftSize << 4));
640 // Return the final length to the caller.
641 return posn - charIndex;
644 // Get the characters that result from decoding a byte buffer.
645 public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
646 char[] chars, int charIndex)
648 uint leftOverBits = 0;
649 uint leftOverCount = 0;
650 return InternalGetChars (bytes, byteIndex, byteCount, chars,
651 charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
654 // Get the maximum number of bytes needed to encode a
655 // specified number of characters.
656 public override int GetMaxByteCount (int charCount)
659 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
661 return charCount * 4;
664 // Get the maximum number of characters needed to decode a
665 // specified number of bytes.
666 public override int GetMaxCharCount (int byteCount)
669 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
674 // Get a UTF8-specific decoder that is attached to this instance.
675 public override Decoder GetDecoder ()
677 return new UTF8Decoder (throwOnInvalid);
680 // Get a UTF8-specific encoder that is attached to this instance.
681 public override Encoder GetEncoder ()
683 return new UTF8Encoder (emitIdentifier);
686 // Get the UTF8 preamble.
687 public override byte[] GetPreamble ()
689 if (emitIdentifier) {
690 byte[] pre = new byte [3];
700 // Determine if this object is equal to another.
701 public override bool Equals (Object value)
703 UTF8Encoding enc = (value as UTF8Encoding);
705 return (codePage == enc.codePage &&
706 emitIdentifier == enc.emitIdentifier &&
707 throwOnInvalid == enc.throwOnInvalid);
713 // Get the hash code for this object.
714 public override int GetHashCode ()
716 return base.GetHashCode ();
719 public override byte [] GetBytes (String s)
722 throw new ArgumentNullException ("s");
724 int length = GetByteCount (s);
725 byte [] bytes = new byte [length];
726 GetBytes (s, 0, s.Length, bytes, 0);
730 // UTF-8 decoder implementation.
732 private class UTF8Decoder : Decoder
734 private bool throwOnInvalid;
735 private uint leftOverBits;
736 private uint leftOverCount;
739 public UTF8Decoder (bool throwOnInvalid)
741 this.throwOnInvalid = throwOnInvalid;
746 // Override inherited methods.
747 public override int GetCharCount (byte[] bytes, int index, int count)
749 return InternalGetCharCount (bytes, index, count,
750 leftOverBits, leftOverCount, throwOnInvalid, false);
752 public override int GetChars (byte[] bytes, int byteIndex,
753 int byteCount, char[] chars, int charIndex)
755 return InternalGetChars (bytes, byteIndex, byteCount,
756 chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
759 } // class UTF8Decoder
761 // UTF-8 encoder implementation.
763 private class UTF8Encoder : Encoder
765 private bool emitIdentifier;
766 private uint leftOver;
769 public UTF8Encoder (bool emitIdentifier)
771 this.emitIdentifier = emitIdentifier;
775 // Override inherited methods.
776 public override int GetByteCount (char[] chars, int index,
777 int count, bool flush)
779 return InternalGetByteCount (chars, index, count, leftOver, flush);
781 public override int GetBytes (char[] chars, int charIndex,
782 int charCount, byte[] bytes, int byteCount, bool flush)
785 result = InternalGetBytes (chars, charIndex, charCount, bytes, byteCount, ref leftOver, flush);
786 emitIdentifier = false;
790 } // class UTF8Encoder
792 }; // class UTF8Encoding
794 }; // namespace System.Text