2 * Mono.Unix/UnixEncoding.cs
5 * Jonathan Pryor (jonpryor@vt.edu)
7 * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
8 * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
9 * Copyright (C) 2005 Jonathan Pryor
11 * Permission is hereby granted, free of charge, to any person obtaining
12 * a copy of this software and associated documentation files (the "Software"),
13 * to deal in the Software without restriction, including without limitation
14 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 * and/or sell copies of the Software, and to permit persons to whom the
16 * Software is furnished to do so, subject to the following conditions:
18 * The above copyright notice and this permission notice shall be included
19 * in all copies or substantial portions of the Software.
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
25 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
26 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 * OTHER DEALINGS IN THE SOFTWARE.
37 public class UnixEncoding : Encoding
39 public static readonly Encoding Instance = new UnixEncoding ();
41 public static readonly char EscapeByte = '\u0000';
44 public UnixEncoding ()
48 // Internal version of "GetByteCount" which can handle a rolling
49 // state between multiple calls to this method.
50 private static int InternalGetByteCount (char[] chars, int index, int count, uint leftOver, bool flush)
52 // Validate the parameters.
54 throw new ArgumentNullException ("chars");
56 if (index < 0 || index > chars.Length) {
57 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
59 if (count < 0 || count > (chars.Length - index)) {
60 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
63 // Determine the lengths of all characters.
70 if (ch == EscapeByte && count > 1) {
74 } else if (ch < '\u0080') {
76 } else if (ch < '\u0800') {
78 } else if (ch >= '\uD800' && ch <= '\uDBFF') {
79 // This is the start of a surrogate pair.
84 } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
85 // We have a surrogate pair.
89 // We have a surrogate start followed by a
90 // regular character. Technically, this is
91 // invalid, but we have to do something.
92 // We write out the surrogate start and then
93 // re-visit the current character again.
101 if (flush && pair != 0) {
102 // Flush the left-over surrogate pair start.
106 // Return the final length to the caller.
110 // Get the number of bytes needed to encode a character buffer.
111 public override int GetByteCount (char[] chars, int index, int count)
113 return InternalGetByteCount (chars, index, count, 0, true);
116 // Convenience wrappers for "GetByteCount".
117 public override int GetByteCount (String s)
119 // Validate the parameters.
121 throw new ArgumentNullException ("s");
124 // Determine the lengths of all characters.
127 int count = s.Length;
132 if (ch == EscapeByte && count > 1) {
136 } else if (ch < '\u0080') {
138 } else if (ch < '\u0800') {
140 } else if (ch >= '\uD800' && ch <= '\uDBFF' && count > 1) {
141 // This may be the start of a surrogate pair.
142 pair = (uint)(s[index]);
143 if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
156 // Return the final length to the caller.
160 // Internal version of "GetBytes" which can handle a rolling
161 // state between multiple calls to this method.
162 private static int InternalGetBytes (char[] chars, int charIndex,
163 int charCount, byte[] bytes,
164 int byteIndex, ref uint leftOver,
167 // Validate the parameters.
169 throw new ArgumentNullException ("chars");
172 throw new ArgumentNullException ("bytes");
174 if (charIndex < 0 || charIndex > chars.Length) {
175 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
177 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
178 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
180 if (byteIndex < 0 || byteIndex > bytes.Length) {
181 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
184 // Convert the characters into bytes.
186 int length = bytes.Length;
188 uint left = leftOver;
189 int posn = byteIndex;
190 while (charCount > 0) {
191 // Fetch the next UTF-16 character pair value.
192 ch = chars[charIndex++];
195 if (ch >= '\uD800' && ch <= '\uDBFF') {
196 // This is the start of a surrogate pair.
199 } else if (ch == EscapeByte) {
200 if (posn >= length) {
201 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
203 if (--charCount >= 0) {
204 bytes[posn++] = (byte) chars [charIndex++];
208 // This is a regular character.
211 } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
212 // We have a surrogate pair.
213 pair = ((left - (uint)0xD800) << 10) +
214 (((uint)ch) - (uint)0xDC00) +
218 // We have a surrogate start followed by a
219 // regular character. Technically, this is
220 // invalid, but we have to do something.
221 // We write out the surrogate start and then
222 // re-visit the current character again.
229 // Encode the character pair value.
230 if (pair < (uint)0x0080) {
231 if (posn >= length) {
232 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
234 bytes[posn++] = (byte)pair;
235 } else if (pair < (uint)0x0800) {
236 if ((posn + 2) > length) {
237 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
239 bytes[posn++] = (byte)(0xC0 | (pair >> 6));
240 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
241 } else if (pair < (uint)0x10000) {
242 if ((posn + 3) > length) {
243 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
245 bytes[posn++] = (byte)(0xE0 | (pair >> 12));
246 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
247 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
249 if ((posn + 4) > length) {
250 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
252 bytes[posn++] = (byte)(0xF0 | (pair >> 18));
253 bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
254 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
255 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
258 if (flush && left != 0) {
259 // Flush the left-over surrogate pair start.
260 if ((posn + 3) > length) {
261 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
263 bytes[posn++] = (byte)(0xE0 | (left >> 12));
264 bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));
265 bytes[posn++] = (byte)(0x80 | (left & 0x3F));
270 // Return the final count to the caller.
271 return posn - byteIndex;
274 // Get the bytes that result from encoding a character buffer.
275 public override int GetBytes (char[] chars, int charIndex, int charCount,
276 byte[] bytes, int byteIndex)
279 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
282 // Convenience wrappers for "GetBytes".
283 public override int GetBytes (String s, int charIndex, int charCount,
284 byte[] bytes, int byteIndex)
286 // Validate the parameters.
288 throw new ArgumentNullException ("s");
291 throw new ArgumentNullException ("bytes");
293 if (charIndex < 0 || charIndex > s.Length) {
294 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
296 if (charCount < 0 || charCount > (s.Length - charIndex)) {
297 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
299 if (byteIndex < 0 || byteIndex > bytes.Length) {
300 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
304 fixed (char* p = s) {
305 fixed (byte* b = bytes) {
306 return GetBytes (p + charIndex, charCount, b + byteIndex, bytes.Length - byteIndex);
312 public unsafe override int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
314 if (bytes == null || chars == null)
315 throw new ArgumentNullException (bytes == null ? "bytes" : "chars");
317 if (charCount < 0 || byteCount < 0)
318 throw new ArgumentOutOfRangeException (charCount < 0 ? "charCount" : "byteCount");
320 // Convert the characters into bytes.
322 int length = byteCount;
326 while (charCount > 0) {
327 // Fetch the next UTF-16 character pair value.
328 ch = chars [charIndex++];
329 if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
330 // This may be the start of a surrogate pair.
331 pair = (uint)(chars[charIndex]);
332 if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
333 pair = (pair - (uint)0xDC00) +
334 ((((uint)ch) - (uint)0xD800) << 10) +
341 } else if (ch == EscapeByte && charCount > 1) {
342 if (posn >= length) {
343 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
346 if (charCount >= 0) {
347 bytes[posn++] = (byte)chars [charIndex++];
355 // Encode the character pair value.
356 if (pair < (uint)0x0080) {
357 if (posn >= length) {
358 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
360 bytes[posn++] = (byte)pair;
361 } else if (pair < (uint)0x0800) {
362 if ((posn + 2) > length) {
363 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
365 bytes[posn++] = (byte)(0xC0 | (pair >> 6));
366 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
367 } else if (pair < (uint)0x10000) {
368 if ((posn + 3) > length) {
369 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
371 bytes[posn++] = (byte)(0xE0 | (pair >> 12));
372 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
373 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
375 if ((posn + 4) > length) {
376 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
378 bytes[posn++] = (byte)(0xF0 | (pair >> 18));
379 bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
380 bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
381 bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
385 // Return the final count to the caller.
389 // Internal version of "GetCharCount" which can handle a rolling
390 // state between multiple calls to this method.
391 private static int InternalGetCharCount (byte[] bytes, int index, int count,
394 bool throwOnInvalid, bool flush)
396 // Validate the parameters.
398 throw new ArgumentNullException ("bytes");
400 if (index < 0 || index > bytes.Length) {
401 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
403 if (count < 0 || count > (bytes.Length - index)) {
404 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
407 // Determine the number of characters that we have.
411 uint leftBits = leftOverBits;
412 uint leftSoFar = (leftOverCount & (uint)0x0F);
413 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
415 ch = (uint)(bytes [index++]);
419 // Process a UTF-8 start character.
420 if (ch < (uint)0x0080) {
421 // Single-byte UTF-8 character.
424 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
425 // Double-byte UTF-8 character.
426 leftBits = (ch & (uint)0x1F);
429 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
430 // Three-byte UTF-8 character.
431 leftBits = (ch & (uint)0x0F);
434 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
435 // Four-byte UTF-8 character.
436 leftBits = (ch & (uint)0x07);
439 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
440 // Five-byte UTF-8 character.
441 leftBits = (ch & (uint)0x03);
444 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
445 // Six-byte UTF-8 character.
446 leftBits = (ch & (uint)0x03);
450 // Invalid UTF-8 start character.
451 if (throwOnInvalid) {
452 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
454 length += next_raw*2;
458 // Process an extra byte in a multi-byte sequence.
459 if ((ch & (uint)0xC0) == (uint)0x80) {
460 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
461 if (++leftSoFar >= leftSize) {
462 // We have a complete character now.
463 if (leftBits < (uint)0x10000) {
464 // is it an overlong ?
465 bool overlong = false;
468 overlong = (leftBits <= 0x7F);
471 overlong = (leftBits <= 0x07FF);
474 overlong = (leftBits <= 0xFFFF);
477 overlong = (leftBits <= 0x1FFFFF);
480 overlong = (leftBits <= 0x03FFFFFF);
484 // if (throwOnInvalid)
485 // throw new ArgumentException (_("Overlong"), leftBits.ToString ());
486 length += next_raw*2;
490 } else if (leftBits < (uint)0x110000) {
492 } else if (throwOnInvalid) {
494 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
495 length += next_raw*2;
501 // Invalid UTF-8 sequence: clear and restart.
502 if (throwOnInvalid) {
503 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
505 // don't escape the current byte, process it normally
506 if (ch < (uint)0x0080) {
511 length += next_raw*2;
517 if (flush && leftSize != 0 && throwOnInvalid) {
518 // We had left-over bytes that didn't make up
519 // a complete UTF-8 character sequence.
520 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
521 length += next_raw * 2;
524 // Return the final length to the caller.
528 // Get the number of characters needed to decode a byte buffer.
529 public override int GetCharCount (byte[] bytes, int index, int count)
531 return InternalGetCharCount (bytes, index, count, 0, 0, true, true);
534 // Get the characters that result from decoding a byte buffer.
535 private static int InternalGetChars (byte[] bytes, int byteIndex,
536 int byteCount, char[] chars,
537 int charIndex, ref uint leftOverBits,
538 ref uint leftOverCount,
539 bool throwOnInvalid, bool flush)
541 // Validate the parameters.
543 throw new ArgumentNullException ("bytes");
546 throw new ArgumentNullException ("chars");
548 if (byteIndex < 0 || byteIndex > bytes.Length) {
549 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
551 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
552 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
554 if (charIndex < 0 || charIndex > chars.Length) {
555 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
558 if (charIndex == chars.Length)
561 // Convert the bytes into the output buffer.
562 byte[] raw = new byte[6];
565 int length = chars.Length;
566 int posn = charIndex;
567 uint leftBits = leftOverBits;
568 uint leftSoFar = (leftOverCount & (uint)0x0F);
569 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
570 while (byteCount > 0) {
571 // Fetch the next character from the byte buffer.
572 ch = (uint)(bytes[byteIndex++]);
573 raw [next_raw++] = (byte) ch;
576 // Process a UTF-8 start character.
577 if (ch < (uint)0x0080) {
578 // Single-byte UTF-8 character.
579 if (posn >= length) {
580 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
583 chars[posn++] = (char)ch;
584 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
585 // Double-byte UTF-8 character.
586 leftBits = (ch & (uint)0x1F);
589 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
590 // Three-byte UTF-8 character.
591 leftBits = (ch & (uint)0x0F);
594 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
595 // Four-byte UTF-8 character.
596 leftBits = (ch & (uint)0x07);
599 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
600 // Five-byte UTF-8 character.
601 leftBits = (ch & (uint)0x03);
604 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
605 // Six-byte UTF-8 character.
606 leftBits = (ch & (uint)0x03);
610 // Invalid UTF-8 start character.
611 if (throwOnInvalid) {
612 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
615 chars[posn++] = EscapeByte;
616 chars[posn++] = (char) ch;
619 // Process an extra byte in a multi-byte sequence.
620 if ((ch & (uint)0xC0) == (uint)0x80) {
621 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
622 if (++leftSoFar >= leftSize) {
623 // We have a complete character now.
624 if (leftBits < (uint)0x10000) {
625 // is it an overlong ?
626 bool overlong = false;
629 overlong = (leftBits <= 0x7F);
632 overlong = (leftBits <= 0x07FF);
635 overlong = (leftBits <= 0xFFFF);
638 overlong = (leftBits <= 0x1FFFFF);
641 overlong = (leftBits <= 0x03FFFFFF);
645 // if (throwOnInvalid)
646 // throw new ArgumentException (_("Overlong"), leftBits.ToString ());
647 CopyRaw (raw, ref next_raw, chars, ref posn, length);
650 if (posn >= length) {
651 throw new ArgumentException
652 (_("Arg_InsufficientSpace"), "chars");
654 chars[posn++] = (char)leftBits;
656 } else if (leftBits < (uint)0x110000) {
657 if ((posn + 2) > length) {
658 throw new ArgumentException
659 (_("Arg_InsufficientSpace"), "chars");
661 leftBits -= (uint)0x10000;
662 chars[posn++] = (char)((leftBits >> 10) +
665 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
666 } else if (throwOnInvalid) {
668 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
669 CopyRaw (raw, ref next_raw, chars, ref posn, length);
675 // Invalid UTF-8 sequence: clear and restart.
676 if (throwOnInvalid) {
677 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
679 // don't escape the current byte, process it normally
680 if (ch < (uint)0x0080) {
685 CopyRaw (raw, ref next_raw, chars, ref posn, length);
691 if (flush && leftSize != 0 && throwOnInvalid) {
692 // We had left-over bytes that didn't make up
693 // a complete UTF-8 character sequence.
694 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
695 CopyRaw (raw, ref next_raw, chars, ref posn, length);
697 leftOverBits = leftBits;
698 leftOverCount = (leftSoFar | (leftSize << 4));
700 // Return the final length to the caller.
701 return posn - charIndex;
704 private static void CopyRaw (byte[] raw, ref int next_raw, char[] chars, ref int posn, int length)
706 if (posn+(next_raw*2) > length)
707 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
709 for (int i = 0; i < next_raw; ++i) {
710 chars[posn++] = EscapeByte;
711 chars[posn++] = (char) raw [i];
717 // Get the characters that result from decoding a byte buffer.
718 public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
719 char[] chars, int charIndex)
721 uint leftOverBits = 0;
722 uint leftOverCount = 0;
723 return InternalGetChars (bytes, byteIndex, byteCount, chars,
724 charIndex, ref leftOverBits, ref leftOverCount, true, true);
727 // Get the maximum number of bytes needed to encode a
728 // specified number of characters.
729 public override int GetMaxByteCount (int charCount)
732 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
734 return charCount * 4;
737 // Get the maximum number of characters needed to decode a
738 // specified number of bytes.
739 public override int GetMaxCharCount (int byteCount)
742 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
747 // Get a Unix-specific decoder that is attached to this instance.
748 public override Decoder GetDecoder ()
750 return new UnixDecoder ();
753 // Get a Unix-specific encoder that is attached to this instance.
754 public override Encoder GetEncoder ()
756 return new UnixEncoder ();
759 // Get the Unix preamble.
760 public override byte[] GetPreamble ()
765 // Determine if this object is equal to another.
766 public override bool Equals (Object value)
768 UnixEncoding enc = (value as UnixEncoding);
777 // Get the hash code for this object.
778 public override int GetHashCode ()
780 return base.GetHashCode ();
783 public override byte [] GetBytes (String s)
786 throw new ArgumentNullException ("s");
788 int length = GetByteCount (s);
789 byte [] bytes = new byte [length];
790 GetBytes (s, 0, s.Length, bytes, 0);
794 // Unix decoder implementation.
796 private class UnixDecoder : Decoder
798 private uint leftOverBits;
799 private uint leftOverCount;
802 public UnixDecoder ()
808 // Override inherited methods.
809 public override int GetCharCount (byte[] bytes, int index, int count)
811 return InternalGetCharCount (bytes, index, count,
812 leftOverBits, leftOverCount, true, false);
814 public override int GetChars (byte[] bytes, int byteIndex,
815 int byteCount, char[] chars, int charIndex)
817 return InternalGetChars (bytes, byteIndex, byteCount,
818 chars, charIndex, ref leftOverBits, ref leftOverCount, true, false);
823 // Unix encoder implementation.
825 private class UnixEncoder : Encoder
827 private uint leftOver;
830 public UnixEncoder ()
835 // Override inherited methods.
836 public override int GetByteCount (char[] chars, int index,
837 int count, bool flush)
839 return InternalGetByteCount (chars, index, count, leftOver, flush);
841 public override int GetBytes (char[] chars, int charIndex,
842 int charCount, byte[] bytes, int byteCount, bool flush)
845 result = InternalGetBytes (chars, charIndex, charCount, bytes, byteCount, ref leftOver, flush);
850 private static string _ (string arg)