2006-02-02 Atsushi Enomoto <atsushi@ximian.com>

[mono.git] / mcs / class / corlib / System.Text / UTF8Encoding.cs
diff --git a/mcs/class/corlib/System.Text/UTF8Encoding.cs b/mcs/class/corlib/System.Text/UTF8Encoding.cs

index 1b8be11b4d7a88ef51cc9f972a31c5fbf170114d..80fc8808b7b80106a3c9bf930dd49bcf394f8a3e 100644 (file)
--- a/mcs/class/corlib/System.Text/UTF8Encoding.cs
+++ b/mcs/class/corlib/System.Text/UTF8Encoding.cs
@@ -74,7 +74,7 @@ public class UTF8Encoding : Encoding
  
         // Internal version of "GetByteCount" which can handle a rolling
         // state between multiple calls to this method.
-       private static int InternalGetByteCount (char[] chars, int index, int count, ref uint leftOver, bool flush)
+       private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
         {
                 // Validate the parameters.
                 if (chars == null) {
@@ -88,9 +88,9 @@ public class UTF8Encoding : Encoding
                 }
  
                 if (index == chars.Length) {
-                       if (flush && leftOver != 0) {
+                       if (flush && leftOver != '\0') {
                                 // Flush the left-over surrogate pair start.
-                               leftOver = 0;
+                               leftOver = '\0';
                                 return 3;
                         }
                         return 0;
@@ -104,14 +104,14 @@ public class UTF8Encoding : Encoding
         }
  
  
-       private unsafe static int InternalGetByteCount (char* chars, int count, ref uint leftOver, bool flush)
+       private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
         {
                 int index = 0;
  
                 // Determine the lengths of all characters.
                 char ch;
                 int length = 0;
-               uint pair = leftOver;
+               char pair = leftOver;
                 while (count > 0) {
                         ch = chars[index];
                         if (pair == 0) {
@@ -121,14 +121,23 @@ public class UTF8Encoding : Encoding
                                         length += 2;
                                 } else if (ch >= '\uD800' && ch <= '\uDBFF') {
                                         // This is the start of a surrogate pair.
-                                       pair = (uint)ch;
+                                       pair = ch;
                                 } else {
                                         length += 3;
                                 }
                         } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
-                               // We have a surrogate pair.
-                               length += 4;
-                               pair = 0;
+                               if (pair != 0) {
+                                       // We have a surrogate pair.
+                                       length += 4;
+                                       pair = '\0';
+                               } else {
+                                       // We have a surrogate tail without 
+                                       // leading surrogate. In NET_2_0 it
+                                       // uses fallback. In NET_1_1 we output
+                                       // wrong surrogate.
+                                       length += 3;
+                                       pair = '\0';
+                               }
                         } else {
                                 // We have a surrogate start followed by a
                                 // regular character.  Technically, this is
@@ -136,18 +145,20 @@ public class UTF8Encoding : Encoding
                                 // We write out the surrogate start and then
                                 // re-visit the current character again.
                                 length += 3;
-                               pair = 0;
+                               pair = '\0';
                                 continue;
                         }
                         ++index;
                         --count;
                 }
-               if (flush && pair != 0) {
-                       // Flush the left-over surrogate pair start.
-                       length += 3;
+               if (flush) {
+                       if (pair != '\0')
+                               // Flush the left-over surrogate pair start.
+                               length += 3;
+                       leftOver = '\0';
                 }
-
-               leftOver = pair;
+               else
+                       leftOver = pair;
  
                 // Return the final length to the caller.
                 return length;
@@ -156,7 +167,7 @@ public class UTF8Encoding : Encoding
         // Get the number of bytes needed to encode a character buffer.
         public override int GetByteCount (char[] chars, int index, int count)
         {
-               uint dummy = 0;
+               char dummy = '\0';
                 return InternalGetByteCount (chars, index, count, ref dummy, true);
         }
  
@@ -170,7 +181,7 @@ public class UTF8Encoding : Encoding
  
                 unsafe {
                         fixed (char* cptr = s) {
-                               uint dummy = 0;
+                               char dummy = '\0';
                                 return InternalGetByteCount (cptr, s.Length, ref dummy, true);
                         }
                 }
@@ -184,7 +195,7 @@ public class UTF8Encoding : Encoding
         // state between multiple calls to this method.
         private static int InternalGetBytes (char[] chars, int charIndex,
                                              int charCount, byte[] bytes,
-                                            int byteIndex, ref uint leftOver,
+                                            int byteIndex, ref char leftOver,
                                              bool flush)
         {
                 // Validate the parameters.
@@ -210,7 +221,7 @@ public class UTF8Encoding : Encoding
                                 bytes [byteIndex++] = 0xEF;
                                 bytes [byteIndex++] = 0xBB;
                                 bytes [byteIndex++] = 0xBF;
-                               leftOver = 0;
+                               leftOver = '\0';
                                 return 3;
                         }
                         return 0;
@@ -230,7 +241,7 @@ public class UTF8Encoding : Encoding
  
         private unsafe static int InternalGetBytes (char* chars, int charCount,
                                              byte* bytes, int byteCount,
-                                            ref uint leftOver, bool flush)
+                                            ref char leftOver, bool flush)
         {
                 int charIndex = 0;
                 int byteIndex = 0;
@@ -239,77 +250,113 @@ public class UTF8Encoding : Encoding
                 // Convert the characters into bytes.
                 char ch;
                 int length = byteCount;
-               uint pair = leftOver;
+               char pair = leftOver;
                 int posn = byteIndex;
+               int code = 0;
  
                 while (charCount > 0) {
                         // Fetch the next UTF-16 character pair value.
-                       ch = chars [charIndex++];
-                       if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
-                               // This may be the start of a surrogate pair.
-                               pair = (uint) chars [charIndex];
-                               if (pair >= 0xDC00 && pair <= 0xDFFF) {
-                                       pair = pair - 0xDC00 +
-                                               (((uint) ch - 0xD800) << 10) +
-                                               0x10000;
+                       ch = chars [charIndex];
+                       if (pair == '\0') {
+                               if (ch < '\uD800' || ch >= '\uE000')
+                                       code = ch;
+                               else if (ch < '\uDC00') {
+                                       // surrogate start
+                                       pair = ch;
                                         ++charIndex;
                                         --charCount;
-                               } else {
-                                       pair = (uint) ch;
+                                       continue;
+                               } else { // ch <= '\uDFFF'
+                                       // We have a surrogate tail without leading 
+                                       // surrogate. In NET_2_0 it uses fallback.
+                                       // In NET_1_1 we output wrong surrogate.
+                                       if ((posn + 3) > length) {
+                                               throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
+                                       }
+                                       bytes [posn++] = (byte) (0xE0 | (ch >> 12));
+                                       bytes [posn++] = (byte) (0x80 | ((ch >> 6) & 0x3F));
+                                       bytes [posn++] = (byte) (0x80 | (ch & 0x3F));
+                                       ++charIndex;
+                                       --charCount;
+                                       continue;
                                 }
                         } else {
-                               pair = (uint) ch;
+                               if ('\uDC00' <= ch && ch <= '\uDFFF')
+                                       code =  0x10000 + (int) ch - 0xDC00 +
+                                               (((int) pair - 0xD800) << 10);
+                               else {
+                                       // We have a surrogate start followed by a
+                                       // regular character.  Technically, this is
+                                       // invalid, but we have to do something.
+                                       // We write out the surrogate start and then
+                                       // re-visit the current character again.
+                                       if ((posn + 3) > length) {
+                                               throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
+                                       }
+                                       bytes [posn++] = (byte) (0xE0 | (pair >> 12));
+                                       bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
+                                       bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
+                                       pair = '\0';
+                                       continue;
+                               }
+                               pair = '\0';
                         }
+                       ++charIndex;
                         --charCount;
  
                         // Encode the character pair value.
-                       if (pair < 0x0080) {
+                       if (code < 0x0080) {
                                 if (posn >= length)
                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
-                               bytes [posn++] = (byte)pair;
-                       } else if (pair < 0x0800) {
+                               bytes [posn++] = (byte)code;
+                       } else if (code < 0x0800) {
                                 if ((posn + 2) > length)
                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
-                               bytes [posn++] = (byte) (0xC0 | (pair >> 6));
-                               bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
-                       } else if (pair < 0x10000) {
+                               bytes [posn++] = (byte) (0xC0 | (code >> 6));
+                               bytes [posn++] = (byte) (0x80 | (code & 0x3F));
+                       } else if (code < 0x10000) {
                                 if ((posn + 3) > length)
                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
-                               bytes [posn++] = (byte) (0xE0 | (pair >> 12));
-                               bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
-                               bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
+                               bytes [posn++] = (byte) (0xE0 | (code >> 12));
+                               bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
+                               bytes [posn++] = (byte) (0x80 | (code & 0x3F));
                         } else {
                                 if ((posn + 4) > length)
                                         throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
-                               bytes [posn++] = (byte) (0xF0 | (pair >> 18));
-                               bytes [posn++] = (byte) (0x80 | ((pair >> 12) & 0x3F));
-                               bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
-                               bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
+                               bytes [posn++] = (byte) (0xF0 | (code >> 18));
+                               bytes [posn++] = (byte) (0x80 | ((code >> 12) & 0x3F));
+                               bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
+                               bytes [posn++] = (byte) (0x80 | (code & 0x3F));
                         }
                 }
  
-               if (flush && pair >= 0xD800 && pair < 0xDC00) {
-                       // Flush the left-over surrogate pair start.
-                       if ((posn + 3) > length) {
-                               throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
+               if (flush) {
+                       if (pair != '\0') {
+                               // Flush the left-over incomplete surrogate.
+                               if ((posn + 3) > length) {
+                                       throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
+                               }
+                               bytes [posn++] = (byte) (0xE0 | (pair >> 12));
+                               bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
+                               bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
                         }
-                       bytes [posn++] = (byte) (0xE0 | (pair >> 12));
-                       bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
-                       bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
-                       leftOver = 0;
+                       leftOver = '\0';
                 }
-               else
-                       leftOver = pair;
  
                 // Return the final count to the caller.
                 return posn - byteIndex;
         }
  
+       private unsafe int Fallback (byte* bytes, int byteCount, char lead, char tail)
+       {
+               throw new NotImplementedException ();
+       }
+
         // Get the bytes that result from encoding a character buffer.
         public override int GetBytes (char[] chars, int charIndex, int charCount,
                                                                  byte[] bytes, int byteIndex)
         {
-               uint leftOver = 0;
+               char leftOver = '\0';
                 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
         }
  
@@ -340,7 +387,7 @@ public class UTF8Encoding : Encoding
                 unsafe {
                         fixed (char* cptr = s) {
                                 fixed (byte *bptr = bytes) {
-                                       uint dummy = 0;
+                                       char dummy = '\0';
                                         return InternalGetBytes (
                                                 cptr + charIndex, charCount,
                                                 bptr + byteIndex, bytes.Length - byteIndex,
@@ -920,15 +967,15 @@ public class UTF8Encoding : Encoding
         private class UTF8Encoder : Encoder
         {
                 private bool emitIdentifier;
-               private uint leftOverForCount;
-               private uint leftOverForConv;
+               private char leftOverForCount;
+               private char leftOverForConv;
  
                 // Constructor.
                 public UTF8Encoder (bool emitIdentifier)
                 {
                         this.emitIdentifier = emitIdentifier;
-                       leftOverForCount = 0;
-                       leftOverForConv = 0;
+                       leftOverForCount = '\0';
+                       leftOverForConv = '\0';
                 }
  
                 // Override inherited methods.