2007-10-19 Nagappan A <anagappan@novell.com>
[mono.git] / eglib / src / gutf8.c
index 66c2877d861b33b8d14a3128ec7c5605a1b4a6c4..dee632032e1385033fb1471f7e6ec3bf42c5f50a 100644 (file)
-/*\r
- * gutf8.c: UTF-8 conversion\r
- *\r
- * Author:\r
- *   Atsushi Enomoto  <atsushi@ximian.com>\r
- *\r
- * (C) 2006 Novell, Inc.\r
- */\r
-\r
-#include <stdio.h>\r
-#include <glib.h>\r
-\r
-gpointer error_quark = "ERROR";\r
-\r
-gpointer\r
-g_convert_error_quark ()\r
-{\r
-       return error_quark;\r
-}\r
-\r
-gunichar2*\r
-g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)\r
-{\r
-       /* The conversion logic is almost identical to UTF8Encoding.GetChars(),\r
-          but error check is always done at utf8_to_utf16_len() so that\r
-          the conversion core below simply resets erroreous bits */\r
-       glong utf16_len;\r
-       gunichar2 *ret;\r
-       gchar ch, mb_size, mb_remain;\r
-       guint32 codepoint;\r
-       glong in_pos, out_pos;\r
-\r
-       utf16_len = 0;\r
-       mb_size = 0;\r
-       mb_remain = 0;\r
-       in_pos = 0;\r
-       out_pos = 0;\r
-\r
-       if (error)\r
-               *error = NULL;\r
-\r
-       utf16_len = utf8_to_utf16_len (str, len, items_read, error);\r
-       if (error)\r
-               if (*error)\r
+/*
+ * gutf8.c: UTF-8 conversion
+ *
+ * Author:
+ *   Atsushi Enomoto  <atsushi@ximian.com>
+ *
+ * (C) 2006 Novell, Inc.
+ */
+
+#include <stdio.h>
+#include <glib.h>
+
+gpointer error_quark = "ERROR";
+
+static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error);
+static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error);
+
+gpointer
+g_convert_error_quark ()
+{
+       return error_quark;
+}
+
+gunichar2*
+g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
+{
+       /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
+          but error check is always done at utf8_to_utf16_len() so that
+          the conversion core below simply resets erroreous bits */
+       glong utf16_len;
+       gunichar2 *ret;
+       guchar ch, mb_size, mb_remain;
+       guint32 codepoint;
+       glong in_pos, out_pos;
+
+       utf16_len = 0;
+       mb_size = 0;
+       mb_remain = 0;
+       in_pos = 0;
+       out_pos = 0;
+
+       if (error)
+               *error = NULL;
+
+       if (items_written)
+               *items_written = 0;
+       utf16_len = utf8_to_utf16_len (str, len, items_read, error);
+       if (error)
+               if (*error)
                        return NULL;
        if (utf16_len < 0)
-               return NULL;\r
-\r
-       ret = g_malloc (utf16_len * sizeof (gunichar2));\r
-\r
-       for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {\r
-               ch = (guchar) str [in_pos];\r
-               if (mb_size == 0) {\r
-                       if (0 < ch)\r
-                               ret [out_pos++] = ch;\r
-                       else if ((ch & 0xE0) == 0xC0) {\r
-                               codepoint = ch & 0x1F;\r
-                               mb_remain = mb_size = 2;\r
-                       } else if ((ch & 0xF0) == 0xE0) {\r
-                               codepoint = ch & 0x0F;\r
-                               mb_remain = mb_size = 3;\r
-                       } else if ((ch & 0xF8) == 0xF0) {\r
-                               codepoint = ch & 7;\r
-                               mb_remain = mb_size = 4;\r
-                       } else if ((ch & 0xFC) == 0xF8) {\r
-                               codepoint = ch & 3;\r
-                               mb_remain = mb_size = 5;\r
-                       } else if ((ch & 0xFE) == 0xFC) {\r
-                               codepoint = ch & 3;\r
-                               mb_remain = mb_size = 6;\r
-                       } else {\r
-                               /* invalid utf-8 sequence */\r
-                               codepoint = 0;\r
-                               mb_remain = mb_size = 0;\r
-                       }\r
-               } else {\r
-                       if ((ch & 0xC0) == 0x80) {\r
-                               codepoint = (codepoint << 6) | (ch & 0x3F);\r
-                               if (--mb_remain == 0) {\r
-                                       /* multi byte character is fully consumed now. */\r
-                                       if (codepoint < 0x10000) {\r
-                                               ret [out_pos++] = codepoint;\r
-                                       } else if (codepoint < 0x110000) {\r
-                                               /* surrogate pair */\r
-                                               codepoint -= 0x10000;\r
-                                               ret [out_pos++] = (codepoint >> 10) + 0xD800;\r
-                                               ret [out_pos++] = (codepoint & 0x3FF) + 0xDC00;\r
-                                       } else {\r
-                                               /* invalid utf-8 sequence (excess) */\r
-                                               codepoint = 0;\r
-                                               mb_remain = mb_size = 0;\r
-                                       }\r
-                               }\r
-                       } else {\r
-                               /* invalid utf-8 sequence */\r
-                               codepoint = 0;\r
-                               mb_remain = mb_size = 0;\r
-                       }\r
-               }\r
-       }\r
-\r
-       if (items_written)\r
-               *items_written = out_pos;\r
-       return ret;\r
-}\r
-\r
-glong\r
-utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)\r
-{\r
-       /* It is almost identical to UTF8Encoding.GetCharCount() */\r
-       guchar ch, mb_size, mb_remain;\r
-       gboolean overlong;\r
-       guint32 codepoint;\r
-       glong in_pos, ret;\r
-\r
-       mb_size = 0;\r
-       mb_remain = 0;\r
-       overlong = 0;\r
-       in_pos = 0;\r
-       ret = 0;\r
-\r
-       for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {\r
-               ch = str [in_pos];\r
-               if (mb_size == 0) {\r
-                       if (ch < 0x80)\r
-                               ret++;\r
-                       else if ((ch & 0xE0) == 0xC0) {\r
-                               codepoint = ch & 0x1F;\r
-                               mb_remain = mb_size = 2;\r
-                       } else if ((ch & 0xF0) == 0xE0) {\r
-                               codepoint = ch & 0x0F;\r
-                               mb_remain = mb_size = 3;\r
-                       } else if ((ch & 0xF8) == 0xF0) {\r
-                               codepoint = ch & 7;\r
-                               mb_remain = mb_size = 4;\r
-                       } else if ((ch & 0xFC) == 0xF8) {\r
-                               codepoint = ch & 3;\r
-                               mb_remain = mb_size = 5;\r
-                       } else if ((ch & 0xFE) == 0xFC) {\r
-                               codepoint = ch & 3;\r
-                               mb_remain = mb_size = 6;\r
-                       } else {\r
-                               /* invalid utf-8 sequence */\r
-                               if (error) {\r
-                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);\r
-                                       if (items_read)\r
-                                               *items_read = in_pos;\r
-                                       return -1;\r
-                               } else {\r
-                                       codepoint = 0;\r
-                                       mb_remain = mb_size = 0;\r
-                               }\r
-                       }\r
-               } else {\r
-                       if ((ch & 0xC0) == 0x80) {\r
-                               codepoint = (codepoint << 6) | (ch & 0x3F);\r
-                               if (--mb_remain == 0) {\r
-                                       /* multi byte character is fully consumed now. */\r
-                                       if (codepoint < 0x10000) {\r
-                                               switch (mb_size) {\r
-                                               case 2:\r
-                                                       overlong = codepoint < 0x7F;\r
-                                                       break;\r
-                                               case 3:\r
-                                                       overlong = codepoint < 0x7FF;\r
-                                                       break;\r
-                                               case 4:\r
-                                                       overlong = codepoint < 0xFFFF;\r
-                                                       break;\r
-                                               case 5:\r
-                                                       overlong = codepoint < 0x1FFFFF;\r
-                                                       break;\r
-                                               case 6:\r
-                                                       overlong = codepoint < 0x03FFFFFF;\r
-                                                       break;\r
-                                               }\r
-                                               if (overlong) {\r
-                                                       /* invalid utf-8 sequence (overlong) */\r
-                                                       if (error) {\r
-                                                               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);\r
-                                                               if (items_read)\r
-                                                                       *items_read = in_pos;\r
-                                                               return -1;\r
-                                                       } else {\r
-                                                               codepoint = 0;\r
-                                                               mb_remain = mb_size = 0;\r
-                                                               overlong = FALSE;\r
-                                                       }\r
-                                               }\r
-                                               else\r
-                                                       ret++;\r
-                                       } else if (codepoint < 0x110000) {\r
-                                               /* surrogate pair */\r
-                                               ret += 2;\r
-                                       } else {\r
-                                               /* invalid utf-8 sequence (excess) */\r
-                                               if (error) {\r
-                                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);\r
-                                                       if (items_read)\r
-                                                               *items_read = in_pos;\r
-                                                       return -1;\r
-                                               } else {\r
-                                                       codepoint = 0;\r
-                                                       mb_remain = mb_size = 0;\r
-                                               }\r
-                                       }\r
-                               }\r
-                       } else {\r
-                               /* invalid utf-8 sequence */\r
-                               if (error) {\r
-                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);\r
-                                       if (items_read)\r
-                                               *items_read = in_pos;\r
-                                       return -1;\r
-                               } else {\r
-                                       codepoint = 0;\r
-                                       mb_remain = mb_size = 0;\r
-                               }\r
-                       }\r
-               }\r
-       }\r
-\r
-       if (items_read)\r
-               *items_read = in_pos;\r
-       return ret;\r
-}\r
-\r
-gchar*\r
-g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)\r
-{\r
-       /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),\r
-          but error check is always done at utf16_to_utf8_len() so that\r
-          the conversion core below simply resets erroreous bits */\r
-       glong utf8_len;\r
-       gchar *ret;\r
-       glong in_pos, out_pos;\r
-       gunichar2 ch;\r
-       guint32 codepoint;\r
-       gboolean surrogate;\r
-\r
-       in_pos = 0;\r
-       out_pos = 0;\r
-       surrogate = FALSE;\r
-\r
-       utf8_len = utf16_to_utf8_len (str, len, items_read, error);\r
-       if (error)\r
-               if (*error)\r
-                       return NULL;\r
+               return NULL;
+
+       ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
+
+       for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
+               ch = (guchar) str [in_pos];
+               if (mb_size == 0) {
+                       if (ch < 0x80)
+                               ret [out_pos++] = ch;
+                       else if ((ch & 0xE0) == 0xC0) {
+                               codepoint = ch & 0x1F;
+                               mb_size = 2;
+                       } else if ((ch & 0xF0) == 0xE0) {
+                               codepoint = ch & 0x0F;
+                               mb_size = 3;
+                       } else if ((ch & 0xF8) == 0xF0) {
+                               codepoint = ch & 7;
+                               mb_size = 4;
+                       } else if ((ch & 0xFC) == 0xF8) {
+                               codepoint = ch & 3;
+                               mb_size = 5;
+                       } else if ((ch & 0xFE) == 0xFC) {
+                               codepoint = ch & 3;
+                               mb_size = 6;
+                       } else {
+                               /* invalid utf-8 sequence */
+                               codepoint = 0;
+                               mb_remain = mb_size = 0;
+                       }
+                       if (mb_size > 1)
+                               mb_remain = mb_size - 1;
+               } else {
+                       if ((ch & 0xC0) == 0x80) {
+                               codepoint = (codepoint << 6) | (ch & 0x3F);
+                               if (--mb_remain == 0) {
+                                       /* multi byte character is fully consumed now. */
+                                       if (codepoint < 0x10000) {
+                                               ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
+                                       } else if (codepoint < 0x110000) {
+                                               /* surrogate pair */
+                                               codepoint -= 0x10000;
+                                               ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
+                                               ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
+                                       } else {
+                                               /* invalid utf-8 sequence (excess) */
+                                               codepoint = 0;
+                                               mb_remain = 0;
+                                       }
+                                       mb_size = 0;
+                               }
+                       } else {
+                               /* invalid utf-8 sequence */
+                               codepoint = 0;
+                               mb_remain = mb_size = 0;
+                       }
+               }
+       }
+
+       ret [out_pos] = 0;
+       if (items_written)
+               *items_written = out_pos;
+       return ret;
+}
+
+static glong
+utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
+{
+       /* It is almost identical to UTF8Encoding.GetCharCount() */
+       guchar ch, mb_size, mb_remain;
+       gboolean overlong;
+       guint32 codepoint;
+       glong in_pos, ret;
+
+       mb_size = 0;
+       mb_remain = 0;
+       overlong = 0;
+       in_pos = 0;
+       ret = 0;
+
+       for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
+               ch = str [in_pos];
+               if (mb_size == 0) {
+                       if (ch < 0x80)
+                               ret++;
+                       else if ((ch & 0xE0) == 0xC0) {
+                               codepoint = ch & 0x1F;
+                               mb_size = 2;
+                       } else if ((ch & 0xF0) == 0xE0) {
+                               codepoint = ch & 0x0F;
+                               mb_size = 3;
+                       } else if ((ch & 0xF8) == 0xF0) {
+                               codepoint = ch & 7;
+                               mb_size = 4;
+                       } else if ((ch & 0xFC) == 0xF8) {
+                               codepoint = ch & 3;
+                               mb_size = 5;
+                       } else if ((ch & 0xFE) == 0xFC) {
+                               codepoint = ch & 3;
+                               mb_size = 6;
+                       } else {
+                               /* invalid utf-8 sequence */
+                               if (error) {
+                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
+                                       if (items_read)
+                                               *items_read = in_pos;
+                                       return -1;
+                               } else {
+                                       codepoint = 0;
+                                       mb_remain = mb_size = 0;
+                               }
+                       }
+                       if (mb_size > 1)
+                               mb_remain = mb_size - 1;
+               } else {
+                       if ((ch & 0xC0) == 0x80) {
+                               codepoint = (codepoint << 6) | (ch & 0x3F);
+                               if (--mb_remain == 0) {
+                                       /* multi byte character is fully consumed now. */
+                                       if (codepoint < 0x10000) {
+                                               switch (mb_size) {
+                                               case 2:
+                                                       overlong = codepoint < 0x7F;
+                                                       break;
+                                               case 3:
+                                                       overlong = codepoint < 0x7FF;
+                                                       break;
+                                               case 4:
+                                                       overlong = codepoint < 0xFFFF;
+                                                       break;
+                                               case 5:
+                                                       overlong = codepoint < 0x1FFFFF;
+                                                       break;
+                                               case 6:
+                                                       overlong = codepoint < 0x03FFFFFF;
+                                                       break;
+                                               }
+                                               if (overlong) {
+                                                       /* invalid utf-8 sequence (overlong) */
+                                                       if (error) {
+                                                               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
+                                                               if (items_read)
+                                                                       *items_read = in_pos;
+                                                               return -1;
+                                                       } else {
+                                                               codepoint = 0;
+                                                               mb_remain = 0;
+                                                               overlong = FALSE;
+                                                       }
+                                               }
+                                               else
+                                                       ret++;
+                                       } else if (codepoint < 0x110000) {
+                                               /* surrogate pair */
+                                               ret += 2;
+                                       } else {
+                                               /* invalid utf-8 sequence (excess) */
+                                               if (error) {
+                                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
+                                                       if (items_read)
+                                                               *items_read = in_pos;
+                                                       return -1;
+                                               } else {
+                                                       codepoint = 0;
+                                                       mb_remain = 0;
+                                               }
+                                       }
+                                       mb_size = 0;
+                               }
+                       } else {
+                               /* invalid utf-8 sequence */
+                               if (error) {
+                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
+                                       if (items_read)
+                                               *items_read = in_pos;
+                                       return -1;
+                               } else {
+                                       codepoint = 0;
+                                       mb_remain = mb_size = 0;
+                               }
+                       }
+               }
+       }
+
+       if (items_read)
+               *items_read = in_pos;
+       return ret;
+}
+
+gchar*
+g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
+{
+       /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
+          but error check is always done at utf16_to_utf8_len() so that
+          the conversion core below simply resets erroreous bits */
+       glong utf8_len;
+       gchar *ret;
+       glong in_pos, out_pos;
+       gunichar2 ch;
+       guint32 codepoint = 0;
+       gboolean surrogate;
+
+       in_pos = 0;
+       out_pos = 0;
+       surrogate = FALSE;
+
+       if (items_written)
+               *items_written = 0;
+       utf8_len = utf16_to_utf8_len (str, len, items_read, error);
+       if (error)
+               if (*error)
+                       return NULL;
        if (utf8_len < 0)
-               return NULL;\r
-\r
-       ret = g_malloc (utf8_len * sizeof (gchar));\r
-\r
-       while (len < 0 ? str [in_pos] : in_pos < len) {\r
-               ch = str [in_pos];\r
-               if (surrogate) {\r
-                       surrogate = 0;\r
-                       if (ch >= 0xDC00 && ch <= 0xDFFF)\r
-                               codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);\r
-                       else\r
-                               /* invalid surrogate pair */\r
-                               continue;\r
-               } else {\r
-                       /* fast path optimization */\r
-                       if (ch < 0x80) {\r
-                               for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {\r
-                                       if (str [in_pos] < 0x80)\r
-                                               ret [out_pos++] = str [in_pos];\r
-                                       else\r
-                                               break;\r
-                               }\r
-                               continue;\r
-                       }\r
-                       else if (ch >= 0xD800 && ch <= 0xDBFF)\r
-                               surrogate = ch;\r
-                       else if (ch >= 0xDC00 && ch <= 0xDFFF) {\r
-                               /* invalid surrogate pair */\r
-                               continue;\r
-                       }\r
-                       else\r
-                               codepoint = ch;\r
-               }\r
-               in_pos++;\r
-\r
-               if (codepoint < 0x80)\r
-                       ret [out_pos++] = (gchar) codepoint;\r
-               else if (codepoint < 0x0800) {\r
-                       ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));\r
-                       ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));\r
-               } else if (codepoint < 0x10000) {\r
-                       ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));\r
-                       ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));\r
-                       ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));\r
-               } else {\r
-                       ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));\r
-                       ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));\r
-                       ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));\r
-                       ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));\r
-               }\r
-       }\r
-\r
-       if (items_written)\r
-               *items_written = out_pos;\r
-       return ret;\r
-}\r
-\r
-glong\r
-utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)\r
-{\r
-       glong ret, in_pos;\r
-       gunichar2 ch;\r
-       gboolean surrogate;\r
-\r
-       ret = 0;\r
-       in_pos = 0;\r
-       surrogate = FALSE;\r
-\r
-       while (len < 0 ? str [in_pos] : in_pos < len) {\r
-               ch = str [in_pos];\r
-               if (surrogate) {\r
-                       if (ch >= 0xDC00 && ch <= 0xDFFF) {\r
-                               ret += 4;\r
-                       } else {\r
-                               /* invalid surrogate pair */\r
-                               if (error) {\r
-                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);\r
-                                       if (items_read)\r
-                                               *items_read = in_pos;\r
-                                       return -1;\r
-                               } /* otherwise just ignore. */\r
-                       }\r
-                       surrogate = FALSE;\r
-               } else {\r
-                       /* fast path optimization */\r
-                       if (ch < 0x80) {\r
-                               for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {\r
-                                       if (str [in_pos] < 0x80)\r
-                                               ++ret;\r
-                                       else\r
-                                               break;\r
-                               }\r
-                               continue;\r
-                       }\r
-                       else if (ch < 0x0800)\r
-                               ret += 2;\r
-                       else if (ch >= 0xD800 && ch <= 0xDBFF)\r
-                               surrogate = TRUE;\r
-                       else if (ch >= 0xDC00 && ch <= 0xDFFF) {\r
-                               /* invalid surrogate pair */\r
-                               if (error) {\r
-                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);\r
-                                       if (items_read)\r
-                                               *items_read = in_pos;\r
-                                       return -1;\r
-                               } /* otherwise just ignore. */\r
-                       }\r
-                       else\r
-                               ret += 3;\r
-               }\r
-               in_pos++;\r
-       }\r
-\r
-       if (items_read)\r
-               *items_read = in_pos;\r
-       return ret;\r
-}\r
+               return NULL;
+
+       ret = g_malloc ((1+utf8_len) * sizeof (gchar));
+
+       while (len < 0 ? str [in_pos] : in_pos < len) {
+               ch = str [in_pos];
+               if (surrogate) {
+                       surrogate = 0;
+                       if (ch >= 0xDC00 && ch <= 0xDFFF)
+                               codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
+                       else
+                               /* invalid surrogate pair */
+                               continue;
+               } else {
+                       /* fast path optimization */
+                       if (ch < 0x80) {
+                               for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
+                                       if (str [in_pos] < 0x80)
+                                               ret [out_pos++] = (gchar)(str [in_pos]);
+                                       else
+                                               break;
+                               }
+                               continue;
+                       }
+                       else if (ch >= 0xD800 && ch <= 0xDBFF)
+                               surrogate = ch;
+                       else if (ch >= 0xDC00 && ch <= 0xDFFF) {
+                               /* invalid surrogate pair */
+                               continue;
+                       }
+                       else
+                               codepoint = ch;
+               }
+               in_pos++;
+
+               if (codepoint < 0x80)
+                       ret [out_pos++] = (gchar) codepoint;
+               else if (codepoint < 0x0800) {
+                       ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
+                       ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
+               } else if (codepoint < 0x10000) {
+                       ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
+                       ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
+                       ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
+               } else {
+                       ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
+                       ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
+                       ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
+                       ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
+               }
+       }
+       ret [out_pos] = 0;
+
+       if (items_written)
+               *items_written = out_pos;
+       return ret;
+}
+
+static glong
+utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
+{
+       glong ret, in_pos;
+       gunichar2 ch;
+       gboolean surrogate;
+
+       ret = 0;
+       in_pos = 0;
+       surrogate = FALSE;
+
+       while (len < 0 ? str [in_pos] : in_pos < len) {
+               ch = str [in_pos];
+               if (surrogate) {
+                       if (ch >= 0xDC00 && ch <= 0xDFFF) {
+                               ret += 4;
+                       } else {
+                               /* invalid surrogate pair */
+                               if (error) {
+                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
+                                       if (items_read)
+                                               *items_read = in_pos;
+                                       return -1;
+                               } /* otherwise just ignore. */
+                       }
+                       surrogate = FALSE;
+               } else {
+                       /* fast path optimization */
+                       if (ch < 0x80) {
+                               for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
+                                       if (str [in_pos] < 0x80)
+                                               ++ret;
+                                       else
+                                               break;
+                               }
+                               continue;
+                       }
+                       else if (ch < 0x0800)
+                               ret += 2;
+                       else if (ch >= 0xD800 && ch <= 0xDBFF)
+                               surrogate = TRUE;
+                       else if (ch >= 0xDC00 && ch <= 0xDFFF) {
+                               /* invalid surrogate pair */
+                               if (error) {
+                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
+                                       if (items_read)
+                                               *items_read = in_pos;
+                                       return -1;
+                               } /* otherwise just ignore. */
+                       }
+                       else
+                               ret += 3;
+               }
+               in_pos++;
+       }
+
+       if (items_read)
+               *items_read = in_pos;
+       return ret;
+}