-/*\r
- * gutf8.c: UTF-8 conversion\r
- *\r
- * Author:\r
- * Atsushi Enomoto <atsushi@ximian.com>\r
- *\r
- * (C) 2006 Novell, Inc.\r
- */\r
-\r
-#include <stdio.h>\r
-#include <glib.h>\r
-\r
-gpointer error_quark = "ERROR";\r
-\r
-gpointer\r
-g_convert_error_quark ()\r
-{\r
- return error_quark;\r
-}\r
-\r
-gunichar2*\r
-g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)\r
-{\r
- /* The conversion logic is almost identical to UTF8Encoding.GetChars(),\r
- but error check is always done at utf8_to_utf16_len() so that\r
- the conversion core below simply resets erroreous bits */\r
- glong utf16_len;\r
- gunichar2 *ret;\r
- gchar ch, mb_size, mb_remain;\r
- guint32 codepoint;\r
- glong in_pos, out_pos;\r
-\r
- utf16_len = 0;\r
- mb_size = 0;\r
- mb_remain = 0;\r
- in_pos = 0;\r
- out_pos = 0;\r
-\r
- if (error)\r
- *error = NULL;\r
-\r
- utf16_len = utf8_to_utf16_len (str, len, items_read, error);\r
- if (error)\r
- if (*error)\r
+/*
+ * gutf8.c: UTF-8 conversion
+ *
+ * Author:
+ * Atsushi Enomoto <atsushi@ximian.com>
+ *
+ * (C) 2006 Novell, Inc.
+ */
+
+#include <stdio.h>
+#include <glib.h>
+
+gpointer error_quark = "ERROR";
+
+static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error);
+static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error);
+
+gpointer
+g_convert_error_quark ()
+{
+ return error_quark;
+}
+
+gunichar2*
+g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
+{
+ /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
+ but error check is always done at utf8_to_utf16_len() so that
+ the conversion core below simply resets erroreous bits */
+ glong utf16_len;
+ gunichar2 *ret;
+ guchar ch, mb_size, mb_remain;
+ guint32 codepoint;
+ glong in_pos, out_pos;
+
+ utf16_len = 0;
+ mb_size = 0;
+ mb_remain = 0;
+ in_pos = 0;
+ out_pos = 0;
+
+ if (error)
+ *error = NULL;
+
+ if (items_written)
+ *items_written = 0;
+ utf16_len = utf8_to_utf16_len (str, len, items_read, error);
+ if (error)
+ if (*error)
return NULL;
if (utf16_len < 0)
- return NULL;\r
-\r
- ret = g_malloc (utf16_len * sizeof (gunichar2));\r
-\r
- for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {\r
- ch = (guchar) str [in_pos];\r
- if (mb_size == 0) {\r
- if (0 < ch)\r
- ret [out_pos++] = ch;\r
- else if ((ch & 0xE0) == 0xC0) {\r
- codepoint = ch & 0x1F;\r
- mb_remain = mb_size = 2;\r
- } else if ((ch & 0xF0) == 0xE0) {\r
- codepoint = ch & 0x0F;\r
- mb_remain = mb_size = 3;\r
- } else if ((ch & 0xF8) == 0xF0) {\r
- codepoint = ch & 7;\r
- mb_remain = mb_size = 4;\r
- } else if ((ch & 0xFC) == 0xF8) {\r
- codepoint = ch & 3;\r
- mb_remain = mb_size = 5;\r
- } else if ((ch & 0xFE) == 0xFC) {\r
- codepoint = ch & 3;\r
- mb_remain = mb_size = 6;\r
- } else {\r
- /* invalid utf-8 sequence */\r
- codepoint = 0;\r
- mb_remain = mb_size = 0;\r
- }\r
- } else {\r
- if ((ch & 0xC0) == 0x80) {\r
- codepoint = (codepoint << 6) | (ch & 0x3F);\r
- if (--mb_remain == 0) {\r
- /* multi byte character is fully consumed now. */\r
- if (codepoint < 0x10000) {\r
- ret [out_pos++] = codepoint;\r
- } else if (codepoint < 0x110000) {\r
- /* surrogate pair */\r
- codepoint -= 0x10000;\r
- ret [out_pos++] = (codepoint >> 10) + 0xD800;\r
- ret [out_pos++] = (codepoint & 0x3FF) + 0xDC00;\r
- } else {\r
- /* invalid utf-8 sequence (excess) */\r
- codepoint = 0;\r
- mb_remain = mb_size = 0;\r
- }\r
- }\r
- } else {\r
- /* invalid utf-8 sequence */\r
- codepoint = 0;\r
- mb_remain = mb_size = 0;\r
- }\r
- }\r
- }\r
-\r
- if (items_written)\r
- *items_written = out_pos;\r
- return ret;\r
-}\r
-\r
-glong\r
-utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)\r
-{\r
- /* It is almost identical to UTF8Encoding.GetCharCount() */\r
- guchar ch, mb_size, mb_remain;\r
- gboolean overlong;\r
- guint32 codepoint;\r
- glong in_pos, ret;\r
-\r
- mb_size = 0;\r
- mb_remain = 0;\r
- overlong = 0;\r
- in_pos = 0;\r
- ret = 0;\r
-\r
- for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {\r
- ch = str [in_pos];\r
- if (mb_size == 0) {\r
- if (ch < 0x80)\r
- ret++;\r
- else if ((ch & 0xE0) == 0xC0) {\r
- codepoint = ch & 0x1F;\r
- mb_remain = mb_size = 2;\r
- } else if ((ch & 0xF0) == 0xE0) {\r
- codepoint = ch & 0x0F;\r
- mb_remain = mb_size = 3;\r
- } else if ((ch & 0xF8) == 0xF0) {\r
- codepoint = ch & 7;\r
- mb_remain = mb_size = 4;\r
- } else if ((ch & 0xFC) == 0xF8) {\r
- codepoint = ch & 3;\r
- mb_remain = mb_size = 5;\r
- } else if ((ch & 0xFE) == 0xFC) {\r
- codepoint = ch & 3;\r
- mb_remain = mb_size = 6;\r
- } else {\r
- /* invalid utf-8 sequence */\r
- if (error) {\r
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);\r
- if (items_read)\r
- *items_read = in_pos;\r
- return -1;\r
- } else {\r
- codepoint = 0;\r
- mb_remain = mb_size = 0;\r
- }\r
- }\r
- } else {\r
- if ((ch & 0xC0) == 0x80) {\r
- codepoint = (codepoint << 6) | (ch & 0x3F);\r
- if (--mb_remain == 0) {\r
- /* multi byte character is fully consumed now. */\r
- if (codepoint < 0x10000) {\r
- switch (mb_size) {\r
- case 2:\r
- overlong = codepoint < 0x7F;\r
- break;\r
- case 3:\r
- overlong = codepoint < 0x7FF;\r
- break;\r
- case 4:\r
- overlong = codepoint < 0xFFFF;\r
- break;\r
- case 5:\r
- overlong = codepoint < 0x1FFFFF;\r
- break;\r
- case 6:\r
- overlong = codepoint < 0x03FFFFFF;\r
- break;\r
- }\r
- if (overlong) {\r
- /* invalid utf-8 sequence (overlong) */\r
- if (error) {\r
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);\r
- if (items_read)\r
- *items_read = in_pos;\r
- return -1;\r
- } else {\r
- codepoint = 0;\r
- mb_remain = mb_size = 0;\r
- overlong = FALSE;\r
- }\r
- }\r
- else\r
- ret++;\r
- } else if (codepoint < 0x110000) {\r
- /* surrogate pair */\r
- ret += 2;\r
- } else {\r
- /* invalid utf-8 sequence (excess) */\r
- if (error) {\r
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);\r
- if (items_read)\r
- *items_read = in_pos;\r
- return -1;\r
- } else {\r
- codepoint = 0;\r
- mb_remain = mb_size = 0;\r
- }\r
- }\r
- }\r
- } else {\r
- /* invalid utf-8 sequence */\r
- if (error) {\r
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);\r
- if (items_read)\r
- *items_read = in_pos;\r
- return -1;\r
- } else {\r
- codepoint = 0;\r
- mb_remain = mb_size = 0;\r
- }\r
- }\r
- }\r
- }\r
-\r
- if (items_read)\r
- *items_read = in_pos;\r
- return ret;\r
-}\r
-\r
-gchar*\r
-g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)\r
-{\r
- /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),\r
- but error check is always done at utf16_to_utf8_len() so that\r
- the conversion core below simply resets erroreous bits */\r
- glong utf8_len;\r
- gchar *ret;\r
- glong in_pos, out_pos;\r
- gunichar2 ch;\r
- guint32 codepoint;\r
- gboolean surrogate;\r
-\r
- in_pos = 0;\r
- out_pos = 0;\r
- surrogate = FALSE;\r
-\r
- utf8_len = utf16_to_utf8_len (str, len, items_read, error);\r
- if (error)\r
- if (*error)\r
- return NULL;\r
+ return NULL;
+
+ ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
+
+ for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
+ ch = (guchar) str [in_pos];
+ if (mb_size == 0) {
+ if (ch < 0x80)
+ ret [out_pos++] = ch;
+ else if ((ch & 0xE0) == 0xC0) {
+ codepoint = ch & 0x1F;
+ mb_size = 2;
+ } else if ((ch & 0xF0) == 0xE0) {
+ codepoint = ch & 0x0F;
+ mb_size = 3;
+ } else if ((ch & 0xF8) == 0xF0) {
+ codepoint = ch & 7;
+ mb_size = 4;
+ } else if ((ch & 0xFC) == 0xF8) {
+ codepoint = ch & 3;
+ mb_size = 5;
+ } else if ((ch & 0xFE) == 0xFC) {
+ codepoint = ch & 3;
+ mb_size = 6;
+ } else {
+ /* invalid utf-8 sequence */
+ codepoint = 0;
+ mb_remain = mb_size = 0;
+ }
+ if (mb_size > 1)
+ mb_remain = mb_size - 1;
+ } else {
+ if ((ch & 0xC0) == 0x80) {
+ codepoint = (codepoint << 6) | (ch & 0x3F);
+ if (--mb_remain == 0) {
+ /* multi byte character is fully consumed now. */
+ if (codepoint < 0x10000) {
+ ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
+ } else if (codepoint < 0x110000) {
+ /* surrogate pair */
+ codepoint -= 0x10000;
+ ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
+ ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
+ } else {
+ /* invalid utf-8 sequence (excess) */
+ codepoint = 0;
+ mb_remain = 0;
+ }
+ mb_size = 0;
+ }
+ } else {
+ /* invalid utf-8 sequence */
+ codepoint = 0;
+ mb_remain = mb_size = 0;
+ }
+ }
+ }
+
+ ret [out_pos] = 0;
+ if (items_written)
+ *items_written = out_pos;
+ return ret;
+}
+
+static glong
+utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
+{
+ /* It is almost identical to UTF8Encoding.GetCharCount() */
+ guchar ch, mb_size, mb_remain;
+ gboolean overlong;
+ guint32 codepoint;
+ glong in_pos, ret;
+
+ mb_size = 0;
+ mb_remain = 0;
+ overlong = 0;
+ in_pos = 0;
+ ret = 0;
+
+ for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
+ ch = str [in_pos];
+ if (mb_size == 0) {
+ if (ch < 0x80)
+ ret++;
+ else if ((ch & 0xE0) == 0xC0) {
+ codepoint = ch & 0x1F;
+ mb_size = 2;
+ } else if ((ch & 0xF0) == 0xE0) {
+ codepoint = ch & 0x0F;
+ mb_size = 3;
+ } else if ((ch & 0xF8) == 0xF0) {
+ codepoint = ch & 7;
+ mb_size = 4;
+ } else if ((ch & 0xFC) == 0xF8) {
+ codepoint = ch & 3;
+ mb_size = 5;
+ } else if ((ch & 0xFE) == 0xFC) {
+ codepoint = ch & 3;
+ mb_size = 6;
+ } else {
+ /* invalid utf-8 sequence */
+ if (error) {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
+ if (items_read)
+ *items_read = in_pos;
+ return -1;
+ } else {
+ codepoint = 0;
+ mb_remain = mb_size = 0;
+ }
+ }
+ if (mb_size > 1)
+ mb_remain = mb_size - 1;
+ } else {
+ if ((ch & 0xC0) == 0x80) {
+ codepoint = (codepoint << 6) | (ch & 0x3F);
+ if (--mb_remain == 0) {
+ /* multi byte character is fully consumed now. */
+ if (codepoint < 0x10000) {
+ switch (mb_size) {
+ case 2:
+ overlong = codepoint < 0x7F;
+ break;
+ case 3:
+ overlong = codepoint < 0x7FF;
+ break;
+ case 4:
+ overlong = codepoint < 0xFFFF;
+ break;
+ case 5:
+ overlong = codepoint < 0x1FFFFF;
+ break;
+ case 6:
+ overlong = codepoint < 0x03FFFFFF;
+ break;
+ }
+ if (overlong) {
+ /* invalid utf-8 sequence (overlong) */
+ if (error) {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
+ if (items_read)
+ *items_read = in_pos;
+ return -1;
+ } else {
+ codepoint = 0;
+ mb_remain = 0;
+ overlong = FALSE;
+ }
+ }
+ else
+ ret++;
+ } else if (codepoint < 0x110000) {
+ /* surrogate pair */
+ ret += 2;
+ } else {
+ /* invalid utf-8 sequence (excess) */
+ if (error) {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
+ if (items_read)
+ *items_read = in_pos;
+ return -1;
+ } else {
+ codepoint = 0;
+ mb_remain = 0;
+ }
+ }
+ mb_size = 0;
+ }
+ } else {
+ /* invalid utf-8 sequence */
+ if (error) {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
+ if (items_read)
+ *items_read = in_pos;
+ return -1;
+ } else {
+ codepoint = 0;
+ mb_remain = mb_size = 0;
+ }
+ }
+ }
+ }
+
+ if (items_read)
+ *items_read = in_pos;
+ return ret;
+}
+
+gchar*
+g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
+{
+ /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
+ but error check is always done at utf16_to_utf8_len() so that
+ the conversion core below simply resets erroreous bits */
+ glong utf8_len;
+ gchar *ret;
+ glong in_pos, out_pos;
+ gunichar2 ch;
+ guint32 codepoint = 0;
+ gboolean surrogate;
+
+ in_pos = 0;
+ out_pos = 0;
+ surrogate = FALSE;
+
+ if (items_written)
+ *items_written = 0;
+ utf8_len = utf16_to_utf8_len (str, len, items_read, error);
+ if (error)
+ if (*error)
+ return NULL;
if (utf8_len < 0)
- return NULL;\r
-\r
- ret = g_malloc (utf8_len * sizeof (gchar));\r
-\r
- while (len < 0 ? str [in_pos] : in_pos < len) {\r
- ch = str [in_pos];\r
- if (surrogate) {\r
- surrogate = 0;\r
- if (ch >= 0xDC00 && ch <= 0xDFFF)\r
- codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);\r
- else\r
- /* invalid surrogate pair */\r
- continue;\r
- } else {\r
- /* fast path optimization */\r
- if (ch < 0x80) {\r
- for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {\r
- if (str [in_pos] < 0x80)\r
- ret [out_pos++] = str [in_pos];\r
- else\r
- break;\r
- }\r
- continue;\r
- }\r
- else if (ch >= 0xD800 && ch <= 0xDBFF)\r
- surrogate = ch;\r
- else if (ch >= 0xDC00 && ch <= 0xDFFF) {\r
- /* invalid surrogate pair */\r
- continue;\r
- }\r
- else\r
- codepoint = ch;\r
- }\r
- in_pos++;\r
-\r
- if (codepoint < 0x80)\r
- ret [out_pos++] = (gchar) codepoint;\r
- else if (codepoint < 0x0800) {\r
- ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));\r
- ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));\r
- } else if (codepoint < 0x10000) {\r
- ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));\r
- ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));\r
- ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));\r
- } else {\r
- ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));\r
- ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));\r
- ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));\r
- ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));\r
- }\r
- }\r
-\r
- if (items_written)\r
- *items_written = out_pos;\r
- return ret;\r
-}\r
-\r
-glong\r
-utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)\r
-{\r
- glong ret, in_pos;\r
- gunichar2 ch;\r
- gboolean surrogate;\r
-\r
- ret = 0;\r
- in_pos = 0;\r
- surrogate = FALSE;\r
-\r
- while (len < 0 ? str [in_pos] : in_pos < len) {\r
- ch = str [in_pos];\r
- if (surrogate) {\r
- if (ch >= 0xDC00 && ch <= 0xDFFF) {\r
- ret += 4;\r
- } else {\r
- /* invalid surrogate pair */\r
- if (error) {\r
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);\r
- if (items_read)\r
- *items_read = in_pos;\r
- return -1;\r
- } /* otherwise just ignore. */\r
- }\r
- surrogate = FALSE;\r
- } else {\r
- /* fast path optimization */\r
- if (ch < 0x80) {\r
- for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {\r
- if (str [in_pos] < 0x80)\r
- ++ret;\r
- else\r
- break;\r
- }\r
- continue;\r
- }\r
- else if (ch < 0x0800)\r
- ret += 2;\r
- else if (ch >= 0xD800 && ch <= 0xDBFF)\r
- surrogate = TRUE;\r
- else if (ch >= 0xDC00 && ch <= 0xDFFF) {\r
- /* invalid surrogate pair */\r
- if (error) {\r
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);\r
- if (items_read)\r
- *items_read = in_pos;\r
- return -1;\r
- } /* otherwise just ignore. */\r
- }\r
- else\r
- ret += 3;\r
- }\r
- in_pos++;\r
- }\r
-\r
- if (items_read)\r
- *items_read = in_pos;\r
- return ret;\r
-}\r
+ return NULL;
+
+ ret = g_malloc ((1+utf8_len) * sizeof (gchar));
+
+ while (len < 0 ? str [in_pos] : in_pos < len) {
+ ch = str [in_pos];
+ if (surrogate) {
+ surrogate = 0;
+ if (ch >= 0xDC00 && ch <= 0xDFFF)
+ codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
+ else
+ /* invalid surrogate pair */
+ continue;
+ } else {
+ /* fast path optimization */
+ if (ch < 0x80) {
+ for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
+ if (str [in_pos] < 0x80)
+ ret [out_pos++] = (gchar)(str [in_pos]);
+ else
+ break;
+ }
+ continue;
+ }
+ else if (ch >= 0xD800 && ch <= 0xDBFF)
+ surrogate = ch;
+ else if (ch >= 0xDC00 && ch <= 0xDFFF) {
+ /* invalid surrogate pair */
+ continue;
+ }
+ else
+ codepoint = ch;
+ }
+ in_pos++;
+
+ if (codepoint < 0x80)
+ ret [out_pos++] = (gchar) codepoint;
+ else if (codepoint < 0x0800) {
+ ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
+ ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
+ } else if (codepoint < 0x10000) {
+ ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
+ ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
+ ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
+ } else {
+ ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
+ ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
+ ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
+ ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
+ }
+ }
+ ret [out_pos] = 0;
+
+ if (items_written)
+ *items_written = out_pos;
+ return ret;
+}
+
+static glong
+utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
+{
+ glong ret, in_pos;
+ gunichar2 ch;
+ gboolean surrogate;
+
+ ret = 0;
+ in_pos = 0;
+ surrogate = FALSE;
+
+ while (len < 0 ? str [in_pos] : in_pos < len) {
+ ch = str [in_pos];
+ if (surrogate) {
+ if (ch >= 0xDC00 && ch <= 0xDFFF) {
+ ret += 4;
+ } else {
+ /* invalid surrogate pair */
+ if (error) {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
+ if (items_read)
+ *items_read = in_pos;
+ return -1;
+ } /* otherwise just ignore. */
+ }
+ surrogate = FALSE;
+ } else {
+ /* fast path optimization */
+ if (ch < 0x80) {
+ for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
+ if (str [in_pos] < 0x80)
+ ++ret;
+ else
+ break;
+ }
+ continue;
+ }
+ else if (ch < 0x0800)
+ ret += 2;
+ else if (ch >= 0xD800 && ch <= 0xDBFF)
+ surrogate = TRUE;
+ else if (ch >= 0xDC00 && ch <= 0xDFFF) {
+ /* invalid surrogate pair */
+ if (error) {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
+ if (items_read)
+ *items_read = in_pos;
+ return -1;
+ } /* otherwise just ignore. */
+ }
+ else
+ ret += 3;
+ }
+ in_pos++;
+ }
+
+ if (items_read)
+ *items_read = in_pos;
+ return ret;
+}