* Atsushi Enomoto <atsushi@ximian.com>
*
* (C) 2006 Novell, Inc.
+ * Copyright 2012 Xamarin Inc
*/
#include <stdio.h>
#include <glib.h>
-static gpointer error_quark = "ConvertError";
-
-static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error);
-static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error);
-
-gpointer
-g_convert_error_quark (void)
-{
- return error_quark;
-}
+/*
+ * Index into the table below with the first byte of a UTF-8 sequence to get
+ * the number of bytes that are supposed to follow it to complete the sequence.
+ *
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is left
+ * as-is for anyone who may want to do such conversion, which was allowed in
+ * earlier algorithms.
+*/
+const guchar g_utf8_jump_table[256] = {
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+};
static gchar *
utf8_case_conv (const gchar *str, gssize len, gboolean upper)
glong i, ulen;
gchar *utf8;
- //ustr = g_utf8_to_ucs4 (str, (glong) len, NULL, &ulen, NULL);
ustr = g_utf8_to_ucs4_fast (str, (glong) len, &ulen);
for (i = 0; i < ulen; i++)
ustr[i] = upper ? g_unichar_toupper (ustr[i]) : g_unichar_tolower (ustr[i]);
return utf8_case_conv (str, len, FALSE);
}
-gunichar
-g_utf8_get_char_validated (const gchar *str, gssize max_len)
+static gboolean
+utf8_validate (const unsigned char *inptr, size_t len)
{
- gushort extra_bytes = 0;
-
- if (max_len == 0)
- return -2;
-
- extra_bytes = g_trailingBytesForUTF8 [(unsigned char) *str];
-
- if (max_len <= extra_bytes)
- return -2;
-
- if (g_utf8_validate (str, max_len, NULL))
- return g_utf8_get_char (str);
+ const unsigned char *ptr = inptr + len;
+ unsigned char c;
- return -1;
-}
-
-static glong
-utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
-{
- /* It is almost identical to UTF8Encoding.GetCharCount() */
- guchar ch, mb_size, mb_remain;
- gboolean overlong;
- guint32 codepoint;
- glong in_pos, ret;
-
- if (len < 0)
- len = (glong) strlen (str);
-
- in_pos = 0;
- ret = 0;
-
- /* Common case */
- for (in_pos = 0; in_pos < len && (guchar) str [in_pos] < 0x80; in_pos++)
- ret ++;
-
- if (in_pos == len) {
- if (items_read)
- *items_read = in_pos;
- return ret;
- }
-
- mb_size = 0;
- mb_remain = 0;
- overlong = 0;
-
- for (; in_pos < len; in_pos++) {
- ch = str [in_pos];
- if (mb_size == 0) {
- if (ch < 0x80)
- ret++;
- else if ((ch & 0xE0) == 0xC0) {
- codepoint = ch & 0x1F;
- mb_size = 2;
- } else if ((ch & 0xF0) == 0xE0) {
- codepoint = ch & 0x0F;
- mb_size = 3;
- } else if ((ch & 0xF8) == 0xF0) {
- codepoint = ch & 7;
- mb_size = 4;
- } else if ((ch & 0xFC) == 0xF8) {
- codepoint = ch & 3;
- mb_size = 5;
- } else if ((ch & 0xFE) == 0xFC) {
- codepoint = ch & 3;
- mb_size = 6;
- } else {
- /* invalid utf-8 sequence */
- if (error) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
- if (items_read)
- *items_read = in_pos;
- return -1;
- } else {
- codepoint = 0;
- mb_remain = mb_size = 0;
- }
- }
- if (mb_size > 1)
- mb_remain = mb_size - 1;
- } else {
- if ((ch & 0xC0) == 0x80) {
- codepoint = (codepoint << 6) | (ch & 0x3F);
- if (--mb_remain == 0) {
- /* multi byte character is fully consumed now. */
- if (codepoint < 0x10000) {
- switch (mb_size) {
- case 2:
- overlong = codepoint < 0x7F;
- break;
- case 3:
- overlong = codepoint < 0x7FF;
- break;
- case 4:
- overlong = codepoint < 0xFFFF;
- break;
- case 5:
- overlong = codepoint < 0x1FFFFF;
- break;
- case 6:
- overlong = codepoint < 0x03FFFFFF;
- break;
- }
- if (overlong) {
- /* invalid utf-8 sequence (overlong) */
- if (error) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
- if (items_read)
- *items_read = in_pos;
- return -1;
- } else {
- codepoint = 0;
- mb_remain = 0;
- overlong = FALSE;
- }
- }
- else
- ret++;
- } else if (codepoint < 0x110000) {
- /* surrogate pair */
- ret += 2;
- } else {
- /* invalid utf-8 sequence (excess) */
- if (error) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
- if (items_read)
- *items_read = in_pos;
- return -1;
- } else {
- codepoint = 0;
- mb_remain = 0;
- }
- }
- mb_size = 0;
- }
- } else {
- /* invalid utf-8 sequence */
- if (error) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
- if (items_read)
- *items_read = in_pos;
- return -1;
- } else {
- codepoint = 0;
- mb_remain = mb_size = 0;
- }
- }
+ /* Everything falls through when TRUE... */
+ switch (len) {
+ default:
+ return FALSE;
+ case 4:
+ if ((c = (*--ptr)) < 0x80 || c > 0xBF)
+ return FALSE;
+
+ if ((c == 0xBF || c == 0xBE) && ptr[-1] == 0xBF) {
+ if (ptr[-2] == 0x8F || ptr[-2] == 0x9F ||
+ ptr[-2] == 0xAF || ptr[-2] == 0xBF)
+ return FALSE;
}
- }
-
- if (items_read)
- *items_read = in_pos;
- return ret;
-}
-
-gunichar2*
-g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
-{
- /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
- but error check is always done at utf8_to_utf16_len() so that
- the conversion core below simply resets erroreous bits */
- glong utf16_len;
- gunichar2 *ret;
- guchar ch, mb_size, mb_remain;
- guint32 codepoint;
- glong in_pos, out_pos;
-
- utf16_len = 0;
- mb_size = 0;
- mb_remain = 0;
- in_pos = 0;
- out_pos = 0;
-
- if (error)
- *error = NULL;
-
- if (len < 0)
- len = (glong) strlen (str);
-
- if (items_read)
- *items_read = 0;
- if (items_written)
- *items_written = 0;
- utf16_len = utf8_to_utf16_len (str, len, items_read, error);
- if (error)
- if (*error)
- return NULL;
- if (utf16_len < 0)
- return NULL;
-
- ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
-
- /* Common case */
- for (in_pos = 0; in_pos < len; in_pos++) {
- ch = (guchar) str [in_pos];
-
- if (ch >= 0x80)
+ case 3:
+ if ((c = (*--ptr)) < 0x80 || c > 0xBF)
+ return FALSE;
+ case 2:
+ if ((c = (*--ptr)) < 0x80 || c > 0xBF)
+ return FALSE;
+
+ /* no fall-through in this inner switch */
+ switch (*inptr) {
+ case 0xE0: if (c < 0xA0) return FALSE; break;
+ case 0xED: if (c > 0x9F) return FALSE; break;
+ case 0xEF: if (c == 0xB7 && (ptr[1] > 0x8F && ptr[1] < 0xB0)) return FALSE;
+ if (c == 0xBF && (ptr[1] == 0xBE || ptr[1] == 0xBF)) return FALSE;
break;
- ret [out_pos++] = ch;
- }
-
- for (; in_pos < len; in_pos++) {
- ch = (guchar) str [in_pos];
- if (mb_size == 0) {
- if (ch < 0x80)
- ret [out_pos++] = ch;
- else if ((ch & 0xE0) == 0xC0) {
- codepoint = ch & 0x1F;
- mb_size = 2;
- } else if ((ch & 0xF0) == 0xE0) {
- codepoint = ch & 0x0F;
- mb_size = 3;
- } else if ((ch & 0xF8) == 0xF0) {
- codepoint = ch & 7;
- mb_size = 4;
- } else if ((ch & 0xFC) == 0xF8) {
- codepoint = ch & 3;
- mb_size = 5;
- } else if ((ch & 0xFE) == 0xFC) {
- codepoint = ch & 3;
- mb_size = 6;
- } else {
- /* invalid utf-8 sequence */
- codepoint = 0;
- mb_remain = mb_size = 0;
- }
- if (mb_size > 1)
- mb_remain = mb_size - 1;
- } else {
- if ((ch & 0xC0) == 0x80) {
- codepoint = (codepoint << 6) | (ch & 0x3F);
- if (--mb_remain == 0) {
- /* multi byte character is fully consumed now. */
- if (codepoint < 0x10000) {
- ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
- } else if (codepoint < 0x110000) {
- /* surrogate pair */
- codepoint -= 0x10000;
- ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
- ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
- } else {
- /* invalid utf-8 sequence (excess) */
- codepoint = 0;
- mb_remain = 0;
- }
- mb_size = 0;
- }
- } else {
- /* invalid utf-8 sequence */
- codepoint = 0;
- mb_remain = mb_size = 0;
- }
+ case 0xF0: if (c < 0x90) return FALSE; break;
+ case 0xF4: if (c > 0x8F) return FALSE; break;
+ default: if (c < 0x80) return FALSE; break;
}
+ case 1: if (*inptr >= 0x80 && *inptr < 0xC2) return FALSE;
}
-
- ret [out_pos] = 0;
- if (items_written)
- *items_written = out_pos;
- return ret;
+
+ if (*inptr > 0xF4)
+ return FALSE;
+
+ return TRUE;
}
-gchar*
-g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
+/**
+ * g_utf8_validate:
+ * @str: a utf-8 encoded string
+ * @max_len: max number of bytes to validate (or -1 to validate the entire null-terminated string)
+ * @end: output parameter to mark the end of the valid input
+ *
+ * Checks @utf for being valid UTF-8. @str is assumed to be
+ * null-terminated. This function is not super-strict, as it will
+ * allow longer UTF-8 sequences than necessary. Note that Java is
+ * capable of producing these sequences if provoked. Also note, this
+ * routine checks for the 4-byte maximum size, but does not check for
+ * 0x10ffff maximum value.
+ *
+ * Return value: %TRUE if @str is valid or %FALSE otherwise.
+ **/
+gboolean
+g_utf8_validate (const gchar *str, gssize max_len, const gchar **end)
{
- /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
- but error check is always done at utf16_to_utf8_len() so that
- the conversion core below simply resets erroreous bits */
- glong utf8_len;
- gchar *ret;
- glong in_pos, out_pos;
- gunichar2 ch;
- guint32 codepoint = 0;
- gboolean surrogate;
-
- in_pos = 0;
- out_pos = 0;
- surrogate = FALSE;
-
- if (items_read)
- *items_read = 0;
- if (items_written)
- *items_written = 0;
- utf8_len = utf16_to_utf8_len (str, len, items_read, error);
- if (error)
- if (*error)
- return NULL;
- if (utf8_len < 0)
- return NULL;
-
- ret = g_malloc ((1+utf8_len) * sizeof (gchar));
-
- while (len < 0 ? str [in_pos] : in_pos < len) {
- ch = str [in_pos];
- if (surrogate) {
- if (ch >= 0xDC00 && ch <= 0xDFFF) {
- codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
- surrogate = 0;
- } else {
- surrogate = 0;
- /* invalid surrogate pair */
- ++in_pos;
- continue;
- }
- } else {
- /* fast path optimization */
- if (ch < 0x80) {
- for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
- if (str [in_pos] < 0x80)
- ret [out_pos++] = (gchar)(str [in_pos]);
- else
- break;
- }
- continue;
- }
- else if (ch >= 0xD800 && ch <= 0xDBFF)
- surrogate = ch;
- else if (ch >= 0xDC00 && ch <= 0xDFFF) {
- ++in_pos;
- /* invalid surrogate pair */
- continue;
+ guchar *inptr = (guchar *) str;
+ gboolean valid = TRUE;
+ guint length, min;
+ gssize n = 0;
+
+ if (max_len == 0)
+ return FALSE;
+
+ if (max_len < 0) {
+ while (*inptr != 0) {
+ length = g_utf8_jump_table[*inptr];
+ if (!utf8_validate (inptr, length)) {
+ valid = FALSE;
+ break;
}
- else
- codepoint = ch;
- }
- in_pos++;
-
- if (surrogate != 0)
- continue;
- if (codepoint < 0x80)
- ret [out_pos++] = (gchar) codepoint;
- else if (codepoint < 0x0800) {
- ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
- ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
- } else if (codepoint < 0x10000) {
- ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
- ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
- ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
- } else {
- ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
- ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
- ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
- ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
+
+ inptr += length;
}
- }
- ret [out_pos] = 0;
-
- if (items_written)
- *items_written = out_pos;
- return ret;
-}
-
-static glong
-utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
-{
- glong ret, in_pos;
- gunichar2 ch;
- gboolean surrogate;
-
- ret = 0;
- in_pos = 0;
- surrogate = FALSE;
-
- while (len < 0 ? str [in_pos] : in_pos < len) {
- ch = str [in_pos];
- if (surrogate) {
- if (ch >= 0xDC00 && ch <= 0xDFFF) {
- ret += 4;
- } else {
- /* invalid surrogate pair */
- if (error) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
- if (items_read)
- *items_read = in_pos;
- return -1;
- } /* otherwise just ignore. */
+ } else {
+ while (n < max_len) {
+ if (*inptr == 0) {
+ /* Note: return FALSE if we encounter nul-byte
+ * before max_len is reached. */
+ valid = FALSE;
+ break;
}
- surrogate = FALSE;
- } else {
- /* fast path optimization */
- if (ch < 0x80) {
- for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
- if (str [in_pos] < 0x80)
- ++ret;
- else
- break;
- }
- continue;
+
+ length = g_utf8_jump_table[*inptr];
+ min = MIN (length, max_len - n);
+
+ if (!utf8_validate (inptr, min)) {
+ valid = FALSE;
+ break;
}
- else if (ch < 0x0800)
- ret += 2;
- else if (ch >= 0xD800 && ch <= 0xDBFF)
- surrogate = TRUE;
- else if (ch >= 0xDC00 && ch <= 0xDFFF) {
- /* invalid surrogate pair */
- if (error) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
- if (items_read)
- *items_read = in_pos;
- return -1;
- } /* otherwise just ignore. */
+
+ if (min < length) {
+ valid = FALSE;
+ break;
}
- else
- ret += 3;
+
+ inptr += length;
+ n += length;
}
- in_pos++;
}
-
- if (items_read)
- *items_read = in_pos;
- return ret;
+
+ if (end != NULL)
+ *end = (gchar *) inptr;
+
+ return valid;
}
-gchar *
-g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
+gunichar
+g_utf8_get_char_validated (const gchar *str, gssize max_len)
{
- gchar *outbuf, *outptr;
- glong nwritten = 0;
- glong i;
- gint n;
+ unsigned char *inptr = (unsigned char *) str;
+ gunichar u = *inptr;
+ int n, i;
- if (len == -1) {
- for (i = 0; str[i] != 0; i++) {
- if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
- "Invalid sequence in conversion input");
-
- if (items_read)
- *items_read = i;
-
- return NULL;
- }
-
- nwritten += n;
- }
+ if (max_len == 0)
+ return -2;
+
+ if (u < 0x80) {
+ /* simple ascii case */
+ return u;
+ } else if (u < 0xc2) {
+ return -1;
+ } else if (u < 0xe0) {
+ u &= 0x1f;
+ n = 2;
+ } else if (u < 0xf0) {
+ u &= 0x0f;
+ n = 3;
+ } else if (u < 0xf8) {
+ u &= 0x07;
+ n = 4;
+ } else if (u < 0xfc) {
+ u &= 0x03;
+ n = 5;
+ } else if (u < 0xfe) {
+ u &= 0x01;
+ n = 6;
} else {
- for (i = 0; i < len; i++) {
- if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
- "Invalid sequence in conversion input");
-
- if (items_read)
- *items_read = i;
-
- return NULL;
- }
-
- nwritten += n;
- }
+ return -1;
}
- outptr = outbuf = g_malloc (nwritten + 1);
- if (len == -1) {
- for (i = 0; str[i] != 0; i++)
- outptr += g_unichar_to_utf8 (str[i], outptr);
+ if (max_len > 0) {
+ if (!utf8_validate (inptr, MIN (max_len, n)))
+ return -1;
+
+ if (max_len < n)
+ return -2;
} else {
- for (i = 0; i < len; i++)
- outptr += g_unichar_to_utf8 (str[i], outptr);
+ if (!utf8_validate (inptr, n))
+ return -1;
}
- *outptr = '\0';
-
- if (items_written)
- *items_written = nwritten;
- if (items_read != 0)
- *items_read = i;
+ for (i = 1; i < n; i++)
+ u = (u << 6) | (*++inptr ^ 0x80);
- return outbuf;
+ return u;
}
-static glong
-g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error)
+glong
+g_utf8_strlen (const gchar *str, gssize max_len)
{
- glong retlen = 0;
- glong errindex = 0;
- const gunichar *lstr = str;
-
- if (!str)
+ const guchar *inptr = (const guchar *) str;
+ glong clen = 0, len = 0, n;
+
+ if (max_len == 0)
return 0;
-
- while (*lstr != '\0' && len--) {
- gunichar ch;
- ch = *lstr++;
- if (ch <= 0x0000FFFF) {
- if (ch >= 0xD800 && ch <= 0xDFFF) {
- errindex = (glong)(lstr - str)-1;
- if (error)
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
- "Invalid sequence in conversion input");
- if (items_read)
- *items_read = errindex;
- return 0;
- } else {
- retlen++;
- }
- } else if (ch > 0x10FFFF) {
- errindex = (glong)(lstr - str)-1;
- if (error)
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
- "Character out of range for UTF-16");
- if (items_read)
- *items_read = errindex;
- return 0;
-
- } else {
- retlen+=2;
- }
- }
-
- if (items_read)
- *items_read = (glong)(lstr - str);
- return retlen;
-}
-
-gunichar2*
-g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
-{
- glong allocsz;
- gunichar2 *retstr = 0;
- gunichar2 *retch = 0;
- glong nwritten = 0;
- GError *lerror =0 ;
-
- allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror);
-
- if (!lerror) {
- retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2));
- retstr[allocsz] = '\0';
-
- while (*str != '\0' && len--) {
- gunichar ch;
- ch = *str++;
- if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) {
- *retch++ = (gunichar2)ch;
- nwritten ++;
- } else {
- ch -= 0x0010000UL;
- *retch++ = (gunichar2)((ch >> 10) + 0xD800);
- *retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00);
- nwritten +=2;
- }
+
+ if (max_len < 0) {
+ while (*inptr) {
+ inptr += g_utf8_jump_table[*inptr];
+ len++;
}
- }
-
- if (items_written)
- *items_written = nwritten;
- if (error)
- *error = lerror;
-
- return retstr;
-}
-
-static glong
-g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
-{
- glong retlen = 0;
- glong errindex = 0;
- const gunichar2 *lstr = str;
- gunichar2 ch,ch2;
-
- if (!str)
- return 0;
-
- while (*lstr != '\0' && len--) {
- ch = *lstr++;
- if (ch >= 0xD800 && ch <= 0xDBFF) {
- if (!len--) {
- lstr--;
+ } else {
+ while (len < max_len && *inptr) {
+ n = g_utf8_jump_table[*inptr];
+ if ((clen + n) > max_len)
break;
- }
- ch2 = *lstr;
- if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
- lstr++;
- } else {
- errindex = (glong)(lstr - str);
- if (error)
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
- "Invalid sequence in conversion input");
- if (items_read)
- *items_read = errindex;
- return 0;
- }
- } else {
- if (ch >= 0xDC00 && ch <= 0xDFFF) {
- errindex = (glong)(lstr - str)-1;
- if (error)
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
- "Invalid sequence in conversion input");
- if (items_read)
- *items_read = errindex;
- return 0;
- }
+
+ inptr += n;
+ clen += n;
+ len++;
}
- retlen++;
}
-
- if (items_read)
- *items_read = (glong)(lstr - str);
-
- return retlen;
+
+ return len;
}
-gunichar*
-g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
+gunichar
+g_utf8_get_char (const gchar *src)
{
- glong allocsz;
- gunichar *retstr = 0;
- gunichar *retch = 0;
- glong nwritten = 0;
- GError *lerror =0 ;
- gunichar ch,ch2;
-
- allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror);
-
- if (!lerror) {
- retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar));
- retstr[allocsz] = '\0';
- nwritten = allocsz;
-
- while (*str != '\0' && allocsz--) {
- ch = *str++;
- if (ch >= 0xD800 && ch <= 0xDBFF) {
- ch2 = *str++;
- ch = ((ch - (gunichar)0xD800) << 10)
- + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL;
- }
- *retch++ = ch;
- }
+ unsigned char *inptr = (unsigned char *) src;
+ gunichar u = *inptr;
+ int n, i;
+
+ if (u < 0x80) {
+ /* simple ascii case */
+ return u;
+ } else if (u < 0xe0) {
+ u &= 0x1f;
+ n = 2;
+ } else if (u < 0xf0) {
+ u &= 0x0f;
+ n = 3;
+ } else if (u < 0xf8) {
+ u &= 0x07;
+ n = 4;
+ } else if (u < 0xfc) {
+ u &= 0x03;
+ n = 5;
+ } else {
+ u &= 0x01;
+ n = 6;
}
-
- if (items_written)
- *items_written = nwritten;
- if (error)
- *error = lerror;
-
- return retstr;
+
+ for (i = 1; i < n; i++)
+ u = (u << 6) | (*++inptr ^ 0x80);
+
+ return u;
}
gchar *
{
while (p > str) {
p--;
- if ((*p && 0xc0) != 0xb0)
+ if ((*p & 0xc0) != 0xb0)
return (gchar *)p;
}
return NULL;
return offset * sign;
}
-
-gunichar*
-g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written)
-{
- gunichar* ucs4;
- int ucs4_index;
- const char *p;
- int mb_size;
- gunichar codepoint;
-
- g_return_val_if_fail (str != NULL, NULL);
-
- if (len < 0) {
- /* we need to find the length of str, as len < 0 means it must be 0 terminated */
-
- len = 0;
- p = str;
- while (*p) {
- len ++;
- p = g_utf8_next_char(p);
- }
- }
-
- ucs4 = g_malloc (sizeof(gunichar)*len);
- if (items_written)
- *items_written = len;
-
- p = str;
- ucs4_index = 0;
- while (len) {
- guint8 c = *p++;
-
- if (c < 0x80) {
- mb_size = 1;
- }
- else if (c < 0xe0) {
- c &= 0x1f;
-
- mb_size = 2;
- }
- else if (c < 0xf0) {
- c &= 0x0f;
- mb_size = 3;
- }
- else if (c < 0xf8) {
- c &= 0x07;
- mb_size = 4;
- }
- else if (c < 0xfc) {
- c &= 0x03;
- mb_size = 5;
- }
- else if (c < 0xfe) {
- c &= 0x01;
- mb_size = 6;
- }
-
- codepoint = c;
- while (--mb_size) {
- codepoint = (codepoint << 6) | ((*p) & 0x3f);
- p++;
- }
-
- ucs4[ucs4_index++] = codepoint;
- len --;
- }
-
- return ucs4;
-}
-
-/**
- * from http://home.tiscali.nl/t876506/utf8tbl.html
- *
- * From Unicode UCS-4 to UTF-8:
- * Start with the Unicode number expressed as a decimal number and call this ud.
- *
- * If ud <128 (7F hex) then UTF-8 is 1 byte long, the value of ud.
- *
- * If ud >=128 and <=2047 (7FF hex) then UTF-8 is 2 bytes long.
- * byte 1 = 192 + (ud div 64)
- * byte 2 = 128 + (ud mod 64)
- *
- * If ud >=2048 and <=65535 (FFFF hex) then UTF-8 is 3 bytes long.
- * byte 1 = 224 + (ud div 4096)
- * byte 2 = 128 + ((ud div 64) mod 64)
- * byte 3 = 128 + (ud mod 64)
- *
- * If ud >=65536 and <=2097151 (1FFFFF hex) then UTF-8 is 4 bytes long.
- * byte 1 = 240 + (ud div 262144)
- * byte 2 = 128 + ((ud div 4096) mod 64)
- * byte 3 = 128 + ((ud div 64) mod 64)
- * byte 4 = 128 + (ud mod 64)
- *
- * If ud >=2097152 and <=67108863 (3FFFFFF hex) then UTF-8 is 5 bytes long.
- * byte 1 = 248 + (ud div 16777216)
- * byte 2 = 128 + ((ud div 262144) mod 64)
- * byte 3 = 128 + ((ud div 4096) mod 64)
- * byte 4 = 128 + ((ud div 64) mod 64)
- * byte 5 = 128 + (ud mod 64)
- *
- * If ud >=67108864 and <=2147483647 (7FFFFFFF hex) then UTF-8 is 6 bytes long.
- * byte 1 = 252 + (ud div 1073741824)
- * byte 2 = 128 + ((ud div 16777216) mod 64)
- * byte 3 = 128 + ((ud div 262144) mod 64)
- * byte 4 = 128 + ((ud div 4096) mod 64)
- * byte 5 = 128 + ((ud div 64) mod 64)
- * byte 6 = 128 + (ud mod 64)
- **/
-gint
-g_unichar_to_utf8 (gunichar c, gchar *outbuf)
-{
- size_t len, i;
- int base;
-
- if (c < 128UL) {
- base = 0;
- len = 1;
- } else if (c < 2048UL) {
- base = 192;
- len = 2;
- } else if (c < 65536UL) {
- base = 224;
- len = 3;
- } else if (c < 2097152UL) {
- base = 240;
- len = 4;
- } else if (c < 67108864UL) {
- base = 248;
- len = 5;
- } else if (c < 2147483648UL) {
- base = 252;
- len = 6;
- } else {
- return -1;
- }
-
- if (outbuf != NULL) {
- for (i = len - 1; i > 0; i--) {
- /* mask off 6 bits worth and add 128 */
- outbuf[i] = 128 + (c & 0x3f);
- c >>= 6;
- }
-
- /* first character has a different base */
- outbuf[0] = base + c;
- }
-
- return len;
-}