* Atsushi Enomoto <atsushi@ximian.com>
*
* (C) 2006 Novell, Inc.
+ * Copyright 2012 Xamarin Inc
*/
#include <stdio.h>
#include <glib.h>
-gpointer error_quark = "ERROR";
+/*
+ * Index into the table below with the first byte of a UTF-8 sequence to get
+ * the number of bytes that are supposed to follow it to complete the sequence.
+ *
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is left
+ * as-is for anyone who may want to do such conversion, which was allowed in
+ * earlier algorithms.
+*/
+const guchar g_utf8_jump_table[256] = {
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+};
-gpointer
-g_convert_error_quark ()
+static gchar *
+utf8_case_conv (const gchar *str, gssize len, gboolean upper)
{
- return error_quark;
+ gunichar *ustr;
+ glong i, ulen;
+ gchar *utf8;
+
+ ustr = g_utf8_to_ucs4_fast (str, (glong) len, &ulen);
+ for (i = 0; i < ulen; i++)
+ ustr[i] = upper ? g_unichar_toupper (ustr[i]) : g_unichar_tolower (ustr[i]);
+ utf8 = g_ucs4_to_utf8 (ustr, ulen, NULL, NULL, NULL);
+ g_free (ustr);
+
+ return utf8;
}
-gunichar2*
-g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
+gchar *
+g_utf8_strup (const gchar *str, gssize len)
{
- /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
- but error check is always done at utf8_to_utf16_len() so that
- the conversion core below simply resets erroreous bits */
- glong utf16_len;
- gunichar2 *ret;
- gchar ch, mb_size, mb_remain;
- guint32 codepoint;
- glong in_pos, out_pos;
-
- utf16_len = 0;
- mb_size = 0;
- mb_remain = 0;
- in_pos = 0;
- out_pos = 0;
-
- if (error)
- *error = NULL;
+ return utf8_case_conv (str, len, TRUE);
+}
- utf16_len = utf8_to_utf16_len (str, len, items_read, error);
- if (error)
- if (*error)
- return NULL;
- if (utf16_len < 0)
- return NULL;
+gchar *
+g_utf8_strdown (const gchar *str, gssize len)
+{
+ return utf8_case_conv (str, len, FALSE);
+}
- ret = g_malloc (utf16_len * sizeof (gunichar2));
+static gboolean
+utf8_validate (const unsigned char *inptr, size_t len)
+{
+ const unsigned char *ptr = inptr + len;
+ unsigned char c;
+
+ /* Everything falls through when TRUE... */
+ switch (len) {
+ default:
+ return FALSE;
+ case 4:
+ if ((c = (*--ptr)) < 0x80 || c > 0xBF)
+ return FALSE;
+
+ if ((c == 0xBF || c == 0xBE) && ptr[-1] == 0xBF) {
+ if (ptr[-2] == 0x8F || ptr[-2] == 0x9F ||
+ ptr[-2] == 0xAF || ptr[-2] == 0xBF)
+ return FALSE;
+ }
+ case 3:
+ if ((c = (*--ptr)) < 0x80 || c > 0xBF)
+ return FALSE;
+ case 2:
+ if ((c = (*--ptr)) < 0x80 || c > 0xBF)
+ return FALSE;
+
+ /* no fall-through in this inner switch */
+ switch (*inptr) {
+ case 0xE0: if (c < 0xA0) return FALSE; break;
+ case 0xED: if (c > 0x9F) return FALSE; break;
+ case 0xEF: if (c == 0xB7 && (ptr[1] > 0x8F && ptr[1] < 0xB0)) return FALSE;
+ if (c == 0xBF && (ptr[1] == 0xBE || ptr[1] == 0xBF)) return FALSE;
+ break;
+ case 0xF0: if (c < 0x90) return FALSE; break;
+ case 0xF4: if (c > 0x8F) return FALSE; break;
+ default: if (c < 0x80) return FALSE; break;
+ }
+ case 1: if (*inptr >= 0x80 && *inptr < 0xC2) return FALSE;
+ }
+
+ if (*inptr > 0xF4)
+ return FALSE;
+
+ return TRUE;
+}
- for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
- ch = (guchar) str [in_pos];
- if (mb_size == 0) {
- if (0 < ch)
- ret [out_pos++] = ch;
- else if ((ch & 0xE0) == 0xC0) {
- codepoint = ch & 0x1F;
- mb_remain = mb_size = 2;
- } else if ((ch & 0xF0) == 0xE0) {
- codepoint = ch & 0x0F;
- mb_remain = mb_size = 3;
- } else if ((ch & 0xF8) == 0xF0) {
- codepoint = ch & 7;
- mb_remain = mb_size = 4;
- } else if ((ch & 0xFC) == 0xF8) {
- codepoint = ch & 3;
- mb_remain = mb_size = 5;
- } else if ((ch & 0xFE) == 0xFC) {
- codepoint = ch & 3;
- mb_remain = mb_size = 6;
- } else {
- /* invalid utf-8 sequence */
- codepoint = 0;
- mb_remain = mb_size = 0;
+/**
+ * g_utf8_validate:
+ * @str: a utf-8 encoded string
+ * @max_len: max number of bytes to validate (or -1 to validate the entire null-terminated string)
+ * @end: output parameter to mark the end of the valid input
+ *
+ * Checks @utf for being valid UTF-8. @str is assumed to be
+ * null-terminated. This function is not super-strict, as it will
+ * allow longer UTF-8 sequences than necessary. Note that Java is
+ * capable of producing these sequences if provoked. Also note, this
+ * routine checks for the 4-byte maximum size, but does not check for
+ * 0x10ffff maximum value.
+ *
+ * Return value: %TRUE if @str is valid or %FALSE otherwise.
+ **/
+gboolean
+g_utf8_validate (const gchar *str, gssize max_len, const gchar **end)
+{
+ guchar *inptr = (guchar *) str;
+ gboolean valid = TRUE;
+ guint length, min;
+ gssize n = 0;
+
+ if (max_len == 0)
+ return FALSE;
+
+ if (max_len < 0) {
+ while (*inptr != 0) {
+ length = g_utf8_jump_table[*inptr];
+ if (!utf8_validate (inptr, length)) {
+ valid = FALSE;
+ break;
+ }
+
+ inptr += length;
+ }
+ } else {
+ while (n < max_len) {
+ if (*inptr == 0) {
+ /* Note: return FALSE if we encounter nul-byte
+ * before max_len is reached. */
+ valid = FALSE;
+ break;
+ }
+
+ length = g_utf8_jump_table[*inptr];
+ min = MIN (length, max_len - n);
+
+ if (!utf8_validate (inptr, min)) {
+ valid = FALSE;
+ break;
}
- } else {
- if ((ch & 0xC0) == 0x80) {
- codepoint = (codepoint << 6) | (ch & 0x3F);
- if (--mb_remain == 0) {
- /* multi byte character is fully consumed now. */
- if (codepoint < 0x10000) {
- ret [out_pos++] = codepoint;
- } else if (codepoint < 0x110000) {
- /* surrogate pair */
- codepoint -= 0x10000;
- ret [out_pos++] = (codepoint >> 10) + 0xD800;
- ret [out_pos++] = (codepoint & 0x3FF) + 0xDC00;
- } else {
- /* invalid utf-8 sequence (excess) */
- codepoint = 0;
- mb_remain = mb_size = 0;
- }
- }
- } else {
- /* invalid utf-8 sequence */
- codepoint = 0;
- mb_remain = mb_size = 0;
+
+ if (min < length) {
+ valid = FALSE;
+ break;
}
+
+ inptr += length;
+ n += length;
}
}
+
+ if (end != NULL)
+ *end = (gchar *) inptr;
+
+ return valid;
+}
- if (items_written)
- *items_written = out_pos;
- return ret;
+gunichar
+g_utf8_get_char_validated (const gchar *str, gssize max_len)
+{
+ unsigned char *inptr = (unsigned char *) str;
+ gunichar u = *inptr;
+ int n, i;
+
+ if (max_len == 0)
+ return -2;
+
+ if (u < 0x80) {
+ /* simple ascii case */
+ return u;
+ } else if (u < 0xc2) {
+ return -1;
+ } else if (u < 0xe0) {
+ u &= 0x1f;
+ n = 2;
+ } else if (u < 0xf0) {
+ u &= 0x0f;
+ n = 3;
+ } else if (u < 0xf8) {
+ u &= 0x07;
+ n = 4;
+ } else if (u < 0xfc) {
+ u &= 0x03;
+ n = 5;
+ } else if (u < 0xfe) {
+ u &= 0x01;
+ n = 6;
+ } else {
+ return -1;
+ }
+
+ if (max_len > 0) {
+ if (!utf8_validate (inptr, MIN (max_len, n)))
+ return -1;
+
+ if (max_len < n)
+ return -2;
+ } else {
+ if (!utf8_validate (inptr, n))
+ return -1;
+ }
+
+ for (i = 1; i < n; i++)
+ u = (u << 6) | (*++inptr ^ 0x80);
+
+ return u;
}
glong
-utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
+g_utf8_strlen (const gchar *str, gssize max_len)
{
- /* It is almost identical to UTF8Encoding.GetCharCount() */
- guchar ch, mb_size, mb_remain;
- gboolean overlong;
- guint32 codepoint;
- glong in_pos, ret;
-
- mb_size = 0;
- mb_remain = 0;
- overlong = 0;
- in_pos = 0;
- ret = 0;
-
- for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
- ch = str [in_pos];
- if (mb_size == 0) {
- if (ch < 0x80)
- ret++;
- else if ((ch & 0xE0) == 0xC0) {
- codepoint = ch & 0x1F;
- mb_remain = mb_size = 2;
- } else if ((ch & 0xF0) == 0xE0) {
- codepoint = ch & 0x0F;
- mb_remain = mb_size = 3;
- } else if ((ch & 0xF8) == 0xF0) {
- codepoint = ch & 7;
- mb_remain = mb_size = 4;
- } else if ((ch & 0xFC) == 0xF8) {
- codepoint = ch & 3;
- mb_remain = mb_size = 5;
- } else if ((ch & 0xFE) == 0xFC) {
- codepoint = ch & 3;
- mb_remain = mb_size = 6;
- } else {
- /* invalid utf-8 sequence */
- if (error) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);
- if (items_read)
- *items_read = in_pos;
- return -1;
- } else {
- codepoint = 0;
- mb_remain = mb_size = 0;
- }
- }
- } else {
- if ((ch & 0xC0) == 0x80) {
- codepoint = (codepoint << 6) | (ch & 0x3F);
- if (--mb_remain == 0) {
- /* multi byte character is fully consumed now. */
- if (codepoint < 0x10000) {
- switch (mb_size) {
- case 2:
- overlong = codepoint < 0x7F;
- break;
- case 3:
- overlong = codepoint < 0x7FF;
- break;
- case 4:
- overlong = codepoint < 0xFFFF;
- break;
- case 5:
- overlong = codepoint < 0x1FFFFF;
- break;
- case 6:
- overlong = codepoint < 0x03FFFFFF;
- break;
- }
- if (overlong) {
- /* invalid utf-8 sequence (overlong) */
- if (error) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
- if (items_read)
- *items_read = in_pos;
- return -1;
- } else {
- codepoint = 0;
- mb_remain = mb_size = 0;
- overlong = FALSE;
- }
- }
- else
- ret++;
- } else if (codepoint < 0x110000) {
- /* surrogate pair */
- ret += 2;
- } else {
- /* invalid utf-8 sequence (excess) */
- if (error) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
- if (items_read)
- *items_read = in_pos;
- return -1;
- } else {
- codepoint = 0;
- mb_remain = mb_size = 0;
- }
- }
- }
- } else {
- /* invalid utf-8 sequence */
- if (error) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);
- if (items_read)
- *items_read = in_pos;
- return -1;
- } else {
- codepoint = 0;
- mb_remain = mb_size = 0;
- }
- }
+ const guchar *inptr = (const guchar *) str;
+ glong clen = 0, len = 0, n;
+
+ if (max_len == 0)
+ return 0;
+
+ if (max_len < 0) {
+ while (*inptr) {
+ inptr += g_utf8_jump_table[*inptr];
+ len++;
+ }
+ } else {
+ while (len < max_len && *inptr) {
+ n = g_utf8_jump_table[*inptr];
+ if ((clen + n) > max_len)
+ break;
+
+ inptr += n;
+ clen += n;
+ len++;
}
}
-
- if (items_read)
- *items_read = in_pos;
- return ret;
+
+ return len;
}
-gchar*
-g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
+gunichar
+g_utf8_get_char (const gchar *src)
{
- /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
- but error check is always done at utf16_to_utf8_len() so that
- the conversion core below simply resets erroreous bits */
- glong utf8_len;
- gchar *ret;
- glong in_pos, out_pos;
- gunichar2 ch;
- guint32 codepoint;
- gboolean surrogate;
-
- in_pos = 0;
- out_pos = 0;
- surrogate = FALSE;
+ unsigned char *inptr = (unsigned char *) src;
+ gunichar u = *inptr;
+ int n, i;
+
+ if (u < 0x80) {
+ /* simple ascii case */
+ return u;
+ } else if (u < 0xe0) {
+ u &= 0x1f;
+ n = 2;
+ } else if (u < 0xf0) {
+ u &= 0x0f;
+ n = 3;
+ } else if (u < 0xf8) {
+ u &= 0x07;
+ n = 4;
+ } else if (u < 0xfc) {
+ u &= 0x03;
+ n = 5;
+ } else {
+ u &= 0x01;
+ n = 6;
+ }
+
+ for (i = 1; i < n; i++)
+ u = (u << 6) | (*++inptr ^ 0x80);
+
+ return u;
+}
- utf8_len = utf16_to_utf8_len (str, len, items_read, error);
- if (error)
- if (*error)
- return NULL;
- if (utf8_len < 0)
- return NULL;
+gchar *
+g_utf8_find_prev_char (const gchar *str, const gchar *p)
+{
+ while (p > str) {
+ p--;
+ if ((*p & 0xc0) != 0xb0)
+ return (gchar *)p;
+ }
+ return NULL;
+}
- ret = g_malloc (utf8_len * sizeof (gchar));
+gchar *
+g_utf8_prev_char (const gchar *str)
+{
+ const gchar *p = str;
+ do {
+ p--;
+ } while ((*p & 0xc0) == 0xb0);
+
+ return (gchar *)p;
+}
- while (len < 0 ? str [in_pos] : in_pos < len) {
- ch = str [in_pos];
- if (surrogate) {
- surrogate = 0;
- if (ch >= 0xDC00 && ch <= 0xDFFF)
- codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
- else
- /* invalid surrogate pair */
- continue;
- } else {
- /* fast path optimization */
- if (ch < 0x80) {
- for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
- if (str [in_pos] < 0x80)
- ret [out_pos++] = str [in_pos];
- else
- break;
- }
- continue;
- }
- else if (ch >= 0xD800 && ch <= 0xDBFF)
- surrogate = ch;
- else if (ch >= 0xDC00 && ch <= 0xDFFF) {
- /* invalid surrogate pair */
- continue;
- }
- else
- codepoint = ch;
- }
- in_pos++;
+gchar *
+g_utf8_offset_to_pointer (const gchar *str, glong offset)
+{
+ const gchar *p = str;
- if (codepoint < 0x80)
- ret [out_pos++] = (gchar) codepoint;
- else if (codepoint < 0x0800) {
- ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
- ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
- } else if (codepoint < 0x10000) {
- ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
- ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
- ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
- } else {
- ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
- ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
- ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
- ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
- }
+ if (offset > 0) {
+ do {
+ p = g_utf8_next_char (p);
+ offset --;
+ } while (offset > 0);
}
-
- if (items_written)
- *items_written = out_pos;
- return ret;
+ else if (offset < 0) {
+ const gchar *jump = str;
+ do {
+ // since the minimum size of a character is 1
+ // we know we can step back at least offset bytes
+ jump = jump + offset;
+
+ // if we land in the middle of a character
+ // walk to the beginning
+ while ((*jump & 0xc0) == 0xb0)
+ jump --;
+
+ // count how many characters we've actually walked
+ // by going forward
+ p = jump;
+ do {
+ p = g_utf8_next_char (p);
+ offset ++;
+ } while (p < jump);
+
+ } while (offset < 0);
+ }
+
+ return (gchar *)p;
}
glong
-utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
+g_utf8_pointer_to_offset (const gchar *str, const gchar *pos)
{
- glong ret, in_pos;
- gunichar2 ch;
- gboolean surrogate;
-
- ret = 0;
- in_pos = 0;
- surrogate = FALSE;
-
- while (len < 0 ? str [in_pos] : in_pos < len) {
- ch = str [in_pos];
- if (surrogate) {
- if (ch >= 0xDC00 && ch <= 0xDFFF) {
- ret += 4;
- } else {
- /* invalid surrogate pair */
- if (error) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
- if (items_read)
- *items_read = in_pos;
- return -1;
- } /* otherwise just ignore. */
- }
- surrogate = FALSE;
- } else {
- /* fast path optimization */
- if (ch < 0x80) {
- for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
- if (str [in_pos] < 0x80)
- ++ret;
- else
- break;
- }
- continue;
- }
- else if (ch < 0x0800)
- ret += 2;
- else if (ch >= 0xD800 && ch <= 0xDBFF)
- surrogate = TRUE;
- else if (ch >= 0xDC00 && ch <= 0xDFFF) {
- /* invalid surrogate pair */
- if (error) {
- g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
- if (items_read)
- *items_read = in_pos;
- return -1;
- } /* otherwise just ignore. */
- }
- else
- ret += 3;
- }
- in_pos++;
+ const gchar *inptr, *inend;
+ glong offset = 0;
+ glong sign = 1;
+
+ if (pos == str)
+ return 0;
+
+ if (str < pos) {
+ inptr = str;
+ inend = pos;
+ } else {
+ inptr = pos;
+ inend = str;
+ sign = -1;
}
-
- if (items_read)
- *items_read = in_pos;
- return ret;
+
+ do {
+ inptr = g_utf8_next_char (inptr);
+ offset++;
+ } while (inptr < inend);
+
+ return offset * sign;
}