Merge pull request #4065 from akoeplinger/add-mcs-lane
[mono.git] / eglib / src / gutf8.c
index e1165663dc6ba316a9a440f522253caacfb7dc99..c4c9b912e76276e83d5504bf1a972fb42a7fca4f 100644 (file)
  *   Atsushi Enomoto  <atsushi@ximian.com>
  *
  * (C) 2006 Novell, Inc.
+ * Copyright 2012 Xamarin Inc
  */
 
 #include <stdio.h>
 #include <glib.h>
 
-gpointer error_quark = "ERROR";
+/*
+ * Index into the table below with the first byte of a UTF-8 sequence to get
+ * the number of bytes that are supposed to follow it to complete the sequence.
+ *
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is left
+ * as-is for anyone who may want to do such conversion, which was allowed in
+ * earlier algorithms.
+*/
+const guchar g_utf8_jump_table[256] = {
+       1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+       1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+       1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+       1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+       1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+       1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+       2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+       3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+};
 
-gpointer
-g_convert_error_quark ()
+static gchar *
+utf8_case_conv (const gchar *str, gssize len, gboolean upper)
 {
-       return error_quark;
+       gunichar *ustr;
+       glong i, ulen;
+       gchar *utf8;
+       
+       ustr = g_utf8_to_ucs4_fast (str, (glong) len, &ulen);
+       for (i = 0; i < ulen; i++)
+               ustr[i] = upper ? g_unichar_toupper (ustr[i]) : g_unichar_tolower (ustr[i]);
+       utf8 = g_ucs4_to_utf8 (ustr, ulen, NULL, NULL, NULL);
+       g_free (ustr);
+       
+       return utf8;
 }
 
-gunichar2*
-g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
+gchar *
+g_utf8_strup (const gchar *str, gssize len)
 {
-       /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
-          but error check is always done at utf8_to_utf16_len() so that
-          the conversion core below simply resets erroreous bits */
-       glong utf16_len;
-       gunichar2 *ret;
-       gchar ch, mb_size, mb_remain;
-       guint32 codepoint;
-       glong in_pos, out_pos;
-
-       utf16_len = 0;
-       mb_size = 0;
-       mb_remain = 0;
-       in_pos = 0;
-       out_pos = 0;
-
-       if (error)
-               *error = NULL;
+       return utf8_case_conv (str, len, TRUE);
+}
 
-       utf16_len = utf8_to_utf16_len (str, len, items_read, error);
-       if (error)
-               if (*error)
-                       return NULL;
-       if (utf16_len < 0)
-               return NULL;
+gchar *
+g_utf8_strdown (const gchar *str, gssize len)
+{
+       return utf8_case_conv (str, len, FALSE);
+}
 
-       ret = g_malloc (utf16_len * sizeof (gunichar2));
+static gboolean
+utf8_validate (const unsigned char *inptr, size_t len)
+{
+       const unsigned char *ptr = inptr + len;
+       unsigned char c;
+       
+       /* Everything falls through when TRUE... */
+       switch (len) {
+       default:
+               return FALSE;
+       case 4:
+               if ((c = (*--ptr)) < 0x80 || c > 0xBF)
+                       return FALSE;
+               
+               if ((c == 0xBF || c == 0xBE) && ptr[-1] == 0xBF) {
+                       if (ptr[-2] == 0x8F || ptr[-2] == 0x9F ||
+                           ptr[-2] == 0xAF || ptr[-2] == 0xBF)
+                               return FALSE;
+               }
+       case 3:
+               if ((c = (*--ptr)) < 0x80 || c > 0xBF)
+                       return FALSE;
+       case 2:
+               if ((c = (*--ptr)) < 0x80 || c > 0xBF)
+                       return FALSE;
+               
+               /* no fall-through in this inner switch */
+               switch (*inptr) {
+               case 0xE0: if (c < 0xA0) return FALSE; break;
+               case 0xED: if (c > 0x9F) return FALSE; break;
+               case 0xEF: if (c == 0xB7 && (ptr[1] > 0x8F && ptr[1] < 0xB0)) return FALSE;
+                       if (c == 0xBF && (ptr[1] == 0xBE || ptr[1] == 0xBF)) return FALSE;
+                       break;
+               case 0xF0: if (c < 0x90) return FALSE; break;
+               case 0xF4: if (c > 0x8F) return FALSE; break;
+               default:   if (c < 0x80) return FALSE; break;
+               }
+       case 1: if (*inptr >= 0x80 && *inptr < 0xC2) return FALSE;
+       }
+       
+       if (*inptr > 0xF4)
+               return FALSE;
+       
+       return TRUE;
+}
 
-       for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
-               ch = (guchar) str [in_pos];
-               if (mb_size == 0) {
-                       if (0 < ch)
-                               ret [out_pos++] = ch;
-                       else if ((ch & 0xE0) == 0xC0) {
-                               codepoint = ch & 0x1F;
-                               mb_remain = mb_size = 2;
-                       } else if ((ch & 0xF0) == 0xE0) {
-                               codepoint = ch & 0x0F;
-                               mb_remain = mb_size = 3;
-                       } else if ((ch & 0xF8) == 0xF0) {
-                               codepoint = ch & 7;
-                               mb_remain = mb_size = 4;
-                       } else if ((ch & 0xFC) == 0xF8) {
-                               codepoint = ch & 3;
-                               mb_remain = mb_size = 5;
-                       } else if ((ch & 0xFE) == 0xFC) {
-                               codepoint = ch & 3;
-                               mb_remain = mb_size = 6;
-                       } else {
-                               /* invalid utf-8 sequence */
-                               codepoint = 0;
-                               mb_remain = mb_size = 0;
+/**
+ * g_utf8_validate:
+ * @str: a utf-8 encoded string
+ * @max_len: max number of bytes to validate (or -1 to validate the entire null-terminated string)
+ * @end: output parameter to mark the end of the valid input
+ *
+ * Checks @utf for being valid UTF-8. @str is assumed to be
+ * null-terminated. This function is not super-strict, as it will
+ * allow longer UTF-8 sequences than necessary. Note that Java is
+ * capable of producing these sequences if provoked. Also note, this
+ * routine checks for the 4-byte maximum size, but does not check for
+ * 0x10ffff maximum value.
+ *
+ * Return value: %TRUE if @str is valid or %FALSE otherwise.
+ **/
+gboolean
+g_utf8_validate (const gchar *str, gssize max_len, const gchar **end)
+{
+       guchar *inptr = (guchar *) str;
+       gboolean valid = TRUE;
+       guint length, min;
+       gssize n = 0;
+       
+       if (max_len == 0)
+               return FALSE;
+       
+       if (max_len < 0) {
+               while (*inptr != 0) {
+                       length = g_utf8_jump_table[*inptr];
+                       if (!utf8_validate (inptr, length)) {
+                               valid = FALSE;
+                               break;
+                       }
+                       
+                       inptr += length;
+               }
+       } else {
+               while (n < max_len) {
+                       if (*inptr == 0) {
+                               /* Note: return FALSE if we encounter nul-byte
+                                * before max_len is reached. */
+                               valid = FALSE;
+                               break;
+                       }
+                       
+                       length = g_utf8_jump_table[*inptr];
+                       min = MIN (length, max_len - n);
+                       
+                       if (!utf8_validate (inptr, min)) {
+                               valid = FALSE;
+                               break;
                        }
-               } else {
-                       if ((ch & 0xC0) == 0x80) {
-                               codepoint = (codepoint << 6) | (ch & 0x3F);
-                               if (--mb_remain == 0) {
-                                       /* multi byte character is fully consumed now. */
-                                       if (codepoint < 0x10000) {
-                                               ret [out_pos++] = codepoint;
-                                       } else if (codepoint < 0x110000) {
-                                               /* surrogate pair */
-                                               codepoint -= 0x10000;
-                                               ret [out_pos++] = (codepoint >> 10) + 0xD800;
-                                               ret [out_pos++] = (codepoint & 0x3FF) + 0xDC00;
-                                       } else {
-                                               /* invalid utf-8 sequence (excess) */
-                                               codepoint = 0;
-                                               mb_remain = mb_size = 0;
-                                       }
-                               }
-                       } else {
-                               /* invalid utf-8 sequence */
-                               codepoint = 0;
-                               mb_remain = mb_size = 0;
+                       
+                       if (min < length) {
+                               valid = FALSE;
+                               break;
                        }
+                       
+                       inptr += length;
+                       n += length;
                }
        }
+       
+       if (end != NULL)
+               *end = (gchar *) inptr;
+       
+       return valid;
+}
 
-       if (items_written)
-               *items_written = out_pos;
-       return ret;
+gunichar
+g_utf8_get_char_validated (const gchar *str, gssize max_len)
+{
+       unsigned char *inptr = (unsigned char *) str;
+       gunichar u = *inptr;
+       int n, i;
+       
+       if (max_len == 0)
+               return -2;
+       
+       if (u < 0x80) {
+               /* simple ascii case */
+               return u;
+       } else if (u < 0xc2) {
+               return -1;
+       } else if (u < 0xe0) {
+               u &= 0x1f;
+               n = 2;
+       } else if (u < 0xf0) {
+               u &= 0x0f;
+               n = 3;
+       } else if (u < 0xf8) {
+               u &= 0x07;
+               n = 4;
+       } else if (u < 0xfc) {
+               u &= 0x03;
+               n = 5;
+       } else if (u < 0xfe) {
+               u &= 0x01;
+               n = 6;
+       } else {
+               return -1;
+       }
+       
+       if (max_len > 0) {
+               if (!utf8_validate (inptr, MIN (max_len, n)))
+                       return -1;
+               
+               if (max_len < n)
+                       return -2;
+       } else {
+               if (!utf8_validate (inptr, n))
+                       return -1;
+       }
+       
+       for (i = 1; i < n; i++)
+               u = (u << 6) | (*++inptr ^ 0x80);
+       
+       return u;
 }
 
 glong
-utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
+g_utf8_strlen (const gchar *str, gssize max_len)
 {
-       /* It is almost identical to UTF8Encoding.GetCharCount() */
-       guchar ch, mb_size, mb_remain;
-       gboolean overlong;
-       guint32 codepoint;
-       glong in_pos, ret;
-
-       mb_size = 0;
-       mb_remain = 0;
-       overlong = 0;
-       in_pos = 0;
-       ret = 0;
-
-       for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
-               ch = str [in_pos];
-               if (mb_size == 0) {
-                       if (ch < 0x80)
-                               ret++;
-                       else if ((ch & 0xE0) == 0xC0) {
-                               codepoint = ch & 0x1F;
-                               mb_remain = mb_size = 2;
-                       } else if ((ch & 0xF0) == 0xE0) {
-                               codepoint = ch & 0x0F;
-                               mb_remain = mb_size = 3;
-                       } else if ((ch & 0xF8) == 0xF0) {
-                               codepoint = ch & 7;
-                               mb_remain = mb_size = 4;
-                       } else if ((ch & 0xFC) == 0xF8) {
-                               codepoint = ch & 3;
-                               mb_remain = mb_size = 5;
-                       } else if ((ch & 0xFE) == 0xFC) {
-                               codepoint = ch & 3;
-                               mb_remain = mb_size = 6;
-                       } else {
-                               /* invalid utf-8 sequence */
-                               if (error) {
-                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);
-                                       if (items_read)
-                                               *items_read = in_pos;
-                                       return -1;
-                               } else {
-                                       codepoint = 0;
-                                       mb_remain = mb_size = 0;
-                               }
-                       }
-               } else {
-                       if ((ch & 0xC0) == 0x80) {
-                               codepoint = (codepoint << 6) | (ch & 0x3F);
-                               if (--mb_remain == 0) {
-                                       /* multi byte character is fully consumed now. */
-                                       if (codepoint < 0x10000) {
-                                               switch (mb_size) {
-                                               case 2:
-                                                       overlong = codepoint < 0x7F;
-                                                       break;
-                                               case 3:
-                                                       overlong = codepoint < 0x7FF;
-                                                       break;
-                                               case 4:
-                                                       overlong = codepoint < 0xFFFF;
-                                                       break;
-                                               case 5:
-                                                       overlong = codepoint < 0x1FFFFF;
-                                                       break;
-                                               case 6:
-                                                       overlong = codepoint < 0x03FFFFFF;
-                                                       break;
-                                               }
-                                               if (overlong) {
-                                                       /* invalid utf-8 sequence (overlong) */
-                                                       if (error) {
-                                                               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
-                                                               if (items_read)
-                                                                       *items_read = in_pos;
-                                                               return -1;
-                                                       } else {
-                                                               codepoint = 0;
-                                                               mb_remain = mb_size = 0;
-                                                               overlong = FALSE;
-                                                       }
-                                               }
-                                               else
-                                                       ret++;
-                                       } else if (codepoint < 0x110000) {
-                                               /* surrogate pair */
-                                               ret += 2;
-                                       } else {
-                                               /* invalid utf-8 sequence (excess) */
-                                               if (error) {
-                                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
-                                                       if (items_read)
-                                                               *items_read = in_pos;
-                                                       return -1;
-                                               } else {
-                                                       codepoint = 0;
-                                                       mb_remain = mb_size = 0;
-                                               }
-                                       }
-                               }
-                       } else {
-                               /* invalid utf-8 sequence */
-                               if (error) {
-                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);
-                                       if (items_read)
-                                               *items_read = in_pos;
-                                       return -1;
-                               } else {
-                                       codepoint = 0;
-                                       mb_remain = mb_size = 0;
-                               }
-                       }
+       const guchar *inptr = (const guchar *) str;
+       glong clen = 0, len = 0, n;
+       
+       if (max_len == 0)
+               return 0;
+       
+       if (max_len < 0) {
+               while (*inptr) {
+                       inptr += g_utf8_jump_table[*inptr];
+                       len++;
+               }
+       } else {
+               while (len < max_len && *inptr) {
+                       n = g_utf8_jump_table[*inptr];
+                       if ((clen + n) > max_len)
+                               break;
+                       
+                       inptr += n;
+                       clen += n;
+                       len++;
                }
        }
-
-       if (items_read)
-               *items_read = in_pos;
-       return ret;
+       
+       return len;
 }
 
-gchar*
-g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
+gunichar
+g_utf8_get_char (const gchar *src)
 {
-       /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
-          but error check is always done at utf16_to_utf8_len() so that
-          the conversion core below simply resets erroreous bits */
-       glong utf8_len;
-       gchar *ret;
-       glong in_pos, out_pos;
-       gunichar2 ch;
-       guint32 codepoint;
-       gboolean surrogate;
-
-       in_pos = 0;
-       out_pos = 0;
-       surrogate = FALSE;
+       unsigned char *inptr = (unsigned char *) src;
+       gunichar u = *inptr;
+       int n, i;
+       
+       if (u < 0x80) {
+               /* simple ascii case */
+               return u;
+       } else if (u < 0xe0) {
+               u &= 0x1f;
+               n = 2;
+       } else if (u < 0xf0) {
+               u &= 0x0f;
+               n = 3;
+       } else if (u < 0xf8) {
+               u &= 0x07;
+               n = 4;
+       } else if (u < 0xfc) {
+               u &= 0x03;
+               n = 5;
+       } else {
+               u &= 0x01;
+               n = 6;
+       }
+       
+       for (i = 1; i < n; i++)
+               u = (u << 6) | (*++inptr ^ 0x80);
+       
+       return u;
+}
 
-       utf8_len = utf16_to_utf8_len (str, len, items_read, error);
-       if (error)
-               if (*error)
-                       return NULL;
-       if (utf8_len < 0)
-               return NULL;
+gchar *
+g_utf8_find_prev_char (const gchar *str, const gchar *p)
+{
+       while (p > str) {
+               p--;
+               if ((*p & 0xc0) != 0xb0)
+                       return (gchar *)p;
+       }
+       return NULL;
+}
 
-       ret = g_malloc (utf8_len * sizeof (gchar));
+gchar *
+g_utf8_prev_char (const gchar *str)
+{
+       const gchar *p = str;
+       do {
+               p--;
+       } while ((*p & 0xc0) == 0xb0);
+       
+       return (gchar *)p;
+}
 
-       while (len < 0 ? str [in_pos] : in_pos < len) {
-               ch = str [in_pos];
-               if (surrogate) {
-                       surrogate = 0;
-                       if (ch >= 0xDC00 && ch <= 0xDFFF)
-                               codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
-                       else
-                               /* invalid surrogate pair */
-                               continue;
-               } else {
-                       /* fast path optimization */
-                       if (ch < 0x80) {
-                               for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
-                                       if (str [in_pos] < 0x80)
-                                               ret [out_pos++] = str [in_pos];
-                                       else
-                                               break;
-                               }
-                               continue;
-                       }
-                       else if (ch >= 0xD800 && ch <= 0xDBFF)
-                               surrogate = ch;
-                       else if (ch >= 0xDC00 && ch <= 0xDFFF) {
-                               /* invalid surrogate pair */
-                               continue;
-                       }
-                       else
-                               codepoint = ch;
-               }
-               in_pos++;
+gchar *
+g_utf8_offset_to_pointer (const gchar *str, glong offset)
+{
+       const gchar *p = str;
 
-               if (codepoint < 0x80)
-                       ret [out_pos++] = (gchar) codepoint;
-               else if (codepoint < 0x0800) {
-                       ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
-                       ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
-               } else if (codepoint < 0x10000) {
-                       ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
-                       ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
-                       ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
-               } else {
-                       ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
-                       ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
-                       ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
-                       ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
-               }
+       if (offset > 0) {
+               do {
+                       p = g_utf8_next_char (p);
+                       offset --;
+               } while (offset > 0);
        }
-
-       if (items_written)
-               *items_written = out_pos;
-       return ret;
+       else if (offset < 0) {
+               const gchar *jump = str;
+               do {
+                       // since the minimum size of a character is 1
+                       // we know we can step back at least offset bytes
+                       jump = jump + offset;
+                       
+                       // if we land in the middle of a character
+                       // walk to the beginning
+                       while ((*jump & 0xc0) == 0xb0)
+                               jump --;
+                       
+                       // count how many characters we've actually walked
+                       // by going forward
+                       p = jump;
+                       do {
+                               p = g_utf8_next_char (p);
+                               offset ++;
+                       } while (p < jump);
+                       
+               } while (offset < 0);
+       }
+       
+       return (gchar *)p;
 }
 
 glong
-utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
+g_utf8_pointer_to_offset (const gchar *str, const gchar *pos)
 {
-       glong ret, in_pos;
-       gunichar2 ch;
-       gboolean surrogate;
-
-       ret = 0;
-       in_pos = 0;
-       surrogate = FALSE;
-
-       while (len < 0 ? str [in_pos] : in_pos < len) {
-               ch = str [in_pos];
-               if (surrogate) {
-                       if (ch >= 0xDC00 && ch <= 0xDFFF) {
-                               ret += 4;
-                       } else {
-                               /* invalid surrogate pair */
-                               if (error) {
-                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
-                                       if (items_read)
-                                               *items_read = in_pos;
-                                       return -1;
-                               } /* otherwise just ignore. */
-                       }
-                       surrogate = FALSE;
-               } else {
-                       /* fast path optimization */
-                       if (ch < 0x80) {
-                               for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
-                                       if (str [in_pos] < 0x80)
-                                               ++ret;
-                                       else
-                                               break;
-                               }
-                               continue;
-                       }
-                       else if (ch < 0x0800)
-                               ret += 2;
-                       else if (ch >= 0xD800 && ch <= 0xDBFF)
-                               surrogate = TRUE;
-                       else if (ch >= 0xDC00 && ch <= 0xDFFF) {
-                               /* invalid surrogate pair */
-                               if (error) {
-                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
-                                       if (items_read)
-                                               *items_read = in_pos;
-                                       return -1;
-                               } /* otherwise just ignore. */
-                       }
-                       else
-                               ret += 3;
-               }
-               in_pos++;
+       const gchar *inptr, *inend;
+       glong offset = 0;
+       glong sign = 1;
+       
+       if (pos == str)
+               return 0;
+       
+       if (str < pos) {
+               inptr = str;
+               inend = pos;
+       } else {
+               inptr = pos;
+               inend = str;
+               sign = -1;
        }
-
-       if (items_read)
-               *items_read = in_pos;
-       return ret;
+       
+       do {
+               inptr = g_utf8_next_char (inptr);
+               offset++;
+       } while (inptr < inend);
+       
+       return offset * sign;
 }