[eglib] Move definition of GPid to eglib-config.h

[mono.git] / eglib / src / gutf8.c
diff --git a/eglib/src/gutf8.c b/eglib/src/gutf8.c

index 6239178b04b868ea758ac4edbd05ae25a82b19a8..fed6dd753ca6921ff34a30fa99b43c67906853e1 100644 (file)
--- a/eglib/src/gutf8.c
+++ b/eglib/src/gutf8.c
@@ -21,91 +21,38 @@ g_convert_error_quark ()
         return error_quark;
  }
  
-gunichar2*
-g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
+static gunichar*
+utf8_case_conv (const gchar *str, gssize len, gboolean upper)
  {
-       /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
-          but error check is always done at utf8_to_utf16_len() so that
-          the conversion core below simply resets erroreous bits */
-       glong utf16_len;
-       gunichar2 *ret;
-       gchar ch, mb_size, mb_remain;
-       guint32 codepoint;
-       glong in_pos, out_pos;
-
-       utf16_len = 0;
-       mb_size = 0;
-       mb_remain = 0;
-       in_pos = 0;
-       out_pos = 0;
+       glong i, u16len, u32len;
+       gunichar2 *u16str;
+       gunichar *u32str;
+       gchar *u8str;
+       GError **err = NULL;
  
-       if (error)
-               *error = NULL;
-
-       utf16_len = utf8_to_utf16_len (str, len, items_read, error);
-       if (error)
-               if (*error)
-                       return NULL;
-       if (utf16_len < 0)
-               return NULL;
-
-       ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
-
-       for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
-               ch = (guchar) str [in_pos];
-               if (mb_size == 0) {
-                       if (0 < ch)
-                               ret [out_pos++] = ch;
-                       else if ((ch & 0xE0) == 0xC0) {
-                               codepoint = ch & 0x1F;
-                               mb_remain = mb_size = 2;
-                       } else if ((ch & 0xF0) == 0xE0) {
-                               codepoint = ch & 0x0F;
-                               mb_remain = mb_size = 3;
-                       } else if ((ch & 0xF8) == 0xF0) {
-                               codepoint = ch & 7;
-                               mb_remain = mb_size = 4;
-                       } else if ((ch & 0xFC) == 0xF8) {
-                               codepoint = ch & 3;
-                               mb_remain = mb_size = 5;
-                       } else if ((ch & 0xFE) == 0xFC) {
-                               codepoint = ch & 3;
-                               mb_remain = mb_size = 6;
-                       } else {
-                               /* invalid utf-8 sequence */
-                               codepoint = 0;
-                               mb_remain = mb_size = 0;
-                       }
-               } else {
-                       if ((ch & 0xC0) == 0x80) {
-                               codepoint = (codepoint << 6) | (ch & 0x3F);
-                               if (--mb_remain == 0) {
-                                       /* multi byte character is fully consumed now. */
-                                       if (codepoint < 0x10000) {
-                                               ret [out_pos++] = codepoint;
-                                       } else if (codepoint < 0x110000) {
-                                               /* surrogate pair */
-                                               codepoint -= 0x10000;
-                                               ret [out_pos++] = (codepoint >> 10) + 0xD800;
-                                               ret [out_pos++] = (codepoint & 0x3FF) + 0xDC00;
-                                       } else {
-                                               /* invalid utf-8 sequence (excess) */
-                                               codepoint = 0;
-                                               mb_remain = mb_size = 0;
-                                       }
-                               }
-                       } else {
-                               /* invalid utf-8 sequence */
-                               codepoint = 0;
-                               mb_remain = mb_size = 0;
-                       }
-               }
+       u16str = g_utf8_to_utf16 (str, (glong)len, NULL, &u16len, err);
+       u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
+       for (i = 0; i < u32len; i++) {
+               u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
         }
+       g_free (u16str);
+       u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
+       u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
+       g_free (u32str);
+       g_free (u16str);
+       return (gunichar*)u8str;
+}
  
-       ret [out_pos] = 0;
-       if (items_written)
-               *items_written = out_pos;
-       return ret;
+gchar*
+g_utf8_strup (const gchar *str, gssize len)
+{
+       return (gchar*)utf8_case_conv (str, len, TRUE);
+}
+
+gchar*
+g_utf8_strdown (const gchar *str, gssize len)
+{
+       return (gchar*)utf8_case_conv (str, len, FALSE);
  }
  
  static glong
@@ -117,36 +64,50 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
         guint32 codepoint;
         glong in_pos, ret;
  
+       if (len < 0)
+               len = (glong) strlen (str);
+
+       in_pos = 0;
+       ret = 0;
+
+       /* Common case */
+       for (in_pos = 0; in_pos < len && (guchar) str [in_pos] < 0x80; in_pos++)
+               ret ++;
+
+       if (in_pos == len) {
+               if (items_read)
+                       *items_read = in_pos;
+               return ret;
+       }
+
         mb_size = 0;
         mb_remain = 0;
         overlong = 0;
-       in_pos = 0;
-       ret = 0;
  
-       for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
+       for (; in_pos < len; in_pos++) {
                 ch = str [in_pos];
                 if (mb_size == 0) {
                         if (ch < 0x80)
                                 ret++;
                         else if ((ch & 0xE0) == 0xC0) {
                                 codepoint = ch & 0x1F;
-                               mb_remain = mb_size = 2;
+                               mb_size = 2;
                         } else if ((ch & 0xF0) == 0xE0) {
                                 codepoint = ch & 0x0F;
-                               mb_remain = mb_size = 3;
+                               mb_size = 3;
                         } else if ((ch & 0xF8) == 0xF0) {
                                 codepoint = ch & 7;
-                               mb_remain = mb_size = 4;
+                               mb_size = 4;
                         } else if ((ch & 0xFC) == 0xF8) {
                                 codepoint = ch & 3;
-                               mb_remain = mb_size = 5;
+                               mb_size = 5;
                         } else if ((ch & 0xFE) == 0xFC) {
                                 codepoint = ch & 3;
-                               mb_remain = mb_size = 6;
+                               mb_size = 6;
                         } else {
                                 /* invalid utf-8 sequence */
                                 if (error) {
-                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);
+                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
                                         if (items_read)
                                                 *items_read = in_pos;
                                         return -1;
@@ -155,6 +116,8 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
                                         mb_remain = mb_size = 0;
                                 }
                         }
+                       if (mb_size > 1)
+                               mb_remain = mb_size - 1;
                 } else {
                         if ((ch & 0xC0) == 0x80) {
                                 codepoint = (codepoint << 6) | (ch & 0x3F);
@@ -187,7 +150,7 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
                                                                 return -1;
                                                         } else {
                                                                 codepoint = 0;
-                                                               mb_remain = mb_size = 0;
+                                                               mb_remain = 0;
                                                                 overlong = FALSE;
                                                         }
                                                 }
@@ -205,14 +168,15 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
                                                         return -1;
                                                 } else {
                                                         codepoint = 0;
-                                                       mb_remain = mb_size = 0;
+                                                       mb_remain = 0;
                                                 }
                                         }
+                                       mb_size = 0;
                                 }
                         } else {
                                 /* invalid utf-8 sequence */
                                 if (error) {
-                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);
+                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
                                         if (items_read)
                                                 *items_read = in_pos;
                                         return -1;
@@ -229,6 +193,112 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
         return ret;
  }
  
+gunichar2*
+g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
+{
+       /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
+          but error check is always done at utf8_to_utf16_len() so that
+          the conversion core below simply resets erroreous bits */
+       glong utf16_len;
+       gunichar2 *ret;
+       guchar ch, mb_size, mb_remain;
+       guint32 codepoint;
+       glong in_pos, out_pos;
+
+       utf16_len = 0;
+       mb_size = 0;
+       mb_remain = 0;
+       in_pos = 0;
+       out_pos = 0;
+
+       if (error)
+               *error = NULL;
+
+       if (len < 0)
+               len = (glong) strlen (str);
+
+       if (items_read)
+               *items_read = 0;
+       if (items_written)
+               *items_written = 0;
+       utf16_len = utf8_to_utf16_len (str, len, items_read, error);
+       if (error)
+               if (*error)
+                       return NULL;
+       if (utf16_len < 0)
+               return NULL;
+
+       ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
+
+       /* Common case */
+       for (in_pos = 0; in_pos < len; in_pos++) {
+               ch = (guchar) str [in_pos];
+
+               if (ch >= 0x80)
+                       break;
+               ret [out_pos++] = ch;
+       }
+
+       for (; in_pos < len; in_pos++) {
+               ch = (guchar) str [in_pos];
+               if (mb_size == 0) {
+                       if (ch < 0x80)
+                               ret [out_pos++] = ch;
+                       else if ((ch & 0xE0) == 0xC0) {
+                               codepoint = ch & 0x1F;
+                               mb_size = 2;
+                       } else if ((ch & 0xF0) == 0xE0) {
+                               codepoint = ch & 0x0F;
+                               mb_size = 3;
+                       } else if ((ch & 0xF8) == 0xF0) {
+                               codepoint = ch & 7;
+                               mb_size = 4;
+                       } else if ((ch & 0xFC) == 0xF8) {
+                               codepoint = ch & 3;
+                               mb_size = 5;
+                       } else if ((ch & 0xFE) == 0xFC) {
+                               codepoint = ch & 3;
+                               mb_size = 6;
+                       } else {
+                               /* invalid utf-8 sequence */
+                               codepoint = 0;
+                               mb_remain = mb_size = 0;
+                       }
+                       if (mb_size > 1)
+                               mb_remain = mb_size - 1;
+               } else {
+                       if ((ch & 0xC0) == 0x80) {
+                               codepoint = (codepoint << 6) | (ch & 0x3F);
+                               if (--mb_remain == 0) {
+                                       /* multi byte character is fully consumed now. */
+                                       if (codepoint < 0x10000) {
+                                               ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
+                                       } else if (codepoint < 0x110000) {
+                                               /* surrogate pair */
+                                               codepoint -= 0x10000;
+                                               ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
+                                               ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
+                                       } else {
+                                               /* invalid utf-8 sequence (excess) */
+                                               codepoint = 0;
+                                               mb_remain = 0;
+                                       }
+                                       mb_size = 0;
+                               }
+                       } else {
+                               /* invalid utf-8 sequence */
+                               codepoint = 0;
+                               mb_remain = mb_size = 0;
+                       }
+               }
+       }
+
+       ret [out_pos] = 0;
+       if (items_written)
+               *items_written = out_pos;
+       return ret;
+}
+
  gchar*
  g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
  {
@@ -239,13 +309,17 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
         gchar *ret;
         glong in_pos, out_pos;
         gunichar2 ch;
-       guint32 codepoint;
+       guint32 codepoint = 0;
         gboolean surrogate;
  
         in_pos = 0;
         out_pos = 0;
         surrogate = FALSE;
  
+       if (items_read)
+               *items_read = 0;
+       if (items_written)
+               *items_written = 0;
         utf8_len = utf16_to_utf8_len (str, len, items_read, error);
         if (error)
                 if (*error)
@@ -258,18 +332,21 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
         while (len < 0 ? str [in_pos] : in_pos < len) {
                 ch = str [in_pos];
                 if (surrogate) {
-                       surrogate = 0;
-                       if (ch >= 0xDC00 && ch <= 0xDFFF)
+                       if (ch >= 0xDC00 && ch <= 0xDFFF) {
                                 codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
-                       else
+                               surrogate = 0;
+                       } else {
+                               surrogate = 0;
                                 /* invalid surrogate pair */
+                               ++in_pos;
                                 continue;
+                       }
                 } else {
                         /* fast path optimization */
                         if (ch < 0x80) {
                                 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
                                         if (str [in_pos] < 0x80)
-                                               ret [out_pos++] = str [in_pos];
+                                               ret [out_pos++] = (gchar)(str [in_pos]);
                                         else
                                                 break;
                                 }
@@ -278,6 +355,7 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
                         else if (ch >= 0xD800 && ch <= 0xDBFF)
                                 surrogate = ch;
                         else if (ch >= 0xDC00 && ch <= 0xDFFF) {
+                               ++in_pos;
                                 /* invalid surrogate pair */
                                 continue;
                         }
@@ -286,6 +364,8 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
                 }
                 in_pos++;
  
+               if (surrogate != 0)
+                       continue;
                 if (codepoint < 0x80)
                         ret [out_pos++] = (gchar) codepoint;
                 else if (codepoint < 0x0800) {
@@ -369,3 +449,171 @@ utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **
                 *items_read = in_pos;
         return ret;
  }
+
+static glong
+g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error)
+{
+       glong retlen = 0;
+       glong errindex = 0;
+       const gunichar *lstr = str;
+
+       if (!str)
+               return 0;
+
+       while (*lstr != '\0' && len--) {
+               gunichar ch;
+               ch = *lstr++;
+               if (ch <= 0x0000FFFF) { 
+                       if (ch >= 0xD800 && ch <= 0xDFFF) {
+                               errindex = (glong)(lstr - str)-1;
+                               if (error)
+                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+                                       "Invalid sequence in conversion input");
+                               if (items_read)
+                                       *items_read = errindex;
+                               return 0;
+                       } else {
+                               retlen++;
+                       }
+               } else if (ch > 0x10FFFF) {
+                       errindex = (glong)(lstr - str)-1;
+                       if (error)
+                               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+                               "Character out of range for UTF-16");
+                       if (items_read)
+                               *items_read = errindex;
+                       return 0;
+
+               } else {
+                       retlen+=2;
+               }
+       }
+
+       if (items_read)
+               *items_read = (glong)(lstr - str);
+       return retlen;
+}
+
+gunichar2*
+g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
+{
+       glong allocsz;
+       gunichar2 *retstr = 0;
+       gunichar2 *retch = 0;
+       glong nwritten = 0;
+       GError *lerror =0 ;
+
+       allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror);
+
+       if (!lerror) {
+               retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2));
+               retstr[allocsz] = '\0';
+
+               while (*str != '\0' && len--) {
+                       gunichar ch;
+                       ch = *str++;
+                       if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) {
+                               *retch++ = (gunichar2)ch;
+                               nwritten ++;
+                       } else {
+                               ch -= 0x0010000UL;
+                               *retch++ = (gunichar2)((ch >> 10) + 0xD800);
+                               *retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00);
+                               nwritten +=2;
+                       }
+               }
+       }
+
+       if (items_written)
+               *items_written = nwritten;
+       if (error)
+               *error = lerror;
+
+       return retstr;
+}
+
+static glong
+g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
+{
+       glong retlen = 0;
+       glong errindex = 0;
+       const gunichar2 *lstr = str;
+       gunichar2 ch,ch2;
+
+       if (!str)
+               return 0;
+
+       while (*lstr != '\0' && len--) {
+               ch = *lstr++;
+               if (ch >= 0xD800 && ch <= 0xDBFF) {
+                       if (!len--) {
+                               lstr--;
+                               break;
+                       }
+                       ch2 = *lstr;
+                       if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+                               lstr++;
+                       } else {
+                               errindex = (glong)(lstr - str);
+                               if (error)
+                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+                                       "Invalid sequence in conversion input");
+                               if (items_read)
+                                       *items_read = errindex;
+                               return 0;
+                       }
+               } else {
+                       if (ch >= 0xDC00 && ch <= 0xDFFF) {
+                               errindex = (glong)(lstr - str)-1;
+                               if (error)
+                                       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+                                       "Invalid sequence in conversion input");
+                               if (items_read)
+                                       *items_read = errindex;
+                               return 0;
+                       }
+               }
+               retlen++;
+       }
+
+       if (items_read)
+               *items_read = (glong)(lstr - str);
+
+       return retlen;
+}
+
+gunichar*
+g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
+{
+       glong allocsz;
+       gunichar *retstr = 0;
+       gunichar *retch = 0;
+       glong nwritten = 0;
+       GError *lerror =0 ;
+       gunichar ch,ch2;
+
+       allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror);
+
+       if (!lerror) {
+               retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar));
+               retstr[allocsz] = '\0';
+               nwritten = allocsz;
+
+               while (*str != '\0' && allocsz--) {
+                       ch = *str++;
+                       if (ch >= 0xD800 && ch <= 0xDBFF) {
+                               ch2 = *str++;
+                               ch = ((ch - (gunichar)0xD800) << 10)
+                                     + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL;
+                       }
+                       *retch++ = ch;
+               }
+       }
+
+       if (items_written)
+               *items_written = nwritten;
+       if (error)
+               *error = lerror;
+
+       return retstr;
+}