X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;ds=sidebyside;f=eglib%2Fsrc%2Fgutf8.c;h=fed6dd753ca6921ff34a30fa99b43c67906853e1;hb=a2b8537da3747d7eb506ecdbb91a8a2bc35a2248;hp=e1165663dc6ba316a9a440f522253caacfb7dc99;hpb=b5cfba1835f2ba823796f825410e0062b7e4c9a3;p=mono.git diff --git a/eglib/src/gutf8.c b/eglib/src/gutf8.c index e1165663dc6..fed6dd753ca 100644 --- a/eglib/src/gutf8.c +++ b/eglib/src/gutf8.c @@ -12,99 +12,50 @@ gpointer error_quark = "ERROR"; +static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error); +static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error); + gpointer g_convert_error_quark () { return error_quark; } -gunichar2* -g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error) +static gunichar* +utf8_case_conv (const gchar *str, gssize len, gboolean upper) { - /* The conversion logic is almost identical to UTF8Encoding.GetChars(), - but error check is always done at utf8_to_utf16_len() so that - the conversion core below simply resets erroreous bits */ - glong utf16_len; - gunichar2 *ret; - gchar ch, mb_size, mb_remain; - guint32 codepoint; - glong in_pos, out_pos; + glong i, u16len, u32len; + gunichar2 *u16str; + gunichar *u32str; + gchar *u8str; + GError **err = NULL; - utf16_len = 0; - mb_size = 0; - mb_remain = 0; - in_pos = 0; - out_pos = 0; - - if (error) - *error = NULL; - - utf16_len = utf8_to_utf16_len (str, len, items_read, error); - if (error) - if (*error) - return NULL; - if (utf16_len < 0) - return NULL; - - ret = g_malloc (utf16_len * sizeof (gunichar2)); - - for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) { - ch = (guchar) str [in_pos]; - if (mb_size == 0) { - if (0 < ch) - ret [out_pos++] = ch; - else if ((ch & 0xE0) == 0xC0) { - codepoint = ch & 0x1F; - mb_remain = mb_size = 2; - } else if ((ch & 0xF0) == 0xE0) { - codepoint = ch & 0x0F; - mb_remain = mb_size = 3; - } else if ((ch & 0xF8) == 0xF0) { - codepoint = ch & 7; - mb_remain = mb_size = 4; - } else if ((ch & 0xFC) == 0xF8) { - codepoint = ch & 3; - mb_remain = mb_size = 5; - } else if ((ch & 0xFE) == 0xFC) { - codepoint = ch & 3; - mb_remain = mb_size = 6; - } else { - /* invalid utf-8 sequence */ - codepoint = 0; - mb_remain = mb_size = 0; - } - } else { - if ((ch & 0xC0) == 0x80) { - codepoint = (codepoint << 6) | (ch & 0x3F); - if (--mb_remain == 0) { - /* multi byte character is fully consumed now. */ - if (codepoint < 0x10000) { - ret [out_pos++] = codepoint; - } else if (codepoint < 0x110000) { - /* surrogate pair */ - codepoint -= 0x10000; - ret [out_pos++] = (codepoint >> 10) + 0xD800; - ret [out_pos++] = (codepoint & 0x3FF) + 0xDC00; - } else { - /* invalid utf-8 sequence (excess) */ - codepoint = 0; - mb_remain = mb_size = 0; - } - } - } else { - /* invalid utf-8 sequence */ - codepoint = 0; - mb_remain = mb_size = 0; - } - } + u16str = g_utf8_to_utf16 (str, (glong)len, NULL, &u16len, err); + u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err); + for (i = 0; i < u32len; i++) { + u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]); } + g_free (u16str); + u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err); + u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err); + g_free (u32str); + g_free (u16str); + return (gunichar*)u8str; +} - if (items_written) - *items_written = out_pos; - return ret; +gchar* +g_utf8_strup (const gchar *str, gssize len) +{ + return (gchar*)utf8_case_conv (str, len, TRUE); } -glong +gchar* +g_utf8_strdown (const gchar *str, gssize len) +{ + return (gchar*)utf8_case_conv (str, len, FALSE); +} + +static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error) { /* It is almost identical to UTF8Encoding.GetCharCount() */ @@ -113,36 +64,50 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro guint32 codepoint; glong in_pos, ret; + if (len < 0) + len = (glong) strlen (str); + + in_pos = 0; + ret = 0; + + /* Common case */ + for (in_pos = 0; in_pos < len && (guchar) str [in_pos] < 0x80; in_pos++) + ret ++; + + if (in_pos == len) { + if (items_read) + *items_read = in_pos; + return ret; + } + mb_size = 0; mb_remain = 0; overlong = 0; - in_pos = 0; - ret = 0; - for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) { + for (; in_pos < len; in_pos++) { ch = str [in_pos]; if (mb_size == 0) { if (ch < 0x80) ret++; else if ((ch & 0xE0) == 0xC0) { codepoint = ch & 0x1F; - mb_remain = mb_size = 2; + mb_size = 2; } else if ((ch & 0xF0) == 0xE0) { codepoint = ch & 0x0F; - mb_remain = mb_size = 3; + mb_size = 3; } else if ((ch & 0xF8) == 0xF0) { codepoint = ch & 7; - mb_remain = mb_size = 4; + mb_size = 4; } else if ((ch & 0xFC) == 0xF8) { codepoint = ch & 3; - mb_remain = mb_size = 5; + mb_size = 5; } else if ((ch & 0xFE) == 0xFC) { codepoint = ch & 3; - mb_remain = mb_size = 6; + mb_size = 6; } else { /* invalid utf-8 sequence */ if (error) { - g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos); + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos); if (items_read) *items_read = in_pos; return -1; @@ -151,6 +116,8 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro mb_remain = mb_size = 0; } } + if (mb_size > 1) + mb_remain = mb_size - 1; } else { if ((ch & 0xC0) == 0x80) { codepoint = (codepoint << 6) | (ch & 0x3F); @@ -183,7 +150,7 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro return -1; } else { codepoint = 0; - mb_remain = mb_size = 0; + mb_remain = 0; overlong = FALSE; } } @@ -201,14 +168,15 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro return -1; } else { codepoint = 0; - mb_remain = mb_size = 0; + mb_remain = 0; } } + mb_size = 0; } } else { /* invalid utf-8 sequence */ if (error) { - g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos); + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos); if (items_read) *items_read = in_pos; return -1; @@ -225,6 +193,112 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro return ret; } +gunichar2* +g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error) +{ + /* The conversion logic is almost identical to UTF8Encoding.GetChars(), + but error check is always done at utf8_to_utf16_len() so that + the conversion core below simply resets erroreous bits */ + glong utf16_len; + gunichar2 *ret; + guchar ch, mb_size, mb_remain; + guint32 codepoint; + glong in_pos, out_pos; + + utf16_len = 0; + mb_size = 0; + mb_remain = 0; + in_pos = 0; + out_pos = 0; + + if (error) + *error = NULL; + + if (len < 0) + len = (glong) strlen (str); + + if (items_read) + *items_read = 0; + if (items_written) + *items_written = 0; + utf16_len = utf8_to_utf16_len (str, len, items_read, error); + if (error) + if (*error) + return NULL; + if (utf16_len < 0) + return NULL; + + ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2)); + + /* Common case */ + for (in_pos = 0; in_pos < len; in_pos++) { + ch = (guchar) str [in_pos]; + + if (ch >= 0x80) + break; + ret [out_pos++] = ch; + } + + for (; in_pos < len; in_pos++) { + ch = (guchar) str [in_pos]; + if (mb_size == 0) { + if (ch < 0x80) + ret [out_pos++] = ch; + else if ((ch & 0xE0) == 0xC0) { + codepoint = ch & 0x1F; + mb_size = 2; + } else if ((ch & 0xF0) == 0xE0) { + codepoint = ch & 0x0F; + mb_size = 3; + } else if ((ch & 0xF8) == 0xF0) { + codepoint = ch & 7; + mb_size = 4; + } else if ((ch & 0xFC) == 0xF8) { + codepoint = ch & 3; + mb_size = 5; + } else if ((ch & 0xFE) == 0xFC) { + codepoint = ch & 3; + mb_size = 6; + } else { + /* invalid utf-8 sequence */ + codepoint = 0; + mb_remain = mb_size = 0; + } + if (mb_size > 1) + mb_remain = mb_size - 1; + } else { + if ((ch & 0xC0) == 0x80) { + codepoint = (codepoint << 6) | (ch & 0x3F); + if (--mb_remain == 0) { + /* multi byte character is fully consumed now. */ + if (codepoint < 0x10000) { + ret [out_pos++] = (gunichar2)(codepoint % 0x10000); + } else if (codepoint < 0x110000) { + /* surrogate pair */ + codepoint -= 0x10000; + ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800); + ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00); + } else { + /* invalid utf-8 sequence (excess) */ + codepoint = 0; + mb_remain = 0; + } + mb_size = 0; + } + } else { + /* invalid utf-8 sequence */ + codepoint = 0; + mb_remain = mb_size = 0; + } + } + } + + ret [out_pos] = 0; + if (items_written) + *items_written = out_pos; + return ret; +} + gchar* g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error) { @@ -235,13 +309,17 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item gchar *ret; glong in_pos, out_pos; gunichar2 ch; - guint32 codepoint; + guint32 codepoint = 0; gboolean surrogate; in_pos = 0; out_pos = 0; surrogate = FALSE; + if (items_read) + *items_read = 0; + if (items_written) + *items_written = 0; utf8_len = utf16_to_utf8_len (str, len, items_read, error); if (error) if (*error) @@ -249,23 +327,26 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item if (utf8_len < 0) return NULL; - ret = g_malloc (utf8_len * sizeof (gchar)); + ret = g_malloc ((1+utf8_len) * sizeof (gchar)); while (len < 0 ? str [in_pos] : in_pos < len) { ch = str [in_pos]; if (surrogate) { - surrogate = 0; - if (ch >= 0xDC00 && ch <= 0xDFFF) + if (ch >= 0xDC00 && ch <= 0xDFFF) { codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10); - else + surrogate = 0; + } else { + surrogate = 0; /* invalid surrogate pair */ + ++in_pos; continue; + } } else { /* fast path optimization */ if (ch < 0x80) { for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) { if (str [in_pos] < 0x80) - ret [out_pos++] = str [in_pos]; + ret [out_pos++] = (gchar)(str [in_pos]); else break; } @@ -274,6 +355,7 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item else if (ch >= 0xD800 && ch <= 0xDBFF) surrogate = ch; else if (ch >= 0xDC00 && ch <= 0xDFFF) { + ++in_pos; /* invalid surrogate pair */ continue; } @@ -282,6 +364,8 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item } in_pos++; + if (surrogate != 0) + continue; if (codepoint < 0x80) ret [out_pos++] = (gchar) codepoint; else if (codepoint < 0x0800) { @@ -298,13 +382,14 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F)); } } + ret [out_pos] = 0; if (items_written) *items_written = out_pos; return ret; } -glong +static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error) { glong ret, in_pos; @@ -364,3 +449,171 @@ utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError ** *items_read = in_pos; return ret; } + +static glong +g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error) +{ + glong retlen = 0; + glong errindex = 0; + const gunichar *lstr = str; + + if (!str) + return 0; + + while (*lstr != '\0' && len--) { + gunichar ch; + ch = *lstr++; + if (ch <= 0x0000FFFF) { + if (ch >= 0xD800 && ch <= 0xDFFF) { + errindex = (glong)(lstr - str)-1; + if (error) + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + "Invalid sequence in conversion input"); + if (items_read) + *items_read = errindex; + return 0; + } else { + retlen++; + } + } else if (ch > 0x10FFFF) { + errindex = (glong)(lstr - str)-1; + if (error) + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + "Character out of range for UTF-16"); + if (items_read) + *items_read = errindex; + return 0; + + } else { + retlen+=2; + } + } + + if (items_read) + *items_read = (glong)(lstr - str); + return retlen; +} + +gunichar2* +g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error) +{ + glong allocsz; + gunichar2 *retstr = 0; + gunichar2 *retch = 0; + glong nwritten = 0; + GError *lerror =0 ; + + allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror); + + if (!lerror) { + retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2)); + retstr[allocsz] = '\0'; + + while (*str != '\0' && len--) { + gunichar ch; + ch = *str++; + if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) { + *retch++ = (gunichar2)ch; + nwritten ++; + } else { + ch -= 0x0010000UL; + *retch++ = (gunichar2)((ch >> 10) + 0xD800); + *retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00); + nwritten +=2; + } + } + } + + if (items_written) + *items_written = nwritten; + if (error) + *error = lerror; + + return retstr; +} + +static glong +g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error) +{ + glong retlen = 0; + glong errindex = 0; + const gunichar2 *lstr = str; + gunichar2 ch,ch2; + + if (!str) + return 0; + + while (*lstr != '\0' && len--) { + ch = *lstr++; + if (ch >= 0xD800 && ch <= 0xDBFF) { + if (!len--) { + lstr--; + break; + } + ch2 = *lstr; + if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { + lstr++; + } else { + errindex = (glong)(lstr - str); + if (error) + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + "Invalid sequence in conversion input"); + if (items_read) + *items_read = errindex; + return 0; + } + } else { + if (ch >= 0xDC00 && ch <= 0xDFFF) { + errindex = (glong)(lstr - str)-1; + if (error) + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + "Invalid sequence in conversion input"); + if (items_read) + *items_read = errindex; + return 0; + } + } + retlen++; + } + + if (items_read) + *items_read = (glong)(lstr - str); + + return retlen; +} + +gunichar* +g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error) +{ + glong allocsz; + gunichar *retstr = 0; + gunichar *retch = 0; + glong nwritten = 0; + GError *lerror =0 ; + gunichar ch,ch2; + + allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror); + + if (!lerror) { + retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar)); + retstr[allocsz] = '\0'; + nwritten = allocsz; + + while (*str != '\0' && allocsz--) { + ch = *str++; + if (ch >= 0xD800 && ch <= 0xDBFF) { + ch2 = *str++; + ch = ((ch - (gunichar)0xD800) << 10) + + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL; + } + *retch++ = ch; + } + } + + if (items_written) + *items_written = nwritten; + if (error) + *error = lerror; + + return retstr; +}