/* * gutf8.c: UTF-8 conversion * * Author: * Atsushi Enomoto * * (C) 2006 Novell, Inc. */ #include #include gpointer error_quark = "ERROR"; static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error); static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error); gpointer g_convert_error_quark () { return error_quark; } static gunichar* utf8_case_conv (const gchar *str, gssize len, gboolean upper) { glong i, u16len, u32len; gunichar2 *u16str; gunichar *u32str; gchar *u8str; GError **err = NULL; u16str = g_utf8_to_utf16 (str, (glong)len, NULL, &u16len, err); u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err); for (i = 0; i < u32len; i++) { u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]); } g_free (u16str); u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err); u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err); g_free (u32str); g_free (u16str); return (gunichar*)u8str; } gchar* g_utf8_strup (const gchar *str, gssize len) { return (gchar*)utf8_case_conv (str, len, TRUE); } gchar* g_utf8_strdown (const gchar *str, gssize len) { return (gchar*)utf8_case_conv (str, len, FALSE); } static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error) { /* It is almost identical to UTF8Encoding.GetCharCount() */ guchar ch, mb_size, mb_remain; gboolean overlong; guint32 codepoint; glong in_pos, ret; if (len < 0) len = (glong) strlen (str); in_pos = 0; ret = 0; /* Common case */ for (in_pos = 0; in_pos < len && (guchar) str [in_pos] < 0x80; in_pos++) ret ++; if (in_pos == len) { if (items_read) *items_read = in_pos; return ret; } mb_size = 0; mb_remain = 0; overlong = 0; for (; in_pos < len; in_pos++) { ch = str [in_pos]; if (mb_size == 0) { if (ch < 0x80) ret++; else if ((ch & 0xE0) == 0xC0) { codepoint = ch & 0x1F; mb_size = 2; } else if ((ch & 0xF0) == 0xE0) { codepoint = ch & 0x0F; mb_size = 3; } else if ((ch & 0xF8) == 0xF0) { codepoint = ch & 7; mb_size = 4; } else if ((ch & 0xFC) == 0xF8) { codepoint = ch & 3; mb_size = 5; } else if ((ch & 0xFE) == 0xFC) { codepoint = ch & 3; mb_size = 6; } else { /* invalid utf-8 sequence */ if (error) { g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos); if (items_read) *items_read = in_pos; return -1; } else { codepoint = 0; mb_remain = mb_size = 0; } } if (mb_size > 1) mb_remain = mb_size - 1; } else { if ((ch & 0xC0) == 0x80) { codepoint = (codepoint << 6) | (ch & 0x3F); if (--mb_remain == 0) { /* multi byte character is fully consumed now. */ if (codepoint < 0x10000) { switch (mb_size) { case 2: overlong = codepoint < 0x7F; break; case 3: overlong = codepoint < 0x7FF; break; case 4: overlong = codepoint < 0xFFFF; break; case 5: overlong = codepoint < 0x1FFFFF; break; case 6: overlong = codepoint < 0x03FFFFFF; break; } if (overlong) { /* invalid utf-8 sequence (overlong) */ if (error) { g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos); if (items_read) *items_read = in_pos; return -1; } else { codepoint = 0; mb_remain = 0; overlong = FALSE; } } else ret++; } else if (codepoint < 0x110000) { /* surrogate pair */ ret += 2; } else { /* invalid utf-8 sequence (excess) */ if (error) { g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos); if (items_read) *items_read = in_pos; return -1; } else { codepoint = 0; mb_remain = 0; } } mb_size = 0; } } else { /* invalid utf-8 sequence */ if (error) { g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos); if (items_read) *items_read = in_pos; return -1; } else { codepoint = 0; mb_remain = mb_size = 0; } } } } if (items_read) *items_read = in_pos; return ret; } gunichar2* g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error) { /* The conversion logic is almost identical to UTF8Encoding.GetChars(), but error check is always done at utf8_to_utf16_len() so that the conversion core below simply resets erroreous bits */ glong utf16_len; gunichar2 *ret; guchar ch, mb_size, mb_remain; guint32 codepoint; glong in_pos, out_pos; utf16_len = 0; mb_size = 0; mb_remain = 0; in_pos = 0; out_pos = 0; if (error) *error = NULL; if (len < 0) len = (glong) strlen (str); if (items_read) *items_read = 0; if (items_written) *items_written = 0; utf16_len = utf8_to_utf16_len (str, len, items_read, error); if (error) if (*error) return NULL; if (utf16_len < 0) return NULL; ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2)); /* Common case */ for (in_pos = 0; in_pos < len; in_pos++) { ch = (guchar) str [in_pos]; if (ch >= 0x80) break; ret [out_pos++] = ch; } for (; in_pos < len; in_pos++) { ch = (guchar) str [in_pos]; if (mb_size == 0) { if (ch < 0x80) ret [out_pos++] = ch; else if ((ch & 0xE0) == 0xC0) { codepoint = ch & 0x1F; mb_size = 2; } else if ((ch & 0xF0) == 0xE0) { codepoint = ch & 0x0F; mb_size = 3; } else if ((ch & 0xF8) == 0xF0) { codepoint = ch & 7; mb_size = 4; } else if ((ch & 0xFC) == 0xF8) { codepoint = ch & 3; mb_size = 5; } else if ((ch & 0xFE) == 0xFC) { codepoint = ch & 3; mb_size = 6; } else { /* invalid utf-8 sequence */ codepoint = 0; mb_remain = mb_size = 0; } if (mb_size > 1) mb_remain = mb_size - 1; } else { if ((ch & 0xC0) == 0x80) { codepoint = (codepoint << 6) | (ch & 0x3F); if (--mb_remain == 0) { /* multi byte character is fully consumed now. */ if (codepoint < 0x10000) { ret [out_pos++] = (gunichar2)(codepoint % 0x10000); } else if (codepoint < 0x110000) { /* surrogate pair */ codepoint -= 0x10000; ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800); ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00); } else { /* invalid utf-8 sequence (excess) */ codepoint = 0; mb_remain = 0; } mb_size = 0; } } else { /* invalid utf-8 sequence */ codepoint = 0; mb_remain = mb_size = 0; } } } ret [out_pos] = 0; if (items_written) *items_written = out_pos; return ret; } gchar* g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error) { /* The conversion logic is almost identical to UTF8Encoding.GetBytes(), but error check is always done at utf16_to_utf8_len() so that the conversion core below simply resets erroreous bits */ glong utf8_len; gchar *ret; glong in_pos, out_pos; gunichar2 ch; guint32 codepoint = 0; gboolean surrogate; in_pos = 0; out_pos = 0; surrogate = FALSE; if (items_read) *items_read = 0; if (items_written) *items_written = 0; utf8_len = utf16_to_utf8_len (str, len, items_read, error); if (error) if (*error) return NULL; if (utf8_len < 0) return NULL; ret = g_malloc ((1+utf8_len) * sizeof (gchar)); while (len < 0 ? str [in_pos] : in_pos < len) { ch = str [in_pos]; if (surrogate) { if (ch >= 0xDC00 && ch <= 0xDFFF) { codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10); surrogate = 0; } else { surrogate = 0; /* invalid surrogate pair */ ++in_pos; continue; } } else { /* fast path optimization */ if (ch < 0x80) { for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) { if (str [in_pos] < 0x80) ret [out_pos++] = (gchar)(str [in_pos]); else break; } continue; } else if (ch >= 0xD800 && ch <= 0xDBFF) surrogate = ch; else if (ch >= 0xDC00 && ch <= 0xDFFF) { ++in_pos; /* invalid surrogate pair */ continue; } else codepoint = ch; } in_pos++; if (surrogate != 0) continue; if (codepoint < 0x80) ret [out_pos++] = (gchar) codepoint; else if (codepoint < 0x0800) { ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6)); ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F)); } else if (codepoint < 0x10000) { ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12)); ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F)); ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F)); } else { ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18)); ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F)); ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F)); ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F)); } } ret [out_pos] = 0; if (items_written) *items_written = out_pos; return ret; } static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error) { glong ret, in_pos; gunichar2 ch; gboolean surrogate; ret = 0; in_pos = 0; surrogate = FALSE; while (len < 0 ? str [in_pos] : in_pos < len) { ch = str [in_pos]; if (surrogate) { if (ch >= 0xDC00 && ch <= 0xDFFF) { ret += 4; } else { /* invalid surrogate pair */ if (error) { g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos); if (items_read) *items_read = in_pos; return -1; } /* otherwise just ignore. */ } surrogate = FALSE; } else { /* fast path optimization */ if (ch < 0x80) { for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) { if (str [in_pos] < 0x80) ++ret; else break; } continue; } else if (ch < 0x0800) ret += 2; else if (ch >= 0xD800 && ch <= 0xDBFF) surrogate = TRUE; else if (ch >= 0xDC00 && ch <= 0xDFFF) { /* invalid surrogate pair */ if (error) { g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos); if (items_read) *items_read = in_pos; return -1; } /* otherwise just ignore. */ } else ret += 3; } in_pos++; } if (items_read) *items_read = in_pos; return ret; } gchar * g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error) { gchar *outbuf, *outptr; glong nwritten = 0; glong i; gint n; if (len == -1) { for (i = 0; str[i] != 0; i++) { if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) { g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Invalid sequence in conversion input"); if (items_read) *items_read = i; return NULL; } nwritten += n; } } else { for (i = 0; i < len; i++) { if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) { g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Invalid sequence in conversion input"); if (items_read) *items_read = i; return NULL; } nwritten += n; } } outptr = outbuf = g_malloc (nwritten + 1); if (len == -1) { for (i = 0; str[i] != 0; i++) outptr += g_unichar_to_utf8 (str[i], outptr); } else { for (i = 0; i < len; i++) outptr += g_unichar_to_utf8 (str[i], outptr); } *outptr = '\0'; if (items_written) *items_written = nwritten; if (items_read != 0) *items_read = i; return outbuf; } static glong g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error) { glong retlen = 0; glong errindex = 0; const gunichar *lstr = str; if (!str) return 0; while (*lstr != '\0' && len--) { gunichar ch; ch = *lstr++; if (ch <= 0x0000FFFF) { if (ch >= 0xD800 && ch <= 0xDFFF) { errindex = (glong)(lstr - str)-1; if (error) g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Invalid sequence in conversion input"); if (items_read) *items_read = errindex; return 0; } else { retlen++; } } else if (ch > 0x10FFFF) { errindex = (glong)(lstr - str)-1; if (error) g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Character out of range for UTF-16"); if (items_read) *items_read = errindex; return 0; } else { retlen+=2; } } if (items_read) *items_read = (glong)(lstr - str); return retlen; } gunichar2* g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error) { glong allocsz; gunichar2 *retstr = 0; gunichar2 *retch = 0; glong nwritten = 0; GError *lerror =0 ; allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror); if (!lerror) { retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2)); retstr[allocsz] = '\0'; while (*str != '\0' && len--) { gunichar ch; ch = *str++; if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) { *retch++ = (gunichar2)ch; nwritten ++; } else { ch -= 0x0010000UL; *retch++ = (gunichar2)((ch >> 10) + 0xD800); *retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00); nwritten +=2; } } } if (items_written) *items_written = nwritten; if (error) *error = lerror; return retstr; } static glong g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error) { glong retlen = 0; glong errindex = 0; const gunichar2 *lstr = str; gunichar2 ch,ch2; if (!str) return 0; while (*lstr != '\0' && len--) { ch = *lstr++; if (ch >= 0xD800 && ch <= 0xDBFF) { if (!len--) { lstr--; break; } ch2 = *lstr; if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { lstr++; } else { errindex = (glong)(lstr - str); if (error) g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Invalid sequence in conversion input"); if (items_read) *items_read = errindex; return 0; } } else { if (ch >= 0xDC00 && ch <= 0xDFFF) { errindex = (glong)(lstr - str)-1; if (error) g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Invalid sequence in conversion input"); if (items_read) *items_read = errindex; return 0; } } retlen++; } if (items_read) *items_read = (glong)(lstr - str); return retlen; } gunichar* g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error) { glong allocsz; gunichar *retstr = 0; gunichar *retch = 0; glong nwritten = 0; GError *lerror =0 ; gunichar ch,ch2; allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror); if (!lerror) { retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar)); retstr[allocsz] = '\0'; nwritten = allocsz; while (*str != '\0' && allocsz--) { ch = *str++; if (ch >= 0xD800 && ch <= 0xDBFF) { ch2 = *str++; ch = ((ch - (gunichar)0xD800) << 10) + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL; } *retch++ = ch; } } if (items_written) *items_written = nwritten; if (error) *error = lerror; return retstr; } gchar * g_utf8_find_prev_char (const gchar *str, const gchar *p) { while (p > str) { p--; if ((*p && 0xc0) != 0xb0) return (gchar *)p; } return NULL; } gchar * g_utf8_prev_char (const gchar *str) { const gchar *p = str; do { p--; } while ((*p & 0xc0) == 0xb0); return (gchar *)p; } gchar * g_utf8_offset_to_pointer (const gchar *str, glong offset) { const gchar *p = str; if (offset > 0) { do { p = g_utf8_next_char (p); offset --; } while (offset > 0); } else if (offset < 0) { const gchar *jump = str; do { // since the minimum size of a character is 1 // we know we can step back at least offset bytes jump = jump + offset; // if we land in the middle of a character // walk to the beginning while ((*jump & 0xc0) == 0xb0) jump --; // count how many characters we've actually walked // by going forward p = jump; do { p = g_utf8_next_char (p); offset ++; } while (p < jump); } while (offset < 0); } return (gchar *)p; } glong g_utf8_pointer_to_offset (const gchar *str, const gchar *pos) { const gchar *inptr, *inend; glong offset = 0; glong sign = 1; if (pos == str) return 0; if (str < pos) { inptr = str; inend = pos; } else { inptr = pos; inend = str; sign = -1; } do { inptr = g_utf8_next_char (inptr); offset++; } while (inptr < inend); return offset * sign; } gunichar* g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written) { gunichar* ucs4; int ucs4_index; const char *p; int mb_size; gunichar codepoint; g_return_val_if_fail (str != NULL, NULL); if (len < 0) { /* we need to find the length of str, as len < 0 means it must be 0 terminated */ len = 0; p = str; while (*p) { len ++; p = g_utf8_next_char(p); } } ucs4 = g_malloc (sizeof(gunichar)*len); if (items_written) *items_written = len; p = str; ucs4_index = 0; while (len) { guint8 c = *p++; if (c < 0x80) { mb_size = 1; } else if (c < 0xe0) { c &= 0x1f; mb_size = 2; } else if (c < 0xf0) { c &= 0x0f; mb_size = 3; } else if (c < 0xf8) { c &= 0x07; mb_size = 4; } else if (c < 0xfc) { c &= 0x03; mb_size = 5; } else if (c < 0xfe) { c &= 0x01; mb_size = 6; } codepoint = c; while (--mb_size) { codepoint = (codepoint << 6) | ((*p) & 0x3f); p++; } ucs4[ucs4_index++] = codepoint; len --; } return ucs4; } /** * from http://home.tiscali.nl/t876506/utf8tbl.html * * From Unicode UCS-4 to UTF-8: * Start with the Unicode number expressed as a decimal number and call this ud. * * If ud <128 (7F hex) then UTF-8 is 1 byte long, the value of ud. * * If ud >=128 and <=2047 (7FF hex) then UTF-8 is 2 bytes long. * byte 1 = 192 + (ud div 64) * byte 2 = 128 + (ud mod 64) * * If ud >=2048 and <=65535 (FFFF hex) then UTF-8 is 3 bytes long. * byte 1 = 224 + (ud div 4096) * byte 2 = 128 + ((ud div 64) mod 64) * byte 3 = 128 + (ud mod 64) * * If ud >=65536 and <=2097151 (1FFFFF hex) then UTF-8 is 4 bytes long. * byte 1 = 240 + (ud div 262144) * byte 2 = 128 + ((ud div 4096) mod 64) * byte 3 = 128 + ((ud div 64) mod 64) * byte 4 = 128 + (ud mod 64) * * If ud >=2097152 and <=67108863 (3FFFFFF hex) then UTF-8 is 5 bytes long. * byte 1 = 248 + (ud div 16777216) * byte 2 = 128 + ((ud div 262144) mod 64) * byte 3 = 128 + ((ud div 4096) mod 64) * byte 4 = 128 + ((ud div 64) mod 64) * byte 5 = 128 + (ud mod 64) * * If ud >=67108864 and <=2147483647 (7FFFFFFF hex) then UTF-8 is 6 bytes long. * byte 1 = 252 + (ud div 1073741824) * byte 2 = 128 + ((ud div 16777216) mod 64) * byte 3 = 128 + ((ud div 262144) mod 64) * byte 4 = 128 + ((ud div 4096) mod 64) * byte 5 = 128 + ((ud div 64) mod 64) * byte 6 = 128 + (ud mod 64) **/ gint g_unichar_to_utf8 (gunichar c, gchar *outbuf) { size_t len, i; int base; if (c < 128UL) { base = 0; len = 1; } else if (c < 2048UL) { base = 192; len = 2; } else if (c < 65536UL) { base = 224; len = 3; } else if (c < 2097152UL) { base = 240; len = 4; } else if (c < 67108864UL) { base = 248; len = 5; } else if (c < 2147483648UL) { base = 252; len = 6; } else { return -1; } if (outbuf != NULL) { for (i = len - 1; i > 0; i--) { /* mask off 6 bits worth and add 128 */ outbuf[i] = 128 + (c & 0x3f); c >>= 6; } /* first character has a different base */ outbuf[0] = base + c; } return len; }