2 * gutf8.c: UTF-8 conversion
5 * Atsushi Enomoto <atsushi@ximian.com>
7 * (C) 2006 Novell, Inc.
13 gpointer error_quark = "ERROR";
15 static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error);
16 static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error);
19 g_convert_error_quark ()
25 utf8_case_conv (const gchar *str, gssize len, gboolean upper)
27 glong i, u16len, u32len;
33 u16str = g_utf8_to_utf16 (str, (glong)len, NULL, &u16len, err);
34 u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
35 for (i = 0; i < u32len; i++) {
36 u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
39 u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
40 u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
43 return (gunichar*)u8str;
47 g_utf8_strup (const gchar *str, gssize len)
49 return (gchar*)utf8_case_conv (str, len, TRUE);
53 g_utf8_strdown (const gchar *str, gssize len)
55 return (gchar*)utf8_case_conv (str, len, FALSE);
59 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
61 /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
62 but error check is always done at utf8_to_utf16_len() so that
63 the conversion core below simply resets erroreous bits */
66 guchar ch, mb_size, mb_remain;
68 glong in_pos, out_pos;
81 utf16_len = utf8_to_utf16_len (str, len, items_read, error);
88 ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
90 for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
91 ch = (guchar) str [in_pos];
95 else if ((ch & 0xE0) == 0xC0) {
96 codepoint = ch & 0x1F;
98 } else if ((ch & 0xF0) == 0xE0) {
99 codepoint = ch & 0x0F;
101 } else if ((ch & 0xF8) == 0xF0) {
104 } else if ((ch & 0xFC) == 0xF8) {
107 } else if ((ch & 0xFE) == 0xFC) {
111 /* invalid utf-8 sequence */
113 mb_remain = mb_size = 0;
116 mb_remain = mb_size - 1;
118 if ((ch & 0xC0) == 0x80) {
119 codepoint = (codepoint << 6) | (ch & 0x3F);
120 if (--mb_remain == 0) {
121 /* multi byte character is fully consumed now. */
122 if (codepoint < 0x10000) {
123 ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
124 } else if (codepoint < 0x110000) {
126 codepoint -= 0x10000;
127 ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
128 ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
130 /* invalid utf-8 sequence (excess) */
137 /* invalid utf-8 sequence */
139 mb_remain = mb_size = 0;
146 *items_written = out_pos;
151 utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
153 /* It is almost identical to UTF8Encoding.GetCharCount() */
154 guchar ch, mb_size, mb_remain;
165 for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
170 else if ((ch & 0xE0) == 0xC0) {
171 codepoint = ch & 0x1F;
173 } else if ((ch & 0xF0) == 0xE0) {
174 codepoint = ch & 0x0F;
176 } else if ((ch & 0xF8) == 0xF0) {
179 } else if ((ch & 0xFC) == 0xF8) {
182 } else if ((ch & 0xFE) == 0xFC) {
186 /* invalid utf-8 sequence */
188 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
190 *items_read = in_pos;
194 mb_remain = mb_size = 0;
198 mb_remain = mb_size - 1;
200 if ((ch & 0xC0) == 0x80) {
201 codepoint = (codepoint << 6) | (ch & 0x3F);
202 if (--mb_remain == 0) {
203 /* multi byte character is fully consumed now. */
204 if (codepoint < 0x10000) {
207 overlong = codepoint < 0x7F;
210 overlong = codepoint < 0x7FF;
213 overlong = codepoint < 0xFFFF;
216 overlong = codepoint < 0x1FFFFF;
219 overlong = codepoint < 0x03FFFFFF;
223 /* invalid utf-8 sequence (overlong) */
225 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
227 *items_read = in_pos;
237 } else if (codepoint < 0x110000) {
241 /* invalid utf-8 sequence (excess) */
243 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
245 *items_read = in_pos;
255 /* invalid utf-8 sequence */
257 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
259 *items_read = in_pos;
263 mb_remain = mb_size = 0;
270 *items_read = in_pos;
275 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
277 /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
278 but error check is always done at utf16_to_utf8_len() so that
279 the conversion core below simply resets erroreous bits */
282 glong in_pos, out_pos;
284 guint32 codepoint = 0;
293 utf8_len = utf16_to_utf8_len (str, len, items_read, error);
300 ret = g_malloc ((1+utf8_len) * sizeof (gchar));
302 while (len < 0 ? str [in_pos] : in_pos < len) {
305 if (ch >= 0xDC00 && ch <= 0xDFFF) {
306 codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
310 /* invalid surrogate pair */
314 /* fast path optimization */
316 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
317 if (str [in_pos] < 0x80)
318 ret [out_pos++] = (gchar)(str [in_pos]);
324 else if (ch >= 0xD800 && ch <= 0xDBFF)
326 else if (ch >= 0xDC00 && ch <= 0xDFFF) {
327 /* invalid surrogate pair */
337 if (codepoint < 0x80)
338 ret [out_pos++] = (gchar) codepoint;
339 else if (codepoint < 0x0800) {
340 ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
341 ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
342 } else if (codepoint < 0x10000) {
343 ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
344 ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
345 ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
347 ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
348 ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
349 ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
350 ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
356 *items_written = out_pos;
361 utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
371 while (len < 0 ? str [in_pos] : in_pos < len) {
374 if (ch >= 0xDC00 && ch <= 0xDFFF) {
377 /* invalid surrogate pair */
379 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
381 *items_read = in_pos;
383 } /* otherwise just ignore. */
387 /* fast path optimization */
389 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
390 if (str [in_pos] < 0x80)
397 else if (ch < 0x0800)
399 else if (ch >= 0xD800 && ch <= 0xDBFF)
401 else if (ch >= 0xDC00 && ch <= 0xDFFF) {
402 /* invalid surrogate pair */
404 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
406 *items_read = in_pos;
408 } /* otherwise just ignore. */
417 *items_read = in_pos;
422 g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error)
426 const gunichar *lstr = str;
431 while (*lstr != '\0' && len--) {
434 if (ch <= 0x0000FFFF) {
435 if (ch >= 0xD800 && ch <= 0xDFFF) {
436 errindex = (glong)(lstr - str)-1;
438 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
439 "Invalid sequence in conversion input");
441 *items_read = errindex;
446 } else if (ch > 0x10FFFF) {
447 errindex = (glong)(lstr - str)-1;
449 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
450 "Character out of range for UTF-16");
452 *items_read = errindex;
461 *items_read = (glong)(lstr - str);
466 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
469 gunichar2 *retstr = 0;
470 gunichar2 *retch = 0;
474 allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror);
477 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2));
478 retstr[allocsz] = '\0';
480 while (*str != '\0' && len--) {
483 if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) {
484 *retch++ = (gunichar2)ch;
488 *retch++ = (gunichar2)((ch >> 10) + 0xD800);
489 *retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00);
496 *items_written = nwritten;
504 g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
508 const gunichar2 *lstr = str;
514 while (*lstr != '\0' && len--) {
516 if (ch >= 0xD800 && ch <= 0xDBFF) {
522 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
525 errindex = (glong)(lstr - str);
527 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
528 "Invalid sequence in conversion input");
530 *items_read = errindex;
534 if (ch >= 0xDC00 && ch <= 0xDFFF) {
535 errindex = (glong)(lstr - str)-1;
537 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
538 "Invalid sequence in conversion input");
540 *items_read = errindex;
548 *items_read = (glong)(lstr - str);
554 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
557 gunichar *retstr = 0;
563 allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror);
566 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar));
567 retstr[allocsz] = '\0';
570 while (*str != '\0' && allocsz--) {
572 if (ch >= 0xD800 && ch <= 0xDBFF) {
574 ch = ((ch - (gunichar)0xD800) << 10)
575 + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL;
582 *items_written = nwritten;