2 * gutf8.c: UTF-8 conversion
5 * Atsushi Enomoto <atsushi@ximian.com>
7 * (C) 2006 Novell, Inc.
13 gpointer error_quark = "ERROR";
15 static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error);
16 static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error);
19 g_convert_error_quark ()
25 utf8_case_conv (const gchar *str, gssize len, gboolean upper)
27 glong i, u16len, u32len;
33 u16str = g_utf8_to_utf16 (str, (glong)len, NULL, &u16len, err);
34 u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
35 for (i = 0; i < u32len; i++) {
36 u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
39 u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
40 u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
43 return (gunichar*)u8str;
47 g_utf8_strup (const gchar *str, gssize len)
49 return (gchar*)utf8_case_conv (str, len, TRUE);
53 g_utf8_strdown (const gchar *str, gssize len)
55 return (gchar*)utf8_case_conv (str, len, FALSE);
59 utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
61 /* It is almost identical to UTF8Encoding.GetCharCount() */
62 guchar ch, mb_size, mb_remain;
68 len = (glong) strlen (str);
74 for (in_pos = 0; in_pos < len && str [in_pos] < 0x80; in_pos++)
87 for (; in_pos < len; in_pos++) {
92 else if ((ch & 0xE0) == 0xC0) {
93 codepoint = ch & 0x1F;
95 } else if ((ch & 0xF0) == 0xE0) {
96 codepoint = ch & 0x0F;
98 } else if ((ch & 0xF8) == 0xF0) {
101 } else if ((ch & 0xFC) == 0xF8) {
104 } else if ((ch & 0xFE) == 0xFC) {
108 /* invalid utf-8 sequence */
110 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
112 *items_read = in_pos;
116 mb_remain = mb_size = 0;
120 mb_remain = mb_size - 1;
122 if ((ch & 0xC0) == 0x80) {
123 codepoint = (codepoint << 6) | (ch & 0x3F);
124 if (--mb_remain == 0) {
125 /* multi byte character is fully consumed now. */
126 if (codepoint < 0x10000) {
129 overlong = codepoint < 0x7F;
132 overlong = codepoint < 0x7FF;
135 overlong = codepoint < 0xFFFF;
138 overlong = codepoint < 0x1FFFFF;
141 overlong = codepoint < 0x03FFFFFF;
145 /* invalid utf-8 sequence (overlong) */
147 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
149 *items_read = in_pos;
159 } else if (codepoint < 0x110000) {
163 /* invalid utf-8 sequence (excess) */
165 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
167 *items_read = in_pos;
177 /* invalid utf-8 sequence */
179 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
181 *items_read = in_pos;
185 mb_remain = mb_size = 0;
192 *items_read = in_pos;
197 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
199 /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
200 but error check is always done at utf8_to_utf16_len() so that
201 the conversion core below simply resets erroreous bits */
204 guchar ch, mb_size, mb_remain;
206 glong in_pos, out_pos;
218 len = (glong) strlen (str);
224 utf16_len = utf8_to_utf16_len (str, len, items_read, error);
231 ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
234 for (in_pos = 0; in_pos < len; in_pos++) {
235 ch = (guchar) str [in_pos];
239 ret [out_pos++] = ch;
242 for (; in_pos < len; in_pos++) {
243 ch = (guchar) str [in_pos];
246 ret [out_pos++] = ch;
247 else if ((ch & 0xE0) == 0xC0) {
248 codepoint = ch & 0x1F;
250 } else if ((ch & 0xF0) == 0xE0) {
251 codepoint = ch & 0x0F;
253 } else if ((ch & 0xF8) == 0xF0) {
256 } else if ((ch & 0xFC) == 0xF8) {
259 } else if ((ch & 0xFE) == 0xFC) {
263 /* invalid utf-8 sequence */
265 mb_remain = mb_size = 0;
268 mb_remain = mb_size - 1;
270 if ((ch & 0xC0) == 0x80) {
271 codepoint = (codepoint << 6) | (ch & 0x3F);
272 if (--mb_remain == 0) {
273 /* multi byte character is fully consumed now. */
274 if (codepoint < 0x10000) {
275 ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
276 } else if (codepoint < 0x110000) {
278 codepoint -= 0x10000;
279 ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
280 ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
282 /* invalid utf-8 sequence (excess) */
289 /* invalid utf-8 sequence */
291 mb_remain = mb_size = 0;
298 *items_written = out_pos;
303 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
305 /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
306 but error check is always done at utf16_to_utf8_len() so that
307 the conversion core below simply resets erroreous bits */
310 glong in_pos, out_pos;
312 guint32 codepoint = 0;
323 utf8_len = utf16_to_utf8_len (str, len, items_read, error);
330 ret = g_malloc ((1+utf8_len) * sizeof (gchar));
332 while (len < 0 ? str [in_pos] : in_pos < len) {
335 if (ch >= 0xDC00 && ch <= 0xDFFF) {
336 codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
340 /* invalid surrogate pair */
344 /* fast path optimization */
346 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
347 if (str [in_pos] < 0x80)
348 ret [out_pos++] = (gchar)(str [in_pos]);
354 else if (ch >= 0xD800 && ch <= 0xDBFF)
356 else if (ch >= 0xDC00 && ch <= 0xDFFF) {
357 /* invalid surrogate pair */
367 if (codepoint < 0x80)
368 ret [out_pos++] = (gchar) codepoint;
369 else if (codepoint < 0x0800) {
370 ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
371 ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
372 } else if (codepoint < 0x10000) {
373 ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
374 ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
375 ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
377 ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
378 ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
379 ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
380 ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
386 *items_written = out_pos;
391 utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
401 while (len < 0 ? str [in_pos] : in_pos < len) {
404 if (ch >= 0xDC00 && ch <= 0xDFFF) {
407 /* invalid surrogate pair */
409 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
411 *items_read = in_pos;
413 } /* otherwise just ignore. */
417 /* fast path optimization */
419 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
420 if (str [in_pos] < 0x80)
427 else if (ch < 0x0800)
429 else if (ch >= 0xD800 && ch <= 0xDBFF)
431 else if (ch >= 0xDC00 && ch <= 0xDFFF) {
432 /* invalid surrogate pair */
434 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
436 *items_read = in_pos;
438 } /* otherwise just ignore. */
447 *items_read = in_pos;
452 g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error)
456 const gunichar *lstr = str;
461 while (*lstr != '\0' && len--) {
464 if (ch <= 0x0000FFFF) {
465 if (ch >= 0xD800 && ch <= 0xDFFF) {
466 errindex = (glong)(lstr - str)-1;
468 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
469 "Invalid sequence in conversion input");
471 *items_read = errindex;
476 } else if (ch > 0x10FFFF) {
477 errindex = (glong)(lstr - str)-1;
479 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
480 "Character out of range for UTF-16");
482 *items_read = errindex;
491 *items_read = (glong)(lstr - str);
496 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
499 gunichar2 *retstr = 0;
500 gunichar2 *retch = 0;
504 allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror);
507 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2));
508 retstr[allocsz] = '\0';
510 while (*str != '\0' && len--) {
513 if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) {
514 *retch++ = (gunichar2)ch;
518 *retch++ = (gunichar2)((ch >> 10) + 0xD800);
519 *retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00);
526 *items_written = nwritten;
534 g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
538 const gunichar2 *lstr = str;
544 while (*lstr != '\0' && len--) {
546 if (ch >= 0xD800 && ch <= 0xDBFF) {
552 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
555 errindex = (glong)(lstr - str);
557 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
558 "Invalid sequence in conversion input");
560 *items_read = errindex;
564 if (ch >= 0xDC00 && ch <= 0xDFFF) {
565 errindex = (glong)(lstr - str)-1;
567 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
568 "Invalid sequence in conversion input");
570 *items_read = errindex;
578 *items_read = (glong)(lstr - str);
584 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
587 gunichar *retstr = 0;
593 allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror);
596 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar));
597 retstr[allocsz] = '\0';
600 while (*str != '\0' && allocsz--) {
602 if (ch >= 0xD800 && ch <= 0xDBFF) {
604 ch = ((ch - (gunichar)0xD800) << 10)
605 + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL;
612 *items_written = nwritten;