Fix broken g_int_hash/g_int_equal semantics
[mono.git] / eglib / src / gutf8.c
1 /*
2  * gutf8.c: UTF-8 conversion
3  *
4  * Author:
5  *   Atsushi Enomoto  <atsushi@ximian.com>
6  *
7  * (C) 2006 Novell, Inc.
8  */
9
10 #include <stdio.h>
11 #include <glib.h>
12
13 gpointer error_quark = "ERROR";
14
15 static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error);
16 static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error);
17
18 gpointer
19 g_convert_error_quark ()
20 {
21         return error_quark;
22 }
23
24 static gunichar*
25 utf8_case_conv (const gchar *str, gssize len, gboolean upper)
26 {
27         glong i, u16len, u32len;
28         gunichar2 *u16str;
29         gunichar *u32str;
30         gchar *u8str;
31         GError **err = NULL;
32
33         u16str = g_utf8_to_utf16 (str, (glong)len, NULL, &u16len, err);
34         u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
35         for (i = 0; i < u32len; i++) {
36                 u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
37         }
38         g_free (u16str);
39         u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
40         u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
41         g_free (u32str);
42         g_free (u16str);
43         return (gunichar*)u8str;
44 }
45
46 gchar*
47 g_utf8_strup (const gchar *str, gssize len)
48 {
49         return (gchar*)utf8_case_conv (str, len, TRUE);
50 }
51
52 gchar*
53 g_utf8_strdown (const gchar *str, gssize len)
54 {
55         return (gchar*)utf8_case_conv (str, len, FALSE);
56 }
57
58 static glong
59 utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
60 {
61         /* It is almost identical to UTF8Encoding.GetCharCount() */
62         guchar ch, mb_size, mb_remain;
63         gboolean overlong;
64         guint32 codepoint;
65         glong in_pos, ret;
66
67         if (len < 0)
68                 len = (glong) strlen (str);
69
70         in_pos = 0;
71         ret = 0;
72
73         /* Common case */
74         for (in_pos = 0; in_pos < len && str [in_pos] < 0x80; in_pos++)
75                 ret ++;
76
77         if (in_pos == len) {
78                 if (items_read)
79                         *items_read = in_pos;
80                 return ret;
81         }
82
83         mb_size = 0;
84         mb_remain = 0;
85         overlong = 0;
86
87         for (; in_pos < len; in_pos++) {
88                 ch = str [in_pos];
89                 if (mb_size == 0) {
90                         if (ch < 0x80)
91                                 ret++;
92                         else if ((ch & 0xE0) == 0xC0) {
93                                 codepoint = ch & 0x1F;
94                                 mb_size = 2;
95                         } else if ((ch & 0xF0) == 0xE0) {
96                                 codepoint = ch & 0x0F;
97                                 mb_size = 3;
98                         } else if ((ch & 0xF8) == 0xF0) {
99                                 codepoint = ch & 7;
100                                 mb_size = 4;
101                         } else if ((ch & 0xFC) == 0xF8) {
102                                 codepoint = ch & 3;
103                                 mb_size = 5;
104                         } else if ((ch & 0xFE) == 0xFC) {
105                                 codepoint = ch & 3;
106                                 mb_size = 6;
107                         } else {
108                                 /* invalid utf-8 sequence */
109                                 if (error) {
110                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
111                                         if (items_read)
112                                                 *items_read = in_pos;
113                                         return -1;
114                                 } else {
115                                         codepoint = 0;
116                                         mb_remain = mb_size = 0;
117                                 }
118                         }
119                         if (mb_size > 1)
120                                 mb_remain = mb_size - 1;
121                 } else {
122                         if ((ch & 0xC0) == 0x80) {
123                                 codepoint = (codepoint << 6) | (ch & 0x3F);
124                                 if (--mb_remain == 0) {
125                                         /* multi byte character is fully consumed now. */
126                                         if (codepoint < 0x10000) {
127                                                 switch (mb_size) {
128                                                 case 2:
129                                                         overlong = codepoint < 0x7F;
130                                                         break;
131                                                 case 3:
132                                                         overlong = codepoint < 0x7FF;
133                                                         break;
134                                                 case 4:
135                                                         overlong = codepoint < 0xFFFF;
136                                                         break;
137                                                 case 5:
138                                                         overlong = codepoint < 0x1FFFFF;
139                                                         break;
140                                                 case 6:
141                                                         overlong = codepoint < 0x03FFFFFF;
142                                                         break;
143                                                 }
144                                                 if (overlong) {
145                                                         /* invalid utf-8 sequence (overlong) */
146                                                         if (error) {
147                                                                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
148                                                                 if (items_read)
149                                                                         *items_read = in_pos;
150                                                                 return -1;
151                                                         } else {
152                                                                 codepoint = 0;
153                                                                 mb_remain = 0;
154                                                                 overlong = FALSE;
155                                                         }
156                                                 }
157                                                 else
158                                                         ret++;
159                                         } else if (codepoint < 0x110000) {
160                                                 /* surrogate pair */
161                                                 ret += 2;
162                                         } else {
163                                                 /* invalid utf-8 sequence (excess) */
164                                                 if (error) {
165                                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
166                                                         if (items_read)
167                                                                 *items_read = in_pos;
168                                                         return -1;
169                                                 } else {
170                                                         codepoint = 0;
171                                                         mb_remain = 0;
172                                                 }
173                                         }
174                                         mb_size = 0;
175                                 }
176                         } else {
177                                 /* invalid utf-8 sequence */
178                                 if (error) {
179                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
180                                         if (items_read)
181                                                 *items_read = in_pos;
182                                         return -1;
183                                 } else {
184                                         codepoint = 0;
185                                         mb_remain = mb_size = 0;
186                                 }
187                         }
188                 }
189         }
190
191         if (items_read)
192                 *items_read = in_pos;
193         return ret;
194 }
195
196 gunichar2*
197 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
198 {
199         /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
200            but error check is always done at utf8_to_utf16_len() so that
201            the conversion core below simply resets erroreous bits */
202         glong utf16_len;
203         gunichar2 *ret;
204         guchar ch, mb_size, mb_remain;
205         guint32 codepoint;
206         glong in_pos, out_pos;
207
208         utf16_len = 0;
209         mb_size = 0;
210         mb_remain = 0;
211         in_pos = 0;
212         out_pos = 0;
213
214         if (error)
215                 *error = NULL;
216
217         if (len < 0)
218                 len = (glong) strlen (str);
219
220         if (items_read)
221                 *items_read = 0;
222         if (items_written)
223                 *items_written = 0;
224         utf16_len = utf8_to_utf16_len (str, len, items_read, error);
225         if (error)
226                 if (*error)
227                         return NULL;
228         if (utf16_len < 0)
229                 return NULL;
230
231         ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
232
233         /* Common case */
234         for (in_pos = 0; in_pos < len; in_pos++) {
235                 ch = (guchar) str [in_pos];
236
237                 if (ch >= 0x80)
238                         break;
239                 ret [out_pos++] = ch;
240         }
241
242         for (; in_pos < len; in_pos++) {
243                 ch = (guchar) str [in_pos];
244                 if (mb_size == 0) {
245                         if (ch < 0x80)
246                                 ret [out_pos++] = ch;
247                         else if ((ch & 0xE0) == 0xC0) {
248                                 codepoint = ch & 0x1F;
249                                 mb_size = 2;
250                         } else if ((ch & 0xF0) == 0xE0) {
251                                 codepoint = ch & 0x0F;
252                                 mb_size = 3;
253                         } else if ((ch & 0xF8) == 0xF0) {
254                                 codepoint = ch & 7;
255                                 mb_size = 4;
256                         } else if ((ch & 0xFC) == 0xF8) {
257                                 codepoint = ch & 3;
258                                 mb_size = 5;
259                         } else if ((ch & 0xFE) == 0xFC) {
260                                 codepoint = ch & 3;
261                                 mb_size = 6;
262                         } else {
263                                 /* invalid utf-8 sequence */
264                                 codepoint = 0;
265                                 mb_remain = mb_size = 0;
266                         }
267                         if (mb_size > 1)
268                                 mb_remain = mb_size - 1;
269                 } else {
270                         if ((ch & 0xC0) == 0x80) {
271                                 codepoint = (codepoint << 6) | (ch & 0x3F);
272                                 if (--mb_remain == 0) {
273                                         /* multi byte character is fully consumed now. */
274                                         if (codepoint < 0x10000) {
275                                                 ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
276                                         } else if (codepoint < 0x110000) {
277                                                 /* surrogate pair */
278                                                 codepoint -= 0x10000;
279                                                 ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
280                                                 ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
281                                         } else {
282                                                 /* invalid utf-8 sequence (excess) */
283                                                 codepoint = 0;
284                                                 mb_remain = 0;
285                                         }
286                                         mb_size = 0;
287                                 }
288                         } else {
289                                 /* invalid utf-8 sequence */
290                                 codepoint = 0;
291                                 mb_remain = mb_size = 0;
292                         }
293                 }
294         }
295
296         ret [out_pos] = 0;
297         if (items_written)
298                 *items_written = out_pos;
299         return ret;
300 }
301
302 gchar*
303 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
304 {
305         /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
306            but error check is always done at utf16_to_utf8_len() so that
307            the conversion core below simply resets erroreous bits */
308         glong utf8_len;
309         gchar *ret;
310         glong in_pos, out_pos;
311         gunichar2 ch;
312         guint32 codepoint = 0;
313         gboolean surrogate;
314
315         in_pos = 0;
316         out_pos = 0;
317         surrogate = FALSE;
318
319         if (items_read)
320                 *items_read = 0;
321         if (items_written)
322                 *items_written = 0;
323         utf8_len = utf16_to_utf8_len (str, len, items_read, error);
324         if (error)
325                 if (*error)
326                         return NULL;
327         if (utf8_len < 0)
328                 return NULL;
329
330         ret = g_malloc ((1+utf8_len) * sizeof (gchar));
331
332         while (len < 0 ? str [in_pos] : in_pos < len) {
333                 ch = str [in_pos];
334                 if (surrogate) {
335                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
336                                 codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
337                                 surrogate = 0;
338                         } else {
339                                 surrogate = 0;
340                                 /* invalid surrogate pair */
341                                 continue;
342                         }
343                 } else {
344                         /* fast path optimization */
345                         if (ch < 0x80) {
346                                 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
347                                         if (str [in_pos] < 0x80)
348                                                 ret [out_pos++] = (gchar)(str [in_pos]);
349                                         else
350                                                 break;
351                                 }
352                                 continue;
353                         }
354                         else if (ch >= 0xD800 && ch <= 0xDBFF)
355                                 surrogate = ch;
356                         else if (ch >= 0xDC00 && ch <= 0xDFFF) {
357                                 /* invalid surrogate pair */
358                                 continue;
359                         }
360                         else
361                                 codepoint = ch;
362                 }
363                 in_pos++;
364
365                 if (surrogate != 0)
366                         continue;
367                 if (codepoint < 0x80)
368                         ret [out_pos++] = (gchar) codepoint;
369                 else if (codepoint < 0x0800) {
370                         ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
371                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
372                 } else if (codepoint < 0x10000) {
373                         ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
374                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
375                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
376                 } else {
377                         ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
378                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
379                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
380                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
381                 }
382         }
383         ret [out_pos] = 0;
384
385         if (items_written)
386                 *items_written = out_pos;
387         return ret;
388 }
389
390 static glong
391 utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
392 {
393         glong ret, in_pos;
394         gunichar2 ch;
395         gboolean surrogate;
396
397         ret = 0;
398         in_pos = 0;
399         surrogate = FALSE;
400
401         while (len < 0 ? str [in_pos] : in_pos < len) {
402                 ch = str [in_pos];
403                 if (surrogate) {
404                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
405                                 ret += 4;
406                         } else {
407                                 /* invalid surrogate pair */
408                                 if (error) {
409                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
410                                         if (items_read)
411                                                 *items_read = in_pos;
412                                         return -1;
413                                 } /* otherwise just ignore. */
414                         }
415                         surrogate = FALSE;
416                 } else {
417                         /* fast path optimization */
418                         if (ch < 0x80) {
419                                 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
420                                         if (str [in_pos] < 0x80)
421                                                 ++ret;
422                                         else
423                                                 break;
424                                 }
425                                 continue;
426                         }
427                         else if (ch < 0x0800)
428                                 ret += 2;
429                         else if (ch >= 0xD800 && ch <= 0xDBFF)
430                                 surrogate = TRUE;
431                         else if (ch >= 0xDC00 && ch <= 0xDFFF) {
432                                 /* invalid surrogate pair */
433                                 if (error) {
434                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
435                                         if (items_read)
436                                                 *items_read = in_pos;
437                                         return -1;
438                                 } /* otherwise just ignore. */
439                         }
440                         else
441                                 ret += 3;
442                 }
443                 in_pos++;
444         }
445
446         if (items_read)
447                 *items_read = in_pos;
448         return ret;
449 }
450
451 static glong
452 g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error)
453 {
454         glong retlen = 0;
455         glong errindex = 0;
456         const gunichar *lstr = str;
457
458         if (!str)
459                 return 0;
460
461         while (*lstr != '\0' && len--) {
462                 gunichar ch;
463                 ch = *lstr++;
464                 if (ch <= 0x0000FFFF) { 
465                         if (ch >= 0xD800 && ch <= 0xDFFF) {
466                                 errindex = (glong)(lstr - str)-1;
467                                 if (error)
468                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
469                                         "Invalid sequence in conversion input");
470                                 if (items_read)
471                                         *items_read = errindex;
472                                 return 0;
473                         } else {
474                                 retlen++;
475                         }
476                 } else if (ch > 0x10FFFF) {
477                         errindex = (glong)(lstr - str)-1;
478                         if (error)
479                                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
480                                 "Character out of range for UTF-16");
481                         if (items_read)
482                                 *items_read = errindex;
483                         return 0;
484
485                 } else {
486                         retlen+=2;
487                 }
488         }
489
490         if (items_read)
491                 *items_read = (glong)(lstr - str);
492         return retlen;
493 }
494
495 gunichar2*
496 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
497 {
498         glong allocsz;
499         gunichar2 *retstr = 0;
500         gunichar2 *retch = 0;
501         glong nwritten = 0;
502         GError *lerror =0 ;
503
504         allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror);
505
506         if (!lerror) {
507                 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2));
508                 retstr[allocsz] = '\0';
509
510                 while (*str != '\0' && len--) {
511                         gunichar ch;
512                         ch = *str++;
513                         if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) {
514                                 *retch++ = (gunichar2)ch;
515                                 nwritten ++;
516                         } else {
517                                 ch -= 0x0010000UL;
518                                 *retch++ = (gunichar2)((ch >> 10) + 0xD800);
519                                 *retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00);
520                                 nwritten +=2;
521                         }
522                 }
523         }
524
525         if (items_written)
526                 *items_written = nwritten;
527         if (error)
528                 *error = lerror;
529
530         return retstr;
531 }
532
533 static glong
534 g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
535 {
536         glong retlen = 0;
537         glong errindex = 0;
538         const gunichar2 *lstr = str;
539         gunichar2 ch,ch2;
540
541         if (!str)
542                 return 0;
543
544         while (*lstr != '\0' && len--) {
545                 ch = *lstr++;
546                 if (ch >= 0xD800 && ch <= 0xDBFF) {
547                         if (!len--) {
548                                 lstr--;
549                                 break;
550                         }
551                         ch2 = *lstr;
552                         if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
553                                 lstr++;
554                         } else {
555                                 errindex = (glong)(lstr - str);
556                                 if (error)
557                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
558                                         "Invalid sequence in conversion input");
559                                 if (items_read)
560                                         *items_read = errindex;
561                                 return 0;
562                         }
563                 } else {
564                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
565                                 errindex = (glong)(lstr - str)-1;
566                                 if (error)
567                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
568                                         "Invalid sequence in conversion input");
569                                 if (items_read)
570                                         *items_read = errindex;
571                                 return 0;
572                         }
573                 }
574                 retlen++;
575         }
576
577         if (items_read)
578                 *items_read = (glong)(lstr - str);
579
580         return retlen;
581 }
582
583 gunichar*
584 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
585 {
586         glong allocsz;
587         gunichar *retstr = 0;
588         gunichar *retch = 0;
589         glong nwritten = 0;
590         GError *lerror =0 ;
591         gunichar ch,ch2;
592
593         allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror);
594
595         if (!lerror) {
596                 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar));
597                 retstr[allocsz] = '\0';
598                 nwritten = allocsz;
599
600                 while (*str != '\0' && allocsz--) {
601                         ch = *str++;
602                         if (ch >= 0xD800 && ch <= 0xDBFF) {
603                                 ch2 = *str++;
604                                 ch = ((ch - (gunichar)0xD800) << 10)
605                                       + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL;
606                         }
607                         *retch++ = ch;
608                 }
609         }
610
611         if (items_written)
612                 *items_written = nwritten;
613         if (error)
614                 *error = lerror;
615
616         return retstr;
617 }