Normalize line endings.
[mono.git] / eglib / src / gutf8.c
1 /*
2  * gutf8.c: UTF-8 conversion
3  *
4  * Author:
5  *   Atsushi Enomoto  <atsushi@ximian.com>
6  *
7  * (C) 2006 Novell, Inc.
8  */
9
10 #include <stdio.h>
11 #include <glib.h>
12
13 gpointer error_quark = "ERROR";
14
15 static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error);
16 static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error);
17
18 gpointer
19 g_convert_error_quark ()
20 {
21         return error_quark;
22 }
23
24 static gunichar*
25 utf8_case_conv (const gchar *str, gssize len, gboolean upper)
26 {
27         glong i, u16len, u32len;
28         gunichar2 *u16str;
29         gunichar *u32str;
30         gchar *u8str;
31         GError **err = NULL;
32
33         u16str = g_utf8_to_utf16 (str, (glong)len, NULL, &u16len, err);
34         u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
35         for (i = 0; i < u32len; i++) {
36                 u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
37         }
38         g_free (u16str);
39         u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
40         u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
41         g_free (u32str);
42         g_free (u16str);
43         return (gunichar*)u8str;
44 }
45
46 gchar*
47 g_utf8_strup (const gchar *str, gssize len)
48 {
49         return (gchar*)utf8_case_conv (str, len, TRUE);
50 }
51
52 gchar*
53 g_utf8_strdown (const gchar *str, gssize len)
54 {
55         return (gchar*)utf8_case_conv (str, len, FALSE);
56 }
57
58 gunichar2*
59 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
60 {
61         /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
62            but error check is always done at utf8_to_utf16_len() so that
63            the conversion core below simply resets erroreous bits */
64         glong utf16_len;
65         gunichar2 *ret;
66         guchar ch, mb_size, mb_remain;
67         guint32 codepoint;
68         glong in_pos, out_pos;
69
70         utf16_len = 0;
71         mb_size = 0;
72         mb_remain = 0;
73         in_pos = 0;
74         out_pos = 0;
75
76         if (error)
77                 *error = NULL;
78
79         if (items_written)
80                 *items_written = 0;
81         utf16_len = utf8_to_utf16_len (str, len, items_read, error);
82         if (error)
83                 if (*error)
84                         return NULL;
85         if (utf16_len < 0)
86                 return NULL;
87
88         ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
89
90         for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
91                 ch = (guchar) str [in_pos];
92                 if (mb_size == 0) {
93                         if (ch < 0x80)
94                                 ret [out_pos++] = ch;
95                         else if ((ch & 0xE0) == 0xC0) {
96                                 codepoint = ch & 0x1F;
97                                 mb_size = 2;
98                         } else if ((ch & 0xF0) == 0xE0) {
99                                 codepoint = ch & 0x0F;
100                                 mb_size = 3;
101                         } else if ((ch & 0xF8) == 0xF0) {
102                                 codepoint = ch & 7;
103                                 mb_size = 4;
104                         } else if ((ch & 0xFC) == 0xF8) {
105                                 codepoint = ch & 3;
106                                 mb_size = 5;
107                         } else if ((ch & 0xFE) == 0xFC) {
108                                 codepoint = ch & 3;
109                                 mb_size = 6;
110                         } else {
111                                 /* invalid utf-8 sequence */
112                                 codepoint = 0;
113                                 mb_remain = mb_size = 0;
114                         }
115                         if (mb_size > 1)
116                                 mb_remain = mb_size - 1;
117                 } else {
118                         if ((ch & 0xC0) == 0x80) {
119                                 codepoint = (codepoint << 6) | (ch & 0x3F);
120                                 if (--mb_remain == 0) {
121                                         /* multi byte character is fully consumed now. */
122                                         if (codepoint < 0x10000) {
123                                                 ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
124                                         } else if (codepoint < 0x110000) {
125                                                 /* surrogate pair */
126                                                 codepoint -= 0x10000;
127                                                 ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
128                                                 ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
129                                         } else {
130                                                 /* invalid utf-8 sequence (excess) */
131                                                 codepoint = 0;
132                                                 mb_remain = 0;
133                                         }
134                                         mb_size = 0;
135                                 }
136                         } else {
137                                 /* invalid utf-8 sequence */
138                                 codepoint = 0;
139                                 mb_remain = mb_size = 0;
140                         }
141                 }
142         }
143
144         ret [out_pos] = 0;
145         if (items_written)
146                 *items_written = out_pos;
147         return ret;
148 }
149
150 static glong
151 utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
152 {
153         /* It is almost identical to UTF8Encoding.GetCharCount() */
154         guchar ch, mb_size, mb_remain;
155         gboolean overlong;
156         guint32 codepoint;
157         glong in_pos, ret;
158
159         mb_size = 0;
160         mb_remain = 0;
161         overlong = 0;
162         in_pos = 0;
163         ret = 0;
164
165         for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
166                 ch = str [in_pos];
167                 if (mb_size == 0) {
168                         if (ch < 0x80)
169                                 ret++;
170                         else if ((ch & 0xE0) == 0xC0) {
171                                 codepoint = ch & 0x1F;
172                                 mb_size = 2;
173                         } else if ((ch & 0xF0) == 0xE0) {
174                                 codepoint = ch & 0x0F;
175                                 mb_size = 3;
176                         } else if ((ch & 0xF8) == 0xF0) {
177                                 codepoint = ch & 7;
178                                 mb_size = 4;
179                         } else if ((ch & 0xFC) == 0xF8) {
180                                 codepoint = ch & 3;
181                                 mb_size = 5;
182                         } else if ((ch & 0xFE) == 0xFC) {
183                                 codepoint = ch & 3;
184                                 mb_size = 6;
185                         } else {
186                                 /* invalid utf-8 sequence */
187                                 if (error) {
188                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
189                                         if (items_read)
190                                                 *items_read = in_pos;
191                                         return -1;
192                                 } else {
193                                         codepoint = 0;
194                                         mb_remain = mb_size = 0;
195                                 }
196                         }
197                         if (mb_size > 1)
198                                 mb_remain = mb_size - 1;
199                 } else {
200                         if ((ch & 0xC0) == 0x80) {
201                                 codepoint = (codepoint << 6) | (ch & 0x3F);
202                                 if (--mb_remain == 0) {
203                                         /* multi byte character is fully consumed now. */
204                                         if (codepoint < 0x10000) {
205                                                 switch (mb_size) {
206                                                 case 2:
207                                                         overlong = codepoint < 0x7F;
208                                                         break;
209                                                 case 3:
210                                                         overlong = codepoint < 0x7FF;
211                                                         break;
212                                                 case 4:
213                                                         overlong = codepoint < 0xFFFF;
214                                                         break;
215                                                 case 5:
216                                                         overlong = codepoint < 0x1FFFFF;
217                                                         break;
218                                                 case 6:
219                                                         overlong = codepoint < 0x03FFFFFF;
220                                                         break;
221                                                 }
222                                                 if (overlong) {
223                                                         /* invalid utf-8 sequence (overlong) */
224                                                         if (error) {
225                                                                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
226                                                                 if (items_read)
227                                                                         *items_read = in_pos;
228                                                                 return -1;
229                                                         } else {
230                                                                 codepoint = 0;
231                                                                 mb_remain = 0;
232                                                                 overlong = FALSE;
233                                                         }
234                                                 }
235                                                 else
236                                                         ret++;
237                                         } else if (codepoint < 0x110000) {
238                                                 /* surrogate pair */
239                                                 ret += 2;
240                                         } else {
241                                                 /* invalid utf-8 sequence (excess) */
242                                                 if (error) {
243                                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
244                                                         if (items_read)
245                                                                 *items_read = in_pos;
246                                                         return -1;
247                                                 } else {
248                                                         codepoint = 0;
249                                                         mb_remain = 0;
250                                                 }
251                                         }
252                                         mb_size = 0;
253                                 }
254                         } else {
255                                 /* invalid utf-8 sequence */
256                                 if (error) {
257                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
258                                         if (items_read)
259                                                 *items_read = in_pos;
260                                         return -1;
261                                 } else {
262                                         codepoint = 0;
263                                         mb_remain = mb_size = 0;
264                                 }
265                         }
266                 }
267         }
268
269         if (items_read)
270                 *items_read = in_pos;
271         return ret;
272 }
273
274 gchar*
275 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
276 {
277         /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
278            but error check is always done at utf16_to_utf8_len() so that
279            the conversion core below simply resets erroreous bits */
280         glong utf8_len;
281         gchar *ret;
282         glong in_pos, out_pos;
283         gunichar2 ch;
284         guint32 codepoint = 0;
285         gboolean surrogate;
286
287         in_pos = 0;
288         out_pos = 0;
289         surrogate = FALSE;
290
291         if (items_written)
292                 *items_written = 0;
293         utf8_len = utf16_to_utf8_len (str, len, items_read, error);
294         if (error)
295                 if (*error)
296                         return NULL;
297         if (utf8_len < 0)
298                 return NULL;
299
300         ret = g_malloc ((1+utf8_len) * sizeof (gchar));
301
302         while (len < 0 ? str [in_pos] : in_pos < len) {
303                 ch = str [in_pos];
304                 if (surrogate) {
305                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
306                                 codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
307                                 surrogate = 0;
308                         } else {
309                                 surrogate = 0;
310                                 /* invalid surrogate pair */
311                                 continue;
312                         }
313                 } else {
314                         /* fast path optimization */
315                         if (ch < 0x80) {
316                                 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
317                                         if (str [in_pos] < 0x80)
318                                                 ret [out_pos++] = (gchar)(str [in_pos]);
319                                         else
320                                                 break;
321                                 }
322                                 continue;
323                         }
324                         else if (ch >= 0xD800 && ch <= 0xDBFF)
325                                 surrogate = ch;
326                         else if (ch >= 0xDC00 && ch <= 0xDFFF) {
327                                 /* invalid surrogate pair */
328                                 continue;
329                         }
330                         else
331                                 codepoint = ch;
332                 }
333                 in_pos++;
334
335                 if (surrogate != 0)
336                         continue;
337                 if (codepoint < 0x80)
338                         ret [out_pos++] = (gchar) codepoint;
339                 else if (codepoint < 0x0800) {
340                         ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
341                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
342                 } else if (codepoint < 0x10000) {
343                         ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
344                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
345                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
346                 } else {
347                         ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
348                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
349                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
350                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
351                 }
352         }
353         ret [out_pos] = 0;
354
355         if (items_written)
356                 *items_written = out_pos;
357         return ret;
358 }
359
360 static glong
361 utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
362 {
363         glong ret, in_pos;
364         gunichar2 ch;
365         gboolean surrogate;
366
367         ret = 0;
368         in_pos = 0;
369         surrogate = FALSE;
370
371         while (len < 0 ? str [in_pos] : in_pos < len) {
372                 ch = str [in_pos];
373                 if (surrogate) {
374                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
375                                 ret += 4;
376                         } else {
377                                 /* invalid surrogate pair */
378                                 if (error) {
379                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
380                                         if (items_read)
381                                                 *items_read = in_pos;
382                                         return -1;
383                                 } /* otherwise just ignore. */
384                         }
385                         surrogate = FALSE;
386                 } else {
387                         /* fast path optimization */
388                         if (ch < 0x80) {
389                                 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
390                                         if (str [in_pos] < 0x80)
391                                                 ++ret;
392                                         else
393                                                 break;
394                                 }
395                                 continue;
396                         }
397                         else if (ch < 0x0800)
398                                 ret += 2;
399                         else if (ch >= 0xD800 && ch <= 0xDBFF)
400                                 surrogate = TRUE;
401                         else if (ch >= 0xDC00 && ch <= 0xDFFF) {
402                                 /* invalid surrogate pair */
403                                 if (error) {
404                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
405                                         if (items_read)
406                                                 *items_read = in_pos;
407                                         return -1;
408                                 } /* otherwise just ignore. */
409                         }
410                         else
411                                 ret += 3;
412                 }
413                 in_pos++;
414         }
415
416         if (items_read)
417                 *items_read = in_pos;
418         return ret;
419 }
420
421 static glong
422 g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error)
423 {
424         glong retlen = 0;
425         glong errindex = 0;
426         const gunichar *lstr = str;
427
428         if (!str)
429                 return 0;
430
431         while (*lstr != '\0' && len--) {
432                 gunichar ch;
433                 ch = *lstr++;
434                 if (ch <= 0x0000FFFF) { 
435                         if (ch >= 0xD800 && ch <= 0xDFFF) {
436                                 errindex = (glong)(lstr - str)-1;
437                                 if (error)
438                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
439                                         "Invalid sequence in conversion input");
440                                 if (items_read)
441                                         *items_read = errindex;
442                                 return 0;
443                         } else {
444                                 retlen++;
445                         }
446                 } else if (ch > 0x10FFFF) {
447                         errindex = (glong)(lstr - str)-1;
448                         if (error)
449                                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
450                                 "Character out of range for UTF-16");
451                         if (items_read)
452                                 *items_read = errindex;
453                         return 0;
454
455                 } else {
456                         retlen+=2;
457                 }
458         }
459
460         if (items_read)
461                 *items_read = (glong)(lstr - str);
462         return retlen;
463 }
464
465 gunichar2*
466 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
467 {
468         glong allocsz;
469         gunichar2 *retstr = 0;
470         gunichar2 *retch = 0;
471         glong nwritten = 0;
472         GError *lerror =0 ;
473
474         allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror);
475
476         if (!lerror) {
477                 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2));
478                 retstr[allocsz] = '\0';
479
480                 while (*str != '\0' && len--) {
481                         gunichar ch;
482                         ch = *str++;
483                         if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) {
484                                 *retch++ = (gunichar2)ch;
485                                 nwritten ++;
486                         } else {
487                                 ch -= 0x0010000UL;
488                                 *retch++ = (gunichar2)((ch >> 10) + 0xD800);
489                                 *retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00);
490                                 nwritten +=2;
491                         }
492                 }
493         }
494
495         if (items_written)
496                 *items_written = nwritten;
497         if (error)
498                 *error = lerror;
499
500         return retstr;
501 }
502
503 static glong
504 g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
505 {
506         glong retlen = 0;
507         glong errindex = 0;
508         const gunichar2 *lstr = str;
509         gunichar2 ch,ch2;
510
511         if (!str)
512                 return 0;
513
514         while (*lstr != '\0' && len--) {
515                 ch = *lstr++;
516                 if (ch >= 0xD800 && ch <= 0xDBFF) {
517                         if (!len--) {
518                                 lstr--;
519                                 break;
520                         }
521                         ch2 = *lstr;
522                         if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
523                                 lstr++;
524                         } else {
525                                 errindex = (glong)(lstr - str);
526                                 if (error)
527                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
528                                         "Invalid sequence in conversion input");
529                                 if (items_read)
530                                         *items_read = errindex;
531                                 return 0;
532                         }
533                 } else {
534                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
535                                 errindex = (glong)(lstr - str)-1;
536                                 if (error)
537                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
538                                         "Invalid sequence in conversion input");
539                                 if (items_read)
540                                         *items_read = errindex;
541                                 return 0;
542                         }
543                 }
544                 retlen++;
545         }
546
547         if (items_read)
548                 *items_read = (glong)(lstr - str);
549
550         return retlen;
551 }
552
553 gunichar*
554 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
555 {
556         glong allocsz;
557         gunichar *retstr = 0;
558         gunichar *retch = 0;
559         glong nwritten = 0;
560         GError *lerror =0 ;
561         gunichar ch,ch2;
562
563         allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror);
564
565         if (!lerror) {
566                 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar));
567                 retstr[allocsz] = '\0';
568                 nwritten = allocsz;
569
570                 while (*str != '\0' && allocsz--) {
571                         ch = *str++;
572                         if (ch >= 0xD800 && ch <= 0xDBFF) {
573                                 ch2 = *str++;
574                                 ch = ((ch - (gunichar)0xD800) << 10)
575                                       + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL;
576                         }
577                         *retch++ = ch;
578                 }
579         }
580
581         if (items_written)
582                 *items_written = nwritten;
583         if (error)
584                 *error = lerror;
585
586         return retstr;
587 }