eglib/src/gutf8.c

   1 /*
   2  * gutf8.c: UTF-8 conversion
   3  *
   4  * Author:
   5  *   Atsushi Enomoto  <atsushi@ximian.com>
   6  *
   7  * (C) 2006 Novell, Inc.
   8  */
   9
  10 #include <stdio.h>
  11 #include <glib.h>
  12
  13 gpointer error_quark = "ERROR";
  14
  15 static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error);
  16 static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error);
  17
  18 gpointer
  19 g_convert_error_quark ()
  20 {
  21         return error_quark;
  22 }
  23
  24 static gunichar*
  25 utf8_case_conv (const gchar *str, gssize len, gboolean upper)
  26 {
  27         glong i, u16len, u32len;
  28         gunichar2 *u16str;
  29         gunichar *u32str;
  30         gchar *u8str;
  31         GError **err = NULL;
  32
  33         u16str = g_utf8_to_utf16 (str, (glong)len, NULL, &u16len, err);
  34         u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
  35         for (i = 0; i < u32len; i++) {
  36                 u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
  37         }
  38         g_free (u16str);
  39         u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
  40         u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
  41         g_free (u32str);
  42         g_free (u16str);
  43         return (gunichar*)u8str;
  44 }
  45
  46 gchar*
  47 g_utf8_strup (const gchar *str, gssize len)
  48 {
  49         return (gchar*)utf8_case_conv (str, len, TRUE);
  50 }
  51
  52 gchar*
  53 g_utf8_strdown (const gchar *str, gssize len)
  54 {
  55         return (gchar*)utf8_case_conv (str, len, FALSE);
  56 }
  57
  58 static glong
  59 utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
  60 {
  61         /* It is almost identical to UTF8Encoding.GetCharCount() */
  62         guchar ch, mb_size, mb_remain;
  63         gboolean overlong;
  64         guint32 codepoint;
  65         glong in_pos, ret;
  66
  67         if (len < 0)
  68                 len = (glong) strlen (str);
  69
  70         in_pos = 0;
  71         ret = 0;
  72
  73         /* Common case */
  74         for (in_pos = 0; in_pos < len && str [in_pos] < 0x80; in_pos++)
  75                 ret ++;
  76
  77         if (in_pos == len) {
  78                 if (items_read)
  79                         *items_read = in_pos;
  80                 return ret;
  81         }
  82
  83         mb_size = 0;
  84         mb_remain = 0;
  85         overlong = 0;
  86
  87         for (; in_pos < len; in_pos++) {
  88                 ch = str [in_pos];
  89                 if (mb_size == 0) {
  90                         if (ch < 0x80)
  91                                 ret++;
  92                         else if ((ch & 0xE0) == 0xC0) {
  93                                 codepoint = ch & 0x1F;
  94                                 mb_size = 2;
  95                         } else if ((ch & 0xF0) == 0xE0) {
  96                                 codepoint = ch & 0x0F;
  97                                 mb_size = 3;
  98                         } else if ((ch & 0xF8) == 0xF0) {
  99                                 codepoint = ch & 7;
 100                                 mb_size = 4;
 101                         } else if ((ch & 0xFC) == 0xF8) {
 102                                 codepoint = ch & 3;
 103                                 mb_size = 5;
 104                         } else if ((ch & 0xFE) == 0xFC) {
 105                                 codepoint = ch & 3;
 106                                 mb_size = 6;
 107                         } else {
 108                                 /* invalid utf-8 sequence */
 109                                 if (error) {
 110                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
 111                                         if (items_read)
 112                                                 *items_read = in_pos;
 113                                         return -1;
 114                                 } else {
 115                                         codepoint = 0;
 116                                         mb_remain = mb_size = 0;
 117                                 }
 118                         }
 119                         if (mb_size > 1)
 120                                 mb_remain = mb_size - 1;
 121                 } else {
 122                         if ((ch & 0xC0) == 0x80) {
 123                                 codepoint = (codepoint << 6) | (ch & 0x3F);
 124                                 if (--mb_remain == 0) {
 125                                         /* multi byte character is fully consumed now. */
 126                                         if (codepoint < 0x10000) {
 127                                                 switch (mb_size) {
 128                                                 case 2:
 129                                                         overlong = codepoint < 0x7F;
 130                                                         break;
 131                                                 case 3:
 132                                                         overlong = codepoint < 0x7FF;
 133                                                         break;
 134                                                 case 4:
 135                                                         overlong = codepoint < 0xFFFF;
 136                                                         break;
 137                                                 case 5:
 138                                                         overlong = codepoint < 0x1FFFFF;
 139                                                         break;
 140                                                 case 6:
 141                                                         overlong = codepoint < 0x03FFFFFF;
 142                                                         break;
 143                                                 }
 144                                                 if (overlong) {
 145                                                         /* invalid utf-8 sequence (overlong) */
 146                                                         if (error) {
 147                                                                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
 148                                                                 if (items_read)
 149                                                                         *items_read = in_pos;
 150                                                                 return -1;
 151                                                         } else {
 152                                                                 codepoint = 0;
 153                                                                 mb_remain = 0;
 154                                                                 overlong = FALSE;
 155                                                         }
 156                                                 }
 157                                                 else
 158                                                         ret++;
 159                                         } else if (codepoint < 0x110000) {
 160                                                 /* surrogate pair */
 161                                                 ret += 2;
 162                                         } else {
 163                                                 /* invalid utf-8 sequence (excess) */
 164                                                 if (error) {
 165                                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
 166                                                         if (items_read)
 167                                                                 *items_read = in_pos;
 168                                                         return -1;
 169                                                 } else {
 170                                                         codepoint = 0;
 171                                                         mb_remain = 0;
 172                                                 }
 173                                         }
 174                                         mb_size = 0;
 175                                 }
 176                         } else {
 177                                 /* invalid utf-8 sequence */
 178                                 if (error) {
 179                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
 180                                         if (items_read)
 181                                                 *items_read = in_pos;
 182                                         return -1;
 183                                 } else {
 184                                         codepoint = 0;
 185                                         mb_remain = mb_size = 0;
 186                                 }
 187                         }
 188                 }
 189         }
 190
 191         if (items_read)
 192                 *items_read = in_pos;
 193         return ret;
 194 }
 195
 196 gunichar2*
 197 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
 198 {
 199         /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
 200            but error check is always done at utf8_to_utf16_len() so that
 201            the conversion core below simply resets erroreous bits */
 202         glong utf16_len;
 203         gunichar2 *ret;
 204         guchar ch, mb_size, mb_remain;
 205         guint32 codepoint;
 206         glong in_pos, out_pos;
 207
 208         utf16_len = 0;
 209         mb_size = 0;
 210         mb_remain = 0;
 211         in_pos = 0;
 212         out_pos = 0;
 213
 214         if (error)
 215                 *error = NULL;
 216
 217         if (len < 0)
 218                 len = (glong) strlen (str);
 219
 220         if (items_read)
 221                 *items_read = 0;
 222         if (items_written)
 223                 *items_written = 0;
 224         utf16_len = utf8_to_utf16_len (str, len, items_read, error);
 225         if (error)
 226                 if (*error)
 227                         return NULL;
 228         if (utf16_len < 0)
 229                 return NULL;
 230
 231         ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
 232
 233         /* Common case */
 234         for (in_pos = 0; in_pos < len; in_pos++) {
 235                 ch = (guchar) str [in_pos];
 236
 237                 if (ch >= 0x80)
 238                         break;
 239                 ret [out_pos++] = ch;
 240         }
 241
 242         for (; in_pos < len; in_pos++) {
 243                 ch = (guchar) str [in_pos];
 244                 if (mb_size == 0) {
 245                         if (ch < 0x80)
 246                                 ret [out_pos++] = ch;
 247                         else if ((ch & 0xE0) == 0xC0) {
 248                                 codepoint = ch & 0x1F;
 249                                 mb_size = 2;
 250                         } else if ((ch & 0xF0) == 0xE0) {
 251                                 codepoint = ch & 0x0F;
 252                                 mb_size = 3;
 253                         } else if ((ch & 0xF8) == 0xF0) {
 254                                 codepoint = ch & 7;
 255                                 mb_size = 4;
 256                         } else if ((ch & 0xFC) == 0xF8) {
 257                                 codepoint = ch & 3;
 258                                 mb_size = 5;
 259                         } else if ((ch & 0xFE) == 0xFC) {
 260                                 codepoint = ch & 3;
 261                                 mb_size = 6;
 262                         } else {
 263                                 /* invalid utf-8 sequence */
 264                                 codepoint = 0;
 265                                 mb_remain = mb_size = 0;
 266                         }
 267                         if (mb_size > 1)
 268                                 mb_remain = mb_size - 1;
 269                 } else {
 270                         if ((ch & 0xC0) == 0x80) {
 271                                 codepoint = (codepoint << 6) | (ch & 0x3F);
 272                                 if (--mb_remain == 0) {
 273                                         /* multi byte character is fully consumed now. */
 274                                         if (codepoint < 0x10000) {
 275                                                 ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
 276                                         } else if (codepoint < 0x110000) {
 277                                                 /* surrogate pair */
 278                                                 codepoint -= 0x10000;
 279                                                 ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
 280                                                 ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
 281                                         } else {
 282                                                 /* invalid utf-8 sequence (excess) */
 283                                                 codepoint = 0;
 284                                                 mb_remain = 0;
 285                                         }
 286                                         mb_size = 0;
 287                                 }
 288                         } else {
 289                                 /* invalid utf-8 sequence */
 290                                 codepoint = 0;
 291                                 mb_remain = mb_size = 0;
 292                         }
 293                 }
 294         }
 295
 296         ret [out_pos] = 0;
 297         if (items_written)
 298                 *items_written = out_pos;
 299         return ret;
 300 }
 301
 302 gchar*
 303 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
 304 {
 305         /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
 306            but error check is always done at utf16_to_utf8_len() so that
 307            the conversion core below simply resets erroreous bits */
 308         glong utf8_len;
 309         gchar *ret;
 310         glong in_pos, out_pos;
 311         gunichar2 ch;
 312         guint32 codepoint = 0;
 313         gboolean surrogate;
 314
 315         in_pos = 0;
 316         out_pos = 0;
 317         surrogate = FALSE;
 318
 319         if (items_read)
 320                 *items_read = 0;
 321         if (items_written)
 322                 *items_written = 0;
 323         utf8_len = utf16_to_utf8_len (str, len, items_read, error);
 324         if (error)
 325                 if (*error)
 326                         return NULL;
 327         if (utf8_len < 0)
 328                 return NULL;
 329
 330         ret = g_malloc ((1+utf8_len) * sizeof (gchar));
 331
 332         while (len < 0 ? str [in_pos] : in_pos < len) {
 333                 ch = str [in_pos];
 334                 if (surrogate) {
 335                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
 336                                 codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
 337                                 surrogate = 0;
 338                         } else {
 339                                 surrogate = 0;
 340                                 /* invalid surrogate pair */
 341                                 continue;
 342                         }
 343                 } else {
 344                         /* fast path optimization */
 345                         if (ch < 0x80) {
 346                                 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
 347                                         if (str [in_pos] < 0x80)
 348                                                 ret [out_pos++] = (gchar)(str [in_pos]);
 349                                         else
 350                                                 break;
 351                                 }
 352                                 continue;
 353                         }
 354                         else if (ch >= 0xD800 && ch <= 0xDBFF)
 355                                 surrogate = ch;
 356                         else if (ch >= 0xDC00 && ch <= 0xDFFF) {
 357                                 /* invalid surrogate pair */
 358                                 continue;
 359                         }
 360                         else
 361                                 codepoint = ch;
 362                 }
 363                 in_pos++;
 364
 365                 if (surrogate != 0)
 366                         continue;
 367                 if (codepoint < 0x80)
 368                         ret [out_pos++] = (gchar) codepoint;
 369                 else if (codepoint < 0x0800) {
 370                         ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
 371                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
 372                 } else if (codepoint < 0x10000) {
 373                         ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
 374                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
 375                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
 376                 } else {
 377                         ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
 378                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
 379                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
 380                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
 381                 }
 382         }
 383         ret [out_pos] = 0;
 384
 385         if (items_written)
 386                 *items_written = out_pos;
 387         return ret;
 388 }
 389
 390 static glong
 391 utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
 392 {
 393         glong ret, in_pos;
 394         gunichar2 ch;
 395         gboolean surrogate;
 396
 397         ret = 0;
 398         in_pos = 0;
 399         surrogate = FALSE;
 400
 401         while (len < 0 ? str [in_pos] : in_pos < len) {
 402                 ch = str [in_pos];
 403                 if (surrogate) {
 404                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
 405                                 ret += 4;
 406                         } else {
 407                                 /* invalid surrogate pair */
 408                                 if (error) {
 409                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
 410                                         if (items_read)
 411                                                 *items_read = in_pos;
 412                                         return -1;
 413                                 } /* otherwise just ignore. */
 414                         }
 415                         surrogate = FALSE;
 416                 } else {
 417                         /* fast path optimization */
 418                         if (ch < 0x80) {
 419                                 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
 420                                         if (str [in_pos] < 0x80)
 421                                                 ++ret;
 422                                         else
 423                                                 break;
 424                                 }
 425                                 continue;
 426                         }
 427                         else if (ch < 0x0800)
 428                                 ret += 2;
 429                         else if (ch >= 0xD800 && ch <= 0xDBFF)
 430                                 surrogate = TRUE;
 431                         else if (ch >= 0xDC00 && ch <= 0xDFFF) {
 432                                 /* invalid surrogate pair */
 433                                 if (error) {
 434                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
 435                                         if (items_read)
 436                                                 *items_read = in_pos;
 437                                         return -1;
 438                                 } /* otherwise just ignore. */
 439                         }
 440                         else
 441                                 ret += 3;
 442                 }
 443                 in_pos++;
 444         }
 445
 446         if (items_read)
 447                 *items_read = in_pos;
 448         return ret;
 449 }
 450
 451 static glong
 452 g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error)
 453 {
 454         glong retlen = 0;
 455         glong errindex = 0;
 456         const gunichar *lstr = str;
 457
 458         if (!str)
 459                 return 0;
 460
 461         while (*lstr != '\0' && len--) {
 462                 gunichar ch;
 463                 ch = *lstr++;
 464                 if (ch <= 0x0000FFFF) {
 465                         if (ch >= 0xD800 && ch <= 0xDFFF) {
 466                                 errindex = (glong)(lstr - str)-1;
 467                                 if (error)
 468                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 469                                         "Invalid sequence in conversion input");
 470                                 if (items_read)
 471                                         *items_read = errindex;
 472                                 return 0;
 473                         } else {
 474                                 retlen++;
 475                         }
 476                 } else if (ch > 0x10FFFF) {
 477                         errindex = (glong)(lstr - str)-1;
 478                         if (error)
 479                                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 480                                 "Character out of range for UTF-16");
 481                         if (items_read)
 482                                 *items_read = errindex;
 483                         return 0;
 484
 485                 } else {
 486                         retlen+=2;
 487                 }
 488         }
 489
 490         if (items_read)
 491                 *items_read = (glong)(lstr - str);
 492         return retlen;
 493 }
 494
 495 gunichar2*
 496 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
 497 {
 498         glong allocsz;
 499         gunichar2 *retstr = 0;
 500         gunichar2 *retch = 0;
 501         glong nwritten = 0;
 502         GError *lerror =0 ;
 503
 504         allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror);
 505
 506         if (!lerror) {
 507                 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2));
 508                 retstr[allocsz] = '\0';
 509
 510                 while (*str != '\0' && len--) {
 511                         gunichar ch;
 512                         ch = *str++;
 513                         if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) {
 514                                 *retch++ = (gunichar2)ch;
 515                                 nwritten ++;
 516                         } else {
 517                                 ch -= 0x0010000UL;
 518                                 *retch++ = (gunichar2)((ch >> 10) + 0xD800);
 519                                 *retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00);
 520                                 nwritten +=2;
 521                         }
 522                 }
 523         }
 524
 525         if (items_written)
 526                 *items_written = nwritten;
 527         if (error)
 528                 *error = lerror;
 529
 530         return retstr;
 531 }
 532
 533 static glong
 534 g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
 535 {
 536         glong retlen = 0;
 537         glong errindex = 0;
 538         const gunichar2 *lstr = str;
 539         gunichar2 ch,ch2;
 540
 541         if (!str)
 542                 return 0;
 543
 544         while (*lstr != '\0' && len--) {
 545                 ch = *lstr++;
 546                 if (ch >= 0xD800 && ch <= 0xDBFF) {
 547                         if (!len--) {
 548                                 lstr--;
 549                                 break;
 550                         }
 551                         ch2 = *lstr;
 552                         if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
 553                                 lstr++;
 554                         } else {
 555                                 errindex = (glong)(lstr - str);
 556                                 if (error)
 557                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 558                                         "Invalid sequence in conversion input");
 559                                 if (items_read)
 560                                         *items_read = errindex;
 561                                 return 0;
 562                         }
 563                 } else {
 564                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
 565                                 errindex = (glong)(lstr - str)-1;
 566                                 if (error)
 567                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 568                                         "Invalid sequence in conversion input");
 569                                 if (items_read)
 570                                         *items_read = errindex;
 571                                 return 0;
 572                         }
 573                 }
 574                 retlen++;
 575         }
 576
 577         if (items_read)
 578                 *items_read = (glong)(lstr - str);
 579
 580         return retlen;
 581 }
 582
 583 gunichar*
 584 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
 585 {
 586         glong allocsz;
 587         gunichar *retstr = 0;
 588         gunichar *retch = 0;
 589         glong nwritten = 0;
 590         GError *lerror =0 ;
 591         gunichar ch,ch2;
 592
 593         allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror);
 594
 595         if (!lerror) {
 596                 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar));
 597                 retstr[allocsz] = '\0';
 598                 nwritten = allocsz;
 599
 600                 while (*str != '\0' && allocsz--) {
 601                         ch = *str++;
 602                         if (ch >= 0xD800 && ch <= 0xDBFF) {
 603                                 ch2 = *str++;
 604                                 ch = ((ch - (gunichar)0xD800) << 10)
 605                                       + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL;
 606                         }
 607                         *retch++ = ch;
 608                 }
 609         }
 610
 611         if (items_written)
 612                 *items_written = nwritten;
 613         if (error)
 614                 *error = lerror;
 615
 616         return retstr;
 617 }