eglib/src/giconv.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*
   3  *  Copyright (C) 2011 Jeffrey Stedfast
   4  *
   5  *  Permission is hereby granted, free of charge, to any person
   6  *  obtaining a copy of this software and associated documentation
   7  *  files (the "Software"), to deal in the Software without
   8  *  restriction, including without limitation the rights to use, copy,
   9  *  modify, merge, publish, distribute, sublicense, and/or sell copies
  10  *  of the Software, and to permit persons to whom the Software is
  11  *  furnished to do so, subject to the following conditions:
  12  *
  13  *  The above copyright notice and this permission notice shall be
  14  *  included in all copies or substantial portions of the Software.
  15  *
  16  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  18  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  20  *  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  21  *  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23  *  DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #include <glib.h>
  31 #include <string.h>
  32 #ifdef HAVE_ICONV_H
  33 #include <iconv.h>
  34 #endif
  35 #include <errno.h>
  36
  37 #ifdef _MSC_VER
  38 #define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE
  39 #else
  40 #define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline))
  41 #endif
  42
  43
  44 #define UNROLL_DECODE_UTF8 0
  45 #define UNROLL_ENCODE_UTF8 0
  46
  47 typedef int (* Decoder) (char *inbuf, size_t inleft, gunichar *outchar);
  48 typedef int (* Encoder) (gunichar c, char *outbuf, size_t outleft);
  49
  50 struct _GIConv {
  51         Decoder decode;
  52         Encoder encode;
  53         gunichar c;
  54 #ifdef HAVE_ICONV
  55         iconv_t cd;
  56 #endif
  57 };
  58
  59 static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar);
  60 static int encode_utf32be (gunichar c, char *outbuf, size_t outleft);
  61
  62 static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar);
  63 static int encode_utf32le (gunichar c, char *outbuf, size_t outleft);
  64
  65 static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar);
  66 static int encode_utf16be (gunichar c, char *outbuf, size_t outleft);
  67
  68 static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar);
  69 static int encode_utf16le (gunichar c, char *outbuf, size_t outleft);
  70
  71 static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar);
  72 static int encode_utf8 (gunichar c, char *outbuf, size_t outleft);
  73
  74 static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar);
  75 static int encode_latin1 (gunichar c, char *outbuf, size_t outleft);
  76
  77 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
  78 #define decode_utf32 decode_utf32le
  79 #define encode_utf32 encode_utf32le
  80 #define decode_utf16 decode_utf16le
  81 #define encode_utf16 encode_utf16le
  82 #else
  83 #define decode_utf32 decode_utf32be
  84 #define encode_utf32 encode_utf32be
  85 #define decode_utf16 decode_utf16be
  86 #define encode_utf16 encode_utf16be
  87 #endif
  88
  89 static struct {
  90         const char *name;
  91         Decoder decoder;
  92         Encoder encoder;
  93 } charsets[] = {
  94         { "ISO-8859-1", decode_latin1,  encode_latin1  },
  95         { "ISO8859-1",  decode_latin1,  encode_latin1  },
  96         { "UTF-32BE",   decode_utf32be, encode_utf32be },
  97         { "UTF-32LE",   decode_utf32le, encode_utf32le },
  98         { "UTF-16BE",   decode_utf16be, encode_utf16be },
  99         { "UTF-16LE",   decode_utf16le, encode_utf16le },
 100         { "UTF-32",     decode_utf32,   encode_utf32   },
 101         { "UTF-16",     decode_utf16,   encode_utf16   },
 102         { "UTF-8",      decode_utf8,    encode_utf8    },
 103         { "US-ASCII",   decode_latin1,  encode_latin1  },
 104         { "Latin1",     decode_latin1,  encode_latin1  },
 105         { "ASCII",      decode_latin1,  encode_latin1  },
 106         { "UTF32",      decode_utf32,   encode_utf32   },
 107         { "UTF16",      decode_utf16,   encode_utf16   },
 108         { "UTF8",       decode_utf8,    encode_utf8    },
 109 };
 110
 111
 112 GIConv
 113 g_iconv_open (const char *to_charset, const char *from_charset)
 114 {
 115 #ifdef HAVE_ICONV
 116         iconv_t icd = (iconv_t) -1;
 117 #endif
 118         Decoder decoder = NULL;
 119         Encoder encoder = NULL;
 120         GIConv cd;
 121         guint i;
 122
 123         if (!to_charset || !from_charset || !to_charset[0] || !from_charset[0]) {
 124                 errno = EINVAL;
 125
 126                 return (GIConv) -1;
 127         }
 128
 129         for (i = 0; i < G_N_ELEMENTS (charsets); i++) {
 130                 if (!g_ascii_strcasecmp (charsets[i].name, from_charset))
 131                         decoder = charsets[i].decoder;
 132
 133                 if (!g_ascii_strcasecmp (charsets[i].name, to_charset))
 134                         encoder = charsets[i].encoder;
 135         }
 136
 137         if (!encoder || !decoder) {
 138 #ifdef HAVE_ICONV
 139                 if ((icd = iconv_open (to_charset, from_charset)) == (iconv_t) -1)
 140                         return (GIConv) -1;
 141 #else
 142                 errno = EINVAL;
 143
 144                 return (GIConv) -1;
 145 #endif
 146         }
 147
 148         cd = (GIConv) g_malloc (sizeof (struct _GIConv));
 149         cd->decode = decoder;
 150         cd->encode = encoder;
 151         cd->c = -1;
 152
 153 #ifdef HAVE_ICONV
 154         cd->cd = icd;
 155 #endif
 156
 157         return cd;
 158 }
 159
 160 int
 161 g_iconv_close (GIConv cd)
 162 {
 163 #ifdef HAVE_ICONV
 164         if (cd->cd != (iconv_t) -1)
 165                 iconv_close (cd->cd);
 166 #endif
 167
 168         g_free (cd);
 169
 170         return 0;
 171 }
 172
 173 gsize
 174 g_iconv (GIConv cd, gchar **inbytes, gsize *inbytesleft,
 175          gchar **outbytes, gsize *outbytesleft)
 176 {
 177         size_t inleft, outleft;
 178         char *inptr, *outptr;
 179         gunichar c;
 180         int rc = 0;
 181
 182 #ifdef HAVE_ICONV
 183         if (cd->cd != (iconv_t) -1) {
 184                 /* Note: gsize may have a different size than size_t, so we need to
 185                    remap inbytesleft and outbytesleft to size_t's. */
 186                 size_t *outleftptr;
 187
 188                 if (outbytesleft) {
 189                         outleft = *outbytesleft;
 190                         outleftptr = &outleft;
 191                 } else {
 192                         outleftptr = NULL;
 193                 }
 194
 195                 inleft = inbytesleft ? *inbytesleft : 0;
 196
 197                 return iconv (cd->cd, inbytes, &inleft, outbytes, outleftptr);
 198         }
 199 #endif
 200
 201         if (outbytes == NULL || outbytesleft == NULL) {
 202                 /* reset converter */
 203                 cd->c = -1;
 204                 return 0;
 205         }
 206
 207         inleft = inbytesleft ? *inbytesleft : 0;
 208         inptr = inbytes ? *inbytes : NULL;
 209         outleft = *outbytesleft;
 210         outptr = *outbytes;
 211
 212         if ((c = cd->c) != (gunichar) -1)
 213                 goto encode;
 214
 215         while (inleft > 0) {
 216                 if ((rc = cd->decode (inptr, inleft, &c)) < 0)
 217                         break;
 218
 219                 inleft -= rc;
 220                 inptr += rc;
 221
 222         encode:
 223                 if ((rc = cd->encode (c, outptr, outleft)) < 0)
 224                         break;
 225
 226                 c = (gunichar) -1;
 227                 outleft -= rc;
 228                 outptr += rc;
 229         }
 230
 231         if (inbytesleft)
 232                 *inbytesleft = inleft;
 233
 234         if (inbytes)
 235                 *inbytes = inptr;
 236
 237         *outbytesleft = outleft;
 238         *outbytes = outptr;
 239         cd->c = c;
 240
 241         return rc < 0 ? -1 : 0;
 242 }
 243
 244 /*
 245  * Unicode encoders and decoders
 246  */
 247
 248 static int
 249 decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar)
 250 {
 251         unsigned char *inptr = (unsigned char *) inbuf;
 252         gunichar c;
 253
 254         if (inleft < 4) {
 255                 errno = EINVAL;
 256                 return -1;
 257         }
 258
 259         c = (inptr[0] << 24) | (inptr[1] << 16) | (inptr[2] << 8) | inptr[3];
 260
 261         if (c >= 0xd800 && c < 0xe000) {
 262                 errno = EILSEQ;
 263                 return -1;
 264         } else if (c >= 0x110000) {
 265                 errno = EILSEQ;
 266                 return -1;
 267         }
 268
 269         *outchar = c;
 270
 271         return 4;
 272 }
 273
 274 static int
 275 decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar)
 276 {
 277         unsigned char *inptr = (unsigned char *) inbuf;
 278         gunichar c;
 279
 280         if (inleft < 4) {
 281                 errno = EINVAL;
 282                 return -1;
 283         }
 284
 285         c = (inptr[3] << 24) | (inptr[2] << 16) | (inptr[1] << 8) | inptr[0];
 286
 287         if (c >= 0xd800 && c < 0xe000) {
 288                 errno = EILSEQ;
 289                 return -1;
 290         } else if (c >= 0x110000) {
 291                 errno = EILSEQ;
 292                 return -1;
 293         }
 294
 295         *outchar = c;
 296
 297         return 4;
 298 }
 299
 300 static int
 301 encode_utf32be (gunichar c, char *outbuf, size_t outleft)
 302 {
 303         unsigned char *outptr = (unsigned char *) outbuf;
 304
 305         if (outleft < 4) {
 306                 errno = E2BIG;
 307                 return -1;
 308         }
 309
 310         outptr[0] = (c >> 24) & 0xff;
 311         outptr[1] = (c >> 16) & 0xff;
 312         outptr[2] = (c >> 8) & 0xff;
 313         outptr[3] = c & 0xff;
 314
 315         return 4;
 316 }
 317
 318 static int
 319 encode_utf32le (gunichar c, char *outbuf, size_t outleft)
 320 {
 321         unsigned char *outptr = (unsigned char *) outbuf;
 322
 323         if (outleft < 4) {
 324                 errno = E2BIG;
 325                 return -1;
 326         }
 327
 328         outptr[0] = c & 0xff;
 329         outptr[1] = (c >> 8) & 0xff;
 330         outptr[2] = (c >> 16) & 0xff;
 331         outptr[3] = (c >> 24) & 0xff;
 332
 333         return 4;
 334 }
 335
 336 static int
 337 decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar)
 338 {
 339         unsigned char *inptr = (unsigned char *) inbuf;
 340         gunichar2 c;
 341         gunichar u;
 342
 343         if (inleft < 2) {
 344                 errno = EINVAL;
 345                 return -1;
 346         }
 347
 348         u = (inptr[0] << 8) | inptr[1];
 349
 350         if (u < 0xd800) {
 351                 /* 0x0000 -> 0xd7ff */
 352                 *outchar = u;
 353                 return 2;
 354         } else if (u < 0xdc00) {
 355                 /* 0xd800 -> 0xdbff */
 356                 if (inleft < 4) {
 357                         errno = EINVAL;
 358                         return -2;
 359                 }
 360
 361                 c = (inptr[2] << 8) | inptr[3];
 362
 363                 if (c < 0xdc00 || c > 0xdfff) {
 364                         errno = EILSEQ;
 365                         return -2;
 366                 }
 367
 368                 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
 369                 *outchar = u;
 370
 371                 return 4;
 372         } else if (u < 0xe000) {
 373                 /* 0xdc00 -> 0xdfff */
 374                 errno = EILSEQ;
 375                 return -1;
 376         } else {
 377                 /* 0xe000 -> 0xffff */
 378                 *outchar = u;
 379                 return 2;
 380         }
 381 }
 382
 383 static int
 384 decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar)
 385 {
 386         unsigned char *inptr = (unsigned char *) inbuf;
 387         gunichar2 c;
 388         gunichar u;
 389
 390         if (inleft < 2) {
 391                 errno = EINVAL;
 392                 return -1;
 393         }
 394
 395         u = (inptr[1] << 8) | inptr[0];
 396
 397         if (u < 0xd800) {
 398                 /* 0x0000 -> 0xd7ff */
 399                 *outchar = u;
 400                 return 2;
 401         } else if (u < 0xdc00) {
 402                 /* 0xd800 -> 0xdbff */
 403                 if (inleft < 4) {
 404                         errno = EINVAL;
 405                         return -2;
 406                 }
 407
 408                 c = (inptr[3] << 8) | inptr[2];
 409
 410                 if (c < 0xdc00 || c > 0xdfff) {
 411                         errno = EILSEQ;
 412                         return -2;
 413                 }
 414
 415                 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
 416                 *outchar = u;
 417
 418                 return 4;
 419         } else if (u < 0xe000) {
 420                 /* 0xdc00 -> 0xdfff */
 421                 errno = EILSEQ;
 422                 return -1;
 423         } else {
 424                 /* 0xe000 -> 0xffff */
 425                 *outchar = u;
 426                 return 2;
 427         }
 428 }
 429
 430 static int
 431 encode_utf16be (gunichar c, char *outbuf, size_t outleft)
 432 {
 433         unsigned char *outptr = (unsigned char *) outbuf;
 434         gunichar2 ch;
 435         gunichar c2;
 436
 437         if (c < 0x10000) {
 438                 if (outleft < 2) {
 439                         errno = E2BIG;
 440                         return -1;
 441                 }
 442
 443                 outptr[0] = (c >> 8) & 0xff;
 444                 outptr[1] = c & 0xff;
 445
 446                 return 2;
 447         } else {
 448                 if (outleft < 4) {
 449                         errno = E2BIG;
 450                         return -1;
 451                 }
 452
 453                 c2 = c - 0x10000;
 454
 455                 ch = (gunichar2) ((c2 >> 10) + 0xd800);
 456                 outptr[0] = (ch >> 8) & 0xff;
 457                 outptr[1] = ch & 0xff;
 458
 459                 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 460                 outptr[2] = (ch >> 8) & 0xff;
 461                 outptr[3] = ch & 0xff;
 462
 463                 return 4;
 464         }
 465 }
 466
 467 static int
 468 encode_utf16le (gunichar c, char *outbuf, size_t outleft)
 469 {
 470         unsigned char *outptr = (unsigned char *) outbuf;
 471         gunichar2 ch;
 472         gunichar c2;
 473
 474         if (c < 0x10000) {
 475                 if (outleft < 2) {
 476                         errno = E2BIG;
 477                         return -1;
 478                 }
 479
 480                 outptr[0] = c & 0xff;
 481                 outptr[1] = (c >> 8) & 0xff;
 482
 483                 return 2;
 484         } else {
 485                 if (outleft < 4) {
 486                         errno = E2BIG;
 487                         return -1;
 488                 }
 489
 490                 c2 = c - 0x10000;
 491
 492                 ch = (gunichar2) ((c2 >> 10) + 0xd800);
 493                 outptr[0] = ch & 0xff;
 494                 outptr[1] = (ch >> 8) & 0xff;
 495
 496                 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 497                 outptr[2] = ch & 0xff;
 498                 outptr[3] = (ch >> 8) & 0xff;
 499
 500                 return 4;
 501         }
 502 }
 503
 504 static FORCE_INLINE (int)
 505 decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar)
 506 {
 507         unsigned char *inptr = (unsigned char *) inbuf;
 508         gunichar u;
 509         int n, i;
 510
 511         u = *inptr;
 512
 513         if (u < 0x80) {
 514                 /* simple ascii case */
 515                 *outchar = u;
 516                 return 1;
 517         } else if (u < 0xc2) {
 518                 errno = EILSEQ;
 519                 return -1;
 520         } else if (u < 0xe0) {
 521                 u &= 0x1f;
 522                 n = 2;
 523         } else if (u < 0xf0) {
 524                 u &= 0x0f;
 525                 n = 3;
 526         } else if (u < 0xf8) {
 527                 u &= 0x07;
 528                 n = 4;
 529         } else if (u < 0xfc) {
 530                 u &= 0x03;
 531                 n = 5;
 532         } else if (u < 0xfe) {
 533                 u &= 0x01;
 534                 n = 6;
 535         } else {
 536                 errno = EILSEQ;
 537                 return -1;
 538         }
 539
 540         if (n > inleft) {
 541                 errno = EINVAL;
 542                 return -1;
 543         }
 544
 545 #if UNROLL_DECODE_UTF8
 546         switch (n) {
 547         case 6: u = (u << 6) | (*++inptr ^ 0x80);
 548         case 5: u = (u << 6) | (*++inptr ^ 0x80);
 549         case 4: u = (u << 6) | (*++inptr ^ 0x80);
 550         case 3: u = (u << 6) | (*++inptr ^ 0x80);
 551         case 2: u = (u << 6) | (*++inptr ^ 0x80);
 552         }
 553 #else
 554         for (i = 1; i < n; i++)
 555                 u = (u << 6) | (*++inptr ^ 0x80);
 556 #endif
 557
 558         *outchar = u;
 559
 560         return n;
 561 }
 562
 563 static int
 564 encode_utf8 (gunichar c, char *outbuf, size_t outleft)
 565 {
 566         unsigned char *outptr = (unsigned char *) outbuf;
 567         int base, n, i;
 568
 569         if (c < 0x80) {
 570                 outptr[0] = c;
 571                 return 1;
 572         } else if (c < 0x800) {
 573                 base = 192;
 574                 n = 2;
 575         } else if (c < 0x10000) {
 576                 base = 224;
 577                 n = 3;
 578         } else if (c < 0x200000) {
 579                 base = 240;
 580                 n = 4;
 581         } else if (c < 0x4000000) {
 582                 base = 248;
 583                 n = 5;
 584         } else {
 585                 base = 252;
 586                 n = 6;
 587         }
 588
 589         if (outleft < n) {
 590                 errno = E2BIG;
 591                 return -1;
 592         }
 593
 594 #if UNROLL_ENCODE_UTF8
 595         switch (n) {
 596         case 6: outptr[5] = (c & 0x3f) | 0x80; c >>= 6;
 597         case 5: outptr[4] = (c & 0x3f) | 0x80; c >>= 6;
 598         case 4: outptr[3] = (c & 0x3f) | 0x80; c >>= 6;
 599         case 3: outptr[2] = (c & 0x3f) | 0x80; c >>= 6;
 600         case 2: outptr[1] = (c & 0x3f) | 0x80; c >>= 6;
 601         case 1: outptr[0] = c | base;
 602         }
 603 #else
 604         for (i = n - 1; i > 0; i--) {
 605                 outptr[i] = (c & 0x3f) | 0x80;
 606                 c >>= 6;
 607         }
 608
 609         outptr[0] = c | base;
 610 #endif
 611
 612         return n;
 613 }
 614
 615 static int
 616 decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar)
 617 {
 618         *outchar = (unsigned char) *inbuf;
 619         return 1;
 620 }
 621
 622 static int
 623 encode_latin1 (gunichar c, char *outbuf, size_t outleft)
 624 {
 625         if (outleft < 1) {
 626                 errno = E2BIG;
 627                 return -1;
 628         }
 629
 630         if (c > 0xff) {
 631                 errno = EILSEQ;
 632                 return -1;
 633         }
 634
 635         *outbuf = (char) c;
 636
 637         return 1;
 638 }
 639
 640
 641 /*
 642  * Simple conversion API
 643  */
 644
 645 static gpointer error_quark = "ConvertError";
 646
 647 gpointer
 648 g_convert_error_quark (void)
 649 {
 650         return error_quark;
 651 }
 652
 653 gchar *
 654 g_convert (const gchar *str, gssize len, const gchar *to_charset, const gchar *from_charset,
 655            gsize *bytes_read, gsize *bytes_written, GError **err)
 656 {
 657         size_t outsize, outused, outleft, inleft, grow, rc;
 658         char *result, *outbuf, *inbuf;
 659         gboolean flush = FALSE;
 660         gboolean done = FALSE;
 661         GIConv cd;
 662
 663         g_return_val_if_fail (str != NULL, NULL);
 664         g_return_val_if_fail (to_charset != NULL, NULL);
 665         g_return_val_if_fail (from_charset != NULL, NULL);
 666
 667         if ((cd = g_iconv_open (to_charset, from_charset)) == (GIConv) -1) {
 668                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 669                              "Conversion from %s to %s not supported.",
 670                              from_charset, to_charset);
 671
 672                 if (bytes_written)
 673                         *bytes_written = 0;
 674
 675                 if (bytes_read)
 676                         *bytes_read = 0;
 677
 678                 return NULL;
 679         }
 680
 681         inleft = len < 0 ? strlen (str) : len;
 682         inbuf = (char *) str;
 683
 684         outleft = outsize = MAX (inleft, 8);
 685         outbuf = result = g_malloc (outsize + 4);
 686
 687         do {
 688                 if (!flush)
 689                         rc = g_iconv (cd, &inbuf, &inleft, &outbuf, &outleft);
 690                 else
 691                         rc = g_iconv (cd, NULL, NULL, &outbuf, &outleft);
 692
 693                 if (rc == (size_t) -1) {
 694                         switch (errno) {
 695                         case E2BIG:
 696                                 /* grow our result buffer */
 697                                 grow = MAX (inleft, 8) << 1;
 698                                 outused = outbuf - result;
 699                                 outsize += grow;
 700                                 outleft += grow;
 701                                 result = g_realloc (result, outsize + 4);
 702                                 outbuf = result + outused;
 703                                 break;
 704                         case EINVAL:
 705                                 /* incomplete input, stop converting and terminate here */
 706                                 if (flush)
 707                                         done = TRUE;
 708                                 else
 709                                         flush = TRUE;
 710                                 break;
 711                         case EILSEQ:
 712                                 /* illegal sequence in the input */
 713                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "%s", g_strerror (errno));
 714
 715                                 if (bytes_read) {
 716                                         /* save offset of the illegal input sequence */
 717                                         *bytes_read = (inbuf - str);
 718                                 }
 719
 720                                 if (bytes_written)
 721                                         *bytes_written = 0;
 722
 723                                 g_iconv_close (cd);
 724                                 g_free (result);
 725                                 return NULL;
 726                         default:
 727                                 /* unknown errno */
 728                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "%s", g_strerror (errno));
 729
 730                                 if (bytes_written)
 731                                         *bytes_written = 0;
 732
 733                                 if (bytes_read)
 734                                         *bytes_read = 0;
 735
 736                                 g_iconv_close (cd);
 737                                 g_free (result);
 738                                 return NULL;
 739                         }
 740                 } else if (flush) {
 741                         /* input has been converted and output has been flushed */
 742                         break;
 743                 } else {
 744                         /* input has been converted, need to flush the output */
 745                         flush = TRUE;
 746                 }
 747         } while (!done);
 748
 749         g_iconv_close (cd);
 750
 751         /* Note: not all charsets can be null-terminated with a single
 752            null byte. UCS2, for example, needs 2 null bytes and UCS4
 753            needs 4. I hope that 4 null bytes is enough to terminate all
 754            multibyte charsets? */
 755
 756         /* null-terminate the result */
 757         memset (outbuf, 0, 4);
 758
 759         if (bytes_written)
 760                 *bytes_written = outbuf - result;
 761
 762         if (bytes_read)
 763                 *bytes_read = inbuf - str;
 764
 765         return result;
 766 }
 767
 768
 769 /*
 770  * Unicode conversion
 771  */
 772
 773 /**
 774  * from http://home.tiscali.nl/t876506/utf8tbl.html
 775  *
 776  * From Unicode UCS-4 to UTF-8:
 777  * Start with the Unicode number expressed as a decimal number and call this ud.
 778  *
 779  * If ud <128 (7F hex) then UTF-8 is 1 byte long, the value of ud.
 780  *
 781  * If ud >=128 and <=2047 (7FF hex) then UTF-8 is 2 bytes long.
 782  *    byte 1 = 192 + (ud div 64)
 783  *    byte 2 = 128 + (ud mod 64)
 784  *
 785  * If ud >=2048 and <=65535 (FFFF hex) then UTF-8 is 3 bytes long.
 786  *    byte 1 = 224 + (ud div 4096)
 787  *    byte 2 = 128 + ((ud div 64) mod 64)
 788  *    byte 3 = 128 + (ud mod 64)
 789  *
 790  * If ud >=65536 and <=2097151 (1FFFFF hex) then UTF-8 is 4 bytes long.
 791  *    byte 1 = 240 + (ud div 262144)
 792  *    byte 2 = 128 + ((ud div 4096) mod 64)
 793  *    byte 3 = 128 + ((ud div 64) mod 64)
 794  *    byte 4 = 128 + (ud mod 64)
 795  *
 796  * If ud >=2097152 and <=67108863 (3FFFFFF hex) then UTF-8 is 5 bytes long.
 797  *    byte 1 = 248 + (ud div 16777216)
 798  *    byte 2 = 128 + ((ud div 262144) mod 64)
 799  *    byte 3 = 128 + ((ud div 4096) mod 64)
 800  *    byte 4 = 128 + ((ud div 64) mod 64)
 801  *    byte 5 = 128 + (ud mod 64)
 802  *
 803  * If ud >=67108864 and <=2147483647 (7FFFFFFF hex) then UTF-8 is 6 bytes long.
 804  *    byte 1 = 252 + (ud div 1073741824)
 805  *    byte 2 = 128 + ((ud div 16777216) mod 64)
 806  *    byte 3 = 128 + ((ud div 262144) mod 64)
 807  *    byte 4 = 128 + ((ud div 4096) mod 64)
 808  *    byte 5 = 128 + ((ud div 64) mod 64)
 809  *    byte 6 = 128 + (ud mod 64)
 810  **/
 811 gint
 812 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
 813 {
 814         int base, n, i;
 815
 816         if (c < 0x80) {
 817                 base = 0;
 818                 n = 1;
 819         } else if (c < 0x800) {
 820                 base = 192;
 821                 n = 2;
 822         } else if (c < 0x10000) {
 823                 base = 224;
 824                 n = 3;
 825         } else if (c < 0x200000) {
 826                 base = 240;
 827                 n = 4;
 828         } else if (c < 0x4000000) {
 829                 base = 248;
 830                 n = 5;
 831         } else if (c < 0x80000000) {
 832                 base = 252;
 833                 n = 6;
 834         } else {
 835                 return -1;
 836         }
 837
 838         if (outbuf != NULL) {
 839                 for (i = n - 1; i > 0; i--) {
 840                         /* mask off 6 bits worth and add 128 */
 841                         outbuf[i] = (c & 0x3f) | 0x80;
 842                         c >>= 6;
 843                 }
 844
 845                 /* first character has a different base */
 846                 outbuf[0] = c | base;
 847         }
 848
 849         return n;
 850 }
 851
 852 static FORCE_INLINE (int)
 853 g_unichar_to_utf16 (gunichar c, gunichar2 *outbuf)
 854 {
 855         gunichar c2;
 856
 857         if (c < 0xd800) {
 858                 if (outbuf)
 859                         *outbuf = (gunichar2) c;
 860
 861                 return 1;
 862         } else if (c < 0xe000) {
 863                 return -1;
 864         } else if (c < 0x10000) {
 865                 if (outbuf)
 866                         *outbuf = (gunichar2) c;
 867
 868                 return 1;
 869         } else if (c < 0x110000) {
 870                 if (outbuf) {
 871                         c2 = c - 0x10000;
 872
 873                         outbuf[0] = (gunichar2) ((c2 >> 10) + 0xd800);
 874                         outbuf[1] = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 875                 }
 876
 877                 return 2;
 878         } else {
 879                 return -1;
 880         }
 881 }
 882
 883 gunichar *
 884 g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written)
 885 {
 886         gunichar *outbuf, *outptr;
 887         char *inptr;
 888         glong n, i;
 889
 890         g_return_val_if_fail (str != NULL, NULL);
 891
 892         n = g_utf8_strlen (str, len);
 893
 894         if (items_written)
 895                 *items_written = n;
 896
 897         outptr = outbuf = g_malloc ((n + 1) * sizeof (gunichar));
 898         inptr = (char *) str;
 899
 900         for (i = 0; i < n; i++) {
 901                 *outptr++ = g_utf8_get_char (inptr);
 902                 inptr = g_utf8_next_char (inptr);
 903         }
 904
 905         *outptr = 0;
 906
 907         return outbuf;
 908 }
 909
 910 static gunichar2 *
 911 eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, GError **err)
 912 {
 913         gunichar2 *outbuf, *outptr;
 914         size_t outlen = 0;
 915         size_t inleft;
 916         char *inptr;
 917         gunichar c;
 918         int n;
 919
 920         g_return_val_if_fail (str != NULL, NULL);
 921
 922         if (len < 0) {
 923                 if (include_nuls) {
 924                         g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "Conversions with embedded nulls must pass the string length");
 925                         return NULL;
 926                 }
 927                 len = strlen (str);
 928         }
 929
 930         inptr = (char *) str;
 931         inleft = len;
 932
 933         while (inleft > 0) {
 934                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0) {
 935                         if (errno == EILSEQ) {
 936                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 937                                              "Illegal byte sequence encounted in the input.");
 938                         } else if (items_read) {
 939                                 /* partial input is ok if we can let our caller know... */
 940                                 break;
 941                         } else {
 942                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 943                                              "Partial byte sequence encountered in the input.");
 944                         }
 945
 946                         if (items_read)
 947                                 *items_read = inptr - str;
 948
 949                         if (items_written)
 950                                 *items_written = 0;
 951
 952                         return NULL;
 953                 } else if (c == 0 && !include_nuls)
 954                         break;
 955
 956                 outlen += g_unichar_to_utf16 (c, NULL);
 957                 inleft -= n;
 958                 inptr += n;
 959         }
 960
 961         if (items_read)
 962                 *items_read = inptr - str;
 963
 964         if (items_written)
 965                 *items_written = outlen;
 966
 967         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
 968         inptr = (char *) str;
 969         inleft = len;
 970
 971         while (inleft > 0) {
 972                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
 973                         break;
 974                 else if (c == 0 && !include_nuls)
 975                         break;
 976
 977                 outptr += g_unichar_to_utf16 (c, outptr);
 978                 inleft -= n;
 979                 inptr += n;
 980         }
 981
 982         *outptr = '\0';
 983
 984         return outbuf;
 985 }
 986
 987 gunichar2 *
 988 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 989 {
 990         return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, err);
 991 }
 992
 993 gunichar2 *
 994 eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 995 {
 996         return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, err);
 997 }
 998
 999 gunichar *
1000 g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
1001 {
1002         gunichar *outbuf, *outptr;
1003         size_t outlen = 0;
1004         size_t inleft;
1005         char *inptr;
1006         gunichar c;
1007         int n;
1008
1009         g_return_val_if_fail (str != NULL, NULL);
1010
1011         if (len < 0)
1012                 len = strlen (str);
1013
1014         inptr = (char *) str;
1015         inleft = len;
1016
1017         while (inleft > 0) {
1018                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0) {
1019                         if (errno == EILSEQ) {
1020                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1021                                              "Illegal byte sequence encounted in the input.");
1022                         } else if (items_read) {
1023                                 /* partial input is ok if we can let our caller know... */
1024                                 break;
1025                         } else {
1026                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1027                                              "Partial byte sequence encountered in the input.");
1028                         }
1029
1030                         if (items_read)
1031                                 *items_read = inptr - str;
1032
1033                         if (items_written)
1034                                 *items_written = 0;
1035
1036                         return NULL;
1037                 } else if (c == 0)
1038                         break;
1039
1040                 outlen += 4;
1041                 inleft -= n;
1042                 inptr += n;
1043         }
1044
1045         if (items_written)
1046                 *items_written = outlen / 4;
1047
1048         if (items_read)
1049                 *items_read = inptr - str;
1050
1051         outptr = outbuf = g_malloc (outlen + 4);
1052         inptr = (char *) str;
1053         inleft = len;
1054
1055         while (inleft > 0) {
1056                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
1057                         break;
1058                 else if (c == 0)
1059                         break;
1060
1061                 *outptr++ = c;
1062                 inleft -= n;
1063                 inptr += n;
1064         }
1065
1066         *outptr = 0;
1067
1068         return outbuf;
1069 }
1070
1071 gchar *
1072 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1073 {
1074         char *inptr, *outbuf, *outptr;
1075         size_t outlen = 0;
1076         size_t inleft;
1077         gunichar c;
1078         int n;
1079
1080         g_return_val_if_fail (str != NULL, NULL);
1081
1082         if (len < 0) {
1083                 len = 0;
1084                 while (str[len])
1085                         len++;
1086         }
1087
1088         inptr = (char *) str;
1089         inleft = len * 2;
1090
1091         while (inleft > 0) {
1092                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1093                         if (n == -2 && inleft > 2) {
1094                                 /* This means that the first UTF-16 char was read, but second failed */
1095                                 inleft -= 2;
1096                                 inptr += 2;
1097                         }
1098
1099                         if (errno == EILSEQ) {
1100                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1101                                              "Illegal byte sequence encounted in the input.");
1102                         } else if (items_read) {
1103                                 /* partial input is ok if we can let our caller know... */
1104                                 break;
1105                         } else {
1106                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1107                                              "Partial byte sequence encountered in the input.");
1108                         }
1109
1110                         if (items_read)
1111                                 *items_read = (inptr - (char *) str) / 2;
1112
1113                         if (items_written)
1114                                 *items_written = 0;
1115
1116                         return NULL;
1117                 } else if (c == 0)
1118                         break;
1119
1120                 outlen += g_unichar_to_utf8 (c, NULL);
1121                 inleft -= n;
1122                 inptr += n;
1123         }
1124
1125         if (items_read)
1126                 *items_read = (inptr - (char *) str) / 2;
1127
1128         if (items_written)
1129                 *items_written = outlen;
1130
1131         outptr = outbuf = g_malloc (outlen + 1);
1132         inptr = (char *) str;
1133         inleft = len * 2;
1134
1135         while (inleft > 0) {
1136                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1137                         break;
1138                 else if (c == 0)
1139                         break;
1140
1141                 outptr += g_unichar_to_utf8 (c, outptr);
1142                 inleft -= n;
1143                 inptr += n;
1144         }
1145
1146         *outptr = '\0';
1147
1148         return outbuf;
1149 }
1150
1151 gunichar *
1152 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1153 {
1154         gunichar *outbuf, *outptr;
1155         size_t outlen = 0;
1156         size_t inleft;
1157         char *inptr;
1158         gunichar c;
1159         int n;
1160
1161         g_return_val_if_fail (str != NULL, NULL);
1162
1163         if (len < 0) {
1164                 len = 0;
1165                 while (str[len])
1166                         len++;
1167         }
1168
1169         inptr = (char *) str;
1170         inleft = len * 2;
1171
1172         while (inleft > 0) {
1173                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1174                         if (n == -2 && inleft > 2) {
1175                                 /* This means that the first UTF-16 char was read, but second failed */
1176                                 inleft -= 2;
1177                                 inptr += 2;
1178                         }
1179
1180                         if (errno == EILSEQ) {
1181                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1182                                              "Illegal byte sequence encounted in the input.");
1183                         } else if (items_read) {
1184                                 /* partial input is ok if we can let our caller know... */
1185                                 break;
1186                         } else {
1187                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1188                                              "Partial byte sequence encountered in the input.");
1189                         }
1190
1191                         if (items_read)
1192                                 *items_read = (inptr - (char *) str) / 2;
1193
1194                         if (items_written)
1195                                 *items_written = 0;
1196
1197                         return NULL;
1198                 } else if (c == 0)
1199                         break;
1200
1201                 outlen += 4;
1202                 inleft -= n;
1203                 inptr += n;
1204         }
1205
1206         if (items_read)
1207                 *items_read = (inptr - (char *) str) / 2;
1208
1209         if (items_written)
1210                 *items_written = outlen / 4;
1211
1212         outptr = outbuf = g_malloc (outlen + 4);
1213         inptr = (char *) str;
1214         inleft = len * 2;
1215
1216         while (inleft > 0) {
1217                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1218                         break;
1219                 else if (c == 0)
1220                         break;
1221
1222                 *outptr++ = c;
1223                 inleft -= n;
1224                 inptr += n;
1225         }
1226
1227         *outptr = 0;
1228
1229         return outbuf;
1230 }
1231
1232 gchar *
1233 g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1234 {
1235         char *outbuf, *outptr;
1236         size_t outlen = 0;
1237         glong i;
1238         int n;
1239
1240         g_return_val_if_fail (str != NULL, NULL);
1241
1242         if (len < 0) {
1243                 for (i = 0; str[i] != 0; i++) {
1244                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1245                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1246                                              "Illegal byte sequence encounted in the input.");
1247
1248                                 if (items_written)
1249                                         *items_written = 0;
1250
1251                                 if (items_read)
1252                                         *items_read = i;
1253
1254                                 return NULL;
1255                         }
1256
1257                         outlen += n;
1258                 }
1259         } else {
1260                 for (i = 0; i < len && str[i] != 0; i++) {
1261                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1262                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1263                                              "Illegal byte sequence encounted in the input.");
1264
1265                                 if (items_written)
1266                                         *items_written = 0;
1267
1268                                 if (items_read)
1269                                         *items_read = i;
1270
1271                                 return NULL;
1272                         }
1273
1274                         outlen += n;
1275                 }
1276         }
1277
1278         len = i;
1279
1280         outptr = outbuf = g_malloc (outlen + 1);
1281         for (i = 0; i < len; i++)
1282                 outptr += g_unichar_to_utf8 (str[i], outptr);
1283         *outptr = 0;
1284
1285         if (items_written)
1286                 *items_written = outlen;
1287
1288         if (items_read)
1289                 *items_read = i;
1290
1291         return outbuf;
1292 }
1293
1294 gunichar2 *
1295 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1296 {
1297         gunichar2 *outbuf, *outptr;
1298         size_t outlen = 0;
1299         glong i;
1300         int n;
1301
1302         g_return_val_if_fail (str != NULL, NULL);
1303
1304         if (len < 0) {
1305                 for (i = 0; str[i] != 0; i++) {
1306                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1307                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1308                                              "Illegal byte sequence encounted in the input.");
1309
1310                                 if (items_written)
1311                                         *items_written = 0;
1312
1313                                 if (items_read)
1314                                         *items_read = i;
1315
1316                                 return NULL;
1317                         }
1318
1319                         outlen += n;
1320                 }
1321         } else {
1322                 for (i = 0; i < len && str[i] != 0; i++) {
1323                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1324                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1325                                              "Illegal byte sequence encounted in the input.");
1326
1327                                 if (items_written)
1328                                         *items_written = 0;
1329
1330                                 if (items_read)
1331                                         *items_read = i;
1332
1333                                 return NULL;
1334                         }
1335
1336                         outlen += n;
1337                 }
1338         }
1339
1340         len = i;
1341
1342         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
1343         for (i = 0; i < len; i++)
1344                 outptr += g_unichar_to_utf16 (str[i], outptr);
1345         *outptr = 0;
1346
1347         if (items_written)
1348                 *items_written = outlen;
1349
1350         if (items_read)
1351                 *items_read = i;
1352
1353         return outbuf;
1354 }