eglib/src/giconv.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*
   3  *  Copyright (C) 2011 Jeffrey Stedfast
   4  *
   5  *  Permission is hereby granted, free of charge, to any person
   6  *  obtaining a copy of this software and associated documentation
   7  *  files (the "Software"), to deal in the Software without
   8  *  restriction, including without limitation the rights to use, copy,
   9  *  modify, merge, publish, distribute, sublicense, and/or sell copies
  10  *  of the Software, and to permit persons to whom the Software is
  11  *  furnished to do so, subject to the following conditions:
  12  *
  13  *  The above copyright notice and this permission notice shall be
  14  *  included in all copies or substantial portions of the Software.
  15  *
  16  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  18  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  20  *  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  21  *  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23  *  DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #include <glib.h>
  31 #include <string.h>
  32 #ifdef HAVE_ICONV_H
  33 #include <iconv.h>
  34 #endif
  35 #include <errno.h>
  36
  37 #ifdef _MSC_VER
  38 #define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE
  39 #else
  40 #define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline))
  41 #endif
  42
  43
  44 #define UNROLL_DECODE_UTF8 0
  45 #define UNROLL_ENCODE_UTF8 0
  46
  47 typedef int (* Decoder) (char *inbuf, size_t inleft, gunichar *outchar);
  48 typedef int (* Encoder) (gunichar c, char *outbuf, size_t outleft);
  49
  50 struct _GIConv {
  51         Decoder decode;
  52         Encoder encode;
  53         gunichar c;
  54 #ifdef HAVE_ICONV
  55         iconv_t cd;
  56 #endif
  57 };
  58
  59 static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar);
  60 static int encode_utf32be (gunichar c, char *outbuf, size_t outleft);
  61
  62 static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar);
  63 static int encode_utf32le (gunichar c, char *outbuf, size_t outleft);
  64
  65 static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar);
  66 static int encode_utf16be (gunichar c, char *outbuf, size_t outleft);
  67
  68 static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar);
  69 static int encode_utf16le (gunichar c, char *outbuf, size_t outleft);
  70
  71 static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar);
  72 static int encode_utf8 (gunichar c, char *outbuf, size_t outleft);
  73
  74 static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar);
  75 static int encode_latin1 (gunichar c, char *outbuf, size_t outleft);
  76
  77 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
  78 #define decode_utf32 decode_utf32le
  79 #define encode_utf32 encode_utf32le
  80 #define decode_utf16 decode_utf16le
  81 #define encode_utf16 encode_utf16le
  82 #else
  83 #define decode_utf32 decode_utf32be
  84 #define encode_utf32 encode_utf32be
  85 #define decode_utf16 decode_utf16be
  86 #define encode_utf16 encode_utf16be
  87 #endif
  88
  89 static struct {
  90         const char *name;
  91         Decoder decoder;
  92         Encoder encoder;
  93 } charsets[] = {
  94         { "ISO-8859-1", decode_latin1,  encode_latin1  },
  95         { "ISO8859-1",  decode_latin1,  encode_latin1  },
  96         { "UTF-32BE",   decode_utf32be, encode_utf32be },
  97         { "UTF-32LE",   decode_utf32le, encode_utf32le },
  98         { "UTF-16BE",   decode_utf16be, encode_utf16be },
  99         { "UTF-16LE",   decode_utf16le, encode_utf16le },
 100         { "UTF-32",     decode_utf32,   encode_utf32   },
 101         { "UTF-16",     decode_utf16,   encode_utf16   },
 102         { "UTF-8",      decode_utf8,    encode_utf8    },
 103         { "US-ASCII",   decode_latin1,  encode_latin1  },
 104         { "Latin1",     decode_latin1,  encode_latin1  },
 105         { "ASCII",      decode_latin1,  encode_latin1  },
 106         { "UTF32",      decode_utf32,   encode_utf32   },
 107         { "UTF16",      decode_utf16,   encode_utf16   },
 108         { "UTF8",       decode_utf8,    encode_utf8    },
 109 };
 110
 111
 112 GIConv
 113 g_iconv_open (const char *to_charset, const char *from_charset)
 114 {
 115 #ifdef HAVE_ICONV
 116         iconv_t icd = (iconv_t) -1;
 117 #endif
 118         Decoder decoder = NULL;
 119         Encoder encoder = NULL;
 120         GIConv cd;
 121         guint i;
 122
 123         if (!to_charset || !from_charset || !to_charset[0] || !from_charset[0]) {
 124                 errno = EINVAL;
 125
 126                 return (GIConv) -1;
 127         }
 128
 129         for (i = 0; i < G_N_ELEMENTS (charsets); i++) {
 130                 if (!g_ascii_strcasecmp (charsets[i].name, from_charset))
 131                         decoder = charsets[i].decoder;
 132
 133                 if (!g_ascii_strcasecmp (charsets[i].name, to_charset))
 134                         encoder = charsets[i].encoder;
 135         }
 136
 137         if (!encoder || !decoder) {
 138 #ifdef HAVE_ICONV
 139                 if ((icd = iconv_open (to_charset, from_charset)) == (iconv_t) -1)
 140                         return (GIConv) -1;
 141 #else
 142                 errno = EINVAL;
 143
 144                 return (GIConv) -1;
 145 #endif
 146         }
 147
 148         cd = (GIConv) g_malloc (sizeof (struct _GIConv));
 149         cd->decode = decoder;
 150         cd->encode = encoder;
 151         cd->c = -1;
 152
 153 #ifdef HAVE_ICONV
 154         cd->cd = icd;
 155 #endif
 156
 157         return cd;
 158 }
 159
 160 int
 161 g_iconv_close (GIConv cd)
 162 {
 163 #ifdef HAVE_ICONV
 164         if (cd->cd != (iconv_t) -1)
 165                 iconv_close (cd->cd);
 166 #endif
 167
 168         g_free (cd);
 169
 170         return 0;
 171 }
 172
 173 gsize
 174 g_iconv (GIConv cd, gchar **inbytes, gsize *inbytesleft,
 175          gchar **outbytes, gsize *outbytesleft)
 176 {
 177         gsize inleft, outleft;
 178         char *inptr, *outptr;
 179         gunichar c;
 180         int rc = 0;
 181
 182 #ifdef HAVE_ICONV
 183         if (cd->cd != (iconv_t) -1) {
 184                 /* Note: gsize may have a different size than size_t, so we need to
 185                    remap inbytesleft and outbytesleft to size_t's. */
 186                 size_t *outleftptr, *inleftptr;
 187                 size_t n_outleft, n_inleft;
 188
 189                 if (inbytesleft) {
 190                         n_inleft = *inbytesleft;
 191                         inleftptr = &n_inleft;
 192                 } else {
 193                         inleftptr = NULL;
 194                 }
 195
 196                 if (outbytesleft) {
 197                         n_outleft = *outbytesleft;
 198                         outleftptr = &n_outleft;
 199                 } else {
 200                         outleftptr = NULL;
 201                 }
 202
 203                 return iconv (cd->cd, inbytes, inleftptr, outbytes, outleftptr);
 204         }
 205 #endif
 206
 207         if (outbytes == NULL || outbytesleft == NULL) {
 208                 /* reset converter */
 209                 cd->c = -1;
 210                 return 0;
 211         }
 212
 213         inleft = inbytesleft ? *inbytesleft : 0;
 214         inptr = inbytes ? *inbytes : NULL;
 215         outleft = *outbytesleft;
 216         outptr = *outbytes;
 217
 218         if ((c = cd->c) != (gunichar) -1)
 219                 goto encode;
 220
 221         while (inleft > 0) {
 222                 if ((rc = cd->decode (inptr, inleft, &c)) < 0)
 223                         break;
 224
 225                 inleft -= rc;
 226                 inptr += rc;
 227
 228         encode:
 229                 if ((rc = cd->encode (c, outptr, outleft)) < 0)
 230                         break;
 231
 232                 c = (gunichar) -1;
 233                 outleft -= rc;
 234                 outptr += rc;
 235         }
 236
 237         if (inbytesleft)
 238                 *inbytesleft = inleft;
 239
 240         if (inbytes)
 241                 *inbytes = inptr;
 242
 243         *outbytesleft = outleft;
 244         *outbytes = outptr;
 245         cd->c = c;
 246
 247         return rc < 0 ? -1 : 0;
 248 }
 249
 250 /*
 251  * Unicode encoders and decoders
 252  */
 253
 254 static int
 255 decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar)
 256 {
 257         unsigned char *inptr = (unsigned char *) inbuf;
 258         gunichar c;
 259
 260         if (inleft < 4) {
 261                 errno = EINVAL;
 262                 return -1;
 263         }
 264
 265         c = (inptr[0] << 24) | (inptr[1] << 16) | (inptr[2] << 8) | inptr[3];
 266
 267         if (c >= 0xd800 && c < 0xe000) {
 268                 errno = EILSEQ;
 269                 return -1;
 270         } else if (c >= 0x110000) {
 271                 errno = EILSEQ;
 272                 return -1;
 273         }
 274
 275         *outchar = c;
 276
 277         return 4;
 278 }
 279
 280 static int
 281 decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar)
 282 {
 283         unsigned char *inptr = (unsigned char *) inbuf;
 284         gunichar c;
 285
 286         if (inleft < 4) {
 287                 errno = EINVAL;
 288                 return -1;
 289         }
 290
 291         c = (inptr[3] << 24) | (inptr[2] << 16) | (inptr[1] << 8) | inptr[0];
 292
 293         if (c >= 0xd800 && c < 0xe000) {
 294                 errno = EILSEQ;
 295                 return -1;
 296         } else if (c >= 0x110000) {
 297                 errno = EILSEQ;
 298                 return -1;
 299         }
 300
 301         *outchar = c;
 302
 303         return 4;
 304 }
 305
 306 static int
 307 encode_utf32be (gunichar c, char *outbuf, size_t outleft)
 308 {
 309         unsigned char *outptr = (unsigned char *) outbuf;
 310
 311         if (outleft < 4) {
 312                 errno = E2BIG;
 313                 return -1;
 314         }
 315
 316         outptr[0] = (c >> 24) & 0xff;
 317         outptr[1] = (c >> 16) & 0xff;
 318         outptr[2] = (c >> 8) & 0xff;
 319         outptr[3] = c & 0xff;
 320
 321         return 4;
 322 }
 323
 324 static int
 325 encode_utf32le (gunichar c, char *outbuf, size_t outleft)
 326 {
 327         unsigned char *outptr = (unsigned char *) outbuf;
 328
 329         if (outleft < 4) {
 330                 errno = E2BIG;
 331                 return -1;
 332         }
 333
 334         outptr[0] = c & 0xff;
 335         outptr[1] = (c >> 8) & 0xff;
 336         outptr[2] = (c >> 16) & 0xff;
 337         outptr[3] = (c >> 24) & 0xff;
 338
 339         return 4;
 340 }
 341
 342 static int
 343 decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar)
 344 {
 345         unsigned char *inptr = (unsigned char *) inbuf;
 346         gunichar2 c;
 347         gunichar u;
 348
 349         if (inleft < 2) {
 350                 errno = EINVAL;
 351                 return -1;
 352         }
 353
 354         u = (inptr[0] << 8) | inptr[1];
 355
 356         if (u < 0xd800) {
 357                 /* 0x0000 -> 0xd7ff */
 358                 *outchar = u;
 359                 return 2;
 360         } else if (u < 0xdc00) {
 361                 /* 0xd800 -> 0xdbff */
 362                 if (inleft < 4) {
 363                         errno = EINVAL;
 364                         return -2;
 365                 }
 366
 367                 c = (inptr[2] << 8) | inptr[3];
 368
 369                 if (c < 0xdc00 || c > 0xdfff) {
 370                         errno = EILSEQ;
 371                         return -2;
 372                 }
 373
 374                 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
 375                 *outchar = u;
 376
 377                 return 4;
 378         } else if (u < 0xe000) {
 379                 /* 0xdc00 -> 0xdfff */
 380                 errno = EILSEQ;
 381                 return -1;
 382         } else {
 383                 /* 0xe000 -> 0xffff */
 384                 *outchar = u;
 385                 return 2;
 386         }
 387 }
 388
 389 static int
 390 decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar)
 391 {
 392         unsigned char *inptr = (unsigned char *) inbuf;
 393         gunichar2 c;
 394         gunichar u;
 395
 396         if (inleft < 2) {
 397                 errno = EINVAL;
 398                 return -1;
 399         }
 400
 401         u = (inptr[1] << 8) | inptr[0];
 402
 403         if (u < 0xd800) {
 404                 /* 0x0000 -> 0xd7ff */
 405                 *outchar = u;
 406                 return 2;
 407         } else if (u < 0xdc00) {
 408                 /* 0xd800 -> 0xdbff */
 409                 if (inleft < 4) {
 410                         errno = EINVAL;
 411                         return -2;
 412                 }
 413
 414                 c = (inptr[3] << 8) | inptr[2];
 415
 416                 if (c < 0xdc00 || c > 0xdfff) {
 417                         errno = EILSEQ;
 418                         return -2;
 419                 }
 420
 421                 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
 422                 *outchar = u;
 423
 424                 return 4;
 425         } else if (u < 0xe000) {
 426                 /* 0xdc00 -> 0xdfff */
 427                 errno = EILSEQ;
 428                 return -1;
 429         } else {
 430                 /* 0xe000 -> 0xffff */
 431                 *outchar = u;
 432                 return 2;
 433         }
 434 }
 435
 436 static int
 437 encode_utf16be (gunichar c, char *outbuf, size_t outleft)
 438 {
 439         unsigned char *outptr = (unsigned char *) outbuf;
 440         gunichar2 ch;
 441         gunichar c2;
 442
 443         if (c < 0x10000) {
 444                 if (outleft < 2) {
 445                         errno = E2BIG;
 446                         return -1;
 447                 }
 448
 449                 outptr[0] = (c >> 8) & 0xff;
 450                 outptr[1] = c & 0xff;
 451
 452                 return 2;
 453         } else {
 454                 if (outleft < 4) {
 455                         errno = E2BIG;
 456                         return -1;
 457                 }
 458
 459                 c2 = c - 0x10000;
 460
 461                 ch = (gunichar2) ((c2 >> 10) + 0xd800);
 462                 outptr[0] = (ch >> 8) & 0xff;
 463                 outptr[1] = ch & 0xff;
 464
 465                 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 466                 outptr[2] = (ch >> 8) & 0xff;
 467                 outptr[3] = ch & 0xff;
 468
 469                 return 4;
 470         }
 471 }
 472
 473 static int
 474 encode_utf16le (gunichar c, char *outbuf, size_t outleft)
 475 {
 476         unsigned char *outptr = (unsigned char *) outbuf;
 477         gunichar2 ch;
 478         gunichar c2;
 479
 480         if (c < 0x10000) {
 481                 if (outleft < 2) {
 482                         errno = E2BIG;
 483                         return -1;
 484                 }
 485
 486                 outptr[0] = c & 0xff;
 487                 outptr[1] = (c >> 8) & 0xff;
 488
 489                 return 2;
 490         } else {
 491                 if (outleft < 4) {
 492                         errno = E2BIG;
 493                         return -1;
 494                 }
 495
 496                 c2 = c - 0x10000;
 497
 498                 ch = (gunichar2) ((c2 >> 10) + 0xd800);
 499                 outptr[0] = ch & 0xff;
 500                 outptr[1] = (ch >> 8) & 0xff;
 501
 502                 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 503                 outptr[2] = ch & 0xff;
 504                 outptr[3] = (ch >> 8) & 0xff;
 505
 506                 return 4;
 507         }
 508 }
 509
 510 static FORCE_INLINE (int)
 511 decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar)
 512 {
 513         unsigned char *inptr = (unsigned char *) inbuf;
 514         gunichar u;
 515         int n, i;
 516
 517         u = *inptr;
 518
 519         if (u < 0x80) {
 520                 /* simple ascii case */
 521                 *outchar = u;
 522                 return 1;
 523         } else if (u < 0xc2) {
 524                 errno = EILSEQ;
 525                 return -1;
 526         } else if (u < 0xe0) {
 527                 u &= 0x1f;
 528                 n = 2;
 529         } else if (u < 0xf0) {
 530                 u &= 0x0f;
 531                 n = 3;
 532         } else if (u < 0xf8) {
 533                 u &= 0x07;
 534                 n = 4;
 535         } else if (u < 0xfc) {
 536                 u &= 0x03;
 537                 n = 5;
 538         } else if (u < 0xfe) {
 539                 u &= 0x01;
 540                 n = 6;
 541         } else {
 542                 errno = EILSEQ;
 543                 return -1;
 544         }
 545
 546         if (n > inleft) {
 547                 errno = EINVAL;
 548                 return -1;
 549         }
 550
 551 #if UNROLL_DECODE_UTF8
 552         switch (n) {
 553         case 6: u = (u << 6) | (*++inptr ^ 0x80);
 554         case 5: u = (u << 6) | (*++inptr ^ 0x80);
 555         case 4: u = (u << 6) | (*++inptr ^ 0x80);
 556         case 3: u = (u << 6) | (*++inptr ^ 0x80);
 557         case 2: u = (u << 6) | (*++inptr ^ 0x80);
 558         }
 559 #else
 560         for (i = 1; i < n; i++)
 561                 u = (u << 6) | (*++inptr ^ 0x80);
 562 #endif
 563
 564         *outchar = u;
 565
 566         return n;
 567 }
 568
 569 static int
 570 encode_utf8 (gunichar c, char *outbuf, size_t outleft)
 571 {
 572         unsigned char *outptr = (unsigned char *) outbuf;
 573         int base, n, i;
 574
 575         if (c < 0x80) {
 576                 outptr[0] = c;
 577                 return 1;
 578         } else if (c < 0x800) {
 579                 base = 192;
 580                 n = 2;
 581         } else if (c < 0x10000) {
 582                 base = 224;
 583                 n = 3;
 584         } else if (c < 0x200000) {
 585                 base = 240;
 586                 n = 4;
 587         } else if (c < 0x4000000) {
 588                 base = 248;
 589                 n = 5;
 590         } else {
 591                 base = 252;
 592                 n = 6;
 593         }
 594
 595         if (outleft < n) {
 596                 errno = E2BIG;
 597                 return -1;
 598         }
 599
 600 #if UNROLL_ENCODE_UTF8
 601         switch (n) {
 602         case 6: outptr[5] = (c & 0x3f) | 0x80; c >>= 6;
 603         case 5: outptr[4] = (c & 0x3f) | 0x80; c >>= 6;
 604         case 4: outptr[3] = (c & 0x3f) | 0x80; c >>= 6;
 605         case 3: outptr[2] = (c & 0x3f) | 0x80; c >>= 6;
 606         case 2: outptr[1] = (c & 0x3f) | 0x80; c >>= 6;
 607         case 1: outptr[0] = c | base;
 608         }
 609 #else
 610         for (i = n - 1; i > 0; i--) {
 611                 outptr[i] = (c & 0x3f) | 0x80;
 612                 c >>= 6;
 613         }
 614
 615         outptr[0] = c | base;
 616 #endif
 617
 618         return n;
 619 }
 620
 621 static int
 622 decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar)
 623 {
 624         *outchar = (unsigned char) *inbuf;
 625         return 1;
 626 }
 627
 628 static int
 629 encode_latin1 (gunichar c, char *outbuf, size_t outleft)
 630 {
 631         if (outleft < 1) {
 632                 errno = E2BIG;
 633                 return -1;
 634         }
 635
 636         if (c > 0xff) {
 637                 errno = EILSEQ;
 638                 return -1;
 639         }
 640
 641         *outbuf = (char) c;
 642
 643         return 1;
 644 }
 645
 646
 647 /*
 648  * Simple conversion API
 649  */
 650
 651 static gpointer error_quark = "ConvertError";
 652
 653 gpointer
 654 g_convert_error_quark (void)
 655 {
 656         return error_quark;
 657 }
 658
 659 gchar *
 660 g_convert (const gchar *str, gssize len, const gchar *to_charset, const gchar *from_charset,
 661            gsize *bytes_read, gsize *bytes_written, GError **err)
 662 {
 663         gsize outsize, outused, outleft, inleft, grow, rc;
 664         char *result, *outbuf, *inbuf;
 665         gboolean flush = FALSE;
 666         gboolean done = FALSE;
 667         GIConv cd;
 668
 669         g_return_val_if_fail (str != NULL, NULL);
 670         g_return_val_if_fail (to_charset != NULL, NULL);
 671         g_return_val_if_fail (from_charset != NULL, NULL);
 672
 673         if ((cd = g_iconv_open (to_charset, from_charset)) == (GIConv) -1) {
 674                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 675                              "Conversion from %s to %s not supported.",
 676                              from_charset, to_charset);
 677
 678                 if (bytes_written)
 679                         *bytes_written = 0;
 680
 681                 if (bytes_read)
 682                         *bytes_read = 0;
 683
 684                 return NULL;
 685         }
 686
 687         inleft = len < 0 ? strlen (str) : len;
 688         inbuf = (char *) str;
 689
 690         outleft = outsize = MAX (inleft, 8);
 691         outbuf = result = g_malloc (outsize + 4);
 692
 693         do {
 694                 if (!flush)
 695                         rc = g_iconv (cd, &inbuf, &inleft, &outbuf, &outleft);
 696                 else
 697                         rc = g_iconv (cd, NULL, NULL, &outbuf, &outleft);
 698
 699                 if (rc == (gsize) -1) {
 700                         switch (errno) {
 701                         case E2BIG:
 702                                 /* grow our result buffer */
 703                                 grow = MAX (inleft, 8) << 1;
 704                                 outused = outbuf - result;
 705                                 outsize += grow;
 706                                 outleft += grow;
 707                                 result = g_realloc (result, outsize + 4);
 708                                 outbuf = result + outused;
 709                                 break;
 710                         case EINVAL:
 711                                 /* incomplete input, stop converting and terminate here */
 712                                 if (flush)
 713                                         done = TRUE;
 714                                 else
 715                                         flush = TRUE;
 716                                 break;
 717                         case EILSEQ:
 718                                 /* illegal sequence in the input */
 719                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "%s", g_strerror (errno));
 720
 721                                 if (bytes_read) {
 722                                         /* save offset of the illegal input sequence */
 723                                         *bytes_read = (inbuf - str);
 724                                 }
 725
 726                                 if (bytes_written)
 727                                         *bytes_written = 0;
 728
 729                                 g_iconv_close (cd);
 730                                 g_free (result);
 731                                 return NULL;
 732                         default:
 733                                 /* unknown errno */
 734                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "%s", g_strerror (errno));
 735
 736                                 if (bytes_written)
 737                                         *bytes_written = 0;
 738
 739                                 if (bytes_read)
 740                                         *bytes_read = 0;
 741
 742                                 g_iconv_close (cd);
 743                                 g_free (result);
 744                                 return NULL;
 745                         }
 746                 } else if (flush) {
 747                         /* input has been converted and output has been flushed */
 748                         break;
 749                 } else {
 750                         /* input has been converted, need to flush the output */
 751                         flush = TRUE;
 752                 }
 753         } while (!done);
 754
 755         g_iconv_close (cd);
 756
 757         /* Note: not all charsets can be null-terminated with a single
 758            null byte. UCS2, for example, needs 2 null bytes and UCS4
 759            needs 4. I hope that 4 null bytes is enough to terminate all
 760            multibyte charsets? */
 761
 762         /* null-terminate the result */
 763         memset (outbuf, 0, 4);
 764
 765         if (bytes_written)
 766                 *bytes_written = outbuf - result;
 767
 768         if (bytes_read)
 769                 *bytes_read = inbuf - str;
 770
 771         return result;
 772 }
 773
 774
 775 /*
 776  * Unicode conversion
 777  */
 778
 779 /**
 780  * from http://home.tiscali.nl/t876506/utf8tbl.html
 781  *
 782  * From Unicode UCS-4 to UTF-8:
 783  * Start with the Unicode number expressed as a decimal number and call this ud.
 784  *
 785  * If ud <128 (7F hex) then UTF-8 is 1 byte long, the value of ud.
 786  *
 787  * If ud >=128 and <=2047 (7FF hex) then UTF-8 is 2 bytes long.
 788  *    byte 1 = 192 + (ud div 64)
 789  *    byte 2 = 128 + (ud mod 64)
 790  *
 791  * If ud >=2048 and <=65535 (FFFF hex) then UTF-8 is 3 bytes long.
 792  *    byte 1 = 224 + (ud div 4096)
 793  *    byte 2 = 128 + ((ud div 64) mod 64)
 794  *    byte 3 = 128 + (ud mod 64)
 795  *
 796  * If ud >=65536 and <=2097151 (1FFFFF hex) then UTF-8 is 4 bytes long.
 797  *    byte 1 = 240 + (ud div 262144)
 798  *    byte 2 = 128 + ((ud div 4096) mod 64)
 799  *    byte 3 = 128 + ((ud div 64) mod 64)
 800  *    byte 4 = 128 + (ud mod 64)
 801  *
 802  * If ud >=2097152 and <=67108863 (3FFFFFF hex) then UTF-8 is 5 bytes long.
 803  *    byte 1 = 248 + (ud div 16777216)
 804  *    byte 2 = 128 + ((ud div 262144) mod 64)
 805  *    byte 3 = 128 + ((ud div 4096) mod 64)
 806  *    byte 4 = 128 + ((ud div 64) mod 64)
 807  *    byte 5 = 128 + (ud mod 64)
 808  *
 809  * If ud >=67108864 and <=2147483647 (7FFFFFFF hex) then UTF-8 is 6 bytes long.
 810  *    byte 1 = 252 + (ud div 1073741824)
 811  *    byte 2 = 128 + ((ud div 16777216) mod 64)
 812  *    byte 3 = 128 + ((ud div 262144) mod 64)
 813  *    byte 4 = 128 + ((ud div 4096) mod 64)
 814  *    byte 5 = 128 + ((ud div 64) mod 64)
 815  *    byte 6 = 128 + (ud mod 64)
 816  **/
 817 gint
 818 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
 819 {
 820         int base, n, i;
 821
 822         if (c < 0x80) {
 823                 base = 0;
 824                 n = 1;
 825         } else if (c < 0x800) {
 826                 base = 192;
 827                 n = 2;
 828         } else if (c < 0x10000) {
 829                 base = 224;
 830                 n = 3;
 831         } else if (c < 0x200000) {
 832                 base = 240;
 833                 n = 4;
 834         } else if (c < 0x4000000) {
 835                 base = 248;
 836                 n = 5;
 837         } else if (c < 0x80000000) {
 838                 base = 252;
 839                 n = 6;
 840         } else {
 841                 return -1;
 842         }
 843
 844         if (outbuf != NULL) {
 845                 for (i = n - 1; i > 0; i--) {
 846                         /* mask off 6 bits worth and add 128 */
 847                         outbuf[i] = (c & 0x3f) | 0x80;
 848                         c >>= 6;
 849                 }
 850
 851                 /* first character has a different base */
 852                 outbuf[0] = c | base;
 853         }
 854
 855         return n;
 856 }
 857
 858 static FORCE_INLINE (int)
 859 g_unichar_to_utf16 (gunichar c, gunichar2 *outbuf)
 860 {
 861         gunichar c2;
 862
 863         if (c < 0xd800) {
 864                 if (outbuf)
 865                         *outbuf = (gunichar2) c;
 866
 867                 return 1;
 868         } else if (c < 0xe000) {
 869                 return -1;
 870         } else if (c < 0x10000) {
 871                 if (outbuf)
 872                         *outbuf = (gunichar2) c;
 873
 874                 return 1;
 875         } else if (c < 0x110000) {
 876                 if (outbuf) {
 877                         c2 = c - 0x10000;
 878
 879                         outbuf[0] = (gunichar2) ((c2 >> 10) + 0xd800);
 880                         outbuf[1] = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 881                 }
 882
 883                 return 2;
 884         } else {
 885                 return -1;
 886         }
 887 }
 888
 889 gunichar *
 890 g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written)
 891 {
 892         gunichar *outbuf, *outptr;
 893         char *inptr;
 894         glong n, i;
 895
 896         g_return_val_if_fail (str != NULL, NULL);
 897
 898         n = g_utf8_strlen (str, len);
 899
 900         if (items_written)
 901                 *items_written = n;
 902
 903         outptr = outbuf = g_malloc ((n + 1) * sizeof (gunichar));
 904         inptr = (char *) str;
 905
 906         for (i = 0; i < n; i++) {
 907                 *outptr++ = g_utf8_get_char (inptr);
 908                 inptr = g_utf8_next_char (inptr);
 909         }
 910
 911         *outptr = 0;
 912
 913         return outbuf;
 914 }
 915
 916 static gunichar2 *
 917 eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, GError **err)
 918 {
 919         gunichar2 *outbuf, *outptr;
 920         size_t outlen = 0;
 921         size_t inleft;
 922         char *inptr;
 923         gunichar c;
 924         int u, n;
 925
 926         g_return_val_if_fail (str != NULL, NULL);
 927
 928         if (len < 0) {
 929                 if (include_nuls) {
 930                         g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "Conversions with embedded nulls must pass the string length");
 931                         return NULL;
 932                 }
 933
 934                 len = strlen (str);
 935         }
 936
 937         inptr = (char *) str;
 938         inleft = len;
 939
 940         while (inleft > 0) {
 941                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
 942                         goto error;
 943
 944                 if (c == 0 && !include_nuls)
 945                         break;
 946
 947                 if ((u = g_unichar_to_utf16 (c, NULL)) < 0) {
 948                         errno = EILSEQ;
 949                         goto error;
 950                 }
 951
 952                 outlen += u;
 953                 inleft -= n;
 954                 inptr += n;
 955         }
 956
 957         if (items_read)
 958                 *items_read = inptr - str;
 959
 960         if (items_written)
 961                 *items_written = outlen;
 962
 963         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
 964         inptr = (char *) str;
 965         inleft = len;
 966
 967         while (inleft > 0) {
 968                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
 969                         break;
 970
 971                 if (c == 0 && !include_nuls)
 972                         break;
 973
 974                 outptr += g_unichar_to_utf16 (c, outptr);
 975                 inleft -= n;
 976                 inptr += n;
 977         }
 978
 979         *outptr = '\0';
 980
 981         return outbuf;
 982
 983  error:
 984         if (errno == EILSEQ) {
 985                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 986                              "Illegal byte sequence encounted in the input.");
 987         } else if (items_read) {
 988                 /* partial input is ok if we can let our caller know... */
 989                 break;
 990         } else {
 991                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 992                              "Partial byte sequence encountered in the input.");
 993         }
 994
 995         if (items_read)
 996                 *items_read = inptr - str;
 997
 998         if (items_written)
 999                 *items_written = 0;
1000
1001         return NULL;
1002 }
1003
1004 gunichar2 *
1005 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
1006 {
1007         return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, err);
1008 }
1009
1010 gunichar2 *
1011 eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
1012 {
1013         return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, err);
1014 }
1015
1016 gunichar *
1017 g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
1018 {
1019         gunichar *outbuf, *outptr;
1020         size_t outlen = 0;
1021         size_t inleft;
1022         char *inptr;
1023         gunichar c;
1024         int n;
1025
1026         g_return_val_if_fail (str != NULL, NULL);
1027
1028         if (len < 0)
1029                 len = strlen (str);
1030
1031         inptr = (char *) str;
1032         inleft = len;
1033
1034         while (inleft > 0) {
1035                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0) {
1036                         if (errno == EILSEQ) {
1037                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1038                                              "Illegal byte sequence encounted in the input.");
1039                         } else if (items_read) {
1040                                 /* partial input is ok if we can let our caller know... */
1041                                 break;
1042                         } else {
1043                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1044                                              "Partial byte sequence encountered in the input.");
1045                         }
1046
1047                         if (items_read)
1048                                 *items_read = inptr - str;
1049
1050                         if (items_written)
1051                                 *items_written = 0;
1052
1053                         return NULL;
1054                 } else if (c == 0)
1055                         break;
1056
1057                 outlen += 4;
1058                 inleft -= n;
1059                 inptr += n;
1060         }
1061
1062         if (items_written)
1063                 *items_written = outlen / 4;
1064
1065         if (items_read)
1066                 *items_read = inptr - str;
1067
1068         outptr = outbuf = g_malloc (outlen + 4);
1069         inptr = (char *) str;
1070         inleft = len;
1071
1072         while (inleft > 0) {
1073                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
1074                         break;
1075                 else if (c == 0)
1076                         break;
1077
1078                 *outptr++ = c;
1079                 inleft -= n;
1080                 inptr += n;
1081         }
1082
1083         *outptr = 0;
1084
1085         return outbuf;
1086 }
1087
1088 gchar *
1089 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1090 {
1091         char *inptr, *outbuf, *outptr;
1092         size_t outlen = 0;
1093         size_t inleft;
1094         gunichar c;
1095         int n;
1096
1097         g_return_val_if_fail (str != NULL, NULL);
1098
1099         if (len < 0) {
1100                 len = 0;
1101                 while (str[len])
1102                         len++;
1103         }
1104
1105         inptr = (char *) str;
1106         inleft = len * 2;
1107
1108         while (inleft > 0) {
1109                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1110                         if (n == -2 && inleft > 2) {
1111                                 /* This means that the first UTF-16 char was read, but second failed */
1112                                 inleft -= 2;
1113                                 inptr += 2;
1114                         }
1115
1116                         if (errno == EILSEQ) {
1117                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1118                                              "Illegal byte sequence encounted in the input.");
1119                         } else if (items_read) {
1120                                 /* partial input is ok if we can let our caller know... */
1121                                 break;
1122                         } else {
1123                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1124                                              "Partial byte sequence encountered in the input.");
1125                         }
1126
1127                         if (items_read)
1128                                 *items_read = (inptr - (char *) str) / 2;
1129
1130                         if (items_written)
1131                                 *items_written = 0;
1132
1133                         return NULL;
1134                 } else if (c == 0)
1135                         break;
1136
1137                 outlen += g_unichar_to_utf8 (c, NULL);
1138                 inleft -= n;
1139                 inptr += n;
1140         }
1141
1142         if (items_read)
1143                 *items_read = (inptr - (char *) str) / 2;
1144
1145         if (items_written)
1146                 *items_written = outlen;
1147
1148         outptr = outbuf = g_malloc (outlen + 1);
1149         inptr = (char *) str;
1150         inleft = len * 2;
1151
1152         while (inleft > 0) {
1153                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1154                         break;
1155                 else if (c == 0)
1156                         break;
1157
1158                 outptr += g_unichar_to_utf8 (c, outptr);
1159                 inleft -= n;
1160                 inptr += n;
1161         }
1162
1163         *outptr = '\0';
1164
1165         return outbuf;
1166 }
1167
1168 gunichar *
1169 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1170 {
1171         gunichar *outbuf, *outptr;
1172         size_t outlen = 0;
1173         size_t inleft;
1174         char *inptr;
1175         gunichar c;
1176         int n;
1177
1178         g_return_val_if_fail (str != NULL, NULL);
1179
1180         if (len < 0) {
1181                 len = 0;
1182                 while (str[len])
1183                         len++;
1184         }
1185
1186         inptr = (char *) str;
1187         inleft = len * 2;
1188
1189         while (inleft > 0) {
1190                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1191                         if (n == -2 && inleft > 2) {
1192                                 /* This means that the first UTF-16 char was read, but second failed */
1193                                 inleft -= 2;
1194                                 inptr += 2;
1195                         }
1196
1197                         if (errno == EILSEQ) {
1198                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1199                                              "Illegal byte sequence encounted in the input.");
1200                         } else if (items_read) {
1201                                 /* partial input is ok if we can let our caller know... */
1202                                 break;
1203                         } else {
1204                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1205                                              "Partial byte sequence encountered in the input.");
1206                         }
1207
1208                         if (items_read)
1209                                 *items_read = (inptr - (char *) str) / 2;
1210
1211                         if (items_written)
1212                                 *items_written = 0;
1213
1214                         return NULL;
1215                 } else if (c == 0)
1216                         break;
1217
1218                 outlen += 4;
1219                 inleft -= n;
1220                 inptr += n;
1221         }
1222
1223         if (items_read)
1224                 *items_read = (inptr - (char *) str) / 2;
1225
1226         if (items_written)
1227                 *items_written = outlen / 4;
1228
1229         outptr = outbuf = g_malloc (outlen + 4);
1230         inptr = (char *) str;
1231         inleft = len * 2;
1232
1233         while (inleft > 0) {
1234                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1235                         break;
1236                 else if (c == 0)
1237                         break;
1238
1239                 *outptr++ = c;
1240                 inleft -= n;
1241                 inptr += n;
1242         }
1243
1244         *outptr = 0;
1245
1246         return outbuf;
1247 }
1248
1249 gchar *
1250 g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1251 {
1252         char *outbuf, *outptr;
1253         size_t outlen = 0;
1254         glong i;
1255         int n;
1256
1257         g_return_val_if_fail (str != NULL, NULL);
1258
1259         if (len < 0) {
1260                 for (i = 0; str[i] != 0; i++) {
1261                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1262                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1263                                              "Illegal byte sequence encounted in the input.");
1264
1265                                 if (items_written)
1266                                         *items_written = 0;
1267
1268                                 if (items_read)
1269                                         *items_read = i;
1270
1271                                 return NULL;
1272                         }
1273
1274                         outlen += n;
1275                 }
1276         } else {
1277                 for (i = 0; i < len && str[i] != 0; i++) {
1278                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1279                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1280                                              "Illegal byte sequence encounted in the input.");
1281
1282                                 if (items_written)
1283                                         *items_written = 0;
1284
1285                                 if (items_read)
1286                                         *items_read = i;
1287
1288                                 return NULL;
1289                         }
1290
1291                         outlen += n;
1292                 }
1293         }
1294
1295         len = i;
1296
1297         outptr = outbuf = g_malloc (outlen + 1);
1298         for (i = 0; i < len; i++)
1299                 outptr += g_unichar_to_utf8 (str[i], outptr);
1300         *outptr = 0;
1301
1302         if (items_written)
1303                 *items_written = outlen;
1304
1305         if (items_read)
1306                 *items_read = i;
1307
1308         return outbuf;
1309 }
1310
1311 gunichar2 *
1312 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1313 {
1314         gunichar2 *outbuf, *outptr;
1315         size_t outlen = 0;
1316         glong i;
1317         int n;
1318
1319         g_return_val_if_fail (str != NULL, NULL);
1320
1321         if (len < 0) {
1322                 for (i = 0; str[i] != 0; i++) {
1323                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1324                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1325                                              "Illegal byte sequence encounted in the input.");
1326
1327                                 if (items_written)
1328                                         *items_written = 0;
1329
1330                                 if (items_read)
1331                                         *items_read = i;
1332
1333                                 return NULL;
1334                         }
1335
1336                         outlen += n;
1337                 }
1338         } else {
1339                 for (i = 0; i < len && str[i] != 0; i++) {
1340                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1341                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1342                                              "Illegal byte sequence encounted in the input.");
1343
1344                                 if (items_written)
1345                                         *items_written = 0;
1346
1347                                 if (items_read)
1348                                         *items_read = i;
1349
1350                                 return NULL;
1351                         }
1352
1353                         outlen += n;
1354                 }
1355         }
1356
1357         len = i;
1358
1359         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
1360         for (i = 0; i < len; i++)
1361                 outptr += g_unichar_to_utf16 (str[i], outptr);
1362         *outptr = 0;
1363
1364         if (items_written)
1365                 *items_written = outlen;
1366
1367         if (items_read)
1368                 *items_read = i;
1369
1370         return outbuf;
1371 }