mono/eglib/giconv.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*
   3  *  Copyright (C) 2011 Jeffrey Stedfast
   4  *
   5  *  Permission is hereby granted, free of charge, to any person
   6  *  obtaining a copy of this software and associated documentation
   7  *  files (the "Software"), to deal in the Software without
   8  *  restriction, including without limitation the rights to use, copy,
   9  *  modify, merge, publish, distribute, sublicense, and/or sell copies
  10  *  of the Software, and to permit persons to whom the Software is
  11  *  furnished to do so, subject to the following conditions:
  12  *
  13  *  The above copyright notice and this permission notice shall be
  14  *  included in all copies or substantial portions of the Software.
  15  *
  16  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  18  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  20  *  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  21  *  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23  *  DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #include <glib.h>
  31 #include <string.h>
  32 #ifdef HAVE_ICONV_H
  33 #include <iconv.h>
  34 #endif
  35 #include <errno.h>
  36
  37 #ifdef _MSC_VER
  38 #define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE
  39 #else
  40 #define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline))
  41 #endif
  42
  43
  44 #define UNROLL_DECODE_UTF8 0
  45 #define UNROLL_ENCODE_UTF8 0
  46
  47 typedef int (* Decoder) (char *inbuf, size_t inleft, gunichar *outchar);
  48 typedef int (* Encoder) (gunichar c, char *outbuf, size_t outleft);
  49
  50 struct _GIConv {
  51         Decoder decode;
  52         Encoder encode;
  53         gunichar c;
  54 #ifdef HAVE_ICONV
  55         iconv_t cd;
  56 #endif
  57 };
  58
  59 static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar);
  60 static int encode_utf32be (gunichar c, char *outbuf, size_t outleft);
  61
  62 static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar);
  63 static int encode_utf32le (gunichar c, char *outbuf, size_t outleft);
  64
  65 static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar);
  66 static int encode_utf16be (gunichar c, char *outbuf, size_t outleft);
  67
  68 static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar);
  69 static int encode_utf16le (gunichar c, char *outbuf, size_t outleft);
  70
  71 static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar);
  72 static int encode_utf8 (gunichar c, char *outbuf, size_t outleft);
  73
  74 static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar);
  75 static int encode_latin1 (gunichar c, char *outbuf, size_t outleft);
  76
  77 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
  78 #define decode_utf32 decode_utf32le
  79 #define encode_utf32 encode_utf32le
  80 #define decode_utf16 decode_utf16le
  81 #define encode_utf16 encode_utf16le
  82 #else
  83 #define decode_utf32 decode_utf32be
  84 #define encode_utf32 encode_utf32be
  85 #define decode_utf16 decode_utf16be
  86 #define encode_utf16 encode_utf16be
  87 #endif
  88
  89 static struct {
  90         const char *name;
  91         Decoder decoder;
  92         Encoder encoder;
  93 } charsets[] = {
  94         { "ISO-8859-1", decode_latin1,  encode_latin1  },
  95         { "ISO8859-1",  decode_latin1,  encode_latin1  },
  96         { "UTF-32BE",   decode_utf32be, encode_utf32be },
  97         { "UTF-32LE",   decode_utf32le, encode_utf32le },
  98         { "UTF-16BE",   decode_utf16be, encode_utf16be },
  99         { "UTF-16LE",   decode_utf16le, encode_utf16le },
 100         { "UTF-32",     decode_utf32,   encode_utf32   },
 101         { "UTF-16",     decode_utf16,   encode_utf16   },
 102         { "UTF-8",      decode_utf8,    encode_utf8    },
 103         { "US-ASCII",   decode_latin1,  encode_latin1  },
 104         { "Latin1",     decode_latin1,  encode_latin1  },
 105         { "ASCII",      decode_latin1,  encode_latin1  },
 106         { "UTF32",      decode_utf32,   encode_utf32   },
 107         { "UTF16",      decode_utf16,   encode_utf16   },
 108         { "UTF8",       decode_utf8,    encode_utf8    },
 109 };
 110
 111
 112 GIConv
 113 g_iconv_open (const char *to_charset, const char *from_charset)
 114 {
 115 #ifdef HAVE_ICONV
 116         iconv_t icd = (iconv_t) -1;
 117 #endif
 118         Decoder decoder = NULL;
 119         Encoder encoder = NULL;
 120         GIConv cd;
 121         guint i;
 122
 123         if (!to_charset || !from_charset || !to_charset[0] || !from_charset[0]) {
 124                 errno = EINVAL;
 125
 126                 return (GIConv) -1;
 127         }
 128
 129         for (i = 0; i < G_N_ELEMENTS (charsets); i++) {
 130                 if (!g_ascii_strcasecmp (charsets[i].name, from_charset))
 131                         decoder = charsets[i].decoder;
 132
 133                 if (!g_ascii_strcasecmp (charsets[i].name, to_charset))
 134                         encoder = charsets[i].encoder;
 135         }
 136
 137         if (!encoder || !decoder) {
 138 #ifdef HAVE_ICONV
 139                 if ((icd = iconv_open (to_charset, from_charset)) == (iconv_t) -1)
 140                         return (GIConv) -1;
 141 #else
 142                 errno = EINVAL;
 143
 144                 return (GIConv) -1;
 145 #endif
 146         }
 147
 148         cd = (GIConv) g_malloc (sizeof (struct _GIConv));
 149         cd->decode = decoder;
 150         cd->encode = encoder;
 151         cd->c = -1;
 152
 153 #ifdef HAVE_ICONV
 154         cd->cd = icd;
 155 #endif
 156
 157         return cd;
 158 }
 159
 160 int
 161 g_iconv_close (GIConv cd)
 162 {
 163 #ifdef HAVE_ICONV
 164         if (cd->cd != (iconv_t) -1)
 165                 iconv_close (cd->cd);
 166 #endif
 167
 168         g_free (cd);
 169
 170         return 0;
 171 }
 172
 173 gsize
 174 g_iconv (GIConv cd, gchar **inbytes, gsize *inbytesleft,
 175          gchar **outbytes, gsize *outbytesleft)
 176 {
 177         gsize inleft, outleft;
 178         char *inptr, *outptr;
 179         gunichar c;
 180         int rc = 0;
 181
 182 #ifdef HAVE_ICONV
 183         if (cd->cd != (iconv_t) -1) {
 184                 /* Note: gsize may have a different size than size_t, so we need to
 185                    remap inbytesleft and outbytesleft to size_t's. */
 186                 size_t *outleftptr, *inleftptr;
 187                 size_t n_outleft, n_inleft;
 188
 189                 if (inbytesleft) {
 190                         n_inleft = *inbytesleft;
 191                         inleftptr = &n_inleft;
 192                 } else {
 193                         inleftptr = NULL;
 194                 }
 195
 196                 if (outbytesleft) {
 197                         n_outleft = *outbytesleft;
 198                         outleftptr = &n_outleft;
 199                 } else {
 200                         outleftptr = NULL;
 201                 }
 202 #if defined(__NetBSD__)
 203                 return iconv (cd->cd, (const gchar **)inbytes, inleftptr, outbytes, outleftptr);
 204 #else
 205                 return iconv (cd->cd, inbytes, inleftptr, outbytes, outleftptr);
 206 #endif
 207         }
 208 #endif
 209
 210         if (outbytes == NULL || outbytesleft == NULL) {
 211                 /* reset converter */
 212                 cd->c = -1;
 213                 return 0;
 214         }
 215
 216         inleft = inbytesleft ? *inbytesleft : 0;
 217         inptr = inbytes ? *inbytes : NULL;
 218         outleft = *outbytesleft;
 219         outptr = *outbytes;
 220
 221         if ((c = cd->c) != (gunichar) -1)
 222                 goto encode;
 223
 224         while (inleft > 0) {
 225                 if ((rc = cd->decode (inptr, inleft, &c)) < 0)
 226                         break;
 227
 228                 inleft -= rc;
 229                 inptr += rc;
 230
 231         encode:
 232                 if ((rc = cd->encode (c, outptr, outleft)) < 0)
 233                         break;
 234
 235                 c = (gunichar) -1;
 236                 outleft -= rc;
 237                 outptr += rc;
 238         }
 239
 240         if (inbytesleft)
 241                 *inbytesleft = inleft;
 242
 243         if (inbytes)
 244                 *inbytes = inptr;
 245
 246         *outbytesleft = outleft;
 247         *outbytes = outptr;
 248         cd->c = c;
 249
 250         return rc < 0 ? -1 : 0;
 251 }
 252
 253 /*
 254  * Unicode encoders and decoders
 255  */
 256
 257 static int
 258 decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar)
 259 {
 260         unsigned char *inptr = (unsigned char *) inbuf;
 261         gunichar c;
 262
 263         if (inleft < 4) {
 264                 errno = EINVAL;
 265                 return -1;
 266         }
 267
 268         c = (inptr[0] << 24) | (inptr[1] << 16) | (inptr[2] << 8) | inptr[3];
 269
 270         if (c >= 0xd800 && c < 0xe000) {
 271                 errno = EILSEQ;
 272                 return -1;
 273         } else if (c >= 0x110000) {
 274                 errno = EILSEQ;
 275                 return -1;
 276         }
 277
 278         *outchar = c;
 279
 280         return 4;
 281 }
 282
 283 static int
 284 decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar)
 285 {
 286         unsigned char *inptr = (unsigned char *) inbuf;
 287         gunichar c;
 288
 289         if (inleft < 4) {
 290                 errno = EINVAL;
 291                 return -1;
 292         }
 293
 294         c = (inptr[3] << 24) | (inptr[2] << 16) | (inptr[1] << 8) | inptr[0];
 295
 296         if (c >= 0xd800 && c < 0xe000) {
 297                 errno = EILSEQ;
 298                 return -1;
 299         } else if (c >= 0x110000) {
 300                 errno = EILSEQ;
 301                 return -1;
 302         }
 303
 304         *outchar = c;
 305
 306         return 4;
 307 }
 308
 309 static int
 310 encode_utf32be (gunichar c, char *outbuf, size_t outleft)
 311 {
 312         unsigned char *outptr = (unsigned char *) outbuf;
 313
 314         if (outleft < 4) {
 315                 errno = E2BIG;
 316                 return -1;
 317         }
 318
 319         outptr[0] = (c >> 24) & 0xff;
 320         outptr[1] = (c >> 16) & 0xff;
 321         outptr[2] = (c >> 8) & 0xff;
 322         outptr[3] = c & 0xff;
 323
 324         return 4;
 325 }
 326
 327 static int
 328 encode_utf32le (gunichar c, char *outbuf, size_t outleft)
 329 {
 330         unsigned char *outptr = (unsigned char *) outbuf;
 331
 332         if (outleft < 4) {
 333                 errno = E2BIG;
 334                 return -1;
 335         }
 336
 337         outptr[0] = c & 0xff;
 338         outptr[1] = (c >> 8) & 0xff;
 339         outptr[2] = (c >> 16) & 0xff;
 340         outptr[3] = (c >> 24) & 0xff;
 341
 342         return 4;
 343 }
 344
 345 static int
 346 decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar)
 347 {
 348         unsigned char *inptr = (unsigned char *) inbuf;
 349         gunichar2 c;
 350         gunichar u;
 351
 352         if (inleft < 2) {
 353                 errno = EINVAL;
 354                 return -1;
 355         }
 356
 357         u = (inptr[0] << 8) | inptr[1];
 358
 359         if (u < 0xd800) {
 360                 /* 0x0000 -> 0xd7ff */
 361                 *outchar = u;
 362                 return 2;
 363         } else if (u < 0xdc00) {
 364                 /* 0xd800 -> 0xdbff */
 365                 if (inleft < 4) {
 366                         errno = EINVAL;
 367                         return -2;
 368                 }
 369
 370                 c = (inptr[2] << 8) | inptr[3];
 371
 372                 if (c < 0xdc00 || c > 0xdfff) {
 373                         errno = EILSEQ;
 374                         return -2;
 375                 }
 376
 377                 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
 378                 *outchar = u;
 379
 380                 return 4;
 381         } else if (u < 0xe000) {
 382                 /* 0xdc00 -> 0xdfff */
 383                 errno = EILSEQ;
 384                 return -1;
 385         } else {
 386                 /* 0xe000 -> 0xffff */
 387                 *outchar = u;
 388                 return 2;
 389         }
 390 }
 391
 392 static int
 393 decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar)
 394 {
 395         unsigned char *inptr = (unsigned char *) inbuf;
 396         gunichar2 c;
 397         gunichar u;
 398
 399         if (inleft < 2) {
 400                 errno = EINVAL;
 401                 return -1;
 402         }
 403
 404         u = (inptr[1] << 8) | inptr[0];
 405
 406         if (u < 0xd800) {
 407                 /* 0x0000 -> 0xd7ff */
 408                 *outchar = u;
 409                 return 2;
 410         } else if (u < 0xdc00) {
 411                 /* 0xd800 -> 0xdbff */
 412                 if (inleft < 4) {
 413                         errno = EINVAL;
 414                         return -2;
 415                 }
 416
 417                 c = (inptr[3] << 8) | inptr[2];
 418
 419                 if (c < 0xdc00 || c > 0xdfff) {
 420                         errno = EILSEQ;
 421                         return -2;
 422                 }
 423
 424                 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
 425                 *outchar = u;
 426
 427                 return 4;
 428         } else if (u < 0xe000) {
 429                 /* 0xdc00 -> 0xdfff */
 430                 errno = EILSEQ;
 431                 return -1;
 432         } else {
 433                 /* 0xe000 -> 0xffff */
 434                 *outchar = u;
 435                 return 2;
 436         }
 437 }
 438
 439 static int
 440 encode_utf16be (gunichar c, char *outbuf, size_t outleft)
 441 {
 442         unsigned char *outptr = (unsigned char *) outbuf;
 443         gunichar2 ch;
 444         gunichar c2;
 445
 446         if (c < 0x10000) {
 447                 if (outleft < 2) {
 448                         errno = E2BIG;
 449                         return -1;
 450                 }
 451
 452                 outptr[0] = (c >> 8) & 0xff;
 453                 outptr[1] = c & 0xff;
 454
 455                 return 2;
 456         } else {
 457                 if (outleft < 4) {
 458                         errno = E2BIG;
 459                         return -1;
 460                 }
 461
 462                 c2 = c - 0x10000;
 463
 464                 ch = (gunichar2) ((c2 >> 10) + 0xd800);
 465                 outptr[0] = (ch >> 8) & 0xff;
 466                 outptr[1] = ch & 0xff;
 467
 468                 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 469                 outptr[2] = (ch >> 8) & 0xff;
 470                 outptr[3] = ch & 0xff;
 471
 472                 return 4;
 473         }
 474 }
 475
 476 static int
 477 encode_utf16le (gunichar c, char *outbuf, size_t outleft)
 478 {
 479         unsigned char *outptr = (unsigned char *) outbuf;
 480         gunichar2 ch;
 481         gunichar c2;
 482
 483         if (c < 0x10000) {
 484                 if (outleft < 2) {
 485                         errno = E2BIG;
 486                         return -1;
 487                 }
 488
 489                 outptr[0] = c & 0xff;
 490                 outptr[1] = (c >> 8) & 0xff;
 491
 492                 return 2;
 493         } else {
 494                 if (outleft < 4) {
 495                         errno = E2BIG;
 496                         return -1;
 497                 }
 498
 499                 c2 = c - 0x10000;
 500
 501                 ch = (gunichar2) ((c2 >> 10) + 0xd800);
 502                 outptr[0] = ch & 0xff;
 503                 outptr[1] = (ch >> 8) & 0xff;
 504
 505                 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 506                 outptr[2] = ch & 0xff;
 507                 outptr[3] = (ch >> 8) & 0xff;
 508
 509                 return 4;
 510         }
 511 }
 512
 513 static FORCE_INLINE (int)
 514 decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar)
 515 {
 516         unsigned char *inptr = (unsigned char *) inbuf;
 517         gunichar u;
 518         int n, i;
 519
 520         u = *inptr;
 521
 522         if (u < 0x80) {
 523                 /* simple ascii case */
 524                 *outchar = u;
 525                 return 1;
 526         } else if (u < 0xc2) {
 527                 errno = EILSEQ;
 528                 return -1;
 529         } else if (u < 0xe0) {
 530                 u &= 0x1f;
 531                 n = 2;
 532         } else if (u < 0xf0) {
 533                 u &= 0x0f;
 534                 n = 3;
 535         } else if (u < 0xf8) {
 536                 u &= 0x07;
 537                 n = 4;
 538         } else if (u < 0xfc) {
 539                 u &= 0x03;
 540                 n = 5;
 541         } else if (u < 0xfe) {
 542                 u &= 0x01;
 543                 n = 6;
 544         } else {
 545                 errno = EILSEQ;
 546                 return -1;
 547         }
 548
 549         if (n > inleft) {
 550                 errno = EINVAL;
 551                 return -1;
 552         }
 553
 554 #if UNROLL_DECODE_UTF8
 555         switch (n) {
 556         case 6: u = (u << 6) | (*++inptr ^ 0x80);
 557         case 5: u = (u << 6) | (*++inptr ^ 0x80);
 558         case 4: u = (u << 6) | (*++inptr ^ 0x80);
 559         case 3: u = (u << 6) | (*++inptr ^ 0x80);
 560         case 2: u = (u << 6) | (*++inptr ^ 0x80);
 561         }
 562 #else
 563         for (i = 1; i < n; i++)
 564                 u = (u << 6) | (*++inptr ^ 0x80);
 565 #endif
 566
 567         *outchar = u;
 568
 569         return n;
 570 }
 571
 572 static int
 573 encode_utf8 (gunichar c, char *outbuf, size_t outleft)
 574 {
 575         unsigned char *outptr = (unsigned char *) outbuf;
 576         int base, n, i;
 577
 578         if (c < 0x80) {
 579                 outptr[0] = c;
 580                 return 1;
 581         } else if (c < 0x800) {
 582                 base = 192;
 583                 n = 2;
 584         } else if (c < 0x10000) {
 585                 base = 224;
 586                 n = 3;
 587         } else if (c < 0x200000) {
 588                 base = 240;
 589                 n = 4;
 590         } else if (c < 0x4000000) {
 591                 base = 248;
 592                 n = 5;
 593         } else {
 594                 base = 252;
 595                 n = 6;
 596         }
 597
 598         if (outleft < n) {
 599                 errno = E2BIG;
 600                 return -1;
 601         }
 602
 603 #if UNROLL_ENCODE_UTF8
 604         switch (n) {
 605         case 6: outptr[5] = (c & 0x3f) | 0x80; c >>= 6;
 606         case 5: outptr[4] = (c & 0x3f) | 0x80; c >>= 6;
 607         case 4: outptr[3] = (c & 0x3f) | 0x80; c >>= 6;
 608         case 3: outptr[2] = (c & 0x3f) | 0x80; c >>= 6;
 609         case 2: outptr[1] = (c & 0x3f) | 0x80; c >>= 6;
 610         case 1: outptr[0] = c | base;
 611         }
 612 #else
 613         for (i = n - 1; i > 0; i--) {
 614                 outptr[i] = (c & 0x3f) | 0x80;
 615                 c >>= 6;
 616         }
 617
 618         outptr[0] = c | base;
 619 #endif
 620
 621         return n;
 622 }
 623
 624 static int
 625 decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar)
 626 {
 627         *outchar = (unsigned char) *inbuf;
 628         return 1;
 629 }
 630
 631 static int
 632 encode_latin1 (gunichar c, char *outbuf, size_t outleft)
 633 {
 634         if (outleft < 1) {
 635                 errno = E2BIG;
 636                 return -1;
 637         }
 638
 639         if (c > 0xff) {
 640                 errno = EILSEQ;
 641                 return -1;
 642         }
 643
 644         *outbuf = (char) c;
 645
 646         return 1;
 647 }
 648
 649
 650 /*
 651  * Simple conversion API
 652  */
 653
 654 static gpointer error_quark = "ConvertError";
 655
 656 gpointer
 657 g_convert_error_quark (void)
 658 {
 659         return error_quark;
 660 }
 661
 662 gchar *
 663 g_convert (const gchar *str, gssize len, const gchar *to_charset, const gchar *from_charset,
 664            gsize *bytes_read, gsize *bytes_written, GError **err)
 665 {
 666         gsize outsize, outused, outleft, inleft, grow, rc;
 667         char *result, *outbuf, *inbuf;
 668         gboolean flush = FALSE;
 669         gboolean done = FALSE;
 670         GIConv cd;
 671
 672         g_return_val_if_fail (str != NULL, NULL);
 673         g_return_val_if_fail (to_charset != NULL, NULL);
 674         g_return_val_if_fail (from_charset != NULL, NULL);
 675
 676         if ((cd = g_iconv_open (to_charset, from_charset)) == (GIConv) -1) {
 677                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 678                              "Conversion from %s to %s not supported.",
 679                              from_charset, to_charset);
 680
 681                 if (bytes_written)
 682                         *bytes_written = 0;
 683
 684                 if (bytes_read)
 685                         *bytes_read = 0;
 686
 687                 return NULL;
 688         }
 689
 690         inleft = len < 0 ? strlen (str) : len;
 691         inbuf = (char *) str;
 692
 693         outleft = outsize = MAX (inleft, 8);
 694         outbuf = result = g_malloc (outsize + 4);
 695
 696         do {
 697                 if (!flush)
 698                         rc = g_iconv (cd, &inbuf, &inleft, &outbuf, &outleft);
 699                 else
 700                         rc = g_iconv (cd, NULL, NULL, &outbuf, &outleft);
 701
 702                 if (rc == (gsize) -1) {
 703                         switch (errno) {
 704                         case E2BIG:
 705                                 /* grow our result buffer */
 706                                 grow = MAX (inleft, 8) << 1;
 707                                 outused = outbuf - result;
 708                                 outsize += grow;
 709                                 outleft += grow;
 710                                 result = g_realloc (result, outsize + 4);
 711                                 outbuf = result + outused;
 712                                 break;
 713                         case EINVAL:
 714                                 /* incomplete input, stop converting and terminate here */
 715                                 if (flush)
 716                                         done = TRUE;
 717                                 else
 718                                         flush = TRUE;
 719                                 break;
 720                         case EILSEQ:
 721                                 /* illegal sequence in the input */
 722                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "%s", g_strerror (errno));
 723
 724                                 if (bytes_read) {
 725                                         /* save offset of the illegal input sequence */
 726                                         *bytes_read = (inbuf - str);
 727                                 }
 728
 729                                 if (bytes_written)
 730                                         *bytes_written = 0;
 731
 732                                 g_iconv_close (cd);
 733                                 g_free (result);
 734                                 return NULL;
 735                         default:
 736                                 /* unknown errno */
 737                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "%s", g_strerror (errno));
 738
 739                                 if (bytes_written)
 740                                         *bytes_written = 0;
 741
 742                                 if (bytes_read)
 743                                         *bytes_read = 0;
 744
 745                                 g_iconv_close (cd);
 746                                 g_free (result);
 747                                 return NULL;
 748                         }
 749                 } else if (flush) {
 750                         /* input has been converted and output has been flushed */
 751                         break;
 752                 } else {
 753                         /* input has been converted, need to flush the output */
 754                         flush = TRUE;
 755                 }
 756         } while (!done);
 757
 758         g_iconv_close (cd);
 759
 760         /* Note: not all charsets can be null-terminated with a single
 761            null byte. UCS2, for example, needs 2 null bytes and UCS4
 762            needs 4. I hope that 4 null bytes is enough to terminate all
 763            multibyte charsets? */
 764
 765         /* null-terminate the result */
 766         memset (outbuf, 0, 4);
 767
 768         if (bytes_written)
 769                 *bytes_written = outbuf - result;
 770
 771         if (bytes_read)
 772                 *bytes_read = inbuf - str;
 773
 774         return result;
 775 }
 776
 777
 778 /*
 779  * Unicode conversion
 780  */
 781
 782 /**
 783  * An explanation of the conversion can be found at:
 784  * http://home.tiscali.nl/t876506/utf8tbl.html
 785  *
 786  **/
 787 gint
 788 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
 789 {
 790         int base, n, i;
 791
 792         if (c < 0x80) {
 793                 base = 0;
 794                 n = 1;
 795         } else if (c < 0x800) {
 796                 base = 192;
 797                 n = 2;
 798         } else if (c < 0x10000) {
 799                 base = 224;
 800                 n = 3;
 801         } else if (c < 0x200000) {
 802                 base = 240;
 803                 n = 4;
 804         } else if (c < 0x4000000) {
 805                 base = 248;
 806                 n = 5;
 807         } else if (c < 0x80000000) {
 808                 base = 252;
 809                 n = 6;
 810         } else {
 811                 return -1;
 812         }
 813
 814         if (outbuf != NULL) {
 815                 for (i = n - 1; i > 0; i--) {
 816                         /* mask off 6 bits worth and add 128 */
 817                         outbuf[i] = (c & 0x3f) | 0x80;
 818                         c >>= 6;
 819                 }
 820
 821                 /* first character has a different base */
 822                 outbuf[0] = c | base;
 823         }
 824
 825         return n;
 826 }
 827
 828 static FORCE_INLINE (int)
 829 g_unichar_to_utf16 (gunichar c, gunichar2 *outbuf)
 830 {
 831         gunichar c2;
 832
 833         if (c < 0xd800) {
 834                 if (outbuf)
 835                         *outbuf = (gunichar2) c;
 836
 837                 return 1;
 838         } else if (c < 0xe000) {
 839                 return -1;
 840         } else if (c < 0x10000) {
 841                 if (outbuf)
 842                         *outbuf = (gunichar2) c;
 843
 844                 return 1;
 845         } else if (c < 0x110000) {
 846                 if (outbuf) {
 847                         c2 = c - 0x10000;
 848
 849                         outbuf[0] = (gunichar2) ((c2 >> 10) + 0xd800);
 850                         outbuf[1] = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 851                 }
 852
 853                 return 2;
 854         } else {
 855                 return -1;
 856         }
 857 }
 858
 859 gunichar *
 860 g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written)
 861 {
 862         gunichar *outbuf, *outptr;
 863         char *inptr;
 864         glong n, i;
 865
 866         g_return_val_if_fail (str != NULL, NULL);
 867
 868         n = g_utf8_strlen (str, len);
 869
 870         if (items_written)
 871                 *items_written = n;
 872
 873         outptr = outbuf = g_malloc ((n + 1) * sizeof (gunichar));
 874         inptr = (char *) str;
 875
 876         for (i = 0; i < n; i++) {
 877                 *outptr++ = g_utf8_get_char (inptr);
 878                 inptr = g_utf8_next_char (inptr);
 879         }
 880
 881         *outptr = 0;
 882
 883         return outbuf;
 884 }
 885
 886 static gunichar2 *
 887 eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, GError **err)
 888 {
 889         gunichar2 *outbuf, *outptr;
 890         size_t outlen = 0;
 891         size_t inleft;
 892         char *inptr;
 893         gunichar c;
 894         int u, n;
 895
 896         g_return_val_if_fail (str != NULL, NULL);
 897
 898         if (len < 0) {
 899                 if (include_nuls) {
 900                         g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "Conversions with embedded nulls must pass the string length");
 901                         return NULL;
 902                 }
 903
 904                 len = strlen (str);
 905         }
 906
 907         inptr = (char *) str;
 908         inleft = len;
 909
 910         while (inleft > 0) {
 911                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
 912                         goto error;
 913
 914                 if (c == 0 && !include_nuls)
 915                         break;
 916
 917                 if ((u = g_unichar_to_utf16 (c, NULL)) < 0) {
 918                         errno = EILSEQ;
 919                         goto error;
 920                 }
 921
 922                 outlen += u;
 923                 inleft -= n;
 924                 inptr += n;
 925         }
 926
 927         if (items_read)
 928                 *items_read = inptr - str;
 929
 930         if (items_written)
 931                 *items_written = outlen;
 932
 933         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
 934         inptr = (char *) str;
 935         inleft = len;
 936
 937         while (inleft > 0) {
 938                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
 939                         break;
 940
 941                 if (c == 0 && !include_nuls)
 942                         break;
 943
 944                 outptr += g_unichar_to_utf16 (c, outptr);
 945                 inleft -= n;
 946                 inptr += n;
 947         }
 948
 949         *outptr = '\0';
 950
 951         return outbuf;
 952
 953  error:
 954         if (errno == EILSEQ) {
 955                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 956                              "Illegal byte sequence encounted in the input.");
 957         } else if (items_read) {
 958                 /* partial input is ok if we can let our caller know... */
 959         } else {
 960                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 961                              "Partial byte sequence encountered in the input.");
 962         }
 963
 964         if (items_read)
 965                 *items_read = inptr - str;
 966
 967         if (items_written)
 968                 *items_written = 0;
 969
 970         return NULL;
 971 }
 972
 973 gunichar2 *
 974 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 975 {
 976         return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, err);
 977 }
 978
 979 gunichar2 *
 980 eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 981 {
 982         return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, err);
 983 }
 984
 985 gunichar *
 986 g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 987 {
 988         gunichar *outbuf, *outptr;
 989         size_t outlen = 0;
 990         size_t inleft;
 991         char *inptr;
 992         gunichar c;
 993         int n;
 994
 995         g_return_val_if_fail (str != NULL, NULL);
 996
 997         if (len < 0)
 998                 len = strlen (str);
 999
1000         inptr = (char *) str;
1001         inleft = len;
1002
1003         while (inleft > 0) {
1004                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0) {
1005                         if (errno == EILSEQ) {
1006                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1007                                              "Illegal byte sequence encounted in the input.");
1008                         } else if (items_read) {
1009                                 /* partial input is ok if we can let our caller know... */
1010                                 break;
1011                         } else {
1012                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1013                                              "Partial byte sequence encountered in the input.");
1014                         }
1015
1016                         if (items_read)
1017                                 *items_read = inptr - str;
1018
1019                         if (items_written)
1020                                 *items_written = 0;
1021
1022                         return NULL;
1023                 } else if (c == 0)
1024                         break;
1025
1026                 outlen += 4;
1027                 inleft -= n;
1028                 inptr += n;
1029         }
1030
1031         if (items_written)
1032                 *items_written = outlen / 4;
1033
1034         if (items_read)
1035                 *items_read = inptr - str;
1036
1037         outptr = outbuf = g_malloc (outlen + 4);
1038         inptr = (char *) str;
1039         inleft = len;
1040
1041         while (inleft > 0) {
1042                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
1043                         break;
1044                 else if (c == 0)
1045                         break;
1046
1047                 *outptr++ = c;
1048                 inleft -= n;
1049                 inptr += n;
1050         }
1051
1052         *outptr = 0;
1053
1054         return outbuf;
1055 }
1056
1057 gchar *
1058 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1059 {
1060         char *inptr, *outbuf, *outptr;
1061         size_t outlen = 0;
1062         size_t inleft;
1063         gunichar c;
1064         int n;
1065
1066         g_return_val_if_fail (str != NULL, NULL);
1067
1068         if (len < 0) {
1069                 len = 0;
1070                 while (str[len])
1071                         len++;
1072         }
1073
1074         inptr = (char *) str;
1075         inleft = len * 2;
1076
1077         while (inleft > 0) {
1078                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1079                         if (n == -2 && inleft > 2) {
1080                                 /* This means that the first UTF-16 char was read, but second failed */
1081                                 inleft -= 2;
1082                                 inptr += 2;
1083                         }
1084
1085                         if (errno == EILSEQ) {
1086                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1087                                              "Illegal byte sequence encounted in the input.");
1088                         } else if (items_read) {
1089                                 /* partial input is ok if we can let our caller know... */
1090                                 break;
1091                         } else {
1092                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1093                                              "Partial byte sequence encountered in the input.");
1094                         }
1095
1096                         if (items_read)
1097                                 *items_read = (inptr - (char *) str) / 2;
1098
1099                         if (items_written)
1100                                 *items_written = 0;
1101
1102                         return NULL;
1103                 } else if (c == 0)
1104                         break;
1105
1106                 outlen += g_unichar_to_utf8 (c, NULL);
1107                 inleft -= n;
1108                 inptr += n;
1109         }
1110
1111         if (items_read)
1112                 *items_read = (inptr - (char *) str) / 2;
1113
1114         if (items_written)
1115                 *items_written = outlen;
1116
1117         outptr = outbuf = g_malloc (outlen + 1);
1118         inptr = (char *) str;
1119         inleft = len * 2;
1120
1121         while (inleft > 0) {
1122                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1123                         break;
1124                 else if (c == 0)
1125                         break;
1126
1127                 outptr += g_unichar_to_utf8 (c, outptr);
1128                 inleft -= n;
1129                 inptr += n;
1130         }
1131
1132         *outptr = '\0';
1133
1134         return outbuf;
1135 }
1136
1137 gunichar *
1138 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1139 {
1140         gunichar *outbuf, *outptr;
1141         size_t outlen = 0;
1142         size_t inleft;
1143         char *inptr;
1144         gunichar c;
1145         int n;
1146
1147         g_return_val_if_fail (str != NULL, NULL);
1148
1149         if (len < 0) {
1150                 len = 0;
1151                 while (str[len])
1152                         len++;
1153         }
1154
1155         inptr = (char *) str;
1156         inleft = len * 2;
1157
1158         while (inleft > 0) {
1159                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1160                         if (n == -2 && inleft > 2) {
1161                                 /* This means that the first UTF-16 char was read, but second failed */
1162                                 inleft -= 2;
1163                                 inptr += 2;
1164                         }
1165
1166                         if (errno == EILSEQ) {
1167                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1168                                              "Illegal byte sequence encounted in the input.");
1169                         } else if (items_read) {
1170                                 /* partial input is ok if we can let our caller know... */
1171                                 break;
1172                         } else {
1173                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1174                                              "Partial byte sequence encountered in the input.");
1175                         }
1176
1177                         if (items_read)
1178                                 *items_read = (inptr - (char *) str) / 2;
1179
1180                         if (items_written)
1181                                 *items_written = 0;
1182
1183                         return NULL;
1184                 } else if (c == 0)
1185                         break;
1186
1187                 outlen += 4;
1188                 inleft -= n;
1189                 inptr += n;
1190         }
1191
1192         if (items_read)
1193                 *items_read = (inptr - (char *) str) / 2;
1194
1195         if (items_written)
1196                 *items_written = outlen / 4;
1197
1198         outptr = outbuf = g_malloc (outlen + 4);
1199         inptr = (char *) str;
1200         inleft = len * 2;
1201
1202         while (inleft > 0) {
1203                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1204                         break;
1205                 else if (c == 0)
1206                         break;
1207
1208                 *outptr++ = c;
1209                 inleft -= n;
1210                 inptr += n;
1211         }
1212
1213         *outptr = 0;
1214
1215         return outbuf;
1216 }
1217
1218 gchar *
1219 g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1220 {
1221         char *outbuf, *outptr;
1222         size_t outlen = 0;
1223         glong i;
1224         int n;
1225
1226         g_return_val_if_fail (str != NULL, NULL);
1227
1228         if (len < 0) {
1229                 for (i = 0; str[i] != 0; i++) {
1230                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1231                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1232                                              "Illegal byte sequence encounted in the input.");
1233
1234                                 if (items_written)
1235                                         *items_written = 0;
1236
1237                                 if (items_read)
1238                                         *items_read = i;
1239
1240                                 return NULL;
1241                         }
1242
1243                         outlen += n;
1244                 }
1245         } else {
1246                 for (i = 0; i < len && str[i] != 0; i++) {
1247                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1248                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1249                                              "Illegal byte sequence encounted in the input.");
1250
1251                                 if (items_written)
1252                                         *items_written = 0;
1253
1254                                 if (items_read)
1255                                         *items_read = i;
1256
1257                                 return NULL;
1258                         }
1259
1260                         outlen += n;
1261                 }
1262         }
1263
1264         len = i;
1265
1266         outptr = outbuf = g_malloc (outlen + 1);
1267         for (i = 0; i < len; i++)
1268                 outptr += g_unichar_to_utf8 (str[i], outptr);
1269         *outptr = 0;
1270
1271         if (items_written)
1272                 *items_written = outlen;
1273
1274         if (items_read)
1275                 *items_read = i;
1276
1277         return outbuf;
1278 }
1279
1280 gunichar2 *
1281 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1282 {
1283         gunichar2 *outbuf, *outptr;
1284         size_t outlen = 0;
1285         glong i;
1286         int n;
1287
1288         g_return_val_if_fail (str != NULL, NULL);
1289
1290         if (len < 0) {
1291                 for (i = 0; str[i] != 0; i++) {
1292                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1293                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1294                                              "Illegal byte sequence encounted in the input.");
1295
1296                                 if (items_written)
1297                                         *items_written = 0;
1298
1299                                 if (items_read)
1300                                         *items_read = i;
1301
1302                                 return NULL;
1303                         }
1304
1305                         outlen += n;
1306                 }
1307         } else {
1308                 for (i = 0; i < len && str[i] != 0; i++) {
1309                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1310                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1311                                              "Illegal byte sequence encounted in the input.");
1312
1313                                 if (items_written)
1314                                         *items_written = 0;
1315
1316                                 if (items_read)
1317                                         *items_read = i;
1318
1319                                 return NULL;
1320                         }
1321
1322                         outlen += n;
1323                 }
1324         }
1325
1326         len = i;
1327
1328         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
1329         for (i = 0; i < len; i++)
1330                 outptr += g_unichar_to_utf16 (str[i], outptr);
1331         *outptr = 0;
1332
1333         if (items_written)
1334                 *items_written = outlen;
1335
1336         if (items_read)
1337                 *items_read = i;
1338
1339         return outbuf;
1340 }