eglib/src/giconv.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*
   3  *  Copyright (C) 2011 Jeffrey Stedfast
   4  *
   5  *  Permission is hereby granted, free of charge, to any person
   6  *  obtaining a copy of this software and associated documentation
   7  *  files (the "Software"), to deal in the Software without
   8  *  restriction, including without limitation the rights to use, copy,
   9  *  modify, merge, publish, distribute, sublicense, and/or sell copies
  10  *  of the Software, and to permit persons to whom the Software is
  11  *  furnished to do so, subject to the following conditions:
  12  *
  13  *  The above copyright notice and this permission notice shall be
  14  *  included in all copies or substantial portions of the Software.
  15  *
  16  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  18  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  20  *  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  21  *  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23  *  DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #include <glib.h>
  31 #include <string.h>
  32 #ifdef HAVE_ICONV_H
  33 #include <iconv.h>
  34 #endif
  35 #include <errno.h>
  36
  37 #define UNROLL_DECODE_UTF8 0
  38 #define UNROLL_ENCODE_UTF8 0
  39
  40 typedef int (* Decoder) (char *inbuf, size_t inleft, gunichar *outchar);
  41 typedef int (* Encoder) (gunichar c, char *outbuf, size_t outleft);
  42
  43 struct _GIConv {
  44         Decoder decode;
  45         Encoder encode;
  46         gunichar c;
  47 #ifdef HAVE_ICONV
  48         iconv_t cd;
  49 #endif
  50 };
  51
  52 static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar);
  53 static int encode_utf32be (gunichar c, char *outbuf, size_t outleft);
  54
  55 static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar);
  56 static int encode_utf32le (gunichar c, char *outbuf, size_t outleft);
  57
  58 static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar);
  59 static int encode_utf16be (gunichar c, char *outbuf, size_t outleft);
  60
  61 static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar);
  62 static int encode_utf16le (gunichar c, char *outbuf, size_t outleft);
  63
  64 static int decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar);
  65 static int encode_utf8 (gunichar c, char *outbuf, size_t outleft);
  66
  67 static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar);
  68 static int encode_latin1 (gunichar c, char *outbuf, size_t outleft);
  69
  70 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
  71 #define decode_utf32 decode_utf32le
  72 #define encode_utf32 encode_utf32le
  73 #define decode_utf16 decode_utf16le
  74 #define encode_utf16 encode_utf16le
  75 #else
  76 #define decode_utf32 decode_utf32be
  77 #define encode_utf32 encode_utf32be
  78 #define decode_utf16 decode_utf16be
  79 #define encode_utf16 encode_utf16be
  80 #endif
  81
  82 static struct {
  83         const char *name;
  84         Decoder decoder;
  85         Encoder encoder;
  86 } charsets[] = {
  87         { "ISO-8859-1", decode_latin1,  encode_latin1  },
  88         { "ISO8859-1",  decode_latin1,  encode_latin1  },
  89         { "UTF-32BE",   decode_utf32be, encode_utf32be },
  90         { "UTF-32LE",   decode_utf32le, encode_utf32le },
  91         { "UTF-16BE",   decode_utf16be, encode_utf16be },
  92         { "UTF-16LE",   decode_utf16le, encode_utf16le },
  93         { "UTF-32",     decode_utf32,   encode_utf32   },
  94         { "UTF-16",     decode_utf16,   encode_utf16   },
  95         { "UTF-8",      decode_utf8,    encode_utf8    },
  96         { "US-ASCII",   decode_latin1,  encode_latin1  },
  97         { "Latin1",     decode_latin1,  encode_latin1  },
  98         { "ASCII",      decode_latin1,  encode_latin1  },
  99         { "UTF32",      decode_utf32,   encode_utf32   },
 100         { "UTF16",      decode_utf16,   encode_utf16   },
 101         { "UTF8",       decode_utf8,    encode_utf8    },
 102 };
 103
 104
 105 GIConv
 106 g_iconv_open (const char *to_charset, const char *from_charset)
 107 {
 108 #ifdef HAVE_ICONV
 109         iconv_t icd = (iconv_t) -1;
 110 #endif
 111         Decoder decoder = NULL;
 112         Encoder encoder = NULL;
 113         GIConv cd;
 114         guint i;
 115
 116         if (!to_charset || !from_charset || !to_charset[0] || !from_charset[0]) {
 117                 errno = EINVAL;
 118
 119                 return (GIConv) -1;
 120         }
 121
 122         for (i = 0; i < G_N_ELEMENTS (charsets); i++) {
 123                 if (!g_ascii_strcasecmp (charsets[i].name, from_charset))
 124                         decoder = charsets[i].decoder;
 125
 126                 if (!g_ascii_strcasecmp (charsets[i].name, to_charset))
 127                         encoder = charsets[i].encoder;
 128         }
 129
 130         if (!encoder || !decoder) {
 131 #ifdef HAVE_ICONV
 132                 if ((icd = iconv_open (to_charset, from_charset)) == (iconv_t) -1)
 133                         return (GIConv) -1;
 134 #else
 135                 errno = EINVAL;
 136
 137                 return (GIConv) -1;
 138 #endif
 139         }
 140
 141         cd = (GIConv) g_malloc (sizeof (struct _GIConv));
 142         cd->decode = decoder;
 143         cd->encode = encoder;
 144         cd->c = -1;
 145
 146 #ifdef HAVE_ICONV
 147         cd->cd = icd;
 148 #endif
 149
 150         return cd;
 151 }
 152
 153 int
 154 g_iconv_close (GIConv cd)
 155 {
 156 #ifdef HAVE_ICONV
 157         if (cd->cd != (iconv_t) -1)
 158                 iconv_close (cd->cd);
 159 #endif
 160
 161         g_free (cd);
 162
 163         return 0;
 164 }
 165
 166 gsize
 167 g_iconv (GIConv cd, gchar **inbytes, gsize *inbytesleft,
 168          gchar **outbytes, gsize *outbytesleft)
 169 {
 170         size_t inleft, outleft;
 171         char *inptr, *outptr;
 172         gunichar c;
 173         int rc = 0;
 174
 175 #ifdef HAVE_ICONV
 176         if (cd->cd != (iconv_t) -1)
 177                 return iconv (cd->cd, inbytes, inbytesleft, outbytes, outbytesleft);
 178 #endif
 179
 180         if (outbytes == NULL || outbytesleft == NULL) {
 181                 /* reset converter */
 182                 cd->c = -1;
 183                 return 0;
 184         }
 185
 186         inleft = inbytesleft ? *inbytesleft : 0;
 187         inptr = inbytes ? *inbytes : NULL;
 188         outleft = *outbytesleft;
 189         outptr = *outbytes;
 190
 191         if ((c = cd->c) != (gunichar) -1)
 192                 goto encode;
 193
 194         while (inleft > 0) {
 195                 if ((rc = cd->decode (inptr, inleft, &c)) < 0)
 196                         break;
 197
 198                 inleft -= rc;
 199                 inptr += rc;
 200
 201         encode:
 202                 if ((rc = cd->encode (c, outptr, outleft)) < 0)
 203                         break;
 204
 205                 c = (gunichar) -1;
 206                 outleft -= rc;
 207                 outptr += rc;
 208         }
 209
 210         if (inbytesleft)
 211                 *inbytesleft = inleft;
 212
 213         if (inbytes)
 214                 *inbytes = inptr;
 215
 216         *outbytesleft = outleft;
 217         *outbytes = outptr;
 218         cd->c = c;
 219
 220         return rc < 0 ? -1 : 0;
 221 }
 222
 223 /*
 224  * Unicode encoders and decoders
 225  */
 226
 227 static int
 228 decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar)
 229 {
 230         unsigned char *inptr = (unsigned char *) inbuf;
 231         gunichar c;
 232
 233         if (inleft < 4) {
 234                 errno = EINVAL;
 235                 return -1;
 236         }
 237
 238         c = (inptr[0] << 24) | (inptr[1] << 16) | (inptr[2] << 8) | inptr[3];
 239
 240         if (c >= 0xd800 && c < 0xe000) {
 241                 errno = EILSEQ;
 242                 return -1;
 243         } else if (c >= 0x110000) {
 244                 errno = EILSEQ;
 245                 return -1;
 246         }
 247
 248         *outchar = c;
 249
 250         return 4;
 251 }
 252
 253 static int
 254 decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar)
 255 {
 256         unsigned char *inptr = (unsigned char *) inbuf;
 257         gunichar c;
 258
 259         if (inleft < 4) {
 260                 errno = EINVAL;
 261                 return -1;
 262         }
 263
 264         c = (inptr[3] << 24) | (inptr[2] << 16) | (inptr[1] << 8) | inptr[0];
 265
 266         if (c >= 0xd800 && c < 0xe000) {
 267                 errno = EILSEQ;
 268                 return -1;
 269         } else if (c >= 0x110000) {
 270                 errno = EILSEQ;
 271                 return -1;
 272         }
 273
 274         *outchar = c;
 275
 276         return 4;
 277 }
 278
 279 static int
 280 encode_utf32be (gunichar c, char *outbuf, size_t outleft)
 281 {
 282         unsigned char *outptr = (unsigned char *) outbuf;
 283
 284         if (outleft < 4) {
 285                 errno = E2BIG;
 286                 return -1;
 287         }
 288
 289         outptr[0] = (c >> 24) & 0xff;
 290         outptr[1] = (c >> 16) & 0xff;
 291         outptr[2] = (c >> 8) & 0xff;
 292         outptr[3] = c & 0xff;
 293
 294         return 4;
 295 }
 296
 297 static int
 298 encode_utf32le (gunichar c, char *outbuf, size_t outleft)
 299 {
 300         unsigned char *outptr = (unsigned char *) outbuf;
 301
 302         if (outleft < 4) {
 303                 errno = E2BIG;
 304                 return -1;
 305         }
 306
 307         outptr[0] = c & 0xff;
 308         outptr[1] = (c >> 8) & 0xff;
 309         outptr[2] = (c >> 16) & 0xff;
 310         outptr[3] = (c >> 24) & 0xff;
 311
 312         return 4;
 313 }
 314
 315 static int
 316 decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar)
 317 {
 318         unsigned char *inptr = (unsigned char *) inbuf;
 319         gunichar2 c;
 320         gunichar u;
 321
 322         if (inleft < 2) {
 323                 errno = EINVAL;
 324                 return -1;
 325         }
 326
 327         u = (inptr[0] << 8) | inptr[1];
 328
 329         if (u < 0xd800) {
 330                 /* 0x0000 -> 0xd7ff */
 331                 *outchar = u;
 332                 return 2;
 333         } else if (u < 0xdc00) {
 334                 /* 0xd800 -> 0xdbff */
 335                 if (inleft < 4) {
 336                         errno = EINVAL;
 337                         return -2;
 338                 }
 339
 340                 c = (inptr[2] << 8) | inptr[3];
 341
 342                 if (c < 0xdc00 || c > 0xdfff) {
 343                         errno = EILSEQ;
 344                         return -2;
 345                 }
 346
 347                 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
 348                 *outchar = u;
 349
 350                 return 4;
 351         } else if (u < 0xe000) {
 352                 /* 0xdc00 -> 0xdfff */
 353                 errno = EILSEQ;
 354                 return -1;
 355         } else {
 356                 /* 0xe000 -> 0xffff */
 357                 *outchar = u;
 358                 return 2;
 359         }
 360 }
 361
 362 static int
 363 decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar)
 364 {
 365         unsigned char *inptr = (unsigned char *) inbuf;
 366         gunichar2 c;
 367         gunichar u;
 368
 369         if (inleft < 2) {
 370                 errno = EINVAL;
 371                 return -1;
 372         }
 373
 374         u = (inptr[1] << 8) | inptr[0];
 375
 376         if (u < 0xd800) {
 377                 /* 0x0000 -> 0xd7ff */
 378                 *outchar = u;
 379                 return 2;
 380         } else if (u < 0xdc00) {
 381                 /* 0xd800 -> 0xdbff */
 382                 if (inleft < 4) {
 383                         errno = EINVAL;
 384                         return -2;
 385                 }
 386
 387                 c = (inptr[3] << 8) | inptr[2];
 388
 389                 if (c < 0xdc00 || c > 0xdfff) {
 390                         errno = EILSEQ;
 391                         return -2;
 392                 }
 393
 394                 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
 395                 *outchar = u;
 396
 397                 return 4;
 398         } else if (u < 0xe000) {
 399                 /* 0xdc00 -> 0xdfff */
 400                 errno = EILSEQ;
 401                 return -1;
 402         } else {
 403                 /* 0xe000 -> 0xffff */
 404                 *outchar = u;
 405                 return 2;
 406         }
 407 }
 408
 409 static int
 410 encode_utf16be (gunichar c, char *outbuf, size_t outleft)
 411 {
 412         unsigned char *outptr = (unsigned char *) outbuf;
 413         gunichar2 ch;
 414         gunichar c2;
 415
 416         if (c < 0x10000) {
 417                 if (outleft < 2) {
 418                         errno = E2BIG;
 419                         return -1;
 420                 }
 421
 422                 outptr[0] = (c >> 8) & 0xff;
 423                 outptr[1] = c & 0xff;
 424
 425                 return 2;
 426         } else {
 427                 if (outleft < 4) {
 428                         errno = E2BIG;
 429                         return -1;
 430                 }
 431
 432                 c2 = c - 0x10000;
 433
 434                 ch = (gunichar2) ((c2 >> 10) + 0xd800);
 435                 outptr[0] = (ch >> 8) & 0xff;
 436                 outptr[1] = ch & 0xff;
 437
 438                 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 439                 outptr[2] = (ch >> 8) & 0xff;
 440                 outptr[3] = ch & 0xff;
 441
 442                 return 4;
 443         }
 444 }
 445
 446 static int
 447 encode_utf16le (gunichar c, char *outbuf, size_t outleft)
 448 {
 449         unsigned char *outptr = (unsigned char *) outbuf;
 450         gunichar2 ch;
 451         gunichar c2;
 452
 453         if (c < 0x10000) {
 454                 if (outleft < 2) {
 455                         errno = E2BIG;
 456                         return -1;
 457                 }
 458
 459                 outptr[0] = c & 0xff;
 460                 outptr[1] = (c >> 8) & 0xff;
 461
 462                 return 2;
 463         } else {
 464                 if (outleft < 4) {
 465                         errno = E2BIG;
 466                         return -1;
 467                 }
 468
 469                 c2 = c - 0x10000;
 470
 471                 ch = (gunichar2) ((c2 >> 10) + 0xd800);
 472                 outptr[0] = ch & 0xff;
 473                 outptr[1] = (ch >> 8) & 0xff;
 474
 475                 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 476                 outptr[2] = ch & 0xff;
 477                 outptr[3] = (ch >> 8) & 0xff;
 478
 479                 return 4;
 480         }
 481 }
 482
 483 static int
 484 decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar)
 485 {
 486         unsigned char *inptr = (unsigned char *) inbuf;
 487         gunichar u;
 488         int n, i;
 489
 490         u = *inptr;
 491
 492         if (u < 0x80) {
 493                 /* simple ascii case */
 494                 *outchar = u;
 495                 return 1;
 496         } else if (u < 0xc2) {
 497                 errno = EILSEQ;
 498                 return -1;
 499         } else if (u < 0xe0) {
 500                 u &= 0x1f;
 501                 n = 2;
 502         } else if (u < 0xf0) {
 503                 u &= 0x0f;
 504                 n = 3;
 505         } else if (u < 0xf8) {
 506                 u &= 0x07;
 507                 n = 4;
 508         } else if (u < 0xfc) {
 509                 u &= 0x03;
 510                 n = 5;
 511         } else if (u < 0xfe) {
 512                 u &= 0x01;
 513                 n = 6;
 514         } else {
 515                 errno = EILSEQ;
 516                 return -1;
 517         }
 518
 519         if (n > inleft) {
 520                 errno = EINVAL;
 521                 return -1;
 522         }
 523
 524 #if UNROLL_DECODE_UTF8
 525         switch (n) {
 526         case 6: u = (u << 6) | (*++inptr ^ 0x80);
 527         case 5: u = (u << 6) | (*++inptr ^ 0x80);
 528         case 4: u = (u << 6) | (*++inptr ^ 0x80);
 529         case 3: u = (u << 6) | (*++inptr ^ 0x80);
 530         case 2: u = (u << 6) | (*++inptr ^ 0x80);
 531         }
 532 #else
 533         for (i = 1; i < n; i++)
 534                 u = (u << 6) | (*++inptr ^ 0x80);
 535 #endif
 536
 537         *outchar = u;
 538
 539         return n;
 540 }
 541
 542 static int
 543 encode_utf8 (gunichar c, char *outbuf, size_t outleft)
 544 {
 545         unsigned char *outptr = (unsigned char *) outbuf;
 546         int base, n, i;
 547
 548         if (c < 0x80) {
 549                 outptr[0] = c;
 550                 return 1;
 551         } else if (c < 0x800) {
 552                 base = 192;
 553                 n = 2;
 554         } else if (c < 0x10000) {
 555                 base = 224;
 556                 n = 3;
 557         } else if (c < 0x200000) {
 558                 base = 240;
 559                 n = 4;
 560         } else if (c < 0x4000000) {
 561                 base = 248;
 562                 n = 5;
 563         } else {
 564                 base = 252;
 565                 n = 6;
 566         }
 567
 568         if (outleft < n) {
 569                 errno = E2BIG;
 570                 return -1;
 571         }
 572
 573 #if UNROLL_ENCODE_UTF8
 574         switch (n) {
 575         case 6: outptr[5] = (c & 0x3f) | 0x80; c >>= 6;
 576         case 5: outptr[4] = (c & 0x3f) | 0x80; c >>= 6;
 577         case 4: outptr[3] = (c & 0x3f) | 0x80; c >>= 6;
 578         case 3: outptr[2] = (c & 0x3f) | 0x80; c >>= 6;
 579         case 2: outptr[1] = (c & 0x3f) | 0x80; c >>= 6;
 580         case 1: outptr[0] = c | base;
 581         }
 582 #else
 583         for (i = n - 1; i > 0; i--) {
 584                 outptr[i] = (c & 0x3f) | 0x80;
 585                 c >>= 6;
 586         }
 587
 588         outptr[0] = c | base;
 589 #endif
 590
 591         return n;
 592 }
 593
 594 static int
 595 decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar)
 596 {
 597         *outchar = (unsigned char) *inbuf;
 598         return 1;
 599 }
 600
 601 static int
 602 encode_latin1 (gunichar c, char *outbuf, size_t outleft)
 603 {
 604         if (outleft < 1) {
 605                 errno = E2BIG;
 606                 return -1;
 607         }
 608
 609         if (c > 0xff) {
 610                 errno = EILSEQ;
 611                 return -1;
 612         }
 613
 614         *outbuf = (char) c;
 615
 616         return 1;
 617 }
 618
 619
 620 /*
 621  * Simple conversion API
 622  */
 623
 624 static gpointer error_quark = "ConvertError";
 625
 626 gpointer
 627 g_convert_error_quark (void)
 628 {
 629         return error_quark;
 630 }
 631
 632 gchar *
 633 g_convert (const gchar *str, gssize len, const gchar *to_charset, const gchar *from_charset,
 634            gsize *bytes_read, gsize *bytes_written, GError **err)
 635 {
 636         size_t outsize, outused, outleft, inleft, grow, rc;
 637         char *result, *outbuf, *inbuf;
 638         gboolean flush = FALSE;
 639         gboolean done = FALSE;
 640         GIConv cd;
 641
 642         g_return_val_if_fail (str != NULL, NULL);
 643         g_return_val_if_fail (to_charset != NULL, NULL);
 644         g_return_val_if_fail (from_charset != NULL, NULL);
 645
 646         if ((cd = g_iconv_open (to_charset, from_charset)) == (GIConv) -1) {
 647                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 648                              "Conversion from %s to %s not supported.",
 649                              from_charset, to_charset);
 650
 651                 if (bytes_written)
 652                         *bytes_written = 0;
 653
 654                 if (bytes_read)
 655                         *bytes_read = 0;
 656
 657                 return NULL;
 658         }
 659
 660         inleft = len < 0 ? strlen (str) : len;
 661         inbuf = (char *) str;
 662
 663         outleft = outsize = MAX (inleft, 8);
 664         outbuf = result = g_malloc (outsize + 4);
 665
 666         do {
 667                 if (!flush)
 668                         rc = g_iconv (cd, &inbuf, &inleft, &outbuf, &outleft);
 669                 else
 670                         rc = g_iconv (cd, NULL, NULL, &outbuf, &outleft);
 671
 672                 if (rc == (size_t) -1) {
 673                         switch (errno) {
 674                         case E2BIG:
 675                                 /* grow our result buffer */
 676                                 grow = MAX (inleft, 8) << 1;
 677                                 outused = outbuf - result;
 678                                 outsize += grow;
 679                                 outleft += grow;
 680                                 result = g_realloc (result, outsize + 4);
 681                                 outbuf = result + outused;
 682                                 break;
 683                         case EINVAL:
 684                                 /* incomplete input, stop converting and terminate here */
 685                                 if (flush)
 686                                         done = TRUE;
 687                                 else
 688                                         flush = TRUE;
 689                                 break;
 690                         case EILSEQ:
 691                                 /* illegal sequence in the input */
 692                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "%s", g_strerror (errno));
 693
 694                                 if (bytes_read) {
 695                                         /* save offset of the illegal input sequence */
 696                                         *bytes_read = (inbuf - str);
 697                                 }
 698
 699                                 if (bytes_written)
 700                                         *bytes_written = 0;
 701
 702                                 g_iconv_close (cd);
 703                                 g_free (result);
 704                                 return NULL;
 705                         default:
 706                                 /* unknown errno */
 707                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "%s", g_strerror (errno));
 708
 709                                 if (bytes_written)
 710                                         *bytes_written = 0;
 711
 712                                 if (bytes_read)
 713                                         *bytes_read = 0;
 714
 715                                 g_iconv_close (cd);
 716                                 g_free (result);
 717                                 return NULL;
 718                         }
 719                 } else if (flush) {
 720                         /* input has been converted and output has been flushed */
 721                         break;
 722                 } else {
 723                         /* input has been converted, need to flush the output */
 724                         flush = TRUE;
 725                 }
 726         } while (!done);
 727
 728         g_iconv_close (cd);
 729
 730         /* Note: not all charsets can be null-terminated with a single
 731            null byte. UCS2, for example, needs 2 null bytes and UCS4
 732            needs 4. I hope that 4 null bytes is enough to terminate all
 733            multibyte charsets? */
 734
 735         /* null-terminate the result */
 736         memset (outbuf, 0, 4);
 737
 738         if (bytes_written)
 739                 *bytes_written = outbuf - result;
 740
 741         if (bytes_read)
 742                 *bytes_read = inbuf - str;
 743
 744         return result;
 745 }
 746
 747
 748 /*
 749  * Unicode conversion
 750  */
 751
 752 /**
 753  * from http://home.tiscali.nl/t876506/utf8tbl.html
 754  *
 755  * From Unicode UCS-4 to UTF-8:
 756  * Start with the Unicode number expressed as a decimal number and call this ud.
 757  *
 758  * If ud <128 (7F hex) then UTF-8 is 1 byte long, the value of ud.
 759  *
 760  * If ud >=128 and <=2047 (7FF hex) then UTF-8 is 2 bytes long.
 761  *    byte 1 = 192 + (ud div 64)
 762  *    byte 2 = 128 + (ud mod 64)
 763  *
 764  * If ud >=2048 and <=65535 (FFFF hex) then UTF-8 is 3 bytes long.
 765  *    byte 1 = 224 + (ud div 4096)
 766  *    byte 2 = 128 + ((ud div 64) mod 64)
 767  *    byte 3 = 128 + (ud mod 64)
 768  *
 769  * If ud >=65536 and <=2097151 (1FFFFF hex) then UTF-8 is 4 bytes long.
 770  *    byte 1 = 240 + (ud div 262144)
 771  *    byte 2 = 128 + ((ud div 4096) mod 64)
 772  *    byte 3 = 128 + ((ud div 64) mod 64)
 773  *    byte 4 = 128 + (ud mod 64)
 774  *
 775  * If ud >=2097152 and <=67108863 (3FFFFFF hex) then UTF-8 is 5 bytes long.
 776  *    byte 1 = 248 + (ud div 16777216)
 777  *    byte 2 = 128 + ((ud div 262144) mod 64)
 778  *    byte 3 = 128 + ((ud div 4096) mod 64)
 779  *    byte 4 = 128 + ((ud div 64) mod 64)
 780  *    byte 5 = 128 + (ud mod 64)
 781  *
 782  * If ud >=67108864 and <=2147483647 (7FFFFFFF hex) then UTF-8 is 6 bytes long.
 783  *    byte 1 = 252 + (ud div 1073741824)
 784  *    byte 2 = 128 + ((ud div 16777216) mod 64)
 785  *    byte 3 = 128 + ((ud div 262144) mod 64)
 786  *    byte 4 = 128 + ((ud div 4096) mod 64)
 787  *    byte 5 = 128 + ((ud div 64) mod 64)
 788  *    byte 6 = 128 + (ud mod 64)
 789  **/
 790 gint
 791 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
 792 {
 793         int base, n, i;
 794
 795         if (c < 0x80) {
 796                 base = 0;
 797                 n = 1;
 798         } else if (c < 0x800) {
 799                 base = 192;
 800                 n = 2;
 801         } else if (c < 0x10000) {
 802                 base = 224;
 803                 n = 3;
 804         } else if (c < 0x200000) {
 805                 base = 240;
 806                 n = 4;
 807         } else if (c < 0x4000000) {
 808                 base = 248;
 809                 n = 5;
 810         } else if (c < 0x80000000) {
 811                 base = 252;
 812                 n = 6;
 813         } else {
 814                 return -1;
 815         }
 816
 817         if (outbuf != NULL) {
 818                 for (i = n - 1; i > 0; i--) {
 819                         /* mask off 6 bits worth and add 128 */
 820                         outbuf[i] = (c & 0x3f) | 0x80;
 821                         c >>= 6;
 822                 }
 823
 824                 /* first character has a different base */
 825                 outbuf[0] = c | base;
 826         }
 827
 828         return n;
 829 }
 830
 831 static int
 832 g_unichar_to_utf16 (gunichar c, gunichar2 *outbuf)
 833 {
 834         gunichar c2;
 835
 836         if (c < 0xd800) {
 837                 if (outbuf)
 838                         *outbuf = (gunichar2) c;
 839
 840                 return 1;
 841         } else if (c < 0xe000) {
 842                 return -1;
 843         } else if (c < 0x10000) {
 844                 if (outbuf)
 845                         *outbuf = (gunichar2) c;
 846
 847                 return 1;
 848         } else if (c < 0x110000) {
 849                 if (outbuf) {
 850                         c2 = c - 0x10000;
 851
 852                         outbuf[0] = (gunichar2) ((c2 >> 10) + 0xd800);
 853                         outbuf[1] = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 854                 }
 855
 856                 return 2;
 857         } else {
 858                 return -1;
 859         }
 860 }
 861
 862 gunichar *
 863 g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written)
 864 {
 865         gunichar *outbuf, *outptr;
 866         char *inptr;
 867         glong n, i;
 868
 869         g_return_val_if_fail (str != NULL, NULL);
 870
 871         n = g_utf8_strlen (str, len);
 872
 873         if (items_written)
 874                 *items_written = n;
 875
 876         outptr = outbuf = g_malloc ((n + 1) * sizeof (gunichar));
 877         inptr = (char *) str;
 878
 879         for (i = 0; i < n; i++) {
 880                 *outptr++ = g_utf8_get_char (inptr);
 881                 inptr = g_utf8_next_char (inptr);
 882         }
 883
 884         *outptr = 0;
 885
 886         return outbuf;
 887 }
 888
 889 gunichar2 *
 890 g_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, GError **err)
 891 {
 892         gunichar2 *outbuf, *outptr;
 893         size_t outlen = 0;
 894         size_t inleft;
 895         char *inptr;
 896         gunichar c;
 897         int n;
 898
 899         g_return_val_if_fail (str != NULL, NULL);
 900
 901         if (len < 0) {
 902                 if (include_nuls) {
 903                         g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "Conversions with embedded nulls must pass the string length");
 904                         return NULL;
 905                 }
 906                 len = strlen (str);
 907         }
 908
 909         inptr = (char *) str;
 910         inleft = len;
 911
 912         while (inleft > 0) {
 913                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0) {
 914                         if (errno == EILSEQ) {
 915                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 916                                              "Illegal byte sequence encounted in the input.");
 917                         } else if (items_read) {
 918                                 /* partial input is ok if we can let our caller know... */
 919                                 break;
 920                         } else {
 921                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 922                                              "Partial byte sequence encountered in the input.");
 923                         }
 924
 925                         if (items_read)
 926                                 *items_read = inptr - str;
 927
 928                         if (items_written)
 929                                 *items_written = 0;
 930
 931                         return NULL;
 932                 } else if (c == 0 && !include_nuls)
 933                         break;
 934
 935                 outlen += g_unichar_to_utf16 (c, NULL);
 936                 inleft -= n;
 937                 inptr += n;
 938         }
 939
 940         if (items_read)
 941                 *items_read = inptr - str;
 942
 943         if (items_written)
 944                 *items_written = outlen;
 945
 946         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
 947         inptr = (char *) str;
 948         inleft = len;
 949
 950         while (inleft > 0) {
 951                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
 952                         break;
 953                 else if (c == 0 && !include_nuls)
 954                         break;
 955
 956                 outptr += g_unichar_to_utf16 (c, outptr);
 957                 inleft -= n;
 958                 inptr += n;
 959         }
 960
 961         *outptr = '\0';
 962
 963         return outbuf;
 964 }
 965
 966 gunichar2 *
 967 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 968 {
 969         return g_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, err);
 970 }
 971
 972 gunichar2 *
 973 g_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 974 {
 975         return g_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, err);
 976 }
 977
 978 gunichar *
 979 g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 980 {
 981         gunichar *outbuf, *outptr;
 982         size_t outlen = 0;
 983         size_t inleft;
 984         char *inptr;
 985         gunichar c;
 986         int n;
 987
 988         g_return_val_if_fail (str != NULL, NULL);
 989
 990         if (len < 0)
 991                 len = strlen (str);
 992
 993         inptr = (char *) str;
 994         inleft = len;
 995
 996         while (inleft > 0) {
 997                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0) {
 998                         if (errno == EILSEQ) {
 999                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1000                                              "Illegal byte sequence encounted in the input.");
1001                         } else if (items_read) {
1002                                 /* partial input is ok if we can let our caller know... */
1003                                 break;
1004                         } else {
1005                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1006                                              "Partial byte sequence encountered in the input.");
1007                         }
1008
1009                         if (items_read)
1010                                 *items_read = inptr - str;
1011
1012                         if (items_written)
1013                                 *items_written = 0;
1014
1015                         return NULL;
1016                 } else if (c == 0)
1017                         break;
1018
1019                 outlen += 4;
1020                 inleft -= n;
1021                 inptr += n;
1022         }
1023
1024         if (items_written)
1025                 *items_written = outlen / 4;
1026
1027         if (items_read)
1028                 *items_read = inptr - str;
1029
1030         outptr = outbuf = g_malloc (outlen + 4);
1031         inptr = (char *) str;
1032         inleft = len;
1033
1034         while (inleft > 0) {
1035                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
1036                         break;
1037                 else if (c == 0)
1038                         break;
1039
1040                 *outptr++ = c;
1041                 inleft -= n;
1042                 inptr += n;
1043         }
1044
1045         *outptr = 0;
1046
1047         return outbuf;
1048 }
1049
1050 gchar *
1051 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1052 {
1053         char *inptr, *outbuf, *outptr;
1054         size_t outlen = 0;
1055         size_t inleft;
1056         gunichar c;
1057         int n;
1058
1059         g_return_val_if_fail (str != NULL, NULL);
1060
1061         if (len < 0) {
1062                 len = 0;
1063                 while (str[len])
1064                         len++;
1065         }
1066
1067         inptr = (char *) str;
1068         inleft = len * 2;
1069
1070         while (inleft > 0) {
1071                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1072                         if (n == -2 && inleft > 2) {
1073                                 /* This means that the first UTF-16 char was read, but second failed */
1074                                 inleft -= 2;
1075                                 inptr += 2;
1076                         }
1077
1078                         if (errno == EILSEQ) {
1079                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1080                                              "Illegal byte sequence encounted in the input.");
1081                         } else if (items_read) {
1082                                 /* partial input is ok if we can let our caller know... */
1083                                 break;
1084                         } else {
1085                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1086                                              "Partial byte sequence encountered in the input.");
1087                         }
1088
1089                         if (items_read)
1090                                 *items_read = (inptr - (char *) str) / 2;
1091
1092                         if (items_written)
1093                                 *items_written = 0;
1094
1095                         return NULL;
1096                 } else if (c == 0)
1097                         break;
1098
1099                 outlen += g_unichar_to_utf8 (c, NULL);
1100                 inleft -= n;
1101                 inptr += n;
1102         }
1103
1104         if (items_read)
1105                 *items_read = (inptr - (char *) str) / 2;
1106
1107         if (items_written)
1108                 *items_written = outlen;
1109
1110         outptr = outbuf = g_malloc (outlen + 1);
1111         inptr = (char *) str;
1112         inleft = len * 2;
1113
1114         while (inleft > 0) {
1115                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1116                         break;
1117                 else if (c == 0)
1118                         break;
1119
1120                 outptr += g_unichar_to_utf8 (c, outptr);
1121                 inleft -= n;
1122                 inptr += n;
1123         }
1124
1125         *outptr = '\0';
1126
1127         return outbuf;
1128 }
1129
1130 gunichar *
1131 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1132 {
1133         gunichar *outbuf, *outptr;
1134         size_t outlen = 0;
1135         size_t inleft;
1136         char *inptr;
1137         gunichar c;
1138         int n;
1139
1140         g_return_val_if_fail (str != NULL, NULL);
1141
1142         if (len < 0) {
1143                 len = 0;
1144                 while (str[len])
1145                         len++;
1146         }
1147
1148         inptr = (char *) str;
1149         inleft = len * 2;
1150
1151         while (inleft > 0) {
1152                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1153                         if (n == -2 && inleft > 2) {
1154                                 /* This means that the first UTF-16 char was read, but second failed */
1155                                 inleft -= 2;
1156                                 inptr += 2;
1157                         }
1158
1159                         if (errno == EILSEQ) {
1160                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1161                                              "Illegal byte sequence encounted in the input.");
1162                         } else if (items_read) {
1163                                 /* partial input is ok if we can let our caller know... */
1164                                 break;
1165                         } else {
1166                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1167                                              "Partial byte sequence encountered in the input.");
1168                         }
1169
1170                         if (items_read)
1171                                 *items_read = (inptr - (char *) str) / 2;
1172
1173                         if (items_written)
1174                                 *items_written = 0;
1175
1176                         return NULL;
1177                 } else if (c == 0)
1178                         break;
1179
1180                 outlen += 4;
1181                 inleft -= n;
1182                 inptr += n;
1183         }
1184
1185         if (items_read)
1186                 *items_read = (inptr - (char *) str) / 2;
1187
1188         if (items_written)
1189                 *items_written = outlen / 4;
1190
1191         outptr = outbuf = g_malloc (outlen + 4);
1192         inptr = (char *) str;
1193         inleft = len * 2;
1194
1195         while (inleft > 0) {
1196                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1197                         break;
1198                 else if (c == 0)
1199                         break;
1200
1201                 *outptr++ = c;
1202                 inleft -= n;
1203                 inptr += n;
1204         }
1205
1206         *outptr = 0;
1207
1208         return outbuf;
1209 }
1210
1211 gchar *
1212 g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1213 {
1214         char *outbuf, *outptr;
1215         size_t outlen = 0;
1216         glong i;
1217         int n;
1218
1219         g_return_val_if_fail (str != NULL, NULL);
1220
1221         if (len < 0) {
1222                 for (i = 0; str[i] != 0; i++) {
1223                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1224                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1225                                              "Illegal byte sequence encounted in the input.");
1226
1227                                 if (items_written)
1228                                         *items_written = 0;
1229
1230                                 if (items_read)
1231                                         *items_read = i;
1232
1233                                 return NULL;
1234                         }
1235
1236                         outlen += n;
1237                 }
1238         } else {
1239                 for (i = 0; i < len && str[i] != 0; i++) {
1240                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1241                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1242                                              "Illegal byte sequence encounted in the input.");
1243
1244                                 if (items_written)
1245                                         *items_written = 0;
1246
1247                                 if (items_read)
1248                                         *items_read = i;
1249
1250                                 return NULL;
1251                         }
1252
1253                         outlen += n;
1254                 }
1255         }
1256
1257         len = i;
1258
1259         outptr = outbuf = g_malloc (outlen + 1);
1260         for (i = 0; i < len; i++)
1261                 outptr += g_unichar_to_utf8 (str[i], outptr);
1262         *outptr = 0;
1263
1264         if (items_written)
1265                 *items_written = outlen;
1266
1267         if (items_read)
1268                 *items_read = i;
1269
1270         return outbuf;
1271 }
1272
1273 gunichar2 *
1274 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1275 {
1276         gunichar2 *outbuf, *outptr;
1277         size_t outlen = 0;
1278         glong i;
1279         int n;
1280
1281         g_return_val_if_fail (str != NULL, NULL);
1282
1283         if (len < 0) {
1284                 for (i = 0; str[i] != 0; i++) {
1285                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1286                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1287                                              "Illegal byte sequence encounted in the input.");
1288
1289                                 if (items_written)
1290                                         *items_written = 0;
1291
1292                                 if (items_read)
1293                                         *items_read = i;
1294
1295                                 return NULL;
1296                         }
1297
1298                         outlen += n;
1299                 }
1300         } else {
1301                 for (i = 0; i < len && str[i] != 0; i++) {
1302                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1303                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1304                                              "Illegal byte sequence encounted in the input.");
1305
1306                                 if (items_written)
1307                                         *items_written = 0;
1308
1309                                 if (items_read)
1310                                         *items_read = i;
1311
1312                                 return NULL;
1313                         }
1314
1315                         outlen += n;
1316                 }
1317         }
1318
1319         len = i;
1320
1321         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
1322         for (i = 0; i < len; i++)
1323                 outptr += g_unichar_to_utf16 (str[i], outptr);
1324         *outptr = 0;
1325
1326         if (items_written)
1327                 *items_written = outlen;
1328
1329         if (items_read)
1330                 *items_read = i;
1331
1332         return outbuf;
1333 }