eglib/src/giconv.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*
   3  *  Copyright (C) 2011 Jeffrey Stedfast
   4  *
   5  *  Permission is hereby granted, free of charge, to any person
   6  *  obtaining a copy of this software and associated documentation
   7  *  files (the "Software"), to deal in the Software without
   8  *  restriction, including without limitation the rights to use, copy,
   9  *  modify, merge, publish, distribute, sublicense, and/or sell copies
  10  *  of the Software, and to permit persons to whom the Software is
  11  *  furnished to do so, subject to the following conditions:
  12  *
  13  *  The above copyright notice and this permission notice shall be
  14  *  included in all copies or substantial portions of the Software.
  15  *
  16  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  18  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  20  *  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  21  *  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23  *  DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #include <glib.h>
  31 #include <string.h>
  32 #ifdef HAVE_ICONV_H
  33 #include <iconv.h>
  34 #endif
  35 #include <errno.h>
  36
  37 #define UNROLL_DECODE_UTF8 0
  38 #define UNROLL_ENCODE_UTF8 0
  39
  40 typedef int (* Decoder) (char *inbuf, size_t inleft, gunichar *outchar);
  41 typedef int (* Encoder) (gunichar c, char *outbuf, size_t outleft);
  42
  43 struct _GIConv {
  44         Decoder decode;
  45         Encoder encode;
  46         gunichar c;
  47 #ifdef HAVE_ICONV
  48         iconv_t cd;
  49 #endif
  50 };
  51
  52 static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar);
  53 static int encode_utf32be (gunichar c, char *outbuf, size_t outleft);
  54
  55 static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar);
  56 static int encode_utf32le (gunichar c, char *outbuf, size_t outleft);
  57
  58 static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar);
  59 static int encode_utf16be (gunichar c, char *outbuf, size_t outleft);
  60
  61 static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar);
  62 static int encode_utf16le (gunichar c, char *outbuf, size_t outleft);
  63
  64 static int decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar);
  65 static int encode_utf8 (gunichar c, char *outbuf, size_t outleft);
  66
  67 static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar);
  68 static int encode_latin1 (gunichar c, char *outbuf, size_t outleft);
  69
  70 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
  71 #define decode_utf32 decode_utf32le
  72 #define encode_utf32 encode_utf32le
  73 #define decode_utf16 decode_utf16le
  74 #define encode_utf16 encode_utf16le
  75 #else
  76 #define decode_utf32 decode_utf32be
  77 #define encode_utf32 encode_utf32be
  78 #define decode_utf16 decode_utf16be
  79 #define encode_utf16 encode_utf16be
  80 #endif
  81
  82 static struct {
  83         const char *name;
  84         Decoder decoder;
  85         Encoder encoder;
  86 } charsets[] = {
  87         { "ISO-8859-1", decode_latin1,  encode_latin1  },
  88         { "ISO8859-1",  decode_latin1,  encode_latin1  },
  89         { "UTF-32BE",   decode_utf32be, encode_utf32be },
  90         { "UTF-32LE",   decode_utf32le, encode_utf32le },
  91         { "UTF-16BE",   decode_utf16be, encode_utf16be },
  92         { "UTF-16LE",   decode_utf16le, encode_utf16le },
  93         { "UTF-32",     decode_utf32,   encode_utf32   },
  94         { "UTF-16",     decode_utf16,   encode_utf16   },
  95         { "UTF-8",      decode_utf8,    encode_utf8    },
  96         { "US-ASCII",   decode_latin1,  encode_latin1  },
  97         { "Latin1",     decode_latin1,  encode_latin1  },
  98         { "ASCII",      decode_latin1,  encode_latin1  },
  99         { "UTF32",      decode_utf32,   encode_utf32   },
 100         { "UTF16",      decode_utf16,   encode_utf16   },
 101         { "UTF8",       decode_utf8,    encode_utf8    },
 102 };
 103
 104
 105 GIConv
 106 g_iconv_open (const char *to_charset, const char *from_charset)
 107 {
 108 #ifdef HAVE_ICONV
 109         iconv_t icd = (iconv_t) -1;
 110 #endif
 111         Decoder decoder = NULL;
 112         Encoder encoder = NULL;
 113         GIConv cd;
 114         guint i;
 115
 116         if (!to_charset || !from_charset || !to_charset[0] || !from_charset[0]) {
 117                 errno = EINVAL;
 118
 119                 return (GIConv) -1;
 120         }
 121
 122         for (i = 0; i < G_N_ELEMENTS (charsets); i++) {
 123                 if (!g_ascii_strcasecmp (charsets[i].name, from_charset))
 124                         decoder = charsets[i].decoder;
 125
 126                 if (!g_ascii_strcasecmp (charsets[i].name, to_charset))
 127                         encoder = charsets[i].encoder;
 128         }
 129
 130         if (!encoder || !decoder) {
 131 #ifdef HAVE_ICONV
 132                 if ((icd = iconv_open (to_charset, from_charset)) == (iconv_t) -1)
 133                         return (GIConv) -1;
 134 #else
 135                 errno = EINVAL;
 136
 137                 return (GIConv) -1;
 138 #endif
 139         }
 140
 141         cd = (GIConv) g_malloc (sizeof (struct _GIConv));
 142         cd->decode = decoder;
 143         cd->encode = encoder;
 144         cd->c = -1;
 145
 146 #ifdef HAVE_ICONV
 147         cd->cd = icd;
 148 #endif
 149
 150         return cd;
 151 }
 152
 153 int
 154 g_iconv_close (GIConv cd)
 155 {
 156 #ifdef HAVE_ICONV
 157         if (cd->cd != (iconv_t) -1)
 158                 iconv_close (cd->cd);
 159 #endif
 160
 161         g_free (cd);
 162
 163         return 0;
 164 }
 165
 166 gsize
 167 g_iconv (GIConv cd, gchar **inbytes, gsize *inbytesleft,
 168          gchar **outbytes, gsize *outbytesleft)
 169 {
 170         size_t inleft, outleft;
 171         char *inptr, *outptr;
 172         gunichar c;
 173         int rc = 0;
 174
 175 #ifdef HAVE_ICONV
 176         if (cd->cd != (iconv_t) -1)
 177                 return iconv (cd->cd, inbytes, inbytesleft, outbytes, outbytesleft);
 178 #endif
 179
 180         if (outbytes == NULL || outbytesleft == NULL) {
 181                 /* reset converter */
 182                 cd->c = -1;
 183                 return 0;
 184         }
 185
 186         inleft = inbytesleft ? *inbytesleft : 0;
 187         inptr = inbytes ? *inbytes : NULL;
 188         outleft = *outbytesleft;
 189         outptr = *outbytes;
 190
 191         if ((c = cd->c) != (gunichar) -1)
 192                 goto encode;
 193
 194         while (inleft > 0) {
 195                 if ((rc = cd->decode (inptr, inleft, &c)) < 0)
 196                         break;
 197
 198                 inleft -= rc;
 199                 inptr += rc;
 200
 201         encode:
 202                 if ((rc = cd->encode (c, outptr, outleft)) < 0)
 203                         break;
 204
 205                 c = (gunichar) -1;
 206                 outleft -= rc;
 207                 outptr += rc;
 208         }
 209
 210         if (inbytesleft)
 211                 *inbytesleft = inleft;
 212
 213         if (inbytes)
 214                 *inbytes = inptr;
 215
 216         *outbytesleft = outleft;
 217         *outbytes = outptr;
 218         cd->c = c;
 219
 220         return rc < 0 ? -1 : 0;
 221 }
 222
 223 /*
 224  * Unicode encoders and decoders
 225  */
 226
 227 static int
 228 decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar)
 229 {
 230         unsigned char *inptr = (unsigned char *) inbuf;
 231         gunichar c;
 232
 233         if (inleft < 4) {
 234                 errno = EINVAL;
 235                 return -1;
 236         }
 237
 238         c = (inptr[0] << 24) | (inptr[1] << 16) | (inptr[2] << 8) | inptr[3];
 239
 240         if (c >= 0xd800 && c < 0xe000) {
 241                 errno = EILSEQ;
 242                 return -1;
 243         } else if (c >= 0x110000) {
 244                 errno = EILSEQ;
 245                 return -1;
 246         }
 247
 248         *outchar = c;
 249
 250         return 4;
 251 }
 252
 253 static int
 254 decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar)
 255 {
 256         unsigned char *inptr = (unsigned char *) inbuf;
 257         gunichar c;
 258
 259         if (inleft < 4) {
 260                 errno = EINVAL;
 261                 return -1;
 262         }
 263
 264         c = (inptr[3] << 24) | (inptr[2] << 16) | (inptr[1] << 8) | inptr[0];
 265
 266         if (c >= 0xd800 && c < 0xe000) {
 267                 errno = EILSEQ;
 268                 return -1;
 269         } else if (c >= 0x110000) {
 270                 errno = EILSEQ;
 271                 return -1;
 272         }
 273
 274         *outchar = c;
 275
 276         return 4;
 277 }
 278
 279 static int
 280 encode_utf32be (gunichar c, char *outbuf, size_t outleft)
 281 {
 282         unsigned char *outptr = (unsigned char *) outbuf;
 283
 284         if (outleft < 4) {
 285                 errno = E2BIG;
 286                 return -1;
 287         }
 288
 289         outptr[0] = (c >> 24) & 0xff;
 290         outptr[1] = (c >> 16) & 0xff;
 291         outptr[2] = (c >> 8) & 0xff;
 292         outptr[3] = c & 0xff;
 293
 294         return 4;
 295 }
 296
 297 static int
 298 encode_utf32le (gunichar c, char *outbuf, size_t outleft)
 299 {
 300         unsigned char *outptr = (unsigned char *) outbuf;
 301
 302         if (outleft < 4) {
 303                 errno = E2BIG;
 304                 return -1;
 305         }
 306
 307         outptr[0] = c & 0xff;
 308         outptr[1] = (c >> 8) & 0xff;
 309         outptr[2] = (c >> 16) & 0xff;
 310         outptr[3] = (c >> 24) & 0xff;
 311
 312         return 4;
 313 }
 314
 315 static int
 316 decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar)
 317 {
 318         unsigned char *inptr = (unsigned char *) inbuf;
 319         gunichar2 c;
 320         gunichar u;
 321
 322         if (inleft < 2) {
 323                 errno = EINVAL;
 324                 return -1;
 325         }
 326
 327         u = (inptr[0] << 8) | inptr[1];
 328
 329         if (u < 0xd800) {
 330                 /* 0x0000 -> 0xd7ff */
 331                 *outchar = u;
 332                 return 2;
 333         } else if (u < 0xdc00) {
 334                 /* 0xd800 -> 0xdbff */
 335                 if (inleft < 4) {
 336                         errno = EINVAL;
 337                         return -2;
 338                 }
 339
 340                 c = (inptr[2] << 8) | inptr[3];
 341
 342                 if (c < 0xdc00 || c > 0xdfff) {
 343                         errno = EILSEQ;
 344                         return -2;
 345                 }
 346
 347                 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
 348                 *outchar = u;
 349
 350                 return 4;
 351         } else if (u < 0xe000) {
 352                 /* 0xdc00 -> 0xdfff */
 353                 errno = EILSEQ;
 354                 return -1;
 355         } else {
 356                 /* 0xe000 -> 0xffff */
 357                 *outchar = u;
 358                 return 2;
 359         }
 360 }
 361
 362 static int
 363 decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar)
 364 {
 365         unsigned char *inptr = (unsigned char *) inbuf;
 366         gunichar2 c;
 367         gunichar u;
 368
 369         if (inleft < 2) {
 370                 errno = EINVAL;
 371                 return -1;
 372         }
 373
 374         u = (inptr[1] << 8) | inptr[0];
 375
 376         if (u < 0xd800) {
 377                 /* 0x0000 -> 0xd7ff */
 378                 *outchar = u;
 379                 return 2;
 380         } else if (u < 0xdc00) {
 381                 /* 0xd800 -> 0xdbff */
 382                 if (inleft < 4) {
 383                         errno = EINVAL;
 384                         return -2;
 385                 }
 386
 387                 c = (inptr[3] << 8) | inptr[2];
 388
 389                 if (c < 0xdc00 || c > 0xdfff) {
 390                         errno = EILSEQ;
 391                         return -2;
 392                 }
 393
 394                 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
 395                 *outchar = u;
 396
 397                 return 4;
 398         } else if (u < 0xe000) {
 399                 /* 0xdc00 -> 0xdfff */
 400                 errno = EILSEQ;
 401                 return -1;
 402         } else {
 403                 /* 0xe000 -> 0xffff */
 404                 *outchar = u;
 405                 return 2;
 406         }
 407 }
 408
 409 static int
 410 encode_utf16be (gunichar c, char *outbuf, size_t outleft)
 411 {
 412         unsigned char *outptr = (unsigned char *) outbuf;
 413         gunichar2 ch;
 414         gunichar c2;
 415
 416         if (c < 0x10000) {
 417                 if (outleft < 2) {
 418                         errno = E2BIG;
 419                         return -1;
 420                 }
 421
 422                 outptr[0] = (c >> 8) & 0xff;
 423                 outptr[1] = c & 0xff;
 424
 425                 return 2;
 426         } else {
 427                 if (outleft < 4) {
 428                         errno = E2BIG;
 429                         return -1;
 430                 }
 431
 432                 c2 = c - 0x10000;
 433
 434                 ch = (gunichar2) ((c2 >> 10) + 0xd800);
 435                 outptr[0] = (ch >> 8) & 0xff;
 436                 outptr[1] = ch & 0xff;
 437
 438                 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 439                 outptr[2] = (ch >> 8) & 0xff;
 440                 outptr[3] = ch & 0xff;
 441
 442                 return 4;
 443         }
 444 }
 445
 446 static int
 447 encode_utf16le (gunichar c, char *outbuf, size_t outleft)
 448 {
 449         unsigned char *outptr = (unsigned char *) outbuf;
 450         gunichar2 ch;
 451         gunichar c2;
 452
 453         if (c < 0x10000) {
 454                 if (outleft < 2) {
 455                         errno = E2BIG;
 456                         return -1;
 457                 }
 458
 459                 outptr[0] = c & 0xff;
 460                 outptr[1] = (c >> 8) & 0xff;
 461
 462                 return 2;
 463         } else {
 464                 if (outleft < 4) {
 465                         errno = E2BIG;
 466                         return -1;
 467                 }
 468
 469                 c2 = c - 0x10000;
 470
 471                 ch = (gunichar2) ((c2 >> 10) + 0xd800);
 472                 outptr[0] = ch & 0xff;
 473                 outptr[1] = (ch >> 8) & 0xff;
 474
 475                 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 476                 outptr[2] = ch & 0xff;
 477                 outptr[3] = (ch >> 8) & 0xff;
 478
 479                 return 4;
 480         }
 481 }
 482
 483 static int
 484 decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar)
 485 {
 486         unsigned char *inptr = (unsigned char *) inbuf;
 487         gunichar u;
 488         int n, i;
 489
 490         u = *inptr;
 491
 492         if (u < 0x80) {
 493                 /* simple ascii case */
 494                 *outchar = u;
 495                 return 1;
 496         } else if (u < 0xc2) {
 497                 errno = EILSEQ;
 498                 return -1;
 499         } else if (u < 0xe0) {
 500                 u &= 0x1f;
 501                 n = 2;
 502         } else if (u < 0xf0) {
 503                 u &= 0x0f;
 504                 n = 3;
 505         } else if (u < 0xf8) {
 506                 u &= 0x07;
 507                 n = 4;
 508         } else if (u < 0xfc) {
 509                 u &= 0x03;
 510                 n = 5;
 511         } else if (u < 0xfe) {
 512                 u &= 0x01;
 513                 n = 6;
 514         } else {
 515                 errno = EILSEQ;
 516                 return -1;
 517         }
 518
 519         if (n > inleft) {
 520                 errno = EINVAL;
 521                 return -1;
 522         }
 523
 524 #if UNROLL_DECODE_UTF8
 525         switch (n) {
 526         case 6: u = (u << 6) | (*++inptr ^ 0x80);
 527         case 5: u = (u << 6) | (*++inptr ^ 0x80);
 528         case 4: u = (u << 6) | (*++inptr ^ 0x80);
 529         case 3: u = (u << 6) | (*++inptr ^ 0x80);
 530         case 2: u = (u << 6) | (*++inptr ^ 0x80);
 531         }
 532 #else
 533         for (i = 1; i < n; i++)
 534                 u = (u << 6) | (*++inptr ^ 0x80);
 535 #endif
 536
 537         *outchar = u;
 538
 539         return n;
 540 }
 541
 542 static int
 543 encode_utf8 (gunichar c, char *outbuf, size_t outleft)
 544 {
 545         unsigned char *outptr = (unsigned char *) outbuf;
 546         int base, n, i;
 547
 548         if (c < 0x80) {
 549                 outptr[0] = c;
 550                 return 1;
 551         } else if (c < 0x800) {
 552                 base = 192;
 553                 n = 2;
 554         } else if (c < 0x10000) {
 555                 base = 224;
 556                 n = 3;
 557         } else if (c < 0x200000) {
 558                 base = 240;
 559                 n = 4;
 560         } else if (c < 0x4000000) {
 561                 base = 248;
 562                 n = 5;
 563         } else {
 564                 base = 252;
 565                 n = 6;
 566         }
 567
 568         if (outleft < n) {
 569                 errno = E2BIG;
 570                 return -1;
 571         }
 572
 573 #if UNROLL_ENCODE_UTF8
 574         switch (n) {
 575         case 6: outptr[5] = (c & 0x3f) | 0x80; c >>= 6;
 576         case 5: outptr[4] = (c & 0x3f) | 0x80; c >>= 6;
 577         case 4: outptr[3] = (c & 0x3f) | 0x80; c >>= 6;
 578         case 3: outptr[2] = (c & 0x3f) | 0x80; c >>= 6;
 579         case 2: outptr[1] = (c & 0x3f) | 0x80; c >>= 6;
 580         case 1: outptr[0] = c | base;
 581         }
 582 #else
 583         for (i = n - 1; i > 0; i--) {
 584                 outptr[i] = (c & 0x3f) | 0x80;
 585                 c >>= 6;
 586         }
 587
 588         outptr[0] = c | base;
 589 #endif
 590
 591         return n;
 592 }
 593
 594 static int
 595 decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar)
 596 {
 597         *outchar = (unsigned char) *inbuf;
 598         return 1;
 599 }
 600
 601 static int
 602 encode_latin1 (gunichar c, char *outbuf, size_t outleft)
 603 {
 604         if (outleft < 1) {
 605                 errno = E2BIG;
 606                 return -1;
 607         }
 608
 609         if (c > 0xff) {
 610                 errno = EILSEQ;
 611                 return -1;
 612         }
 613
 614         *outbuf = (char) c;
 615
 616         return 1;
 617 }
 618
 619
 620 /*
 621  * Simple conversion API
 622  */
 623
 624 static gpointer error_quark = "ConvertError";
 625
 626 gpointer
 627 g_convert_error_quark (void)
 628 {
 629         return error_quark;
 630 }
 631
 632 gchar *
 633 g_convert (const gchar *str, gssize len, const gchar *to_charset, const gchar *from_charset,
 634            gsize *bytes_read, gsize *bytes_written, GError **err)
 635 {
 636         size_t outsize, outused, outleft, inleft, grow, rc;
 637         char *result, *outbuf, *inbuf;
 638         gboolean flush = FALSE;
 639         gboolean done = FALSE;
 640         GIConv cd;
 641
 642         g_return_val_if_fail (str != NULL, NULL);
 643         g_return_val_if_fail (to_charset != NULL, NULL);
 644         g_return_val_if_fail (from_charset != NULL, NULL);
 645
 646         if ((cd = g_iconv_open (to_charset, from_charset)) == (GIConv) -1) {
 647                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 648                              "Conversion from %s to %s not supported.",
 649                              from_charset, to_charset);
 650
 651                 if (bytes_written)
 652                         *bytes_written = 0;
 653
 654                 if (bytes_read)
 655                         *bytes_read = 0;
 656
 657                 return NULL;
 658         }
 659
 660         inleft = len < 0 ? strlen (str) : len;
 661         inbuf = (char *) str;
 662
 663         outleft = outsize = MAX (inleft, 8);
 664         outbuf = result = g_malloc (outsize + 4);
 665
 666         do {
 667                 if (!flush)
 668                         rc = g_iconv (cd, &inbuf, &inleft, &outbuf, &outleft);
 669                 else
 670                         rc = g_iconv (cd, NULL, NULL, &outbuf, &outleft);
 671
 672                 if (rc == (size_t) -1) {
 673                         switch (errno) {
 674                         case E2BIG:
 675                                 /* grow our result buffer */
 676                                 grow = MAX (inleft, 8) << 1;
 677                                 outused = outbuf - result;
 678                                 outsize += grow;
 679                                 outleft += grow;
 680                                 result = g_realloc (result, outsize + 4);
 681                                 outbuf = result + outused;
 682                                 break;
 683                         case EINVAL:
 684                                 /* incomplete input, stop converting and terminate here */
 685                                 if (flush)
 686                                         done = TRUE;
 687                                 else
 688                                         flush = TRUE;
 689                                 break;
 690                         case EILSEQ:
 691                                 /* illegal sequence in the input */
 692                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "%s", g_strerror (errno));
 693
 694                                 if (bytes_read) {
 695                                         /* save offset of the illegal input sequence */
 696                                         *bytes_read = (inbuf - str);
 697                                 }
 698
 699                                 if (bytes_written)
 700                                         *bytes_written = 0;
 701
 702                                 g_iconv_close (cd);
 703                                 g_free (result);
 704                                 return NULL;
 705                         default:
 706                                 /* unknown errno */
 707                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "%s", g_strerror (errno));
 708
 709                                 if (bytes_written)
 710                                         *bytes_written = 0;
 711
 712                                 if (bytes_read)
 713                                         *bytes_read = 0;
 714
 715                                 g_iconv_close (cd);
 716                                 g_free (result);
 717                                 return NULL;
 718                         }
 719                 } else if (flush) {
 720                         /* input has been converted and output has been flushed */
 721                         break;
 722                 } else {
 723                         /* input has been converted, need to flush the output */
 724                         flush = TRUE;
 725                 }
 726         } while (!done);
 727
 728         g_iconv_close (cd);
 729
 730         /* Note: not all charsets can be null-terminated with a single
 731            null byte. UCS2, for example, needs 2 null bytes and UCS4
 732            needs 4. I hope that 4 null bytes is enough to terminate all
 733            multibyte charsets? */
 734
 735         /* null-terminate the result */
 736         memset (outbuf, 0, 4);
 737
 738         if (bytes_written)
 739                 *bytes_written = outbuf - result;
 740
 741         if (bytes_read)
 742                 *bytes_read = inbuf - str;
 743
 744         return result;
 745 }
 746
 747
 748 /*
 749  * Unicode conversion
 750  */
 751
 752 /**
 753  * from http://home.tiscali.nl/t876506/utf8tbl.html
 754  *
 755  * From Unicode UCS-4 to UTF-8:
 756  * Start with the Unicode number expressed as a decimal number and call this ud.
 757  *
 758  * If ud <128 (7F hex) then UTF-8 is 1 byte long, the value of ud.
 759  *
 760  * If ud >=128 and <=2047 (7FF hex) then UTF-8 is 2 bytes long.
 761  *    byte 1 = 192 + (ud div 64)
 762  *    byte 2 = 128 + (ud mod 64)
 763  *
 764  * If ud >=2048 and <=65535 (FFFF hex) then UTF-8 is 3 bytes long.
 765  *    byte 1 = 224 + (ud div 4096)
 766  *    byte 2 = 128 + ((ud div 64) mod 64)
 767  *    byte 3 = 128 + (ud mod 64)
 768  *
 769  * If ud >=65536 and <=2097151 (1FFFFF hex) then UTF-8 is 4 bytes long.
 770  *    byte 1 = 240 + (ud div 262144)
 771  *    byte 2 = 128 + ((ud div 4096) mod 64)
 772  *    byte 3 = 128 + ((ud div 64) mod 64)
 773  *    byte 4 = 128 + (ud mod 64)
 774  *
 775  * If ud >=2097152 and <=67108863 (3FFFFFF hex) then UTF-8 is 5 bytes long.
 776  *    byte 1 = 248 + (ud div 16777216)
 777  *    byte 2 = 128 + ((ud div 262144) mod 64)
 778  *    byte 3 = 128 + ((ud div 4096) mod 64)
 779  *    byte 4 = 128 + ((ud div 64) mod 64)
 780  *    byte 5 = 128 + (ud mod 64)
 781  *
 782  * If ud >=67108864 and <=2147483647 (7FFFFFFF hex) then UTF-8 is 6 bytes long.
 783  *    byte 1 = 252 + (ud div 1073741824)
 784  *    byte 2 = 128 + ((ud div 16777216) mod 64)
 785  *    byte 3 = 128 + ((ud div 262144) mod 64)
 786  *    byte 4 = 128 + ((ud div 4096) mod 64)
 787  *    byte 5 = 128 + ((ud div 64) mod 64)
 788  *    byte 6 = 128 + (ud mod 64)
 789  **/
 790 gint
 791 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
 792 {
 793         int base, n, i;
 794
 795         if (c < 0x80) {
 796                 base = 0;
 797                 n = 1;
 798         } else if (c < 0x800) {
 799                 base = 192;
 800                 n = 2;
 801         } else if (c < 0x10000) {
 802                 base = 224;
 803                 n = 3;
 804         } else if (c < 0x200000) {
 805                 base = 240;
 806                 n = 4;
 807         } else if (c < 0x4000000) {
 808                 base = 248;
 809                 n = 5;
 810         } else if (c < 0x80000000) {
 811                 base = 252;
 812                 n = 6;
 813         } else {
 814                 return -1;
 815         }
 816
 817         if (outbuf != NULL) {
 818                 for (i = n - 1; i > 0; i--) {
 819                         /* mask off 6 bits worth and add 128 */
 820                         outbuf[i] = (c & 0x3f) | 0x80;
 821                         c >>= 6;
 822                 }
 823
 824                 /* first character has a different base */
 825                 outbuf[0] = c | base;
 826         }
 827
 828         return n;
 829 }
 830
 831 static int
 832 g_unichar_to_utf16 (gunichar c, gunichar2 *outbuf)
 833 {
 834         gunichar c2;
 835
 836         if (c < 0xd800) {
 837                 if (outbuf)
 838                         *outbuf = (gunichar2) c;
 839
 840                 return 1;
 841         } else if (c < 0xe000) {
 842                 return -1;
 843         } else if (c < 0x10000) {
 844                 if (outbuf)
 845                         *outbuf = (gunichar2) c;
 846
 847                 return 1;
 848         } else if (c < 0x110000) {
 849                 if (outbuf) {
 850                         c2 = c - 0x10000;
 851
 852                         outbuf[0] = (gunichar2) ((c2 >> 10) + 0xd800);
 853                         outbuf[1] = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 854                 }
 855
 856                 return 2;
 857         } else {
 858                 return -1;
 859         }
 860 }
 861
 862 gunichar *
 863 g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written)
 864 {
 865         gunichar *outbuf, *outptr;
 866         char *inptr;
 867         glong n, i;
 868
 869         g_return_val_if_fail (str != NULL, NULL);
 870
 871         n = g_utf8_strlen (str, len);
 872
 873         if (items_written)
 874                 *items_written = n;
 875
 876         outptr = outbuf = g_malloc ((n + 1) * sizeof (gunichar));
 877         inptr = (char *) str;
 878
 879         for (i = 0; i < n; i++) {
 880                 *outptr++ = g_utf8_get_char (inptr);
 881                 inptr = g_utf8_next_char (inptr);
 882         }
 883
 884         *outptr = 0;
 885
 886         return outbuf;
 887 }
 888
 889 gunichar2 *
 890 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 891 {
 892         gunichar2 *outbuf, *outptr;
 893         size_t outlen = 0;
 894         size_t inleft;
 895         char *inptr;
 896         gunichar c;
 897         int n;
 898
 899         g_return_val_if_fail (str != NULL, NULL);
 900
 901         if (len < 0)
 902                 len = strlen (str);
 903
 904         inptr = (char *) str;
 905         inleft = len;
 906
 907         while (inleft > 0) {
 908                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0) {
 909                         if (errno == EILSEQ) {
 910                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 911                                              "Illegal byte sequence encounted in the input.");
 912                         } else if (items_read) {
 913                                 /* partial input is ok if we can let our caller know... */
 914                                 break;
 915                         } else {
 916                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 917                                              "Partial byte sequence encountered in the input.");
 918                         }
 919
 920                         if (items_read)
 921                                 *items_read = inptr - str;
 922
 923                         if (items_written)
 924                                 *items_written = 0;
 925
 926                         return NULL;
 927                 } else if (c == 0)
 928                         break;
 929
 930                 outlen += g_unichar_to_utf16 (c, NULL);
 931                 inleft -= n;
 932                 inptr += n;
 933         }
 934
 935         if (items_read)
 936                 *items_read = inptr - str;
 937
 938         if (items_written)
 939                 *items_written = outlen;
 940
 941         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
 942         inptr = (char *) str;
 943         inleft = len;
 944
 945         while (inleft > 0) {
 946                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
 947                         break;
 948                 else if (c == 0)
 949                         break;
 950
 951                 outptr += g_unichar_to_utf16 (c, outptr);
 952                 inleft -= n;
 953                 inptr += n;
 954         }
 955
 956         *outptr = '\0';
 957
 958         return outbuf;
 959 }
 960
 961 gunichar *
 962 g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 963 {
 964         gunichar *outbuf, *outptr;
 965         size_t outlen = 0;
 966         size_t inleft;
 967         char *inptr;
 968         gunichar c;
 969         int n;
 970
 971         g_return_val_if_fail (str != NULL, NULL);
 972
 973         if (len < 0)
 974                 len = strlen (str);
 975
 976         inptr = (char *) str;
 977         inleft = len;
 978
 979         while (inleft > 0) {
 980                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0) {
 981                         if (errno == EILSEQ) {
 982                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 983                                              "Illegal byte sequence encounted in the input.");
 984                         } else if (items_read) {
 985                                 /* partial input is ok if we can let our caller know... */
 986                                 break;
 987                         } else {
 988                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 989                                              "Partial byte sequence encountered in the input.");
 990                         }
 991
 992                         if (items_read)
 993                                 *items_read = inptr - str;
 994
 995                         if (items_written)
 996                                 *items_written = 0;
 997
 998                         return NULL;
 999                 } else if (c == 0)
1000                         break;
1001
1002                 outlen += 4;
1003                 inleft -= n;
1004                 inptr += n;
1005         }
1006
1007         if (items_written)
1008                 *items_written = outlen / 4;
1009
1010         if (items_read)
1011                 *items_read = inptr - str;
1012
1013         outptr = outbuf = g_malloc (outlen + 4);
1014         inptr = (char *) str;
1015         inleft = len;
1016
1017         while (inleft > 0) {
1018                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
1019                         break;
1020                 else if (c == 0)
1021                         break;
1022
1023                 *outptr++ = c;
1024                 inleft -= n;
1025                 inptr += n;
1026         }
1027
1028         *outptr = 0;
1029
1030         return outbuf;
1031 }
1032
1033 gchar *
1034 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1035 {
1036         char *inptr, *outbuf, *outptr;
1037         size_t outlen = 0;
1038         size_t inleft;
1039         gunichar c;
1040         int n;
1041
1042         g_return_val_if_fail (str != NULL, NULL);
1043
1044         if (len < 0) {
1045                 len = 0;
1046                 while (str[len])
1047                         len++;
1048         }
1049
1050         inptr = (char *) str;
1051         inleft = len * 2;
1052
1053         while (inleft > 0) {
1054                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1055                         if (n == -2 && inleft > 2) {
1056                                 /* This means that the first UTF-16 char was read, but second failed */
1057                                 inleft -= 2;
1058                                 inptr += 2;
1059                         }
1060
1061                         if (errno == EILSEQ) {
1062                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1063                                              "Illegal byte sequence encounted in the input.");
1064                         } else if (items_read) {
1065                                 /* partial input is ok if we can let our caller know... */
1066                                 break;
1067                         } else {
1068                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1069                                              "Partial byte sequence encountered in the input.");
1070                         }
1071
1072                         if (items_read)
1073                                 *items_read = (inptr - (char *) str) / 2;
1074
1075                         if (items_written)
1076                                 *items_written = 0;
1077
1078                         return NULL;
1079                 } else if (c == 0)
1080                         break;
1081
1082                 outlen += g_unichar_to_utf8 (c, NULL);
1083                 inleft -= n;
1084                 inptr += n;
1085         }
1086
1087         if (items_read)
1088                 *items_read = (inptr - (char *) str) / 2;
1089
1090         if (items_written)
1091                 *items_written = outlen;
1092
1093         outptr = outbuf = g_malloc (outlen + 1);
1094         inptr = (char *) str;
1095         inleft = len * 2;
1096
1097         while (inleft > 0) {
1098                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1099                         break;
1100                 else if (c == 0)
1101                         break;
1102
1103                 outptr += g_unichar_to_utf8 (c, outptr);
1104                 inleft -= n;
1105                 inptr += n;
1106         }
1107
1108         *outptr = '\0';
1109
1110         return outbuf;
1111 }
1112
1113 gunichar *
1114 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1115 {
1116         gunichar *outbuf, *outptr;
1117         size_t outlen = 0;
1118         size_t inleft;
1119         char *inptr;
1120         gunichar c;
1121         int n;
1122
1123         g_return_val_if_fail (str != NULL, NULL);
1124
1125         if (len < 0) {
1126                 len = 0;
1127                 while (str[len])
1128                         len++;
1129         }
1130
1131         inptr = (char *) str;
1132         inleft = len * 2;
1133
1134         while (inleft > 0) {
1135                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1136                         if (n == -2 && inleft > 2) {
1137                                 /* This means that the first UTF-16 char was read, but second failed */
1138                                 inleft -= 2;
1139                                 inptr += 2;
1140                         }
1141
1142                         if (errno == EILSEQ) {
1143                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1144                                              "Illegal byte sequence encounted in the input.");
1145                         } else if (items_read) {
1146                                 /* partial input is ok if we can let our caller know... */
1147                                 break;
1148                         } else {
1149                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1150                                              "Partial byte sequence encountered in the input.");
1151                         }
1152
1153                         if (items_read)
1154                                 *items_read = (inptr - (char *) str) / 2;
1155
1156                         if (items_written)
1157                                 *items_written = 0;
1158
1159                         return NULL;
1160                 } else if (c == 0)
1161                         break;
1162
1163                 outlen += 4;
1164                 inleft -= n;
1165                 inptr += n;
1166         }
1167
1168         if (items_read)
1169                 *items_read = (inptr - (char *) str) / 2;
1170
1171         if (items_written)
1172                 *items_written = outlen / 4;
1173
1174         outptr = outbuf = g_malloc (outlen + 4);
1175         inptr = (char *) str;
1176         inleft = len * 2;
1177
1178         while (inleft > 0) {
1179                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1180                         break;
1181                 else if (c == 0)
1182                         break;
1183
1184                 *outptr++ = c;
1185                 inleft -= n;
1186                 inptr += n;
1187         }
1188
1189         *outptr = 0;
1190
1191         return outbuf;
1192 }
1193
1194 gchar *
1195 g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1196 {
1197         char *outbuf, *outptr;
1198         size_t outlen = 0;
1199         glong i;
1200         int n;
1201
1202         g_return_val_if_fail (str != NULL, NULL);
1203
1204         if (len < 0) {
1205                 for (i = 0; str[i] != 0; i++) {
1206                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1207                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1208                                              "Illegal byte sequence encounted in the input.");
1209
1210                                 if (items_written)
1211                                         *items_written = 0;
1212
1213                                 if (items_read)
1214                                         *items_read = i;
1215
1216                                 return NULL;
1217                         }
1218
1219                         outlen += n;
1220                 }
1221         } else {
1222                 for (i = 0; i < len && str[i] != 0; i++) {
1223                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1224                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1225                                              "Illegal byte sequence encounted in the input.");
1226
1227                                 if (items_written)
1228                                         *items_written = 0;
1229
1230                                 if (items_read)
1231                                         *items_read = i;
1232
1233                                 return NULL;
1234                         }
1235
1236                         outlen += n;
1237                 }
1238         }
1239
1240         len = i;
1241
1242         outptr = outbuf = g_malloc (outlen + 1);
1243         for (i = 0; i < len; i++)
1244                 outptr += g_unichar_to_utf8 (str[i], outptr);
1245         *outptr = 0;
1246
1247         if (items_written)
1248                 *items_written = outlen;
1249
1250         if (items_read)
1251                 *items_read = i;
1252
1253         return outbuf;
1254 }
1255
1256 gunichar2 *
1257 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1258 {
1259         gunichar2 *outbuf, *outptr;
1260         size_t outlen = 0;
1261         glong i;
1262         int n;
1263
1264         g_return_val_if_fail (str != NULL, NULL);
1265
1266         if (len < 0) {
1267                 for (i = 0; str[i] != 0; i++) {
1268                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1269                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1270                                              "Illegal byte sequence encounted in the input.");
1271
1272                                 if (items_written)
1273                                         *items_written = 0;
1274
1275                                 if (items_read)
1276                                         *items_read = i;
1277
1278                                 return NULL;
1279                         }
1280
1281                         outlen += n;
1282                 }
1283         } else {
1284                 for (i = 0; i < len && str[i] != 0; i++) {
1285                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1286                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1287                                              "Illegal byte sequence encounted in the input.");
1288
1289                                 if (items_written)
1290                                         *items_written = 0;
1291
1292                                 if (items_read)
1293                                         *items_read = i;
1294
1295                                 return NULL;
1296                         }
1297
1298                         outlen += n;
1299                 }
1300         }
1301
1302         len = i;
1303
1304         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
1305         for (i = 0; i < len; i++)
1306                 outptr += g_unichar_to_utf16 (str[i], outptr);
1307         *outptr = 0;
1308
1309         if (items_written)
1310                 *items_written = outlen;
1311
1312         if (items_read)
1313                 *items_read = i;
1314
1315         return outbuf;
1316 }