eglib/src/giconv.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*
   3  *  Copyright (C) 2011 Jeffrey Stedfast
   4  *
   5  *  Permission is hereby granted, free of charge, to any person
   6  *  obtaining a copy of this software and associated documentation
   7  *  files (the "Software"), to deal in the Software without
   8  *  restriction, including without limitation the rights to use, copy,
   9  *  modify, merge, publish, distribute, sublicense, and/or sell copies
  10  *  of the Software, and to permit persons to whom the Software is
  11  *  furnished to do so, subject to the following conditions:
  12  *
  13  *  The above copyright notice and this permission notice shall be
  14  *  included in all copies or substantial portions of the Software.
  15  *
  16  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  18  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  20  *  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  21  *  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23  *  DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #include <glib.h>
  31 #include <string.h>
  32 #ifdef HAVE_ICONV_H
  33 #include <iconv.h>
  34 #endif
  35 #include <errno.h>
  36
  37 #define UNROLL_DECODE_UTF8 0
  38 #define UNROLL_ENCODE_UTF8 0
  39
  40 typedef int (* Decoder) (char *inbuf, size_t inleft, gunichar *outchar);
  41 typedef int (* Encoder) (gunichar c, char *outbuf, size_t outleft);
  42
  43 struct _GIConv {
  44         Decoder decode;
  45         Encoder encode;
  46         gunichar c;
  47 #ifdef HAVE_ICONV
  48         iconv_t cd;
  49 #endif
  50 };
  51
  52 static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar);
  53 static int encode_utf32be (gunichar c, char *outbuf, size_t outleft);
  54
  55 static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar);
  56 static int encode_utf32le (gunichar c, char *outbuf, size_t outleft);
  57
  58 static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar);
  59 static int encode_utf16be (gunichar c, char *outbuf, size_t outleft);
  60
  61 static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar);
  62 static int encode_utf16le (gunichar c, char *outbuf, size_t outleft);
  63
  64 static int decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar);
  65 static int encode_utf8 (gunichar c, char *outbuf, size_t outleft);
  66
  67 static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar);
  68 static int encode_latin1 (gunichar c, char *outbuf, size_t outleft);
  69
  70 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
  71 #define decode_utf32 decode_utf32le
  72 #define encode_utf32 encode_utf32le
  73 #define decode_utf16 decode_utf16le
  74 #define encode_utf16 encode_utf16le
  75 #else
  76 #define decode_utf32 decode_utf32be
  77 #define encode_utf32 encode_utf32be
  78 #define decode_utf16 decode_utf16be
  79 #define encode_utf16 encode_utf16be
  80 #endif
  81
  82 static struct {
  83         const char *name;
  84         Decoder decoder;
  85         Encoder encoder;
  86 } charsets[] = {
  87         { "ISO-8859-1", decode_latin1,  encode_latin1  },
  88         { "ISO8859-1",  decode_latin1,  encode_latin1  },
  89         { "UTF-32BE",   decode_utf32be, encode_utf32be },
  90         { "UTF-32LE",   decode_utf32le, encode_utf32le },
  91         { "UTF-16BE",   decode_utf16be, encode_utf16be },
  92         { "UTF-16LE",   decode_utf16le, encode_utf16le },
  93         { "UTF-32",     decode_utf32,   encode_utf32   },
  94         { "UTF-16",     decode_utf16,   encode_utf16   },
  95         { "UTF-8",      decode_utf8,    encode_utf8    },
  96         { "US-ASCII",   decode_latin1,  encode_latin1  },
  97         { "Latin1",     decode_latin1,  encode_latin1  },
  98         { "ASCII",      decode_latin1,  encode_latin1  },
  99         { "UTF32",      decode_utf32,   encode_utf32   },
 100         { "UTF16",      decode_utf16,   encode_utf16   },
 101         { "UTF8",       decode_utf8,    encode_utf8    },
 102 };
 103
 104
 105 GIConv
 106 g_iconv_open (const char *to_charset, const char *from_charset)
 107 {
 108 #ifdef HAVE_ICONV
 109         iconv_t icd = (iconv_t) -1;
 110 #endif
 111         Decoder decoder = NULL;
 112         Encoder encoder = NULL;
 113         GIConv cd;
 114         guint i;
 115
 116         if (!to_charset || !from_charset || !to_charset[0] || !from_charset[0]) {
 117                 errno = EINVAL;
 118
 119                 return (GIConv) -1;
 120         }
 121
 122         for (i = 0; i < G_N_ELEMENTS (charsets); i++) {
 123                 if (!g_ascii_strcasecmp (charsets[i].name, from_charset))
 124                         decoder = charsets[i].decoder;
 125
 126                 if (!g_ascii_strcasecmp (charsets[i].name, to_charset))
 127                         encoder = charsets[i].encoder;
 128         }
 129
 130         if (!encoder || !decoder) {
 131 #ifdef HAVE_ICONV
 132                 if ((icd = iconv_open (to_charset, from_charset)) == (iconv_t) -1)
 133                         return (GIConv) -1;
 134 #else
 135                 errno = EINVAL;
 136
 137                 return (GIConv) -1;
 138 #endif
 139         }
 140
 141         cd = (GIConv) g_malloc (sizeof (struct _GIConv));
 142         cd->decode = decoder;
 143         cd->encode = encoder;
 144         cd->c = -1;
 145
 146 #ifdef HAVE_ICONV
 147         cd->cd = icd;
 148 #endif
 149
 150         return cd;
 151 }
 152
 153 int
 154 g_iconv_close (GIConv cd)
 155 {
 156 #ifdef HAVE_ICONV
 157         if (cd->cd != (iconv_t) -1)
 158                 iconv_close (cd->cd);
 159 #endif
 160
 161         g_free (cd);
 162
 163         return 0;
 164 }
 165
 166 gsize
 167 g_iconv (GIConv cd, gchar **inbytes, gsize *inbytesleft,
 168          gchar **outbytes, gsize *outbytesleft)
 169 {
 170         size_t inleft, outleft;
 171         char *inptr, *outptr;
 172         gunichar c;
 173         int rc = 0;
 174
 175 #ifdef HAVE_ICONV
 176         if (cd->cd != (iconv_t) -1)
 177                 return iconv (cd->cd, inbytes, inbytesleft, outbytes, outbytesleft);
 178 #endif
 179
 180         if (outbytes == NULL || outbytesleft == NULL) {
 181                 /* reset converter */
 182                 cd->c = -1;
 183                 return 0;
 184         }
 185
 186         inleft = inbytesleft ? *inbytesleft : 0;
 187         inptr = inbytes ? *inbytes : NULL;
 188         outleft = *outbytesleft;
 189         outptr = *outbytes;
 190
 191         if ((c = cd->c) != (gunichar) -1)
 192                 goto encode;
 193
 194         while (inleft > 0) {
 195                 if ((rc = cd->decode (inptr, inleft, &c)) < 0)
 196                         break;
 197
 198                 inleft -= rc;
 199                 inptr += rc;
 200
 201         encode:
 202                 if ((rc = cd->encode (c, outptr, outleft)) < 0)
 203                         break;
 204
 205                 c = (gunichar) -1;
 206                 outleft -= rc;
 207                 outptr += rc;
 208         }
 209
 210         if (inbytesleft)
 211                 *inbytesleft = inleft;
 212
 213         if (inbytes)
 214                 *inbytes = inptr;
 215
 216         *outbytesleft = outleft;
 217         *outbytes = outptr;
 218         cd->c = c;
 219
 220         return rc < 0 ? -1 : 0;
 221 }
 222
 223 /*
 224  * Unicode encoders and decoders
 225  */
 226
 227 static int
 228 decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar)
 229 {
 230         gunichar *inptr = (gunichar *) inbuf;
 231         gunichar c;
 232
 233         if (inleft < 4) {
 234                 errno = EINVAL;
 235                 return -1;
 236         }
 237
 238         c = GUINT32_FROM_BE (*inptr);
 239
 240         if (c >= 0xd800 && c < 0xe000) {
 241                 errno = EILSEQ;
 242                 return -1;
 243         } else if (c >= 0x110000) {
 244                 errno = EILSEQ;
 245                 return -1;
 246         }
 247
 248         *outchar = c;
 249
 250         return 4;
 251 }
 252
 253 static int
 254 decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar)
 255 {
 256         gunichar *inptr = (gunichar *) inbuf;
 257         gunichar c;
 258
 259         if (inleft < 4) {
 260                 errno = EINVAL;
 261                 return -1;
 262         }
 263
 264         c = GUINT32_FROM_LE (*inptr);
 265
 266         if (c >= 0xd800 && c < 0xe000) {
 267                 errno = EILSEQ;
 268                 return -1;
 269         } else if (c >= 0x110000) {
 270                 errno = EILSEQ;
 271                 return -1;
 272         }
 273
 274         *outchar = c;
 275
 276         return 4;
 277 }
 278
 279 static int
 280 encode_utf32be (gunichar c, char *outbuf, size_t outleft)
 281 {
 282         gunichar *outptr = (gunichar *) outbuf;
 283
 284         if (outleft < 4) {
 285                 errno = E2BIG;
 286                 return -1;
 287         }
 288
 289         *outptr = GUINT32_TO_BE (c);
 290
 291         return 4;
 292 }
 293
 294 static int
 295 encode_utf32le (gunichar c, char *outbuf, size_t outleft)
 296 {
 297         gunichar *outptr = (gunichar *) outbuf;
 298
 299         if (outleft < 4) {
 300                 errno = E2BIG;
 301                 return -1;
 302         }
 303
 304         *outptr = GUINT32_TO_LE (c);
 305
 306         return 4;
 307 }
 308
 309 static int
 310 decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar)
 311 {
 312         gunichar2 *inptr = (gunichar2 *) inbuf;
 313         gunichar2 c;
 314         gunichar u;
 315
 316         if (inleft < 2) {
 317                 errno = EINVAL;
 318                 return -1;
 319         }
 320
 321         u = GUINT16_FROM_BE (*inptr);
 322
 323         if (u < 0xd800) {
 324                 /* 0x0000 -> 0xd7ff */
 325                 *outchar = u;
 326                 return 2;
 327         } else if (u < 0xdc00) {
 328                 /* 0xd800 -> 0xdbff */
 329                 if (inleft < 4) {
 330                         errno = EINVAL;
 331                         return -2;
 332                 }
 333
 334                 c = GUINT16_FROM_BE (inptr[1]);
 335
 336                 if (c < 0xdc00 || c > 0xdfff) {
 337                         errno = EILSEQ;
 338                         return -2;
 339                 }
 340
 341                 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
 342                 *outchar = u;
 343
 344                 return 4;
 345         } else if (u < 0xe000) {
 346                 /* 0xdc00 -> 0xdfff */
 347                 errno = EILSEQ;
 348                 return -1;
 349         } else {
 350                 /* 0xe000 -> 0xffff */
 351                 *outchar = u;
 352                 return 2;
 353         }
 354 }
 355
 356 static int
 357 decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar)
 358 {
 359         gunichar2 *inptr = (gunichar2 *) inbuf;
 360         gunichar2 c;
 361         gunichar u;
 362
 363         if (inleft < 2) {
 364                 errno = EINVAL;
 365                 return -1;
 366         }
 367
 368         u = GUINT16_FROM_LE (*inptr);
 369
 370         if (u < 0xd800) {
 371                 /* 0x0000 -> 0xd7ff */
 372                 *outchar = u;
 373                 return 2;
 374         } else if (u < 0xdc00) {
 375                 /* 0xd800 -> 0xdbff */
 376                 if (inleft < 4) {
 377                         errno = EINVAL;
 378                         return -2;
 379                 }
 380
 381                 c = GUINT16_FROM_LE (inptr[1]);
 382
 383                 if (c < 0xdc00 || c > 0xdfff) {
 384                         errno = EILSEQ;
 385                         return -2;
 386                 }
 387
 388                 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
 389                 *outchar = u;
 390
 391                 return 4;
 392         } else if (u < 0xe000) {
 393                 /* 0xdc00 -> 0xdfff */
 394                 errno = EILSEQ;
 395                 return -1;
 396         } else {
 397                 /* 0xe000 -> 0xffff */
 398                 *outchar = u;
 399                 return 2;
 400         }
 401 }
 402
 403 static int
 404 encode_utf16be (gunichar c, char *outbuf, size_t outleft)
 405 {
 406         gunichar2 *outptr = (gunichar2 *) outbuf;
 407         gunichar2 ch;
 408         gunichar c2;
 409
 410         if (c < 0x10000) {
 411                 if (outleft < 2) {
 412                         errno = E2BIG;
 413                         return -1;
 414                 }
 415
 416                 ch = (gunichar2) c;
 417
 418                 *outptr = GUINT16_TO_BE (ch);
 419
 420                 return 2;
 421         } else {
 422                 if (outleft < 4) {
 423                         errno = E2BIG;
 424                         return -1;
 425                 }
 426
 427                 c2 = c - 0x10000;
 428
 429                 ch = (gunichar2) ((c2 >> 10) + 0xd800);
 430                 outptr[0] = GUINT16_TO_BE (ch);
 431
 432                 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 433                 outptr[1] = GUINT16_TO_BE (ch);
 434
 435                 return 4;
 436         }
 437 }
 438
 439 static int
 440 encode_utf16le (gunichar c, char *outbuf, size_t outleft)
 441 {
 442         gunichar2 *outptr = (gunichar2 *) outbuf;
 443         gunichar2 ch;
 444         gunichar c2;
 445
 446         if (c < 0x10000) {
 447                 if (outleft < 2) {
 448                         errno = E2BIG;
 449                         return -1;
 450                 }
 451
 452                 ch = (gunichar2) c;
 453
 454                 *outptr = GUINT16_TO_LE (ch);
 455
 456                 return 2;
 457         } else {
 458                 if (outleft < 4) {
 459                         errno = E2BIG;
 460                         return -1;
 461                 }
 462
 463                 c2 = c - 0x10000;
 464
 465                 ch = (gunichar2) ((c2 >> 10) + 0xd800);
 466                 outptr[0] = GUINT16_TO_LE (ch);
 467
 468                 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 469                 outptr[1] = GUINT16_TO_LE (ch);
 470
 471                 return 4;
 472         }
 473 }
 474
 475 static int
 476 decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar)
 477 {
 478         unsigned char *inptr = (unsigned char *) inbuf;
 479         gunichar u;
 480         int n, i;
 481
 482         u = *inptr;
 483
 484         if (u < 0x80) {
 485                 /* simple ascii case */
 486                 *outchar = u;
 487                 return 1;
 488         } else if (u < 0xc2) {
 489                 errno = EILSEQ;
 490                 return -1;
 491         } else if (u < 0xe0) {
 492                 u &= 0x1f;
 493                 n = 2;
 494         } else if (u < 0xf0) {
 495                 u &= 0x0f;
 496                 n = 3;
 497         } else if (u < 0xf8) {
 498                 u &= 0x07;
 499                 n = 4;
 500         } else if (u < 0xfc) {
 501                 u &= 0x03;
 502                 n = 5;
 503         } else if (u < 0xfe) {
 504                 u &= 0x01;
 505                 n = 6;
 506         } else {
 507                 errno = EILSEQ;
 508                 return -1;
 509         }
 510
 511         if (n > inleft) {
 512                 errno = EINVAL;
 513                 return -1;
 514         }
 515
 516 #if UNROLL_DECODE_UTF8
 517         switch (n) {
 518         case 6: u = (u << 6) | (*++inptr ^ 0x80);
 519         case 5: u = (u << 6) | (*++inptr ^ 0x80);
 520         case 4: u = (u << 6) | (*++inptr ^ 0x80);
 521         case 3: u = (u << 6) | (*++inptr ^ 0x80);
 522         case 2: u = (u << 6) | (*++inptr ^ 0x80);
 523         }
 524 #else
 525         for (i = 1; i < n; i++)
 526                 u = (u << 6) | (*++inptr ^ 0x80);
 527 #endif
 528
 529         *outchar = u;
 530
 531         return n;
 532 }
 533
 534 static int
 535 encode_utf8 (gunichar c, char *outbuf, size_t outleft)
 536 {
 537         unsigned char *outptr = (unsigned char *) outbuf;
 538         int base, n, i;
 539
 540         if (c < 0x80) {
 541                 base = 0;
 542                 n = 1;
 543         } else if (c < 0x800) {
 544                 base = 192;
 545                 n = 2;
 546         } else if (c < 0x10000) {
 547                 base = 224;
 548                 n = 3;
 549         } else if (c < 0x200000) {
 550                 base = 240;
 551                 n = 4;
 552         } else if (c < 0x4000000) {
 553                 base = 248;
 554                 n = 5;
 555         } else {
 556                 base = 252;
 557                 n = 6;
 558         }
 559
 560         if (outleft < n) {
 561                 errno = E2BIG;
 562                 return -1;
 563         }
 564
 565 #if UNROLL_ENCODE_UTF8
 566         switch (n) {
 567         case 6: outptr[5] = (c & 0x3f) | 0x80; c >>= 6;
 568         case 5: outptr[4] = (c & 0x3f) | 0x80; c >>= 6;
 569         case 4: outptr[3] = (c & 0x3f) | 0x80; c >>= 6;
 570         case 3: outptr[2] = (c & 0x3f) | 0x80; c >>= 6;
 571         case 2: outptr[1] = (c & 0x3f) | 0x80; c >>= 6;
 572         case 1: outptr[0] = c | base;
 573         }
 574 #else
 575         for (i = n - 1; i > 0; i--) {
 576                 outptr[i] = (c & 0x3f) | 0x80;
 577                 c >>= 6;
 578         }
 579
 580         outptr[0] = c | base;
 581 #endif
 582
 583         return n;
 584 }
 585
 586 static int
 587 decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar)
 588 {
 589         *outchar = (unsigned char) *inbuf;
 590         return 1;
 591 }
 592
 593 static int
 594 encode_latin1 (gunichar c, char *outbuf, size_t outleft)
 595 {
 596         if (outleft < 1) {
 597                 errno = E2BIG;
 598                 return -1;
 599         }
 600
 601         if (c > 0xff) {
 602                 errno = EILSEQ;
 603                 return -1;
 604         }
 605
 606         *outbuf = (char) c;
 607
 608         return 1;
 609 }
 610
 611
 612 /*
 613  * Simple conversion API
 614  */
 615
 616 static gpointer error_quark = "ConvertError";
 617
 618 gpointer
 619 g_convert_error_quark (void)
 620 {
 621         return error_quark;
 622 }
 623
 624 gchar *
 625 g_convert (const gchar *str, gssize len, const gchar *to_charset, const gchar *from_charset,
 626            gsize *bytes_read, gsize *bytes_written, GError **err)
 627 {
 628         size_t outsize, outused, outleft, inleft, grow, rc;
 629         char *result, *outbuf, *inbuf;
 630         gboolean flush = FALSE;
 631         gboolean done = FALSE;
 632         GIConv cd;
 633
 634         g_return_val_if_fail (str != NULL, NULL);
 635         g_return_val_if_fail (to_charset != NULL, NULL);
 636         g_return_val_if_fail (from_charset != NULL, NULL);
 637
 638         if ((cd = g_iconv_open (to_charset, from_charset)) == (GIConv) -1) {
 639                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 640                              "Conversion from %s to %s not supported.",
 641                              from_charset, to_charset);
 642
 643                 if (bytes_written)
 644                         *bytes_written = 0;
 645
 646                 if (bytes_read)
 647                         *bytes_read = 0;
 648
 649                 return NULL;
 650         }
 651
 652         inleft = len < 0 ? strlen (str) : len;
 653         inbuf = (char *) str;
 654
 655         outleft = outsize = MAX (inleft, 8);
 656         outbuf = result = g_malloc (outsize + 4);
 657
 658         do {
 659                 if (!flush)
 660                         rc = g_iconv (cd, &inbuf, &inleft, &outbuf, &outleft);
 661                 else
 662                         rc = g_iconv (cd, NULL, NULL, &outbuf, &outleft);
 663
 664                 if (rc == (size_t) -1) {
 665                         switch (errno) {
 666                         case E2BIG:
 667                                 /* grow our result buffer */
 668                                 grow = MAX (inleft, 8) << 1;
 669                                 outused = outbuf - result;
 670                                 outsize += grow;
 671                                 outleft += grow;
 672                                 result = g_realloc (result, outsize + 4);
 673                                 outbuf = result + outused;
 674                                 break;
 675                         case EINVAL:
 676                                 /* incomplete input, stop converting and terminate here */
 677                                 if (flush)
 678                                         done = TRUE;
 679                                 else
 680                                         flush = TRUE;
 681                                 break;
 682                         case EILSEQ:
 683                                 /* illegal sequence in the input */
 684                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "%s", g_strerror (errno));
 685
 686                                 if (bytes_read) {
 687                                         /* save offset of the illegal input sequence */
 688                                         *bytes_read = (inbuf - str);
 689                                 }
 690
 691                                 if (bytes_written)
 692                                         *bytes_written = 0;
 693
 694                                 g_iconv_close (cd);
 695                                 g_free (result);
 696                                 return NULL;
 697                         default:
 698                                 /* unknown errno */
 699                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "%s", g_strerror (errno));
 700
 701                                 if (bytes_written)
 702                                         *bytes_written = 0;
 703
 704                                 if (bytes_read)
 705                                         *bytes_read = 0;
 706
 707                                 g_iconv_close (cd);
 708                                 g_free (result);
 709                                 return NULL;
 710                         }
 711                 } else if (flush) {
 712                         /* input has been converted and output has been flushed */
 713                         break;
 714                 } else {
 715                         /* input has been converted, need to flush the output */
 716                         flush = TRUE;
 717                 }
 718         } while (!done);
 719
 720         g_iconv_close (cd);
 721
 722         /* Note: not all charsets can be null-terminated with a single
 723            null byte. UCS2, for example, needs 2 null bytes and UCS4
 724            needs 4. I hope that 4 null bytes is enough to terminate all
 725            multibyte charsets? */
 726
 727         /* null-terminate the result */
 728         memset (outbuf, 0, 4);
 729
 730         if (bytes_written)
 731                 *bytes_written = outbuf - result;
 732
 733         if (bytes_read)
 734                 *bytes_read = inbuf - str;
 735
 736         return result;
 737 }
 738
 739
 740 /*
 741  * Unicode conversion
 742  */
 743
 744 /**
 745  * from http://home.tiscali.nl/t876506/utf8tbl.html
 746  *
 747  * From Unicode UCS-4 to UTF-8:
 748  * Start with the Unicode number expressed as a decimal number and call this ud.
 749  *
 750  * If ud <128 (7F hex) then UTF-8 is 1 byte long, the value of ud.
 751  *
 752  * If ud >=128 and <=2047 (7FF hex) then UTF-8 is 2 bytes long.
 753  *    byte 1 = 192 + (ud div 64)
 754  *    byte 2 = 128 + (ud mod 64)
 755  *
 756  * If ud >=2048 and <=65535 (FFFF hex) then UTF-8 is 3 bytes long.
 757  *    byte 1 = 224 + (ud div 4096)
 758  *    byte 2 = 128 + ((ud div 64) mod 64)
 759  *    byte 3 = 128 + (ud mod 64)
 760  *
 761  * If ud >=65536 and <=2097151 (1FFFFF hex) then UTF-8 is 4 bytes long.
 762  *    byte 1 = 240 + (ud div 262144)
 763  *    byte 2 = 128 + ((ud div 4096) mod 64)
 764  *    byte 3 = 128 + ((ud div 64) mod 64)
 765  *    byte 4 = 128 + (ud mod 64)
 766  *
 767  * If ud >=2097152 and <=67108863 (3FFFFFF hex) then UTF-8 is 5 bytes long.
 768  *    byte 1 = 248 + (ud div 16777216)
 769  *    byte 2 = 128 + ((ud div 262144) mod 64)
 770  *    byte 3 = 128 + ((ud div 4096) mod 64)
 771  *    byte 4 = 128 + ((ud div 64) mod 64)
 772  *    byte 5 = 128 + (ud mod 64)
 773  *
 774  * If ud >=67108864 and <=2147483647 (7FFFFFFF hex) then UTF-8 is 6 bytes long.
 775  *    byte 1 = 252 + (ud div 1073741824)
 776  *    byte 2 = 128 + ((ud div 16777216) mod 64)
 777  *    byte 3 = 128 + ((ud div 262144) mod 64)
 778  *    byte 4 = 128 + ((ud div 4096) mod 64)
 779  *    byte 5 = 128 + ((ud div 64) mod 64)
 780  *    byte 6 = 128 + (ud mod 64)
 781  **/
 782 gint
 783 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
 784 {
 785         int base, n, i;
 786
 787         if (c < 0x80) {
 788                 base = 0;
 789                 n = 1;
 790         } else if (c < 0x800) {
 791                 base = 192;
 792                 n = 2;
 793         } else if (c < 0x10000) {
 794                 base = 224;
 795                 n = 3;
 796         } else if (c < 0x200000) {
 797                 base = 240;
 798                 n = 4;
 799         } else if (c < 0x4000000) {
 800                 base = 248;
 801                 n = 5;
 802         } else if (c < 0x80000000) {
 803                 base = 252;
 804                 n = 6;
 805         } else {
 806                 return -1;
 807         }
 808
 809         if (outbuf != NULL) {
 810                 for (i = n - 1; i > 0; i--) {
 811                         /* mask off 6 bits worth and add 128 */
 812                         outbuf[i] = (c & 0x3f) | 0x80;
 813                         c >>= 6;
 814                 }
 815
 816                 /* first character has a different base */
 817                 outbuf[0] = c | base;
 818         }
 819
 820         return n;
 821 }
 822
 823 static int
 824 g_unichar_to_utf16 (gunichar c, gunichar2 *outbuf)
 825 {
 826         gunichar c2;
 827
 828         if (c < 0xd800) {
 829                 if (outbuf)
 830                         *outbuf = (gunichar2) c;
 831
 832                 return 1;
 833         } else if (c < 0xe000) {
 834                 return -1;
 835         } else if (c < 0x10000) {
 836                 if (outbuf)
 837                         *outbuf = (gunichar2) c;
 838
 839                 return 1;
 840         } else if (c < 0x110000) {
 841                 if (outbuf) {
 842                         c2 = c - 0x10000;
 843
 844                         outbuf[0] = (gunichar2) ((c2 >> 10) + 0xd800);
 845                         outbuf[1] = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 846                 }
 847
 848                 return 2;
 849         } else {
 850                 return -1;
 851         }
 852 }
 853
 854 gunichar *
 855 g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written)
 856 {
 857         gunichar *outbuf, *outptr;
 858         char *inptr;
 859         glong n, i;
 860
 861         g_return_val_if_fail (str != NULL, NULL);
 862
 863         n = g_utf8_strlen (str, len);
 864
 865         if (items_written)
 866                 *items_written = n;
 867
 868         outptr = outbuf = g_malloc ((n + 1) * sizeof (gunichar));
 869         inptr = (char *) str;
 870
 871         for (i = 0; i < n; i++) {
 872                 *outptr++ = g_utf8_get_char (inptr);
 873                 inptr = g_utf8_next_char (inptr);
 874         }
 875
 876         *outptr = 0;
 877
 878         return outbuf;
 879 }
 880
 881 gunichar2 *
 882 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 883 {
 884         gunichar2 *outbuf, *outptr;
 885         size_t outlen = 0;
 886         size_t inleft;
 887         char *inptr;
 888         gunichar c;
 889         int n;
 890
 891         g_return_val_if_fail (str != NULL, NULL);
 892
 893         if (len < 0)
 894                 len = strlen (str);
 895
 896         inptr = (char *) str;
 897         inleft = len;
 898
 899         while (inleft > 0) {
 900                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0) {
 901                         if (errno == EILSEQ) {
 902                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 903                                              "Illegal byte sequence encounted in the input.");
 904                         } else if (items_read) {
 905                                 /* partial input is ok if we can let our caller know... */
 906                                 break;
 907                         } else {
 908                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 909                                              "Partial byte sequence encountered in the input.");
 910                         }
 911
 912                         if (items_read)
 913                                 *items_read = inptr - str;
 914
 915                         if (items_written)
 916                                 *items_written = 0;
 917
 918                         return NULL;
 919                 } else if (c == 0)
 920                         break;
 921
 922                 outlen += g_unichar_to_utf16 (c, NULL);
 923                 inleft -= n;
 924                 inptr += n;
 925         }
 926
 927         if (items_read)
 928                 *items_read = inptr - str;
 929
 930         if (items_written)
 931                 *items_written = outlen;
 932
 933         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
 934         inptr = (char *) str;
 935         inleft = len;
 936
 937         while (inleft > 0) {
 938                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
 939                         break;
 940                 else if (c == 0)
 941                         break;
 942
 943                 outptr += g_unichar_to_utf16 (c, outptr);
 944                 inleft -= n;
 945                 inptr += n;
 946         }
 947
 948         *outptr = '\0';
 949
 950         return outbuf;
 951 }
 952
 953 gunichar *
 954 g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 955 {
 956         gunichar *outbuf, *outptr;
 957         size_t outlen = 0;
 958         size_t inleft;
 959         char *inptr;
 960         gunichar c;
 961         int n;
 962
 963         g_return_val_if_fail (str != NULL, NULL);
 964
 965         if (len < 0)
 966                 len = strlen (str);
 967
 968         inptr = (char *) str;
 969         inleft = len;
 970
 971         while (inleft > 0) {
 972                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0) {
 973                         if (errno == EILSEQ) {
 974                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 975                                              "Illegal byte sequence encounted in the input.");
 976                         } else if (items_read) {
 977                                 /* partial input is ok if we can let our caller know... */
 978                                 break;
 979                         } else {
 980                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 981                                              "Partial byte sequence encountered in the input.");
 982                         }
 983
 984                         if (items_read)
 985                                 *items_read = inptr - str;
 986
 987                         if (items_written)
 988                                 *items_written = 0;
 989
 990                         return NULL;
 991                 } else if (c == 0)
 992                         break;
 993
 994                 outlen += 4;
 995                 inleft -= n;
 996                 inptr += n;
 997         }
 998
 999         if (items_written)
1000                 *items_written = outlen / 4;
1001
1002         if (items_read)
1003                 *items_read = inptr - str;
1004
1005         outptr = outbuf = g_malloc (outlen + 4);
1006         inptr = (char *) str;
1007         inleft = len;
1008
1009         while (inleft > 0) {
1010                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
1011                         break;
1012                 else if (c == 0)
1013                         break;
1014
1015                 *outptr++ = c;
1016                 inleft -= n;
1017                 inptr += n;
1018         }
1019
1020         *outptr = 0;
1021
1022         return outbuf;
1023 }
1024
1025 gchar *
1026 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1027 {
1028         char *inptr, *outbuf, *outptr;
1029         size_t outlen = 0;
1030         size_t inleft;
1031         gunichar c;
1032         int n;
1033
1034         g_return_val_if_fail (str != NULL, NULL);
1035
1036         if (len < 0) {
1037                 len = 0;
1038                 while (str[len])
1039                         len++;
1040         }
1041
1042         inptr = (char *) str;
1043         inleft = len * 2;
1044
1045         while (inleft > 0) {
1046                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1047                         if (n == -2 && inleft > 2) {
1048                                 /* This means that the first UTF-16 char was read, but second failed */
1049                                 inleft -= 2;
1050                                 inptr += 2;
1051                         }
1052
1053                         if (errno == EILSEQ) {
1054                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1055                                              "Illegal byte sequence encounted in the input.");
1056                         } else if (items_read) {
1057                                 /* partial input is ok if we can let our caller know... */
1058                                 break;
1059                         } else {
1060                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1061                                              "Partial byte sequence encountered in the input.");
1062                         }
1063
1064                         if (items_read)
1065                                 *items_read = (inptr - (char *) str) / 2;
1066
1067                         if (items_written)
1068                                 *items_written = 0;
1069
1070                         return NULL;
1071                 } else if (c == 0)
1072                         break;
1073
1074                 outlen += g_unichar_to_utf8 (c, NULL);
1075                 inleft -= n;
1076                 inptr += n;
1077         }
1078
1079         if (items_read)
1080                 *items_read = (inptr - (char *) str) / 2;
1081
1082         if (items_written)
1083                 *items_written = outlen;
1084
1085         outptr = outbuf = g_malloc (outlen + 1);
1086         inptr = (char *) str;
1087         inleft = len * 2;
1088
1089         while (inleft > 0) {
1090                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1091                         break;
1092                 else if (c == 0)
1093                         break;
1094
1095                 outptr += g_unichar_to_utf8 (c, outptr);
1096                 inleft -= n;
1097                 inptr += n;
1098         }
1099
1100         *outptr = '\0';
1101
1102         return outbuf;
1103 }
1104
1105 gunichar *
1106 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1107 {
1108         gunichar *outbuf, *outptr;
1109         size_t outlen = 0;
1110         size_t inleft;
1111         char *inptr;
1112         gunichar c;
1113         int n;
1114
1115         g_return_val_if_fail (str != NULL, NULL);
1116
1117         if (len < 0) {
1118                 len = 0;
1119                 while (str[len])
1120                         len++;
1121         }
1122
1123         inptr = (char *) str;
1124         inleft = len * 2;
1125
1126         while (inleft > 0) {
1127                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1128                         if (n == -2 && inleft > 2) {
1129                                 /* This means that the first UTF-16 char was read, but second failed */
1130                                 inleft -= 2;
1131                                 inptr += 2;
1132                         }
1133
1134                         if (errno == EILSEQ) {
1135                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1136                                              "Illegal byte sequence encounted in the input.");
1137                         } else if (items_read) {
1138                                 /* partial input is ok if we can let our caller know... */
1139                                 break;
1140                         } else {
1141                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1142                                              "Partial byte sequence encountered in the input.");
1143                         }
1144
1145                         if (items_read)
1146                                 *items_read = (inptr - (char *) str) / 2;
1147
1148                         if (items_written)
1149                                 *items_written = 0;
1150
1151                         return NULL;
1152                 } else if (c == 0)
1153                         break;
1154
1155                 outlen += 4;
1156                 inleft -= n;
1157                 inptr += n;
1158         }
1159
1160         if (items_read)
1161                 *items_read = (inptr - (char *) str) / 2;
1162
1163         if (items_written)
1164                 *items_written = outlen / 4;
1165
1166         outptr = outbuf = g_malloc (outlen + 4);
1167         inptr = (char *) str;
1168         inleft = len * 2;
1169
1170         while (inleft > 0) {
1171                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1172                         break;
1173                 else if (c == 0)
1174                         break;
1175
1176                 *outptr++ = c;
1177                 inleft -= n;
1178                 inptr += n;
1179         }
1180
1181         *outptr = 0;
1182
1183         return outbuf;
1184 }
1185
1186 gchar *
1187 g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1188 {
1189         char *outbuf, *outptr;
1190         size_t outlen = 0;
1191         glong i;
1192         int n;
1193
1194         g_return_val_if_fail (str != NULL, NULL);
1195
1196         if (len < 0) {
1197                 for (i = 0; str[i] != 0; i++) {
1198                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1199                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1200                                              "Illegal byte sequence encounted in the input.");
1201
1202                                 if (items_written)
1203                                         *items_written = 0;
1204
1205                                 if (items_read)
1206                                         *items_read = i;
1207
1208                                 return NULL;
1209                         }
1210
1211                         outlen += n;
1212                 }
1213         } else {
1214                 for (i = 0; i < len && str[i] != 0; i++) {
1215                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1216                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1217                                              "Illegal byte sequence encounted in the input.");
1218
1219                                 if (items_written)
1220                                         *items_written = 0;
1221
1222                                 if (items_read)
1223                                         *items_read = i;
1224
1225                                 return NULL;
1226                         }
1227
1228                         outlen += n;
1229                 }
1230         }
1231
1232         len = i;
1233
1234         outptr = outbuf = g_malloc (outlen + 1);
1235         for (i = 0; i < len; i++)
1236                 outptr += g_unichar_to_utf8 (str[i], outptr);
1237         *outptr = 0;
1238
1239         if (items_written)
1240                 *items_written = outlen;
1241
1242         if (items_read)
1243                 *items_read = i;
1244
1245         return outbuf;
1246 }
1247
1248 gunichar2 *
1249 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1250 {
1251         gunichar2 *outbuf, *outptr;
1252         size_t outlen = 0;
1253         glong i;
1254         int n;
1255
1256         g_return_val_if_fail (str != NULL, NULL);
1257
1258         if (len < 0) {
1259                 for (i = 0; str[i] != 0; i++) {
1260                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1261                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1262                                              "Illegal byte sequence encounted in the input.");
1263
1264                                 if (items_written)
1265                                         *items_written = 0;
1266
1267                                 if (items_read)
1268                                         *items_read = i;
1269
1270                                 return NULL;
1271                         }
1272
1273                         outlen += n;
1274                 }
1275         } else {
1276                 for (i = 0; i < len && str[i] != 0; i++) {
1277                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1278                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1279                                              "Illegal byte sequence encounted in the input.");
1280
1281                                 if (items_written)
1282                                         *items_written = 0;
1283
1284                                 if (items_read)
1285                                         *items_read = i;
1286
1287                                 return NULL;
1288                         }
1289
1290                         outlen += n;
1291                 }
1292         }
1293
1294         len = i;
1295
1296         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
1297         for (i = 0; i < len; i++)
1298                 outptr += g_unichar_to_utf16 (str[i], outptr);
1299         *outptr = 0;
1300
1301         if (items_written)
1302                 *items_written = outlen;
1303
1304         if (items_read)
1305                 *items_read = i;
1306
1307         return outbuf;
1308 }