1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * Copyright (C) 2011 Jeffrey Stedfast
5 * Permission is hereby granted, free of charge, to any person
6 * obtaining a copy of this software and associated documentation
7 * files (the "Software"), to deal in the Software without
8 * restriction, including without limitation the rights to use, copy,
9 * modify, merge, publish, distribute, sublicense, and/or sell copies
10 * of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be
14 * included in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
42 typedef int (* Decoder) (char **inbytes, size_t *inbytesleft, gunichar *outchar);
43 typedef int (* Encoder) (gunichar c, char **outbytes, size_t *outbytesleft);
54 static int decode_utf32be (char **inbytes, size_t *inbytesleft, gunichar *outchar);
55 static int encode_utf32be (gunichar c, char **outbytes, size_t *outbytesleft);
57 static int decode_utf32le (char **inbytes, size_t *inbytesleft, gunichar *outchar);
58 static int encode_utf32le (gunichar c, char **outbytes, size_t *outbytesleft);
60 static int decode_utf16be (char **inbytes, size_t *inbytesleft, gunichar *outchar);
61 static int encode_utf16be (gunichar c, char **outbytes, size_t *outbytesleft);
63 static int decode_utf16le (char **inbytes, size_t *inbytesleft, gunichar *outchar);
64 static int encode_utf16le (gunichar c, char **outbytes, size_t *outbytesleft);
66 static int decode_utf32 (char **inbytes, size_t *inbytesleft, gunichar *outchar);
67 static int encode_utf32 (gunichar c, char **outbytes, size_t *outbytesleft);
69 static int decode_utf16 (char **inbytes, size_t *inbytesleft, gunichar *outchar);
70 static int encode_utf16 (gunichar c, char **outbytes, size_t *outbytesleft);
72 static int decode_utf8 (char **inbytes, size_t *inbytesleft, gunichar *outchar);
73 static int encode_utf8 (gunichar c, char **outbytes, size_t *outbytesleft);
75 static int decode_latin1 (char **inbytes, size_t *inbytesleft, gunichar *outchar);
76 static int encode_latin1 (gunichar c, char **outbytes, size_t *outbytesleft);
83 { "ISO-8859-1", decode_latin1, encode_latin1 },
84 { "ISO8859-1", decode_latin1, encode_latin1 },
85 { "UTF-32BE", decode_utf32be, encode_utf32be },
86 { "UTF-32LE", decode_utf32le, encode_utf32le },
87 { "UTF-16BE", decode_utf16be, encode_utf16be },
88 { "UTF-16LE", decode_utf16le, encode_utf16le },
89 { "UTF-32", decode_utf32, encode_utf32 },
90 { "UTF-16", decode_utf16, encode_utf16 },
91 { "UTF-8", decode_utf8, encode_utf8 },
92 { "US-ASCII", decode_latin1, encode_latin1 },
93 { "Latin1", decode_latin1, encode_latin1 },
94 { "ASCII", decode_latin1, encode_latin1 },
95 { "UTF32", decode_utf32, encode_utf32 },
96 { "UTF16", decode_utf16, encode_utf16 },
97 { "UTF8", decode_utf8, encode_utf8 },
102 g_iconv_open (const char *to_charset, const char *from_charset)
105 iconv_t icd = (iconv_t) -1;
107 Decoder decoder = NULL;
108 Encoder encoder = NULL;
112 if (!to_charset || !from_charset || !to_charset[0] || !from_charset[0])
115 for (i = 0; i < G_N_ELEMENTS (charsets); i++) {
116 if (!g_ascii_strcasecmp (charsets[i].name, from_charset))
117 decoder = charsets[i].decoder;
119 if (!g_ascii_strcasecmp (charsets[i].name, to_charset))
120 encoder = charsets[i].encoder;
123 if (encoder == NULL || decoder == NULL) {
125 if ((icd = iconv_open (to_charset, from_charset)) == (iconv_t) -1)
132 cd = (GIConv) g_malloc (sizeof (struct _GIConv));
133 cd->decode = decoder;
134 cd->encode = encoder;
145 g_iconv_close (GIConv cd)
148 if (cd->cd != (iconv_t) -1)
149 iconv_close (cd->cd);
158 g_iconv (GIConv cd, char **inbytes, size_t *inbytesleft,
159 char **outbytes, size_t *outbytesleft)
161 size_t inleft, outleft;
162 char *inptr, *outptr;
167 if (cd->cd != (iconv_t) -1)
168 return iconv (cd->cd, inbytes, inbytesleft, outbytes, outbytesleft);
171 if (outbytes == NULL || outbytesleft == NULL) {
172 /* reset converter */
177 inleft = inbytesleft ? *inbytesleft : 0;
178 inptr = inbytes ? *inbytes : NULL;
179 outleft = *outbytesleft;
184 if (c == (gunichar) -1 && cd->decode (&inptr, &inleft, &c) == -1) {
189 if (cd->encode (c, &outptr, &outleft) == -1) {
195 } while (inleft > 0 && outleft > 0);
198 *inbytesleft = inleft;
203 *outbytesleft = outleft;
212 decode_utf32_be_or_le (Endian endian, char **inbytes, size_t *inbytesleft, gunichar *outchar)
214 gunichar *inptr = (gunichar *) *inbytes;
215 size_t inleft = *inbytesleft;
223 if (endian == BigEndian)
224 c = GUINT32_FROM_BE (*inptr);
226 c = GUINT32_FROM_LE (*inptr);
231 if (c >= 2147483648UL) {
236 *inbytes = (char *) inptr;
237 *inbytesleft = inleft;
244 decode_utf32be (char **inbytes, size_t *inbytesleft, gunichar *outchar)
246 return decode_utf32_be_or_le (BigEndian, inbytes, inbytesleft, outchar);
250 decode_utf32le (char **inbytes, size_t *inbytesleft, gunichar *outchar)
252 return decode_utf32_be_or_le (LittleEndian, inbytes, inbytesleft, outchar);
256 decode_utf32 (char **inbytes, size_t *inbytesleft, gunichar *outchar)
258 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
259 return decode_utf32_be_or_le (LittleEndian, inbytes, inbytesleft, outchar);
261 return decode_utf32_be_or_le (BigEndian, inbytes, inbytesleft, outchar);
266 encode_utf32_be_or_le (Endian endian, gunichar c, char **outbytes, size_t *outbytesleft)
268 gunichar *outptr = (gunichar *) *outbytes;
269 size_t outleft = *outbytesleft;
276 if (endian == BigEndian)
277 *outptr++ = GUINT32_TO_BE (c);
279 *outptr++ = GUINT32_TO_LE (c);
283 *outbytes = (char *) outptr;
284 *outbytesleft = outleft;
290 encode_utf32be (gunichar c, char **outbytes, size_t *outbytesleft)
292 return encode_utf32_be_or_le (BigEndian, c, outbytes, outbytesleft);
296 encode_utf32le (gunichar c, char **outbytes, size_t *outbytesleft)
298 return encode_utf32_be_or_le (LittleEndian, c, outbytes, outbytesleft);
302 encode_utf32 (gunichar c, char **outbytes, size_t *outbytesleft)
304 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
305 return encode_utf32_be_or_le (LittleEndian, c, outbytes, outbytesleft);
307 return encode_utf32_be_or_le (BigEndian, c, outbytes, outbytesleft);
312 decode_utf16_be_or_le (Endian endian, char **inbytes, size_t *inbytesleft, gunichar *outchar)
314 gunichar2 *inptr = (gunichar2 *) *inbytes;
315 size_t inleft = *inbytesleft;
324 if (endian == BigEndian)
325 u = GUINT16_FROM_BE (*inptr);
327 u = GUINT16_FROM_LE (*inptr);
332 if (u >= 0xdc00 && u <= 0xdfff) {
335 } else if (u >= 0xd800 && u <= 0xdbff) {
341 if (endian == BigEndian)
342 c = GUINT16_FROM_BE (*inptr);
344 c = GUINT16_FROM_LE (*inptr);
349 if (c < 0xdc00 || c > 0xdfff) {
354 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
357 *inbytes = (char *) inptr;
358 *inbytesleft = inleft;
365 decode_utf16be (char **inbytes, size_t *inbytesleft, gunichar *outchar)
367 return decode_utf16_be_or_le (BigEndian, inbytes, inbytesleft, outchar);
371 decode_utf16le (char **inbytes, size_t *inbytesleft, gunichar *outchar)
373 return decode_utf16_be_or_le (LittleEndian, inbytes, inbytesleft, outchar);
377 decode_utf16 (char **inbytes, size_t *inbytesleft, gunichar *outchar)
379 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
380 return decode_utf16_be_or_le (LittleEndian, inbytes, inbytesleft, outchar);
382 return decode_utf16_be_or_le (BigEndian, inbytes, inbytesleft, outchar);
387 encode_utf16_be_or_le (Endian endian, gunichar c, char **outbytes, size_t *outbytesleft)
389 gunichar2 *outptr = (gunichar2 *) *outbytes;
390 size_t outleft = *outbytesleft;
399 if (c <= 0xffff && (c < 0xd800 || c > 0xdfff)) {
402 if (endian == BigEndian)
403 *outptr++ = GUINT16_TO_BE (ch);
405 *outptr++ = GUINT16_TO_LE (ch);
408 } else if (outleft < 4) {
414 ch = (gunichar2) ((c2 >> 10) + 0xd800);
415 if (endian == BigEndian)
416 *outptr++ = GUINT16_TO_BE (ch);
418 *outptr++ = GUINT16_TO_LE (ch);
420 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
421 if (endian == BigEndian)
422 *outptr++ = GUINT16_TO_BE (ch);
424 *outptr++ = GUINT16_TO_LE (ch);
429 *outbytes = (char *) outptr;
430 *outbytesleft = outleft;
436 encode_utf16be (gunichar c, char **outbytes, size_t *outbytesleft)
438 return encode_utf16_be_or_le (BigEndian, c, outbytes, outbytesleft);
442 encode_utf16le (gunichar c, char **outbytes, size_t *outbytesleft)
444 return encode_utf16_be_or_le (LittleEndian, c, outbytes, outbytesleft);
448 encode_utf16 (gunichar c, char **outbytes, size_t *outbytesleft)
450 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
451 return encode_utf16_be_or_le (LittleEndian, c, outbytes, outbytesleft);
453 return encode_utf16_be_or_le (BigEndian, c, outbytes, outbytesleft);
458 decode_utf8 (char **inbytes, size_t *inbytesleft, gunichar *outchar)
460 size_t inleft = *inbytesleft;
461 char *inptr = *inbytes;
472 /* simple ascii case */
474 } else if (c < 0xe0) {
477 } else if (c < 0xf0) {
480 } else if (c < 0xf8) {
483 } else if (c < 0xfc) {
486 } else if (c < 0xfe) {
500 for (i = 1; i < len; i++) {
501 u = (u << 6) | ((*inptr) & 0x3f);
505 *inbytesleft = inleft - len;
513 encode_utf8 (gunichar c, char **outbytes, size_t *outbytesleft)
515 size_t outleft = *outbytesleft;
516 char *outptr = *outbytes;
523 } else if (c < 2048UL) {
526 } else if (c < 65536UL) {
529 } else if (c < 2097152UL) {
532 } else if (c < 67108864UL) {
535 } else if (c < 2147483648UL) {
548 for (i = len - 1; i > 0; i--) {
549 /* mask off 6 bits worth and add 128 */
550 outptr[i] = 128 + (c & 0x3f);
554 /* first character has a different base */
555 outptr[0] = base + c;
557 *outbytesleft = outleft - len;
558 *outbytes = outptr + len;
564 decode_latin1 (char **inbytes, size_t *inbytesleft, gunichar *outchar)
566 size_t inleft = *inbytesleft;
567 char *inptr = *inbytes;
573 *inbytesleft = inleft - 1;
574 *inbytes = inptr + 1;
581 encode_latin1 (gunichar c, char **outbytes, size_t *outbytesleft)
583 size_t outleft = *outbytesleft;
584 char *outptr = *outbytes;
596 *outptr++ = (char) c;
599 *outbytesleft = outleft;