3 * string encoding conversions
6 * Dick Porter (dick@ximian.com)
8 * (C) 2003 Ximian, Inc.
17 static const char trailingBytesForUTF8[256] = {
18 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
23 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
24 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
25 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0
29 * mono_unicode_from_external:
30 * \param in pointers to the buffer.
31 * \param bytes number of bytes in the string.
32 * Tries to turn a NULL-terminated string into UTF-16.
34 * First, see if it's valid UTF-8, in which case just turn it directly
35 * into UTF-16. Next, run through the colon-separated encodings in
36 * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
37 * returning the first successful conversion to UTF-16. If no
38 * conversion succeeds, return NULL.
40 * Callers must free the returned string if not NULL. \p bytes holds the number
41 * of bytes in the returned string, not including the terminator.
44 mono_unicode_from_external (const gchar *in, gsize *bytes)
48 const gchar *encoding_list;
56 encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
57 if(encoding_list==NULL) {
61 encodings=g_strsplit (encoding_list, ":", 0);
62 for(i=0;encodings[i]!=NULL; i++) {
63 /* "default_locale" is a special case encoding */
64 if(!strcmp (encodings[i], "default_locale")) {
65 gchar *utf8=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
67 res=(gchar *) g_utf8_to_utf16 (utf8, -1, NULL, &lbytes, NULL);
68 *bytes = (gsize) lbytes;
72 /* Don't use UTF16 here. It returns the <FF FE> prepended to the string */
73 res = g_convert (in, strlen (in), "UTF8", encodings[i], NULL, bytes, NULL);
76 res = (gchar *) g_utf8_to_utf16 (res, -1, NULL, &lbytes, NULL);
77 *bytes = (gsize) lbytes;
83 g_strfreev (encodings);
85 return((gunichar2 *)res);
89 g_strfreev (encodings);
91 if(g_utf8_validate (in, -1, NULL)) {
92 gunichar2 *unires=g_utf8_to_utf16 (in, -1, NULL, (glong *)bytes, NULL);
101 * mono_utf8_from_external:
102 * \param in pointer to the string buffer.
103 * Tries to turn a NULL-terminated string into UTF8.
105 * First, see if it's valid UTF-8, in which case there's nothing more
106 * to be done. Next, run through the colon-separated encodings in
107 * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
108 * returning the first successful conversion to UTF-8. If no
109 * conversion succeeds, return NULL.
111 * Callers must free the returned string if not NULL.
113 * This function is identical to \c mono_unicode_from_external, apart
114 * from returning UTF-8 not UTF-16; it's handy in a few places to work
117 gchar *mono_utf8_from_external (const gchar *in)
121 const gchar *encoding_list;
128 encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
129 if(encoding_list==NULL) {
133 encodings=g_strsplit (encoding_list, ":", 0);
134 for(i=0;encodings[i]!=NULL; i++) {
136 /* "default_locale" is a special case encoding */
137 if(!strcmp (encodings[i], "default_locale")) {
138 res=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
139 if(res!=NULL && !g_utf8_validate (res, -1, NULL)) {
144 res=g_convert (in, -1, "UTF8", encodings[i], NULL,
149 g_strfreev (encodings);
154 g_strfreev (encodings);
156 if(g_utf8_validate (in, -1, NULL)) {
157 return(g_strdup (in));
164 * mono_unicode_to_external:
165 * \param uni a UTF-16 string to convert to an external representation.
166 * Turns NULL-terminated UTF-16 into either UTF-8, or the first
167 * working item in \c MONO_EXTERNAL_ENCODINGS if set. If no conversions
168 * work, then UTF-8 is returned.
169 * Callers must free the returned string.
171 gchar *mono_unicode_to_external (const gunichar2 *uni)
174 const gchar *encoding_list;
176 /* Turn the unicode into utf8 to start with, because its
177 * easier to work with gchar * than gunichar2 *
179 utf8=g_utf16_to_utf8 (uni, -1, NULL, NULL, NULL);
180 g_assert (utf8!=NULL);
182 encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
183 if(encoding_list==NULL) {
187 gchar *res, **encodings;
190 encodings=g_strsplit (encoding_list, ":", 0);
191 for(i=0; encodings[i]!=NULL; i++) {
192 if(!strcmp (encodings[i], "default_locale")) {
193 res=g_locale_from_utf8 (utf8, -1, NULL, NULL,
196 res=g_convert (utf8, -1, encodings[i], "UTF8",
202 g_strfreev (encodings);
208 g_strfreev (encodings);
211 /* Nothing else worked, so just return the utf8 */
216 * mono_utf8_validate_and_len
217 * \param source Pointer to putative UTF-8 encoded string.
218 * Checks \p source for being valid UTF-8. \p utf is assumed to be
220 * \returns TRUE if \p source is valid.
221 * \p oEnd will equal the null terminator at the end of the string if valid.
222 * if not valid, it will equal the first charater of the invalid sequence.
223 * \p oLength will equal the length to \p oEnd
226 mono_utf8_validate_and_len (const gchar *source, glong* oLength, const gchar** oEnd)
228 gboolean retVal = TRUE;
229 gboolean lastRet = TRUE;
230 guchar* ptr = (guchar*) source;
236 length = trailingBytesForUTF8 [*ptr] + 1;
237 srcPtr = (guchar*) ptr + length;
239 default: retVal = FALSE;
240 /* Everything else falls through when "TRUE"... */
241 case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
242 if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
243 if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
244 *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
247 case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
248 case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
251 /* no fall-through in this inner switch */
252 case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
253 case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
254 case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
255 if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
256 case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
257 case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
258 default: if (a < (guchar) 0x80) retVal = FALSE;
261 case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
263 if (*ptr > (guchar) 0xF4)
265 //If the string is invalid, set the end to the invalid byte.
266 if (!retVal && lastRet) {
268 *oEnd = (gchar*) ptr;
274 if (retVal && oEnd != NULL)
275 *oEnd = (gchar*) ptr;
281 * mono_utf8_validate_and_len_with_bounds
282 * \param source: Pointer to putative UTF-8 encoded string.
283 * \param max_bytes: Max number of bytes that can be decoded.
285 * Checks \p source for being valid UTF-8. \p utf is assumed to be
288 * This function returns FALSE if it needs to decode characters beyond \p max_bytes.
290 * \returns TRUE if \p source is valid.
291 * \p oEnd will equal the null terminator at the end of the string if valid.
292 * if not valid, it will equal the first charater of the invalid sequence.
293 * \p oLength will equal the length to \p oEnd
296 mono_utf8_validate_and_len_with_bounds (const gchar *source, glong max_bytes, glong* oLength, const gchar** oEnd)
298 gboolean retVal = TRUE;
299 gboolean lastRet = TRUE;
300 guchar* ptr = (guchar*) source;
301 guchar *end = ptr + max_bytes;
309 *oEnd = (gchar*) ptr;
314 length = trailingBytesForUTF8 [*ptr] + 1;
315 srcPtr = (guchar*) ptr + length;
317 /* since *ptr is not zero we must ensure that we can decode the current char + the byte after
318 srcPtr points to the first byte after the current char.*/
324 default: retVal = FALSE;
325 /* Everything else falls through when "TRUE"... */
326 case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
327 if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
328 if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
329 *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
332 case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
333 case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
336 /* no fall-through in this inner switch */
337 case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
338 case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
339 case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
340 if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
341 case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
342 case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
343 default: if (a < (guchar) 0x80) retVal = FALSE;
346 case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
348 if (*ptr > (guchar) 0xF4)
350 //If the string is invalid, set the end to the invalid byte.
351 if (!retVal && lastRet) {
353 *oEnd = (gchar*) ptr;
359 if (retVal && oEnd != NULL)
360 *oEnd = (gchar*) ptr;