2 * strenc.c: string encoding conversions
5 * Dick Porter (dick@ximian.com)
7 * (C) 2003 Ximian, Inc.
16 static const char trailingBytesForUTF8[256] = {
17 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
18 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
23 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
24 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0
28 * mono_unicode_from_external:
29 * @in: pointers to the buffer.
30 * @bytes: number of bytes in the string.
32 * Tries to turn a NULL-terminated string into UTF16.
34 * First, see if it's valid UTF8, in which case just turn it directly
35 * into UTF16. Next, run through the colon-separated encodings in
36 * MONO_EXTERNAL_ENCODINGS and do an iconv conversion on each,
37 * returning the first successful conversion to UTF16. If no
38 * conversion succeeds, return NULL.
40 * Callers must free the returned string if not NULL. bytes holds the number
41 * of bytes in the returned string, not including the terminator.
44 mono_unicode_from_external (const gchar *in, gsize *bytes)
56 encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
57 if(encoding_list==NULL) {
58 encoding_list = g_strdup("");
61 encodings=g_strsplit (encoding_list, ":", 0);
62 g_free (encoding_list);
63 for(i=0;encodings[i]!=NULL; i++) {
64 /* "default_locale" is a special case encoding */
65 if(!strcmp (encodings[i], "default_locale")) {
66 gchar *utf8=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
68 res=(gchar *) g_utf8_to_utf16 (utf8, -1, NULL, &lbytes, NULL);
69 *bytes = (gsize) lbytes;
73 /* Don't use UTF16 here. It returns the <FF FE> prepended to the string */
74 res = g_convert (in, strlen (in), "UTF8", encodings[i], NULL, bytes, NULL);
77 res = (gchar *) g_utf8_to_utf16 (res, -1, NULL, &lbytes, NULL);
78 *bytes = (gsize) lbytes;
84 g_strfreev (encodings);
86 return((gunichar2 *)res);
90 g_strfreev (encodings);
92 if(g_utf8_validate (in, -1, NULL)) {
93 gunichar2 *unires=g_utf8_to_utf16 (in, -1, NULL, (glong *)bytes, NULL);
102 * mono_utf8_from_external:
103 * @in: pointer to the string buffer.
105 * Tries to turn a NULL-terminated string into UTF8.
107 * First, see if it's valid UTF8, in which case there's nothing more
108 * to be done. Next, run through the colon-separated encodings in
109 * MONO_EXTERNAL_ENCODINGS and do an iconv conversion on each,
110 * returning the first successful conversion to utf8. If no
111 * conversion succeeds, return NULL.
113 * Callers must free the returned string if not NULL.
115 * This function is identical to mono_unicode_from_external, apart
116 * from returning utf8 not utf16; it's handy in a few places to work
119 gchar *mono_utf8_from_external (const gchar *in)
123 gchar *encoding_list;
130 encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
131 if(encoding_list==NULL) {
132 encoding_list = g_strdup("");
135 encodings=g_strsplit (encoding_list, ":", 0);
136 g_free (encoding_list);
137 for(i=0;encodings[i]!=NULL; i++) {
139 /* "default_locale" is a special case encoding */
140 if(!strcmp (encodings[i], "default_locale")) {
141 res=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
142 if(res!=NULL && !g_utf8_validate (res, -1, NULL)) {
147 res=g_convert (in, -1, "UTF8", encodings[i], NULL,
152 g_strfreev (encodings);
157 g_strfreev (encodings);
159 if(g_utf8_validate (in, -1, NULL)) {
160 return(g_strdup (in));
167 * mono_unicode_to_external:
168 * @uni: an UTF16 string to conver to an external representation.
170 * Turns NULL-terminated UTF16 into either UTF8, or the first
171 * working item in MONO_EXTERNAL_ENCODINGS if set. If no conversions
172 * work, then UTF8 is returned.
174 * Callers must free the returned string.
176 gchar *mono_unicode_to_external (const gunichar2 *uni)
179 gchar *encoding_list;
181 /* Turn the unicode into utf8 to start with, because its
182 * easier to work with gchar * than gunichar2 *
184 utf8=g_utf16_to_utf8 (uni, -1, NULL, NULL, NULL);
185 g_assert (utf8!=NULL);
187 encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
188 if(encoding_list==NULL) {
192 gchar *res, **encodings;
195 encodings=g_strsplit (encoding_list, ":", 0);
196 g_free (encoding_list);
197 for(i=0; encodings[i]!=NULL; i++) {
198 if(!strcmp (encodings[i], "default_locale")) {
199 res=g_locale_from_utf8 (utf8, -1, NULL, NULL,
202 res=g_convert (utf8, -1, encodings[i], "UTF8",
208 g_strfreev (encodings);
214 g_strfreev (encodings);
217 /* Nothing else worked, so just return the utf8 */
222 * mono_utf8_validate_and_len
223 * @source: Pointer to putative UTF-8 encoded string.
225 * Checks @source for being valid UTF-8. @utf is assumed to be
228 * Return value: true if @source is valid.
229 * oEnd : will equal the null terminator at the end of the string if valid.
230 * if not valid, it will equal the first charater of the invalid sequence.
231 * oLengh : will equal the length to @oEnd
234 mono_utf8_validate_and_len (const gchar *source, glong* oLength, const gchar** oEnd)
236 gboolean retVal = TRUE;
237 gboolean lastRet = TRUE;
238 guchar* ptr = (guchar*) source;
244 length = trailingBytesForUTF8 [*ptr] + 1;
245 srcPtr = (guchar*) ptr + length;
247 default: retVal = FALSE;
248 /* Everything else falls through when "TRUE"... */
249 case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
250 if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
251 if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
252 *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
255 case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
256 case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
259 /* no fall-through in this inner switch */
260 case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
261 case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
262 case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
263 if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
264 case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
265 case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
266 default: if (a < (guchar) 0x80) retVal = FALSE;
269 case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
271 if (*ptr > (guchar) 0xF4)
273 //If the string is invalid, set the end to the invalid byte.
274 if (!retVal && lastRet) {
276 *oEnd = (gchar*) ptr;
282 if (retVal && oEnd != NULL)
283 *oEnd = (gchar*) ptr;
289 * mono_utf8_validate_and_len_with_bounds
290 * @source: Pointer to putative UTF-8 encoded string.
291 * @max_bytes: Max number of bytes that can be decoded. This function returns FALSE if
292 * it needs to decode characters beyond that.
294 * Checks @source for being valid UTF-8. @utf is assumed to be
297 * Return value: true if @source is valid.
298 * oEnd : will equal the null terminator at the end of the string if valid.
299 * if not valid, it will equal the first charater of the invalid sequence.
300 * oLengh : will equal the length to @oEnd
303 mono_utf8_validate_and_len_with_bounds (const gchar *source, glong max_bytes, glong* oLength, const gchar** oEnd)
305 gboolean retVal = TRUE;
306 gboolean lastRet = TRUE;
307 guchar* ptr = (guchar*) source;
308 guchar *end = ptr + max_bytes;
316 *oEnd = (gchar*) ptr;
321 length = trailingBytesForUTF8 [*ptr] + 1;
322 srcPtr = (guchar*) ptr + length;
324 /* since *ptr is not zero we must ensure that we can decode the current char + the byte after
325 srcPtr points to the first byte after the current char.*/
331 default: retVal = FALSE;
332 /* Everything else falls through when "TRUE"... */
333 case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
334 if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
335 if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
336 *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
339 case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
340 case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
343 /* no fall-through in this inner switch */
344 case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
345 case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
346 case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
347 if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
348 case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
349 case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
350 default: if (a < (guchar) 0x80) retVal = FALSE;
353 case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
355 if (*ptr > (guchar) 0xF4)
357 //If the string is invalid, set the end to the invalid byte.
358 if (!retVal && lastRet) {
360 *oEnd = (gchar*) ptr;
366 if (retVal && oEnd != NULL)
367 *oEnd = (gchar*) ptr;