X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;f=mono%2Futils%2Fstrenc.c;h=2ece0733d122a42d8ac969a7027081faf6af501a;hb=5ea6b70ca10d57d13ea7ac6cae5370f04d28c428;hp=c37bb3d20168ffb6c93473b33ce7558b4da2ffb7;hpb=af90548a08ef5effc93b083b7eec44daa178b141;p=mono.git diff --git a/mono/utils/strenc.c b/mono/utils/strenc.c index c37bb3d2016..2ece0733d12 100644 --- a/mono/utils/strenc.c +++ b/mono/utils/strenc.c @@ -1,5 +1,6 @@ -/* - * strenc.c: string encoding conversions +/** + * \file + * string encoding conversions * * Author: * Dick Porter (dick@ximian.com) @@ -13,22 +14,30 @@ #include "strenc.h" -#undef DEBUG +static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0 +}; /** * mono_unicode_from_external: - * @in: pointers to the buffer. - * @bytes: number of bytes in the string. - * - * Tries to turn a NULL-terminated string into UTF16. + * \param in pointers to the buffer. + * \param bytes number of bytes in the string. + * Tries to turn a NULL-terminated string into UTF-16. * - * First, see if it's valid UTF8, in which case just turn it directly - * into UTF16. Next, run through the colon-separated encodings in - * MONO_EXTERNAL_ENCODINGS and do an iconv conversion on each, - * returning the first successful conversion to UTF16. If no + * First, see if it's valid UTF-8, in which case just turn it directly + * into UTF-16. Next, run through the colon-separated encodings in + * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each, + * returning the first successful conversion to UTF-16. If no * conversion succeeds, return NULL. * - * Callers must free the returned string if not NULL. bytes holds the number + * Callers must free the returned string if not NULL. \p bytes holds the number * of bytes in the returned string, not including the terminator. */ gunichar2 * @@ -36,7 +45,7 @@ mono_unicode_from_external (const gchar *in, gsize *bytes) { gchar *res=NULL; gchar **encodings; - const gchar *encoding_list; + gchar *encoding_list; int i; glong lbytes; @@ -46,15 +55,12 @@ mono_unicode_from_external (const gchar *in, gsize *bytes) encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS"); if(encoding_list==NULL) { - encoding_list = ""; + encoding_list = g_strdup(""); } encodings=g_strsplit (encoding_list, ":", 0); + g_free (encoding_list); for(i=0;encodings[i]!=NULL; i++) { -#ifdef DEBUG - g_message (G_GNUC_PRETTY_FUNCTION ": Trying encoding [%s]", - encodings[i]); -#endif /* "default_locale" is a special case encoding */ if(!strcmp (encodings[i], "default_locale")) { gchar *utf8=g_locale_to_utf8 (in, -1, NULL, NULL, NULL); @@ -94,27 +100,26 @@ mono_unicode_from_external (const gchar *in, gsize *bytes) /** * mono_utf8_from_external: - * @in: pointer to the string buffer. - * + * \param in pointer to the string buffer. * Tries to turn a NULL-terminated string into UTF8. * - * First, see if it's valid UTF8, in which case there's nothing more + * First, see if it's valid UTF-8, in which case there's nothing more * to be done. Next, run through the colon-separated encodings in - * MONO_EXTERNAL_ENCODINGS and do an iconv conversion on each, - * returning the first successful conversion to utf8. If no + * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each, + * returning the first successful conversion to UTF-8. If no * conversion succeeds, return NULL. * * Callers must free the returned string if not NULL. * - * This function is identical to mono_unicode_from_external, apart - * from returning utf8 not utf16; it's handy in a few places to work - * in utf8. + * This function is identical to \c mono_unicode_from_external, apart + * from returning UTF-8 not UTF-16; it's handy in a few places to work + * in UTF-8. */ gchar *mono_utf8_from_external (const gchar *in) { gchar *res=NULL; gchar **encodings; - const gchar *encoding_list; + gchar *encoding_list; int i; if(in==NULL) { @@ -123,15 +128,12 @@ gchar *mono_utf8_from_external (const gchar *in) encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS"); if(encoding_list==NULL) { - encoding_list = ""; + encoding_list = g_strdup(""); } encodings=g_strsplit (encoding_list, ":", 0); + g_free (encoding_list); for(i=0;encodings[i]!=NULL; i++) { -#ifdef DEBUG - g_message (G_GNUC_PRETTY_FUNCTION ": Trying encoding [%s]", - encodings[i]); -#endif /* "default_locale" is a special case encoding */ if(!strcmp (encodings[i], "default_locale")) { @@ -162,18 +164,16 @@ gchar *mono_utf8_from_external (const gchar *in) /** * mono_unicode_to_external: - * @uni: an UTF16 string to conver to an external representation. - * - * Turns NULL-terminated UTF16 into either UTF8, or the first - * working item in MONO_EXTERNAL_ENCODINGS if set. If no conversions - * work, then UTF8 is returned. - * + * \param uni a UTF-16 string to convert to an external representation. + * Turns NULL-terminated UTF-16 into either UTF-8, or the first + * working item in \c MONO_EXTERNAL_ENCODINGS if set. If no conversions + * work, then UTF-8 is returned. * Callers must free the returned string. */ gchar *mono_unicode_to_external (const gunichar2 *uni) { gchar *utf8; - const gchar *encoding_list; + gchar *encoding_list; /* Turn the unicode into utf8 to start with, because its * easier to work with gchar * than gunichar2 * @@ -190,6 +190,7 @@ gchar *mono_unicode_to_external (const gunichar2 *uni) int i; encodings=g_strsplit (encoding_list, ":", 0); + g_free (encoding_list); for(i=0; encodings[i]!=NULL; i++) { if(!strcmp (encodings[i], "default_locale")) { res=g_locale_from_utf8 (utf8, -1, NULL, NULL, @@ -214,3 +215,152 @@ gchar *mono_unicode_to_external (const gunichar2 *uni) return(utf8); } +/** + * mono_utf8_validate_and_len + * \param source Pointer to putative UTF-8 encoded string. + * Checks \p source for being valid UTF-8. \p utf is assumed to be + * null-terminated. + * \returns TRUE if \p source is valid. + * \p oEnd will equal the null terminator at the end of the string if valid. + * if not valid, it will equal the first charater of the invalid sequence. + * \p oLength will equal the length to \p oEnd + **/ +gboolean +mono_utf8_validate_and_len (const gchar *source, glong* oLength, const gchar** oEnd) +{ + gboolean retVal = TRUE; + gboolean lastRet = TRUE; + guchar* ptr = (guchar*) source; + guchar* srcPtr; + guint length; + guchar a; + *oLength = 0; + while (*ptr != 0) { + length = trailingBytesForUTF8 [*ptr] + 1; + srcPtr = (guchar*) ptr + length; + switch (length) { + default: retVal = FALSE; + /* Everything else falls through when "TRUE"... */ + case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE; + if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) { + if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F || + *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF) + retVal = FALSE; + } + case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE; + case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE; + + switch (*ptr) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break; + case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break; + case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE; + if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break; + case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break; + case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break; + default: if (a < (guchar) 0x80) retVal = FALSE; + } + + case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE; + } + if (*ptr > (guchar) 0xF4) + retVal = FALSE; + //If the string is invalid, set the end to the invalid byte. + if (!retVal && lastRet) { + if (oEnd != NULL) + *oEnd = (gchar*) ptr; + lastRet = FALSE; + } + ptr += length; + (*oLength)++; + } + if (retVal && oEnd != NULL) + *oEnd = (gchar*) ptr; + return retVal; +} + + +/** + * mono_utf8_validate_and_len_with_bounds + * \param source: Pointer to putative UTF-8 encoded string. + * \param max_bytes: Max number of bytes that can be decoded. + * + * Checks \p source for being valid UTF-8. \p utf is assumed to be + * null-terminated. + * + * This function returns FALSE if it needs to decode characters beyond \p max_bytes. + * + * \returns TRUE if \p source is valid. + * \p oEnd will equal the null terminator at the end of the string if valid. + * if not valid, it will equal the first charater of the invalid sequence. + * \p oLength will equal the length to \p oEnd + **/ +gboolean +mono_utf8_validate_and_len_with_bounds (const gchar *source, glong max_bytes, glong* oLength, const gchar** oEnd) +{ + gboolean retVal = TRUE; + gboolean lastRet = TRUE; + guchar* ptr = (guchar*) source; + guchar *end = ptr + max_bytes; + guchar* srcPtr; + guint length; + guchar a; + *oLength = 0; + + if (max_bytes < 1) { + if (oEnd) + *oEnd = (gchar*) ptr; + return FALSE; + } + + while (*ptr != 0) { + length = trailingBytesForUTF8 [*ptr] + 1; + srcPtr = (guchar*) ptr + length; + + /* since *ptr is not zero we must ensure that we can decode the current char + the byte after + srcPtr points to the first byte after the current char.*/ + if (srcPtr >= end) { + retVal = FALSE; + break; + } + switch (length) { + default: retVal = FALSE; + /* Everything else falls through when "TRUE"... */ + case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE; + if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) { + if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F || + *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF) + retVal = FALSE; + } + case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE; + case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE; + + switch (*ptr) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break; + case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break; + case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE; + if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break; + case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break; + case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break; + default: if (a < (guchar) 0x80) retVal = FALSE; + } + + case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE; + } + if (*ptr > (guchar) 0xF4) + retVal = FALSE; + //If the string is invalid, set the end to the invalid byte. + if (!retVal && lastRet) { + if (oEnd != NULL) + *oEnd = (gchar*) ptr; + lastRet = FALSE; + } + ptr += length; + (*oLength)++; + } + if (retVal && oEnd != NULL) + *oEnd = (gchar*) ptr; + return retVal; +} +