X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;f=mono%2Futils%2Fstrenc.c;h=2ece0733d122a42d8ac969a7027081faf6af501a;hb=5ea6b70ca10d57d13ea7ac6cae5370f04d28c428;hp=c37bb3d20168ffb6c93473b33ce7558b4da2ffb7;hpb=af90548a08ef5effc93b083b7eec44daa178b141;p=mono.git

diff --git a/mono/utils/strenc.c b/mono/utils/strenc.c
index c37bb3d2016..2ece0733d12 100644
--- a/mono/utils/strenc.c
+++ b/mono/utils/strenc.c
@@ -1,5 +1,6 @@
-/*
- * strenc.c: string encoding conversions
+/**
+ * \file
+ * string encoding conversions
  *
  * Author:
  *	Dick Porter (dick@ximian.com)
@@ -13,22 +14,30 @@
 
 #include "strenc.h"
 
-#undef DEBUG
+static const char trailingBytesForUTF8[256] = {
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0
+};
 
 /**
  * mono_unicode_from_external:
- * @in: pointers to the buffer.
- * @bytes: number of bytes in the string.
- *
- * Tries to turn a NULL-terminated string into UTF16.
+ * \param in pointers to the buffer.
+ * \param bytes number of bytes in the string.
+ * Tries to turn a NULL-terminated string into UTF-16.
  *
- * First, see if it's valid UTF8, in which case just turn it directly
- * into UTF16.  Next, run through the colon-separated encodings in
- * MONO_EXTERNAL_ENCODINGS and do an iconv conversion on each,
- * returning the first successful conversion to UTF16.  If no
+ * First, see if it's valid UTF-8, in which case just turn it directly
+ * into UTF-16.  Next, run through the colon-separated encodings in
+ * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
+ * returning the first successful conversion to UTF-16.  If no
  * conversion succeeds, return NULL.
  *
- * Callers must free the returned string if not NULL. bytes holds the number
+ * Callers must free the returned string if not NULL. \p bytes holds the number
  * of bytes in the returned string, not including the terminator.
  */
 gunichar2 *
@@ -36,7 +45,7 @@ mono_unicode_from_external (const gchar *in, gsize *bytes)
 {
 	gchar *res=NULL;
 	gchar **encodings;
-	const gchar *encoding_list;
+	gchar *encoding_list;
 	int i;
 	glong lbytes;
 	
@@ -46,15 +55,12 @@ mono_unicode_from_external (const gchar *in, gsize *bytes)
 	
 	encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
 	if(encoding_list==NULL) {
-		encoding_list = "";
+		encoding_list = g_strdup("");
 	}
 	
 	encodings=g_strsplit (encoding_list, ":", 0);
+	g_free (encoding_list);
 	for(i=0;encodings[i]!=NULL; i++) {
-#ifdef DEBUG
-		g_message (G_GNUC_PRETTY_FUNCTION ": Trying encoding [%s]",
-			   encodings[i]);
-#endif
 		/* "default_locale" is a special case encoding */
 		if(!strcmp (encodings[i], "default_locale")) {
 			gchar *utf8=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
@@ -94,27 +100,26 @@ mono_unicode_from_external (const gchar *in, gsize *bytes)
 
 /**
  * mono_utf8_from_external:
- * @in: pointer to the string buffer.
- *
+ * \param in pointer to the string buffer.
  * Tries to turn a NULL-terminated string into UTF8.
  *
- * First, see if it's valid UTF8, in which case there's nothing more
+ * First, see if it's valid UTF-8, in which case there's nothing more
  * to be done.  Next, run through the colon-separated encodings in
- * MONO_EXTERNAL_ENCODINGS and do an iconv conversion on each,
- * returning the first successful conversion to utf8.  If no
+ * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
+ * returning the first successful conversion to UTF-8.  If no
  * conversion succeeds, return NULL.
  *
  * Callers must free the returned string if not NULL.
  *
- * This function is identical to mono_unicode_from_external, apart
- * from returning utf8 not utf16; it's handy in a few places to work
- * in utf8.
+ * This function is identical to \c mono_unicode_from_external, apart
+ * from returning UTF-8 not UTF-16; it's handy in a few places to work
+ * in UTF-8.
  */
 gchar *mono_utf8_from_external (const gchar *in)
 {
 	gchar *res=NULL;
 	gchar **encodings;
-	const gchar *encoding_list;
+	gchar *encoding_list;
 	int i;
 	
 	if(in==NULL) {
@@ -123,15 +128,12 @@ gchar *mono_utf8_from_external (const gchar *in)
 	
 	encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
 	if(encoding_list==NULL) {
-		encoding_list = "";
+		encoding_list = g_strdup("");
 	}
 	
 	encodings=g_strsplit (encoding_list, ":", 0);
+	g_free (encoding_list);
 	for(i=0;encodings[i]!=NULL; i++) {
-#ifdef DEBUG
-		g_message (G_GNUC_PRETTY_FUNCTION ": Trying encoding [%s]",
-			   encodings[i]);
-#endif
 		
 		/* "default_locale" is a special case encoding */
 		if(!strcmp (encodings[i], "default_locale")) {
@@ -162,18 +164,16 @@ gchar *mono_utf8_from_external (const gchar *in)
 
 /**
  * mono_unicode_to_external:
- * @uni: an UTF16 string to conver to an external representation.
- *
- * Turns NULL-terminated UTF16 into either UTF8, or the first
- * working item in MONO_EXTERNAL_ENCODINGS if set.  If no conversions
- * work, then UTF8 is returned.
- *
+ * \param uni a UTF-16 string to convert to an external representation.
+ * Turns NULL-terminated UTF-16 into either UTF-8, or the first
+ * working item in \c MONO_EXTERNAL_ENCODINGS if set.  If no conversions
+ * work, then UTF-8 is returned.
  * Callers must free the returned string.
  */
 gchar *mono_unicode_to_external (const gunichar2 *uni)
 {
 	gchar *utf8;
-	const gchar *encoding_list;
+	gchar *encoding_list;
 	
 	/* Turn the unicode into utf8 to start with, because its
 	 * easier to work with gchar * than gunichar2 *
@@ -190,6 +190,7 @@ gchar *mono_unicode_to_external (const gunichar2 *uni)
 		int i;
 		
 		encodings=g_strsplit (encoding_list, ":", 0);
+		g_free (encoding_list);
 		for(i=0; encodings[i]!=NULL; i++) {
 			if(!strcmp (encodings[i], "default_locale")) {
 				res=g_locale_from_utf8 (utf8, -1, NULL, NULL,
@@ -214,3 +215,152 @@ gchar *mono_unicode_to_external (const gunichar2 *uni)
 	return(utf8);
 }
 
+/**
+ * mono_utf8_validate_and_len
+ * \param source Pointer to putative UTF-8 encoded string.
+ * Checks \p source for being valid UTF-8. \p utf is assumed to be
+ * null-terminated.
+ * \returns TRUE if \p source is valid.
+ * \p oEnd will equal the null terminator at the end of the string if valid.
+ * if not valid, it will equal the first charater of the invalid sequence.
+ * \p oLength will equal the length to \p oEnd
+ **/
+gboolean
+mono_utf8_validate_and_len (const gchar *source, glong* oLength, const gchar** oEnd)
+{
+	gboolean retVal = TRUE;
+	gboolean lastRet = TRUE;
+	guchar* ptr = (guchar*) source;
+	guchar* srcPtr;
+	guint length;
+	guchar a;
+	*oLength = 0;
+	while (*ptr != 0) {
+		length = trailingBytesForUTF8 [*ptr] + 1;
+		srcPtr = (guchar*) ptr + length;
+		switch (length) {
+		default: retVal = FALSE;
+		/* Everything else falls through when "TRUE"... */
+		case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
+				if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
+				if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
+					*(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
+					retVal = FALSE;
+				}
+		case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
+		case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
+
+		switch (*ptr) {
+		/* no fall-through in this inner switch */
+		case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
+		case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
+		case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
+				   if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
+		case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
+		case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
+		default:   if (a < (guchar) 0x80) retVal = FALSE;
+		}
+
+		case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
+		}
+		if (*ptr > (guchar) 0xF4)
+			retVal = FALSE;
+		//If the string is invalid, set the end to the invalid byte.
+		if (!retVal && lastRet) {
+			if (oEnd != NULL)
+				*oEnd = (gchar*) ptr;
+			lastRet = FALSE;
+		}
+		ptr += length;
+		(*oLength)++;
+	}
+	if (retVal && oEnd != NULL)
+		*oEnd = (gchar*) ptr;
+	return retVal;
+}
+
+
+/**
+ * mono_utf8_validate_and_len_with_bounds
+ * \param source: Pointer to putative UTF-8 encoded string.
+ * \param max_bytes: Max number of bytes that can be decoded.
+ *
+ * Checks \p source for being valid UTF-8. \p utf is assumed to be
+ * null-terminated.
+ *
+ * This function returns FALSE if it needs to decode characters beyond \p max_bytes.
+ *
+ * \returns TRUE if \p source is valid.
+ * \p oEnd will equal the null terminator at the end of the string if valid.
+ * if not valid, it will equal the first charater of the invalid sequence.
+ * \p oLength will equal the length to \p oEnd
+ **/
+gboolean
+mono_utf8_validate_and_len_with_bounds (const gchar *source, glong max_bytes, glong* oLength, const gchar** oEnd)
+{
+	gboolean retVal = TRUE;
+	gboolean lastRet = TRUE;
+	guchar* ptr = (guchar*) source;
+	guchar *end = ptr + max_bytes;
+	guchar* srcPtr;
+	guint length;
+	guchar a;
+	*oLength = 0;
+
+	if (max_bytes < 1) {
+		if (oEnd)
+			*oEnd = (gchar*) ptr;
+		return FALSE;
+	}
+
+	while (*ptr != 0) {
+		length = trailingBytesForUTF8 [*ptr] + 1;
+		srcPtr = (guchar*) ptr + length;
+		
+		/* since *ptr is not zero we must ensure that we can decode the current char + the byte after
+		   srcPtr points to the first byte after the current char.*/
+		if (srcPtr >= end) {
+			retVal = FALSE;
+			break;
+		}
+		switch (length) {
+		default: retVal = FALSE;
+		/* Everything else falls through when "TRUE"... */
+		case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
+				if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
+				if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
+					*(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
+					retVal = FALSE;
+				}
+		case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
+		case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
+
+		switch (*ptr) {
+		/* no fall-through in this inner switch */
+		case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
+		case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
+		case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
+				   if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
+		case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
+		case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
+		default:   if (a < (guchar) 0x80) retVal = FALSE;
+		}
+
+		case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
+		}
+		if (*ptr > (guchar) 0xF4)
+			retVal = FALSE;
+		//If the string is invalid, set the end to the invalid byte.
+		if (!retVal && lastRet) {
+			if (oEnd != NULL)
+				*oEnd = (gchar*) ptr;
+			lastRet = FALSE;
+		}
+		ptr += length;
+		(*oLength)++;
+	}
+	if (retVal && oEnd != NULL)
+		*oEnd = (gchar*) ptr;
+	return retVal;
+}
+