X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;ds=sidebyside;f=eglib%2Fsrc%2Fgutf8.c;h=fed6dd753ca6921ff34a30fa99b43c67906853e1;hb=a2b8537da3747d7eb506ecdbb91a8a2bc35a2248;hp=e1165663dc6ba316a9a440f522253caacfb7dc99;hpb=b5cfba1835f2ba823796f825410e0062b7e4c9a3;p=mono.git

diff --git a/eglib/src/gutf8.c b/eglib/src/gutf8.c
index e1165663dc6..fed6dd753ca 100644
--- a/eglib/src/gutf8.c
+++ b/eglib/src/gutf8.c
@@ -12,99 +12,50 @@
 
 gpointer error_quark = "ERROR";
 
+static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error);
+static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error);
+
 gpointer
 g_convert_error_quark ()
 {
 	return error_quark;
 }
 
-gunichar2*
-g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
+static gunichar*
+utf8_case_conv (const gchar *str, gssize len, gboolean upper)
 {
-	/* The conversion logic is almost identical to UTF8Encoding.GetChars(),
-	   but error check is always done at utf8_to_utf16_len() so that
-	   the conversion core below simply resets erroreous bits */
-	glong utf16_len;
-	gunichar2 *ret;
-	gchar ch, mb_size, mb_remain;
-	guint32 codepoint;
-	glong in_pos, out_pos;
+	glong i, u16len, u32len;
+	gunichar2 *u16str;
+	gunichar *u32str;
+	gchar *u8str;
+	GError **err = NULL;
 
-	utf16_len = 0;
-	mb_size = 0;
-	mb_remain = 0;
-	in_pos = 0;
-	out_pos = 0;
-
-	if (error)
-		*error = NULL;
-
-	utf16_len = utf8_to_utf16_len (str, len, items_read, error);
-	if (error)
-		if (*error)
-			return NULL;
-	if (utf16_len < 0)
-		return NULL;
-
-	ret = g_malloc (utf16_len * sizeof (gunichar2));
-
-	for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
-		ch = (guchar) str [in_pos];
-		if (mb_size == 0) {
-			if (0 < ch)
-				ret [out_pos++] = ch;
-			else if ((ch & 0xE0) == 0xC0) {
-				codepoint = ch & 0x1F;
-				mb_remain = mb_size = 2;
-			} else if ((ch & 0xF0) == 0xE0) {
-				codepoint = ch & 0x0F;
-				mb_remain = mb_size = 3;
-			} else if ((ch & 0xF8) == 0xF0) {
-				codepoint = ch & 7;
-				mb_remain = mb_size = 4;
-			} else if ((ch & 0xFC) == 0xF8) {
-				codepoint = ch & 3;
-				mb_remain = mb_size = 5;
-			} else if ((ch & 0xFE) == 0xFC) {
-				codepoint = ch & 3;
-				mb_remain = mb_size = 6;
-			} else {
-				/* invalid utf-8 sequence */
-				codepoint = 0;
-				mb_remain = mb_size = 0;
-			}
-		} else {
-			if ((ch & 0xC0) == 0x80) {
-				codepoint = (codepoint << 6) | (ch & 0x3F);
-				if (--mb_remain == 0) {
-					/* multi byte character is fully consumed now. */
-					if (codepoint < 0x10000) {
-						ret [out_pos++] = codepoint;
-					} else if (codepoint < 0x110000) {
-						/* surrogate pair */
-						codepoint -= 0x10000;
-						ret [out_pos++] = (codepoint >> 10) + 0xD800;
-						ret [out_pos++] = (codepoint & 0x3FF) + 0xDC00;
-					} else {
-						/* invalid utf-8 sequence (excess) */
-						codepoint = 0;
-						mb_remain = mb_size = 0;
-					}
-				}
-			} else {
-				/* invalid utf-8 sequence */
-				codepoint = 0;
-				mb_remain = mb_size = 0;
-			}
-		}
+	u16str = g_utf8_to_utf16 (str, (glong)len, NULL, &u16len, err);
+	u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
+	for (i = 0; i < u32len; i++) {
+		u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
 	}
+	g_free (u16str);
+	u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
+	u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
+	g_free (u32str);
+	g_free (u16str);
+	return (gunichar*)u8str;
+}
 
-	if (items_written)
-		*items_written = out_pos;
-	return ret;
+gchar*
+g_utf8_strup (const gchar *str, gssize len)
+{
+	return (gchar*)utf8_case_conv (str, len, TRUE);
 }
 
-glong
+gchar*
+g_utf8_strdown (const gchar *str, gssize len)
+{
+	return (gchar*)utf8_case_conv (str, len, FALSE);
+}
+
+static glong
 utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
 {
 	/* It is almost identical to UTF8Encoding.GetCharCount() */
@@ -113,36 +64,50 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
 	guint32 codepoint;
 	glong in_pos, ret;
 
+	if (len < 0)
+		len = (glong) strlen (str);
+
+	in_pos = 0;
+	ret = 0;
+
+	/* Common case */
+	for (in_pos = 0; in_pos < len && (guchar) str [in_pos] < 0x80; in_pos++)
+		ret ++;
+
+	if (in_pos == len) {
+		if (items_read)
+			*items_read = in_pos;
+		return ret;
+	}
+
 	mb_size = 0;
 	mb_remain = 0;
 	overlong = 0;
-	in_pos = 0;
-	ret = 0;
 
-	for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
+	for (; in_pos < len; in_pos++) {
 		ch = str [in_pos];
 		if (mb_size == 0) {
 			if (ch < 0x80)
 				ret++;
 			else if ((ch & 0xE0) == 0xC0) {
 				codepoint = ch & 0x1F;
-				mb_remain = mb_size = 2;
+				mb_size = 2;
 			} else if ((ch & 0xF0) == 0xE0) {
 				codepoint = ch & 0x0F;
-				mb_remain = mb_size = 3;
+				mb_size = 3;
 			} else if ((ch & 0xF8) == 0xF0) {
 				codepoint = ch & 7;
-				mb_remain = mb_size = 4;
+				mb_size = 4;
 			} else if ((ch & 0xFC) == 0xF8) {
 				codepoint = ch & 3;
-				mb_remain = mb_size = 5;
+				mb_size = 5;
 			} else if ((ch & 0xFE) == 0xFC) {
 				codepoint = ch & 3;
-				mb_remain = mb_size = 6;
+				mb_size = 6;
 			} else {
 				/* invalid utf-8 sequence */
 				if (error) {
-					g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);
+					g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
 					if (items_read)
 						*items_read = in_pos;
 					return -1;
@@ -151,6 +116,8 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
 					mb_remain = mb_size = 0;
 				}
 			}
+			if (mb_size > 1)
+				mb_remain = mb_size - 1;
 		} else {
 			if ((ch & 0xC0) == 0x80) {
 				codepoint = (codepoint << 6) | (ch & 0x3F);
@@ -183,7 +150,7 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
 								return -1;
 							} else {
 								codepoint = 0;
-								mb_remain = mb_size = 0;
+								mb_remain = 0;
 								overlong = FALSE;
 							}
 						}
@@ -201,14 +168,15 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
 							return -1;
 						} else {
 							codepoint = 0;
-							mb_remain = mb_size = 0;
+							mb_remain = 0;
 						}
 					}
+					mb_size = 0;
 				}
 			} else {
 				/* invalid utf-8 sequence */
 				if (error) {
-					g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d", in_pos);
+					g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
 					if (items_read)
 						*items_read = in_pos;
 					return -1;
@@ -225,6 +193,112 @@ utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **erro
 	return ret;
 }
 
+gunichar2*
+g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
+{
+	/* The conversion logic is almost identical to UTF8Encoding.GetChars(),
+	   but error check is always done at utf8_to_utf16_len() so that
+	   the conversion core below simply resets erroreous bits */
+	glong utf16_len;
+	gunichar2 *ret;
+	guchar ch, mb_size, mb_remain;
+	guint32 codepoint;
+	glong in_pos, out_pos;
+
+	utf16_len = 0;
+	mb_size = 0;
+	mb_remain = 0;
+	in_pos = 0;
+	out_pos = 0;
+
+	if (error)
+		*error = NULL;
+
+	if (len < 0)
+		len = (glong) strlen (str);
+
+	if (items_read)
+		*items_read = 0;
+	if (items_written)
+		*items_written = 0;
+	utf16_len = utf8_to_utf16_len (str, len, items_read, error);
+	if (error)
+		if (*error)
+			return NULL;
+	if (utf16_len < 0)
+		return NULL;
+
+	ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
+
+	/* Common case */
+	for (in_pos = 0; in_pos < len; in_pos++) {
+		ch = (guchar) str [in_pos];
+
+		if (ch >= 0x80)
+			break;
+		ret [out_pos++] = ch;
+	}
+
+	for (; in_pos < len; in_pos++) {
+		ch = (guchar) str [in_pos];
+		if (mb_size == 0) {
+			if (ch < 0x80)
+				ret [out_pos++] = ch;
+			else if ((ch & 0xE0) == 0xC0) {
+				codepoint = ch & 0x1F;
+				mb_size = 2;
+			} else if ((ch & 0xF0) == 0xE0) {
+				codepoint = ch & 0x0F;
+				mb_size = 3;
+			} else if ((ch & 0xF8) == 0xF0) {
+				codepoint = ch & 7;
+				mb_size = 4;
+			} else if ((ch & 0xFC) == 0xF8) {
+				codepoint = ch & 3;
+				mb_size = 5;
+			} else if ((ch & 0xFE) == 0xFC) {
+				codepoint = ch & 3;
+				mb_size = 6;
+			} else {
+				/* invalid utf-8 sequence */
+				codepoint = 0;
+				mb_remain = mb_size = 0;
+			}
+			if (mb_size > 1)
+				mb_remain = mb_size - 1;
+		} else {
+			if ((ch & 0xC0) == 0x80) {
+				codepoint = (codepoint << 6) | (ch & 0x3F);
+				if (--mb_remain == 0) {
+					/* multi byte character is fully consumed now. */
+					if (codepoint < 0x10000) {
+						ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
+					} else if (codepoint < 0x110000) {
+						/* surrogate pair */
+						codepoint -= 0x10000;
+						ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
+						ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
+					} else {
+						/* invalid utf-8 sequence (excess) */
+						codepoint = 0;
+						mb_remain = 0;
+					}
+					mb_size = 0;
+				}
+			} else {
+				/* invalid utf-8 sequence */
+				codepoint = 0;
+				mb_remain = mb_size = 0;
+			}
+		}
+	}
+
+	ret [out_pos] = 0;
+	if (items_written)
+		*items_written = out_pos;
+	return ret;
+}
+
 gchar*
 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
 {
@@ -235,13 +309,17 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
 	gchar *ret;
 	glong in_pos, out_pos;
 	gunichar2 ch;
-	guint32 codepoint;
+	guint32 codepoint = 0;
 	gboolean surrogate;
 
 	in_pos = 0;
 	out_pos = 0;
 	surrogate = FALSE;
 
+	if (items_read)
+		*items_read = 0;
+	if (items_written)
+		*items_written = 0;
 	utf8_len = utf16_to_utf8_len (str, len, items_read, error);
 	if (error)
 		if (*error)
@@ -249,23 +327,26 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
 	if (utf8_len < 0)
 		return NULL;
 
-	ret = g_malloc (utf8_len * sizeof (gchar));
+	ret = g_malloc ((1+utf8_len) * sizeof (gchar));
 
 	while (len < 0 ? str [in_pos] : in_pos < len) {
 		ch = str [in_pos];
 		if (surrogate) {
-			surrogate = 0;
-			if (ch >= 0xDC00 && ch <= 0xDFFF)
+			if (ch >= 0xDC00 && ch <= 0xDFFF) {
 				codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
-			else
+				surrogate = 0;
+			} else {
+				surrogate = 0;
 				/* invalid surrogate pair */
+				++in_pos;
 				continue;
+			}
 		} else {
 			/* fast path optimization */
 			if (ch < 0x80) {
 				for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
 					if (str [in_pos] < 0x80)
-						ret [out_pos++] = str [in_pos];
+						ret [out_pos++] = (gchar)(str [in_pos]);
 					else
 						break;
 				}
@@ -274,6 +355,7 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
 			else if (ch >= 0xD800 && ch <= 0xDBFF)
 				surrogate = ch;
 			else if (ch >= 0xDC00 && ch <= 0xDFFF) {
+				++in_pos;
 				/* invalid surrogate pair */
 				continue;
 			}
@@ -282,6 +364,8 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
 		}
 		in_pos++;
 
+		if (surrogate != 0)
+			continue;
 		if (codepoint < 0x80)
 			ret [out_pos++] = (gchar) codepoint;
 		else if (codepoint < 0x0800) {
@@ -298,13 +382,14 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
 			ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
 		}
 	}
+	ret [out_pos] = 0;
 
 	if (items_written)
 		*items_written = out_pos;
 	return ret;
 }
 
-glong
+static glong
 utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
 {
 	glong ret, in_pos;
@@ -364,3 +449,171 @@ utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **
 		*items_read = in_pos;
 	return ret;
 }
+
+static glong
+g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error)
+{
+	glong retlen = 0;
+	glong errindex = 0;
+	const gunichar *lstr = str;
+
+	if (!str)
+		return 0;
+
+	while (*lstr != '\0' && len--) {
+		gunichar ch;
+		ch = *lstr++;
+		if (ch <= 0x0000FFFF) { 
+			if (ch >= 0xD800 && ch <= 0xDFFF) {
+				errindex = (glong)(lstr - str)-1;
+				if (error)
+					g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+					"Invalid sequence in conversion input");
+				if (items_read)
+					*items_read = errindex;
+				return 0;
+			} else {
+				retlen++;
+			}
+		} else if (ch > 0x10FFFF) {
+			errindex = (glong)(lstr - str)-1;
+			if (error)
+				g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+				"Character out of range for UTF-16");
+			if (items_read)
+				*items_read = errindex;
+			return 0;
+
+		} else {
+			retlen+=2;
+		}
+	}
+
+	if (items_read)
+		*items_read = (glong)(lstr - str);
+	return retlen;
+}
+
+gunichar2*
+g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
+{
+	glong allocsz;
+	gunichar2 *retstr = 0;
+	gunichar2 *retch = 0;
+	glong nwritten = 0;
+	GError *lerror =0 ;
+
+	allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror);
+
+	if (!lerror) {
+		retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2));
+		retstr[allocsz] = '\0';
+
+		while (*str != '\0' && len--) {
+			gunichar ch;
+			ch = *str++;
+			if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) {
+				*retch++ = (gunichar2)ch;
+				nwritten ++;
+			} else {
+				ch -= 0x0010000UL;
+				*retch++ = (gunichar2)((ch >> 10) + 0xD800);
+				*retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00);
+				nwritten +=2;
+			}
+		}
+	}
+
+	if (items_written)
+		*items_written = nwritten;
+	if (error)
+		*error = lerror;
+
+	return retstr;
+}
+
+static glong
+g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
+{
+	glong retlen = 0;
+	glong errindex = 0;
+	const gunichar2 *lstr = str;
+	gunichar2 ch,ch2;
+
+	if (!str)
+		return 0;
+
+	while (*lstr != '\0' && len--) {
+		ch = *lstr++;
+		if (ch >= 0xD800 && ch <= 0xDBFF) {
+			if (!len--) {
+				lstr--;
+				break;
+			}
+			ch2 = *lstr;
+			if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+				lstr++;
+			} else {
+				errindex = (glong)(lstr - str);
+				if (error)
+					g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+					"Invalid sequence in conversion input");
+				if (items_read)
+					*items_read = errindex;
+				return 0;
+			}
+		} else {
+			if (ch >= 0xDC00 && ch <= 0xDFFF) {
+				errindex = (glong)(lstr - str)-1;
+				if (error)
+					g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+					"Invalid sequence in conversion input");
+				if (items_read)
+					*items_read = errindex;
+				return 0;
+			}
+		}
+		retlen++;
+	}
+
+	if (items_read)
+		*items_read = (glong)(lstr - str);
+
+	return retlen;
+}
+
+gunichar*
+g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
+{
+	glong allocsz;
+	gunichar *retstr = 0;
+	gunichar *retch = 0;
+	glong nwritten = 0;
+	GError *lerror =0 ;
+	gunichar ch,ch2;
+
+	allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror);
+
+	if (!lerror) {
+		retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar));
+		retstr[allocsz] = '\0';
+		nwritten = allocsz;
+
+		while (*str != '\0' && allocsz--) {
+			ch = *str++;
+			if (ch >= 0xD800 && ch <= 0xDBFF) {
+				ch2 = *str++;
+				ch = ((ch - (gunichar)0xD800) << 10)
+				      + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL;
+			}
+			*retch++ = ch;
+		}
+	}
+
+	if (items_written)
+		*items_written = nwritten;
+	if (error)
+		*error = lerror;
+
+	return retstr;
+}