X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;f=src%2Fvm%2Futf8.c;h=4c65b583598801b950d8227a8a87118ba1998a78;hb=4f7f3896b97b7c228687fc2f4f04fcca7cf8f67f;hp=cd16f8b3f7ccfb11384b667b8883d3693d433c38;hpb=cbbf96f102bf6b0ea93fcd4f099487858b9c96d0;p=cacao.git

diff --git a/src/vm/utf8.c b/src/vm/utf8.c
index cd16f8b3f..4c65b5835 100644
--- a/src/vm/utf8.c
+++ b/src/vm/utf8.c
@@ -31,7 +31,7 @@
             Christian Thalinger
 			Edwin Steiner
 
-   $Id: utf8.c 4900 2006-05-11 09:18:28Z twisti $
+   $Id: utf8.c 5920 2006-11-05 21:23:09Z twisti $
 
 */
 
@@ -45,12 +45,10 @@
 
 #include "mm/memory.h"
 
-#if defined(USE_THREADS)
-# if defined(NATIVE_THREADS)
-#  include "threads/native/threads.h"
-# else
-#  include "threads/green/threads.h"
-# endif
+#if defined(ENABLE_THREADS)
+# include "threads/native/lock.h"
+#else
+# include "threads/none/lock.h"
 #endif
 
 #include "vm/builtin.h"
@@ -87,12 +85,14 @@ utf *utf_java_io_Serializable;
 utf *utf_java_lang_Throwable;
 utf *utf_java_lang_VMThrowable;
 utf *utf_java_lang_Error;
-utf *utf_java_lang_NoClassDefFoundError;
+utf *utf_java_lang_AbstractMethodError;
 utf *utf_java_lang_LinkageError;
+utf *utf_java_lang_NoClassDefFoundError;
 utf *utf_java_lang_NoSuchMethodError;
 utf *utf_java_lang_OutOfMemoryError;
 
 utf *utf_java_lang_Exception;
+utf *utf_java_lang_ClassCastException;
 utf *utf_java_lang_ClassNotFoundException;
 utf *utf_java_lang_IllegalArgumentException;
 utf *utf_java_lang_IllegalMonitorStateException;
@@ -121,6 +121,7 @@ utf *utf_Code;                          /* Code                               */
 utf *utf_Exceptions;                    /* Exceptions                         */
 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 utf *utf_SourceFile;                    /* SourceFile                         */
+utf *utf_Signature;
 
 utf *utf_init;                          /* <init>                             */
 utf *utf_clinit;                        /* <clinit>                           */
@@ -128,11 +129,12 @@ utf *utf_clone;                         /* clone                              */
 utf *utf_finalize;                      /* finalize                           */
 utf *utf_run;                           /* run                                */
 
-utf *utf_add;                           /* add                                */
-utf *utf_remove;                        /* remove                             */
-utf *utf_put;                           /* put                                */
-utf *utf_get;                           /* get                                */
-utf *utf_value;                         /* value                              */
+utf *utf_add;
+utf *utf_remove;
+utf *utf_removeThread;
+utf *utf_put;
+utf *utf_get;
+utf *utf_value;
 
 utf *utf_fillInStackTrace;
 utf *utf_getSystemClassLoader;
@@ -164,10 +166,11 @@ utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 utf *utf_java_lang_Object__java_lang_Object;
 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 utf *utf_java_lang_String__java_lang_Class;
+utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 
 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
-
+utf *utf_null;
 utf *array_packagename;
 
 
@@ -187,7 +190,7 @@ bool utf8_init(void)
 
 #if defined(ENABLE_STATISTICS)
 	if (opt_stat)
-		count_utf_len += sizeof(utf*) * hashtable_utf.size;
+		count_utf_len += sizeof(utf*) * hashtable_utf->size;
 #endif
 
 	/* create utf-symbols for pointer comparison of frequently used strings */
@@ -207,12 +210,15 @@ bool utf8_init(void)
 	utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
 	utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
 
-	utf_java_lang_NoClassDefFoundError =
-		utf_new_char(string_java_lang_NoClassDefFoundError);
+	utf_java_lang_AbstractMethodError =
+		utf_new_char(string_java_lang_AbstractMethodError);
 
 	utf_java_lang_LinkageError =
 		utf_new_char(string_java_lang_LinkageError);
 
+	utf_java_lang_NoClassDefFoundError =
+		utf_new_char(string_java_lang_NoClassDefFoundError);
+
 	utf_java_lang_NoSuchMethodError =
 		utf_new_char(string_java_lang_NoSuchMethodError);
 
@@ -221,6 +227,9 @@ bool utf8_init(void)
 
 	utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
 
+	utf_java_lang_ClassCastException =
+		utf_new_char(string_java_lang_ClassCastException);
+
 	utf_java_lang_ClassNotFoundException =
 		utf_new_char(string_java_lang_ClassNotFoundException);
 
@@ -259,6 +268,7 @@ bool utf8_init(void)
 	utf_Exceptions	               = utf_new_char("Exceptions");
 	utf_LineNumberTable            = utf_new_char("LineNumberTable");
 	utf_SourceFile                 = utf_new_char("SourceFile");
+	utf_Signature                  = utf_new_char("Signature");
 
 	utf_init	                   = utf_new_char("<init>");
 	utf_clinit	                   = utf_new_char("<clinit>");
@@ -268,6 +278,7 @@ bool utf8_init(void)
 
 	utf_add                        = utf_new_char("add");
 	utf_remove                     = utf_new_char("remove");
+	utf_removeThread               = utf_new_char("removeThread");
 	utf_put                        = utf_new_char("put");
 	utf_get                        = utf_new_char("get");
 	utf_value                      = utf_new_char("value");
@@ -309,10 +320,11 @@ bool utf8_init(void)
 	utf_java_lang_String__java_lang_Class =
 		utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 
+	utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 	utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 
+	utf_null                       = utf_new_char("null");
 	utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
-
 	array_packagename              = utf_new_char("\t<the array package>");
 
 	/* everything's ok */
@@ -505,9 +517,7 @@ utf *utf_new(const char *text, u2 length)
 	utf *u;                             /* hashtable element                  */
 	u2 i;
 
-#if defined(USE_THREADS)
-	builtin_monitorenter(hashtable_utf->header);
-#endif
+	LOCK_MONITOR_ENTER(hashtable_utf->header);
 
 #if defined(ENABLE_STATISTICS)
 	if (opt_stat)
@@ -535,9 +545,7 @@ utf *utf_new(const char *text, u2 length)
 
 			/* symbol found in hashtable */
 
-#if defined(USE_THREADS)
-			builtin_monitorexit(hashtable_utf->header);
-#endif
+			LOCK_MONITOR_EXIT(hashtable_utf->header);
 
 			return u;
 		}
@@ -608,9 +616,7 @@ utf *utf_new(const char *text, u2 length)
 		hashtable_utf = newhash;
 	}
 
-#if defined(USE_THREADS)
-	builtin_monitorexit(hashtable_utf->header);
-#endif
+	LOCK_MONITOR_EXIT(hashtable_utf->header);
 
 	return u;
 }
@@ -733,6 +739,9 @@ utf *utf_new_char_classname(const char *text)
    Read the next unicode character from the utf string and increment
    the utf-string pointer accordingly.
 
+   CAUTION: This function is unsafe for input that was not checked 
+            by is_valid_utf!
+
 *******************************************************************************/
 
 u2 utf_nextu2(char **utf_ptr)
@@ -803,6 +812,9 @@ u4 utf_bytes(utf *u)
 
    Determine number of UTF-16 u2s in the given UTF-8 buffer
 
+   CAUTION: This function is unsafe for input that was not checked 
+            by is_valid_utf!
+
    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
    to an array of u2s (UTF-16) and want to know how many of them you will get.
    All other uses of this function are probably wrong.
@@ -845,6 +857,9 @@ u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
 
    Determine number of UTF-16 u2s in the utf string.
 
+   CAUTION: This function is unsafe for input that was not checked 
+            by is_valid_utf!
+
    CAUTION: Use this function *only* when you want to convert a utf string
    to an array of u2s and want to know how many of them you will get.
    All other uses of this function are probably wrong.
@@ -890,6 +905,309 @@ u4 utf_get_number_of_u2s(utf *u)
 }
 
 
+/* utf8_safe_number_of_u2s *****************************************************
+
+   Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
+   (For invalid UTF-8 the U+fffd replacement character will be counted.)
+
+   This function is safe even for invalid UTF-8 strings.
+
+   IN:
+      text..........zero-terminated(!) UTF-8 string (may be invalid)
+	                must NOT be NULL
+	  nbytes........strlen(text). (This is needed to completely emulate
+	                the RI).
+
+   OUT:
+      the number of u2s needed to hold this string in UTF-16 encoding.
+	  There is _no_ terminating zero included in this count.
+
+*******************************************************************************/
+
+s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
+	register const unsigned char *t;
+	register s4 byte;
+	register s4 len;
+	register const unsigned char *tlimit;
+	s4 byte1;
+	s4 byte2;
+	s4 byte3;
+	s4 value;
+	s4 skip;
+
+	assert(text);
+	assert(nbytes >= 0);
+
+	len = 0;
+	t = (const unsigned char *) text;
+	tlimit = t + nbytes;
+
+	/* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
+
+	while (1) {
+		byte = *t++;
+
+		if (byte & 0x80) {
+			/* highest bit set, non-ASCII character */
+
+			if ((byte & 0xe0) == 0xc0) {
+				/* 2-byte: should be 110..... 10...... ? */
+
+				if ((*t++ & 0xc0) == 0x80)
+					; /* valid 2-byte */
+				else
+					t--; /* invalid */
+			}
+			else if ((byte & 0xf0) == 0xe0) {
+				/* 3-byte: should be 1110.... 10...... 10...... */
+				/*                            ^t                */
+
+				if (t + 2 > tlimit)
+					return len + 1; /* invalid, stop here */
+
+				if ((*t++ & 0xc0) == 0x80) {
+					if ((*t++ & 0xc0) == 0x80)
+						; /* valid 3-byte */
+					else
+						t--; /* invalid */
+				}
+				else
+					t--; /* invalid */
+			}
+			else if ((byte & 0xf8) == 0xf0) {
+				/* 4-byte: should be 11110... 10...... 10...... 10...... */
+				/*                            ^t                         */
+
+				if (t + 3 > tlimit)
+					return len + 1; /* invalid, stop here */
+
+				if (((byte1 = *t++) & 0xc0) == 0x80) {
+					if (((byte2 = *t++) & 0xc0) == 0x80) {
+						if (((byte3 = *t++) & 0xc0) == 0x80) {
+							/* valid 4-byte UTF-8? */
+							value = ((byte  & 0x07) << 18)
+								  | ((byte1 & 0x3f) << 12)
+								  | ((byte2 & 0x3f) <<  6)
+								  | ((byte3 & 0x3f)      );
+
+							if (value > 0x10FFFF)
+								; /* invalid */
+							else if (value > 0xFFFF)
+								len += 1; /* we need surrogates */
+							else
+								; /* 16bit suffice */
+						}
+						else
+							t--; /* invalid */
+					}
+					else
+						t--; /* invalid */
+				}
+				else
+					t--; /* invalid */
+			}
+			else if ((byte & 0xfc) == 0xf8) {
+				/* invalid 5-byte */
+				if (t + 4 > tlimit)
+					return len + 1; /* invalid, stop here */
+
+				skip = 4;
+				for (; skip && ((*t & 0xc0) == 0x80); --skip)
+					t++;
+			}
+			else if ((byte & 0xfe) == 0xfc) {
+				/* invalid 6-byte */
+				if (t + 5 > tlimit)
+					return len + 1; /* invalid, stop here */
+
+				skip = 5;
+				for (; skip && ((*t & 0xc0) == 0x80); --skip)
+					t++;
+			}
+			else
+				; /* invalid */
+		}
+		else {
+			/* NUL */
+
+			if (byte == 0)
+				break;
+
+			/* ASCII character, common case */
+		}
+
+		len++;
+	}
+
+	return len;
+}
+
+
+/* utf8_safe_convert_to_u2s ****************************************************
+
+   Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
+   (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
+   Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
+
+   This function is safe even for invalid UTF-8 strings.
+
+   IN:
+      text..........zero-terminated(!) UTF-8 string (may be invalid)
+	                must NOT be NULL
+	  nbytes........strlen(text). (This is needed to completely emulate
+	  				the RI).
+	  buffer........a preallocated array of u2s to receive the decoded
+	                string. Use utf8_safe_number_of_u2s to get the
+					required number of u2s for allocating this.
+
+*******************************************************************************/
+
+#define UNICODE_REPLACEMENT  0xfffd
+
+void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
+	register const unsigned char *t;
+	register s4 byte;
+	register const unsigned char *tlimit;
+	s4 byte1;
+	s4 byte2;
+	s4 byte3;
+	s4 value;
+	s4 skip;
+
+	assert(text);
+	assert(nbytes >= 0);
+
+	t = (const unsigned char *) text;
+	tlimit = t + nbytes;
+
+	/* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
+
+	while (1) {
+		byte = *t++;
+
+		if (byte & 0x80) {
+			/* highest bit set, non-ASCII character */
+
+			if ((byte & 0xe0) == 0xc0) {
+				/* 2-byte: should be 110..... 10...... */
+
+				if (((byte1 = *t++) & 0xc0) == 0x80) {
+					/* valid 2-byte UTF-8 */
+					*buffer++ = ((byte  & 0x1f) << 6)
+							  | ((byte1 & 0x3f)     );
+				}
+				else {
+					*buffer++ = UNICODE_REPLACEMENT;
+					t--;
+				}
+			}
+			else if ((byte & 0xf0) == 0xe0) {
+				/* 3-byte: should be 1110.... 10...... 10...... */
+
+				if (t + 2 > tlimit) {
+					*buffer++ = UNICODE_REPLACEMENT;
+					return;
+				}
+
+				if (((byte1 = *t++) & 0xc0) == 0x80) {
+					if (((byte2 = *t++) & 0xc0) == 0x80) {
+						/* valid 3-byte UTF-8 */
+						*buffer++ = ((byte  & 0x0f) << 12)
+								  | ((byte1 & 0x3f) <<  6)
+								  | ((byte2 & 0x3f)      );
+					}
+					else {
+						*buffer++ = UNICODE_REPLACEMENT;
+						t--;
+					}
+				}
+				else {
+					*buffer++ = UNICODE_REPLACEMENT;
+					t--;
+				}
+			}
+			else if ((byte & 0xf8) == 0xf0) {
+				/* 4-byte: should be 11110... 10...... 10...... 10...... */
+
+				if (t + 3 > tlimit) {
+					*buffer++ = UNICODE_REPLACEMENT;
+					return;
+				}
+
+				if (((byte1 = *t++) & 0xc0) == 0x80) {
+					if (((byte2 = *t++) & 0xc0) == 0x80) {
+						if (((byte3 = *t++) & 0xc0) == 0x80) {
+							/* valid 4-byte UTF-8? */
+							value = ((byte  & 0x07) << 18)
+								  | ((byte1 & 0x3f) << 12)
+								  | ((byte2 & 0x3f) <<  6)
+								  | ((byte3 & 0x3f)      );
+
+							if (value > 0x10FFFF) {
+								*buffer++ = UNICODE_REPLACEMENT;
+							}
+							else if (value > 0xFFFF) {
+								/* we need surrogates */
+								*buffer++ = 0xd800 | ((value >> 10) - 0x40);
+								*buffer++ = 0xdc00 | (value & 0x03ff);
+							}
+							else
+								*buffer++ = value; /* 16bit suffice */
+						}
+						else {
+							*buffer++ = UNICODE_REPLACEMENT;
+							t--;
+						}
+					}
+					else {
+						*buffer++ = UNICODE_REPLACEMENT;
+						t--;
+					}
+				}
+				else {
+					*buffer++ = UNICODE_REPLACEMENT;
+					t--;
+				}
+			}
+			else if ((byte & 0xfc) == 0xf8) {
+				if (t + 4 > tlimit) {
+					*buffer++ = UNICODE_REPLACEMENT;
+					return;
+				}
+
+				skip = 4;
+				for (; skip && ((*t & 0xc0) == 0x80); --skip)
+					t++;
+				*buffer++ = UNICODE_REPLACEMENT;
+			}
+			else if ((byte & 0xfe) == 0xfc) {
+				if (t + 5 > tlimit) {
+					*buffer++ = UNICODE_REPLACEMENT;
+					return;
+				}
+
+				skip = 5;
+				for (; skip && ((*t & 0xc0) == 0x80); --skip)
+					t++;
+				*buffer++ = UNICODE_REPLACEMENT;
+			}
+			else
+				*buffer++ = UNICODE_REPLACEMENT;
+		}
+		else {
+			/* NUL */
+
+			if (byte == 0)
+				break;
+
+			/* ASCII character, common case */
+
+			*buffer++ = byte;
+		}
+	}
+}
+
+
 /* u2_utflength ****************************************************************
 
    Returns the utf length in bytes of a u2 array.