X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;f=src%2Fvm%2Futf8.c;h=4c65b583598801b950d8227a8a87118ba1998a78;hb=4f7f3896b97b7c228687fc2f4f04fcca7cf8f67f;hp=cd16f8b3f7ccfb11384b667b8883d3693d433c38;hpb=cbbf96f102bf6b0ea93fcd4f099487858b9c96d0;p=cacao.git diff --git a/src/vm/utf8.c b/src/vm/utf8.c index cd16f8b3f..4c65b5835 100644 --- a/src/vm/utf8.c +++ b/src/vm/utf8.c @@ -31,7 +31,7 @@ Christian Thalinger Edwin Steiner - $Id: utf8.c 4900 2006-05-11 09:18:28Z twisti $ + $Id: utf8.c 5920 2006-11-05 21:23:09Z twisti $ */ @@ -45,12 +45,10 @@ #include "mm/memory.h" -#if defined(USE_THREADS) -# if defined(NATIVE_THREADS) -# include "threads/native/threads.h" -# else -# include "threads/green/threads.h" -# endif +#if defined(ENABLE_THREADS) +# include "threads/native/lock.h" +#else +# include "threads/none/lock.h" #endif #include "vm/builtin.h" @@ -87,12 +85,14 @@ utf *utf_java_io_Serializable; utf *utf_java_lang_Throwable; utf *utf_java_lang_VMThrowable; utf *utf_java_lang_Error; -utf *utf_java_lang_NoClassDefFoundError; +utf *utf_java_lang_AbstractMethodError; utf *utf_java_lang_LinkageError; +utf *utf_java_lang_NoClassDefFoundError; utf *utf_java_lang_NoSuchMethodError; utf *utf_java_lang_OutOfMemoryError; utf *utf_java_lang_Exception; +utf *utf_java_lang_ClassCastException; utf *utf_java_lang_ClassNotFoundException; utf *utf_java_lang_IllegalArgumentException; utf *utf_java_lang_IllegalMonitorStateException; @@ -121,6 +121,7 @@ utf *utf_Code; /* Code */ utf *utf_Exceptions; /* Exceptions */ utf *utf_LineNumberTable; /* LineNumberTable */ utf *utf_SourceFile; /* SourceFile */ +utf *utf_Signature; utf *utf_init; /* */ utf *utf_clinit; /* */ @@ -128,11 +129,12 @@ utf *utf_clone; /* clone */ utf *utf_finalize; /* finalize */ utf *utf_run; /* run */ -utf *utf_add; /* add */ -utf *utf_remove; /* remove */ -utf *utf_put; /* put */ -utf *utf_get; /* get */ -utf *utf_value; /* value */ +utf *utf_add; +utf *utf_remove; +utf *utf_removeThread; +utf *utf_put; +utf *utf_get; +utf *utf_value; utf *utf_fillInStackTrace; utf *utf_getSystemClassLoader; @@ -164,10 +166,11 @@ utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */ utf *utf_java_lang_Object__java_lang_Object; utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */ utf *utf_java_lang_String__java_lang_Class; +utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */ utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */ utf *utf_not_named_yet; /* special name for unnamed classes */ - +utf *utf_null; utf *array_packagename; @@ -187,7 +190,7 @@ bool utf8_init(void) #if defined(ENABLE_STATISTICS) if (opt_stat) - count_utf_len += sizeof(utf*) * hashtable_utf.size; + count_utf_len += sizeof(utf*) * hashtable_utf->size; #endif /* create utf-symbols for pointer comparison of frequently used strings */ @@ -207,12 +210,15 @@ bool utf8_init(void) utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable); utf_java_lang_Error = utf_new_char(string_java_lang_Error); - utf_java_lang_NoClassDefFoundError = - utf_new_char(string_java_lang_NoClassDefFoundError); + utf_java_lang_AbstractMethodError = + utf_new_char(string_java_lang_AbstractMethodError); utf_java_lang_LinkageError = utf_new_char(string_java_lang_LinkageError); + utf_java_lang_NoClassDefFoundError = + utf_new_char(string_java_lang_NoClassDefFoundError); + utf_java_lang_NoSuchMethodError = utf_new_char(string_java_lang_NoSuchMethodError); @@ -221,6 +227,9 @@ bool utf8_init(void) utf_java_lang_Exception = utf_new_char(string_java_lang_Exception); + utf_java_lang_ClassCastException = + utf_new_char(string_java_lang_ClassCastException); + utf_java_lang_ClassNotFoundException = utf_new_char(string_java_lang_ClassNotFoundException); @@ -259,6 +268,7 @@ bool utf8_init(void) utf_Exceptions = utf_new_char("Exceptions"); utf_LineNumberTable = utf_new_char("LineNumberTable"); utf_SourceFile = utf_new_char("SourceFile"); + utf_Signature = utf_new_char("Signature"); utf_init = utf_new_char(""); utf_clinit = utf_new_char(""); @@ -268,6 +278,7 @@ bool utf8_init(void) utf_add = utf_new_char("add"); utf_remove = utf_new_char("remove"); + utf_removeThread = utf_new_char("removeThread"); utf_put = utf_new_char("put"); utf_get = utf_new_char("get"); utf_value = utf_new_char("value"); @@ -309,10 +320,11 @@ bool utf8_init(void) utf_java_lang_String__java_lang_Class = utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;"); + utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V"); utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V"); + utf_null = utf_new_char("null"); utf_not_named_yet = utf_new_char("\t"); - array_packagename = utf_new_char("\t"); /* everything's ok */ @@ -505,9 +517,7 @@ utf *utf_new(const char *text, u2 length) utf *u; /* hashtable element */ u2 i; -#if defined(USE_THREADS) - builtin_monitorenter(hashtable_utf->header); -#endif + LOCK_MONITOR_ENTER(hashtable_utf->header); #if defined(ENABLE_STATISTICS) if (opt_stat) @@ -535,9 +545,7 @@ utf *utf_new(const char *text, u2 length) /* symbol found in hashtable */ -#if defined(USE_THREADS) - builtin_monitorexit(hashtable_utf->header); -#endif + LOCK_MONITOR_EXIT(hashtable_utf->header); return u; } @@ -608,9 +616,7 @@ utf *utf_new(const char *text, u2 length) hashtable_utf = newhash; } -#if defined(USE_THREADS) - builtin_monitorexit(hashtable_utf->header); -#endif + LOCK_MONITOR_EXIT(hashtable_utf->header); return u; } @@ -733,6 +739,9 @@ utf *utf_new_char_classname(const char *text) Read the next unicode character from the utf string and increment the utf-string pointer accordingly. + CAUTION: This function is unsafe for input that was not checked + by is_valid_utf! + *******************************************************************************/ u2 utf_nextu2(char **utf_ptr) @@ -803,6 +812,9 @@ u4 utf_bytes(utf *u) Determine number of UTF-16 u2s in the given UTF-8 buffer + CAUTION: This function is unsafe for input that was not checked + by is_valid_utf! + CAUTION: Use this function *only* when you want to convert an UTF-8 buffer to an array of u2s (UTF-16) and want to know how many of them you will get. All other uses of this function are probably wrong. @@ -845,6 +857,9 @@ u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength) Determine number of UTF-16 u2s in the utf string. + CAUTION: This function is unsafe for input that was not checked + by is_valid_utf! + CAUTION: Use this function *only* when you want to convert a utf string to an array of u2s and want to know how many of them you will get. All other uses of this function are probably wrong. @@ -890,6 +905,309 @@ u4 utf_get_number_of_u2s(utf *u) } +/* utf8_safe_number_of_u2s ***************************************************** + + Determine number of UTF-16 u2s needed for decoding the given UTF-8 string. + (For invalid UTF-8 the U+fffd replacement character will be counted.) + + This function is safe even for invalid UTF-8 strings. + + IN: + text..........zero-terminated(!) UTF-8 string (may be invalid) + must NOT be NULL + nbytes........strlen(text). (This is needed to completely emulate + the RI). + + OUT: + the number of u2s needed to hold this string in UTF-16 encoding. + There is _no_ terminating zero included in this count. + +*******************************************************************************/ + +s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) { + register const unsigned char *t; + register s4 byte; + register s4 len; + register const unsigned char *tlimit; + s4 byte1; + s4 byte2; + s4 byte3; + s4 value; + s4 skip; + + assert(text); + assert(nbytes >= 0); + + len = 0; + t = (const unsigned char *) text; + tlimit = t + nbytes; + + /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */ + + while (1) { + byte = *t++; + + if (byte & 0x80) { + /* highest bit set, non-ASCII character */ + + if ((byte & 0xe0) == 0xc0) { + /* 2-byte: should be 110..... 10...... ? */ + + if ((*t++ & 0xc0) == 0x80) + ; /* valid 2-byte */ + else + t--; /* invalid */ + } + else if ((byte & 0xf0) == 0xe0) { + /* 3-byte: should be 1110.... 10...... 10...... */ + /* ^t */ + + if (t + 2 > tlimit) + return len + 1; /* invalid, stop here */ + + if ((*t++ & 0xc0) == 0x80) { + if ((*t++ & 0xc0) == 0x80) + ; /* valid 3-byte */ + else + t--; /* invalid */ + } + else + t--; /* invalid */ + } + else if ((byte & 0xf8) == 0xf0) { + /* 4-byte: should be 11110... 10...... 10...... 10...... */ + /* ^t */ + + if (t + 3 > tlimit) + return len + 1; /* invalid, stop here */ + + if (((byte1 = *t++) & 0xc0) == 0x80) { + if (((byte2 = *t++) & 0xc0) == 0x80) { + if (((byte3 = *t++) & 0xc0) == 0x80) { + /* valid 4-byte UTF-8? */ + value = ((byte & 0x07) << 18) + | ((byte1 & 0x3f) << 12) + | ((byte2 & 0x3f) << 6) + | ((byte3 & 0x3f) ); + + if (value > 0x10FFFF) + ; /* invalid */ + else if (value > 0xFFFF) + len += 1; /* we need surrogates */ + else + ; /* 16bit suffice */ + } + else + t--; /* invalid */ + } + else + t--; /* invalid */ + } + else + t--; /* invalid */ + } + else if ((byte & 0xfc) == 0xf8) { + /* invalid 5-byte */ + if (t + 4 > tlimit) + return len + 1; /* invalid, stop here */ + + skip = 4; + for (; skip && ((*t & 0xc0) == 0x80); --skip) + t++; + } + else if ((byte & 0xfe) == 0xfc) { + /* invalid 6-byte */ + if (t + 5 > tlimit) + return len + 1; /* invalid, stop here */ + + skip = 5; + for (; skip && ((*t & 0xc0) == 0x80); --skip) + t++; + } + else + ; /* invalid */ + } + else { + /* NUL */ + + if (byte == 0) + break; + + /* ASCII character, common case */ + } + + len++; + } + + return len; +} + + +/* utf8_safe_convert_to_u2s **************************************************** + + Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer. + (Invalid UTF-8 will be replaced with the U+fffd replacement character.) + Use utf8_safe_number_of_u2s to determine the number of u2s to allocate. + + This function is safe even for invalid UTF-8 strings. + + IN: + text..........zero-terminated(!) UTF-8 string (may be invalid) + must NOT be NULL + nbytes........strlen(text). (This is needed to completely emulate + the RI). + buffer........a preallocated array of u2s to receive the decoded + string. Use utf8_safe_number_of_u2s to get the + required number of u2s for allocating this. + +*******************************************************************************/ + +#define UNICODE_REPLACEMENT 0xfffd + +void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) { + register const unsigned char *t; + register s4 byte; + register const unsigned char *tlimit; + s4 byte1; + s4 byte2; + s4 byte3; + s4 value; + s4 skip; + + assert(text); + assert(nbytes >= 0); + + t = (const unsigned char *) text; + tlimit = t + nbytes; + + /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */ + + while (1) { + byte = *t++; + + if (byte & 0x80) { + /* highest bit set, non-ASCII character */ + + if ((byte & 0xe0) == 0xc0) { + /* 2-byte: should be 110..... 10...... */ + + if (((byte1 = *t++) & 0xc0) == 0x80) { + /* valid 2-byte UTF-8 */ + *buffer++ = ((byte & 0x1f) << 6) + | ((byte1 & 0x3f) ); + } + else { + *buffer++ = UNICODE_REPLACEMENT; + t--; + } + } + else if ((byte & 0xf0) == 0xe0) { + /* 3-byte: should be 1110.... 10...... 10...... */ + + if (t + 2 > tlimit) { + *buffer++ = UNICODE_REPLACEMENT; + return; + } + + if (((byte1 = *t++) & 0xc0) == 0x80) { + if (((byte2 = *t++) & 0xc0) == 0x80) { + /* valid 3-byte UTF-8 */ + *buffer++ = ((byte & 0x0f) << 12) + | ((byte1 & 0x3f) << 6) + | ((byte2 & 0x3f) ); + } + else { + *buffer++ = UNICODE_REPLACEMENT; + t--; + } + } + else { + *buffer++ = UNICODE_REPLACEMENT; + t--; + } + } + else if ((byte & 0xf8) == 0xf0) { + /* 4-byte: should be 11110... 10...... 10...... 10...... */ + + if (t + 3 > tlimit) { + *buffer++ = UNICODE_REPLACEMENT; + return; + } + + if (((byte1 = *t++) & 0xc0) == 0x80) { + if (((byte2 = *t++) & 0xc0) == 0x80) { + if (((byte3 = *t++) & 0xc0) == 0x80) { + /* valid 4-byte UTF-8? */ + value = ((byte & 0x07) << 18) + | ((byte1 & 0x3f) << 12) + | ((byte2 & 0x3f) << 6) + | ((byte3 & 0x3f) ); + + if (value > 0x10FFFF) { + *buffer++ = UNICODE_REPLACEMENT; + } + else if (value > 0xFFFF) { + /* we need surrogates */ + *buffer++ = 0xd800 | ((value >> 10) - 0x40); + *buffer++ = 0xdc00 | (value & 0x03ff); + } + else + *buffer++ = value; /* 16bit suffice */ + } + else { + *buffer++ = UNICODE_REPLACEMENT; + t--; + } + } + else { + *buffer++ = UNICODE_REPLACEMENT; + t--; + } + } + else { + *buffer++ = UNICODE_REPLACEMENT; + t--; + } + } + else if ((byte & 0xfc) == 0xf8) { + if (t + 4 > tlimit) { + *buffer++ = UNICODE_REPLACEMENT; + return; + } + + skip = 4; + for (; skip && ((*t & 0xc0) == 0x80); --skip) + t++; + *buffer++ = UNICODE_REPLACEMENT; + } + else if ((byte & 0xfe) == 0xfc) { + if (t + 5 > tlimit) { + *buffer++ = UNICODE_REPLACEMENT; + return; + } + + skip = 5; + for (; skip && ((*t & 0xc0) == 0x80); --skip) + t++; + *buffer++ = UNICODE_REPLACEMENT; + } + else + *buffer++ = UNICODE_REPLACEMENT; + } + else { + /* NUL */ + + if (byte == 0) + break; + + /* ASCII character, common case */ + + *buffer++ = byte; + } + } +} + + /* u2_utflength **************************************************************** Returns the utf length in bytes of a u2 array.