X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;f=src%2Fvm%2Futf8.c;h=4c65b583598801b950d8227a8a87118ba1998a78;hb=4f7f3896b97b7c228687fc2f4f04fcca7cf8f67f;hp=f81fc82004c20949f1a3666536b3cf6e6cf9f810;hpb=826811c3fba8129283e5206651da574c4cb10b8c;p=cacao.git diff --git a/src/vm/utf8.c b/src/vm/utf8.c index f81fc8200..4c65b5835 100644 --- a/src/vm/utf8.c +++ b/src/vm/utf8.c @@ -1,9 +1,9 @@ -/* src/vm/utf.c - utf functions +/* src/vm/utf8.c - utf8 string functions - Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates, - R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner, - C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger, - Institut f. Computersprachen - TU Wien + Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel, + C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring, + E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, + J. Wenninger, Institut f. Computersprachen - TU Wien This file is part of CACAO. @@ -19,45 +19,59 @@ You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA - 02111-1307, USA. + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. - Contact: cacao@complang.tuwien.ac.at + Contact: cacao@cacaojvm.org Authors: Reinhard Grafl Changes: Mark Probst Andreas Krall Christian Thalinger + Edwin Steiner - $Id: utf8.c 3683 2005-11-16 13:27:46Z twisti $ + $Id: utf8.c 5920 2006-11-05 21:23:09Z twisti $ */ +#include "config.h" + #include +#include + +#include "vm/types.h" #include "mm/memory.h" + +#if defined(ENABLE_THREADS) +# include "threads/native/lock.h" +#else +# include "threads/none/lock.h" +#endif + +#include "vm/builtin.h" #include "vm/exceptions.h" +#include "vm/hashtable.h" #include "vm/options.h" #include "vm/statistics.h" #include "vm/stringlocal.h" -#include "vm/tables.h" #include "vm/utf8.h" /* global variables ***********************************************************/ -#if defined(USE_THREADS) -static java_objectheader *lock_utf_hashtable; -#endif +/* hashsize must be power of 2 */ -hashtable utf_hash; /* hashtable for utf8-symbols */ +#define HASHTABLE_UTF_SIZE 16384 /* initial size of utf-hash */ + +hashtable *hashtable_utf; /* hashtable for utf8-symbols */ /* utf-symbols for pointer comparison of frequently used strings **************/ -utf *utf_java_lang_Object; /* java/lang/Object */ +utf *utf_java_lang_Object; utf *utf_java_lang_Class; utf *utf_java_lang_ClassLoader; @@ -71,11 +85,14 @@ utf *utf_java_io_Serializable; utf *utf_java_lang_Throwable; utf *utf_java_lang_VMThrowable; utf *utf_java_lang_Error; +utf *utf_java_lang_AbstractMethodError; +utf *utf_java_lang_LinkageError; utf *utf_java_lang_NoClassDefFoundError; utf *utf_java_lang_NoSuchMethodError; utf *utf_java_lang_OutOfMemoryError; utf *utf_java_lang_Exception; +utf *utf_java_lang_ClassCastException; utf *utf_java_lang_ClassNotFoundException; utf *utf_java_lang_IllegalArgumentException; utf *utf_java_lang_IllegalMonitorStateException; @@ -104,6 +121,7 @@ utf *utf_Code; /* Code */ utf *utf_Exceptions; /* Exceptions */ utf *utf_LineNumberTable; /* LineNumberTable */ utf *utf_SourceFile; /* SourceFile */ +utf *utf_Signature; utf *utf_init; /* */ utf *utf_clinit; /* */ @@ -111,11 +129,12 @@ utf *utf_clone; /* clone */ utf *utf_finalize; /* finalize */ utf *utf_run; /* run */ -utf *utf_add; /* add */ -utf *utf_remove; /* remove */ -utf *utf_put; /* put */ -utf *utf_get; /* get */ -utf *utf_value; /* value */ +utf *utf_add; +utf *utf_remove; +utf *utf_removeThread; +utf *utf_put; +utf *utf_get; +utf *utf_value; utf *utf_fillInStackTrace; utf *utf_getSystemClassLoader; @@ -147,10 +166,11 @@ utf *utf_void__java_lang_Throwable; /* ()Ljava/lang/Throwable; */ utf *utf_java_lang_Object__java_lang_Object; utf *utf_java_lang_String__void; /* (Ljava/lang/String;)V */ utf *utf_java_lang_String__java_lang_Class; +utf *utf_java_lang_Thread__V; /* (Ljava/lang/Thread;)V */ utf *utf_java_lang_Throwable__void; /* (Ljava/lang/Throwable;)V */ utf *utf_not_named_yet; /* special name for unnamed classes */ - +utf *utf_null; utf *array_packagename; @@ -162,14 +182,15 @@ utf *array_packagename; bool utf8_init(void) { -#if defined(USE_THREADS) - /* create utf hashtable lock object */ + /* create utf8 hashtable */ + + hashtable_utf = NEW(hashtable); - lock_utf_hashtable = NEW(java_objectheader); + hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE); -# if defined(NATIVE_THREADS) - initObjectLock(lock_utf_hashtable); -# endif +#if defined(ENABLE_STATISTICS) + if (opt_stat) + count_utf_len += sizeof(utf*) * hashtable_utf->size; #endif /* create utf-symbols for pointer comparison of frequently used strings */ @@ -189,6 +210,12 @@ bool utf8_init(void) utf_java_lang_VMThrowable = utf_new_char(string_java_lang_VMThrowable); utf_java_lang_Error = utf_new_char(string_java_lang_Error); + utf_java_lang_AbstractMethodError = + utf_new_char(string_java_lang_AbstractMethodError); + + utf_java_lang_LinkageError = + utf_new_char(string_java_lang_LinkageError); + utf_java_lang_NoClassDefFoundError = utf_new_char(string_java_lang_NoClassDefFoundError); @@ -200,6 +227,9 @@ bool utf8_init(void) utf_java_lang_Exception = utf_new_char(string_java_lang_Exception); + utf_java_lang_ClassCastException = + utf_new_char(string_java_lang_ClassCastException); + utf_java_lang_ClassNotFoundException = utf_new_char(string_java_lang_ClassNotFoundException); @@ -238,6 +268,7 @@ bool utf8_init(void) utf_Exceptions = utf_new_char("Exceptions"); utf_LineNumberTable = utf_new_char("LineNumberTable"); utf_SourceFile = utf_new_char("SourceFile"); + utf_Signature = utf_new_char("Signature"); utf_init = utf_new_char(""); utf_clinit = utf_new_char(""); @@ -247,6 +278,7 @@ bool utf8_init(void) utf_add = utf_new_char("add"); utf_remove = utf_new_char("remove"); + utf_removeThread = utf_new_char("removeThread"); utf_put = utf_new_char("put"); utf_get = utf_new_char("get"); utf_value = utf_new_char("value"); @@ -288,10 +320,11 @@ bool utf8_init(void) utf_java_lang_String__java_lang_Class = utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;"); + utf_java_lang_Thread__V = utf_new_char("(Ljava/lang/Thread;)V"); utf_java_lang_Throwable__void = utf_new_char("(Ljava/lang/Throwable;)V"); + utf_null = utf_new_char("null"); utf_not_named_yet = utf_new_char("\t"); - array_packagename = utf_new_char("\t"); /* everything's ok */ @@ -423,8 +456,36 @@ u4 utf_hashkey(const char *text, u4 length) } } +/* utf_full_hashkey ************************************************************ -/* utf_hashkey ***************************************************************** + This function computes a hash value using all bytes in the string. + + The algorithm is the "One-at-a-time" algorithm as published + by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html. + +*******************************************************************************/ + +u4 utf_full_hashkey(const char *text, u4 length) +{ + register const unsigned char *p = (const unsigned char *) text; + register u4 hash; + register u4 i; + + hash = 0; + for (i=length; i--;) + { + hash += *p++; + hash += (hash << 10); + hash ^= (hash >> 6); + } + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + + return hash; +} + +/* unicode_hashkey ************************************************************* Compute the hashkey of a unicode string. @@ -456,25 +517,16 @@ utf *utf_new(const char *text, u2 length) utf *u; /* hashtable element */ u2 i; -#if defined(USE_THREADS) - builtin_monitorenter(lock_utf_hashtable); -#endif - - /* XXX REMOVE ME! after testing of course ;-) */ - #include - static int running = 0; - assert(running == 0); - running = 1; - /* XXX REMOVE ME! */ + LOCK_MONITOR_ENTER(hashtable_utf->header); -#ifdef STATISTICS +#if defined(ENABLE_STATISTICS) if (opt_stat) count_utf_new++; #endif key = utf_hashkey(text, length); - slot = key & (utf_hash.size - 1); - u = utf_hash.ptr[slot]; + slot = key & (hashtable_utf->size - 1); + u = hashtable_utf->ptr[slot]; /* search external hash chain for utf-symbol */ @@ -486,20 +538,14 @@ utf *utf_new(const char *text, u2 length) if (text[i] != u->text[i]) goto nomatch; -#if defined(STATISTICS) +#if defined(ENABLE_STATISTICS) if (opt_stat) count_utf_new_found++; #endif /* symbol found in hashtable */ - /* XXX REMOVE ME! */ - running = 0; - /* XXX REMOVE ME! */ - -#if defined(USE_THREADS) - builtin_monitorexit(lock_utf_hashtable); -#endif + LOCK_MONITOR_EXIT(hashtable_utf->header); return u; } @@ -508,7 +554,7 @@ utf *utf_new(const char *text, u2 length) u = u->hashlink; /* next element in external chain */ } -#if defined(STATISTICS) +#if defined(ENABLE_STATISTICS) if (opt_stat) count_utf_len += sizeof(utf) + length + 1; #endif @@ -516,59 +562,61 @@ utf *utf_new(const char *text, u2 length) /* location in hashtable found, create new utf element */ u = NEW(utf); u->blength = length; /* length in bytes of utfstring */ - u->hashlink = utf_hash.ptr[slot]; /* link in external hashchain */ + u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain */ u->text = mem_alloc(length + 1);/* allocate memory for utf-text */ + memcpy(u->text, text, length); /* copy utf-text */ u->text[length] = '\0'; - utf_hash.ptr[slot] = u; /* insert symbol into table */ - utf_hash.entries++; /* update number of entries */ + hashtable_utf->ptr[slot] = u; /* insert symbol into table */ + hashtable_utf->entries++; /* update number of entries */ - if (utf_hash.entries > (utf_hash.size * 2)) { + if (hashtable_utf->entries > (hashtable_utf->size * 2)) { - /* reorganization of hashtable, average length of - the external chains is approx. 2 */ + /* reorganization of hashtable, average length of the external + chains is approx. 2 */ - u4 i; - utf *u; - hashtable newhash; /* the new hashtable */ + hashtable *newhash; /* the new hashtable */ + u4 i; + utf *u; + utf *nextu; + u4 slot; /* create new hashtable, double the size */ - init_hashtable(&newhash, utf_hash.size * 2); - newhash.entries = utf_hash.entries; -#ifdef STATISTICS + newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2); + +#if defined(ENABLE_STATISTICS) if (opt_stat) - count_utf_len += sizeof(utf*) * utf_hash.size; + count_utf_len += sizeof(utf*) * hashtable_utf->size; #endif /* transfer elements to new hashtable */ - for (i = 0; i < utf_hash.size; i++) { - u = (utf *) utf_hash.ptr[i]; + + for (i = 0; i < hashtable_utf->size; i++) { + u = hashtable_utf->ptr[i]; + while (u) { - utf *nextu = u->hashlink; - u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1); + nextu = u->hashlink; + slot = utf_hashkey(u->text, u->blength) & (newhash->size - 1); - u->hashlink = (utf *) newhash.ptr[slot]; - newhash.ptr[slot] = u; + u->hashlink = (utf *) newhash->ptr[slot]; + newhash->ptr[slot] = u; /* follow link in external hash chain */ + u = nextu; } } /* dispose old table */ - MFREE(utf_hash.ptr, void*, utf_hash.size); - utf_hash = newhash; - } - /* XXX REMOVE ME! */ - running = 0; - /* XXX REMOVE ME! */ + hashtable_free(hashtable_utf); -#if defined(USE_THREADS) - builtin_monitorexit(lock_utf_hashtable); -#endif + hashtable_utf = newhash; + } + + LOCK_MONITOR_EXIT(hashtable_utf->header); return u; } @@ -691,6 +739,9 @@ utf *utf_new_char_classname(const char *text) Read the next unicode character from the utf string and increment the utf-string pointer accordingly. + CAUTION: This function is unsafe for input that was not checked + by is_valid_utf! + *******************************************************************************/ u2 utf_nextu2(char **utf_ptr) @@ -739,20 +790,100 @@ u2 utf_nextu2(char **utf_ptr) } -/* utf_strlen ****************************************************************** +/* utf_bytes ******************************************************************* + + Determine number of bytes (aka. octets) in the utf string. - Determine number of unicode characters in the utf string. + IN: + u............utf string + + OUT: + The number of octets of this utf string. + There is _no_ terminating zero included in this count. *******************************************************************************/ -u4 utf_strlen(utf *u) +u4 utf_bytes(utf *u) +{ + return u->blength; +} + +/* utf_get_number_of_u2s_for_buffer ******************************************** + + Determine number of UTF-16 u2s in the given UTF-8 buffer + + CAUTION: This function is unsafe for input that was not checked + by is_valid_utf! + + CAUTION: Use this function *only* when you want to convert an UTF-8 buffer + to an array of u2s (UTF-16) and want to know how many of them you will get. + All other uses of this function are probably wrong. + + IN: + buffer........points to first char in buffer + blength.......number of _bytes_ in the buffer + + OUT: + the number of u2s needed to hold this string in UTF-16 encoding. + There is _no_ terminating zero included in this count. + + NOTE: Unlike utf_get_number_of_u2s, this function never throws an + exception. + +*******************************************************************************/ + +u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength) +{ + const char *endpos; /* points behind utf string */ + const char *utf_ptr; /* current position in utf text */ + u4 len = 0; /* number of unicode characters */ + + utf_ptr = buffer; + endpos = utf_ptr + blength; + + while (utf_ptr < endpos) { + len++; + /* next unicode character */ + utf_nextu2((char **)&utf_ptr); + } + + assert(utf_ptr == endpos); + + return len; +} + + +/* utf_get_number_of_u2s ******************************************************* + + Determine number of UTF-16 u2s in the utf string. + + CAUTION: This function is unsafe for input that was not checked + by is_valid_utf! + + CAUTION: Use this function *only* when you want to convert a utf string + to an array of u2s and want to know how many of them you will get. + All other uses of this function are probably wrong. + + IN: + u............utf string + + OUT: + the number of u2s needed to hold this string in UTF-16 encoding. + There is _no_ terminating zero included in this count. + XXX 0 if a NullPointerException has been thrown (see below) + +*******************************************************************************/ + +u4 utf_get_number_of_u2s(utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ u4 len = 0; /* number of unicode characters */ + /* XXX this is probably not checked by most callers! Review this after */ + /* the invalid uses of this function have been eliminated */ if (!u) { - *exceptionptr = new_nullpointerexception(); + exceptions_throw_nullpointerexception(); return 0; } @@ -774,6 +905,309 @@ u4 utf_strlen(utf *u) } +/* utf8_safe_number_of_u2s ***************************************************** + + Determine number of UTF-16 u2s needed for decoding the given UTF-8 string. + (For invalid UTF-8 the U+fffd replacement character will be counted.) + + This function is safe even for invalid UTF-8 strings. + + IN: + text..........zero-terminated(!) UTF-8 string (may be invalid) + must NOT be NULL + nbytes........strlen(text). (This is needed to completely emulate + the RI). + + OUT: + the number of u2s needed to hold this string in UTF-16 encoding. + There is _no_ terminating zero included in this count. + +*******************************************************************************/ + +s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) { + register const unsigned char *t; + register s4 byte; + register s4 len; + register const unsigned char *tlimit; + s4 byte1; + s4 byte2; + s4 byte3; + s4 value; + s4 skip; + + assert(text); + assert(nbytes >= 0); + + len = 0; + t = (const unsigned char *) text; + tlimit = t + nbytes; + + /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */ + + while (1) { + byte = *t++; + + if (byte & 0x80) { + /* highest bit set, non-ASCII character */ + + if ((byte & 0xe0) == 0xc0) { + /* 2-byte: should be 110..... 10...... ? */ + + if ((*t++ & 0xc0) == 0x80) + ; /* valid 2-byte */ + else + t--; /* invalid */ + } + else if ((byte & 0xf0) == 0xe0) { + /* 3-byte: should be 1110.... 10...... 10...... */ + /* ^t */ + + if (t + 2 > tlimit) + return len + 1; /* invalid, stop here */ + + if ((*t++ & 0xc0) == 0x80) { + if ((*t++ & 0xc0) == 0x80) + ; /* valid 3-byte */ + else + t--; /* invalid */ + } + else + t--; /* invalid */ + } + else if ((byte & 0xf8) == 0xf0) { + /* 4-byte: should be 11110... 10...... 10...... 10...... */ + /* ^t */ + + if (t + 3 > tlimit) + return len + 1; /* invalid, stop here */ + + if (((byte1 = *t++) & 0xc0) == 0x80) { + if (((byte2 = *t++) & 0xc0) == 0x80) { + if (((byte3 = *t++) & 0xc0) == 0x80) { + /* valid 4-byte UTF-8? */ + value = ((byte & 0x07) << 18) + | ((byte1 & 0x3f) << 12) + | ((byte2 & 0x3f) << 6) + | ((byte3 & 0x3f) ); + + if (value > 0x10FFFF) + ; /* invalid */ + else if (value > 0xFFFF) + len += 1; /* we need surrogates */ + else + ; /* 16bit suffice */ + } + else + t--; /* invalid */ + } + else + t--; /* invalid */ + } + else + t--; /* invalid */ + } + else if ((byte & 0xfc) == 0xf8) { + /* invalid 5-byte */ + if (t + 4 > tlimit) + return len + 1; /* invalid, stop here */ + + skip = 4; + for (; skip && ((*t & 0xc0) == 0x80); --skip) + t++; + } + else if ((byte & 0xfe) == 0xfc) { + /* invalid 6-byte */ + if (t + 5 > tlimit) + return len + 1; /* invalid, stop here */ + + skip = 5; + for (; skip && ((*t & 0xc0) == 0x80); --skip) + t++; + } + else + ; /* invalid */ + } + else { + /* NUL */ + + if (byte == 0) + break; + + /* ASCII character, common case */ + } + + len++; + } + + return len; +} + + +/* utf8_safe_convert_to_u2s **************************************************** + + Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer. + (Invalid UTF-8 will be replaced with the U+fffd replacement character.) + Use utf8_safe_number_of_u2s to determine the number of u2s to allocate. + + This function is safe even for invalid UTF-8 strings. + + IN: + text..........zero-terminated(!) UTF-8 string (may be invalid) + must NOT be NULL + nbytes........strlen(text). (This is needed to completely emulate + the RI). + buffer........a preallocated array of u2s to receive the decoded + string. Use utf8_safe_number_of_u2s to get the + required number of u2s for allocating this. + +*******************************************************************************/ + +#define UNICODE_REPLACEMENT 0xfffd + +void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) { + register const unsigned char *t; + register s4 byte; + register const unsigned char *tlimit; + s4 byte1; + s4 byte2; + s4 byte3; + s4 value; + s4 skip; + + assert(text); + assert(nbytes >= 0); + + t = (const unsigned char *) text; + tlimit = t + nbytes; + + /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */ + + while (1) { + byte = *t++; + + if (byte & 0x80) { + /* highest bit set, non-ASCII character */ + + if ((byte & 0xe0) == 0xc0) { + /* 2-byte: should be 110..... 10...... */ + + if (((byte1 = *t++) & 0xc0) == 0x80) { + /* valid 2-byte UTF-8 */ + *buffer++ = ((byte & 0x1f) << 6) + | ((byte1 & 0x3f) ); + } + else { + *buffer++ = UNICODE_REPLACEMENT; + t--; + } + } + else if ((byte & 0xf0) == 0xe0) { + /* 3-byte: should be 1110.... 10...... 10...... */ + + if (t + 2 > tlimit) { + *buffer++ = UNICODE_REPLACEMENT; + return; + } + + if (((byte1 = *t++) & 0xc0) == 0x80) { + if (((byte2 = *t++) & 0xc0) == 0x80) { + /* valid 3-byte UTF-8 */ + *buffer++ = ((byte & 0x0f) << 12) + | ((byte1 & 0x3f) << 6) + | ((byte2 & 0x3f) ); + } + else { + *buffer++ = UNICODE_REPLACEMENT; + t--; + } + } + else { + *buffer++ = UNICODE_REPLACEMENT; + t--; + } + } + else if ((byte & 0xf8) == 0xf0) { + /* 4-byte: should be 11110... 10...... 10...... 10...... */ + + if (t + 3 > tlimit) { + *buffer++ = UNICODE_REPLACEMENT; + return; + } + + if (((byte1 = *t++) & 0xc0) == 0x80) { + if (((byte2 = *t++) & 0xc0) == 0x80) { + if (((byte3 = *t++) & 0xc0) == 0x80) { + /* valid 4-byte UTF-8? */ + value = ((byte & 0x07) << 18) + | ((byte1 & 0x3f) << 12) + | ((byte2 & 0x3f) << 6) + | ((byte3 & 0x3f) ); + + if (value > 0x10FFFF) { + *buffer++ = UNICODE_REPLACEMENT; + } + else if (value > 0xFFFF) { + /* we need surrogates */ + *buffer++ = 0xd800 | ((value >> 10) - 0x40); + *buffer++ = 0xdc00 | (value & 0x03ff); + } + else + *buffer++ = value; /* 16bit suffice */ + } + else { + *buffer++ = UNICODE_REPLACEMENT; + t--; + } + } + else { + *buffer++ = UNICODE_REPLACEMENT; + t--; + } + } + else { + *buffer++ = UNICODE_REPLACEMENT; + t--; + } + } + else if ((byte & 0xfc) == 0xf8) { + if (t + 4 > tlimit) { + *buffer++ = UNICODE_REPLACEMENT; + return; + } + + skip = 4; + for (; skip && ((*t & 0xc0) == 0x80); --skip) + t++; + *buffer++ = UNICODE_REPLACEMENT; + } + else if ((byte & 0xfe) == 0xfc) { + if (t + 5 > tlimit) { + *buffer++ = UNICODE_REPLACEMENT; + return; + } + + skip = 5; + for (; skip && ((*t & 0xc0) == 0x80); --skip) + t++; + *buffer++ = UNICODE_REPLACEMENT; + } + else + *buffer++ = UNICODE_REPLACEMENT; + } + else { + /* NUL */ + + if (byte == 0) + break; + + /* ASCII character, common case */ + + *buffer++ = byte; + } + } +} + + /* u2_utflength **************************************************************** Returns the utf length in bytes of a u2 array. @@ -803,18 +1237,100 @@ u4 u2_utflength(u2 *text, u4 u2_length) } -/* utf_display ***************************************************************** +/* utf_copy ******************************************************************** + + Copy the given utf string byte-for-byte to a buffer. + + IN: + buffer.......the buffer + u............the utf string + +*******************************************************************************/ + +void utf_copy(char *buffer, utf *u) +{ + /* our utf strings are zero-terminated (done by utf_new) */ + MCOPY(buffer, u->text, char, u->blength + 1); +} + + +/* utf_cat ********************************************************************* + + Append the given utf string byte-for-byte to a buffer. + + IN: + buffer.......the buffer + u............the utf string + +*******************************************************************************/ + +void utf_cat(char *buffer, utf *u) +{ + /* our utf strings are zero-terminated (done by utf_new) */ + MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1); +} + + +/* utf_copy_classname ********************************************************** + + Copy the given utf classname byte-for-byte to a buffer. + '/' is replaced by '.' + + IN: + buffer.......the buffer + u............the utf string + +*******************************************************************************/ + +void utf_copy_classname(char *buffer, utf *u) +{ + char *bufptr; + char *srcptr; + char *endptr; + char ch; + + bufptr = buffer; + srcptr = u->text; + endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */ + + while (srcptr != endptr) { + ch = *srcptr++; + if (ch == '/') + ch = '.'; + *bufptr++ = ch; + } +} + + +/* utf_cat ********************************************************************* + + Append the given utf classname byte-for-byte to a buffer. + '/' is replaced by '.' + + IN: + buffer.......the buffer + u............the utf string + +*******************************************************************************/ + +void utf_cat_classname(char *buffer, utf *u) +{ + utf_copy_classname(buffer + strlen(buffer), u); +} + +/* utf_display_printable_ascii ************************************************* Write utf symbol to stdout (for debugging purposes). + Non-printable and non-ASCII characters are printed as '?'. *******************************************************************************/ -void utf_display(utf *u) +void utf_display_printable_ascii(utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ - if (!u) { + if (u == NULL) { printf("NULL"); fflush(stdout); return; @@ -824,29 +1340,34 @@ void utf_display(utf *u) utf_ptr = u->text; while (utf_ptr < endpos) { - /* read next unicode character */ + /* read next unicode character */ + u2 c = utf_nextu2(&utf_ptr); - if (c >= 32 && c <= 127) printf("%c", c); - else printf("?"); + + if ((c >= 32) && (c <= 127)) + printf("%c", c); + else + printf("?"); } fflush(stdout); } -/* utf_display_classname ******************************************************* +/* utf_display_printable_ascii_classname *************************************** Write utf symbol to stdout with `/' converted to `.' (for debugging purposes). + Non-printable and non-ASCII characters are printed as '?'. *******************************************************************************/ -void utf_display_classname(utf *u) +void utf_display_printable_ascii_classname(utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ - if (!u) { + if (u == NULL) { printf("NULL"); fflush(stdout); return; @@ -856,24 +1377,32 @@ void utf_display_classname(utf *u) utf_ptr = u->text; while (utf_ptr < endpos) { - /* read next unicode character */ + /* read next unicode character */ + u2 c = utf_nextu2(&utf_ptr); - if (c == '/') c = '.'; - if (c >= 32 && c <= 127) printf("%c", c); - else printf("?"); + + if (c == '/') + c = '.'; + + if ((c >= 32) && (c <= 127)) + printf("%c", c); + else + printf("?"); } fflush(stdout); } -/* utf_sprint ****************************************************************** +/* utf_sprint_convert_to_latin1 ************************************************ Write utf symbol into c-string (for debugging purposes). + Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield + invalid results. *******************************************************************************/ -void utf_sprint(char *buffer, utf *u) +void utf_sprint_convert_to_latin1(char *buffer, utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ @@ -896,14 +1425,16 @@ void utf_sprint(char *buffer, utf *u) } -/* utf_sprint_classname ******************************************************** +/* utf_sprint_convert_to_latin1_classname ************************************** Write utf symbol into c-string with `/' converted to `.' (for debugging purposes). + Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield + invalid results. *******************************************************************************/ -void utf_sprint_classname(char *buffer, utf *u) +void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ @@ -929,37 +1460,42 @@ void utf_sprint_classname(char *buffer, utf *u) } -/* utf_strcat ****************************************************************** +/* utf_strcat_convert_to_latin1 ************************************************ Like libc strcat, but uses an utf8 string. + Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield + invalid results. *******************************************************************************/ -void utf_strcat(char *buffer, utf *u) +void utf_strcat_convert_to_latin1(char *buffer, utf *u) { - utf_sprint(buffer + strlen(buffer), u); + utf_sprint_convert_to_latin1(buffer + strlen(buffer), u); } -/* utf_strcat_classname ******************************************************** +/* utf_strcat_convert_to_latin1_classname ************************************** Like libc strcat, but uses an utf8 string. + Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield + invalid results. *******************************************************************************/ -void utf_strcat_classname(char *buffer, utf *u) +void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u) { - utf_sprint_classname(buffer + strlen(buffer), u); + utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u); } -/* utf_fprint ****************************************************************** +/* utf_fprint_printable_ascii ************************************************** Write utf symbol into file. + Non-printable and non-ASCII characters are printed as '?'. *******************************************************************************/ -void utf_fprint(FILE *file, utf *u) +void utf_fprint_printable_ascii(FILE *file, utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ @@ -980,13 +1516,14 @@ void utf_fprint(FILE *file, utf *u) } -/* utf_fprint_classname ******************************************************** +/* utf_fprint_printable_ascii_classname **************************************** Write utf symbol into file with `/' converted to `.'. + Non-printable and non-ASCII characters are printed as '?'. *******************************************************************************/ -void utf_fprint_classname(FILE *file, utf *u) +void utf_fprint_printable_ascii_classname(FILE *file, utf *u) { char *endpos; /* points behind utf string */ char *utf_ptr; /* current position in utf text */ @@ -1117,6 +1654,7 @@ bool is_valid_name_utf(utf *u) *******************************************************************************/ +#if !defined(NDEBUG) void utf_show(void) { @@ -1128,29 +1666,30 @@ void utf_show(void) u4 beyond_limit = 0; /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */ u4 i; - printf ("UTF-HASH:\n"); + printf("UTF-HASH:\n"); /* show element of utf-hashtable */ - for (i=0; isize; i++) { + utf *u = hashtable_utf->ptr[i]; + if (u) { - printf ("SLOT %d: ", (int) i); + printf("SLOT %d: ", (int) i); + while (u) { - printf ("'"); - utf_display (u); - printf ("' "); + printf("'"); + utf_display_printable_ascii(u); + printf("' "); u = u->hashlink; } - printf ("\n"); + printf("\n"); } - } - printf ("UTF-HASH: %d slots for %d entries\n", - (int) utf_hash.size, (int) utf_hash.entries ); - + printf("UTF-HASH: %d slots for %d entries\n", + (int) hashtable_utf->size, (int) hashtable_utf->entries ); - if (utf_hash.entries == 0) + if (hashtable_utf->entries == 0) return; printf("chains:\n chainlength number of chains %% of utfstrings\n"); @@ -1159,9 +1698,9 @@ void utf_show(void) chain_count[i]=0; /* count numbers of hashchains according to their length */ - for (i=0; isize; i++) { - utf *u = (utf*) utf_hash.ptr[i]; + utf *u = (utf*) hashtable_utf->ptr[i]; u4 chain_length = 0; /* determine chainlength */ @@ -1189,16 +1728,17 @@ void utf_show(void) /* display results */ for (i=1;ientries)); - printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries); + printf(" >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries); printf("max. chainlength:%5d\n",max_chainlength); /* avg. chainlength = sum of chainlengths / number of chains */ - printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0])); + printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0])); } +#endif /* !defined(NDEBUG) */ /* @@ -1212,4 +1752,5 @@ void utf_show(void) * c-basic-offset: 4 * tab-width: 4 * End: + * vim:noexpandtab:sw=4:ts=4: */