* src/vm/jit/parse.c, src/vm/jit/parse.h (Changes): Merged with
[cacao.git] / src / vm / utf8.c
index f81fc82004c20949f1a3666536b3cf6e6cf9f810..4c65b583598801b950d8227a8a87118ba1998a78 100644 (file)
@@ -1,9 +1,9 @@
-/* src/vm/utf.c - utf functions
+/* src/vm/utf8.c - utf8 string functions
 
-   Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
-   R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
-   C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
-   Institut f. Computersprachen - TU Wien
+   Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
+   C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
+   E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
+   J. Wenninger, Institut f. Computersprachen - TU Wien
 
    This file is part of CACAO.
 
 
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
-   02111-1307, USA.
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
 
-   Contact: cacao@complang.tuwien.ac.at
+   Contact: cacao@cacaojvm.org
 
    Authors: Reinhard Grafl
 
    Changes: Mark Probst
             Andreas Krall
             Christian Thalinger
+                       Edwin Steiner
 
-   $Id: utf8.c 3683 2005-11-16 13:27:46Z twisti $
+   $Id: utf8.c 5920 2006-11-05 21:23:09Z twisti $
 
 */
 
 
+#include "config.h"
+
 #include <string.h>
+#include <assert.h>
+
+#include "vm/types.h"
 
 #include "mm/memory.h"
+
+#if defined(ENABLE_THREADS)
+# include "threads/native/lock.h"
+#else
+# include "threads/none/lock.h"
+#endif
+
+#include "vm/builtin.h"
 #include "vm/exceptions.h"
+#include "vm/hashtable.h"
 #include "vm/options.h"
 #include "vm/statistics.h"
 #include "vm/stringlocal.h"
-#include "vm/tables.h"
 #include "vm/utf8.h"
 
 
 /* global variables ***********************************************************/
 
-#if defined(USE_THREADS)
-static java_objectheader *lock_utf_hashtable;
-#endif
+/* hashsize must be power of 2 */
 
-hashtable utf_hash;                     /* hashtable for utf8-symbols         */
+#define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
+
+hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
 
 
 /* utf-symbols for pointer comparison of frequently used strings **************/
 
-utf *utf_java_lang_Object;              /* java/lang/Object                   */
+utf *utf_java_lang_Object;
 
 utf *utf_java_lang_Class;
 utf *utf_java_lang_ClassLoader;
@@ -71,11 +85,14 @@ utf *utf_java_io_Serializable;
 utf *utf_java_lang_Throwable;
 utf *utf_java_lang_VMThrowable;
 utf *utf_java_lang_Error;
+utf *utf_java_lang_AbstractMethodError;
+utf *utf_java_lang_LinkageError;
 utf *utf_java_lang_NoClassDefFoundError;
 utf *utf_java_lang_NoSuchMethodError;
 utf *utf_java_lang_OutOfMemoryError;
 
 utf *utf_java_lang_Exception;
+utf *utf_java_lang_ClassCastException;
 utf *utf_java_lang_ClassNotFoundException;
 utf *utf_java_lang_IllegalArgumentException;
 utf *utf_java_lang_IllegalMonitorStateException;
@@ -104,6 +121,7 @@ utf *utf_Code;                          /* Code                               */
 utf *utf_Exceptions;                    /* Exceptions                         */
 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 utf *utf_SourceFile;                    /* SourceFile                         */
+utf *utf_Signature;
 
 utf *utf_init;                          /* <init>                             */
 utf *utf_clinit;                        /* <clinit>                           */
@@ -111,11 +129,12 @@ utf *utf_clone;                         /* clone                              */
 utf *utf_finalize;                      /* finalize                           */
 utf *utf_run;                           /* run                                */
 
-utf *utf_add;                           /* add                                */
-utf *utf_remove;                        /* remove                             */
-utf *utf_put;                           /* put                                */
-utf *utf_get;                           /* get                                */
-utf *utf_value;                         /* value                              */
+utf *utf_add;
+utf *utf_remove;
+utf *utf_removeThread;
+utf *utf_put;
+utf *utf_get;
+utf *utf_value;
 
 utf *utf_fillInStackTrace;
 utf *utf_getSystemClassLoader;
@@ -147,10 +166,11 @@ utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 utf *utf_java_lang_Object__java_lang_Object;
 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 utf *utf_java_lang_String__java_lang_Class;
+utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 
 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
-
+utf *utf_null;
 utf *array_packagename;
 
 
@@ -162,14 +182,15 @@ utf *array_packagename;
 
 bool utf8_init(void)
 {
-#if defined(USE_THREADS)
-       /* create utf hashtable lock object */
+       /* create utf8 hashtable */
+
+       hashtable_utf = NEW(hashtable);
 
-       lock_utf_hashtable = NEW(java_objectheader);
+       hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 
-# if defined(NATIVE_THREADS)
-       initObjectLock(lock_utf_hashtable);
-# endif
+#if defined(ENABLE_STATISTICS)
+       if (opt_stat)
+               count_utf_len += sizeof(utf*) * hashtable_utf->size;
 #endif
 
        /* create utf-symbols for pointer comparison of frequently used strings */
@@ -189,6 +210,12 @@ bool utf8_init(void)
        utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
        utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
 
+       utf_java_lang_AbstractMethodError =
+               utf_new_char(string_java_lang_AbstractMethodError);
+
+       utf_java_lang_LinkageError =
+               utf_new_char(string_java_lang_LinkageError);
+
        utf_java_lang_NoClassDefFoundError =
                utf_new_char(string_java_lang_NoClassDefFoundError);
 
@@ -200,6 +227,9 @@ bool utf8_init(void)
 
        utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
 
+       utf_java_lang_ClassCastException =
+               utf_new_char(string_java_lang_ClassCastException);
+
        utf_java_lang_ClassNotFoundException =
                utf_new_char(string_java_lang_ClassNotFoundException);
 
@@ -238,6 +268,7 @@ bool utf8_init(void)
        utf_Exceptions                 = utf_new_char("Exceptions");
        utf_LineNumberTable            = utf_new_char("LineNumberTable");
        utf_SourceFile                 = utf_new_char("SourceFile");
+       utf_Signature                  = utf_new_char("Signature");
 
        utf_init                           = utf_new_char("<init>");
        utf_clinit                         = utf_new_char("<clinit>");
@@ -247,6 +278,7 @@ bool utf8_init(void)
 
        utf_add                        = utf_new_char("add");
        utf_remove                     = utf_new_char("remove");
+       utf_removeThread               = utf_new_char("removeThread");
        utf_put                        = utf_new_char("put");
        utf_get                        = utf_new_char("get");
        utf_value                      = utf_new_char("value");
@@ -288,10 +320,11 @@ bool utf8_init(void)
        utf_java_lang_String__java_lang_Class =
                utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 
+       utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
        utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 
+       utf_null                       = utf_new_char("null");
        utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
-
        array_packagename              = utf_new_char("\t<the array package>");
 
        /* everything's ok */
@@ -423,8 +456,36 @@ u4 utf_hashkey(const char *text, u4 length)
     }
 }
 
+/* utf_full_hashkey ************************************************************
 
-/* utf_hashkey *****************************************************************
+   This function computes a hash value using all bytes in the string.
+
+   The algorithm is the "One-at-a-time" algorithm as published
+   by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
+
+*******************************************************************************/
+
+u4 utf_full_hashkey(const char *text, u4 length)
+{
+       register const unsigned char *p = (const unsigned char *) text;
+       register u4 hash;
+       register u4 i;
+
+       hash = 0;
+       for (i=length; i--;)
+       {
+           hash += *p++;
+           hash += (hash << 10);
+           hash ^= (hash >> 6);
+       }
+       hash += (hash << 3);
+       hash ^= (hash >> 11);
+       hash += (hash << 15);
+
+       return hash;
+}
+
+/* unicode_hashkey *************************************************************
 
    Compute the hashkey of a unicode string.
 
@@ -456,25 +517,16 @@ utf *utf_new(const char *text, u2 length)
        utf *u;                             /* hashtable element                  */
        u2 i;
 
-#if defined(USE_THREADS)
-       builtin_monitorenter(lock_utf_hashtable);
-#endif
-
-       /* XXX REMOVE ME! after testing of course ;-) */
-       #include <assert.h>
-       static int running = 0;
-       assert(running == 0);
-       running = 1;
-       /* XXX REMOVE ME! */
+       LOCK_MONITOR_ENTER(hashtable_utf->header);
 
-#ifdef STATISTICS
+#if defined(ENABLE_STATISTICS)
        if (opt_stat)
                count_utf_new++;
 #endif
 
        key  = utf_hashkey(text, length);
-       slot = key & (utf_hash.size - 1);
-       u    = utf_hash.ptr[slot];
+       slot = key & (hashtable_utf->size - 1);
+       u    = hashtable_utf->ptr[slot];
 
        /* search external hash chain for utf-symbol */
 
@@ -486,20 +538,14 @@ utf *utf_new(const char *text, u2 length)
                                if (text[i] != u->text[i])
                                        goto nomatch;
                        
-#if defined(STATISTICS)
+#if defined(ENABLE_STATISTICS)
                        if (opt_stat)
                                count_utf_new_found++;
 #endif
 
                        /* symbol found in hashtable */
 
-                       /* XXX REMOVE ME! */
-                       running = 0;
-                       /* XXX REMOVE ME! */
-
-#if defined(USE_THREADS)
-                       builtin_monitorexit(lock_utf_hashtable);
-#endif
+                       LOCK_MONITOR_EXIT(hashtable_utf->header);
 
                        return u;
                }
@@ -508,7 +554,7 @@ utf *utf_new(const char *text, u2 length)
                u = u->hashlink; /* next element in external chain */
        }
 
-#if defined(STATISTICS)
+#if defined(ENABLE_STATISTICS)
        if (opt_stat)
                count_utf_len += sizeof(utf) + length + 1;
 #endif
@@ -516,59 +562,61 @@ utf *utf_new(const char *text, u2 length)
        /* location in hashtable found, create new utf element */
        u = NEW(utf);
        u->blength  = length;               /* length in bytes of utfstring       */
-       u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
+       u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
        u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
+
        memcpy(u->text, text, length);      /* copy utf-text                      */
        u->text[length] = '\0';
-       utf_hash.ptr[slot] = u;             /* insert symbol into table           */
 
-       utf_hash.entries++;                 /* update number of entries           */
+       hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
+       hashtable_utf->entries++;           /* update number of entries           */
 
-       if (utf_hash.entries > (utf_hash.size * 2)) {
+       if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 
-        /* reorganization of hashtable, average length of 
-           the external chains is approx. 2                */  
+        /* reorganization of hashtable, average length of the external
+           chains is approx. 2 */
 
-               u4 i;
-               utf *u;
-               hashtable newhash; /* the new hashtable */
+               hashtable *newhash;                              /* the new hashtable */
+               u4         i;
+               utf       *u;
+               utf       *nextu;
+               u4         slot;
 
                /* create new hashtable, double the size */
-               init_hashtable(&newhash, utf_hash.size * 2);
-               newhash.entries = utf_hash.entries;
 
-#ifdef STATISTICS
+               newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
+
+#if defined(ENABLE_STATISTICS)
                if (opt_stat)
-                       count_utf_len += sizeof(utf*) * utf_hash.size;
+                       count_utf_len += sizeof(utf*) * hashtable_utf->size;
 #endif
 
                /* transfer elements to new hashtable */
-               for (i = 0; i < utf_hash.size; i++) {
-                       u = (utf *) utf_hash.ptr[i];
+
+               for (i = 0; i < hashtable_utf->size; i++) {
+                       u = hashtable_utf->ptr[i];
+
                        while (u) {
-                               utf *nextu = u->hashlink;
-                               u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
+                               nextu = u->hashlink;
+                               slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
                                                
-                               u->hashlink = (utf *) newhash.ptr[slot];
-                               newhash.ptr[slot] = u;
+                               u->hashlink = (utf *) newhash->ptr[slot];
+                               newhash->ptr[slot] = u;
 
                                /* follow link in external hash chain */
+
                                u = nextu;
                        }
                }
        
                /* dispose old table */
-               MFREE(utf_hash.ptr, void*, utf_hash.size);
-               utf_hash = newhash;
-       }
 
-       /* XXX REMOVE ME! */
-       running = 0;
-       /* XXX REMOVE ME! */
+               hashtable_free(hashtable_utf);
 
-#if defined(USE_THREADS)
-       builtin_monitorexit(lock_utf_hashtable);
-#endif
+               hashtable_utf = newhash;
+       }
+
+       LOCK_MONITOR_EXIT(hashtable_utf->header);
 
        return u;
 }
@@ -691,6 +739,9 @@ utf *utf_new_char_classname(const char *text)
    Read the next unicode character from the utf string and increment
    the utf-string pointer accordingly.
 
+   CAUTION: This function is unsafe for input that was not checked 
+            by is_valid_utf!
+
 *******************************************************************************/
 
 u2 utf_nextu2(char **utf_ptr)
@@ -739,20 +790,100 @@ u2 utf_nextu2(char **utf_ptr)
 }
 
 
-/* utf_strlen ******************************************************************
+/* utf_bytes *******************************************************************
+
+   Determine number of bytes (aka. octets) in the utf string.
 
-   Determine number of unicode characters in the utf string.
+   IN:
+      u............utf string
+
+   OUT:
+      The number of octets of this utf string.
+         There is _no_ terminating zero included in this count.
 
 *******************************************************************************/
 
-u4 utf_strlen(utf *u)
+u4 utf_bytes(utf *u)
+{
+       return u->blength;
+}
+
+/* utf_get_number_of_u2s_for_buffer ********************************************
+
+   Determine number of UTF-16 u2s in the given UTF-8 buffer
+
+   CAUTION: This function is unsafe for input that was not checked 
+            by is_valid_utf!
+
+   CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
+   to an array of u2s (UTF-16) and want to know how many of them you will get.
+   All other uses of this function are probably wrong.
+
+   IN:
+      buffer........points to first char in buffer
+         blength.......number of _bytes_ in the buffer
+
+   OUT:
+      the number of u2s needed to hold this string in UTF-16 encoding.
+         There is _no_ terminating zero included in this count.
+
+   NOTE: Unlike utf_get_number_of_u2s, this function never throws an
+   exception.
+
+*******************************************************************************/
+
+u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
+{
+       const char *endpos;                 /* points behind utf string           */
+       const char *utf_ptr;                /* current position in utf text       */
+       u4 len = 0;                         /* number of unicode characters       */
+
+       utf_ptr = buffer;
+       endpos = utf_ptr + blength;
+
+       while (utf_ptr < endpos) {
+               len++;
+               /* next unicode character */
+               utf_nextu2((char **)&utf_ptr);
+       }
+
+       assert(utf_ptr == endpos);
+
+       return len;
+}
+
+
+/* utf_get_number_of_u2s *******************************************************
+
+   Determine number of UTF-16 u2s in the utf string.
+
+   CAUTION: This function is unsafe for input that was not checked 
+            by is_valid_utf!
+
+   CAUTION: Use this function *only* when you want to convert a utf string
+   to an array of u2s and want to know how many of them you will get.
+   All other uses of this function are probably wrong.
+
+   IN:
+      u............utf string
+
+   OUT:
+      the number of u2s needed to hold this string in UTF-16 encoding.
+         There is _no_ terminating zero included in this count.
+         XXX 0 if a NullPointerException has been thrown (see below)
+
+*******************************************************************************/
+
+u4 utf_get_number_of_u2s(utf *u)
 {
        char *endpos;                       /* points behind utf string           */
        char *utf_ptr;                      /* current position in utf text       */
        u4 len = 0;                         /* number of unicode characters       */
 
+       /* XXX this is probably not checked by most callers! Review this after */
+       /* the invalid uses of this function have been eliminated */
        if (!u) {
-               *exceptionptr = new_nullpointerexception();
+               exceptions_throw_nullpointerexception();
                return 0;
        }
 
@@ -774,6 +905,309 @@ u4 utf_strlen(utf *u)
 }
 
 
+/* utf8_safe_number_of_u2s *****************************************************
+
+   Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
+   (For invalid UTF-8 the U+fffd replacement character will be counted.)
+
+   This function is safe even for invalid UTF-8 strings.
+
+   IN:
+      text..........zero-terminated(!) UTF-8 string (may be invalid)
+                       must NOT be NULL
+         nbytes........strlen(text). (This is needed to completely emulate
+                       the RI).
+
+   OUT:
+      the number of u2s needed to hold this string in UTF-16 encoding.
+         There is _no_ terminating zero included in this count.
+
+*******************************************************************************/
+
+s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
+       register const unsigned char *t;
+       register s4 byte;
+       register s4 len;
+       register const unsigned char *tlimit;
+       s4 byte1;
+       s4 byte2;
+       s4 byte3;
+       s4 value;
+       s4 skip;
+
+       assert(text);
+       assert(nbytes >= 0);
+
+       len = 0;
+       t = (const unsigned char *) text;
+       tlimit = t + nbytes;
+
+       /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
+
+       while (1) {
+               byte = *t++;
+
+               if (byte & 0x80) {
+                       /* highest bit set, non-ASCII character */
+
+                       if ((byte & 0xe0) == 0xc0) {
+                               /* 2-byte: should be 110..... 10...... ? */
+
+                               if ((*t++ & 0xc0) == 0x80)
+                                       ; /* valid 2-byte */
+                               else
+                                       t--; /* invalid */
+                       }
+                       else if ((byte & 0xf0) == 0xe0) {
+                               /* 3-byte: should be 1110.... 10...... 10...... */
+                               /*                            ^t                */
+
+                               if (t + 2 > tlimit)
+                                       return len + 1; /* invalid, stop here */
+
+                               if ((*t++ & 0xc0) == 0x80) {
+                                       if ((*t++ & 0xc0) == 0x80)
+                                               ; /* valid 3-byte */
+                                       else
+                                               t--; /* invalid */
+                               }
+                               else
+                                       t--; /* invalid */
+                       }
+                       else if ((byte & 0xf8) == 0xf0) {
+                               /* 4-byte: should be 11110... 10...... 10...... 10...... */
+                               /*                            ^t                         */
+
+                               if (t + 3 > tlimit)
+                                       return len + 1; /* invalid, stop here */
+
+                               if (((byte1 = *t++) & 0xc0) == 0x80) {
+                                       if (((byte2 = *t++) & 0xc0) == 0x80) {
+                                               if (((byte3 = *t++) & 0xc0) == 0x80) {
+                                                       /* valid 4-byte UTF-8? */
+                                                       value = ((byte  & 0x07) << 18)
+                                                                 | ((byte1 & 0x3f) << 12)
+                                                                 | ((byte2 & 0x3f) <<  6)
+                                                                 | ((byte3 & 0x3f)      );
+
+                                                       if (value > 0x10FFFF)
+                                                               ; /* invalid */
+                                                       else if (value > 0xFFFF)
+                                                               len += 1; /* we need surrogates */
+                                                       else
+                                                               ; /* 16bit suffice */
+                                               }
+                                               else
+                                                       t--; /* invalid */
+                                       }
+                                       else
+                                               t--; /* invalid */
+                               }
+                               else
+                                       t--; /* invalid */
+                       }
+                       else if ((byte & 0xfc) == 0xf8) {
+                               /* invalid 5-byte */
+                               if (t + 4 > tlimit)
+                                       return len + 1; /* invalid, stop here */
+
+                               skip = 4;
+                               for (; skip && ((*t & 0xc0) == 0x80); --skip)
+                                       t++;
+                       }
+                       else if ((byte & 0xfe) == 0xfc) {
+                               /* invalid 6-byte */
+                               if (t + 5 > tlimit)
+                                       return len + 1; /* invalid, stop here */
+
+                               skip = 5;
+                               for (; skip && ((*t & 0xc0) == 0x80); --skip)
+                                       t++;
+                       }
+                       else
+                               ; /* invalid */
+               }
+               else {
+                       /* NUL */
+
+                       if (byte == 0)
+                               break;
+
+                       /* ASCII character, common case */
+               }
+
+               len++;
+       }
+
+       return len;
+}
+
+
+/* utf8_safe_convert_to_u2s ****************************************************
+
+   Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
+   (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
+   Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
+
+   This function is safe even for invalid UTF-8 strings.
+
+   IN:
+      text..........zero-terminated(!) UTF-8 string (may be invalid)
+                       must NOT be NULL
+         nbytes........strlen(text). (This is needed to completely emulate
+                                       the RI).
+         buffer........a preallocated array of u2s to receive the decoded
+                       string. Use utf8_safe_number_of_u2s to get the
+                                       required number of u2s for allocating this.
+
+*******************************************************************************/
+
+#define UNICODE_REPLACEMENT  0xfffd
+
+void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
+       register const unsigned char *t;
+       register s4 byte;
+       register const unsigned char *tlimit;
+       s4 byte1;
+       s4 byte2;
+       s4 byte3;
+       s4 value;
+       s4 skip;
+
+       assert(text);
+       assert(nbytes >= 0);
+
+       t = (const unsigned char *) text;
+       tlimit = t + nbytes;
+
+       /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
+
+       while (1) {
+               byte = *t++;
+
+               if (byte & 0x80) {
+                       /* highest bit set, non-ASCII character */
+
+                       if ((byte & 0xe0) == 0xc0) {
+                               /* 2-byte: should be 110..... 10...... */
+
+                               if (((byte1 = *t++) & 0xc0) == 0x80) {
+                                       /* valid 2-byte UTF-8 */
+                                       *buffer++ = ((byte  & 0x1f) << 6)
+                                                         | ((byte1 & 0x3f)     );
+                               }
+                               else {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       t--;
+                               }
+                       }
+                       else if ((byte & 0xf0) == 0xe0) {
+                               /* 3-byte: should be 1110.... 10...... 10...... */
+
+                               if (t + 2 > tlimit) {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       return;
+                               }
+
+                               if (((byte1 = *t++) & 0xc0) == 0x80) {
+                                       if (((byte2 = *t++) & 0xc0) == 0x80) {
+                                               /* valid 3-byte UTF-8 */
+                                               *buffer++ = ((byte  & 0x0f) << 12)
+                                                                 | ((byte1 & 0x3f) <<  6)
+                                                                 | ((byte2 & 0x3f)      );
+                                       }
+                                       else {
+                                               *buffer++ = UNICODE_REPLACEMENT;
+                                               t--;
+                                       }
+                               }
+                               else {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       t--;
+                               }
+                       }
+                       else if ((byte & 0xf8) == 0xf0) {
+                               /* 4-byte: should be 11110... 10...... 10...... 10...... */
+
+                               if (t + 3 > tlimit) {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       return;
+                               }
+
+                               if (((byte1 = *t++) & 0xc0) == 0x80) {
+                                       if (((byte2 = *t++) & 0xc0) == 0x80) {
+                                               if (((byte3 = *t++) & 0xc0) == 0x80) {
+                                                       /* valid 4-byte UTF-8? */
+                                                       value = ((byte  & 0x07) << 18)
+                                                                 | ((byte1 & 0x3f) << 12)
+                                                                 | ((byte2 & 0x3f) <<  6)
+                                                                 | ((byte3 & 0x3f)      );
+
+                                                       if (value > 0x10FFFF) {
+                                                               *buffer++ = UNICODE_REPLACEMENT;
+                                                       }
+                                                       else if (value > 0xFFFF) {
+                                                               /* we need surrogates */
+                                                               *buffer++ = 0xd800 | ((value >> 10) - 0x40);
+                                                               *buffer++ = 0xdc00 | (value & 0x03ff);
+                                                       }
+                                                       else
+                                                               *buffer++ = value; /* 16bit suffice */
+                                               }
+                                               else {
+                                                       *buffer++ = UNICODE_REPLACEMENT;
+                                                       t--;
+                                               }
+                                       }
+                                       else {
+                                               *buffer++ = UNICODE_REPLACEMENT;
+                                               t--;
+                                       }
+                               }
+                               else {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       t--;
+                               }
+                       }
+                       else if ((byte & 0xfc) == 0xf8) {
+                               if (t + 4 > tlimit) {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       return;
+                               }
+
+                               skip = 4;
+                               for (; skip && ((*t & 0xc0) == 0x80); --skip)
+                                       t++;
+                               *buffer++ = UNICODE_REPLACEMENT;
+                       }
+                       else if ((byte & 0xfe) == 0xfc) {
+                               if (t + 5 > tlimit) {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       return;
+                               }
+
+                               skip = 5;
+                               for (; skip && ((*t & 0xc0) == 0x80); --skip)
+                                       t++;
+                               *buffer++ = UNICODE_REPLACEMENT;
+                       }
+                       else
+                               *buffer++ = UNICODE_REPLACEMENT;
+               }
+               else {
+                       /* NUL */
+
+                       if (byte == 0)
+                               break;
+
+                       /* ASCII character, common case */
+
+                       *buffer++ = byte;
+               }
+       }
+}
+
+
 /* u2_utflength ****************************************************************
 
    Returns the utf length in bytes of a u2 array.
@@ -803,18 +1237,100 @@ u4 u2_utflength(u2 *text, u4 u2_length)
 }
 
 
-/* utf_display *****************************************************************
+/* utf_copy ********************************************************************
+
+   Copy the given utf string byte-for-byte to a buffer.
+
+   IN:
+      buffer.......the buffer
+         u............the utf string
+
+*******************************************************************************/
+
+void utf_copy(char *buffer, utf *u)
+{
+       /* our utf strings are zero-terminated (done by utf_new) */
+       MCOPY(buffer, u->text, char, u->blength + 1);
+}
+
+
+/* utf_cat *********************************************************************
+
+   Append the given utf string byte-for-byte to a buffer.
+
+   IN:
+      buffer.......the buffer
+         u............the utf string
+
+*******************************************************************************/
+
+void utf_cat(char *buffer, utf *u)
+{
+       /* our utf strings are zero-terminated (done by utf_new) */
+       MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
+}
+
+
+/* utf_copy_classname **********************************************************
+
+   Copy the given utf classname byte-for-byte to a buffer.
+   '/' is replaced by '.'
+
+   IN:
+      buffer.......the buffer
+         u............the utf string
+
+*******************************************************************************/
+
+void utf_copy_classname(char *buffer, utf *u)
+{
+       char *bufptr;
+       char *srcptr;
+       char *endptr;
+       char ch;
+
+       bufptr = buffer;
+       srcptr = u->text;
+       endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
+
+       while (srcptr != endptr) {
+               ch = *srcptr++;
+               if (ch == '/')
+                       ch = '.';
+               *bufptr++ = ch;
+       }
+}
+
+
+/* utf_cat *********************************************************************
+
+   Append the given utf classname byte-for-byte to a buffer.
+   '/' is replaced by '.'
+
+   IN:
+      buffer.......the buffer
+         u............the utf string
+
+*******************************************************************************/
+
+void utf_cat_classname(char *buffer, utf *u)
+{
+       utf_copy_classname(buffer + strlen(buffer), u);
+}
+
+/* utf_display_printable_ascii *************************************************
 
    Write utf symbol to stdout (for debugging purposes).
+   Non-printable and non-ASCII characters are printed as '?'.
 
 *******************************************************************************/
 
-void utf_display(utf *u)
+void utf_display_printable_ascii(utf *u)
 {
        char *endpos;                       /* points behind utf string           */
        char *utf_ptr;                      /* current position in utf text       */
 
-       if (!u) {
+       if (u == NULL) {
                printf("NULL");
                fflush(stdout);
                return;
@@ -824,29 +1340,34 @@ void utf_display(utf *u)
        utf_ptr = u->text;
 
        while (utf_ptr < endpos) {
-               /* read next unicode character */                
+               /* read next unicode character */
+
                u2 c = utf_nextu2(&utf_ptr);
-               if (c >= 32 && c <= 127) printf("%c", c);
-               else printf("?");
+
+               if ((c >= 32) && (c <= 127))
+                       printf("%c", c);
+               else
+                       printf("?");
        }
 
        fflush(stdout);
 }
 
 
-/* utf_display_classname *******************************************************
+/* utf_display_printable_ascii_classname ***************************************
 
    Write utf symbol to stdout with `/' converted to `.' (for debugging
    purposes).
+   Non-printable and non-ASCII characters are printed as '?'.
 
 *******************************************************************************/
 
-void utf_display_classname(utf *u)
+void utf_display_printable_ascii_classname(utf *u)
 {
        char *endpos;                       /* points behind utf string           */
        char *utf_ptr;                      /* current position in utf text       */
 
-       if (!u) {
+       if (u == NULL) {
                printf("NULL");
                fflush(stdout);
                return;
@@ -856,24 +1377,32 @@ void utf_display_classname(utf *u)
        utf_ptr = u->text;
 
        while (utf_ptr < endpos) {
-               /* read next unicode character */                
+               /* read next unicode character */
+
                u2 c = utf_nextu2(&utf_ptr);
-               if (c == '/') c = '.';
-               if (c >= 32 && c <= 127) printf("%c", c);
-               else printf("?");
+
+               if (c == '/')
+                       c = '.';
+
+               if ((c >= 32) && (c <= 127))
+                       printf("%c", c);
+               else
+                       printf("?");
        }
 
        fflush(stdout);
 }
 
 
-/* utf_sprint ******************************************************************
+/* utf_sprint_convert_to_latin1 ************************************************
        
    Write utf symbol into c-string (for debugging purposes).
+   Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
+   invalid results.
 
 *******************************************************************************/
 
-void utf_sprint(char *buffer, utf *u)
+void utf_sprint_convert_to_latin1(char *buffer, utf *u)
 {
        char *endpos;                       /* points behind utf string           */
        char *utf_ptr;                      /* current position in utf text       */
@@ -896,14 +1425,16 @@ void utf_sprint(char *buffer, utf *u)
 }
 
 
-/* utf_sprint_classname ********************************************************
+/* utf_sprint_convert_to_latin1_classname **************************************
        
    Write utf symbol into c-string with `/' converted to `.' (for debugging
    purposes).
+   Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
+   invalid results.
 
 *******************************************************************************/
 
-void utf_sprint_classname(char *buffer, utf *u)
+void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
 {
        char *endpos;                       /* points behind utf string           */
        char *utf_ptr;                      /* current position in utf text       */
@@ -929,37 +1460,42 @@ void utf_sprint_classname(char *buffer, utf *u)
 }
 
 
-/* utf_strcat ******************************************************************
+/* utf_strcat_convert_to_latin1 ************************************************
        
    Like libc strcat, but uses an utf8 string.
+   Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
+   invalid results.
 
 *******************************************************************************/
 
-void utf_strcat(char *buffer, utf *u)
+void utf_strcat_convert_to_latin1(char *buffer, utf *u)
 {
-       utf_sprint(buffer + strlen(buffer), u);
+       utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
 }
 
 
-/* utf_strcat_classname ********************************************************
+/* utf_strcat_convert_to_latin1_classname **************************************
        
    Like libc strcat, but uses an utf8 string.
+   Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
+   invalid results.
 
 *******************************************************************************/
 
-void utf_strcat_classname(char *buffer, utf *u)
+void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
 {
-       utf_sprint_classname(buffer + strlen(buffer), u);
+       utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
 }
 
 
-/* utf_fprint ******************************************************************
+/* utf_fprint_printable_ascii **************************************************
        
    Write utf symbol into file.
+   Non-printable and non-ASCII characters are printed as '?'.
 
 *******************************************************************************/
 
-void utf_fprint(FILE *file, utf *u)
+void utf_fprint_printable_ascii(FILE *file, utf *u)
 {
        char *endpos;                       /* points behind utf string           */
        char *utf_ptr;                      /* current position in utf text       */
@@ -980,13 +1516,14 @@ void utf_fprint(FILE *file, utf *u)
 }
 
 
-/* utf_fprint_classname ********************************************************
+/* utf_fprint_printable_ascii_classname ****************************************
        
    Write utf symbol into file with `/' converted to `.'.
+   Non-printable and non-ASCII characters are printed as '?'.
 
 *******************************************************************************/
 
-void utf_fprint_classname(FILE *file, utf *u)
+void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
 {
        char *endpos;                       /* points behind utf string           */
        char *utf_ptr;                      /* current position in utf text       */
@@ -1117,6 +1654,7 @@ bool is_valid_name_utf(utf *u)
 
 *******************************************************************************/
 
+#if !defined(NDEBUG)
 void utf_show(void)
 {
 
@@ -1128,29 +1666,30 @@ void utf_show(void)
        u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
        u4 i;
 
-       printf ("UTF-HASH:\n");
+       printf("UTF-HASH:\n");
 
        /* show element of utf-hashtable */
-       for (i=0; i<utf_hash.size; i++) {
-               utf *u = utf_hash.ptr[i];
+
+       for (i = 0; i < hashtable_utf->size; i++) {
+               utf *u = hashtable_utf->ptr[i];
+
                if (u) {
-                       printf ("SLOT %d: ", (int) i);
+                       printf("SLOT %d: ", (int) i);
+
                        while (u) {
-                               printf ("'");
-                               utf_display (u);
-                               printf ("' ");
+                               printf("'");
+                               utf_display_printable_ascii(u);
+                               printf("' ");
                                u = u->hashlink;
                        }       
-                       printf ("\n");
+                       printf("\n");
                }
-               
        }
 
-       printf ("UTF-HASH: %d slots for %d entries\n", 
-                       (int) utf_hash.size, (int) utf_hash.entries );
-
+       printf("UTF-HASH: %d slots for %d entries\n", 
+                  (int) hashtable_utf->size, (int) hashtable_utf->entries );
 
-       if (utf_hash.entries == 0)
+       if (hashtable_utf->entries == 0)
                return;
 
        printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
@@ -1159,9 +1698,9 @@ void utf_show(void)
                chain_count[i]=0;
 
        /* count numbers of hashchains according to their length */
-       for (i=0; i<utf_hash.size; i++) {
+       for (i=0; i<hashtable_utf->size; i++) {
                  
-               utf *u = (utf*) utf_hash.ptr[i];
+               utf *u = (utf*) hashtable_utf->ptr[i];
                u4 chain_length = 0;
 
                /* determine chainlength */
@@ -1189,16 +1728,17 @@ void utf_show(void)
 
        /* display results */  
        for (i=1;i<CHAIN_LIMIT-1;i++) 
-               printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
+               printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
          
-       printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
+       printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
 
 
        printf("max. chainlength:%5d\n",max_chainlength);
 
        /* avg. chainlength = sum of chainlengths / number of chains */
-       printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
+       printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
 }
+#endif /* !defined(NDEBUG) */
 
 
 /*
@@ -1212,4 +1752,5 @@ void utf_show(void)
  * c-basic-offset: 4
  * tab-width: 4
  * End:
+ * vim:noexpandtab:sw=4:ts=4:
  */