merged volatile memory barriers
[cacao.git] / src / vm / utf8.c
index d9b27f32a9f7c2885963622da72d346be886539c..7afcb2ce14d540538f992ffd5cea05ba62934c18 100644 (file)
@@ -1,9 +1,7 @@
 /* src/vm/utf8.c - utf8 string functions
 
-   Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
-   C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
-   E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
-   J. Wenninger, Institut f. Computersprachen - TU Wien
+   Copyright (C) 1996-2005, 2006, 2007, 2008
+   CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
 
    This file is part of CACAO.
 
    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
    02110-1301, USA.
 
-   Contact: cacao@cacaojvm.org
-
-   Authors: Reinhard Grafl
-
-   Changes: Mark Probst
-            Andreas Krall
-            Christian Thalinger
-                       Edwin Steiner
-
-   $Id: utf8.c 5046 2006-06-22 14:39:10Z twisti $
-
 */
 
 
 
 #include "vm/types.h"
 
-#include "mm/memory.h"
+#include "mm/memory.hpp"
 
-#if defined(ENABLE_THREADS)
-# include "threads/native/threads.h"
-#endif
+#include "threads/mutex.hpp"
+
+#include "toolbox/hashtable.h"
 
-#include "vm/builtin.h"
-#include "vm/exceptions.h"
-#include "vm/hashtable.h"
+#include "vm/exceptions.hpp"
 #include "vm/options.h"
-#include "vm/statistics.h"
-#include "vm/stringlocal.h"
+
+#if defined(ENABLE_STATISTICS)
+# include "vm/statistics.h"
+#endif
+
 #include "vm/utf8.h"
 
 
@@ -76,26 +64,58 @@ utf *utf_java_lang_ClassLoader;
 utf *utf_java_lang_Cloneable;
 utf *utf_java_lang_SecurityManager;
 utf *utf_java_lang_String;
-utf *utf_java_lang_System;
 utf *utf_java_lang_ThreadGroup;
+utf *utf_java_lang_ref_SoftReference;
+utf *utf_java_lang_ref_WeakReference;
+utf *utf_java_lang_ref_PhantomReference;
 utf *utf_java_io_Serializable;
 
 utf *utf_java_lang_Throwable;
-utf *utf_java_lang_VMThrowable;
 utf *utf_java_lang_Error;
-utf *utf_java_lang_NoClassDefFoundError;
+
+utf *utf_java_lang_AbstractMethodError;
+utf *utf_java_lang_ClassCircularityError;
+utf *utf_java_lang_ClassFormatError;
+utf *utf_java_lang_ExceptionInInitializerError;
+utf *utf_java_lang_IncompatibleClassChangeError;
+utf *utf_java_lang_InstantiationError;
+utf *utf_java_lang_InternalError;
 utf *utf_java_lang_LinkageError;
+utf *utf_java_lang_NoClassDefFoundError;
+utf *utf_java_lang_NoSuchFieldError;
 utf *utf_java_lang_NoSuchMethodError;
 utf *utf_java_lang_OutOfMemoryError;
+utf *utf_java_lang_UnsatisfiedLinkError;
+utf *utf_java_lang_UnsupportedClassVersionError;
+utf *utf_java_lang_VerifyError;
+utf *utf_java_lang_VirtualMachineError;
 
 utf *utf_java_lang_Exception;
+
+utf *utf_java_lang_ArithmeticException;
+utf *utf_java_lang_ArrayIndexOutOfBoundsException;
+utf *utf_java_lang_ArrayStoreException;
+utf *utf_java_lang_ClassCastException;
 utf *utf_java_lang_ClassNotFoundException;
+utf *utf_java_lang_CloneNotSupportedException;
+utf *utf_java_lang_IllegalAccessException;
 utf *utf_java_lang_IllegalArgumentException;
 utf *utf_java_lang_IllegalMonitorStateException;
-
+utf *utf_java_lang_InstantiationException;
+utf *utf_java_lang_InterruptedException;
+utf *utf_java_lang_NegativeArraySizeException;
 utf *utf_java_lang_NullPointerException;
+utf *utf_java_lang_RuntimeException;
+utf *utf_java_lang_StringIndexOutOfBoundsException;
+
+utf *utf_java_lang_reflect_InvocationTargetException;
 
+utf *utf_java_security_PrivilegedActionException;
+
+#if defined(ENABLE_JAVASE)
 utf* utf_java_lang_Void;
+#endif
+
 utf* utf_java_lang_Boolean;
 utf* utf_java_lang_Byte;
 utf* utf_java_lang_Character;
@@ -105,11 +125,20 @@ utf* utf_java_lang_Long;
 utf* utf_java_lang_Float;
 utf* utf_java_lang_Double;
 
+#if defined(ENABLE_JAVASE)
 utf *utf_java_lang_StackTraceElement;
 utf *utf_java_lang_reflect_Constructor;
 utf *utf_java_lang_reflect_Field;
 utf *utf_java_lang_reflect_Method;
+
+# if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
+utf *utf_java_lang_reflect_VMConstructor;
+utf *utf_java_lang_reflect_VMField;
+utf *utf_java_lang_reflect_VMMethod;
+# endif
+
 utf *utf_java_util_Vector;
+#endif
 
 utf *utf_InnerClasses;                  /* InnerClasses                       */
 utf *utf_ConstantValue;                 /* ConstantValue                      */
@@ -118,23 +147,52 @@ utf *utf_Exceptions;                    /* Exceptions                         */
 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 utf *utf_SourceFile;                    /* SourceFile                         */
 
+#if defined(ENABLE_JAVASE)
+utf *utf_EnclosingMethod;
+utf *utf_Signature;
+utf *utf_StackMapTable;
+
+# if defined(ENABLE_JVMTI)
+utf *utf_LocalVariableTable;
+# endif
+
+# if defined(ENABLE_ANNOTATIONS)
+utf *utf_RuntimeVisibleAnnotations;            /* RuntimeVisibleAnnotations            */
+utf *utf_RuntimeInvisibleAnnotations;          /* RuntimeInvisibleAnnotations          */
+utf *utf_RuntimeVisibleParameterAnnotations;   /* RuntimeVisibleParameterAnnotations   */
+utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
+utf *utf_AnnotationDefault;                    /* AnnotationDefault                    */
+# endif
+#endif
+
 utf *utf_init;                          /* <init>                             */
 utf *utf_clinit;                        /* <clinit>                           */
 utf *utf_clone;                         /* clone                              */
 utf *utf_finalize;                      /* finalize                           */
+utf *utf_invoke;
+utf *utf_main;
 utf *utf_run;                           /* run                                */
 
-utf *utf_add;                           /* add                                */
-utf *utf_remove;                        /* remove                             */
-utf *utf_put;                           /* put                                */
-utf *utf_get;                           /* get                                */
-utf *utf_value;                         /* value                              */
+utf *utf_add;
+utf *utf_dispatch;
+utf *utf_remove;
+utf *utf_addThread;
+utf *utf_removeThread;
+utf *utf_put;
+utf *utf_get;
+utf *utf_uncaughtException;
+utf *utf_value;
 
 utf *utf_fillInStackTrace;
+utf *utf_findNative;
 utf *utf_getSystemClassLoader;
+utf *utf_initCause;
 utf *utf_loadClass;
+utf *utf_loadClassInternal;
 utf *utf_printStackTrace;
 
+utf *utf_division_by_zero;
+
 utf *utf_Z;                             /* Z                                  */
 utf *utf_B;                             /* B                                  */
 utf *utf_C;                             /* C                                  */
@@ -157,13 +215,19 @@ utf *utf_double__void;                  /* (D)V                               */
 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
+utf *utf_java_lang_ClassLoader_java_lang_String__J;
+utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 utf *utf_java_lang_Object__java_lang_Object;
 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 utf *utf_java_lang_String__java_lang_Class;
+utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
+utf *utf_java_lang_Thread_java_lang_Throwable__V;
+utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
+utf *utf_java_lang_Throwable__java_lang_Throwable;
 
 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
-
+utf *utf_null;
 utf *array_packagename;
 
 
@@ -173,8 +237,10 @@ utf *array_packagename;
 
 *******************************************************************************/
 
-bool utf8_init(void)
+void utf8_init(void)
 {
+       TRACESUBSYSTEMINITIALIZATION("utf8_init");
+
        /* create utf8 hashtable */
 
        hashtable_utf = NEW(hashtable);
@@ -195,41 +261,123 @@ bool utf8_init(void)
        utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
        utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
        utf_java_lang_String           = utf_new_char("java/lang/String");
-       utf_java_lang_System           = utf_new_char("java/lang/System");
        utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
+
+       utf_java_lang_ref_SoftReference =
+               utf_new_char("java/lang/ref/SoftReference");
+
+       utf_java_lang_ref_WeakReference =
+               utf_new_char("java/lang/ref/WeakReference");
+
+       utf_java_lang_ref_PhantomReference =
+               utf_new_char("java/lang/ref/PhantomReference");
+
        utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 
-       utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
-       utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
-       utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
+       utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
+       utf_java_lang_Error            = utf_new_char("java/lang/Error");
+
+       utf_java_lang_ClassCircularityError =
+               utf_new_char("java/lang/ClassCircularityError");
+
+       utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
+
+       utf_java_lang_ExceptionInInitializerError =
+               utf_new_char("java/lang/ExceptionInInitializerError");
+
+       utf_java_lang_IncompatibleClassChangeError =
+               utf_new_char("java/lang/IncompatibleClassChangeError");
+
+       utf_java_lang_InstantiationError =
+               utf_new_char("java/lang/InstantiationError");
+
+       utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
+       utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 
        utf_java_lang_NoClassDefFoundError =
-               utf_new_char(string_java_lang_NoClassDefFoundError);
+               utf_new_char("java/lang/NoClassDefFoundError");
+
+       utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
+
+       utf_java_lang_UnsatisfiedLinkError =
+               utf_new_char("java/lang/UnsatisfiedLinkError");
+
+       utf_java_lang_UnsupportedClassVersionError =
+               utf_new_char("java/lang/UnsupportedClassVersionError");
+
+       utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
+
+       utf_java_lang_VirtualMachineError =
+               utf_new_char("java/lang/VirtualMachineError");
 
-       utf_java_lang_LinkageError =
-               utf_new_char(string_java_lang_LinkageError);
+#if defined(ENABLE_JAVASE)
+       utf_java_lang_AbstractMethodError =
+               utf_new_char("java/lang/AbstractMethodError");
+
+       utf_java_lang_NoSuchFieldError =
+               utf_new_char("java/lang/NoSuchFieldError");
 
        utf_java_lang_NoSuchMethodError =
-               utf_new_char(string_java_lang_NoSuchMethodError);
+               utf_new_char("java/lang/NoSuchMethodError");
+#endif
+
+       utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 
-       utf_java_lang_OutOfMemoryError =
-               utf_new_char(string_java_lang_OutOfMemoryError);
+       utf_java_lang_ArithmeticException =
+               utf_new_char("java/lang/ArithmeticException");
 
-       utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
+       utf_java_lang_ArrayIndexOutOfBoundsException =
+               utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
+
+       utf_java_lang_ArrayStoreException =
+               utf_new_char("java/lang/ArrayStoreException");
+
+       utf_java_lang_ClassCastException =
+               utf_new_char("java/lang/ClassCastException");
 
        utf_java_lang_ClassNotFoundException =
-               utf_new_char(string_java_lang_ClassNotFoundException);
+               utf_new_char("java/lang/ClassNotFoundException");
+
+       utf_java_lang_CloneNotSupportedException =
+               utf_new_char("java/lang/CloneNotSupportedException");
+
+       utf_java_lang_IllegalAccessException =
+               utf_new_char("java/lang/IllegalAccessException");
 
        utf_java_lang_IllegalArgumentException =
-               utf_new_char(string_java_lang_IllegalArgumentException);
+               utf_new_char("java/lang/IllegalArgumentException");
 
        utf_java_lang_IllegalMonitorStateException =
-               utf_new_char(string_java_lang_IllegalMonitorStateException);
+               utf_new_char("java/lang/IllegalMonitorStateException");
+
+       utf_java_lang_InstantiationException =
+               utf_new_char("java/lang/InstantiationException");
+
+       utf_java_lang_InterruptedException =
+               utf_new_char("java/lang/InterruptedException");
+
+       utf_java_lang_NegativeArraySizeException =
+               utf_new_char("java/lang/NegativeArraySizeException");
 
        utf_java_lang_NullPointerException =
-               utf_new_char(string_java_lang_NullPointerException);
+               utf_new_char("java/lang/NullPointerException");
+
+       utf_java_lang_RuntimeException =
+               utf_new_char("java/lang/RuntimeException");
 
+       utf_java_lang_StringIndexOutOfBoundsException =
+               utf_new_char("java/lang/StringIndexOutOfBoundsException");
+
+       utf_java_lang_reflect_InvocationTargetException =
+               utf_new_char("java/lang/reflect/InvocationTargetException");
+
+       utf_java_security_PrivilegedActionException =
+               utf_new_char("java/security/PrivilegedActionException");
+#if defined(ENABLE_JAVASE)
        utf_java_lang_Void             = utf_new_char("java/lang/Void");
+#endif
+
        utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
        utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
        utf_java_lang_Character        = utf_new_char("java/lang/Character");
@@ -239,6 +387,7 @@ bool utf8_init(void)
        utf_java_lang_Float            = utf_new_char("java/lang/Float");
        utf_java_lang_Double           = utf_new_char("java/lang/Double");
 
+#if defined(ENABLE_JAVASE)
        utf_java_lang_StackTraceElement =
                utf_new_char("java/lang/StackTraceElement");
 
@@ -247,7 +396,15 @@ bool utf8_init(void)
 
        utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
        utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
+
+# if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
+       utf_java_lang_reflect_VMConstructor = utf_new_char("java/lang/reflect/VMConstructor");
+       utf_java_lang_reflect_VMField       = utf_new_char("java/lang/reflect/VMField");
+       utf_java_lang_reflect_VMMethod      = utf_new_char("java/lang/reflect/VMMethod");
+# endif
+
        utf_java_util_Vector           = utf_new_char("java/util/Vector");
+#endif
 
        utf_InnerClasses               = utf_new_char("InnerClasses");
        utf_ConstantValue              = utf_new_char("ConstantValue");
@@ -256,22 +413,51 @@ bool utf8_init(void)
        utf_LineNumberTable            = utf_new_char("LineNumberTable");
        utf_SourceFile                 = utf_new_char("SourceFile");
 
+#if defined(ENABLE_JAVASE)
+       utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
+       utf_Signature                  = utf_new_char("Signature");
+       utf_StackMapTable              = utf_new_char("StackMapTable");
+
+# if defined(ENABLE_JVMTI)
+       utf_LocalVariableTable         = utf_new_char("LocalVariableTable");
+# endif
+
+# if defined(ENABLE_ANNOTATIONS)
+       utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
+       utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
+       utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
+       utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
+       utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
+# endif
+#endif
+
        utf_init                           = utf_new_char("<init>");
        utf_clinit                         = utf_new_char("<clinit>");
        utf_clone                      = utf_new_char("clone");
        utf_finalize                   = utf_new_char("finalize");
+       utf_invoke                     = utf_new_char("invoke");
+       utf_main                       = utf_new_char("main");
        utf_run                        = utf_new_char("run");
 
        utf_add                        = utf_new_char("add");
+       utf_dispatch                   = utf_new_char("dispatch");
        utf_remove                     = utf_new_char("remove");
+       utf_addThread                  = utf_new_char("addThread");
+       utf_removeThread               = utf_new_char("removeThread");
        utf_put                        = utf_new_char("put");
        utf_get                        = utf_new_char("get");
+       utf_uncaughtException          = utf_new_char("uncaughtException");
        utf_value                      = utf_new_char("value");
 
-       utf_printStackTrace            = utf_new_char("printStackTrace");
        utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
-       utf_loadClass                  = utf_new_char("loadClass");
+       utf_findNative                 = utf_new_char("findNative");
        utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
+       utf_initCause                  = utf_new_char("initCause");
+       utf_loadClass                  = utf_new_char("loadClass");
+       utf_loadClassInternal          = utf_new_char("loadClassInternal");
+       utf_printStackTrace            = utf_new_char("printStackTrace");
+
+       utf_division_by_zero           = utf_new_char("/ by zero");
 
        utf_Z                          = utf_new_char("Z");
        utf_B                          = utf_new_char("B");
@@ -297,6 +483,11 @@ bool utf8_init(void)
        utf_void__java_lang_ClassLoader =
                utf_new_char("()Ljava/lang/ClassLoader;");
 
+       utf_java_lang_ClassLoader_java_lang_String__J =
+               utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
+
+       utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
+
        utf_java_lang_Object__java_lang_Object =
                utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 
@@ -305,15 +496,22 @@ bool utf8_init(void)
        utf_java_lang_String__java_lang_Class =
                utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 
-       utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
+       utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 
-       utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
+       utf_java_lang_Thread_java_lang_Throwable__V =
+               utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
 
-       array_packagename              = utf_new_char("\t<the array package>");
+       utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
+               utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");
 
-       /* everything's ok */
+       utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 
-       return true;
+       utf_java_lang_Throwable__java_lang_Throwable =
+               utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
+
+       utf_null                       = utf_new_char("null");
+       utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
+       array_packagename              = utf_new_char("\t<the array package>");
 }
 
 
@@ -501,9 +699,7 @@ utf *utf_new(const char *text, u2 length)
        utf *u;                             /* hashtable element                  */
        u2 i;
 
-#if defined(ENABLE_THREADS)
-       builtin_monitorenter(hashtable_utf->header);
-#endif
+       Mutex_lock(hashtable_utf->mutex);
 
 #if defined(ENABLE_STATISTICS)
        if (opt_stat)
@@ -531,9 +727,7 @@ utf *utf_new(const char *text, u2 length)
 
                        /* symbol found in hashtable */
 
-#if defined(ENABLE_THREADS)
-                       builtin_monitorexit(hashtable_utf->header);
-#endif
+                       Mutex_unlock(hashtable_utf->mutex);
 
                        return u;
                }
@@ -542,13 +736,10 @@ utf *utf_new(const char *text, u2 length)
                u = u->hashlink; /* next element in external chain */
        }
 
-#if defined(ENABLE_STATISTICS)
-       if (opt_stat)
-               count_utf_len += sizeof(utf) + length + 1;
-#endif
-
        /* location in hashtable found, create new utf element */
+
        u = NEW(utf);
+
        u->blength  = length;               /* length in bytes of utfstring       */
        u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
        u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
@@ -556,6 +747,11 @@ utf *utf_new(const char *text, u2 length)
        memcpy(u->text, text, length);      /* copy utf-text                      */
        u->text[length] = '\0';
 
+#if defined(ENABLE_STATISTICS)
+       if (opt_stat)
+               count_utf_len += sizeof(utf) + length + 1;
+#endif
+
        hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
        hashtable_utf->entries++;           /* update number of entries           */
 
@@ -604,9 +800,7 @@ utf *utf_new(const char *text, u2 length)
                hashtable_utf = newhash;
        }
 
-#if defined(ENABLE_THREADS)
-       builtin_monitorexit(hashtable_utf->header);
-#endif
+       Mutex_unlock(hashtable_utf->mutex);
 
        return u;
 }
@@ -626,7 +820,7 @@ utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
        u4 left;                        /* unicode characters left                */
        u4 buflength;                   /* utf length in bytes of the u2 array    */
        utf *result;                    /* resulting utf-string                   */
-       int i;          
+       int i;
 
        /* determine utf length in bytes and allocate memory */
 
@@ -729,6 +923,9 @@ utf *utf_new_char_classname(const char *text)
    Read the next unicode character from the utf string and increment
    the utf-string pointer accordingly.
 
+   CAUTION: This function is unsafe for input that was not checked 
+            by is_valid_utf!
+
 *******************************************************************************/
 
 u2 utf_nextu2(char **utf_ptr)
@@ -795,10 +992,14 @@ u4 utf_bytes(utf *u)
        return u->blength;
 }
 
+
 /* utf_get_number_of_u2s_for_buffer ********************************************
 
    Determine number of UTF-16 u2s in the given UTF-8 buffer
 
+   CAUTION: This function is unsafe for input that was not checked 
+            by is_valid_utf!
+
    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
    to an array of u2s (UTF-16) and want to know how many of them you will get.
    All other uses of this function are probably wrong.
@@ -841,6 +1042,9 @@ u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
 
    Determine number of UTF-16 u2s in the utf string.
 
+   CAUTION: This function is unsafe for input that was not checked 
+            by is_valid_utf!
+
    CAUTION: Use this function *only* when you want to convert a utf string
    to an array of u2s and want to know how many of them you will get.
    All other uses of this function are probably wrong.
@@ -863,7 +1067,7 @@ u4 utf_get_number_of_u2s(utf *u)
 
        /* XXX this is probably not checked by most callers! Review this after */
        /* the invalid uses of this function have been eliminated */
-       if (!u) {
+       if (u == NULL) {
                exceptions_throw_nullpointerexception();
                return 0;
        }
@@ -877,15 +1081,319 @@ u4 utf_get_number_of_u2s(utf *u)
                utf_nextu2(&utf_ptr);
        }
 
-       if (utf_ptr != endpos)
+       if (utf_ptr != endpos) {
                /* string ended abruptly */
-               throw_cacao_exception_exit(string_java_lang_InternalError,
-                                                                  "Illegal utf8 string");
+               exceptions_throw_internalerror("Illegal utf8 string");
+               return 0;
+       }
 
        return len;
 }
 
 
+/* utf8_safe_number_of_u2s *****************************************************
+
+   Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
+   (For invalid UTF-8 the U+fffd replacement character will be counted.)
+
+   This function is safe even for invalid UTF-8 strings.
+
+   IN:
+      text..........zero-terminated(!) UTF-8 string (may be invalid)
+                       must NOT be NULL
+         nbytes........strlen(text). (This is needed to completely emulate
+                       the RI).
+
+   OUT:
+      the number of u2s needed to hold this string in UTF-16 encoding.
+         There is _no_ terminating zero included in this count.
+
+*******************************************************************************/
+
+s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
+       register const unsigned char *t;
+       register s4 byte;
+       register s4 len;
+       register const unsigned char *tlimit;
+       s4 byte1;
+       s4 byte2;
+       s4 byte3;
+       s4 value;
+       s4 skip;
+
+       assert(text);
+       assert(nbytes >= 0);
+
+       len = 0;
+       t = (const unsigned char *) text;
+       tlimit = t + nbytes;
+
+       /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
+
+       while (1) {
+               byte = *t++;
+
+               if (byte & 0x80) {
+                       /* highest bit set, non-ASCII character */
+
+                       if ((byte & 0xe0) == 0xc0) {
+                               /* 2-byte: should be 110..... 10...... ? */
+
+                               if ((*t++ & 0xc0) == 0x80)
+                                       ; /* valid 2-byte */
+                               else
+                                       t--; /* invalid */
+                       }
+                       else if ((byte & 0xf0) == 0xe0) {
+                               /* 3-byte: should be 1110.... 10...... 10...... */
+                               /*                            ^t                */
+
+                               if (t + 2 > tlimit)
+                                       return len + 1; /* invalid, stop here */
+
+                               if ((*t++ & 0xc0) == 0x80) {
+                                       if ((*t++ & 0xc0) == 0x80)
+                                               ; /* valid 3-byte */
+                                       else
+                                               t--; /* invalid */
+                               }
+                               else
+                                       t--; /* invalid */
+                       }
+                       else if ((byte & 0xf8) == 0xf0) {
+                               /* 4-byte: should be 11110... 10...... 10...... 10...... */
+                               /*                            ^t                         */
+
+                               if (t + 3 > tlimit)
+                                       return len + 1; /* invalid, stop here */
+
+                               if (((byte1 = *t++) & 0xc0) == 0x80) {
+                                       if (((byte2 = *t++) & 0xc0) == 0x80) {
+                                               if (((byte3 = *t++) & 0xc0) == 0x80) {
+                                                       /* valid 4-byte UTF-8? */
+                                                       value = ((byte  & 0x07) << 18)
+                                                                 | ((byte1 & 0x3f) << 12)
+                                                                 | ((byte2 & 0x3f) <<  6)
+                                                                 | ((byte3 & 0x3f)      );
+
+                                                       if (value > 0x10FFFF)
+                                                               ; /* invalid */
+                                                       else if (value > 0xFFFF)
+                                                               len += 1; /* we need surrogates */
+                                                       else
+                                                               ; /* 16bit suffice */
+                                               }
+                                               else
+                                                       t--; /* invalid */
+                                       }
+                                       else
+                                               t--; /* invalid */
+                               }
+                               else
+                                       t--; /* invalid */
+                       }
+                       else if ((byte & 0xfc) == 0xf8) {
+                               /* invalid 5-byte */
+                               if (t + 4 > tlimit)
+                                       return len + 1; /* invalid, stop here */
+
+                               skip = 4;
+                               for (; skip && ((*t & 0xc0) == 0x80); --skip)
+                                       t++;
+                       }
+                       else if ((byte & 0xfe) == 0xfc) {
+                               /* invalid 6-byte */
+                               if (t + 5 > tlimit)
+                                       return len + 1; /* invalid, stop here */
+
+                               skip = 5;
+                               for (; skip && ((*t & 0xc0) == 0x80); --skip)
+                                       t++;
+                       }
+                       else
+                               ; /* invalid */
+               }
+               else {
+                       /* NUL */
+
+                       if (byte == 0)
+                               break;
+
+                       /* ASCII character, common case */
+               }
+
+               len++;
+       }
+
+       return len;
+}
+
+
+/* utf8_safe_convert_to_u2s ****************************************************
+
+   Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
+   (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
+   Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
+
+   This function is safe even for invalid UTF-8 strings.
+
+   IN:
+      text..........zero-terminated(!) UTF-8 string (may be invalid)
+                       must NOT be NULL
+         nbytes........strlen(text). (This is needed to completely emulate
+                                       the RI).
+         buffer........a preallocated array of u2s to receive the decoded
+                       string. Use utf8_safe_number_of_u2s to get the
+                                       required number of u2s for allocating this.
+
+*******************************************************************************/
+
+#define UNICODE_REPLACEMENT  0xfffd
+
+void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
+       register const unsigned char *t;
+       register s4 byte;
+       register const unsigned char *tlimit;
+       s4 byte1;
+       s4 byte2;
+       s4 byte3;
+       s4 value;
+       s4 skip;
+
+       assert(text);
+       assert(nbytes >= 0);
+
+       t = (const unsigned char *) text;
+       tlimit = t + nbytes;
+
+       /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
+
+       while (1) {
+               byte = *t++;
+
+               if (byte & 0x80) {
+                       /* highest bit set, non-ASCII character */
+
+                       if ((byte & 0xe0) == 0xc0) {
+                               /* 2-byte: should be 110..... 10...... */
+
+                               if (((byte1 = *t++) & 0xc0) == 0x80) {
+                                       /* valid 2-byte UTF-8 */
+                                       *buffer++ = ((byte  & 0x1f) << 6)
+                                                         | ((byte1 & 0x3f)     );
+                               }
+                               else {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       t--;
+                               }
+                       }
+                       else if ((byte & 0xf0) == 0xe0) {
+                               /* 3-byte: should be 1110.... 10...... 10...... */
+
+                               if (t + 2 > tlimit) {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       return;
+                               }
+
+                               if (((byte1 = *t++) & 0xc0) == 0x80) {
+                                       if (((byte2 = *t++) & 0xc0) == 0x80) {
+                                               /* valid 3-byte UTF-8 */
+                                               *buffer++ = ((byte  & 0x0f) << 12)
+                                                                 | ((byte1 & 0x3f) <<  6)
+                                                                 | ((byte2 & 0x3f)      );
+                                       }
+                                       else {
+                                               *buffer++ = UNICODE_REPLACEMENT;
+                                               t--;
+                                       }
+                               }
+                               else {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       t--;
+                               }
+                       }
+                       else if ((byte & 0xf8) == 0xf0) {
+                               /* 4-byte: should be 11110... 10...... 10...... 10...... */
+
+                               if (t + 3 > tlimit) {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       return;
+                               }
+
+                               if (((byte1 = *t++) & 0xc0) == 0x80) {
+                                       if (((byte2 = *t++) & 0xc0) == 0x80) {
+                                               if (((byte3 = *t++) & 0xc0) == 0x80) {
+                                                       /* valid 4-byte UTF-8? */
+                                                       value = ((byte  & 0x07) << 18)
+                                                                 | ((byte1 & 0x3f) << 12)
+                                                                 | ((byte2 & 0x3f) <<  6)
+                                                                 | ((byte3 & 0x3f)      );
+
+                                                       if (value > 0x10FFFF) {
+                                                               *buffer++ = UNICODE_REPLACEMENT;
+                                                       }
+                                                       else if (value > 0xFFFF) {
+                                                               /* we need surrogates */
+                                                               *buffer++ = 0xd800 | ((value >> 10) - 0x40);
+                                                               *buffer++ = 0xdc00 | (value & 0x03ff);
+                                                       }
+                                                       else
+                                                               *buffer++ = value; /* 16bit suffice */
+                                               }
+                                               else {
+                                                       *buffer++ = UNICODE_REPLACEMENT;
+                                                       t--;
+                                               }
+                                       }
+                                       else {
+                                               *buffer++ = UNICODE_REPLACEMENT;
+                                               t--;
+                                       }
+                               }
+                               else {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       t--;
+                               }
+                       }
+                       else if ((byte & 0xfc) == 0xf8) {
+                               if (t + 4 > tlimit) {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       return;
+                               }
+
+                               skip = 4;
+                               for (; skip && ((*t & 0xc0) == 0x80); --skip)
+                                       t++;
+                               *buffer++ = UNICODE_REPLACEMENT;
+                       }
+                       else if ((byte & 0xfe) == 0xfc) {
+                               if (t + 5 > tlimit) {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       return;
+                               }
+
+                               skip = 5;
+                               for (; skip && ((*t & 0xc0) == 0x80); --skip)
+                                       t++;
+                               *buffer++ = UNICODE_REPLACEMENT;
+                       }
+                       else
+                               *buffer++ = UNICODE_REPLACEMENT;
+               }
+               else {
+                       /* NUL */
+
+                       if (byte == 0)
+                               break;
+
+                       /* ASCII character, common case */
+
+                       *buffer++ = byte;
+               }
+       }
+}
+
+
 /* u2_utflength ****************************************************************
 
    Returns the utf length in bytes of a u2 array.