/* src/vm/utf8.c - utf8 string functions

   Copyright (C) 1996-2005, 2006, 2007, 2008
   CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO

   This file is part of CACAO.

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2, or (at
   your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
   02110-1301, USA.

*/


#include "config.h"

#include <string.h>
#include <assert.h>

#include "vm/types.h"

#include "mm/memory.hpp"

#include "threads/mutex.hpp"

#include "toolbox/hashtable.h"

#include "vm/exceptions.hpp"
#include "vm/options.h"

#if defined(ENABLE_STATISTICS)
# include "vm/statistics.h"
#endif

#include "vm/utf8.h"


/* global variables ***********************************************************/

/* hashsize must be power of 2 */

#define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */

hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */


/* utf-symbols for pointer comparison of frequently used strings **************/

utf *utf_java_lang_Object;

utf *utf_java_lang_Class;
utf *utf_java_lang_ClassLoader;
utf *utf_java_lang_Cloneable;
utf *utf_java_lang_SecurityManager;
utf *utf_java_lang_String;
utf *utf_java_lang_ThreadGroup;
utf *utf_java_lang_ref_SoftReference;
utf *utf_java_lang_ref_WeakReference;
utf *utf_java_lang_ref_PhantomReference;
utf *utf_java_io_Serializable;

utf *utf_java_lang_Throwable;
utf *utf_java_lang_Error;

utf *utf_java_lang_AbstractMethodError;
utf *utf_java_lang_ClassCircularityError;
utf *utf_java_lang_ClassFormatError;
utf *utf_java_lang_ExceptionInInitializerError;
utf *utf_java_lang_IncompatibleClassChangeError;
utf *utf_java_lang_InstantiationError;
utf *utf_java_lang_InternalError;
utf *utf_java_lang_LinkageError;
utf *utf_java_lang_NoClassDefFoundError;
utf *utf_java_lang_NoSuchFieldError;
utf *utf_java_lang_NoSuchMethodError;
utf *utf_java_lang_OutOfMemoryError;
utf *utf_java_lang_UnsatisfiedLinkError;
utf *utf_java_lang_UnsupportedClassVersionError;
utf *utf_java_lang_VerifyError;
utf *utf_java_lang_VirtualMachineError;

utf *utf_java_lang_Exception;

utf *utf_java_lang_ArithmeticException;
utf *utf_java_lang_ArrayIndexOutOfBoundsException;
utf *utf_java_lang_ArrayStoreException;
utf *utf_java_lang_ClassCastException;
utf *utf_java_lang_ClassNotFoundException;
utf *utf_java_lang_CloneNotSupportedException;
utf *utf_java_lang_IllegalAccessException;
utf *utf_java_lang_IllegalArgumentException;
utf *utf_java_lang_IllegalMonitorStateException;
utf *utf_java_lang_InstantiationException;
utf *utf_java_lang_InterruptedException;
utf *utf_java_lang_NegativeArraySizeException;
utf *utf_java_lang_NullPointerException;
utf *utf_java_lang_RuntimeException;
utf *utf_java_lang_StringIndexOutOfBoundsException;

utf *utf_java_lang_reflect_InvocationTargetException;

utf *utf_java_security_PrivilegedActionException;

#if defined(ENABLE_JAVASE)
utf* utf_java_lang_Void;
#endif

utf* utf_java_lang_Boolean;
utf* utf_java_lang_Byte;
utf* utf_java_lang_Character;
utf* utf_java_lang_Short;
utf* utf_java_lang_Integer;
utf* utf_java_lang_Long;
utf* utf_java_lang_Float;
utf* utf_java_lang_Double;

#if defined(ENABLE_JAVASE)
utf *utf_java_lang_StackTraceElement;
utf *utf_java_lang_reflect_Constructor;
utf *utf_java_lang_reflect_Field;
utf *utf_java_lang_reflect_Method;

# if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
utf *utf_java_lang_reflect_VMConstructor;
utf *utf_java_lang_reflect_VMField;
utf *utf_java_lang_reflect_VMMethod;
# endif

utf *utf_java_util_Vector;
#endif

utf *utf_InnerClasses;                  /* InnerClasses                       */
utf *utf_ConstantValue;                 /* ConstantValue                      */
utf *utf_Code;                          /* Code                               */
utf *utf_Exceptions;                    /* Exceptions                         */
utf *utf_LineNumberTable;               /* LineNumberTable                    */
utf *utf_SourceFile;                    /* SourceFile                         */

#if defined(ENABLE_JAVASE)
utf *utf_EnclosingMethod;
utf *utf_Signature;
utf *utf_StackMapTable;

# if defined(ENABLE_JVMTI)
utf *utf_LocalVariableTable;
# endif

# if defined(ENABLE_ANNOTATIONS)
utf *utf_RuntimeVisibleAnnotations;            /* RuntimeVisibleAnnotations            */
utf *utf_RuntimeInvisibleAnnotations;          /* RuntimeInvisibleAnnotations          */
utf *utf_RuntimeVisibleParameterAnnotations;   /* RuntimeVisibleParameterAnnotations   */
utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
utf *utf_AnnotationDefault;                    /* AnnotationDefault                    */
# endif
#endif

utf *utf_init;                          /* <init>                             */
utf *utf_clinit;                        /* <clinit>                           */
utf *utf_clone;                         /* clone                              */
utf *utf_finalize;                      /* finalize                           */
utf *utf_invoke;
utf *utf_main;
utf *utf_run;                           /* run                                */

utf *utf_add;
utf *utf_dispatch;
utf *utf_remove;
utf *utf_addThread;
utf *utf_removeThread;
utf *utf_put;
utf *utf_get;
utf *utf_uncaughtException;
utf *utf_value;

utf *utf_fillInStackTrace;
utf *utf_findNative;
utf *utf_getSystemClassLoader;
utf *utf_initCause;
utf *utf_loadClass;
utf *utf_loadClassInternal;
utf *utf_printStackTrace;

utf *utf_division_by_zero;

utf *utf_Z;                             /* Z                                  */
utf *utf_B;                             /* B                                  */
utf *utf_C;                             /* C                                  */
utf *utf_S;                             /* S                                  */
utf *utf_I;                             /* I                                  */
utf *utf_J;                             /* J                                  */
utf *utf_F;                             /* F                                  */
utf *utf_D;                             /* D                                  */

utf *utf_void__void;                    /* ()V                                */
utf *utf_boolean__void;                 /* (Z)V                               */
utf *utf_byte__void;                    /* (B)V                               */
utf *utf_char__void;                    /* (C)V                               */
utf *utf_short__void;                   /* (S)V                               */
utf *utf_int__void;                     /* (I)V                               */
utf *utf_long__void;                    /* (J)V                               */
utf *utf_float__void;                   /* (F)V                               */
utf *utf_double__void;                  /* (D)V                               */

utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
utf *utf_java_lang_ClassLoader_java_lang_String__J;
utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
utf *utf_java_lang_Object__java_lang_Object;
utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
utf *utf_java_lang_String__java_lang_Class;
utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
utf *utf_java_lang_Thread_java_lang_Throwable__V;
utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
utf *utf_java_lang_Throwable__java_lang_Throwable;

utf *utf_not_named_yet;                 /* special name for unnamed classes   */
utf *utf_null;
utf *array_packagename;


/* utf_init ********************************************************************

   Initializes the utf8 subsystem.

*******************************************************************************/

void utf8_init(void)
{
	TRACESUBSYSTEMINITIALIZATION("utf8_init");

	/* create utf8 hashtable */

	hashtable_utf = NEW(hashtable);

	hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);

#if defined(ENABLE_STATISTICS)
	if (opt_stat)
		count_utf_len += sizeof(utf*) * hashtable_utf->size;
#endif

	/* create utf-symbols for pointer comparison of frequently used strings */

	utf_java_lang_Object           = utf_new_char("java/lang/Object");

	utf_java_lang_Class            = utf_new_char("java/lang/Class");
	utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
	utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
	utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
	utf_java_lang_String           = utf_new_char("java/lang/String");
	utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");

	utf_java_lang_ref_SoftReference =
		utf_new_char("java/lang/ref/SoftReference");

	utf_java_lang_ref_WeakReference =
		utf_new_char("java/lang/ref/WeakReference");

	utf_java_lang_ref_PhantomReference =
		utf_new_char("java/lang/ref/PhantomReference");

	utf_java_io_Serializable       = utf_new_char("java/io/Serializable");

	utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
	utf_java_lang_Error            = utf_new_char("java/lang/Error");

	utf_java_lang_ClassCircularityError =
		utf_new_char("java/lang/ClassCircularityError");

	utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");

	utf_java_lang_ExceptionInInitializerError =
		utf_new_char("java/lang/ExceptionInInitializerError");

	utf_java_lang_IncompatibleClassChangeError =
		utf_new_char("java/lang/IncompatibleClassChangeError");

	utf_java_lang_InstantiationError =
		utf_new_char("java/lang/InstantiationError");

	utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
	utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");

	utf_java_lang_NoClassDefFoundError =
		utf_new_char("java/lang/NoClassDefFoundError");

	utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");

	utf_java_lang_UnsatisfiedLinkError =
		utf_new_char("java/lang/UnsatisfiedLinkError");

	utf_java_lang_UnsupportedClassVersionError =
		utf_new_char("java/lang/UnsupportedClassVersionError");

	utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");

	utf_java_lang_VirtualMachineError =
		utf_new_char("java/lang/VirtualMachineError");

#if defined(ENABLE_JAVASE)
	utf_java_lang_AbstractMethodError =
		utf_new_char("java/lang/AbstractMethodError");

	utf_java_lang_NoSuchFieldError =
		utf_new_char("java/lang/NoSuchFieldError");

	utf_java_lang_NoSuchMethodError =
		utf_new_char("java/lang/NoSuchMethodError");
#endif

	utf_java_lang_Exception        = utf_new_char("java/lang/Exception");

	utf_java_lang_ArithmeticException =
		utf_new_char("java/lang/ArithmeticException");

	utf_java_lang_ArrayIndexOutOfBoundsException =
		utf_new_char("java/lang/ArrayIndexOutOfBoundsException");

	utf_java_lang_ArrayStoreException =
		utf_new_char("java/lang/ArrayStoreException");

	utf_java_lang_ClassCastException =
		utf_new_char("java/lang/ClassCastException");

	utf_java_lang_ClassNotFoundException =
		utf_new_char("java/lang/ClassNotFoundException");

	utf_java_lang_CloneNotSupportedException =
		utf_new_char("java/lang/CloneNotSupportedException");

	utf_java_lang_IllegalAccessException =
		utf_new_char("java/lang/IllegalAccessException");

	utf_java_lang_IllegalArgumentException =
		utf_new_char("java/lang/IllegalArgumentException");

	utf_java_lang_IllegalMonitorStateException =
		utf_new_char("java/lang/IllegalMonitorStateException");

	utf_java_lang_InstantiationException =
		utf_new_char("java/lang/InstantiationException");

	utf_java_lang_InterruptedException =
		utf_new_char("java/lang/InterruptedException");

	utf_java_lang_NegativeArraySizeException =
		utf_new_char("java/lang/NegativeArraySizeException");

	utf_java_lang_NullPointerException =
		utf_new_char("java/lang/NullPointerException");

	utf_java_lang_RuntimeException =
		utf_new_char("java/lang/RuntimeException");

	utf_java_lang_StringIndexOutOfBoundsException =
		utf_new_char("java/lang/StringIndexOutOfBoundsException");

	utf_java_lang_reflect_InvocationTargetException =
		utf_new_char("java/lang/reflect/InvocationTargetException");

	utf_java_security_PrivilegedActionException =
		utf_new_char("java/security/PrivilegedActionException");
 
#if defined(ENABLE_JAVASE)
	utf_java_lang_Void             = utf_new_char("java/lang/Void");
#endif

	utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
	utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
	utf_java_lang_Character        = utf_new_char("java/lang/Character");
	utf_java_lang_Short            = utf_new_char("java/lang/Short");
	utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
	utf_java_lang_Long             = utf_new_char("java/lang/Long");
	utf_java_lang_Float            = utf_new_char("java/lang/Float");
	utf_java_lang_Double           = utf_new_char("java/lang/Double");

#if defined(ENABLE_JAVASE)
	utf_java_lang_StackTraceElement =
		utf_new_char("java/lang/StackTraceElement");

	utf_java_lang_reflect_Constructor =
		utf_new_char("java/lang/reflect/Constructor");

	utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
	utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");

# if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
	utf_java_lang_reflect_VMConstructor = utf_new_char("java/lang/reflect/VMConstructor");
	utf_java_lang_reflect_VMField       = utf_new_char("java/lang/reflect/VMField");
	utf_java_lang_reflect_VMMethod      = utf_new_char("java/lang/reflect/VMMethod");
# endif

	utf_java_util_Vector           = utf_new_char("java/util/Vector");
#endif

	utf_InnerClasses               = utf_new_char("InnerClasses");
	utf_ConstantValue              = utf_new_char("ConstantValue");
	utf_Code                       = utf_new_char("Code");
	utf_Exceptions	               = utf_new_char("Exceptions");
	utf_LineNumberTable            = utf_new_char("LineNumberTable");
	utf_SourceFile                 = utf_new_char("SourceFile");

#if defined(ENABLE_JAVASE)
	utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
	utf_Signature                  = utf_new_char("Signature");
	utf_StackMapTable              = utf_new_char("StackMapTable");

# if defined(ENABLE_JVMTI)
	utf_LocalVariableTable         = utf_new_char("LocalVariableTable");
# endif

# if defined(ENABLE_ANNOTATIONS)
	utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
	utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
	utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
	utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
	utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
# endif
#endif

	utf_init	                   = utf_new_char("<init>");
	utf_clinit	                   = utf_new_char("<clinit>");
	utf_clone                      = utf_new_char("clone");
	utf_finalize	               = utf_new_char("finalize");
	utf_invoke                     = utf_new_char("invoke");
	utf_main                       = utf_new_char("main");
	utf_run                        = utf_new_char("run");

	utf_add                        = utf_new_char("add");
	utf_dispatch                   = utf_new_char("dispatch");
	utf_remove                     = utf_new_char("remove");
	utf_addThread                  = utf_new_char("addThread");
	utf_removeThread               = utf_new_char("removeThread");
	utf_put                        = utf_new_char("put");
	utf_get                        = utf_new_char("get");
	utf_uncaughtException          = utf_new_char("uncaughtException");
	utf_value                      = utf_new_char("value");

	utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
	utf_findNative                 = utf_new_char("findNative");
	utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
	utf_initCause                  = utf_new_char("initCause");
	utf_loadClass                  = utf_new_char("loadClass");
	utf_loadClassInternal          = utf_new_char("loadClassInternal");
	utf_printStackTrace            = utf_new_char("printStackTrace");

	utf_division_by_zero           = utf_new_char("/ by zero");

	utf_Z                          = utf_new_char("Z");
	utf_B                          = utf_new_char("B");
	utf_C                          = utf_new_char("C");
	utf_S                          = utf_new_char("S");
	utf_I                          = utf_new_char("I");
	utf_J                          = utf_new_char("J");
	utf_F                          = utf_new_char("F");
	utf_D                          = utf_new_char("D");

	utf_void__void                 = utf_new_char("()V");
	utf_boolean__void              = utf_new_char("(Z)V");
	utf_byte__void                 = utf_new_char("(B)V");
	utf_char__void                 = utf_new_char("(C)V");
	utf_short__void                = utf_new_char("(S)V");
	utf_int__void                  = utf_new_char("(I)V");
	utf_long__void                 = utf_new_char("(J)V");
	utf_float__void                = utf_new_char("(F)V");
	utf_double__void               = utf_new_char("(D)V");
	utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
	utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");

	utf_void__java_lang_ClassLoader =
		utf_new_char("()Ljava/lang/ClassLoader;");

	utf_java_lang_ClassLoader_java_lang_String__J =
		utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");

	utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");

	utf_java_lang_Object__java_lang_Object =
		utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");

	utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");

	utf_java_lang_String__java_lang_Class =
		utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");

	utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");

	utf_java_lang_Thread_java_lang_Throwable__V =
		utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");

	utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
		utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");

	utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");

	utf_java_lang_Throwable__java_lang_Throwable =
		utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");

	utf_null                       = utf_new_char("null");
	utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
	array_packagename              = utf_new_char("\t<the array package>");
}


/* utf_hashkey *****************************************************************

   The hashkey is computed from the utf-text by using up to 8
   characters.  For utf-symbols longer than 15 characters 3 characters
   are taken from the beginning and the end, 2 characters are taken
   from the middle.

*******************************************************************************/

#define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
#define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */

u4 utf_hashkey(const char *text, u4 length)
{
	const char *start_pos = text;       /* pointer to utf text                */
	u4 a;

	switch (length) {
	case 0: /* empty string */
		return 0;

	case 1: return fbs(0);
	case 2: return fbs(0) ^ nbs(3);
	case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
	case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
	case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
	case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
	case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
	case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);

	case 9:
		a = fbs(0);
		a ^= nbs(1);
		a ^= nbs(2);
		text++;
		return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);

	case 10:
		a = fbs(0);
		text++;
		a ^= nbs(2);
		a ^= nbs(3);
		a ^= nbs(4);
		text++;
		return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);

	case 11:
		a = fbs(0);
		text++;
		a ^= nbs(2);
		a ^= nbs(3);
		a ^= nbs(4);
		text++;
		return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);

	case 12:
		a = fbs(0);
		text += 2;
		a ^= nbs(2);
		a ^= nbs(3);
		text++;
		a ^= nbs(5);
		a ^= nbs(6);
		a ^= nbs(7);
		text++;
		return a ^ nbs(9) ^ nbs(10);

	case 13:
		a = fbs(0);
		a ^= nbs(1);
		text++;
		a ^= nbs(3);
		a ^= nbs(4);
		text += 2;	
		a ^= nbs(7);
		a ^= nbs(8);
		text += 2;
		return a ^ nbs(9) ^ nbs(10);

	case 14:
		a = fbs(0);
		text += 2;	
		a ^= nbs(3);
		a ^= nbs(4);
		text += 2;	
		a ^= nbs(7);
		a ^= nbs(8);
		text += 2;
		return a ^ nbs(9) ^ nbs(10) ^ nbs(11);

	case 15:
		a = fbs(0);
		text += 2;	
		a ^= nbs(3);
		a ^= nbs(4);
		text += 2;	
		a ^= nbs(7);
		a ^= nbs(8);
		text += 2;
		return a ^ nbs(9) ^ nbs(10) ^ nbs(11);

	default:  /* 3 characters from beginning */
		a = fbs(0);
		text += 2;
		a ^= nbs(3);
		a ^= nbs(4);

		/* 2 characters from middle */
		text = start_pos + (length / 2);
		a ^= fbs(5);
		text += 2;
		a ^= nbs(6);	

		/* 3 characters from end */
		text = start_pos + length - 4;

		a ^= fbs(7);
		text++;

		return a ^ nbs(10) ^ nbs(11);
    }
}

/* utf_full_hashkey ************************************************************

   This function computes a hash value using all bytes in the string.

   The algorithm is the "One-at-a-time" algorithm as published
   by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.

*******************************************************************************/

u4 utf_full_hashkey(const char *text, u4 length)
{
	register const unsigned char *p = (const unsigned char *) text;
	register u4 hash;
	register u4 i;

	hash = 0;
	for (i=length; i--;)
	{
	    hash += *p++;
	    hash += (hash << 10);
	    hash ^= (hash >> 6);
	}
	hash += (hash << 3);
	hash ^= (hash >> 11);
	hash += (hash << 15);

	return hash;
}

/* unicode_hashkey *************************************************************

   Compute the hashkey of a unicode string.

*******************************************************************************/

u4 unicode_hashkey(u2 *text, u2 len)
{
	return utf_hashkey((char *) text, len);
}


/* utf_new *********************************************************************

   Creates a new utf-symbol, the text of the symbol is passed as a
   u1-array. The function searches the utf-hashtable for a utf-symbol
   with this text. On success the element returned, otherwise a new
   hashtable element is created.

   If the number of entries in the hashtable exceeds twice the size of
   the hashtable slots a reorganization of the hashtable is done and
   the utf symbols are copied to a new hashtable with doubled size.

*******************************************************************************/

utf *utf_new(const char *text, u2 length)
{
	u4 key;                             /* hashkey computed from utf-text     */
	u4 slot;                            /* slot in hashtable                  */
	utf *u;                             /* hashtable element                  */
	u2 i;

	Mutex_lock(hashtable_utf->mutex);

#if defined(ENABLE_STATISTICS)
	if (opt_stat)
		count_utf_new++;
#endif

	key  = utf_hashkey(text, length);
	slot = key & (hashtable_utf->size - 1);
	u    = hashtable_utf->ptr[slot];

	/* search external hash chain for utf-symbol */

	while (u) {
		if (u->blength == length) {
			/* compare text of hashtable elements */

			for (i = 0; i < length; i++)
				if (text[i] != u->text[i])
					goto nomatch;
			
#if defined(ENABLE_STATISTICS)
			if (opt_stat)
				count_utf_new_found++;
#endif

			/* symbol found in hashtable */

			Mutex_unlock(hashtable_utf->mutex);

			return u;
		}

	nomatch:
		u = u->hashlink; /* next element in external chain */
	}

	/* location in hashtable found, create new utf element */

	u = NEW(utf);

	u->blength  = length;               /* length in bytes of utfstring       */
	u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
	u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */

	memcpy(u->text, text, length);      /* copy utf-text                      */
	u->text[length] = '\0';

#if defined(ENABLE_STATISTICS)
	if (opt_stat)
		count_utf_len += sizeof(utf) + length + 1;
#endif

	hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
	hashtable_utf->entries++;           /* update number of entries           */

	if (hashtable_utf->entries > (hashtable_utf->size * 2)) {

        /* reorganization of hashtable, average length of the external
           chains is approx. 2 */

		hashtable *newhash;                              /* the new hashtable */
		u4         i;
		utf       *u;
		utf       *nextu;
		u4         slot;

		/* create new hashtable, double the size */

		newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);

#if defined(ENABLE_STATISTICS)
		if (opt_stat)
			count_utf_len += sizeof(utf*) * hashtable_utf->size;
#endif

		/* transfer elements to new hashtable */

		for (i = 0; i < hashtable_utf->size; i++) {
			u = hashtable_utf->ptr[i];

			while (u) {
				nextu = u->hashlink;
				slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
						
				u->hashlink = (utf *) newhash->ptr[slot];
				newhash->ptr[slot] = u;

				/* follow link in external hash chain */

				u = nextu;
			}
		}
	
		/* dispose old table */

		hashtable_free(hashtable_utf);

		hashtable_utf = newhash;
	}

	Mutex_unlock(hashtable_utf->mutex);

	return u;
}


/* utf_new_u2 ******************************************************************

   Make utf symbol from u2 array, if isclassname is true '.' is
   replaced by '/'.

*******************************************************************************/

utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
{
	char *buffer;                   /* memory buffer for  unicode characters  */
	char *pos;                      /* pointer to current position in buffer  */
	u4 left;                        /* unicode characters left                */
	u4 buflength;                   /* utf length in bytes of the u2 array    */
	utf *result;                    /* resulting utf-string                   */
	int i;

	/* determine utf length in bytes and allocate memory */

	buflength = u2_utflength(unicode_pos, unicode_length); 
	buffer    = MNEW(char, buflength);
 
	left = buflength;
	pos  = buffer;

	for (i = 0; i++ < unicode_length; unicode_pos++) {
		/* next unicode character */
		u2 c = *unicode_pos;
		
		if ((c != 0) && (c < 0x80)) {
			/* 1 character */	
			left--;
	    	if ((int) left < 0) break;
			/* convert classname */
			if (isclassname && c == '.')
				*pos++ = '/';
			else
				*pos++ = (char) c;

		} else if (c < 0x800) { 	    
			/* 2 characters */				
	    	unsigned char high = c >> 6;
	    	unsigned char low  = c & 0x3F;
			left = left - 2;
	    	if ((int) left < 0) break;
	    	*pos++ = high | 0xC0; 
	    	*pos++ = low  | 0x80;	  

		} else {	 
	    	/* 3 characters */				
	    	char low  = c & 0x3f;
	    	char mid  = (c >> 6) & 0x3F;
	    	char high = c >> 12;
			left = left - 3;
	    	if ((int) left < 0) break;
	    	*pos++ = high | 0xE0; 
	    	*pos++ = mid  | 0x80;  
	    	*pos++ = low  | 0x80;   
		}
	}
	
	/* insert utf-string into symbol-table */
	result = utf_new(buffer,buflength);

	MFREE(buffer, char, buflength);

	return result;
}


/* utf_new_char ****************************************************************

   Creates a new utf symbol, the text for this symbol is passed as a
   c-string ( = char* ).

*******************************************************************************/

utf *utf_new_char(const char *text)
{
	return utf_new(text, strlen(text));
}


/* utf_new_char_classname ******************************************************

   Creates a new utf symbol, the text for this symbol is passed as a
   c-string ( = char* ) "." characters are going to be replaced by
   "/". Since the above function is used often, this is a separte
   function, instead of an if.

*******************************************************************************/

utf *utf_new_char_classname(const char *text)
{
	if (strchr(text, '.')) {
		char *txt = strdup(text);
		char *end = txt + strlen(txt);
		char *c;
		utf *tmpRes;

		for (c = txt; c < end; c++)
			if (*c == '.') *c = '/';

		tmpRes = utf_new(txt, strlen(txt));
		FREE(txt, 0);

		return tmpRes;

	} else
		return utf_new(text, strlen(text));
}


/* utf_nextu2 ******************************************************************

   Read the next unicode character from the utf string and increment
   the utf-string pointer accordingly.

   CAUTION: This function is unsafe for input that was not checked 
            by is_valid_utf!

*******************************************************************************/

u2 utf_nextu2(char **utf_ptr)
{
    /* uncompressed unicode character */
    u2 unicode_char = 0;
    /* current position in utf text */	
    unsigned char *utf = (unsigned char *) (*utf_ptr);
    /* bytes representing the unicode character */
    unsigned char ch1, ch2, ch3;
    /* number of bytes used to represent the unicode character */
    int len = 0;
	
    switch ((ch1 = utf[0]) >> 4) {
	default: /* 1 byte */
		(*utf_ptr)++;
		return (u2) ch1;
	case 0xC: 
	case 0xD: /* 2 bytes */
		if (((ch2 = utf[1]) & 0xC0) == 0x80) {
			unsigned char high = ch1 & 0x1F;
			unsigned char low  = ch2 & 0x3F;
			unicode_char = (high << 6) + low;
			len = 2;
		}
		break;

	case 0xE: /* 2 or 3 bytes */
		if (((ch2 = utf[1]) & 0xC0) == 0x80) {
			if (((ch3 = utf[2]) & 0xC0) == 0x80) {
				unsigned char low  = ch3 & 0x3f;
				unsigned char mid  = ch2 & 0x3f;
				unsigned char high = ch1 & 0x0f;
				unicode_char = (((high << 6) + mid) << 6) + low;
				len = 3;
			} else
				len = 2;					   
		}
		break;
    }

    /* update position in utf-text */
    *utf_ptr = (char *) (utf + len);

    return unicode_char;
}


/* utf_bytes *******************************************************************

   Determine number of bytes (aka. octets) in the utf string.

   IN:
      u............utf string

   OUT:
      The number of octets of this utf string.
	  There is _no_ terminating zero included in this count.

*******************************************************************************/

u4 utf_bytes(utf *u)
{
	return u->blength;
}


/* utf_get_number_of_u2s_for_buffer ********************************************

   Determine number of UTF-16 u2s in the given UTF-8 buffer

   CAUTION: This function is unsafe for input that was not checked 
            by is_valid_utf!

   CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
   to an array of u2s (UTF-16) and want to know how many of them you will get.
   All other uses of this function are probably wrong.

   IN:
      buffer........points to first char in buffer
	  blength.......number of _bytes_ in the buffer

   OUT:
      the number of u2s needed to hold this string in UTF-16 encoding.
	  There is _no_ terminating zero included in this count.

   NOTE: Unlike utf_get_number_of_u2s, this function never throws an
   exception.

*******************************************************************************/

u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
{
	const char *endpos;                 /* points behind utf string           */
	const char *utf_ptr;                /* current position in utf text       */
	u4 len = 0;                         /* number of unicode characters       */

	utf_ptr = buffer;
	endpos = utf_ptr + blength;

	while (utf_ptr < endpos) {
		len++;
		/* next unicode character */
		utf_nextu2((char **)&utf_ptr);
	}

	assert(utf_ptr == endpos);

	return len;
}


/* utf_get_number_of_u2s *******************************************************

   Determine number of UTF-16 u2s in the utf string.

   CAUTION: This function is unsafe for input that was not checked 
            by is_valid_utf!

   CAUTION: Use this function *only* when you want to convert a utf string
   to an array of u2s and want to know how many of them you will get.
   All other uses of this function are probably wrong.

   IN:
      u............utf string

   OUT:
      the number of u2s needed to hold this string in UTF-16 encoding.
	  There is _no_ terminating zero included in this count.
	  XXX 0 if a NullPointerException has been thrown (see below)

*******************************************************************************/

u4 utf_get_number_of_u2s(utf *u)
{
	char *endpos;                       /* points behind utf string           */
	char *utf_ptr;                      /* current position in utf text       */
	u4 len = 0;                         /* number of unicode characters       */

	/* XXX this is probably not checked by most callers! Review this after */
	/* the invalid uses of this function have been eliminated */
	if (u == NULL) {
		exceptions_throw_nullpointerexception();
		return 0;
	}

	endpos = UTF_END(u);
	utf_ptr = u->text;

	while (utf_ptr < endpos) {
		len++;
		/* next unicode character */
		utf_nextu2(&utf_ptr);
	}

	if (utf_ptr != endpos) {
		/* string ended abruptly */
		exceptions_throw_internalerror("Illegal utf8 string");
		return 0;
	}

	return len;
}


/* utf8_safe_number_of_u2s *****************************************************

   Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
   (For invalid UTF-8 the U+fffd replacement character will be counted.)

   This function is safe even for invalid UTF-8 strings.

   IN:
      text..........zero-terminated(!) UTF-8 string (may be invalid)
	                must NOT be NULL
	  nbytes........strlen(text). (This is needed to completely emulate
	                the RI).

   OUT:
      the number of u2s needed to hold this string in UTF-16 encoding.
	  There is _no_ terminating zero included in this count.

*******************************************************************************/

s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
	register const unsigned char *t;
	register s4 byte;
	register s4 len;
	register const unsigned char *tlimit;
	s4 byte1;
	s4 byte2;
	s4 byte3;
	s4 value;
	s4 skip;

	assert(text);
	assert(nbytes >= 0);

	len = 0;
	t = (const unsigned char *) text;
	tlimit = t + nbytes;

	/* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */

	while (1) {
		byte = *t++;

		if (byte & 0x80) {
			/* highest bit set, non-ASCII character */

			if ((byte & 0xe0) == 0xc0) {
				/* 2-byte: should be 110..... 10...... ? */

				if ((*t++ & 0xc0) == 0x80)
					; /* valid 2-byte */
				else
					t--; /* invalid */
			}
			else if ((byte & 0xf0) == 0xe0) {
				/* 3-byte: should be 1110.... 10...... 10...... */
				/*                            ^t                */

				if (t + 2 > tlimit)
					return len + 1; /* invalid, stop here */

				if ((*t++ & 0xc0) == 0x80) {
					if ((*t++ & 0xc0) == 0x80)
						; /* valid 3-byte */
					else
						t--; /* invalid */
				}
				else
					t--; /* invalid */
			}
			else if ((byte & 0xf8) == 0xf0) {
				/* 4-byte: should be 11110... 10...... 10...... 10...... */
				/*                            ^t                         */

				if (t + 3 > tlimit)
					return len + 1; /* invalid, stop here */

				if (((byte1 = *t++) & 0xc0) == 0x80) {
					if (((byte2 = *t++) & 0xc0) == 0x80) {
						if (((byte3 = *t++) & 0xc0) == 0x80) {
							/* valid 4-byte UTF-8? */
							value = ((byte  & 0x07) << 18)
								  | ((byte1 & 0x3f) << 12)
								  | ((byte2 & 0x3f) <<  6)
								  | ((byte3 & 0x3f)      );

							if (value > 0x10FFFF)
								; /* invalid */
							else if (value > 0xFFFF)
								len += 1; /* we need surrogates */
							else
								; /* 16bit suffice */
						}
						else
							t--; /* invalid */
					}
					else
						t--; /* invalid */
				}
				else
					t--; /* invalid */
			}
			else if ((byte & 0xfc) == 0xf8) {
				/* invalid 5-byte */
				if (t + 4 > tlimit)
					return len + 1; /* invalid, stop here */

				skip = 4;
				for (; skip && ((*t & 0xc0) == 0x80); --skip)
					t++;
			}
			else if ((byte & 0xfe) == 0xfc) {
				/* invalid 6-byte */
				if (t + 5 > tlimit)
					return len + 1; /* invalid, stop here */

				skip = 5;
				for (; skip && ((*t & 0xc0) == 0x80); --skip)
					t++;
			}
			else
				; /* invalid */
		}
		else {
			/* NUL */

			if (byte == 0)
				break;

			/* ASCII character, common case */
		}

		len++;
	}

	return len;
}


/* utf8_safe_convert_to_u2s ****************************************************

   Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
   (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
   Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.

   This function is safe even for invalid UTF-8 strings.

   IN:
      text..........zero-terminated(!) UTF-8 string (may be invalid)
	                must NOT be NULL
	  nbytes........strlen(text). (This is needed to completely emulate
	  				the RI).
	  buffer........a preallocated array of u2s to receive the decoded
	                string. Use utf8_safe_number_of_u2s to get the
					required number of u2s for allocating this.

*******************************************************************************/

#define UNICODE_REPLACEMENT  0xfffd

void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
	register const unsigned char *t;
	register s4 byte;
	register const unsigned char *tlimit;
	s4 byte1;
	s4 byte2;
	s4 byte3;
	s4 value;
	s4 skip;

	assert(text);
	assert(nbytes >= 0);

	t = (const unsigned char *) text;
	tlimit = t + nbytes;

	/* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */

	while (1) {
		byte = *t++;

		if (byte & 0x80) {
			/* highest bit set, non-ASCII character */

			if ((byte & 0xe0) == 0xc0) {
				/* 2-byte: should be 110..... 10...... */

				if (((byte1 = *t++) & 0xc0) == 0x80) {
					/* valid 2-byte UTF-8 */
					*buffer++ = ((byte  & 0x1f) << 6)
							  | ((byte1 & 0x3f)     );
				}
				else {
					*buffer++ = UNICODE_REPLACEMENT;
					t--;
				}
			}
			else if ((byte & 0xf0) == 0xe0) {
				/* 3-byte: should be 1110.... 10...... 10...... */

				if (t + 2 > tlimit) {
					*buffer++ = UNICODE_REPLACEMENT;
					return;
				}

				if (((byte1 = *t++) & 0xc0) == 0x80) {
					if (((byte2 = *t++) & 0xc0) == 0x80) {
						/* valid 3-byte UTF-8 */
						*buffer++ = ((byte  & 0x0f) << 12)
								  | ((byte1 & 0x3f) <<  6)
								  | ((byte2 & 0x3f)      );
					}
					else {
						*buffer++ = UNICODE_REPLACEMENT;
						t--;
					}
				}
				else {
					*buffer++ = UNICODE_REPLACEMENT;
					t--;
				}
			}
			else if ((byte & 0xf8) == 0xf0) {
				/* 4-byte: should be 11110... 10...... 10...... 10...... */

				if (t + 3 > tlimit) {
					*buffer++ = UNICODE_REPLACEMENT;
					return;
				}

				if (((byte1 = *t++) & 0xc0) == 0x80) {
					if (((byte2 = *t++) & 0xc0) == 0x80) {
						if (((byte3 = *t++) & 0xc0) == 0x80) {
							/* valid 4-byte UTF-8? */
							value = ((byte  & 0x07) << 18)
								  | ((byte1 & 0x3f) << 12)
								  | ((byte2 & 0x3f) <<  6)
								  | ((byte3 & 0x3f)      );

							if (value > 0x10FFFF) {
								*buffer++ = UNICODE_REPLACEMENT;
							}
							else if (value > 0xFFFF) {
								/* we need surrogates */
								*buffer++ = 0xd800 | ((value >> 10) - 0x40);
								*buffer++ = 0xdc00 | (value & 0x03ff);
							}
							else
								*buffer++ = value; /* 16bit suffice */
						}
						else {
							*buffer++ = UNICODE_REPLACEMENT;
							t--;
						}
					}
					else {
						*buffer++ = UNICODE_REPLACEMENT;
						t--;
					}
				}
				else {
					*buffer++ = UNICODE_REPLACEMENT;
					t--;
				}
			}
			else if ((byte & 0xfc) == 0xf8) {
				if (t + 4 > tlimit) {
					*buffer++ = UNICODE_REPLACEMENT;
					return;
				}

				skip = 4;
				for (; skip && ((*t & 0xc0) == 0x80); --skip)
					t++;
				*buffer++ = UNICODE_REPLACEMENT;
			}
			else if ((byte & 0xfe) == 0xfc) {
				if (t + 5 > tlimit) {
					*buffer++ = UNICODE_REPLACEMENT;
					return;
				}

				skip = 5;
				for (; skip && ((*t & 0xc0) == 0x80); --skip)
					t++;
				*buffer++ = UNICODE_REPLACEMENT;
			}
			else
				*buffer++ = UNICODE_REPLACEMENT;
		}
		else {
			/* NUL */

			if (byte == 0)
				break;

			/* ASCII character, common case */

			*buffer++ = byte;
		}
	}
}


/* u2_utflength ****************************************************************

   Returns the utf length in bytes of a u2 array.

*******************************************************************************/

u4 u2_utflength(u2 *text, u4 u2_length)
{
	u4 result_len = 0;                  /* utf length in bytes                */
	u2 ch;                              /* current unicode character          */
	u4 len;
	
	for (len = 0; len < u2_length; len++) {
		/* next unicode character */
		ch = *text++;
	  
		/* determine bytes required to store unicode character as utf */
		if (ch && (ch < 0x80)) 
			result_len++;
		else if (ch < 0x800)
			result_len += 2;	
		else 
			result_len += 3;	
	}

    return result_len;
}


/* utf_copy ********************************************************************

   Copy the given utf string byte-for-byte to a buffer.

   IN:
      buffer.......the buffer
	  u............the utf string

*******************************************************************************/

void utf_copy(char *buffer, utf *u)
{
	/* our utf strings are zero-terminated (done by utf_new) */
	MCOPY(buffer, u->text, char, u->blength + 1);
}


/* utf_cat *********************************************************************

   Append the given utf string byte-for-byte to a buffer.

   IN:
      buffer.......the buffer
	  u............the utf string

*******************************************************************************/

void utf_cat(char *buffer, utf *u)
{
	/* our utf strings are zero-terminated (done by utf_new) */
	MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
}


/* utf_copy_classname **********************************************************

   Copy the given utf classname byte-for-byte to a buffer.
   '/' is replaced by '.'

   IN:
      buffer.......the buffer
	  u............the utf string

*******************************************************************************/

void utf_copy_classname(char *buffer, utf *u)
{
	char *bufptr;
	char *srcptr;
	char *endptr;
	char ch;

	bufptr = buffer;
	srcptr = u->text;
	endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */

	while (srcptr != endptr) {
		ch = *srcptr++;
		if (ch == '/')
			ch = '.';
		*bufptr++ = ch;
	}
}


/* utf_cat *********************************************************************

   Append the given utf classname byte-for-byte to a buffer.
   '/' is replaced by '.'

   IN:
      buffer.......the buffer
	  u............the utf string

*******************************************************************************/

void utf_cat_classname(char *buffer, utf *u)
{
	utf_copy_classname(buffer + strlen(buffer), u);
}

/* utf_display_printable_ascii *************************************************

   Write utf symbol to stdout (for debugging purposes).
   Non-printable and non-ASCII characters are printed as '?'.

*******************************************************************************/

void utf_display_printable_ascii(utf *u)
{
	char *endpos;                       /* points behind utf string           */
	char *utf_ptr;                      /* current position in utf text       */

	if (u == NULL) {
		printf("NULL");
		fflush(stdout);
		return;
	}

	endpos = UTF_END(u);
	utf_ptr = u->text;

	while (utf_ptr < endpos) {
		/* read next unicode character */

		u2 c = utf_nextu2(&utf_ptr);

		if ((c >= 32) && (c <= 127))
			printf("%c", c);
		else
			printf("?");
	}

	fflush(stdout);
}


/* utf_display_printable_ascii_classname ***************************************

   Write utf symbol to stdout with `/' converted to `.' (for debugging
   purposes).
   Non-printable and non-ASCII characters are printed as '?'.

*******************************************************************************/

void utf_display_printable_ascii_classname(utf *u)
{
	char *endpos;                       /* points behind utf string           */
	char *utf_ptr;                      /* current position in utf text       */

	if (u == NULL) {
		printf("NULL");
		fflush(stdout);
		return;
	}

	endpos = UTF_END(u);
	utf_ptr = u->text;

	while (utf_ptr < endpos) {
		/* read next unicode character */

		u2 c = utf_nextu2(&utf_ptr);

		if (c == '/')
			c = '.';

		if ((c >= 32) && (c <= 127))
			printf("%c", c);
		else
			printf("?");
	}

	fflush(stdout);
}


/* utf_sprint_convert_to_latin1 ************************************************
	
   Write utf symbol into c-string (for debugging purposes).
   Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
   invalid results.

*******************************************************************************/

void utf_sprint_convert_to_latin1(char *buffer, utf *u)
{
	char *endpos;                       /* points behind utf string           */
	char *utf_ptr;                      /* current position in utf text       */
	u2 pos = 0;                         /* position in c-string               */

	if (!u) {
		strcpy(buffer, "NULL");
		return;
	}

	endpos = UTF_END(u);
	utf_ptr = u->text;

	while (utf_ptr < endpos) 
		/* copy next unicode character */       
		buffer[pos++] = utf_nextu2(&utf_ptr);

	/* terminate string */
	buffer[pos] = '\0';
}


/* utf_sprint_convert_to_latin1_classname **************************************
	
   Write utf symbol into c-string with `/' converted to `.' (for debugging
   purposes).
   Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
   invalid results.

*******************************************************************************/

void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
{
	char *endpos;                       /* points behind utf string           */
	char *utf_ptr;                      /* current position in utf text       */
	u2 pos = 0;                         /* position in c-string               */

	if (!u) {
		strcpy(buffer, "NULL");
		return;
	}

	endpos = UTF_END(u);
	utf_ptr = u->text;

	while (utf_ptr < endpos) {
		/* copy next unicode character */       
		u2 c = utf_nextu2(&utf_ptr);
		if (c == '/') c = '.';
		buffer[pos++] = c;
	}

	/* terminate string */
	buffer[pos] = '\0';
}


/* utf_strcat_convert_to_latin1 ************************************************
	
   Like libc strcat, but uses an utf8 string.
   Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
   invalid results.

*******************************************************************************/

void utf_strcat_convert_to_latin1(char *buffer, utf *u)
{
	utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
}


/* utf_strcat_convert_to_latin1_classname **************************************
	
   Like libc strcat, but uses an utf8 string.
   Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
   invalid results.

*******************************************************************************/

void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
{
	utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
}


/* utf_fprint_printable_ascii **************************************************
	
   Write utf symbol into file.
   Non-printable and non-ASCII characters are printed as '?'.

*******************************************************************************/

void utf_fprint_printable_ascii(FILE *file, utf *u)
{
	char *endpos;                       /* points behind utf string           */
	char *utf_ptr;                      /* current position in utf text       */

	if (!u)
		return;

	endpos = UTF_END(u);
	utf_ptr = u->text;

	while (utf_ptr < endpos) { 
		/* read next unicode character */                
		u2 c = utf_nextu2(&utf_ptr);				

		if (c >= 32 && c <= 127) fprintf(file, "%c", c);
		else fprintf(file, "?");
	}
}


/* utf_fprint_printable_ascii_classname ****************************************
	
   Write utf symbol into file with `/' converted to `.'.
   Non-printable and non-ASCII characters are printed as '?'.

*******************************************************************************/

void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
{
	char *endpos;                       /* points behind utf string           */
	char *utf_ptr;                      /* current position in utf text       */

    if (!u)
		return;

	endpos = UTF_END(u);
	utf_ptr = u->text;

	while (utf_ptr < endpos) { 
		/* read next unicode character */                
		u2 c = utf_nextu2(&utf_ptr);				
		if (c == '/') c = '.';

		if (c >= 32 && c <= 127) fprintf(file, "%c", c);
		else fprintf(file, "?");
	}
}


/* is_valid_utf ****************************************************************

   Return true if the given string is a valid UTF-8 string.

   utf_ptr...points to first character
   end_pos...points after last character

*******************************************************************************/

/*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */

bool is_valid_utf(char *utf_ptr, char *end_pos)
{
	int bytes;
	int len,i;
	char c;
	unsigned long v;

	if (end_pos < utf_ptr) return false;
	bytes = end_pos - utf_ptr;
	while (bytes--) {
		c = *utf_ptr++;

		if (!c) return false;                     /* 0x00 is not allowed */
		if ((c & 0x80) == 0) continue;            /* ASCII */

		if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
		else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
		else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
		else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
		else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
		else return false;                        /* invalid leading byte */

		if (len > 2) return false;                /* Java limitation */

		v = (unsigned long)c & (0x3f >> len);
		
		if ((bytes -= len) < 0) return false;     /* missing bytes */

		for (i = len; i--; ) {
			c = *utf_ptr++;
			if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
				return false;
			v = (v << 6) | (c & 0x3f);
		}

		if (v == 0) {
			if (len != 1) return false;           /* Java special */

		} else {
			/* Sun Java seems to allow overlong UTF-8 encodings */
			
			/* if (v < min_codepoint[len]) */
				/* XXX throw exception? */
		}

		/* surrogates in UTF-8 seem to be allowed in Java classfiles */
		/* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */

		/* even these seem to be allowed */
		/* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
	}

	return true;
}


/* is_valid_name ***************************************************************

   Return true if the given string may be used as a class/field/method
   name. (Currently this only disallows empty strings and control
   characters.)

   NOTE: The string is assumed to have passed is_valid_utf!

   utf_ptr...points to first character
   end_pos...points after last character

*******************************************************************************/

bool is_valid_name(char *utf_ptr, char *end_pos)
{
	if (end_pos <= utf_ptr) return false; /* disallow empty names */

	while (utf_ptr < end_pos) {
		unsigned char c = *utf_ptr++;

		if (c < 0x20) return false; /* disallow control characters */
		if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
			return false;
	}

	return true;
}

bool is_valid_name_utf(utf *u)
{
	return is_valid_name(u->text, UTF_END(u));
}


/* utf_show ********************************************************************

   Writes the utf symbols in the utfhash to stdout and displays the
   number of external hash chains grouped according to the chainlength
   (for debugging purposes).

*******************************************************************************/

#if !defined(NDEBUG)
void utf_show(void)
{

#define CHAIN_LIMIT 20               /* limit for seperated enumeration */

	u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
	u4 max_chainlength = 0;      /* maximum length of the chains */
	u4 sum_chainlength = 0;      /* sum of the chainlengths */
	u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
	u4 i;

	printf("UTF-HASH:\n");

	/* show element of utf-hashtable */

	for (i = 0; i < hashtable_utf->size; i++) {
		utf *u = hashtable_utf->ptr[i];

		if (u) {
			printf("SLOT %d: ", (int) i);

			while (u) {
				printf("'");
				utf_display_printable_ascii(u);
				printf("' ");
				u = u->hashlink;
			}	
			printf("\n");
		}
	}

	printf("UTF-HASH: %d slots for %d entries\n", 
		   (int) hashtable_utf->size, (int) hashtable_utf->entries );

	if (hashtable_utf->entries == 0)
		return;

	printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");

	for (i=0;i<CHAIN_LIMIT;i++)
		chain_count[i]=0;

	/* count numbers of hashchains according to their length */
	for (i=0; i<hashtable_utf->size; i++) {
		  
		utf *u = (utf*) hashtable_utf->ptr[i];
		u4 chain_length = 0;

		/* determine chainlength */
		while (u) {
			u = u->hashlink;
			chain_length++;
		}

		/* update sum of all chainlengths */
		sum_chainlength+=chain_length;

		/* determine the maximum length of the chains */
		if (chain_length>max_chainlength)
			max_chainlength = chain_length;

		/* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
		if (chain_length>=CHAIN_LIMIT) {
			beyond_limit+=chain_length;
			chain_length=CHAIN_LIMIT-1;
		}

		/* update number of hashchains of current length */
		chain_count[chain_length]++;
	}

	/* display results */  
	for (i=1;i<CHAIN_LIMIT-1;i++) 
		printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
	  
	printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);


	printf("max. chainlength:%5d\n",max_chainlength);

	/* avg. chainlength = sum of chainlengths / number of chains */
	printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
}
#endif /* !defined(NDEBUG) */


/*
 * These are local overrides for various environment variables in Emacs.
 * Please do not remove this and leave it at the end of the file, where
 * Emacs will automagically detect them.
 * ---------------------------------------------------------------------
 * Local variables:
 * mode: c
 * indent-tabs-mode: t
 * c-basic-offset: 4
 * tab-width: 4
 * End:
 * vim:noexpandtab:sw=4:ts=4:
 */