src/vmcore/utf8.c

   1 /* src/vmcore/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
   4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
   5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
   6    J. Wenninger, Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  23    02110-1301, USA.
  24
  25 */
  26
  27
  28 #include "config.h"
  29
  30 #include <string.h>
  31 #include <assert.h>
  32
  33 #include "vm/types.h"
  34
  35 #include "mm/memory.h"
  36
  37 #include "threads/lock-common.h"
  38
  39 #include "toolbox/hashtable.h"
  40
  41 #include "vm/exceptions.h"
  42
  43 #include "vmcore/options.h"
  44
  45 #if defined(ENABLE_STATISTICS)
  46 # include "vmcore/statistics.h"
  47 #endif
  48
  49 #include "vmcore/utf8.h"
  50
  51
  52 /* global variables ***********************************************************/
  53
  54 /* hashsize must be power of 2 */
  55
  56 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  57
  58 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  59
  60
  61 /* utf-symbols for pointer comparison of frequently used strings **************/
  62
  63 utf *utf_java_lang_Object;
  64
  65 utf *utf_java_lang_Class;
  66 utf *utf_java_lang_ClassLoader;
  67 utf *utf_java_lang_Cloneable;
  68 utf *utf_java_lang_SecurityManager;
  69 utf *utf_java_lang_String;
  70 utf *utf_java_lang_System;
  71 utf *utf_java_lang_ThreadGroup;
  72 utf *utf_java_lang_ref_SoftReference;
  73 utf *utf_java_lang_ref_WeakReference;
  74 utf *utf_java_lang_ref_PhantomReference;
  75 utf *utf_java_io_Serializable;
  76
  77 utf *utf_java_lang_Throwable;
  78 utf *utf_java_lang_Error;
  79
  80 utf *utf_java_lang_AbstractMethodError;
  81 utf *utf_java_lang_ClassCircularityError;
  82 utf *utf_java_lang_ClassFormatError;
  83 utf *utf_java_lang_ExceptionInInitializerError;
  84 utf *utf_java_lang_IncompatibleClassChangeError;
  85 utf *utf_java_lang_InstantiationError;
  86 utf *utf_java_lang_InternalError;
  87 utf *utf_java_lang_LinkageError;
  88 utf *utf_java_lang_NoClassDefFoundError;
  89 utf *utf_java_lang_NoSuchFieldError;
  90 utf *utf_java_lang_NoSuchMethodError;
  91 utf *utf_java_lang_OutOfMemoryError;
  92 utf *utf_java_lang_UnsatisfiedLinkError;
  93 utf *utf_java_lang_UnsupportedClassVersionError;
  94 utf *utf_java_lang_VerifyError;
  95 utf *utf_java_lang_VirtualMachineError;
  96
  97 #if defined(WITH_CLASSPATH_GNU)
  98 utf *utf_java_lang_VMThrowable;
  99 #endif
 100
 101 utf *utf_java_lang_Exception;
 102
 103 utf *utf_java_lang_ArithmeticException;
 104 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
 105 utf *utf_java_lang_ArrayStoreException;
 106 utf *utf_java_lang_ClassCastException;
 107 utf *utf_java_lang_ClassNotFoundException;
 108 utf *utf_java_lang_CloneNotSupportedException;
 109 utf *utf_java_lang_IllegalAccessException;
 110 utf *utf_java_lang_IllegalArgumentException;
 111 utf *utf_java_lang_IllegalMonitorStateException;
 112 utf *utf_java_lang_InstantiationException;
 113 utf *utf_java_lang_InterruptedException;
 114 utf *utf_java_lang_NegativeArraySizeException;
 115 utf *utf_java_lang_NullPointerException;
 116 utf *utf_java_lang_StringIndexOutOfBoundsException;
 117
 118 utf *utf_java_lang_reflect_InvocationTargetException;
 119
 120 utf *utf_java_security_PrivilegedActionException;
 121
 122 #if defined(ENABLE_JAVASE)
 123 utf* utf_java_lang_Void;
 124 #endif
 125
 126 utf* utf_java_lang_Boolean;
 127 utf* utf_java_lang_Byte;
 128 utf* utf_java_lang_Character;
 129 utf* utf_java_lang_Short;
 130 utf* utf_java_lang_Integer;
 131 utf* utf_java_lang_Long;
 132 utf* utf_java_lang_Float;
 133 utf* utf_java_lang_Double;
 134
 135 #if defined(ENABLE_JAVASE)
 136 utf *utf_java_lang_StackTraceElement;
 137 utf *utf_java_lang_reflect_Constructor;
 138 utf *utf_java_lang_reflect_Field;
 139 utf *utf_java_lang_reflect_Method;
 140 utf *utf_java_util_Vector;
 141 #endif
 142
 143 utf *utf_InnerClasses;                  /* InnerClasses                       */
 144 utf *utf_ConstantValue;                 /* ConstantValue                      */
 145 utf *utf_Code;                          /* Code                               */
 146 utf *utf_Exceptions;                    /* Exceptions                         */
 147 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 148 utf *utf_SourceFile;                    /* SourceFile                         */
 149
 150 #if defined(ENABLE_JAVASE)
 151 utf *utf_EnclosingMethod;
 152 utf *utf_Signature;
 153 utf *utf_StackMapTable;
 154
 155 #if defined(ENABLE_ANNOTATIONS)
 156 utf *utf_sun_reflect_ConstantPool;
 157 #if defined(WITH_CLASSPATH_GNU)
 158 utf *utf_sun_reflect_annotation_AnnotationParser;
 159 #endif
 160
 161 utf *utf_RuntimeVisibleAnnotations;
 162 utf *utf_RuntimeInvisibleAnnotations;
 163 utf *utf_RuntimeVisibleParameterAnnotations;
 164 utf *utf_RuntimeInvisibleParameterAnnotations;
 165 utf *utf_AnnotationDefault;
 166 #endif
 167 #endif
 168
 169 utf *utf_init;                          /* <init>                             */
 170 utf *utf_clinit;                        /* <clinit>                           */
 171 utf *utf_clone;                         /* clone                              */
 172 utf *utf_finalize;                      /* finalize                           */
 173 utf *utf_run;                           /* run                                */
 174
 175 utf *utf_add;
 176 utf *utf_remove;
 177 utf *utf_addThread;
 178 utf *utf_removeThread;
 179 utf *utf_put;
 180 utf *utf_get;
 181 utf *utf_uncaughtException;
 182 utf *utf_value;
 183
 184 utf *utf_fillInStackTrace;
 185 utf *utf_findNative;
 186 utf *utf_getSystemClassLoader;
 187 utf *utf_initCause;
 188 utf *utf_loadClass;
 189 utf *utf_printStackTrace;
 190
 191 utf *utf_division_by_zero;
 192
 193 utf *utf_Z;                             /* Z                                  */
 194 utf *utf_B;                             /* B                                  */
 195 utf *utf_C;                             /* C                                  */
 196 utf *utf_S;                             /* S                                  */
 197 utf *utf_I;                             /* I                                  */
 198 utf *utf_J;                             /* J                                  */
 199 utf *utf_F;                             /* F                                  */
 200 utf *utf_D;                             /* D                                  */
 201
 202 utf *utf_void__void;                    /* ()V                                */
 203 utf *utf_boolean__void;                 /* (Z)V                               */
 204 utf *utf_byte__void;                    /* (B)V                               */
 205 utf *utf_char__void;                    /* (C)V                               */
 206 utf *utf_short__void;                   /* (S)V                               */
 207 utf *utf_int__void;                     /* (I)V                               */
 208 utf *utf_long__void;                    /* (J)V                               */
 209 utf *utf_float__void;                   /* (F)V                               */
 210 utf *utf_double__void;                  /* (D)V                               */
 211
 212 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 213 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 214 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 215 utf *utf_java_lang_ClassLoader_java_lang_String__J;
 216 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 217 utf *utf_java_lang_Object__java_lang_Object;
 218 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 219 utf *utf_java_lang_String__java_lang_Class;
 220 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 221 utf *utf_java_lang_Thread_java_lang_Throwable__V;
 222 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 223 utf *utf_java_lang_Throwable__java_lang_Throwable;
 224
 225 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 226 utf *utf_null;
 227 utf *array_packagename;
 228
 229
 230 /* utf_init ********************************************************************
 231
 232    Initializes the utf8 subsystem.
 233
 234 *******************************************************************************/
 235
 236 bool utf8_init(void)
 237 {
 238         /* create utf8 hashtable */
 239
 240         hashtable_utf = NEW(hashtable);
 241
 242         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 243
 244 #if defined(ENABLE_STATISTICS)
 245         if (opt_stat)
 246                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 247 #endif
 248
 249         /* create utf-symbols for pointer comparison of frequently used strings */
 250
 251         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 252
 253         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 254         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 255         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 256         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 257         utf_java_lang_String           = utf_new_char("java/lang/String");
 258         utf_java_lang_System           = utf_new_char("java/lang/System");
 259         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 260
 261         utf_java_lang_ref_SoftReference =
 262                 utf_new_char("java/lang/ref/SoftReference");
 263
 264         utf_java_lang_ref_WeakReference =
 265                 utf_new_char("java/lang/ref/WeakReference");
 266
 267         utf_java_lang_ref_PhantomReference =
 268                 utf_new_char("java/lang/ref/PhantomReference");
 269
 270         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 271
 272         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 273         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 274
 275         utf_java_lang_ClassCircularityError =
 276                 utf_new_char("java/lang/ClassCircularityError");
 277
 278         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
 279
 280         utf_java_lang_ExceptionInInitializerError =
 281                 utf_new_char("java/lang/ExceptionInInitializerError");
 282
 283         utf_java_lang_IncompatibleClassChangeError =
 284                 utf_new_char("java/lang/IncompatibleClassChangeError");
 285
 286         utf_java_lang_InstantiationError =
 287                 utf_new_char("java/lang/InstantiationError");
 288
 289         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
 290         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 291
 292         utf_java_lang_NoClassDefFoundError =
 293                 utf_new_char("java/lang/NoClassDefFoundError");
 294
 295         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 296
 297         utf_java_lang_UnsatisfiedLinkError =
 298                 utf_new_char("java/lang/UnsatisfiedLinkError");
 299
 300         utf_java_lang_UnsupportedClassVersionError =
 301                 utf_new_char("java/lang/UnsupportedClassVersionError");
 302
 303         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
 304
 305         utf_java_lang_VirtualMachineError =
 306                 utf_new_char("java/lang/VirtualMachineError");
 307
 308 #if defined(ENABLE_JAVASE)
 309         utf_java_lang_AbstractMethodError =
 310                 utf_new_char("java/lang/AbstractMethodError");
 311
 312         utf_java_lang_NoSuchFieldError =
 313                 utf_new_char("java/lang/NoSuchFieldError");
 314
 315         utf_java_lang_NoSuchMethodError =
 316                 utf_new_char("java/lang/NoSuchMethodError");
 317 #endif
 318
 319 #if defined(WITH_CLASSPATH_GNU)
 320         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
 321 #endif
 322
 323         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 324
 325         utf_java_lang_ArithmeticException =
 326                 utf_new_char("java/lang/ArithmeticException");
 327
 328         utf_java_lang_ArrayIndexOutOfBoundsException =
 329                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
 330
 331         utf_java_lang_ArrayStoreException =
 332                 utf_new_char("java/lang/ArrayStoreException");
 333
 334         utf_java_lang_ClassCastException =
 335                 utf_new_char("java/lang/ClassCastException");
 336
 337         utf_java_lang_ClassNotFoundException =
 338                 utf_new_char("java/lang/ClassNotFoundException");
 339
 340         utf_java_lang_CloneNotSupportedException =
 341                 utf_new_char("java/lang/CloneNotSupportedException");
 342
 343         utf_java_lang_IllegalAccessException =
 344                 utf_new_char("java/lang/IllegalAccessException");
 345
 346         utf_java_lang_IllegalArgumentException =
 347                 utf_new_char("java/lang/IllegalArgumentException");
 348
 349         utf_java_lang_IllegalMonitorStateException =
 350                 utf_new_char("java/lang/IllegalMonitorStateException");
 351
 352         utf_java_lang_InstantiationException =
 353                 utf_new_char("java/lang/InstantiationException");
 354
 355         utf_java_lang_InterruptedException =
 356                 utf_new_char("java/lang/InterruptedException");
 357
 358         utf_java_lang_NegativeArraySizeException =
 359                 utf_new_char("java/lang/NegativeArraySizeException");
 360
 361         utf_java_lang_NullPointerException =
 362                 utf_new_char("java/lang/NullPointerException");
 363
 364         utf_java_lang_StringIndexOutOfBoundsException =
 365                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
 366
 367         utf_java_lang_reflect_InvocationTargetException =
 368                 utf_new_char("java/lang/reflect/InvocationTargetException");
 369
 370         utf_java_security_PrivilegedActionException =
 371                 utf_new_char("java/security/PrivilegedActionException");
 372
 373 #if defined(ENABLE_JAVASE)
 374         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 375 #endif
 376
 377         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 378         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 379         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 380         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 381         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 382         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 383         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 384         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 385
 386 #if defined(ENABLE_JAVASE)
 387         utf_java_lang_StackTraceElement =
 388                 utf_new_char("java/lang/StackTraceElement");
 389
 390         utf_java_lang_reflect_Constructor =
 391                 utf_new_char("java/lang/reflect/Constructor");
 392
 393         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 394         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 395         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 396 #endif
 397
 398         utf_InnerClasses               = utf_new_char("InnerClasses");
 399         utf_ConstantValue              = utf_new_char("ConstantValue");
 400         utf_Code                       = utf_new_char("Code");
 401         utf_Exceptions                 = utf_new_char("Exceptions");
 402         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 403         utf_SourceFile                 = utf_new_char("SourceFile");
 404
 405 #if defined(ENABLE_JAVASE)
 406         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 407         utf_Signature                  = utf_new_char("Signature");
 408         utf_StackMapTable              = utf_new_char("StackMapTable");
 409
 410 #if defined(ENABLE_ANNOTATIONS)
 411         utf_sun_reflect_ConstantPool                = utf_new_char("sun/reflect/ConstantPool");
 412 #if defined(WITH_CLASSPATH_GNU)
 413         utf_sun_reflect_annotation_AnnotationParser = utf_new_char("sun/reflect/annotation/AnnotationParser");
 414 #endif
 415
 416         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
 417         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
 418         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
 419         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
 420         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
 421 #endif
 422 #endif
 423
 424         utf_init                           = utf_new_char("<init>");
 425         utf_clinit                         = utf_new_char("<clinit>");
 426         utf_clone                      = utf_new_char("clone");
 427         utf_finalize                   = utf_new_char("finalize");
 428         utf_run                        = utf_new_char("run");
 429
 430         utf_add                        = utf_new_char("add");
 431         utf_remove                     = utf_new_char("remove");
 432         utf_addThread                  = utf_new_char("addThread");
 433         utf_removeThread               = utf_new_char("removeThread");
 434         utf_put                        = utf_new_char("put");
 435         utf_get                        = utf_new_char("get");
 436         utf_uncaughtException          = utf_new_char("uncaughtException");
 437         utf_value                      = utf_new_char("value");
 438
 439         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 440         utf_findNative                 = utf_new_char("findNative");
 441         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 442         utf_initCause                  = utf_new_char("initCause");
 443         utf_loadClass                  = utf_new_char("loadClass");
 444         utf_printStackTrace            = utf_new_char("printStackTrace");
 445
 446         utf_division_by_zero           = utf_new_char("/ by zero");
 447
 448         utf_Z                          = utf_new_char("Z");
 449         utf_B                          = utf_new_char("B");
 450         utf_C                          = utf_new_char("C");
 451         utf_S                          = utf_new_char("S");
 452         utf_I                          = utf_new_char("I");
 453         utf_J                          = utf_new_char("J");
 454         utf_F                          = utf_new_char("F");
 455         utf_D                          = utf_new_char("D");
 456
 457         utf_void__void                 = utf_new_char("()V");
 458         utf_boolean__void              = utf_new_char("(Z)V");
 459         utf_byte__void                 = utf_new_char("(B)V");
 460         utf_char__void                 = utf_new_char("(C)V");
 461         utf_short__void                = utf_new_char("(S)V");
 462         utf_int__void                  = utf_new_char("(I)V");
 463         utf_long__void                 = utf_new_char("(J)V");
 464         utf_float__void                = utf_new_char("(F)V");
 465         utf_double__void               = utf_new_char("(D)V");
 466         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 467         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 468
 469         utf_void__java_lang_ClassLoader =
 470                 utf_new_char("()Ljava/lang/ClassLoader;");
 471
 472         utf_java_lang_ClassLoader_java_lang_String__J =
 473                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
 474
 475         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
 476
 477         utf_java_lang_Object__java_lang_Object =
 478                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 479
 480         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 481
 482         utf_java_lang_String__java_lang_Class =
 483                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 484
 485         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 486
 487         utf_java_lang_Thread_java_lang_Throwable__V =
 488                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
 489
 490         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 491
 492         utf_java_lang_Throwable__java_lang_Throwable =
 493                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
 494
 495         utf_null                       = utf_new_char("null");
 496         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 497         array_packagename              = utf_new_char("\t<the array package>");
 498
 499         /* everything's ok */
 500
 501         return true;
 502 }
 503
 504
 505 /* utf_hashkey *****************************************************************
 506
 507    The hashkey is computed from the utf-text by using up to 8
 508    characters.  For utf-symbols longer than 15 characters 3 characters
 509    are taken from the beginning and the end, 2 characters are taken
 510    from the middle.
 511
 512 *******************************************************************************/
 513
 514 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 515 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 516
 517 u4 utf_hashkey(const char *text, u4 length)
 518 {
 519         const char *start_pos = text;       /* pointer to utf text                */
 520         u4 a;
 521
 522         switch (length) {
 523         case 0: /* empty string */
 524                 return 0;
 525
 526         case 1: return fbs(0);
 527         case 2: return fbs(0) ^ nbs(3);
 528         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 529         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 530         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 531         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 532         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 533         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 534
 535         case 9:
 536                 a = fbs(0);
 537                 a ^= nbs(1);
 538                 a ^= nbs(2);
 539                 text++;
 540                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 541
 542         case 10:
 543                 a = fbs(0);
 544                 text++;
 545                 a ^= nbs(2);
 546                 a ^= nbs(3);
 547                 a ^= nbs(4);
 548                 text++;
 549                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 550
 551         case 11:
 552                 a = fbs(0);
 553                 text++;
 554                 a ^= nbs(2);
 555                 a ^= nbs(3);
 556                 a ^= nbs(4);
 557                 text++;
 558                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 559
 560         case 12:
 561                 a = fbs(0);
 562                 text += 2;
 563                 a ^= nbs(2);
 564                 a ^= nbs(3);
 565                 text++;
 566                 a ^= nbs(5);
 567                 a ^= nbs(6);
 568                 a ^= nbs(7);
 569                 text++;
 570                 return a ^ nbs(9) ^ nbs(10);
 571
 572         case 13:
 573                 a = fbs(0);
 574                 a ^= nbs(1);
 575                 text++;
 576                 a ^= nbs(3);
 577                 a ^= nbs(4);
 578                 text += 2;
 579                 a ^= nbs(7);
 580                 a ^= nbs(8);
 581                 text += 2;
 582                 return a ^ nbs(9) ^ nbs(10);
 583
 584         case 14:
 585                 a = fbs(0);
 586                 text += 2;
 587                 a ^= nbs(3);
 588                 a ^= nbs(4);
 589                 text += 2;
 590                 a ^= nbs(7);
 591                 a ^= nbs(8);
 592                 text += 2;
 593                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 594
 595         case 15:
 596                 a = fbs(0);
 597                 text += 2;
 598                 a ^= nbs(3);
 599                 a ^= nbs(4);
 600                 text += 2;
 601                 a ^= nbs(7);
 602                 a ^= nbs(8);
 603                 text += 2;
 604                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 605
 606         default:  /* 3 characters from beginning */
 607                 a = fbs(0);
 608                 text += 2;
 609                 a ^= nbs(3);
 610                 a ^= nbs(4);
 611
 612                 /* 2 characters from middle */
 613                 text = start_pos + (length / 2);
 614                 a ^= fbs(5);
 615                 text += 2;
 616                 a ^= nbs(6);
 617
 618                 /* 3 characters from end */
 619                 text = start_pos + length - 4;
 620
 621                 a ^= fbs(7);
 622                 text++;
 623
 624                 return a ^ nbs(10) ^ nbs(11);
 625     }
 626 }
 627
 628 /* utf_full_hashkey ************************************************************
 629
 630    This function computes a hash value using all bytes in the string.
 631
 632    The algorithm is the "One-at-a-time" algorithm as published
 633    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 634
 635 *******************************************************************************/
 636
 637 u4 utf_full_hashkey(const char *text, u4 length)
 638 {
 639         register const unsigned char *p = (const unsigned char *) text;
 640         register u4 hash;
 641         register u4 i;
 642
 643         hash = 0;
 644         for (i=length; i--;)
 645         {
 646             hash += *p++;
 647             hash += (hash << 10);
 648             hash ^= (hash >> 6);
 649         }
 650         hash += (hash << 3);
 651         hash ^= (hash >> 11);
 652         hash += (hash << 15);
 653
 654         return hash;
 655 }
 656
 657 /* unicode_hashkey *************************************************************
 658
 659    Compute the hashkey of a unicode string.
 660
 661 *******************************************************************************/
 662
 663 u4 unicode_hashkey(u2 *text, u2 len)
 664 {
 665         return utf_hashkey((char *) text, len);
 666 }
 667
 668
 669 /* utf_new *********************************************************************
 670
 671    Creates a new utf-symbol, the text of the symbol is passed as a
 672    u1-array. The function searches the utf-hashtable for a utf-symbol
 673    with this text. On success the element returned, otherwise a new
 674    hashtable element is created.
 675
 676    If the number of entries in the hashtable exceeds twice the size of
 677    the hashtable slots a reorganization of the hashtable is done and
 678    the utf symbols are copied to a new hashtable with doubled size.
 679
 680 *******************************************************************************/
 681
 682 utf *utf_new(const char *text, u2 length)
 683 {
 684         u4 key;                             /* hashkey computed from utf-text     */
 685         u4 slot;                            /* slot in hashtable                  */
 686         utf *u;                             /* hashtable element                  */
 687         u2 i;
 688
 689         LOCK_MONITOR_ENTER(hashtable_utf->header);
 690
 691 #if defined(ENABLE_STATISTICS)
 692         if (opt_stat)
 693                 count_utf_new++;
 694 #endif
 695
 696         key  = utf_hashkey(text, length);
 697         slot = key & (hashtable_utf->size - 1);
 698         u    = hashtable_utf->ptr[slot];
 699
 700         /* search external hash chain for utf-symbol */
 701
 702         while (u) {
 703                 if (u->blength == length) {
 704                         /* compare text of hashtable elements */
 705
 706                         for (i = 0; i < length; i++)
 707                                 if (text[i] != u->text[i])
 708                                         goto nomatch;
 709
 710 #if defined(ENABLE_STATISTICS)
 711                         if (opt_stat)
 712                                 count_utf_new_found++;
 713 #endif
 714
 715                         /* symbol found in hashtable */
 716
 717                         LOCK_MONITOR_EXIT(hashtable_utf->header);
 718
 719                         return u;
 720                 }
 721
 722         nomatch:
 723                 u = u->hashlink; /* next element in external chain */
 724         }
 725
 726         /* location in hashtable found, create new utf element */
 727
 728         u = NEW(utf);
 729
 730         u->blength  = length;               /* length in bytes of utfstring       */
 731         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 732         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 733
 734         memcpy(u->text, text, length);      /* copy utf-text                      */
 735         u->text[length] = '\0';
 736
 737 #if defined(ENABLE_STATISTICS)
 738         if (opt_stat)
 739                 count_utf_len += sizeof(utf) + length + 1;
 740 #endif
 741
 742         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 743         hashtable_utf->entries++;           /* update number of entries           */
 744
 745         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 746
 747         /* reorganization of hashtable, average length of the external
 748            chains is approx. 2 */
 749
 750                 hashtable *newhash;                              /* the new hashtable */
 751                 u4         i;
 752                 utf       *u;
 753                 utf       *nextu;
 754                 u4         slot;
 755
 756                 /* create new hashtable, double the size */
 757
 758                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 759
 760 #if defined(ENABLE_STATISTICS)
 761                 if (opt_stat)
 762                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 763 #endif
 764
 765                 /* transfer elements to new hashtable */
 766
 767                 for (i = 0; i < hashtable_utf->size; i++) {
 768                         u = hashtable_utf->ptr[i];
 769
 770                         while (u) {
 771                                 nextu = u->hashlink;
 772                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 773
 774                                 u->hashlink = (utf *) newhash->ptr[slot];
 775                                 newhash->ptr[slot] = u;
 776
 777                                 /* follow link in external hash chain */
 778
 779                                 u = nextu;
 780                         }
 781                 }
 782
 783                 /* dispose old table */
 784
 785                 hashtable_free(hashtable_utf);
 786
 787                 hashtable_utf = newhash;
 788         }
 789
 790         LOCK_MONITOR_EXIT(hashtable_utf->header);
 791
 792         return u;
 793 }
 794
 795
 796 /* utf_new_u2 ******************************************************************
 797
 798    Make utf symbol from u2 array, if isclassname is true '.' is
 799    replaced by '/'.
 800
 801 *******************************************************************************/
 802
 803 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 804 {
 805         char *buffer;                   /* memory buffer for  unicode characters  */
 806         char *pos;                      /* pointer to current position in buffer  */
 807         u4 left;                        /* unicode characters left                */
 808         u4 buflength;                   /* utf length in bytes of the u2 array    */
 809         utf *result;                    /* resulting utf-string                   */
 810         int i;
 811
 812         /* determine utf length in bytes and allocate memory */
 813
 814         buflength = u2_utflength(unicode_pos, unicode_length);
 815         buffer    = MNEW(char, buflength);
 816
 817         left = buflength;
 818         pos  = buffer;
 819
 820         for (i = 0; i++ < unicode_length; unicode_pos++) {
 821                 /* next unicode character */
 822                 u2 c = *unicode_pos;
 823
 824                 if ((c != 0) && (c < 0x80)) {
 825                         /* 1 character */
 826                         left--;
 827                 if ((int) left < 0) break;
 828                         /* convert classname */
 829                         if (isclassname && c == '.')
 830                                 *pos++ = '/';
 831                         else
 832                                 *pos++ = (char) c;
 833
 834                 } else if (c < 0x800) {
 835                         /* 2 characters */
 836                 unsigned char high = c >> 6;
 837                 unsigned char low  = c & 0x3F;
 838                         left = left - 2;
 839                 if ((int) left < 0) break;
 840                 *pos++ = high | 0xC0;
 841                 *pos++ = low  | 0x80;
 842
 843                 } else {
 844                 /* 3 characters */
 845                 char low  = c & 0x3f;
 846                 char mid  = (c >> 6) & 0x3F;
 847                 char high = c >> 12;
 848                         left = left - 3;
 849                 if ((int) left < 0) break;
 850                 *pos++ = high | 0xE0;
 851                 *pos++ = mid  | 0x80;
 852                 *pos++ = low  | 0x80;
 853                 }
 854         }
 855
 856         /* insert utf-string into symbol-table */
 857         result = utf_new(buffer,buflength);
 858
 859         MFREE(buffer, char, buflength);
 860
 861         return result;
 862 }
 863
 864
 865 /* utf_new_char ****************************************************************
 866
 867    Creates a new utf symbol, the text for this symbol is passed as a
 868    c-string ( = char* ).
 869
 870 *******************************************************************************/
 871
 872 utf *utf_new_char(const char *text)
 873 {
 874         return utf_new(text, strlen(text));
 875 }
 876
 877
 878 /* utf_new_char_classname ******************************************************
 879
 880    Creates a new utf symbol, the text for this symbol is passed as a
 881    c-string ( = char* ) "." characters are going to be replaced by
 882    "/". Since the above function is used often, this is a separte
 883    function, instead of an if.
 884
 885 *******************************************************************************/
 886
 887 utf *utf_new_char_classname(const char *text)
 888 {
 889         if (strchr(text, '.')) {
 890                 char *txt = strdup(text);
 891                 char *end = txt + strlen(txt);
 892                 char *c;
 893                 utf *tmpRes;
 894
 895                 for (c = txt; c < end; c++)
 896                         if (*c == '.') *c = '/';
 897
 898                 tmpRes = utf_new(txt, strlen(txt));
 899                 FREE(txt, 0);
 900
 901                 return tmpRes;
 902
 903         } else
 904                 return utf_new(text, strlen(text));
 905 }
 906
 907
 908 /* utf_nextu2 ******************************************************************
 909
 910    Read the next unicode character from the utf string and increment
 911    the utf-string pointer accordingly.
 912
 913    CAUTION: This function is unsafe for input that was not checked
 914             by is_valid_utf!
 915
 916 *******************************************************************************/
 917
 918 u2 utf_nextu2(char **utf_ptr)
 919 {
 920     /* uncompressed unicode character */
 921     u2 unicode_char = 0;
 922     /* current position in utf text */
 923     unsigned char *utf = (unsigned char *) (*utf_ptr);
 924     /* bytes representing the unicode character */
 925     unsigned char ch1, ch2, ch3;
 926     /* number of bytes used to represent the unicode character */
 927     int len = 0;
 928
 929     switch ((ch1 = utf[0]) >> 4) {
 930         default: /* 1 byte */
 931                 (*utf_ptr)++;
 932                 return (u2) ch1;
 933         case 0xC:
 934         case 0xD: /* 2 bytes */
 935                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 936                         unsigned char high = ch1 & 0x1F;
 937                         unsigned char low  = ch2 & 0x3F;
 938                         unicode_char = (high << 6) + low;
 939                         len = 2;
 940                 }
 941                 break;
 942
 943         case 0xE: /* 2 or 3 bytes */
 944                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 945                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 946                                 unsigned char low  = ch3 & 0x3f;
 947                                 unsigned char mid  = ch2 & 0x3f;
 948                                 unsigned char high = ch1 & 0x0f;
 949                                 unicode_char = (((high << 6) + mid) << 6) + low;
 950                                 len = 3;
 951                         } else
 952                                 len = 2;
 953                 }
 954                 break;
 955     }
 956
 957     /* update position in utf-text */
 958     *utf_ptr = (char *) (utf + len);
 959
 960     return unicode_char;
 961 }
 962
 963
 964 /* utf_bytes *******************************************************************
 965
 966    Determine number of bytes (aka. octets) in the utf string.
 967
 968    IN:
 969       u............utf string
 970
 971    OUT:
 972       The number of octets of this utf string.
 973           There is _no_ terminating zero included in this count.
 974
 975 *******************************************************************************/
 976
 977 u4 utf_bytes(utf *u)
 978 {
 979         return u->blength;
 980 }
 981
 982
 983 /* utf_get_number_of_u2s_for_buffer ********************************************
 984
 985    Determine number of UTF-16 u2s in the given UTF-8 buffer
 986
 987    CAUTION: This function is unsafe for input that was not checked
 988             by is_valid_utf!
 989
 990    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 991    to an array of u2s (UTF-16) and want to know how many of them you will get.
 992    All other uses of this function are probably wrong.
 993
 994    IN:
 995       buffer........points to first char in buffer
 996           blength.......number of _bytes_ in the buffer
 997
 998    OUT:
 999       the number of u2s needed to hold this string in UTF-16 encoding.
1000           There is _no_ terminating zero included in this count.
1001
1002    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1003    exception.
1004
1005 *******************************************************************************/
1006
1007 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1008 {
1009         const char *endpos;                 /* points behind utf string           */
1010         const char *utf_ptr;                /* current position in utf text       */
1011         u4 len = 0;                         /* number of unicode characters       */
1012
1013         utf_ptr = buffer;
1014         endpos = utf_ptr + blength;
1015
1016         while (utf_ptr < endpos) {
1017                 len++;
1018                 /* next unicode character */
1019                 utf_nextu2((char **)&utf_ptr);
1020         }
1021
1022         assert(utf_ptr == endpos);
1023
1024         return len;
1025 }
1026
1027
1028 /* utf_get_number_of_u2s *******************************************************
1029
1030    Determine number of UTF-16 u2s in the utf string.
1031
1032    CAUTION: This function is unsafe for input that was not checked
1033             by is_valid_utf!
1034
1035    CAUTION: Use this function *only* when you want to convert a utf string
1036    to an array of u2s and want to know how many of them you will get.
1037    All other uses of this function are probably wrong.
1038
1039    IN:
1040       u............utf string
1041
1042    OUT:
1043       the number of u2s needed to hold this string in UTF-16 encoding.
1044           There is _no_ terminating zero included in this count.
1045           XXX 0 if a NullPointerException has been thrown (see below)
1046
1047 *******************************************************************************/
1048
1049 u4 utf_get_number_of_u2s(utf *u)
1050 {
1051         char *endpos;                       /* points behind utf string           */
1052         char *utf_ptr;                      /* current position in utf text       */
1053         u4 len = 0;                         /* number of unicode characters       */
1054
1055         /* XXX this is probably not checked by most callers! Review this after */
1056         /* the invalid uses of this function have been eliminated */
1057         if (u == NULL) {
1058                 exceptions_throw_nullpointerexception();
1059                 return 0;
1060         }
1061
1062         endpos = UTF_END(u);
1063         utf_ptr = u->text;
1064
1065         while (utf_ptr < endpos) {
1066                 len++;
1067                 /* next unicode character */
1068                 utf_nextu2(&utf_ptr);
1069         }
1070
1071         if (utf_ptr != endpos) {
1072                 /* string ended abruptly */
1073                 exceptions_throw_internalerror("Illegal utf8 string");
1074                 return 0;
1075         }
1076
1077         return len;
1078 }
1079
1080
1081 /* utf8_safe_number_of_u2s *****************************************************
1082
1083    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1084    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1085
1086    This function is safe even for invalid UTF-8 strings.
1087
1088    IN:
1089       text..........zero-terminated(!) UTF-8 string (may be invalid)
1090                         must NOT be NULL
1091           nbytes........strlen(text). (This is needed to completely emulate
1092                         the RI).
1093
1094    OUT:
1095       the number of u2s needed to hold this string in UTF-16 encoding.
1096           There is _no_ terminating zero included in this count.
1097
1098 *******************************************************************************/
1099
1100 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1101         register const unsigned char *t;
1102         register s4 byte;
1103         register s4 len;
1104         register const unsigned char *tlimit;
1105         s4 byte1;
1106         s4 byte2;
1107         s4 byte3;
1108         s4 value;
1109         s4 skip;
1110
1111         assert(text);
1112         assert(nbytes >= 0);
1113
1114         len = 0;
1115         t = (const unsigned char *) text;
1116         tlimit = t + nbytes;
1117
1118         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1119
1120         while (1) {
1121                 byte = *t++;
1122
1123                 if (byte & 0x80) {
1124                         /* highest bit set, non-ASCII character */
1125
1126                         if ((byte & 0xe0) == 0xc0) {
1127                                 /* 2-byte: should be 110..... 10...... ? */
1128
1129                                 if ((*t++ & 0xc0) == 0x80)
1130                                         ; /* valid 2-byte */
1131                                 else
1132                                         t--; /* invalid */
1133                         }
1134                         else if ((byte & 0xf0) == 0xe0) {
1135                                 /* 3-byte: should be 1110.... 10...... 10...... */
1136                                 /*                            ^t                */
1137
1138                                 if (t + 2 > tlimit)
1139                                         return len + 1; /* invalid, stop here */
1140
1141                                 if ((*t++ & 0xc0) == 0x80) {
1142                                         if ((*t++ & 0xc0) == 0x80)
1143                                                 ; /* valid 3-byte */
1144                                         else
1145                                                 t--; /* invalid */
1146                                 }
1147                                 else
1148                                         t--; /* invalid */
1149                         }
1150                         else if ((byte & 0xf8) == 0xf0) {
1151                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1152                                 /*                            ^t                         */
1153
1154                                 if (t + 3 > tlimit)
1155                                         return len + 1; /* invalid, stop here */
1156
1157                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1158                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1159                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1160                                                         /* valid 4-byte UTF-8? */
1161                                                         value = ((byte  & 0x07) << 18)
1162                                                                   | ((byte1 & 0x3f) << 12)
1163                                                                   | ((byte2 & 0x3f) <<  6)
1164                                                                   | ((byte3 & 0x3f)      );
1165
1166                                                         if (value > 0x10FFFF)
1167                                                                 ; /* invalid */
1168                                                         else if (value > 0xFFFF)
1169                                                                 len += 1; /* we need surrogates */
1170                                                         else
1171                                                                 ; /* 16bit suffice */
1172                                                 }
1173                                                 else
1174                                                         t--; /* invalid */
1175                                         }
1176                                         else
1177                                                 t--; /* invalid */
1178                                 }
1179                                 else
1180                                         t--; /* invalid */
1181                         }
1182                         else if ((byte & 0xfc) == 0xf8) {
1183                                 /* invalid 5-byte */
1184                                 if (t + 4 > tlimit)
1185                                         return len + 1; /* invalid, stop here */
1186
1187                                 skip = 4;
1188                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1189                                         t++;
1190                         }
1191                         else if ((byte & 0xfe) == 0xfc) {
1192                                 /* invalid 6-byte */
1193                                 if (t + 5 > tlimit)
1194                                         return len + 1; /* invalid, stop here */
1195
1196                                 skip = 5;
1197                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1198                                         t++;
1199                         }
1200                         else
1201                                 ; /* invalid */
1202                 }
1203                 else {
1204                         /* NUL */
1205
1206                         if (byte == 0)
1207                                 break;
1208
1209                         /* ASCII character, common case */
1210                 }
1211
1212                 len++;
1213         }
1214
1215         return len;
1216 }
1217
1218
1219 /* utf8_safe_convert_to_u2s ****************************************************
1220
1221    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1222    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1223    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1224
1225    This function is safe even for invalid UTF-8 strings.
1226
1227    IN:
1228       text..........zero-terminated(!) UTF-8 string (may be invalid)
1229                         must NOT be NULL
1230           nbytes........strlen(text). (This is needed to completely emulate
1231                                         the RI).
1232           buffer........a preallocated array of u2s to receive the decoded
1233                         string. Use utf8_safe_number_of_u2s to get the
1234                                         required number of u2s for allocating this.
1235
1236 *******************************************************************************/
1237
1238 #define UNICODE_REPLACEMENT  0xfffd
1239
1240 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1241         register const unsigned char *t;
1242         register s4 byte;
1243         register const unsigned char *tlimit;
1244         s4 byte1;
1245         s4 byte2;
1246         s4 byte3;
1247         s4 value;
1248         s4 skip;
1249
1250         assert(text);
1251         assert(nbytes >= 0);
1252
1253         t = (const unsigned char *) text;
1254         tlimit = t + nbytes;
1255
1256         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1257
1258         while (1) {
1259                 byte = *t++;
1260
1261                 if (byte & 0x80) {
1262                         /* highest bit set, non-ASCII character */
1263
1264                         if ((byte & 0xe0) == 0xc0) {
1265                                 /* 2-byte: should be 110..... 10...... */
1266
1267                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1268                                         /* valid 2-byte UTF-8 */
1269                                         *buffer++ = ((byte  & 0x1f) << 6)
1270                                                           | ((byte1 & 0x3f)     );
1271                                 }
1272                                 else {
1273                                         *buffer++ = UNICODE_REPLACEMENT;
1274                                         t--;
1275                                 }
1276                         }
1277                         else if ((byte & 0xf0) == 0xe0) {
1278                                 /* 3-byte: should be 1110.... 10...... 10...... */
1279
1280                                 if (t + 2 > tlimit) {
1281                                         *buffer++ = UNICODE_REPLACEMENT;
1282                                         return;
1283                                 }
1284
1285                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1286                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1287                                                 /* valid 3-byte UTF-8 */
1288                                                 *buffer++ = ((byte  & 0x0f) << 12)
1289                                                                   | ((byte1 & 0x3f) <<  6)
1290                                                                   | ((byte2 & 0x3f)      );
1291                                         }
1292                                         else {
1293                                                 *buffer++ = UNICODE_REPLACEMENT;
1294                                                 t--;
1295                                         }
1296                                 }
1297                                 else {
1298                                         *buffer++ = UNICODE_REPLACEMENT;
1299                                         t--;
1300                                 }
1301                         }
1302                         else if ((byte & 0xf8) == 0xf0) {
1303                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1304
1305                                 if (t + 3 > tlimit) {
1306                                         *buffer++ = UNICODE_REPLACEMENT;
1307                                         return;
1308                                 }
1309
1310                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1311                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1312                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1313                                                         /* valid 4-byte UTF-8? */
1314                                                         value = ((byte  & 0x07) << 18)
1315                                                                   | ((byte1 & 0x3f) << 12)
1316                                                                   | ((byte2 & 0x3f) <<  6)
1317                                                                   | ((byte3 & 0x3f)      );
1318
1319                                                         if (value > 0x10FFFF) {
1320                                                                 *buffer++ = UNICODE_REPLACEMENT;
1321                                                         }
1322                                                         else if (value > 0xFFFF) {
1323                                                                 /* we need surrogates */
1324                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1325                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1326                                                         }
1327                                                         else
1328                                                                 *buffer++ = value; /* 16bit suffice */
1329                                                 }
1330                                                 else {
1331                                                         *buffer++ = UNICODE_REPLACEMENT;
1332                                                         t--;
1333                                                 }
1334                                         }
1335                                         else {
1336                                                 *buffer++ = UNICODE_REPLACEMENT;
1337                                                 t--;
1338                                         }
1339                                 }
1340                                 else {
1341                                         *buffer++ = UNICODE_REPLACEMENT;
1342                                         t--;
1343                                 }
1344                         }
1345                         else if ((byte & 0xfc) == 0xf8) {
1346                                 if (t + 4 > tlimit) {
1347                                         *buffer++ = UNICODE_REPLACEMENT;
1348                                         return;
1349                                 }
1350
1351                                 skip = 4;
1352                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1353                                         t++;
1354                                 *buffer++ = UNICODE_REPLACEMENT;
1355                         }
1356                         else if ((byte & 0xfe) == 0xfc) {
1357                                 if (t + 5 > tlimit) {
1358                                         *buffer++ = UNICODE_REPLACEMENT;
1359                                         return;
1360                                 }
1361
1362                                 skip = 5;
1363                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1364                                         t++;
1365                                 *buffer++ = UNICODE_REPLACEMENT;
1366                         }
1367                         else
1368                                 *buffer++ = UNICODE_REPLACEMENT;
1369                 }
1370                 else {
1371                         /* NUL */
1372
1373                         if (byte == 0)
1374                                 break;
1375
1376                         /* ASCII character, common case */
1377
1378                         *buffer++ = byte;
1379                 }
1380         }
1381 }
1382
1383
1384 /* u2_utflength ****************************************************************
1385
1386    Returns the utf length in bytes of a u2 array.
1387
1388 *******************************************************************************/
1389
1390 u4 u2_utflength(u2 *text, u4 u2_length)
1391 {
1392         u4 result_len = 0;                  /* utf length in bytes                */
1393         u2 ch;                              /* current unicode character          */
1394         u4 len;
1395
1396         for (len = 0; len < u2_length; len++) {
1397                 /* next unicode character */
1398                 ch = *text++;
1399
1400                 /* determine bytes required to store unicode character as utf */
1401                 if (ch && (ch < 0x80))
1402                         result_len++;
1403                 else if (ch < 0x800)
1404                         result_len += 2;
1405                 else
1406                         result_len += 3;
1407         }
1408
1409     return result_len;
1410 }
1411
1412
1413 /* utf_copy ********************************************************************
1414
1415    Copy the given utf string byte-for-byte to a buffer.
1416
1417    IN:
1418       buffer.......the buffer
1419           u............the utf string
1420
1421 *******************************************************************************/
1422
1423 void utf_copy(char *buffer, utf *u)
1424 {
1425         /* our utf strings are zero-terminated (done by utf_new) */
1426         MCOPY(buffer, u->text, char, u->blength + 1);
1427 }
1428
1429
1430 /* utf_cat *********************************************************************
1431
1432    Append the given utf string byte-for-byte to a buffer.
1433
1434    IN:
1435       buffer.......the buffer
1436           u............the utf string
1437
1438 *******************************************************************************/
1439
1440 void utf_cat(char *buffer, utf *u)
1441 {
1442         /* our utf strings are zero-terminated (done by utf_new) */
1443         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1444 }
1445
1446
1447 /* utf_copy_classname **********************************************************
1448
1449    Copy the given utf classname byte-for-byte to a buffer.
1450    '/' is replaced by '.'
1451
1452    IN:
1453       buffer.......the buffer
1454           u............the utf string
1455
1456 *******************************************************************************/
1457
1458 void utf_copy_classname(char *buffer, utf *u)
1459 {
1460         char *bufptr;
1461         char *srcptr;
1462         char *endptr;
1463         char ch;
1464
1465         bufptr = buffer;
1466         srcptr = u->text;
1467         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1468
1469         while (srcptr != endptr) {
1470                 ch = *srcptr++;
1471                 if (ch == '/')
1472                         ch = '.';
1473                 *bufptr++ = ch;
1474         }
1475 }
1476
1477
1478 /* utf_cat *********************************************************************
1479
1480    Append the given utf classname byte-for-byte to a buffer.
1481    '/' is replaced by '.'
1482
1483    IN:
1484       buffer.......the buffer
1485           u............the utf string
1486
1487 *******************************************************************************/
1488
1489 void utf_cat_classname(char *buffer, utf *u)
1490 {
1491         utf_copy_classname(buffer + strlen(buffer), u);
1492 }
1493
1494 /* utf_display_printable_ascii *************************************************
1495
1496    Write utf symbol to stdout (for debugging purposes).
1497    Non-printable and non-ASCII characters are printed as '?'.
1498
1499 *******************************************************************************/
1500
1501 void utf_display_printable_ascii(utf *u)
1502 {
1503         char *endpos;                       /* points behind utf string           */
1504         char *utf_ptr;                      /* current position in utf text       */
1505
1506         if (u == NULL) {
1507                 printf("NULL");
1508                 fflush(stdout);
1509                 return;
1510         }
1511
1512         endpos = UTF_END(u);
1513         utf_ptr = u->text;
1514
1515         while (utf_ptr < endpos) {
1516                 /* read next unicode character */
1517
1518                 u2 c = utf_nextu2(&utf_ptr);
1519
1520                 if ((c >= 32) && (c <= 127))
1521                         printf("%c", c);
1522                 else
1523                         printf("?");
1524         }
1525
1526         fflush(stdout);
1527 }
1528
1529
1530 /* utf_display_printable_ascii_classname ***************************************
1531
1532    Write utf symbol to stdout with `/' converted to `.' (for debugging
1533    purposes).
1534    Non-printable and non-ASCII characters are printed as '?'.
1535
1536 *******************************************************************************/
1537
1538 void utf_display_printable_ascii_classname(utf *u)
1539 {
1540         char *endpos;                       /* points behind utf string           */
1541         char *utf_ptr;                      /* current position in utf text       */
1542
1543         if (u == NULL) {
1544                 printf("NULL");
1545                 fflush(stdout);
1546                 return;
1547         }
1548
1549         endpos = UTF_END(u);
1550         utf_ptr = u->text;
1551
1552         while (utf_ptr < endpos) {
1553                 /* read next unicode character */
1554
1555                 u2 c = utf_nextu2(&utf_ptr);
1556
1557                 if (c == '/')
1558                         c = '.';
1559
1560                 if ((c >= 32) && (c <= 127))
1561                         printf("%c", c);
1562                 else
1563                         printf("?");
1564         }
1565
1566         fflush(stdout);
1567 }
1568
1569
1570 /* utf_sprint_convert_to_latin1 ************************************************
1571
1572    Write utf symbol into c-string (for debugging purposes).
1573    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1574    invalid results.
1575
1576 *******************************************************************************/
1577
1578 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1579 {
1580         char *endpos;                       /* points behind utf string           */
1581         char *utf_ptr;                      /* current position in utf text       */
1582         u2 pos = 0;                         /* position in c-string               */
1583
1584         if (!u) {
1585                 strcpy(buffer, "NULL");
1586                 return;
1587         }
1588
1589         endpos = UTF_END(u);
1590         utf_ptr = u->text;
1591
1592         while (utf_ptr < endpos)
1593                 /* copy next unicode character */
1594                 buffer[pos++] = utf_nextu2(&utf_ptr);
1595
1596         /* terminate string */
1597         buffer[pos] = '\0';
1598 }
1599
1600
1601 /* utf_sprint_convert_to_latin1_classname **************************************
1602
1603    Write utf symbol into c-string with `/' converted to `.' (for debugging
1604    purposes).
1605    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1606    invalid results.
1607
1608 *******************************************************************************/
1609
1610 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1611 {
1612         char *endpos;                       /* points behind utf string           */
1613         char *utf_ptr;                      /* current position in utf text       */
1614         u2 pos = 0;                         /* position in c-string               */
1615
1616         if (!u) {
1617                 strcpy(buffer, "NULL");
1618                 return;
1619         }
1620
1621         endpos = UTF_END(u);
1622         utf_ptr = u->text;
1623
1624         while (utf_ptr < endpos) {
1625                 /* copy next unicode character */
1626                 u2 c = utf_nextu2(&utf_ptr);
1627                 if (c == '/') c = '.';
1628                 buffer[pos++] = c;
1629         }
1630
1631         /* terminate string */
1632         buffer[pos] = '\0';
1633 }
1634
1635
1636 /* utf_strcat_convert_to_latin1 ************************************************
1637
1638    Like libc strcat, but uses an utf8 string.
1639    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1640    invalid results.
1641
1642 *******************************************************************************/
1643
1644 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1645 {
1646         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1647 }
1648
1649
1650 /* utf_strcat_convert_to_latin1_classname **************************************
1651
1652    Like libc strcat, but uses an utf8 string.
1653    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1654    invalid results.
1655
1656 *******************************************************************************/
1657
1658 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1659 {
1660         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1661 }
1662
1663
1664 /* utf_fprint_printable_ascii **************************************************
1665
1666    Write utf symbol into file.
1667    Non-printable and non-ASCII characters are printed as '?'.
1668
1669 *******************************************************************************/
1670
1671 void utf_fprint_printable_ascii(FILE *file, utf *u)
1672 {
1673         char *endpos;                       /* points behind utf string           */
1674         char *utf_ptr;                      /* current position in utf text       */
1675
1676         if (!u)
1677                 return;
1678
1679         endpos = UTF_END(u);
1680         utf_ptr = u->text;
1681
1682         while (utf_ptr < endpos) {
1683                 /* read next unicode character */
1684                 u2 c = utf_nextu2(&utf_ptr);
1685
1686                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1687                 else fprintf(file, "?");
1688         }
1689 }
1690
1691
1692 /* utf_fprint_printable_ascii_classname ****************************************
1693
1694    Write utf symbol into file with `/' converted to `.'.
1695    Non-printable and non-ASCII characters are printed as '?'.
1696
1697 *******************************************************************************/
1698
1699 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1700 {
1701         char *endpos;                       /* points behind utf string           */
1702         char *utf_ptr;                      /* current position in utf text       */
1703
1704     if (!u)
1705                 return;
1706
1707         endpos = UTF_END(u);
1708         utf_ptr = u->text;
1709
1710         while (utf_ptr < endpos) {
1711                 /* read next unicode character */
1712                 u2 c = utf_nextu2(&utf_ptr);
1713                 if (c == '/') c = '.';
1714
1715                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1716                 else fprintf(file, "?");
1717         }
1718 }
1719
1720
1721 /* is_valid_utf ****************************************************************
1722
1723    Return true if the given string is a valid UTF-8 string.
1724
1725    utf_ptr...points to first character
1726    end_pos...points after last character
1727
1728 *******************************************************************************/
1729
1730 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1731
1732 bool is_valid_utf(char *utf_ptr, char *end_pos)
1733 {
1734         int bytes;
1735         int len,i;
1736         char c;
1737         unsigned long v;
1738
1739         if (end_pos < utf_ptr) return false;
1740         bytes = end_pos - utf_ptr;
1741         while (bytes--) {
1742                 c = *utf_ptr++;
1743
1744                 if (!c) return false;                     /* 0x00 is not allowed */
1745                 if ((c & 0x80) == 0) continue;            /* ASCII */
1746
1747                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1748                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1749                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1750                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1751                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1752                 else return false;                        /* invalid leading byte */
1753
1754                 if (len > 2) return false;                /* Java limitation */
1755
1756                 v = (unsigned long)c & (0x3f >> len);
1757
1758                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1759
1760                 for (i = len; i--; ) {
1761                         c = *utf_ptr++;
1762                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1763                                 return false;
1764                         v = (v << 6) | (c & 0x3f);
1765                 }
1766
1767                 if (v == 0) {
1768                         if (len != 1) return false;           /* Java special */
1769
1770                 } else {
1771                         /* Sun Java seems to allow overlong UTF-8 encodings */
1772
1773                         /* if (v < min_codepoint[len]) */
1774                                 /* XXX throw exception? */
1775                 }
1776
1777                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1778                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1779
1780                 /* even these seem to be allowed */
1781                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1782         }
1783
1784         return true;
1785 }
1786
1787
1788 /* is_valid_name ***************************************************************
1789
1790    Return true if the given string may be used as a class/field/method
1791    name. (Currently this only disallows empty strings and control
1792    characters.)
1793
1794    NOTE: The string is assumed to have passed is_valid_utf!
1795
1796    utf_ptr...points to first character
1797    end_pos...points after last character
1798
1799 *******************************************************************************/
1800
1801 bool is_valid_name(char *utf_ptr, char *end_pos)
1802 {
1803         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1804
1805         while (utf_ptr < end_pos) {
1806                 unsigned char c = *utf_ptr++;
1807
1808                 if (c < 0x20) return false; /* disallow control characters */
1809                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1810                         return false;
1811         }
1812
1813         return true;
1814 }
1815
1816 bool is_valid_name_utf(utf *u)
1817 {
1818         return is_valid_name(u->text, UTF_END(u));
1819 }
1820
1821
1822 /* utf_show ********************************************************************
1823
1824    Writes the utf symbols in the utfhash to stdout and displays the
1825    number of external hash chains grouped according to the chainlength
1826    (for debugging purposes).
1827
1828 *******************************************************************************/
1829
1830 #if !defined(NDEBUG)
1831 void utf_show(void)
1832 {
1833
1834 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1835
1836         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1837         u4 max_chainlength = 0;      /* maximum length of the chains */
1838         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1839         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1840         u4 i;
1841
1842         printf("UTF-HASH:\n");
1843
1844         /* show element of utf-hashtable */
1845
1846         for (i = 0; i < hashtable_utf->size; i++) {
1847                 utf *u = hashtable_utf->ptr[i];
1848
1849                 if (u) {
1850                         printf("SLOT %d: ", (int) i);
1851
1852                         while (u) {
1853                                 printf("'");
1854                                 utf_display_printable_ascii(u);
1855                                 printf("' ");
1856                                 u = u->hashlink;
1857                         }
1858                         printf("\n");
1859                 }
1860         }
1861
1862         printf("UTF-HASH: %d slots for %d entries\n",
1863                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1864
1865         if (hashtable_utf->entries == 0)
1866                 return;
1867
1868         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1869
1870         for (i=0;i<CHAIN_LIMIT;i++)
1871                 chain_count[i]=0;
1872
1873         /* count numbers of hashchains according to their length */
1874         for (i=0; i<hashtable_utf->size; i++) {
1875
1876                 utf *u = (utf*) hashtable_utf->ptr[i];
1877                 u4 chain_length = 0;
1878
1879                 /* determine chainlength */
1880                 while (u) {
1881                         u = u->hashlink;
1882                         chain_length++;
1883                 }
1884
1885                 /* update sum of all chainlengths */
1886                 sum_chainlength+=chain_length;
1887
1888                 /* determine the maximum length of the chains */
1889                 if (chain_length>max_chainlength)
1890                         max_chainlength = chain_length;
1891
1892                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1893                 if (chain_length>=CHAIN_LIMIT) {
1894                         beyond_limit+=chain_length;
1895                         chain_length=CHAIN_LIMIT-1;
1896                 }
1897
1898                 /* update number of hashchains of current length */
1899                 chain_count[chain_length]++;
1900         }
1901
1902         /* display results */
1903         for (i=1;i<CHAIN_LIMIT-1;i++)
1904                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1905
1906         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1907
1908
1909         printf("max. chainlength:%5d\n",max_chainlength);
1910
1911         /* avg. chainlength = sum of chainlengths / number of chains */
1912         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1913 }
1914 #endif /* !defined(NDEBUG) */
1915
1916
1917 /*
1918  * These are local overrides for various environment variables in Emacs.
1919  * Please do not remove this and leave it at the end of the file, where
1920  * Emacs will automagically detect them.
1921  * ---------------------------------------------------------------------
1922  * Local variables:
1923  * mode: c
1924  * indent-tabs-mode: t
1925  * c-basic-offset: 4
1926  * tab-width: 4
1927  * End:
1928  * vim:noexpandtab:sw=4:ts=4:
1929  */