src/vmcore/utf8.c

   1 /* src/vmcore/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006, 2007, 2008
   4    CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
   5
   6    This file is part of CACAO.
   7
   8    This program is free software; you can redistribute it and/or
   9    modify it under the terms of the GNU General Public License as
  10    published by the Free Software Foundation; either version 2, or (at
  11    your option) any later version.
  12
  13    This program is distributed in the hope that it will be useful, but
  14    WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16    General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; if not, write to the Free Software
  20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  21    02110-1301, USA.
  22
  23 */
  24
  25
  26 #include "config.h"
  27
  28 #include <string.h>
  29 #include <assert.h>
  30
  31 #include "vm/types.h"
  32
  33 #include "mm/memory.h"
  34
  35 #include "threads/lock-common.h"
  36
  37 #include "toolbox/hashtable.h"
  38
  39 #include "vm/exceptions.h"
  40
  41 #include "vmcore/options.h"
  42
  43 #if defined(ENABLE_STATISTICS)
  44 # include "vmcore/statistics.h"
  45 #endif
  46
  47 #include "vmcore/utf8.h"
  48
  49
  50 /* global variables ***********************************************************/
  51
  52 /* hashsize must be power of 2 */
  53
  54 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  55
  56 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  57
  58
  59 /* utf-symbols for pointer comparison of frequently used strings **************/
  60
  61 utf *utf_java_lang_Object;
  62
  63 utf *utf_java_lang_Class;
  64 utf *utf_java_lang_ClassLoader;
  65 utf *utf_java_lang_Cloneable;
  66 utf *utf_java_lang_SecurityManager;
  67 utf *utf_java_lang_String;
  68 utf *utf_java_lang_ThreadGroup;
  69 utf *utf_java_lang_ref_SoftReference;
  70 utf *utf_java_lang_ref_WeakReference;
  71 utf *utf_java_lang_ref_PhantomReference;
  72 utf *utf_java_io_Serializable;
  73
  74 utf *utf_java_lang_Throwable;
  75 utf *utf_java_lang_Error;
  76
  77 utf *utf_java_lang_AbstractMethodError;
  78 utf *utf_java_lang_ClassCircularityError;
  79 utf *utf_java_lang_ClassFormatError;
  80 utf *utf_java_lang_ExceptionInInitializerError;
  81 utf *utf_java_lang_IncompatibleClassChangeError;
  82 utf *utf_java_lang_InstantiationError;
  83 utf *utf_java_lang_InternalError;
  84 utf *utf_java_lang_LinkageError;
  85 utf *utf_java_lang_NoClassDefFoundError;
  86 utf *utf_java_lang_NoSuchFieldError;
  87 utf *utf_java_lang_NoSuchMethodError;
  88 utf *utf_java_lang_OutOfMemoryError;
  89 utf *utf_java_lang_UnsatisfiedLinkError;
  90 utf *utf_java_lang_UnsupportedClassVersionError;
  91 utf *utf_java_lang_VerifyError;
  92 utf *utf_java_lang_VirtualMachineError;
  93
  94 utf *utf_java_lang_Exception;
  95
  96 utf *utf_java_lang_ArithmeticException;
  97 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
  98 utf *utf_java_lang_ArrayStoreException;
  99 utf *utf_java_lang_ClassCastException;
 100 utf *utf_java_lang_ClassNotFoundException;
 101 utf *utf_java_lang_CloneNotSupportedException;
 102 utf *utf_java_lang_IllegalAccessException;
 103 utf *utf_java_lang_IllegalArgumentException;
 104 utf *utf_java_lang_IllegalMonitorStateException;
 105 utf *utf_java_lang_InstantiationException;
 106 utf *utf_java_lang_InterruptedException;
 107 utf *utf_java_lang_NegativeArraySizeException;
 108 utf *utf_java_lang_NullPointerException;
 109 utf *utf_java_lang_RuntimeException;
 110 utf *utf_java_lang_StringIndexOutOfBoundsException;
 111
 112 utf *utf_java_lang_reflect_InvocationTargetException;
 113
 114 utf *utf_java_security_PrivilegedActionException;
 115
 116 #if defined(ENABLE_JAVASE)
 117 utf* utf_java_lang_Void;
 118 #endif
 119
 120 utf* utf_java_lang_Boolean;
 121 utf* utf_java_lang_Byte;
 122 utf* utf_java_lang_Character;
 123 utf* utf_java_lang_Short;
 124 utf* utf_java_lang_Integer;
 125 utf* utf_java_lang_Long;
 126 utf* utf_java_lang_Float;
 127 utf* utf_java_lang_Double;
 128
 129 #if defined(ENABLE_JAVASE)
 130 utf *utf_java_lang_StackTraceElement;
 131 utf *utf_java_lang_reflect_Constructor;
 132 utf *utf_java_lang_reflect_Field;
 133 utf *utf_java_lang_reflect_Method;
 134 utf *utf_java_util_Vector;
 135 #endif
 136
 137 utf *utf_InnerClasses;                  /* InnerClasses                       */
 138 utf *utf_ConstantValue;                 /* ConstantValue                      */
 139 utf *utf_Code;                          /* Code                               */
 140 utf *utf_Exceptions;                    /* Exceptions                         */
 141 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 142 utf *utf_SourceFile;                    /* SourceFile                         */
 143
 144 #if defined(ENABLE_JAVASE)
 145 utf *utf_EnclosingMethod;
 146 utf *utf_Signature;
 147 utf *utf_StackMapTable;
 148
 149 #if defined(ENABLE_ANNOTATIONS)
 150 utf *utf_RuntimeVisibleAnnotations;            /* RuntimeVisibleAnnotations            */
 151 utf *utf_RuntimeInvisibleAnnotations;          /* RuntimeInvisibleAnnotations          */
 152 utf *utf_RuntimeVisibleParameterAnnotations;   /* RuntimeVisibleParameterAnnotations   */
 153 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
 154 utf *utf_AnnotationDefault;                    /* AnnotationDefault                    */
 155 #endif
 156 #endif
 157
 158 utf *utf_init;                          /* <init>                             */
 159 utf *utf_clinit;                        /* <clinit>                           */
 160 utf *utf_clone;                         /* clone                              */
 161 utf *utf_finalize;                      /* finalize                           */
 162 utf *utf_invoke;
 163 utf *utf_main;
 164 utf *utf_run;                           /* run                                */
 165
 166 utf *utf_add;
 167 utf *utf_remove;
 168 utf *utf_addThread;
 169 utf *utf_removeThread;
 170 utf *utf_put;
 171 utf *utf_get;
 172 utf *utf_uncaughtException;
 173 utf *utf_value;
 174
 175 utf *utf_fillInStackTrace;
 176 utf *utf_findNative;
 177 utf *utf_getSystemClassLoader;
 178 utf *utf_initCause;
 179 utf *utf_loadClass;
 180 utf *utf_loadClassInternal;
 181 utf *utf_printStackTrace;
 182
 183 utf *utf_division_by_zero;
 184
 185 utf *utf_Z;                             /* Z                                  */
 186 utf *utf_B;                             /* B                                  */
 187 utf *utf_C;                             /* C                                  */
 188 utf *utf_S;                             /* S                                  */
 189 utf *utf_I;                             /* I                                  */
 190 utf *utf_J;                             /* J                                  */
 191 utf *utf_F;                             /* F                                  */
 192 utf *utf_D;                             /* D                                  */
 193
 194 utf *utf_void__void;                    /* ()V                                */
 195 utf *utf_boolean__void;                 /* (Z)V                               */
 196 utf *utf_byte__void;                    /* (B)V                               */
 197 utf *utf_char__void;                    /* (C)V                               */
 198 utf *utf_short__void;                   /* (S)V                               */
 199 utf *utf_int__void;                     /* (I)V                               */
 200 utf *utf_long__void;                    /* (J)V                               */
 201 utf *utf_float__void;                   /* (F)V                               */
 202 utf *utf_double__void;                  /* (D)V                               */
 203
 204 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 205 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 206 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 207 utf *utf_java_lang_ClassLoader_java_lang_String__J;
 208 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 209 utf *utf_java_lang_Object__java_lang_Object;
 210 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 211 utf *utf_java_lang_String__java_lang_Class;
 212 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 213 utf *utf_java_lang_Thread_java_lang_Throwable__V;
 214 utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
 215 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 216 utf *utf_java_lang_Throwable__java_lang_Throwable;
 217
 218 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 219 utf *utf_null;
 220 utf *array_packagename;
 221
 222
 223 /* utf_init ********************************************************************
 224
 225    Initializes the utf8 subsystem.
 226
 227 *******************************************************************************/
 228
 229 void utf8_init(void)
 230 {
 231         TRACESUBSYSTEMINITIALIZATION("utf8_init");
 232
 233         /* create utf8 hashtable */
 234
 235         hashtable_utf = NEW(hashtable);
 236
 237         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 238
 239 #if defined(ENABLE_STATISTICS)
 240         if (opt_stat)
 241                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 242 #endif
 243
 244         /* create utf-symbols for pointer comparison of frequently used strings */
 245
 246         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 247
 248         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 249         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 250         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 251         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 252         utf_java_lang_String           = utf_new_char("java/lang/String");
 253         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 254
 255         utf_java_lang_ref_SoftReference =
 256                 utf_new_char("java/lang/ref/SoftReference");
 257
 258         utf_java_lang_ref_WeakReference =
 259                 utf_new_char("java/lang/ref/WeakReference");
 260
 261         utf_java_lang_ref_PhantomReference =
 262                 utf_new_char("java/lang/ref/PhantomReference");
 263
 264         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 265
 266         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 267         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 268
 269         utf_java_lang_ClassCircularityError =
 270                 utf_new_char("java/lang/ClassCircularityError");
 271
 272         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
 273
 274         utf_java_lang_ExceptionInInitializerError =
 275                 utf_new_char("java/lang/ExceptionInInitializerError");
 276
 277         utf_java_lang_IncompatibleClassChangeError =
 278                 utf_new_char("java/lang/IncompatibleClassChangeError");
 279
 280         utf_java_lang_InstantiationError =
 281                 utf_new_char("java/lang/InstantiationError");
 282
 283         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
 284         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 285
 286         utf_java_lang_NoClassDefFoundError =
 287                 utf_new_char("java/lang/NoClassDefFoundError");
 288
 289         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 290
 291         utf_java_lang_UnsatisfiedLinkError =
 292                 utf_new_char("java/lang/UnsatisfiedLinkError");
 293
 294         utf_java_lang_UnsupportedClassVersionError =
 295                 utf_new_char("java/lang/UnsupportedClassVersionError");
 296
 297         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
 298
 299         utf_java_lang_VirtualMachineError =
 300                 utf_new_char("java/lang/VirtualMachineError");
 301
 302 #if defined(ENABLE_JAVASE)
 303         utf_java_lang_AbstractMethodError =
 304                 utf_new_char("java/lang/AbstractMethodError");
 305
 306         utf_java_lang_NoSuchFieldError =
 307                 utf_new_char("java/lang/NoSuchFieldError");
 308
 309         utf_java_lang_NoSuchMethodError =
 310                 utf_new_char("java/lang/NoSuchMethodError");
 311 #endif
 312
 313         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 314
 315         utf_java_lang_ArithmeticException =
 316                 utf_new_char("java/lang/ArithmeticException");
 317
 318         utf_java_lang_ArrayIndexOutOfBoundsException =
 319                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
 320
 321         utf_java_lang_ArrayStoreException =
 322                 utf_new_char("java/lang/ArrayStoreException");
 323
 324         utf_java_lang_ClassCastException =
 325                 utf_new_char("java/lang/ClassCastException");
 326
 327         utf_java_lang_ClassNotFoundException =
 328                 utf_new_char("java/lang/ClassNotFoundException");
 329
 330         utf_java_lang_CloneNotSupportedException =
 331                 utf_new_char("java/lang/CloneNotSupportedException");
 332
 333         utf_java_lang_IllegalAccessException =
 334                 utf_new_char("java/lang/IllegalAccessException");
 335
 336         utf_java_lang_IllegalArgumentException =
 337                 utf_new_char("java/lang/IllegalArgumentException");
 338
 339         utf_java_lang_IllegalMonitorStateException =
 340                 utf_new_char("java/lang/IllegalMonitorStateException");
 341
 342         utf_java_lang_InstantiationException =
 343                 utf_new_char("java/lang/InstantiationException");
 344
 345         utf_java_lang_InterruptedException =
 346                 utf_new_char("java/lang/InterruptedException");
 347
 348         utf_java_lang_NegativeArraySizeException =
 349                 utf_new_char("java/lang/NegativeArraySizeException");
 350
 351         utf_java_lang_NullPointerException =
 352                 utf_new_char("java/lang/NullPointerException");
 353
 354         utf_java_lang_RuntimeException =
 355                 utf_new_char("java/lang/RuntimeException");
 356
 357         utf_java_lang_StringIndexOutOfBoundsException =
 358                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
 359
 360         utf_java_lang_reflect_InvocationTargetException =
 361                 utf_new_char("java/lang/reflect/InvocationTargetException");
 362
 363         utf_java_security_PrivilegedActionException =
 364                 utf_new_char("java/security/PrivilegedActionException");
 365
 366 #if defined(ENABLE_JAVASE)
 367         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 368 #endif
 369
 370         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 371         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 372         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 373         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 374         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 375         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 376         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 377         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 378
 379 #if defined(ENABLE_JAVASE)
 380         utf_java_lang_StackTraceElement =
 381                 utf_new_char("java/lang/StackTraceElement");
 382
 383         utf_java_lang_reflect_Constructor =
 384                 utf_new_char("java/lang/reflect/Constructor");
 385
 386         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 387         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 388         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 389 #endif
 390
 391         utf_InnerClasses               = utf_new_char("InnerClasses");
 392         utf_ConstantValue              = utf_new_char("ConstantValue");
 393         utf_Code                       = utf_new_char("Code");
 394         utf_Exceptions                 = utf_new_char("Exceptions");
 395         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 396         utf_SourceFile                 = utf_new_char("SourceFile");
 397
 398 #if defined(ENABLE_JAVASE)
 399         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 400         utf_Signature                  = utf_new_char("Signature");
 401         utf_StackMapTable              = utf_new_char("StackMapTable");
 402
 403 #if defined(ENABLE_ANNOTATIONS)
 404         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
 405         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
 406         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
 407         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
 408         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
 409 #endif
 410 #endif
 411
 412         utf_init                           = utf_new_char("<init>");
 413         utf_clinit                         = utf_new_char("<clinit>");
 414         utf_clone                      = utf_new_char("clone");
 415         utf_finalize                   = utf_new_char("finalize");
 416         utf_invoke                     = utf_new_char("invoke");
 417         utf_main                       = utf_new_char("main");
 418         utf_run                        = utf_new_char("run");
 419
 420         utf_add                        = utf_new_char("add");
 421         utf_remove                     = utf_new_char("remove");
 422         utf_addThread                  = utf_new_char("addThread");
 423         utf_removeThread               = utf_new_char("removeThread");
 424         utf_put                        = utf_new_char("put");
 425         utf_get                        = utf_new_char("get");
 426         utf_uncaughtException          = utf_new_char("uncaughtException");
 427         utf_value                      = utf_new_char("value");
 428
 429         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 430         utf_findNative                 = utf_new_char("findNative");
 431         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 432         utf_initCause                  = utf_new_char("initCause");
 433         utf_loadClass                  = utf_new_char("loadClass");
 434         utf_loadClassInternal          = utf_new_char("loadClassInternal");
 435         utf_printStackTrace            = utf_new_char("printStackTrace");
 436
 437         utf_division_by_zero           = utf_new_char("/ by zero");
 438
 439         utf_Z                          = utf_new_char("Z");
 440         utf_B                          = utf_new_char("B");
 441         utf_C                          = utf_new_char("C");
 442         utf_S                          = utf_new_char("S");
 443         utf_I                          = utf_new_char("I");
 444         utf_J                          = utf_new_char("J");
 445         utf_F                          = utf_new_char("F");
 446         utf_D                          = utf_new_char("D");
 447
 448         utf_void__void                 = utf_new_char("()V");
 449         utf_boolean__void              = utf_new_char("(Z)V");
 450         utf_byte__void                 = utf_new_char("(B)V");
 451         utf_char__void                 = utf_new_char("(C)V");
 452         utf_short__void                = utf_new_char("(S)V");
 453         utf_int__void                  = utf_new_char("(I)V");
 454         utf_long__void                 = utf_new_char("(J)V");
 455         utf_float__void                = utf_new_char("(F)V");
 456         utf_double__void               = utf_new_char("(D)V");
 457         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 458         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 459
 460         utf_void__java_lang_ClassLoader =
 461                 utf_new_char("()Ljava/lang/ClassLoader;");
 462
 463         utf_java_lang_ClassLoader_java_lang_String__J =
 464                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
 465
 466         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
 467
 468         utf_java_lang_Object__java_lang_Object =
 469                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 470
 471         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 472
 473         utf_java_lang_String__java_lang_Class =
 474                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 475
 476         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 477
 478         utf_java_lang_Thread_java_lang_Throwable__V =
 479                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
 480
 481         utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
 482                 utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");
 483
 484         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 485
 486         utf_java_lang_Throwable__java_lang_Throwable =
 487                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
 488
 489         utf_null                       = utf_new_char("null");
 490         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 491         array_packagename              = utf_new_char("\t<the array package>");
 492 }
 493
 494
 495 /* utf_hashkey *****************************************************************
 496
 497    The hashkey is computed from the utf-text by using up to 8
 498    characters.  For utf-symbols longer than 15 characters 3 characters
 499    are taken from the beginning and the end, 2 characters are taken
 500    from the middle.
 501
 502 *******************************************************************************/
 503
 504 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 505 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 506
 507 u4 utf_hashkey(const char *text, u4 length)
 508 {
 509         const char *start_pos = text;       /* pointer to utf text                */
 510         u4 a;
 511
 512         switch (length) {
 513         case 0: /* empty string */
 514                 return 0;
 515
 516         case 1: return fbs(0);
 517         case 2: return fbs(0) ^ nbs(3);
 518         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 519         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 520         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 521         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 522         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 523         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 524
 525         case 9:
 526                 a = fbs(0);
 527                 a ^= nbs(1);
 528                 a ^= nbs(2);
 529                 text++;
 530                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 531
 532         case 10:
 533                 a = fbs(0);
 534                 text++;
 535                 a ^= nbs(2);
 536                 a ^= nbs(3);
 537                 a ^= nbs(4);
 538                 text++;
 539                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 540
 541         case 11:
 542                 a = fbs(0);
 543                 text++;
 544                 a ^= nbs(2);
 545                 a ^= nbs(3);
 546                 a ^= nbs(4);
 547                 text++;
 548                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 549
 550         case 12:
 551                 a = fbs(0);
 552                 text += 2;
 553                 a ^= nbs(2);
 554                 a ^= nbs(3);
 555                 text++;
 556                 a ^= nbs(5);
 557                 a ^= nbs(6);
 558                 a ^= nbs(7);
 559                 text++;
 560                 return a ^ nbs(9) ^ nbs(10);
 561
 562         case 13:
 563                 a = fbs(0);
 564                 a ^= nbs(1);
 565                 text++;
 566                 a ^= nbs(3);
 567                 a ^= nbs(4);
 568                 text += 2;
 569                 a ^= nbs(7);
 570                 a ^= nbs(8);
 571                 text += 2;
 572                 return a ^ nbs(9) ^ nbs(10);
 573
 574         case 14:
 575                 a = fbs(0);
 576                 text += 2;
 577                 a ^= nbs(3);
 578                 a ^= nbs(4);
 579                 text += 2;
 580                 a ^= nbs(7);
 581                 a ^= nbs(8);
 582                 text += 2;
 583                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 584
 585         case 15:
 586                 a = fbs(0);
 587                 text += 2;
 588                 a ^= nbs(3);
 589                 a ^= nbs(4);
 590                 text += 2;
 591                 a ^= nbs(7);
 592                 a ^= nbs(8);
 593                 text += 2;
 594                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 595
 596         default:  /* 3 characters from beginning */
 597                 a = fbs(0);
 598                 text += 2;
 599                 a ^= nbs(3);
 600                 a ^= nbs(4);
 601
 602                 /* 2 characters from middle */
 603                 text = start_pos + (length / 2);
 604                 a ^= fbs(5);
 605                 text += 2;
 606                 a ^= nbs(6);
 607
 608                 /* 3 characters from end */
 609                 text = start_pos + length - 4;
 610
 611                 a ^= fbs(7);
 612                 text++;
 613
 614                 return a ^ nbs(10) ^ nbs(11);
 615     }
 616 }
 617
 618 /* utf_full_hashkey ************************************************************
 619
 620    This function computes a hash value using all bytes in the string.
 621
 622    The algorithm is the "One-at-a-time" algorithm as published
 623    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 624
 625 *******************************************************************************/
 626
 627 u4 utf_full_hashkey(const char *text, u4 length)
 628 {
 629         register const unsigned char *p = (const unsigned char *) text;
 630         register u4 hash;
 631         register u4 i;
 632
 633         hash = 0;
 634         for (i=length; i--;)
 635         {
 636             hash += *p++;
 637             hash += (hash << 10);
 638             hash ^= (hash >> 6);
 639         }
 640         hash += (hash << 3);
 641         hash ^= (hash >> 11);
 642         hash += (hash << 15);
 643
 644         return hash;
 645 }
 646
 647 /* unicode_hashkey *************************************************************
 648
 649    Compute the hashkey of a unicode string.
 650
 651 *******************************************************************************/
 652
 653 u4 unicode_hashkey(u2 *text, u2 len)
 654 {
 655         return utf_hashkey((char *) text, len);
 656 }
 657
 658
 659 /* utf_new *********************************************************************
 660
 661    Creates a new utf-symbol, the text of the symbol is passed as a
 662    u1-array. The function searches the utf-hashtable for a utf-symbol
 663    with this text. On success the element returned, otherwise a new
 664    hashtable element is created.
 665
 666    If the number of entries in the hashtable exceeds twice the size of
 667    the hashtable slots a reorganization of the hashtable is done and
 668    the utf symbols are copied to a new hashtable with doubled size.
 669
 670 *******************************************************************************/
 671
 672 utf *utf_new(const char *text, u2 length)
 673 {
 674         u4 key;                             /* hashkey computed from utf-text     */
 675         u4 slot;                            /* slot in hashtable                  */
 676         utf *u;                             /* hashtable element                  */
 677         u2 i;
 678
 679         LOCK_MONITOR_ENTER(hashtable_utf->header);
 680
 681 #if defined(ENABLE_STATISTICS)
 682         if (opt_stat)
 683                 count_utf_new++;
 684 #endif
 685
 686         key  = utf_hashkey(text, length);
 687         slot = key & (hashtable_utf->size - 1);
 688         u    = hashtable_utf->ptr[slot];
 689
 690         /* search external hash chain for utf-symbol */
 691
 692         while (u) {
 693                 if (u->blength == length) {
 694                         /* compare text of hashtable elements */
 695
 696                         for (i = 0; i < length; i++)
 697                                 if (text[i] != u->text[i])
 698                                         goto nomatch;
 699
 700 #if defined(ENABLE_STATISTICS)
 701                         if (opt_stat)
 702                                 count_utf_new_found++;
 703 #endif
 704
 705                         /* symbol found in hashtable */
 706
 707                         LOCK_MONITOR_EXIT(hashtable_utf->header);
 708
 709                         return u;
 710                 }
 711
 712         nomatch:
 713                 u = u->hashlink; /* next element in external chain */
 714         }
 715
 716         /* location in hashtable found, create new utf element */
 717
 718         u = NEW(utf);
 719
 720         u->blength  = length;               /* length in bytes of utfstring       */
 721         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 722         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 723
 724         memcpy(u->text, text, length);      /* copy utf-text                      */
 725         u->text[length] = '\0';
 726
 727 #if defined(ENABLE_STATISTICS)
 728         if (opt_stat)
 729                 count_utf_len += sizeof(utf) + length + 1;
 730 #endif
 731
 732         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 733         hashtable_utf->entries++;           /* update number of entries           */
 734
 735         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 736
 737         /* reorganization of hashtable, average length of the external
 738            chains is approx. 2 */
 739
 740                 hashtable *newhash;                              /* the new hashtable */
 741                 u4         i;
 742                 utf       *u;
 743                 utf       *nextu;
 744                 u4         slot;
 745
 746                 /* create new hashtable, double the size */
 747
 748                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 749
 750 #if defined(ENABLE_STATISTICS)
 751                 if (opt_stat)
 752                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 753 #endif
 754
 755                 /* transfer elements to new hashtable */
 756
 757                 for (i = 0; i < hashtable_utf->size; i++) {
 758                         u = hashtable_utf->ptr[i];
 759
 760                         while (u) {
 761                                 nextu = u->hashlink;
 762                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 763
 764                                 u->hashlink = (utf *) newhash->ptr[slot];
 765                                 newhash->ptr[slot] = u;
 766
 767                                 /* follow link in external hash chain */
 768
 769                                 u = nextu;
 770                         }
 771                 }
 772
 773                 /* dispose old table */
 774
 775                 hashtable_free(hashtable_utf);
 776
 777                 hashtable_utf = newhash;
 778         }
 779
 780         LOCK_MONITOR_EXIT(hashtable_utf->header);
 781
 782         return u;
 783 }
 784
 785
 786 /* utf_new_u2 ******************************************************************
 787
 788    Make utf symbol from u2 array, if isclassname is true '.' is
 789    replaced by '/'.
 790
 791 *******************************************************************************/
 792
 793 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 794 {
 795         char *buffer;                   /* memory buffer for  unicode characters  */
 796         char *pos;                      /* pointer to current position in buffer  */
 797         u4 left;                        /* unicode characters left                */
 798         u4 buflength;                   /* utf length in bytes of the u2 array    */
 799         utf *result;                    /* resulting utf-string                   */
 800         int i;
 801
 802         /* determine utf length in bytes and allocate memory */
 803
 804         buflength = u2_utflength(unicode_pos, unicode_length);
 805         buffer    = MNEW(char, buflength);
 806
 807         left = buflength;
 808         pos  = buffer;
 809
 810         for (i = 0; i++ < unicode_length; unicode_pos++) {
 811                 /* next unicode character */
 812                 u2 c = *unicode_pos;
 813
 814                 if ((c != 0) && (c < 0x80)) {
 815                         /* 1 character */
 816                         left--;
 817                 if ((int) left < 0) break;
 818                         /* convert classname */
 819                         if (isclassname && c == '.')
 820                                 *pos++ = '/';
 821                         else
 822                                 *pos++ = (char) c;
 823
 824                 } else if (c < 0x800) {
 825                         /* 2 characters */
 826                 unsigned char high = c >> 6;
 827                 unsigned char low  = c & 0x3F;
 828                         left = left - 2;
 829                 if ((int) left < 0) break;
 830                 *pos++ = high | 0xC0;
 831                 *pos++ = low  | 0x80;
 832
 833                 } else {
 834                 /* 3 characters */
 835                 char low  = c & 0x3f;
 836                 char mid  = (c >> 6) & 0x3F;
 837                 char high = c >> 12;
 838                         left = left - 3;
 839                 if ((int) left < 0) break;
 840                 *pos++ = high | 0xE0;
 841                 *pos++ = mid  | 0x80;
 842                 *pos++ = low  | 0x80;
 843                 }
 844         }
 845
 846         /* insert utf-string into symbol-table */
 847         result = utf_new(buffer,buflength);
 848
 849         MFREE(buffer, char, buflength);
 850
 851         return result;
 852 }
 853
 854
 855 /* utf_new_char ****************************************************************
 856
 857    Creates a new utf symbol, the text for this symbol is passed as a
 858    c-string ( = char* ).
 859
 860 *******************************************************************************/
 861
 862 utf *utf_new_char(const char *text)
 863 {
 864         return utf_new(text, strlen(text));
 865 }
 866
 867
 868 /* utf_new_char_classname ******************************************************
 869
 870    Creates a new utf symbol, the text for this symbol is passed as a
 871    c-string ( = char* ) "." characters are going to be replaced by
 872    "/". Since the above function is used often, this is a separte
 873    function, instead of an if.
 874
 875 *******************************************************************************/
 876
 877 utf *utf_new_char_classname(const char *text)
 878 {
 879         if (strchr(text, '.')) {
 880                 char *txt = strdup(text);
 881                 char *end = txt + strlen(txt);
 882                 char *c;
 883                 utf *tmpRes;
 884
 885                 for (c = txt; c < end; c++)
 886                         if (*c == '.') *c = '/';
 887
 888                 tmpRes = utf_new(txt, strlen(txt));
 889                 FREE(txt, 0);
 890
 891                 return tmpRes;
 892
 893         } else
 894                 return utf_new(text, strlen(text));
 895 }
 896
 897
 898 /* utf_nextu2 ******************************************************************
 899
 900    Read the next unicode character from the utf string and increment
 901    the utf-string pointer accordingly.
 902
 903    CAUTION: This function is unsafe for input that was not checked
 904             by is_valid_utf!
 905
 906 *******************************************************************************/
 907
 908 u2 utf_nextu2(char **utf_ptr)
 909 {
 910     /* uncompressed unicode character */
 911     u2 unicode_char = 0;
 912     /* current position in utf text */
 913     unsigned char *utf = (unsigned char *) (*utf_ptr);
 914     /* bytes representing the unicode character */
 915     unsigned char ch1, ch2, ch3;
 916     /* number of bytes used to represent the unicode character */
 917     int len = 0;
 918
 919     switch ((ch1 = utf[0]) >> 4) {
 920         default: /* 1 byte */
 921                 (*utf_ptr)++;
 922                 return (u2) ch1;
 923         case 0xC:
 924         case 0xD: /* 2 bytes */
 925                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 926                         unsigned char high = ch1 & 0x1F;
 927                         unsigned char low  = ch2 & 0x3F;
 928                         unicode_char = (high << 6) + low;
 929                         len = 2;
 930                 }
 931                 break;
 932
 933         case 0xE: /* 2 or 3 bytes */
 934                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 935                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 936                                 unsigned char low  = ch3 & 0x3f;
 937                                 unsigned char mid  = ch2 & 0x3f;
 938                                 unsigned char high = ch1 & 0x0f;
 939                                 unicode_char = (((high << 6) + mid) << 6) + low;
 940                                 len = 3;
 941                         } else
 942                                 len = 2;
 943                 }
 944                 break;
 945     }
 946
 947     /* update position in utf-text */
 948     *utf_ptr = (char *) (utf + len);
 949
 950     return unicode_char;
 951 }
 952
 953
 954 /* utf_bytes *******************************************************************
 955
 956    Determine number of bytes (aka. octets) in the utf string.
 957
 958    IN:
 959       u............utf string
 960
 961    OUT:
 962       The number of octets of this utf string.
 963           There is _no_ terminating zero included in this count.
 964
 965 *******************************************************************************/
 966
 967 u4 utf_bytes(utf *u)
 968 {
 969         return u->blength;
 970 }
 971
 972
 973 /* utf_get_number_of_u2s_for_buffer ********************************************
 974
 975    Determine number of UTF-16 u2s in the given UTF-8 buffer
 976
 977    CAUTION: This function is unsafe for input that was not checked
 978             by is_valid_utf!
 979
 980    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 981    to an array of u2s (UTF-16) and want to know how many of them you will get.
 982    All other uses of this function are probably wrong.
 983
 984    IN:
 985       buffer........points to first char in buffer
 986           blength.......number of _bytes_ in the buffer
 987
 988    OUT:
 989       the number of u2s needed to hold this string in UTF-16 encoding.
 990           There is _no_ terminating zero included in this count.
 991
 992    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
 993    exception.
 994
 995 *******************************************************************************/
 996
 997 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
 998 {
 999         const char *endpos;                 /* points behind utf string           */
1000         const char *utf_ptr;                /* current position in utf text       */
1001         u4 len = 0;                         /* number of unicode characters       */
1002
1003         utf_ptr = buffer;
1004         endpos = utf_ptr + blength;
1005
1006         while (utf_ptr < endpos) {
1007                 len++;
1008                 /* next unicode character */
1009                 utf_nextu2((char **)&utf_ptr);
1010         }
1011
1012         assert(utf_ptr == endpos);
1013
1014         return len;
1015 }
1016
1017
1018 /* utf_get_number_of_u2s *******************************************************
1019
1020    Determine number of UTF-16 u2s in the utf string.
1021
1022    CAUTION: This function is unsafe for input that was not checked
1023             by is_valid_utf!
1024
1025    CAUTION: Use this function *only* when you want to convert a utf string
1026    to an array of u2s and want to know how many of them you will get.
1027    All other uses of this function are probably wrong.
1028
1029    IN:
1030       u............utf string
1031
1032    OUT:
1033       the number of u2s needed to hold this string in UTF-16 encoding.
1034           There is _no_ terminating zero included in this count.
1035           XXX 0 if a NullPointerException has been thrown (see below)
1036
1037 *******************************************************************************/
1038
1039 u4 utf_get_number_of_u2s(utf *u)
1040 {
1041         char *endpos;                       /* points behind utf string           */
1042         char *utf_ptr;                      /* current position in utf text       */
1043         u4 len = 0;                         /* number of unicode characters       */
1044
1045         /* XXX this is probably not checked by most callers! Review this after */
1046         /* the invalid uses of this function have been eliminated */
1047         if (u == NULL) {
1048                 exceptions_throw_nullpointerexception();
1049                 return 0;
1050         }
1051
1052         endpos = UTF_END(u);
1053         utf_ptr = u->text;
1054
1055         while (utf_ptr < endpos) {
1056                 len++;
1057                 /* next unicode character */
1058                 utf_nextu2(&utf_ptr);
1059         }
1060
1061         if (utf_ptr != endpos) {
1062                 /* string ended abruptly */
1063                 exceptions_throw_internalerror("Illegal utf8 string");
1064                 return 0;
1065         }
1066
1067         return len;
1068 }
1069
1070
1071 /* utf8_safe_number_of_u2s *****************************************************
1072
1073    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1074    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1075
1076    This function is safe even for invalid UTF-8 strings.
1077
1078    IN:
1079       text..........zero-terminated(!) UTF-8 string (may be invalid)
1080                         must NOT be NULL
1081           nbytes........strlen(text). (This is needed to completely emulate
1082                         the RI).
1083
1084    OUT:
1085       the number of u2s needed to hold this string in UTF-16 encoding.
1086           There is _no_ terminating zero included in this count.
1087
1088 *******************************************************************************/
1089
1090 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1091         register const unsigned char *t;
1092         register s4 byte;
1093         register s4 len;
1094         register const unsigned char *tlimit;
1095         s4 byte1;
1096         s4 byte2;
1097         s4 byte3;
1098         s4 value;
1099         s4 skip;
1100
1101         assert(text);
1102         assert(nbytes >= 0);
1103
1104         len = 0;
1105         t = (const unsigned char *) text;
1106         tlimit = t + nbytes;
1107
1108         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1109
1110         while (1) {
1111                 byte = *t++;
1112
1113                 if (byte & 0x80) {
1114                         /* highest bit set, non-ASCII character */
1115
1116                         if ((byte & 0xe0) == 0xc0) {
1117                                 /* 2-byte: should be 110..... 10...... ? */
1118
1119                                 if ((*t++ & 0xc0) == 0x80)
1120                                         ; /* valid 2-byte */
1121                                 else
1122                                         t--; /* invalid */
1123                         }
1124                         else if ((byte & 0xf0) == 0xe0) {
1125                                 /* 3-byte: should be 1110.... 10...... 10...... */
1126                                 /*                            ^t                */
1127
1128                                 if (t + 2 > tlimit)
1129                                         return len + 1; /* invalid, stop here */
1130
1131                                 if ((*t++ & 0xc0) == 0x80) {
1132                                         if ((*t++ & 0xc0) == 0x80)
1133                                                 ; /* valid 3-byte */
1134                                         else
1135                                                 t--; /* invalid */
1136                                 }
1137                                 else
1138                                         t--; /* invalid */
1139                         }
1140                         else if ((byte & 0xf8) == 0xf0) {
1141                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1142                                 /*                            ^t                         */
1143
1144                                 if (t + 3 > tlimit)
1145                                         return len + 1; /* invalid, stop here */
1146
1147                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1148                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1149                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1150                                                         /* valid 4-byte UTF-8? */
1151                                                         value = ((byte  & 0x07) << 18)
1152                                                                   | ((byte1 & 0x3f) << 12)
1153                                                                   | ((byte2 & 0x3f) <<  6)
1154                                                                   | ((byte3 & 0x3f)      );
1155
1156                                                         if (value > 0x10FFFF)
1157                                                                 ; /* invalid */
1158                                                         else if (value > 0xFFFF)
1159                                                                 len += 1; /* we need surrogates */
1160                                                         else
1161                                                                 ; /* 16bit suffice */
1162                                                 }
1163                                                 else
1164                                                         t--; /* invalid */
1165                                         }
1166                                         else
1167                                                 t--; /* invalid */
1168                                 }
1169                                 else
1170                                         t--; /* invalid */
1171                         }
1172                         else if ((byte & 0xfc) == 0xf8) {
1173                                 /* invalid 5-byte */
1174                                 if (t + 4 > tlimit)
1175                                         return len + 1; /* invalid, stop here */
1176
1177                                 skip = 4;
1178                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1179                                         t++;
1180                         }
1181                         else if ((byte & 0xfe) == 0xfc) {
1182                                 /* invalid 6-byte */
1183                                 if (t + 5 > tlimit)
1184                                         return len + 1; /* invalid, stop here */
1185
1186                                 skip = 5;
1187                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1188                                         t++;
1189                         }
1190                         else
1191                                 ; /* invalid */
1192                 }
1193                 else {
1194                         /* NUL */
1195
1196                         if (byte == 0)
1197                                 break;
1198
1199                         /* ASCII character, common case */
1200                 }
1201
1202                 len++;
1203         }
1204
1205         return len;
1206 }
1207
1208
1209 /* utf8_safe_convert_to_u2s ****************************************************
1210
1211    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1212    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1213    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1214
1215    This function is safe even for invalid UTF-8 strings.
1216
1217    IN:
1218       text..........zero-terminated(!) UTF-8 string (may be invalid)
1219                         must NOT be NULL
1220           nbytes........strlen(text). (This is needed to completely emulate
1221                                         the RI).
1222           buffer........a preallocated array of u2s to receive the decoded
1223                         string. Use utf8_safe_number_of_u2s to get the
1224                                         required number of u2s for allocating this.
1225
1226 *******************************************************************************/
1227
1228 #define UNICODE_REPLACEMENT  0xfffd
1229
1230 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1231         register const unsigned char *t;
1232         register s4 byte;
1233         register const unsigned char *tlimit;
1234         s4 byte1;
1235         s4 byte2;
1236         s4 byte3;
1237         s4 value;
1238         s4 skip;
1239
1240         assert(text);
1241         assert(nbytes >= 0);
1242
1243         t = (const unsigned char *) text;
1244         tlimit = t + nbytes;
1245
1246         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1247
1248         while (1) {
1249                 byte = *t++;
1250
1251                 if (byte & 0x80) {
1252                         /* highest bit set, non-ASCII character */
1253
1254                         if ((byte & 0xe0) == 0xc0) {
1255                                 /* 2-byte: should be 110..... 10...... */
1256
1257                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1258                                         /* valid 2-byte UTF-8 */
1259                                         *buffer++ = ((byte  & 0x1f) << 6)
1260                                                           | ((byte1 & 0x3f)     );
1261                                 }
1262                                 else {
1263                                         *buffer++ = UNICODE_REPLACEMENT;
1264                                         t--;
1265                                 }
1266                         }
1267                         else if ((byte & 0xf0) == 0xe0) {
1268                                 /* 3-byte: should be 1110.... 10...... 10...... */
1269
1270                                 if (t + 2 > tlimit) {
1271                                         *buffer++ = UNICODE_REPLACEMENT;
1272                                         return;
1273                                 }
1274
1275                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1276                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1277                                                 /* valid 3-byte UTF-8 */
1278                                                 *buffer++ = ((byte  & 0x0f) << 12)
1279                                                                   | ((byte1 & 0x3f) <<  6)
1280                                                                   | ((byte2 & 0x3f)      );
1281                                         }
1282                                         else {
1283                                                 *buffer++ = UNICODE_REPLACEMENT;
1284                                                 t--;
1285                                         }
1286                                 }
1287                                 else {
1288                                         *buffer++ = UNICODE_REPLACEMENT;
1289                                         t--;
1290                                 }
1291                         }
1292                         else if ((byte & 0xf8) == 0xf0) {
1293                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1294
1295                                 if (t + 3 > tlimit) {
1296                                         *buffer++ = UNICODE_REPLACEMENT;
1297                                         return;
1298                                 }
1299
1300                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1301                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1302                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1303                                                         /* valid 4-byte UTF-8? */
1304                                                         value = ((byte  & 0x07) << 18)
1305                                                                   | ((byte1 & 0x3f) << 12)
1306                                                                   | ((byte2 & 0x3f) <<  6)
1307                                                                   | ((byte3 & 0x3f)      );
1308
1309                                                         if (value > 0x10FFFF) {
1310                                                                 *buffer++ = UNICODE_REPLACEMENT;
1311                                                         }
1312                                                         else if (value > 0xFFFF) {
1313                                                                 /* we need surrogates */
1314                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1315                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1316                                                         }
1317                                                         else
1318                                                                 *buffer++ = value; /* 16bit suffice */
1319                                                 }
1320                                                 else {
1321                                                         *buffer++ = UNICODE_REPLACEMENT;
1322                                                         t--;
1323                                                 }
1324                                         }
1325                                         else {
1326                                                 *buffer++ = UNICODE_REPLACEMENT;
1327                                                 t--;
1328                                         }
1329                                 }
1330                                 else {
1331                                         *buffer++ = UNICODE_REPLACEMENT;
1332                                         t--;
1333                                 }
1334                         }
1335                         else if ((byte & 0xfc) == 0xf8) {
1336                                 if (t + 4 > tlimit) {
1337                                         *buffer++ = UNICODE_REPLACEMENT;
1338                                         return;
1339                                 }
1340
1341                                 skip = 4;
1342                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1343                                         t++;
1344                                 *buffer++ = UNICODE_REPLACEMENT;
1345                         }
1346                         else if ((byte & 0xfe) == 0xfc) {
1347                                 if (t + 5 > tlimit) {
1348                                         *buffer++ = UNICODE_REPLACEMENT;
1349                                         return;
1350                                 }
1351
1352                                 skip = 5;
1353                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1354                                         t++;
1355                                 *buffer++ = UNICODE_REPLACEMENT;
1356                         }
1357                         else
1358                                 *buffer++ = UNICODE_REPLACEMENT;
1359                 }
1360                 else {
1361                         /* NUL */
1362
1363                         if (byte == 0)
1364                                 break;
1365
1366                         /* ASCII character, common case */
1367
1368                         *buffer++ = byte;
1369                 }
1370         }
1371 }
1372
1373
1374 /* u2_utflength ****************************************************************
1375
1376    Returns the utf length in bytes of a u2 array.
1377
1378 *******************************************************************************/
1379
1380 u4 u2_utflength(u2 *text, u4 u2_length)
1381 {
1382         u4 result_len = 0;                  /* utf length in bytes                */
1383         u2 ch;                              /* current unicode character          */
1384         u4 len;
1385
1386         for (len = 0; len < u2_length; len++) {
1387                 /* next unicode character */
1388                 ch = *text++;
1389
1390                 /* determine bytes required to store unicode character as utf */
1391                 if (ch && (ch < 0x80))
1392                         result_len++;
1393                 else if (ch < 0x800)
1394                         result_len += 2;
1395                 else
1396                         result_len += 3;
1397         }
1398
1399     return result_len;
1400 }
1401
1402
1403 /* utf_copy ********************************************************************
1404
1405    Copy the given utf string byte-for-byte to a buffer.
1406
1407    IN:
1408       buffer.......the buffer
1409           u............the utf string
1410
1411 *******************************************************************************/
1412
1413 void utf_copy(char *buffer, utf *u)
1414 {
1415         /* our utf strings are zero-terminated (done by utf_new) */
1416         MCOPY(buffer, u->text, char, u->blength + 1);
1417 }
1418
1419
1420 /* utf_cat *********************************************************************
1421
1422    Append the given utf string byte-for-byte to a buffer.
1423
1424    IN:
1425       buffer.......the buffer
1426           u............the utf string
1427
1428 *******************************************************************************/
1429
1430 void utf_cat(char *buffer, utf *u)
1431 {
1432         /* our utf strings are zero-terminated (done by utf_new) */
1433         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1434 }
1435
1436
1437 /* utf_copy_classname **********************************************************
1438
1439    Copy the given utf classname byte-for-byte to a buffer.
1440    '/' is replaced by '.'
1441
1442    IN:
1443       buffer.......the buffer
1444           u............the utf string
1445
1446 *******************************************************************************/
1447
1448 void utf_copy_classname(char *buffer, utf *u)
1449 {
1450         char *bufptr;
1451         char *srcptr;
1452         char *endptr;
1453         char ch;
1454
1455         bufptr = buffer;
1456         srcptr = u->text;
1457         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1458
1459         while (srcptr != endptr) {
1460                 ch = *srcptr++;
1461                 if (ch == '/')
1462                         ch = '.';
1463                 *bufptr++ = ch;
1464         }
1465 }
1466
1467
1468 /* utf_cat *********************************************************************
1469
1470    Append the given utf classname byte-for-byte to a buffer.
1471    '/' is replaced by '.'
1472
1473    IN:
1474       buffer.......the buffer
1475           u............the utf string
1476
1477 *******************************************************************************/
1478
1479 void utf_cat_classname(char *buffer, utf *u)
1480 {
1481         utf_copy_classname(buffer + strlen(buffer), u);
1482 }
1483
1484 /* utf_display_printable_ascii *************************************************
1485
1486    Write utf symbol to stdout (for debugging purposes).
1487    Non-printable and non-ASCII characters are printed as '?'.
1488
1489 *******************************************************************************/
1490
1491 void utf_display_printable_ascii(utf *u)
1492 {
1493         char *endpos;                       /* points behind utf string           */
1494         char *utf_ptr;                      /* current position in utf text       */
1495
1496         if (u == NULL) {
1497                 printf("NULL");
1498                 fflush(stdout);
1499                 return;
1500         }
1501
1502         endpos = UTF_END(u);
1503         utf_ptr = u->text;
1504
1505         while (utf_ptr < endpos) {
1506                 /* read next unicode character */
1507
1508                 u2 c = utf_nextu2(&utf_ptr);
1509
1510                 if ((c >= 32) && (c <= 127))
1511                         printf("%c", c);
1512                 else
1513                         printf("?");
1514         }
1515
1516         fflush(stdout);
1517 }
1518
1519
1520 /* utf_display_printable_ascii_classname ***************************************
1521
1522    Write utf symbol to stdout with `/' converted to `.' (for debugging
1523    purposes).
1524    Non-printable and non-ASCII characters are printed as '?'.
1525
1526 *******************************************************************************/
1527
1528 void utf_display_printable_ascii_classname(utf *u)
1529 {
1530         char *endpos;                       /* points behind utf string           */
1531         char *utf_ptr;                      /* current position in utf text       */
1532
1533         if (u == NULL) {
1534                 printf("NULL");
1535                 fflush(stdout);
1536                 return;
1537         }
1538
1539         endpos = UTF_END(u);
1540         utf_ptr = u->text;
1541
1542         while (utf_ptr < endpos) {
1543                 /* read next unicode character */
1544
1545                 u2 c = utf_nextu2(&utf_ptr);
1546
1547                 if (c == '/')
1548                         c = '.';
1549
1550                 if ((c >= 32) && (c <= 127))
1551                         printf("%c", c);
1552                 else
1553                         printf("?");
1554         }
1555
1556         fflush(stdout);
1557 }
1558
1559
1560 /* utf_sprint_convert_to_latin1 ************************************************
1561
1562    Write utf symbol into c-string (for debugging purposes).
1563    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1564    invalid results.
1565
1566 *******************************************************************************/
1567
1568 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1569 {
1570         char *endpos;                       /* points behind utf string           */
1571         char *utf_ptr;                      /* current position in utf text       */
1572         u2 pos = 0;                         /* position in c-string               */
1573
1574         if (!u) {
1575                 strcpy(buffer, "NULL");
1576                 return;
1577         }
1578
1579         endpos = UTF_END(u);
1580         utf_ptr = u->text;
1581
1582         while (utf_ptr < endpos)
1583                 /* copy next unicode character */
1584                 buffer[pos++] = utf_nextu2(&utf_ptr);
1585
1586         /* terminate string */
1587         buffer[pos] = '\0';
1588 }
1589
1590
1591 /* utf_sprint_convert_to_latin1_classname **************************************
1592
1593    Write utf symbol into c-string with `/' converted to `.' (for debugging
1594    purposes).
1595    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1596    invalid results.
1597
1598 *******************************************************************************/
1599
1600 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1601 {
1602         char *endpos;                       /* points behind utf string           */
1603         char *utf_ptr;                      /* current position in utf text       */
1604         u2 pos = 0;                         /* position in c-string               */
1605
1606         if (!u) {
1607                 strcpy(buffer, "NULL");
1608                 return;
1609         }
1610
1611         endpos = UTF_END(u);
1612         utf_ptr = u->text;
1613
1614         while (utf_ptr < endpos) {
1615                 /* copy next unicode character */
1616                 u2 c = utf_nextu2(&utf_ptr);
1617                 if (c == '/') c = '.';
1618                 buffer[pos++] = c;
1619         }
1620
1621         /* terminate string */
1622         buffer[pos] = '\0';
1623 }
1624
1625
1626 /* utf_strcat_convert_to_latin1 ************************************************
1627
1628    Like libc strcat, but uses an utf8 string.
1629    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1630    invalid results.
1631
1632 *******************************************************************************/
1633
1634 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1635 {
1636         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1637 }
1638
1639
1640 /* utf_strcat_convert_to_latin1_classname **************************************
1641
1642    Like libc strcat, but uses an utf8 string.
1643    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1644    invalid results.
1645
1646 *******************************************************************************/
1647
1648 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1649 {
1650         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1651 }
1652
1653
1654 /* utf_fprint_printable_ascii **************************************************
1655
1656    Write utf symbol into file.
1657    Non-printable and non-ASCII characters are printed as '?'.
1658
1659 *******************************************************************************/
1660
1661 void utf_fprint_printable_ascii(FILE *file, utf *u)
1662 {
1663         char *endpos;                       /* points behind utf string           */
1664         char *utf_ptr;                      /* current position in utf text       */
1665
1666         if (!u)
1667                 return;
1668
1669         endpos = UTF_END(u);
1670         utf_ptr = u->text;
1671
1672         while (utf_ptr < endpos) {
1673                 /* read next unicode character */
1674                 u2 c = utf_nextu2(&utf_ptr);
1675
1676                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1677                 else fprintf(file, "?");
1678         }
1679 }
1680
1681
1682 /* utf_fprint_printable_ascii_classname ****************************************
1683
1684    Write utf symbol into file with `/' converted to `.'.
1685    Non-printable and non-ASCII characters are printed as '?'.
1686
1687 *******************************************************************************/
1688
1689 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1690 {
1691         char *endpos;                       /* points behind utf string           */
1692         char *utf_ptr;                      /* current position in utf text       */
1693
1694     if (!u)
1695                 return;
1696
1697         endpos = UTF_END(u);
1698         utf_ptr = u->text;
1699
1700         while (utf_ptr < endpos) {
1701                 /* read next unicode character */
1702                 u2 c = utf_nextu2(&utf_ptr);
1703                 if (c == '/') c = '.';
1704
1705                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1706                 else fprintf(file, "?");
1707         }
1708 }
1709
1710
1711 /* is_valid_utf ****************************************************************
1712
1713    Return true if the given string is a valid UTF-8 string.
1714
1715    utf_ptr...points to first character
1716    end_pos...points after last character
1717
1718 *******************************************************************************/
1719
1720 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1721
1722 bool is_valid_utf(char *utf_ptr, char *end_pos)
1723 {
1724         int bytes;
1725         int len,i;
1726         char c;
1727         unsigned long v;
1728
1729         if (end_pos < utf_ptr) return false;
1730         bytes = end_pos - utf_ptr;
1731         while (bytes--) {
1732                 c = *utf_ptr++;
1733
1734                 if (!c) return false;                     /* 0x00 is not allowed */
1735                 if ((c & 0x80) == 0) continue;            /* ASCII */
1736
1737                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1738                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1739                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1740                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1741                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1742                 else return false;                        /* invalid leading byte */
1743
1744                 if (len > 2) return false;                /* Java limitation */
1745
1746                 v = (unsigned long)c & (0x3f >> len);
1747
1748                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1749
1750                 for (i = len; i--; ) {
1751                         c = *utf_ptr++;
1752                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1753                                 return false;
1754                         v = (v << 6) | (c & 0x3f);
1755                 }
1756
1757                 if (v == 0) {
1758                         if (len != 1) return false;           /* Java special */
1759
1760                 } else {
1761                         /* Sun Java seems to allow overlong UTF-8 encodings */
1762
1763                         /* if (v < min_codepoint[len]) */
1764                                 /* XXX throw exception? */
1765                 }
1766
1767                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1768                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1769
1770                 /* even these seem to be allowed */
1771                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1772         }
1773
1774         return true;
1775 }
1776
1777
1778 /* is_valid_name ***************************************************************
1779
1780    Return true if the given string may be used as a class/field/method
1781    name. (Currently this only disallows empty strings and control
1782    characters.)
1783
1784    NOTE: The string is assumed to have passed is_valid_utf!
1785
1786    utf_ptr...points to first character
1787    end_pos...points after last character
1788
1789 *******************************************************************************/
1790
1791 bool is_valid_name(char *utf_ptr, char *end_pos)
1792 {
1793         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1794
1795         while (utf_ptr < end_pos) {
1796                 unsigned char c = *utf_ptr++;
1797
1798                 if (c < 0x20) return false; /* disallow control characters */
1799                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1800                         return false;
1801         }
1802
1803         return true;
1804 }
1805
1806 bool is_valid_name_utf(utf *u)
1807 {
1808         return is_valid_name(u->text, UTF_END(u));
1809 }
1810
1811
1812 /* utf_show ********************************************************************
1813
1814    Writes the utf symbols in the utfhash to stdout and displays the
1815    number of external hash chains grouped according to the chainlength
1816    (for debugging purposes).
1817
1818 *******************************************************************************/
1819
1820 #if !defined(NDEBUG)
1821 void utf_show(void)
1822 {
1823
1824 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1825
1826         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1827         u4 max_chainlength = 0;      /* maximum length of the chains */
1828         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1829         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1830         u4 i;
1831
1832         printf("UTF-HASH:\n");
1833
1834         /* show element of utf-hashtable */
1835
1836         for (i = 0; i < hashtable_utf->size; i++) {
1837                 utf *u = hashtable_utf->ptr[i];
1838
1839                 if (u) {
1840                         printf("SLOT %d: ", (int) i);
1841
1842                         while (u) {
1843                                 printf("'");
1844                                 utf_display_printable_ascii(u);
1845                                 printf("' ");
1846                                 u = u->hashlink;
1847                         }
1848                         printf("\n");
1849                 }
1850         }
1851
1852         printf("UTF-HASH: %d slots for %d entries\n",
1853                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1854
1855         if (hashtable_utf->entries == 0)
1856                 return;
1857
1858         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1859
1860         for (i=0;i<CHAIN_LIMIT;i++)
1861                 chain_count[i]=0;
1862
1863         /* count numbers of hashchains according to their length */
1864         for (i=0; i<hashtable_utf->size; i++) {
1865
1866                 utf *u = (utf*) hashtable_utf->ptr[i];
1867                 u4 chain_length = 0;
1868
1869                 /* determine chainlength */
1870                 while (u) {
1871                         u = u->hashlink;
1872                         chain_length++;
1873                 }
1874
1875                 /* update sum of all chainlengths */
1876                 sum_chainlength+=chain_length;
1877
1878                 /* determine the maximum length of the chains */
1879                 if (chain_length>max_chainlength)
1880                         max_chainlength = chain_length;
1881
1882                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1883                 if (chain_length>=CHAIN_LIMIT) {
1884                         beyond_limit+=chain_length;
1885                         chain_length=CHAIN_LIMIT-1;
1886                 }
1887
1888                 /* update number of hashchains of current length */
1889                 chain_count[chain_length]++;
1890         }
1891
1892         /* display results */
1893         for (i=1;i<CHAIN_LIMIT-1;i++)
1894                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1895
1896         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1897
1898
1899         printf("max. chainlength:%5d\n",max_chainlength);
1900
1901         /* avg. chainlength = sum of chainlengths / number of chains */
1902         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1903 }
1904 #endif /* !defined(NDEBUG) */
1905
1906
1907 /*
1908  * These are local overrides for various environment variables in Emacs.
1909  * Please do not remove this and leave it at the end of the file, where
1910  * Emacs will automagically detect them.
1911  * ---------------------------------------------------------------------
1912  * Local variables:
1913  * mode: c
1914  * indent-tabs-mode: t
1915  * c-basic-offset: 4
1916  * tab-width: 4
1917  * End:
1918  * vim:noexpandtab:sw=4:ts=4:
1919  */