src/vmcore/utf8.c

   1 /* src/vmcore/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006, 2007, 2008
   4    CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
   5
   6    This file is part of CACAO.
   7
   8    This program is free software; you can redistribute it and/or
   9    modify it under the terms of the GNU General Public License as
  10    published by the Free Software Foundation; either version 2, or (at
  11    your option) any later version.
  12
  13    This program is distributed in the hope that it will be useful, but
  14    WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16    General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; if not, write to the Free Software
  20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  21    02110-1301, USA.
  22
  23 */
  24
  25
  26 #include "config.h"
  27
  28 #include <string.h>
  29 #include <assert.h>
  30
  31 #include "vm/types.h"
  32
  33 #include "mm/memory.h"
  34
  35 #include "threads/lock-common.h"
  36
  37 #include "toolbox/hashtable.h"
  38
  39 #include "vm/exceptions.h"
  40
  41 #include "vmcore/options.h"
  42
  43 #if defined(ENABLE_STATISTICS)
  44 # include "vmcore/statistics.h"
  45 #endif
  46
  47 #include "vmcore/utf8.h"
  48
  49
  50 /* global variables ***********************************************************/
  51
  52 /* hashsize must be power of 2 */
  53
  54 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  55
  56 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  57
  58
  59 /* utf-symbols for pointer comparison of frequently used strings **************/
  60
  61 utf *utf_java_lang_Object;
  62
  63 utf *utf_java_lang_Class;
  64 utf *utf_java_lang_ClassLoader;
  65 utf *utf_java_lang_Cloneable;
  66 utf *utf_java_lang_SecurityManager;
  67 utf *utf_java_lang_String;
  68 utf *utf_java_lang_ThreadGroup;
  69 utf *utf_java_lang_ref_SoftReference;
  70 utf *utf_java_lang_ref_WeakReference;
  71 utf *utf_java_lang_ref_PhantomReference;
  72 utf *utf_java_io_Serializable;
  73
  74 utf *utf_java_lang_Throwable;
  75 utf *utf_java_lang_Error;
  76
  77 utf *utf_java_lang_AbstractMethodError;
  78 utf *utf_java_lang_ClassCircularityError;
  79 utf *utf_java_lang_ClassFormatError;
  80 utf *utf_java_lang_ExceptionInInitializerError;
  81 utf *utf_java_lang_IncompatibleClassChangeError;
  82 utf *utf_java_lang_InstantiationError;
  83 utf *utf_java_lang_InternalError;
  84 utf *utf_java_lang_LinkageError;
  85 utf *utf_java_lang_NoClassDefFoundError;
  86 utf *utf_java_lang_NoSuchFieldError;
  87 utf *utf_java_lang_NoSuchMethodError;
  88 utf *utf_java_lang_OutOfMemoryError;
  89 utf *utf_java_lang_UnsatisfiedLinkError;
  90 utf *utf_java_lang_UnsupportedClassVersionError;
  91 utf *utf_java_lang_VerifyError;
  92 utf *utf_java_lang_VirtualMachineError;
  93
  94 utf *utf_java_lang_Exception;
  95
  96 utf *utf_java_lang_ArithmeticException;
  97 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
  98 utf *utf_java_lang_ArrayStoreException;
  99 utf *utf_java_lang_ClassCastException;
 100 utf *utf_java_lang_ClassNotFoundException;
 101 utf *utf_java_lang_CloneNotSupportedException;
 102 utf *utf_java_lang_IllegalAccessException;
 103 utf *utf_java_lang_IllegalArgumentException;
 104 utf *utf_java_lang_IllegalMonitorStateException;
 105 utf *utf_java_lang_InstantiationException;
 106 utf *utf_java_lang_InterruptedException;
 107 utf *utf_java_lang_NegativeArraySizeException;
 108 utf *utf_java_lang_NullPointerException;
 109 utf *utf_java_lang_StringIndexOutOfBoundsException;
 110
 111 utf *utf_java_lang_reflect_InvocationTargetException;
 112
 113 utf *utf_java_security_PrivilegedActionException;
 114
 115 #if defined(ENABLE_JAVASE)
 116 utf* utf_java_lang_Void;
 117 #endif
 118
 119 utf* utf_java_lang_Boolean;
 120 utf* utf_java_lang_Byte;
 121 utf* utf_java_lang_Character;
 122 utf* utf_java_lang_Short;
 123 utf* utf_java_lang_Integer;
 124 utf* utf_java_lang_Long;
 125 utf* utf_java_lang_Float;
 126 utf* utf_java_lang_Double;
 127
 128 #if defined(ENABLE_JAVASE)
 129 utf *utf_java_lang_StackTraceElement;
 130 utf *utf_java_lang_reflect_Constructor;
 131 utf *utf_java_lang_reflect_Field;
 132 utf *utf_java_lang_reflect_Method;
 133 utf *utf_java_util_Vector;
 134 #endif
 135
 136 utf *utf_InnerClasses;                  /* InnerClasses                       */
 137 utf *utf_ConstantValue;                 /* ConstantValue                      */
 138 utf *utf_Code;                          /* Code                               */
 139 utf *utf_Exceptions;                    /* Exceptions                         */
 140 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 141 utf *utf_SourceFile;                    /* SourceFile                         */
 142
 143 #if defined(ENABLE_JAVASE)
 144 utf *utf_EnclosingMethod;
 145 utf *utf_Signature;
 146 utf *utf_StackMapTable;
 147
 148 #if defined(ENABLE_ANNOTATIONS)
 149 utf *utf_RuntimeVisibleAnnotations;            /* RuntimeVisibleAnnotations            */
 150 utf *utf_RuntimeInvisibleAnnotations;          /* RuntimeInvisibleAnnotations          */
 151 utf *utf_RuntimeVisibleParameterAnnotations;   /* RuntimeVisibleParameterAnnotations   */
 152 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
 153 utf *utf_AnnotationDefault;                    /* AnnotationDefault                    */
 154 #endif
 155 #endif
 156
 157 utf *utf_init;                          /* <init>                             */
 158 utf *utf_clinit;                        /* <clinit>                           */
 159 utf *utf_clone;                         /* clone                              */
 160 utf *utf_finalize;                      /* finalize                           */
 161 utf *utf_invoke;
 162 utf *utf_main;
 163 utf *utf_run;                           /* run                                */
 164
 165 utf *utf_add;
 166 utf *utf_remove;
 167 utf *utf_addThread;
 168 utf *utf_removeThread;
 169 utf *utf_put;
 170 utf *utf_get;
 171 utf *utf_uncaughtException;
 172 utf *utf_value;
 173
 174 utf *utf_fillInStackTrace;
 175 utf *utf_findNative;
 176 utf *utf_getSystemClassLoader;
 177 utf *utf_initCause;
 178 utf *utf_loadClass;
 179 utf *utf_loadClassInternal;
 180 utf *utf_printStackTrace;
 181
 182 utf *utf_division_by_zero;
 183
 184 utf *utf_Z;                             /* Z                                  */
 185 utf *utf_B;                             /* B                                  */
 186 utf *utf_C;                             /* C                                  */
 187 utf *utf_S;                             /* S                                  */
 188 utf *utf_I;                             /* I                                  */
 189 utf *utf_J;                             /* J                                  */
 190 utf *utf_F;                             /* F                                  */
 191 utf *utf_D;                             /* D                                  */
 192
 193 utf *utf_void__void;                    /* ()V                                */
 194 utf *utf_boolean__void;                 /* (Z)V                               */
 195 utf *utf_byte__void;                    /* (B)V                               */
 196 utf *utf_char__void;                    /* (C)V                               */
 197 utf *utf_short__void;                   /* (S)V                               */
 198 utf *utf_int__void;                     /* (I)V                               */
 199 utf *utf_long__void;                    /* (J)V                               */
 200 utf *utf_float__void;                   /* (F)V                               */
 201 utf *utf_double__void;                  /* (D)V                               */
 202
 203 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 204 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 205 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 206 utf *utf_java_lang_ClassLoader_java_lang_String__J;
 207 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 208 utf *utf_java_lang_Object__java_lang_Object;
 209 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 210 utf *utf_java_lang_String__java_lang_Class;
 211 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 212 utf *utf_java_lang_Thread_java_lang_Throwable__V;
 213 utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
 214 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 215 utf *utf_java_lang_Throwable__java_lang_Throwable;
 216
 217 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 218 utf *utf_null;
 219 utf *array_packagename;
 220
 221
 222 /* utf_init ********************************************************************
 223
 224    Initializes the utf8 subsystem.
 225
 226 *******************************************************************************/
 227
 228 void utf8_init(void)
 229 {
 230         TRACESUBSYSTEMINITIALIZATION("utf8_init");
 231
 232         /* create utf8 hashtable */
 233
 234         hashtable_utf = NEW(hashtable);
 235
 236         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 237
 238 #if defined(ENABLE_STATISTICS)
 239         if (opt_stat)
 240                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 241 #endif
 242
 243         /* create utf-symbols for pointer comparison of frequently used strings */
 244
 245         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 246
 247         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 248         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 249         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 250         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 251         utf_java_lang_String           = utf_new_char("java/lang/String");
 252         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 253
 254         utf_java_lang_ref_SoftReference =
 255                 utf_new_char("java/lang/ref/SoftReference");
 256
 257         utf_java_lang_ref_WeakReference =
 258                 utf_new_char("java/lang/ref/WeakReference");
 259
 260         utf_java_lang_ref_PhantomReference =
 261                 utf_new_char("java/lang/ref/PhantomReference");
 262
 263         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 264
 265         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 266         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 267
 268         utf_java_lang_ClassCircularityError =
 269                 utf_new_char("java/lang/ClassCircularityError");
 270
 271         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
 272
 273         utf_java_lang_ExceptionInInitializerError =
 274                 utf_new_char("java/lang/ExceptionInInitializerError");
 275
 276         utf_java_lang_IncompatibleClassChangeError =
 277                 utf_new_char("java/lang/IncompatibleClassChangeError");
 278
 279         utf_java_lang_InstantiationError =
 280                 utf_new_char("java/lang/InstantiationError");
 281
 282         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
 283         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 284
 285         utf_java_lang_NoClassDefFoundError =
 286                 utf_new_char("java/lang/NoClassDefFoundError");
 287
 288         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 289
 290         utf_java_lang_UnsatisfiedLinkError =
 291                 utf_new_char("java/lang/UnsatisfiedLinkError");
 292
 293         utf_java_lang_UnsupportedClassVersionError =
 294                 utf_new_char("java/lang/UnsupportedClassVersionError");
 295
 296         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
 297
 298         utf_java_lang_VirtualMachineError =
 299                 utf_new_char("java/lang/VirtualMachineError");
 300
 301 #if defined(ENABLE_JAVASE)
 302         utf_java_lang_AbstractMethodError =
 303                 utf_new_char("java/lang/AbstractMethodError");
 304
 305         utf_java_lang_NoSuchFieldError =
 306                 utf_new_char("java/lang/NoSuchFieldError");
 307
 308         utf_java_lang_NoSuchMethodError =
 309                 utf_new_char("java/lang/NoSuchMethodError");
 310 #endif
 311
 312         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 313
 314         utf_java_lang_ArithmeticException =
 315                 utf_new_char("java/lang/ArithmeticException");
 316
 317         utf_java_lang_ArrayIndexOutOfBoundsException =
 318                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
 319
 320         utf_java_lang_ArrayStoreException =
 321                 utf_new_char("java/lang/ArrayStoreException");
 322
 323         utf_java_lang_ClassCastException =
 324                 utf_new_char("java/lang/ClassCastException");
 325
 326         utf_java_lang_ClassNotFoundException =
 327                 utf_new_char("java/lang/ClassNotFoundException");
 328
 329         utf_java_lang_CloneNotSupportedException =
 330                 utf_new_char("java/lang/CloneNotSupportedException");
 331
 332         utf_java_lang_IllegalAccessException =
 333                 utf_new_char("java/lang/IllegalAccessException");
 334
 335         utf_java_lang_IllegalArgumentException =
 336                 utf_new_char("java/lang/IllegalArgumentException");
 337
 338         utf_java_lang_IllegalMonitorStateException =
 339                 utf_new_char("java/lang/IllegalMonitorStateException");
 340
 341         utf_java_lang_InstantiationException =
 342                 utf_new_char("java/lang/InstantiationException");
 343
 344         utf_java_lang_InterruptedException =
 345                 utf_new_char("java/lang/InterruptedException");
 346
 347         utf_java_lang_NegativeArraySizeException =
 348                 utf_new_char("java/lang/NegativeArraySizeException");
 349
 350         utf_java_lang_NullPointerException =
 351                 utf_new_char("java/lang/NullPointerException");
 352
 353         utf_java_lang_StringIndexOutOfBoundsException =
 354                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
 355
 356         utf_java_lang_reflect_InvocationTargetException =
 357                 utf_new_char("java/lang/reflect/InvocationTargetException");
 358
 359         utf_java_security_PrivilegedActionException =
 360                 utf_new_char("java/security/PrivilegedActionException");
 361
 362 #if defined(ENABLE_JAVASE)
 363         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 364 #endif
 365
 366         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 367         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 368         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 369         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 370         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 371         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 372         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 373         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 374
 375 #if defined(ENABLE_JAVASE)
 376         utf_java_lang_StackTraceElement =
 377                 utf_new_char("java/lang/StackTraceElement");
 378
 379         utf_java_lang_reflect_Constructor =
 380                 utf_new_char("java/lang/reflect/Constructor");
 381
 382         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 383         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 384         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 385 #endif
 386
 387         utf_InnerClasses               = utf_new_char("InnerClasses");
 388         utf_ConstantValue              = utf_new_char("ConstantValue");
 389         utf_Code                       = utf_new_char("Code");
 390         utf_Exceptions                 = utf_new_char("Exceptions");
 391         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 392         utf_SourceFile                 = utf_new_char("SourceFile");
 393
 394 #if defined(ENABLE_JAVASE)
 395         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 396         utf_Signature                  = utf_new_char("Signature");
 397         utf_StackMapTable              = utf_new_char("StackMapTable");
 398
 399 #if defined(ENABLE_ANNOTATIONS)
 400         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
 401         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
 402         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
 403         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
 404         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
 405 #endif
 406 #endif
 407
 408         utf_init                           = utf_new_char("<init>");
 409         utf_clinit                         = utf_new_char("<clinit>");
 410         utf_clone                      = utf_new_char("clone");
 411         utf_finalize                   = utf_new_char("finalize");
 412         utf_invoke                     = utf_new_char("invoke");
 413         utf_main                       = utf_new_char("main");
 414         utf_run                        = utf_new_char("run");
 415
 416         utf_add                        = utf_new_char("add");
 417         utf_remove                     = utf_new_char("remove");
 418         utf_addThread                  = utf_new_char("addThread");
 419         utf_removeThread               = utf_new_char("removeThread");
 420         utf_put                        = utf_new_char("put");
 421         utf_get                        = utf_new_char("get");
 422         utf_uncaughtException          = utf_new_char("uncaughtException");
 423         utf_value                      = utf_new_char("value");
 424
 425         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 426         utf_findNative                 = utf_new_char("findNative");
 427         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 428         utf_initCause                  = utf_new_char("initCause");
 429         utf_loadClass                  = utf_new_char("loadClass");
 430         utf_loadClassInternal          = utf_new_char("loadClassInternal");
 431         utf_printStackTrace            = utf_new_char("printStackTrace");
 432
 433         utf_division_by_zero           = utf_new_char("/ by zero");
 434
 435         utf_Z                          = utf_new_char("Z");
 436         utf_B                          = utf_new_char("B");
 437         utf_C                          = utf_new_char("C");
 438         utf_S                          = utf_new_char("S");
 439         utf_I                          = utf_new_char("I");
 440         utf_J                          = utf_new_char("J");
 441         utf_F                          = utf_new_char("F");
 442         utf_D                          = utf_new_char("D");
 443
 444         utf_void__void                 = utf_new_char("()V");
 445         utf_boolean__void              = utf_new_char("(Z)V");
 446         utf_byte__void                 = utf_new_char("(B)V");
 447         utf_char__void                 = utf_new_char("(C)V");
 448         utf_short__void                = utf_new_char("(S)V");
 449         utf_int__void                  = utf_new_char("(I)V");
 450         utf_long__void                 = utf_new_char("(J)V");
 451         utf_float__void                = utf_new_char("(F)V");
 452         utf_double__void               = utf_new_char("(D)V");
 453         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 454         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 455
 456         utf_void__java_lang_ClassLoader =
 457                 utf_new_char("()Ljava/lang/ClassLoader;");
 458
 459         utf_java_lang_ClassLoader_java_lang_String__J =
 460                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
 461
 462         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
 463
 464         utf_java_lang_Object__java_lang_Object =
 465                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 466
 467         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 468
 469         utf_java_lang_String__java_lang_Class =
 470                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 471
 472         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 473
 474         utf_java_lang_Thread_java_lang_Throwable__V =
 475                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
 476
 477         utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
 478                 utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");
 479
 480         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 481
 482         utf_java_lang_Throwable__java_lang_Throwable =
 483                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
 484
 485         utf_null                       = utf_new_char("null");
 486         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 487         array_packagename              = utf_new_char("\t<the array package>");
 488 }
 489
 490
 491 /* utf_hashkey *****************************************************************
 492
 493    The hashkey is computed from the utf-text by using up to 8
 494    characters.  For utf-symbols longer than 15 characters 3 characters
 495    are taken from the beginning and the end, 2 characters are taken
 496    from the middle.
 497
 498 *******************************************************************************/
 499
 500 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 501 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 502
 503 u4 utf_hashkey(const char *text, u4 length)
 504 {
 505         const char *start_pos = text;       /* pointer to utf text                */
 506         u4 a;
 507
 508         switch (length) {
 509         case 0: /* empty string */
 510                 return 0;
 511
 512         case 1: return fbs(0);
 513         case 2: return fbs(0) ^ nbs(3);
 514         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 515         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 516         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 517         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 518         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 519         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 520
 521         case 9:
 522                 a = fbs(0);
 523                 a ^= nbs(1);
 524                 a ^= nbs(2);
 525                 text++;
 526                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 527
 528         case 10:
 529                 a = fbs(0);
 530                 text++;
 531                 a ^= nbs(2);
 532                 a ^= nbs(3);
 533                 a ^= nbs(4);
 534                 text++;
 535                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 536
 537         case 11:
 538                 a = fbs(0);
 539                 text++;
 540                 a ^= nbs(2);
 541                 a ^= nbs(3);
 542                 a ^= nbs(4);
 543                 text++;
 544                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 545
 546         case 12:
 547                 a = fbs(0);
 548                 text += 2;
 549                 a ^= nbs(2);
 550                 a ^= nbs(3);
 551                 text++;
 552                 a ^= nbs(5);
 553                 a ^= nbs(6);
 554                 a ^= nbs(7);
 555                 text++;
 556                 return a ^ nbs(9) ^ nbs(10);
 557
 558         case 13:
 559                 a = fbs(0);
 560                 a ^= nbs(1);
 561                 text++;
 562                 a ^= nbs(3);
 563                 a ^= nbs(4);
 564                 text += 2;
 565                 a ^= nbs(7);
 566                 a ^= nbs(8);
 567                 text += 2;
 568                 return a ^ nbs(9) ^ nbs(10);
 569
 570         case 14:
 571                 a = fbs(0);
 572                 text += 2;
 573                 a ^= nbs(3);
 574                 a ^= nbs(4);
 575                 text += 2;
 576                 a ^= nbs(7);
 577                 a ^= nbs(8);
 578                 text += 2;
 579                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 580
 581         case 15:
 582                 a = fbs(0);
 583                 text += 2;
 584                 a ^= nbs(3);
 585                 a ^= nbs(4);
 586                 text += 2;
 587                 a ^= nbs(7);
 588                 a ^= nbs(8);
 589                 text += 2;
 590                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 591
 592         default:  /* 3 characters from beginning */
 593                 a = fbs(0);
 594                 text += 2;
 595                 a ^= nbs(3);
 596                 a ^= nbs(4);
 597
 598                 /* 2 characters from middle */
 599                 text = start_pos + (length / 2);
 600                 a ^= fbs(5);
 601                 text += 2;
 602                 a ^= nbs(6);
 603
 604                 /* 3 characters from end */
 605                 text = start_pos + length - 4;
 606
 607                 a ^= fbs(7);
 608                 text++;
 609
 610                 return a ^ nbs(10) ^ nbs(11);
 611     }
 612 }
 613
 614 /* utf_full_hashkey ************************************************************
 615
 616    This function computes a hash value using all bytes in the string.
 617
 618    The algorithm is the "One-at-a-time" algorithm as published
 619    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 620
 621 *******************************************************************************/
 622
 623 u4 utf_full_hashkey(const char *text, u4 length)
 624 {
 625         register const unsigned char *p = (const unsigned char *) text;
 626         register u4 hash;
 627         register u4 i;
 628
 629         hash = 0;
 630         for (i=length; i--;)
 631         {
 632             hash += *p++;
 633             hash += (hash << 10);
 634             hash ^= (hash >> 6);
 635         }
 636         hash += (hash << 3);
 637         hash ^= (hash >> 11);
 638         hash += (hash << 15);
 639
 640         return hash;
 641 }
 642
 643 /* unicode_hashkey *************************************************************
 644
 645    Compute the hashkey of a unicode string.
 646
 647 *******************************************************************************/
 648
 649 u4 unicode_hashkey(u2 *text, u2 len)
 650 {
 651         return utf_hashkey((char *) text, len);
 652 }
 653
 654
 655 /* utf_new *********************************************************************
 656
 657    Creates a new utf-symbol, the text of the symbol is passed as a
 658    u1-array. The function searches the utf-hashtable for a utf-symbol
 659    with this text. On success the element returned, otherwise a new
 660    hashtable element is created.
 661
 662    If the number of entries in the hashtable exceeds twice the size of
 663    the hashtable slots a reorganization of the hashtable is done and
 664    the utf symbols are copied to a new hashtable with doubled size.
 665
 666 *******************************************************************************/
 667
 668 utf *utf_new(const char *text, u2 length)
 669 {
 670         u4 key;                             /* hashkey computed from utf-text     */
 671         u4 slot;                            /* slot in hashtable                  */
 672         utf *u;                             /* hashtable element                  */
 673         u2 i;
 674
 675         LOCK_MONITOR_ENTER(hashtable_utf->header);
 676
 677 #if defined(ENABLE_STATISTICS)
 678         if (opt_stat)
 679                 count_utf_new++;
 680 #endif
 681
 682         key  = utf_hashkey(text, length);
 683         slot = key & (hashtable_utf->size - 1);
 684         u    = hashtable_utf->ptr[slot];
 685
 686         /* search external hash chain for utf-symbol */
 687
 688         while (u) {
 689                 if (u->blength == length) {
 690                         /* compare text of hashtable elements */
 691
 692                         for (i = 0; i < length; i++)
 693                                 if (text[i] != u->text[i])
 694                                         goto nomatch;
 695
 696 #if defined(ENABLE_STATISTICS)
 697                         if (opt_stat)
 698                                 count_utf_new_found++;
 699 #endif
 700
 701                         /* symbol found in hashtable */
 702
 703                         LOCK_MONITOR_EXIT(hashtable_utf->header);
 704
 705                         return u;
 706                 }
 707
 708         nomatch:
 709                 u = u->hashlink; /* next element in external chain */
 710         }
 711
 712         /* location in hashtable found, create new utf element */
 713
 714         u = NEW(utf);
 715
 716         u->blength  = length;               /* length in bytes of utfstring       */
 717         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 718         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 719
 720         memcpy(u->text, text, length);      /* copy utf-text                      */
 721         u->text[length] = '\0';
 722
 723 #if defined(ENABLE_STATISTICS)
 724         if (opt_stat)
 725                 count_utf_len += sizeof(utf) + length + 1;
 726 #endif
 727
 728         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 729         hashtable_utf->entries++;           /* update number of entries           */
 730
 731         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 732
 733         /* reorganization of hashtable, average length of the external
 734            chains is approx. 2 */
 735
 736                 hashtable *newhash;                              /* the new hashtable */
 737                 u4         i;
 738                 utf       *u;
 739                 utf       *nextu;
 740                 u4         slot;
 741
 742                 /* create new hashtable, double the size */
 743
 744                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 745
 746 #if defined(ENABLE_STATISTICS)
 747                 if (opt_stat)
 748                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 749 #endif
 750
 751                 /* transfer elements to new hashtable */
 752
 753                 for (i = 0; i < hashtable_utf->size; i++) {
 754                         u = hashtable_utf->ptr[i];
 755
 756                         while (u) {
 757                                 nextu = u->hashlink;
 758                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 759
 760                                 u->hashlink = (utf *) newhash->ptr[slot];
 761                                 newhash->ptr[slot] = u;
 762
 763                                 /* follow link in external hash chain */
 764
 765                                 u = nextu;
 766                         }
 767                 }
 768
 769                 /* dispose old table */
 770
 771                 hashtable_free(hashtable_utf);
 772
 773                 hashtable_utf = newhash;
 774         }
 775
 776         LOCK_MONITOR_EXIT(hashtable_utf->header);
 777
 778         return u;
 779 }
 780
 781
 782 /* utf_new_u2 ******************************************************************
 783
 784    Make utf symbol from u2 array, if isclassname is true '.' is
 785    replaced by '/'.
 786
 787 *******************************************************************************/
 788
 789 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 790 {
 791         char *buffer;                   /* memory buffer for  unicode characters  */
 792         char *pos;                      /* pointer to current position in buffer  */
 793         u4 left;                        /* unicode characters left                */
 794         u4 buflength;                   /* utf length in bytes of the u2 array    */
 795         utf *result;                    /* resulting utf-string                   */
 796         int i;
 797
 798         /* determine utf length in bytes and allocate memory */
 799
 800         buflength = u2_utflength(unicode_pos, unicode_length);
 801         buffer    = MNEW(char, buflength);
 802
 803         left = buflength;
 804         pos  = buffer;
 805
 806         for (i = 0; i++ < unicode_length; unicode_pos++) {
 807                 /* next unicode character */
 808                 u2 c = *unicode_pos;
 809
 810                 if ((c != 0) && (c < 0x80)) {
 811                         /* 1 character */
 812                         left--;
 813                 if ((int) left < 0) break;
 814                         /* convert classname */
 815                         if (isclassname && c == '.')
 816                                 *pos++ = '/';
 817                         else
 818                                 *pos++ = (char) c;
 819
 820                 } else if (c < 0x800) {
 821                         /* 2 characters */
 822                 unsigned char high = c >> 6;
 823                 unsigned char low  = c & 0x3F;
 824                         left = left - 2;
 825                 if ((int) left < 0) break;
 826                 *pos++ = high | 0xC0;
 827                 *pos++ = low  | 0x80;
 828
 829                 } else {
 830                 /* 3 characters */
 831                 char low  = c & 0x3f;
 832                 char mid  = (c >> 6) & 0x3F;
 833                 char high = c >> 12;
 834                         left = left - 3;
 835                 if ((int) left < 0) break;
 836                 *pos++ = high | 0xE0;
 837                 *pos++ = mid  | 0x80;
 838                 *pos++ = low  | 0x80;
 839                 }
 840         }
 841
 842         /* insert utf-string into symbol-table */
 843         result = utf_new(buffer,buflength);
 844
 845         MFREE(buffer, char, buflength);
 846
 847         return result;
 848 }
 849
 850
 851 /* utf_new_char ****************************************************************
 852
 853    Creates a new utf symbol, the text for this symbol is passed as a
 854    c-string ( = char* ).
 855
 856 *******************************************************************************/
 857
 858 utf *utf_new_char(const char *text)
 859 {
 860         return utf_new(text, strlen(text));
 861 }
 862
 863
 864 /* utf_new_char_classname ******************************************************
 865
 866    Creates a new utf symbol, the text for this symbol is passed as a
 867    c-string ( = char* ) "." characters are going to be replaced by
 868    "/". Since the above function is used often, this is a separte
 869    function, instead of an if.
 870
 871 *******************************************************************************/
 872
 873 utf *utf_new_char_classname(const char *text)
 874 {
 875         if (strchr(text, '.')) {
 876                 char *txt = strdup(text);
 877                 char *end = txt + strlen(txt);
 878                 char *c;
 879                 utf *tmpRes;
 880
 881                 for (c = txt; c < end; c++)
 882                         if (*c == '.') *c = '/';
 883
 884                 tmpRes = utf_new(txt, strlen(txt));
 885                 FREE(txt, 0);
 886
 887                 return tmpRes;
 888
 889         } else
 890                 return utf_new(text, strlen(text));
 891 }
 892
 893
 894 /* utf_nextu2 ******************************************************************
 895
 896    Read the next unicode character from the utf string and increment
 897    the utf-string pointer accordingly.
 898
 899    CAUTION: This function is unsafe for input that was not checked
 900             by is_valid_utf!
 901
 902 *******************************************************************************/
 903
 904 u2 utf_nextu2(char **utf_ptr)
 905 {
 906     /* uncompressed unicode character */
 907     u2 unicode_char = 0;
 908     /* current position in utf text */
 909     unsigned char *utf = (unsigned char *) (*utf_ptr);
 910     /* bytes representing the unicode character */
 911     unsigned char ch1, ch2, ch3;
 912     /* number of bytes used to represent the unicode character */
 913     int len = 0;
 914
 915     switch ((ch1 = utf[0]) >> 4) {
 916         default: /* 1 byte */
 917                 (*utf_ptr)++;
 918                 return (u2) ch1;
 919         case 0xC:
 920         case 0xD: /* 2 bytes */
 921                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 922                         unsigned char high = ch1 & 0x1F;
 923                         unsigned char low  = ch2 & 0x3F;
 924                         unicode_char = (high << 6) + low;
 925                         len = 2;
 926                 }
 927                 break;
 928
 929         case 0xE: /* 2 or 3 bytes */
 930                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 931                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 932                                 unsigned char low  = ch3 & 0x3f;
 933                                 unsigned char mid  = ch2 & 0x3f;
 934                                 unsigned char high = ch1 & 0x0f;
 935                                 unicode_char = (((high << 6) + mid) << 6) + low;
 936                                 len = 3;
 937                         } else
 938                                 len = 2;
 939                 }
 940                 break;
 941     }
 942
 943     /* update position in utf-text */
 944     *utf_ptr = (char *) (utf + len);
 945
 946     return unicode_char;
 947 }
 948
 949
 950 /* utf_bytes *******************************************************************
 951
 952    Determine number of bytes (aka. octets) in the utf string.
 953
 954    IN:
 955       u............utf string
 956
 957    OUT:
 958       The number of octets of this utf string.
 959           There is _no_ terminating zero included in this count.
 960
 961 *******************************************************************************/
 962
 963 u4 utf_bytes(utf *u)
 964 {
 965         return u->blength;
 966 }
 967
 968
 969 /* utf_get_number_of_u2s_for_buffer ********************************************
 970
 971    Determine number of UTF-16 u2s in the given UTF-8 buffer
 972
 973    CAUTION: This function is unsafe for input that was not checked
 974             by is_valid_utf!
 975
 976    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 977    to an array of u2s (UTF-16) and want to know how many of them you will get.
 978    All other uses of this function are probably wrong.
 979
 980    IN:
 981       buffer........points to first char in buffer
 982           blength.......number of _bytes_ in the buffer
 983
 984    OUT:
 985       the number of u2s needed to hold this string in UTF-16 encoding.
 986           There is _no_ terminating zero included in this count.
 987
 988    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
 989    exception.
 990
 991 *******************************************************************************/
 992
 993 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
 994 {
 995         const char *endpos;                 /* points behind utf string           */
 996         const char *utf_ptr;                /* current position in utf text       */
 997         u4 len = 0;                         /* number of unicode characters       */
 998
 999         utf_ptr = buffer;
1000         endpos = utf_ptr + blength;
1001
1002         while (utf_ptr < endpos) {
1003                 len++;
1004                 /* next unicode character */
1005                 utf_nextu2((char **)&utf_ptr);
1006         }
1007
1008         assert(utf_ptr == endpos);
1009
1010         return len;
1011 }
1012
1013
1014 /* utf_get_number_of_u2s *******************************************************
1015
1016    Determine number of UTF-16 u2s in the utf string.
1017
1018    CAUTION: This function is unsafe for input that was not checked
1019             by is_valid_utf!
1020
1021    CAUTION: Use this function *only* when you want to convert a utf string
1022    to an array of u2s and want to know how many of them you will get.
1023    All other uses of this function are probably wrong.
1024
1025    IN:
1026       u............utf string
1027
1028    OUT:
1029       the number of u2s needed to hold this string in UTF-16 encoding.
1030           There is _no_ terminating zero included in this count.
1031           XXX 0 if a NullPointerException has been thrown (see below)
1032
1033 *******************************************************************************/
1034
1035 u4 utf_get_number_of_u2s(utf *u)
1036 {
1037         char *endpos;                       /* points behind utf string           */
1038         char *utf_ptr;                      /* current position in utf text       */
1039         u4 len = 0;                         /* number of unicode characters       */
1040
1041         /* XXX this is probably not checked by most callers! Review this after */
1042         /* the invalid uses of this function have been eliminated */
1043         if (u == NULL) {
1044                 exceptions_throw_nullpointerexception();
1045                 return 0;
1046         }
1047
1048         endpos = UTF_END(u);
1049         utf_ptr = u->text;
1050
1051         while (utf_ptr < endpos) {
1052                 len++;
1053                 /* next unicode character */
1054                 utf_nextu2(&utf_ptr);
1055         }
1056
1057         if (utf_ptr != endpos) {
1058                 /* string ended abruptly */
1059                 exceptions_throw_internalerror("Illegal utf8 string");
1060                 return 0;
1061         }
1062
1063         return len;
1064 }
1065
1066
1067 /* utf8_safe_number_of_u2s *****************************************************
1068
1069    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1070    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1071
1072    This function is safe even for invalid UTF-8 strings.
1073
1074    IN:
1075       text..........zero-terminated(!) UTF-8 string (may be invalid)
1076                         must NOT be NULL
1077           nbytes........strlen(text). (This is needed to completely emulate
1078                         the RI).
1079
1080    OUT:
1081       the number of u2s needed to hold this string in UTF-16 encoding.
1082           There is _no_ terminating zero included in this count.
1083
1084 *******************************************************************************/
1085
1086 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1087         register const unsigned char *t;
1088         register s4 byte;
1089         register s4 len;
1090         register const unsigned char *tlimit;
1091         s4 byte1;
1092         s4 byte2;
1093         s4 byte3;
1094         s4 value;
1095         s4 skip;
1096
1097         assert(text);
1098         assert(nbytes >= 0);
1099
1100         len = 0;
1101         t = (const unsigned char *) text;
1102         tlimit = t + nbytes;
1103
1104         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1105
1106         while (1) {
1107                 byte = *t++;
1108
1109                 if (byte & 0x80) {
1110                         /* highest bit set, non-ASCII character */
1111
1112                         if ((byte & 0xe0) == 0xc0) {
1113                                 /* 2-byte: should be 110..... 10...... ? */
1114
1115                                 if ((*t++ & 0xc0) == 0x80)
1116                                         ; /* valid 2-byte */
1117                                 else
1118                                         t--; /* invalid */
1119                         }
1120                         else if ((byte & 0xf0) == 0xe0) {
1121                                 /* 3-byte: should be 1110.... 10...... 10...... */
1122                                 /*                            ^t                */
1123
1124                                 if (t + 2 > tlimit)
1125                                         return len + 1; /* invalid, stop here */
1126
1127                                 if ((*t++ & 0xc0) == 0x80) {
1128                                         if ((*t++ & 0xc0) == 0x80)
1129                                                 ; /* valid 3-byte */
1130                                         else
1131                                                 t--; /* invalid */
1132                                 }
1133                                 else
1134                                         t--; /* invalid */
1135                         }
1136                         else if ((byte & 0xf8) == 0xf0) {
1137                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1138                                 /*                            ^t                         */
1139
1140                                 if (t + 3 > tlimit)
1141                                         return len + 1; /* invalid, stop here */
1142
1143                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1144                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1145                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1146                                                         /* valid 4-byte UTF-8? */
1147                                                         value = ((byte  & 0x07) << 18)
1148                                                                   | ((byte1 & 0x3f) << 12)
1149                                                                   | ((byte2 & 0x3f) <<  6)
1150                                                                   | ((byte3 & 0x3f)      );
1151
1152                                                         if (value > 0x10FFFF)
1153                                                                 ; /* invalid */
1154                                                         else if (value > 0xFFFF)
1155                                                                 len += 1; /* we need surrogates */
1156                                                         else
1157                                                                 ; /* 16bit suffice */
1158                                                 }
1159                                                 else
1160                                                         t--; /* invalid */
1161                                         }
1162                                         else
1163                                                 t--; /* invalid */
1164                                 }
1165                                 else
1166                                         t--; /* invalid */
1167                         }
1168                         else if ((byte & 0xfc) == 0xf8) {
1169                                 /* invalid 5-byte */
1170                                 if (t + 4 > tlimit)
1171                                         return len + 1; /* invalid, stop here */
1172
1173                                 skip = 4;
1174                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1175                                         t++;
1176                         }
1177                         else if ((byte & 0xfe) == 0xfc) {
1178                                 /* invalid 6-byte */
1179                                 if (t + 5 > tlimit)
1180                                         return len + 1; /* invalid, stop here */
1181
1182                                 skip = 5;
1183                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1184                                         t++;
1185                         }
1186                         else
1187                                 ; /* invalid */
1188                 }
1189                 else {
1190                         /* NUL */
1191
1192                         if (byte == 0)
1193                                 break;
1194
1195                         /* ASCII character, common case */
1196                 }
1197
1198                 len++;
1199         }
1200
1201         return len;
1202 }
1203
1204
1205 /* utf8_safe_convert_to_u2s ****************************************************
1206
1207    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1208    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1209    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1210
1211    This function is safe even for invalid UTF-8 strings.
1212
1213    IN:
1214       text..........zero-terminated(!) UTF-8 string (may be invalid)
1215                         must NOT be NULL
1216           nbytes........strlen(text). (This is needed to completely emulate
1217                                         the RI).
1218           buffer........a preallocated array of u2s to receive the decoded
1219                         string. Use utf8_safe_number_of_u2s to get the
1220                                         required number of u2s for allocating this.
1221
1222 *******************************************************************************/
1223
1224 #define UNICODE_REPLACEMENT  0xfffd
1225
1226 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1227         register const unsigned char *t;
1228         register s4 byte;
1229         register const unsigned char *tlimit;
1230         s4 byte1;
1231         s4 byte2;
1232         s4 byte3;
1233         s4 value;
1234         s4 skip;
1235
1236         assert(text);
1237         assert(nbytes >= 0);
1238
1239         t = (const unsigned char *) text;
1240         tlimit = t + nbytes;
1241
1242         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1243
1244         while (1) {
1245                 byte = *t++;
1246
1247                 if (byte & 0x80) {
1248                         /* highest bit set, non-ASCII character */
1249
1250                         if ((byte & 0xe0) == 0xc0) {
1251                                 /* 2-byte: should be 110..... 10...... */
1252
1253                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1254                                         /* valid 2-byte UTF-8 */
1255                                         *buffer++ = ((byte  & 0x1f) << 6)
1256                                                           | ((byte1 & 0x3f)     );
1257                                 }
1258                                 else {
1259                                         *buffer++ = UNICODE_REPLACEMENT;
1260                                         t--;
1261                                 }
1262                         }
1263                         else if ((byte & 0xf0) == 0xe0) {
1264                                 /* 3-byte: should be 1110.... 10...... 10...... */
1265
1266                                 if (t + 2 > tlimit) {
1267                                         *buffer++ = UNICODE_REPLACEMENT;
1268                                         return;
1269                                 }
1270
1271                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1272                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1273                                                 /* valid 3-byte UTF-8 */
1274                                                 *buffer++ = ((byte  & 0x0f) << 12)
1275                                                                   | ((byte1 & 0x3f) <<  6)
1276                                                                   | ((byte2 & 0x3f)      );
1277                                         }
1278                                         else {
1279                                                 *buffer++ = UNICODE_REPLACEMENT;
1280                                                 t--;
1281                                         }
1282                                 }
1283                                 else {
1284                                         *buffer++ = UNICODE_REPLACEMENT;
1285                                         t--;
1286                                 }
1287                         }
1288                         else if ((byte & 0xf8) == 0xf0) {
1289                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1290
1291                                 if (t + 3 > tlimit) {
1292                                         *buffer++ = UNICODE_REPLACEMENT;
1293                                         return;
1294                                 }
1295
1296                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1297                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1298                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1299                                                         /* valid 4-byte UTF-8? */
1300                                                         value = ((byte  & 0x07) << 18)
1301                                                                   | ((byte1 & 0x3f) << 12)
1302                                                                   | ((byte2 & 0x3f) <<  6)
1303                                                                   | ((byte3 & 0x3f)      );
1304
1305                                                         if (value > 0x10FFFF) {
1306                                                                 *buffer++ = UNICODE_REPLACEMENT;
1307                                                         }
1308                                                         else if (value > 0xFFFF) {
1309                                                                 /* we need surrogates */
1310                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1311                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1312                                                         }
1313                                                         else
1314                                                                 *buffer++ = value; /* 16bit suffice */
1315                                                 }
1316                                                 else {
1317                                                         *buffer++ = UNICODE_REPLACEMENT;
1318                                                         t--;
1319                                                 }
1320                                         }
1321                                         else {
1322                                                 *buffer++ = UNICODE_REPLACEMENT;
1323                                                 t--;
1324                                         }
1325                                 }
1326                                 else {
1327                                         *buffer++ = UNICODE_REPLACEMENT;
1328                                         t--;
1329                                 }
1330                         }
1331                         else if ((byte & 0xfc) == 0xf8) {
1332                                 if (t + 4 > tlimit) {
1333                                         *buffer++ = UNICODE_REPLACEMENT;
1334                                         return;
1335                                 }
1336
1337                                 skip = 4;
1338                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1339                                         t++;
1340                                 *buffer++ = UNICODE_REPLACEMENT;
1341                         }
1342                         else if ((byte & 0xfe) == 0xfc) {
1343                                 if (t + 5 > tlimit) {
1344                                         *buffer++ = UNICODE_REPLACEMENT;
1345                                         return;
1346                                 }
1347
1348                                 skip = 5;
1349                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1350                                         t++;
1351                                 *buffer++ = UNICODE_REPLACEMENT;
1352                         }
1353                         else
1354                                 *buffer++ = UNICODE_REPLACEMENT;
1355                 }
1356                 else {
1357                         /* NUL */
1358
1359                         if (byte == 0)
1360                                 break;
1361
1362                         /* ASCII character, common case */
1363
1364                         *buffer++ = byte;
1365                 }
1366         }
1367 }
1368
1369
1370 /* u2_utflength ****************************************************************
1371
1372    Returns the utf length in bytes of a u2 array.
1373
1374 *******************************************************************************/
1375
1376 u4 u2_utflength(u2 *text, u4 u2_length)
1377 {
1378         u4 result_len = 0;                  /* utf length in bytes                */
1379         u2 ch;                              /* current unicode character          */
1380         u4 len;
1381
1382         for (len = 0; len < u2_length; len++) {
1383                 /* next unicode character */
1384                 ch = *text++;
1385
1386                 /* determine bytes required to store unicode character as utf */
1387                 if (ch && (ch < 0x80))
1388                         result_len++;
1389                 else if (ch < 0x800)
1390                         result_len += 2;
1391                 else
1392                         result_len += 3;
1393         }
1394
1395     return result_len;
1396 }
1397
1398
1399 /* utf_copy ********************************************************************
1400
1401    Copy the given utf string byte-for-byte to a buffer.
1402
1403    IN:
1404       buffer.......the buffer
1405           u............the utf string
1406
1407 *******************************************************************************/
1408
1409 void utf_copy(char *buffer, utf *u)
1410 {
1411         /* our utf strings are zero-terminated (done by utf_new) */
1412         MCOPY(buffer, u->text, char, u->blength + 1);
1413 }
1414
1415
1416 /* utf_cat *********************************************************************
1417
1418    Append the given utf string byte-for-byte to a buffer.
1419
1420    IN:
1421       buffer.......the buffer
1422           u............the utf string
1423
1424 *******************************************************************************/
1425
1426 void utf_cat(char *buffer, utf *u)
1427 {
1428         /* our utf strings are zero-terminated (done by utf_new) */
1429         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1430 }
1431
1432
1433 /* utf_copy_classname **********************************************************
1434
1435    Copy the given utf classname byte-for-byte to a buffer.
1436    '/' is replaced by '.'
1437
1438    IN:
1439       buffer.......the buffer
1440           u............the utf string
1441
1442 *******************************************************************************/
1443
1444 void utf_copy_classname(char *buffer, utf *u)
1445 {
1446         char *bufptr;
1447         char *srcptr;
1448         char *endptr;
1449         char ch;
1450
1451         bufptr = buffer;
1452         srcptr = u->text;
1453         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1454
1455         while (srcptr != endptr) {
1456                 ch = *srcptr++;
1457                 if (ch == '/')
1458                         ch = '.';
1459                 *bufptr++ = ch;
1460         }
1461 }
1462
1463
1464 /* utf_cat *********************************************************************
1465
1466    Append the given utf classname byte-for-byte to a buffer.
1467    '/' is replaced by '.'
1468
1469    IN:
1470       buffer.......the buffer
1471           u............the utf string
1472
1473 *******************************************************************************/
1474
1475 void utf_cat_classname(char *buffer, utf *u)
1476 {
1477         utf_copy_classname(buffer + strlen(buffer), u);
1478 }
1479
1480 /* utf_display_printable_ascii *************************************************
1481
1482    Write utf symbol to stdout (for debugging purposes).
1483    Non-printable and non-ASCII characters are printed as '?'.
1484
1485 *******************************************************************************/
1486
1487 void utf_display_printable_ascii(utf *u)
1488 {
1489         char *endpos;                       /* points behind utf string           */
1490         char *utf_ptr;                      /* current position in utf text       */
1491
1492         if (u == NULL) {
1493                 printf("NULL");
1494                 fflush(stdout);
1495                 return;
1496         }
1497
1498         endpos = UTF_END(u);
1499         utf_ptr = u->text;
1500
1501         while (utf_ptr < endpos) {
1502                 /* read next unicode character */
1503
1504                 u2 c = utf_nextu2(&utf_ptr);
1505
1506                 if ((c >= 32) && (c <= 127))
1507                         printf("%c", c);
1508                 else
1509                         printf("?");
1510         }
1511
1512         fflush(stdout);
1513 }
1514
1515
1516 /* utf_display_printable_ascii_classname ***************************************
1517
1518    Write utf symbol to stdout with `/' converted to `.' (for debugging
1519    purposes).
1520    Non-printable and non-ASCII characters are printed as '?'.
1521
1522 *******************************************************************************/
1523
1524 void utf_display_printable_ascii_classname(utf *u)
1525 {
1526         char *endpos;                       /* points behind utf string           */
1527         char *utf_ptr;                      /* current position in utf text       */
1528
1529         if (u == NULL) {
1530                 printf("NULL");
1531                 fflush(stdout);
1532                 return;
1533         }
1534
1535         endpos = UTF_END(u);
1536         utf_ptr = u->text;
1537
1538         while (utf_ptr < endpos) {
1539                 /* read next unicode character */
1540
1541                 u2 c = utf_nextu2(&utf_ptr);
1542
1543                 if (c == '/')
1544                         c = '.';
1545
1546                 if ((c >= 32) && (c <= 127))
1547                         printf("%c", c);
1548                 else
1549                         printf("?");
1550         }
1551
1552         fflush(stdout);
1553 }
1554
1555
1556 /* utf_sprint_convert_to_latin1 ************************************************
1557
1558    Write utf symbol into c-string (for debugging purposes).
1559    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1560    invalid results.
1561
1562 *******************************************************************************/
1563
1564 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1565 {
1566         char *endpos;                       /* points behind utf string           */
1567         char *utf_ptr;                      /* current position in utf text       */
1568         u2 pos = 0;                         /* position in c-string               */
1569
1570         if (!u) {
1571                 strcpy(buffer, "NULL");
1572                 return;
1573         }
1574
1575         endpos = UTF_END(u);
1576         utf_ptr = u->text;
1577
1578         while (utf_ptr < endpos)
1579                 /* copy next unicode character */
1580                 buffer[pos++] = utf_nextu2(&utf_ptr);
1581
1582         /* terminate string */
1583         buffer[pos] = '\0';
1584 }
1585
1586
1587 /* utf_sprint_convert_to_latin1_classname **************************************
1588
1589    Write utf symbol into c-string with `/' converted to `.' (for debugging
1590    purposes).
1591    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1592    invalid results.
1593
1594 *******************************************************************************/
1595
1596 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1597 {
1598         char *endpos;                       /* points behind utf string           */
1599         char *utf_ptr;                      /* current position in utf text       */
1600         u2 pos = 0;                         /* position in c-string               */
1601
1602         if (!u) {
1603                 strcpy(buffer, "NULL");
1604                 return;
1605         }
1606
1607         endpos = UTF_END(u);
1608         utf_ptr = u->text;
1609
1610         while (utf_ptr < endpos) {
1611                 /* copy next unicode character */
1612                 u2 c = utf_nextu2(&utf_ptr);
1613                 if (c == '/') c = '.';
1614                 buffer[pos++] = c;
1615         }
1616
1617         /* terminate string */
1618         buffer[pos] = '\0';
1619 }
1620
1621
1622 /* utf_strcat_convert_to_latin1 ************************************************
1623
1624    Like libc strcat, but uses an utf8 string.
1625    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1626    invalid results.
1627
1628 *******************************************************************************/
1629
1630 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1631 {
1632         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1633 }
1634
1635
1636 /* utf_strcat_convert_to_latin1_classname **************************************
1637
1638    Like libc strcat, but uses an utf8 string.
1639    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1640    invalid results.
1641
1642 *******************************************************************************/
1643
1644 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1645 {
1646         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1647 }
1648
1649
1650 /* utf_fprint_printable_ascii **************************************************
1651
1652    Write utf symbol into file.
1653    Non-printable and non-ASCII characters are printed as '?'.
1654
1655 *******************************************************************************/
1656
1657 void utf_fprint_printable_ascii(FILE *file, utf *u)
1658 {
1659         char *endpos;                       /* points behind utf string           */
1660         char *utf_ptr;                      /* current position in utf text       */
1661
1662         if (!u)
1663                 return;
1664
1665         endpos = UTF_END(u);
1666         utf_ptr = u->text;
1667
1668         while (utf_ptr < endpos) {
1669                 /* read next unicode character */
1670                 u2 c = utf_nextu2(&utf_ptr);
1671
1672                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1673                 else fprintf(file, "?");
1674         }
1675 }
1676
1677
1678 /* utf_fprint_printable_ascii_classname ****************************************
1679
1680    Write utf symbol into file with `/' converted to `.'.
1681    Non-printable and non-ASCII characters are printed as '?'.
1682
1683 *******************************************************************************/
1684
1685 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1686 {
1687         char *endpos;                       /* points behind utf string           */
1688         char *utf_ptr;                      /* current position in utf text       */
1689
1690     if (!u)
1691                 return;
1692
1693         endpos = UTF_END(u);
1694         utf_ptr = u->text;
1695
1696         while (utf_ptr < endpos) {
1697                 /* read next unicode character */
1698                 u2 c = utf_nextu2(&utf_ptr);
1699                 if (c == '/') c = '.';
1700
1701                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1702                 else fprintf(file, "?");
1703         }
1704 }
1705
1706
1707 /* is_valid_utf ****************************************************************
1708
1709    Return true if the given string is a valid UTF-8 string.
1710
1711    utf_ptr...points to first character
1712    end_pos...points after last character
1713
1714 *******************************************************************************/
1715
1716 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1717
1718 bool is_valid_utf(char *utf_ptr, char *end_pos)
1719 {
1720         int bytes;
1721         int len,i;
1722         char c;
1723         unsigned long v;
1724
1725         if (end_pos < utf_ptr) return false;
1726         bytes = end_pos - utf_ptr;
1727         while (bytes--) {
1728                 c = *utf_ptr++;
1729
1730                 if (!c) return false;                     /* 0x00 is not allowed */
1731                 if ((c & 0x80) == 0) continue;            /* ASCII */
1732
1733                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1734                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1735                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1736                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1737                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1738                 else return false;                        /* invalid leading byte */
1739
1740                 if (len > 2) return false;                /* Java limitation */
1741
1742                 v = (unsigned long)c & (0x3f >> len);
1743
1744                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1745
1746                 for (i = len; i--; ) {
1747                         c = *utf_ptr++;
1748                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1749                                 return false;
1750                         v = (v << 6) | (c & 0x3f);
1751                 }
1752
1753                 if (v == 0) {
1754                         if (len != 1) return false;           /* Java special */
1755
1756                 } else {
1757                         /* Sun Java seems to allow overlong UTF-8 encodings */
1758
1759                         /* if (v < min_codepoint[len]) */
1760                                 /* XXX throw exception? */
1761                 }
1762
1763                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1764                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1765
1766                 /* even these seem to be allowed */
1767                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1768         }
1769
1770         return true;
1771 }
1772
1773
1774 /* is_valid_name ***************************************************************
1775
1776    Return true if the given string may be used as a class/field/method
1777    name. (Currently this only disallows empty strings and control
1778    characters.)
1779
1780    NOTE: The string is assumed to have passed is_valid_utf!
1781
1782    utf_ptr...points to first character
1783    end_pos...points after last character
1784
1785 *******************************************************************************/
1786
1787 bool is_valid_name(char *utf_ptr, char *end_pos)
1788 {
1789         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1790
1791         while (utf_ptr < end_pos) {
1792                 unsigned char c = *utf_ptr++;
1793
1794                 if (c < 0x20) return false; /* disallow control characters */
1795                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1796                         return false;
1797         }
1798
1799         return true;
1800 }
1801
1802 bool is_valid_name_utf(utf *u)
1803 {
1804         return is_valid_name(u->text, UTF_END(u));
1805 }
1806
1807
1808 /* utf_show ********************************************************************
1809
1810    Writes the utf symbols in the utfhash to stdout and displays the
1811    number of external hash chains grouped according to the chainlength
1812    (for debugging purposes).
1813
1814 *******************************************************************************/
1815
1816 #if !defined(NDEBUG)
1817 void utf_show(void)
1818 {
1819
1820 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1821
1822         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1823         u4 max_chainlength = 0;      /* maximum length of the chains */
1824         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1825         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1826         u4 i;
1827
1828         printf("UTF-HASH:\n");
1829
1830         /* show element of utf-hashtable */
1831
1832         for (i = 0; i < hashtable_utf->size; i++) {
1833                 utf *u = hashtable_utf->ptr[i];
1834
1835                 if (u) {
1836                         printf("SLOT %d: ", (int) i);
1837
1838                         while (u) {
1839                                 printf("'");
1840                                 utf_display_printable_ascii(u);
1841                                 printf("' ");
1842                                 u = u->hashlink;
1843                         }
1844                         printf("\n");
1845                 }
1846         }
1847
1848         printf("UTF-HASH: %d slots for %d entries\n",
1849                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1850
1851         if (hashtable_utf->entries == 0)
1852                 return;
1853
1854         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1855
1856         for (i=0;i<CHAIN_LIMIT;i++)
1857                 chain_count[i]=0;
1858
1859         /* count numbers of hashchains according to their length */
1860         for (i=0; i<hashtable_utf->size; i++) {
1861
1862                 utf *u = (utf*) hashtable_utf->ptr[i];
1863                 u4 chain_length = 0;
1864
1865                 /* determine chainlength */
1866                 while (u) {
1867                         u = u->hashlink;
1868                         chain_length++;
1869                 }
1870
1871                 /* update sum of all chainlengths */
1872                 sum_chainlength+=chain_length;
1873
1874                 /* determine the maximum length of the chains */
1875                 if (chain_length>max_chainlength)
1876                         max_chainlength = chain_length;
1877
1878                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1879                 if (chain_length>=CHAIN_LIMIT) {
1880                         beyond_limit+=chain_length;
1881                         chain_length=CHAIN_LIMIT-1;
1882                 }
1883
1884                 /* update number of hashchains of current length */
1885                 chain_count[chain_length]++;
1886         }
1887
1888         /* display results */
1889         for (i=1;i<CHAIN_LIMIT-1;i++)
1890                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1891
1892         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1893
1894
1895         printf("max. chainlength:%5d\n",max_chainlength);
1896
1897         /* avg. chainlength = sum of chainlengths / number of chains */
1898         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1899 }
1900 #endif /* !defined(NDEBUG) */
1901
1902
1903 /*
1904  * These are local overrides for various environment variables in Emacs.
1905  * Please do not remove this and leave it at the end of the file, where
1906  * Emacs will automagically detect them.
1907  * ---------------------------------------------------------------------
1908  * Local variables:
1909  * mode: c
1910  * indent-tabs-mode: t
1911  * c-basic-offset: 4
1912  * tab-width: 4
1913  * End:
1914  * vim:noexpandtab:sw=4:ts=4:
1915  */