src/vmcore/utf8.c

   1 /* src/vmcore/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
   4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
   5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
   6    J. Wenninger, Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  23    02110-1301, USA.
  24
  25 */
  26
  27
  28 #include "config.h"
  29
  30 #include <string.h>
  31 #include <assert.h>
  32
  33 #include "vm/types.h"
  34
  35 #include "mm/memory.h"
  36
  37 #include "threads/lock-common.h"
  38
  39 #include "toolbox/hashtable.h"
  40
  41 #include "vm/exceptions.h"
  42
  43 #include "vmcore/options.h"
  44
  45 #if defined(ENABLE_STATISTICS)
  46 # include "vmcore/statistics.h"
  47 #endif
  48
  49 #include "vmcore/utf8.h"
  50
  51
  52 /* global variables ***********************************************************/
  53
  54 /* hashsize must be power of 2 */
  55
  56 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  57
  58 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  59
  60
  61 /* utf-symbols for pointer comparison of frequently used strings **************/
  62
  63 utf *utf_java_lang_Object;
  64
  65 utf *utf_java_lang_Class;
  66 utf *utf_java_lang_ClassLoader;
  67 utf *utf_java_lang_Cloneable;
  68 utf *utf_java_lang_SecurityManager;
  69 utf *utf_java_lang_String;
  70 utf *utf_java_lang_System;
  71 utf *utf_java_lang_ThreadGroup;
  72 utf *utf_java_lang_ref_SoftReference;
  73 utf *utf_java_lang_ref_WeakReference;
  74 utf *utf_java_lang_ref_PhantomReference;
  75 utf *utf_java_io_Serializable;
  76
  77 utf *utf_java_lang_Throwable;
  78 utf *utf_java_lang_Error;
  79
  80 utf *utf_java_lang_AbstractMethodError;
  81 utf *utf_java_lang_ClassCircularityError;
  82 utf *utf_java_lang_ClassFormatError;
  83 utf *utf_java_lang_ExceptionInInitializerError;
  84 utf *utf_java_lang_IncompatibleClassChangeError;
  85 utf *utf_java_lang_InstantiationError;
  86 utf *utf_java_lang_InternalError;
  87 utf *utf_java_lang_LinkageError;
  88 utf *utf_java_lang_NoClassDefFoundError;
  89 utf *utf_java_lang_NoSuchFieldError;
  90 utf *utf_java_lang_NoSuchMethodError;
  91 utf *utf_java_lang_OutOfMemoryError;
  92 utf *utf_java_lang_UnsatisfiedLinkError;
  93 utf *utf_java_lang_UnsupportedClassVersionError;
  94 utf *utf_java_lang_VerifyError;
  95 utf *utf_java_lang_VirtualMachineError;
  96
  97 utf *utf_java_lang_Exception;
  98
  99 utf *utf_java_lang_ArithmeticException;
 100 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
 101 utf *utf_java_lang_ArrayStoreException;
 102 utf *utf_java_lang_ClassCastException;
 103 utf *utf_java_lang_ClassNotFoundException;
 104 utf *utf_java_lang_CloneNotSupportedException;
 105 utf *utf_java_lang_IllegalAccessException;
 106 utf *utf_java_lang_IllegalArgumentException;
 107 utf *utf_java_lang_IllegalMonitorStateException;
 108 utf *utf_java_lang_InstantiationException;
 109 utf *utf_java_lang_InterruptedException;
 110 utf *utf_java_lang_NegativeArraySizeException;
 111 utf *utf_java_lang_NullPointerException;
 112 utf *utf_java_lang_StringIndexOutOfBoundsException;
 113
 114 utf *utf_java_lang_reflect_InvocationTargetException;
 115
 116 utf *utf_java_security_PrivilegedActionException;
 117
 118 #if defined(ENABLE_JAVASE)
 119 utf* utf_java_lang_Void;
 120 #endif
 121
 122 utf* utf_java_lang_Boolean;
 123 utf* utf_java_lang_Byte;
 124 utf* utf_java_lang_Character;
 125 utf* utf_java_lang_Short;
 126 utf* utf_java_lang_Integer;
 127 utf* utf_java_lang_Long;
 128 utf* utf_java_lang_Float;
 129 utf* utf_java_lang_Double;
 130
 131 #if defined(ENABLE_JAVASE)
 132 utf *utf_java_lang_StackTraceElement;
 133 utf *utf_java_lang_reflect_Constructor;
 134 utf *utf_java_lang_reflect_Field;
 135 utf *utf_java_lang_reflect_Method;
 136 utf *utf_java_util_Vector;
 137 #endif
 138
 139 utf *utf_InnerClasses;                  /* InnerClasses                       */
 140 utf *utf_ConstantValue;                 /* ConstantValue                      */
 141 utf *utf_Code;                          /* Code                               */
 142 utf *utf_Exceptions;                    /* Exceptions                         */
 143 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 144 utf *utf_SourceFile;                    /* SourceFile                         */
 145
 146 #if defined(ENABLE_JAVASE)
 147 utf *utf_EnclosingMethod;
 148 utf *utf_Signature;
 149 utf *utf_StackMapTable;
 150
 151 #if defined(ENABLE_ANNOTATIONS)
 152 utf *utf_RuntimeVisibleAnnotations;            /* RuntimeVisibleAnnotations            */
 153 utf *utf_RuntimeInvisibleAnnotations;          /* RuntimeInvisibleAnnotations          */
 154 utf *utf_RuntimeVisibleParameterAnnotations;   /* RuntimeVisibleParameterAnnotations   */
 155 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
 156 utf *utf_AnnotationDefault;                    /* AnnotationDefault                    */
 157 #endif
 158 #endif
 159
 160 utf *utf_init;                          /* <init>                             */
 161 utf *utf_clinit;                        /* <clinit>                           */
 162 utf *utf_clone;                         /* clone                              */
 163 utf *utf_finalize;                      /* finalize                           */
 164 utf *utf_run;                           /* run                                */
 165
 166 utf *utf_add;
 167 utf *utf_remove;
 168 utf *utf_addThread;
 169 utf *utf_removeThread;
 170 utf *utf_put;
 171 utf *utf_get;
 172 utf *utf_uncaughtException;
 173 utf *utf_value;
 174
 175 utf *utf_fillInStackTrace;
 176 utf *utf_findNative;
 177 utf *utf_getSystemClassLoader;
 178 utf *utf_initCause;
 179 utf *utf_loadClass;
 180 utf *utf_loadClassInternal;
 181 utf *utf_printStackTrace;
 182
 183 utf *utf_division_by_zero;
 184
 185 utf *utf_Z;                             /* Z                                  */
 186 utf *utf_B;                             /* B                                  */
 187 utf *utf_C;                             /* C                                  */
 188 utf *utf_S;                             /* S                                  */
 189 utf *utf_I;                             /* I                                  */
 190 utf *utf_J;                             /* J                                  */
 191 utf *utf_F;                             /* F                                  */
 192 utf *utf_D;                             /* D                                  */
 193
 194 utf *utf_void__void;                    /* ()V                                */
 195 utf *utf_boolean__void;                 /* (Z)V                               */
 196 utf *utf_byte__void;                    /* (B)V                               */
 197 utf *utf_char__void;                    /* (C)V                               */
 198 utf *utf_short__void;                   /* (S)V                               */
 199 utf *utf_int__void;                     /* (I)V                               */
 200 utf *utf_long__void;                    /* (J)V                               */
 201 utf *utf_float__void;                   /* (F)V                               */
 202 utf *utf_double__void;                  /* (D)V                               */
 203
 204 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 205 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 206 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 207 utf *utf_java_lang_ClassLoader_java_lang_String__J;
 208 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 209 utf *utf_java_lang_Object__java_lang_Object;
 210 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 211 utf *utf_java_lang_String__java_lang_Class;
 212 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 213 utf *utf_java_lang_Thread_java_lang_Throwable__V;
 214 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 215 utf *utf_java_lang_Throwable__java_lang_Throwable;
 216
 217 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 218 utf *utf_null;
 219 utf *array_packagename;
 220
 221
 222 /* utf_init ********************************************************************
 223
 224    Initializes the utf8 subsystem.
 225
 226 *******************************************************************************/
 227
 228 bool utf8_init(void)
 229 {
 230         /* create utf8 hashtable */
 231
 232         hashtable_utf = NEW(hashtable);
 233
 234         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 235
 236 #if defined(ENABLE_STATISTICS)
 237         if (opt_stat)
 238                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 239 #endif
 240
 241         /* create utf-symbols for pointer comparison of frequently used strings */
 242
 243         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 244
 245         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 246         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 247         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 248         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 249         utf_java_lang_String           = utf_new_char("java/lang/String");
 250         utf_java_lang_System           = utf_new_char("java/lang/System");
 251         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 252
 253         utf_java_lang_ref_SoftReference =
 254                 utf_new_char("java/lang/ref/SoftReference");
 255
 256         utf_java_lang_ref_WeakReference =
 257                 utf_new_char("java/lang/ref/WeakReference");
 258
 259         utf_java_lang_ref_PhantomReference =
 260                 utf_new_char("java/lang/ref/PhantomReference");
 261
 262         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 263
 264         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 265         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 266
 267         utf_java_lang_ClassCircularityError =
 268                 utf_new_char("java/lang/ClassCircularityError");
 269
 270         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
 271
 272         utf_java_lang_ExceptionInInitializerError =
 273                 utf_new_char("java/lang/ExceptionInInitializerError");
 274
 275         utf_java_lang_IncompatibleClassChangeError =
 276                 utf_new_char("java/lang/IncompatibleClassChangeError");
 277
 278         utf_java_lang_InstantiationError =
 279                 utf_new_char("java/lang/InstantiationError");
 280
 281         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
 282         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 283
 284         utf_java_lang_NoClassDefFoundError =
 285                 utf_new_char("java/lang/NoClassDefFoundError");
 286
 287         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 288
 289         utf_java_lang_UnsatisfiedLinkError =
 290                 utf_new_char("java/lang/UnsatisfiedLinkError");
 291
 292         utf_java_lang_UnsupportedClassVersionError =
 293                 utf_new_char("java/lang/UnsupportedClassVersionError");
 294
 295         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
 296
 297         utf_java_lang_VirtualMachineError =
 298                 utf_new_char("java/lang/VirtualMachineError");
 299
 300 #if defined(ENABLE_JAVASE)
 301         utf_java_lang_AbstractMethodError =
 302                 utf_new_char("java/lang/AbstractMethodError");
 303
 304         utf_java_lang_NoSuchFieldError =
 305                 utf_new_char("java/lang/NoSuchFieldError");
 306
 307         utf_java_lang_NoSuchMethodError =
 308                 utf_new_char("java/lang/NoSuchMethodError");
 309 #endif
 310
 311         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 312
 313         utf_java_lang_ArithmeticException =
 314                 utf_new_char("java/lang/ArithmeticException");
 315
 316         utf_java_lang_ArrayIndexOutOfBoundsException =
 317                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
 318
 319         utf_java_lang_ArrayStoreException =
 320                 utf_new_char("java/lang/ArrayStoreException");
 321
 322         utf_java_lang_ClassCastException =
 323                 utf_new_char("java/lang/ClassCastException");
 324
 325         utf_java_lang_ClassNotFoundException =
 326                 utf_new_char("java/lang/ClassNotFoundException");
 327
 328         utf_java_lang_CloneNotSupportedException =
 329                 utf_new_char("java/lang/CloneNotSupportedException");
 330
 331         utf_java_lang_IllegalAccessException =
 332                 utf_new_char("java/lang/IllegalAccessException");
 333
 334         utf_java_lang_IllegalArgumentException =
 335                 utf_new_char("java/lang/IllegalArgumentException");
 336
 337         utf_java_lang_IllegalMonitorStateException =
 338                 utf_new_char("java/lang/IllegalMonitorStateException");
 339
 340         utf_java_lang_InstantiationException =
 341                 utf_new_char("java/lang/InstantiationException");
 342
 343         utf_java_lang_InterruptedException =
 344                 utf_new_char("java/lang/InterruptedException");
 345
 346         utf_java_lang_NegativeArraySizeException =
 347                 utf_new_char("java/lang/NegativeArraySizeException");
 348
 349         utf_java_lang_NullPointerException =
 350                 utf_new_char("java/lang/NullPointerException");
 351
 352         utf_java_lang_StringIndexOutOfBoundsException =
 353                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
 354
 355         utf_java_lang_reflect_InvocationTargetException =
 356                 utf_new_char("java/lang/reflect/InvocationTargetException");
 357
 358         utf_java_security_PrivilegedActionException =
 359                 utf_new_char("java/security/PrivilegedActionException");
 360
 361 #if defined(ENABLE_JAVASE)
 362         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 363 #endif
 364
 365         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 366         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 367         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 368         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 369         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 370         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 371         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 372         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 373
 374 #if defined(ENABLE_JAVASE)
 375         utf_java_lang_StackTraceElement =
 376                 utf_new_char("java/lang/StackTraceElement");
 377
 378         utf_java_lang_reflect_Constructor =
 379                 utf_new_char("java/lang/reflect/Constructor");
 380
 381         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 382         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 383         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 384 #endif
 385
 386         utf_InnerClasses               = utf_new_char("InnerClasses");
 387         utf_ConstantValue              = utf_new_char("ConstantValue");
 388         utf_Code                       = utf_new_char("Code");
 389         utf_Exceptions                 = utf_new_char("Exceptions");
 390         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 391         utf_SourceFile                 = utf_new_char("SourceFile");
 392
 393 #if defined(ENABLE_JAVASE)
 394         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 395         utf_Signature                  = utf_new_char("Signature");
 396         utf_StackMapTable              = utf_new_char("StackMapTable");
 397
 398 #if defined(ENABLE_ANNOTATIONS)
 399         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
 400         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
 401         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
 402         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
 403         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
 404 #endif
 405 #endif
 406
 407         utf_init                           = utf_new_char("<init>");
 408         utf_clinit                         = utf_new_char("<clinit>");
 409         utf_clone                      = utf_new_char("clone");
 410         utf_finalize                   = utf_new_char("finalize");
 411         utf_run                        = utf_new_char("run");
 412
 413         utf_add                        = utf_new_char("add");
 414         utf_remove                     = utf_new_char("remove");
 415         utf_addThread                  = utf_new_char("addThread");
 416         utf_removeThread               = utf_new_char("removeThread");
 417         utf_put                        = utf_new_char("put");
 418         utf_get                        = utf_new_char("get");
 419         utf_uncaughtException          = utf_new_char("uncaughtException");
 420         utf_value                      = utf_new_char("value");
 421
 422         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 423         utf_findNative                 = utf_new_char("findNative");
 424         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 425         utf_initCause                  = utf_new_char("initCause");
 426         utf_loadClass                  = utf_new_char("loadClass");
 427         utf_loadClassInternal          = utf_new_char("loadClassInternal");
 428         utf_printStackTrace            = utf_new_char("printStackTrace");
 429
 430         utf_division_by_zero           = utf_new_char("/ by zero");
 431
 432         utf_Z                          = utf_new_char("Z");
 433         utf_B                          = utf_new_char("B");
 434         utf_C                          = utf_new_char("C");
 435         utf_S                          = utf_new_char("S");
 436         utf_I                          = utf_new_char("I");
 437         utf_J                          = utf_new_char("J");
 438         utf_F                          = utf_new_char("F");
 439         utf_D                          = utf_new_char("D");
 440
 441         utf_void__void                 = utf_new_char("()V");
 442         utf_boolean__void              = utf_new_char("(Z)V");
 443         utf_byte__void                 = utf_new_char("(B)V");
 444         utf_char__void                 = utf_new_char("(C)V");
 445         utf_short__void                = utf_new_char("(S)V");
 446         utf_int__void                  = utf_new_char("(I)V");
 447         utf_long__void                 = utf_new_char("(J)V");
 448         utf_float__void                = utf_new_char("(F)V");
 449         utf_double__void               = utf_new_char("(D)V");
 450         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 451         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 452
 453         utf_void__java_lang_ClassLoader =
 454                 utf_new_char("()Ljava/lang/ClassLoader;");
 455
 456         utf_java_lang_ClassLoader_java_lang_String__J =
 457                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
 458
 459         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
 460
 461         utf_java_lang_Object__java_lang_Object =
 462                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 463
 464         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 465
 466         utf_java_lang_String__java_lang_Class =
 467                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 468
 469         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 470
 471         utf_java_lang_Thread_java_lang_Throwable__V =
 472                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
 473
 474         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 475
 476         utf_java_lang_Throwable__java_lang_Throwable =
 477                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
 478
 479         utf_null                       = utf_new_char("null");
 480         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 481         array_packagename              = utf_new_char("\t<the array package>");
 482
 483         /* everything's ok */
 484
 485         return true;
 486 }
 487
 488
 489 /* utf_hashkey *****************************************************************
 490
 491    The hashkey is computed from the utf-text by using up to 8
 492    characters.  For utf-symbols longer than 15 characters 3 characters
 493    are taken from the beginning and the end, 2 characters are taken
 494    from the middle.
 495
 496 *******************************************************************************/
 497
 498 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 499 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 500
 501 u4 utf_hashkey(const char *text, u4 length)
 502 {
 503         const char *start_pos = text;       /* pointer to utf text                */
 504         u4 a;
 505
 506         switch (length) {
 507         case 0: /* empty string */
 508                 return 0;
 509
 510         case 1: return fbs(0);
 511         case 2: return fbs(0) ^ nbs(3);
 512         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 513         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 514         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 515         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 516         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 517         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 518
 519         case 9:
 520                 a = fbs(0);
 521                 a ^= nbs(1);
 522                 a ^= nbs(2);
 523                 text++;
 524                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 525
 526         case 10:
 527                 a = fbs(0);
 528                 text++;
 529                 a ^= nbs(2);
 530                 a ^= nbs(3);
 531                 a ^= nbs(4);
 532                 text++;
 533                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 534
 535         case 11:
 536                 a = fbs(0);
 537                 text++;
 538                 a ^= nbs(2);
 539                 a ^= nbs(3);
 540                 a ^= nbs(4);
 541                 text++;
 542                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 543
 544         case 12:
 545                 a = fbs(0);
 546                 text += 2;
 547                 a ^= nbs(2);
 548                 a ^= nbs(3);
 549                 text++;
 550                 a ^= nbs(5);
 551                 a ^= nbs(6);
 552                 a ^= nbs(7);
 553                 text++;
 554                 return a ^ nbs(9) ^ nbs(10);
 555
 556         case 13:
 557                 a = fbs(0);
 558                 a ^= nbs(1);
 559                 text++;
 560                 a ^= nbs(3);
 561                 a ^= nbs(4);
 562                 text += 2;
 563                 a ^= nbs(7);
 564                 a ^= nbs(8);
 565                 text += 2;
 566                 return a ^ nbs(9) ^ nbs(10);
 567
 568         case 14:
 569                 a = fbs(0);
 570                 text += 2;
 571                 a ^= nbs(3);
 572                 a ^= nbs(4);
 573                 text += 2;
 574                 a ^= nbs(7);
 575                 a ^= nbs(8);
 576                 text += 2;
 577                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 578
 579         case 15:
 580                 a = fbs(0);
 581                 text += 2;
 582                 a ^= nbs(3);
 583                 a ^= nbs(4);
 584                 text += 2;
 585                 a ^= nbs(7);
 586                 a ^= nbs(8);
 587                 text += 2;
 588                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 589
 590         default:  /* 3 characters from beginning */
 591                 a = fbs(0);
 592                 text += 2;
 593                 a ^= nbs(3);
 594                 a ^= nbs(4);
 595
 596                 /* 2 characters from middle */
 597                 text = start_pos + (length / 2);
 598                 a ^= fbs(5);
 599                 text += 2;
 600                 a ^= nbs(6);
 601
 602                 /* 3 characters from end */
 603                 text = start_pos + length - 4;
 604
 605                 a ^= fbs(7);
 606                 text++;
 607
 608                 return a ^ nbs(10) ^ nbs(11);
 609     }
 610 }
 611
 612 /* utf_full_hashkey ************************************************************
 613
 614    This function computes a hash value using all bytes in the string.
 615
 616    The algorithm is the "One-at-a-time" algorithm as published
 617    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 618
 619 *******************************************************************************/
 620
 621 u4 utf_full_hashkey(const char *text, u4 length)
 622 {
 623         register const unsigned char *p = (const unsigned char *) text;
 624         register u4 hash;
 625         register u4 i;
 626
 627         hash = 0;
 628         for (i=length; i--;)
 629         {
 630             hash += *p++;
 631             hash += (hash << 10);
 632             hash ^= (hash >> 6);
 633         }
 634         hash += (hash << 3);
 635         hash ^= (hash >> 11);
 636         hash += (hash << 15);
 637
 638         return hash;
 639 }
 640
 641 /* unicode_hashkey *************************************************************
 642
 643    Compute the hashkey of a unicode string.
 644
 645 *******************************************************************************/
 646
 647 u4 unicode_hashkey(u2 *text, u2 len)
 648 {
 649         return utf_hashkey((char *) text, len);
 650 }
 651
 652
 653 /* utf_new *********************************************************************
 654
 655    Creates a new utf-symbol, the text of the symbol is passed as a
 656    u1-array. The function searches the utf-hashtable for a utf-symbol
 657    with this text. On success the element returned, otherwise a new
 658    hashtable element is created.
 659
 660    If the number of entries in the hashtable exceeds twice the size of
 661    the hashtable slots a reorganization of the hashtable is done and
 662    the utf symbols are copied to a new hashtable with doubled size.
 663
 664 *******************************************************************************/
 665
 666 utf *utf_new(const char *text, u2 length)
 667 {
 668         u4 key;                             /* hashkey computed from utf-text     */
 669         u4 slot;                            /* slot in hashtable                  */
 670         utf *u;                             /* hashtable element                  */
 671         u2 i;
 672
 673         LOCK_MONITOR_ENTER(hashtable_utf->header);
 674
 675 #if defined(ENABLE_STATISTICS)
 676         if (opt_stat)
 677                 count_utf_new++;
 678 #endif
 679
 680         key  = utf_hashkey(text, length);
 681         slot = key & (hashtable_utf->size - 1);
 682         u    = hashtable_utf->ptr[slot];
 683
 684         /* search external hash chain for utf-symbol */
 685
 686         while (u) {
 687                 if (u->blength == length) {
 688                         /* compare text of hashtable elements */
 689
 690                         for (i = 0; i < length; i++)
 691                                 if (text[i] != u->text[i])
 692                                         goto nomatch;
 693
 694 #if defined(ENABLE_STATISTICS)
 695                         if (opt_stat)
 696                                 count_utf_new_found++;
 697 #endif
 698
 699                         /* symbol found in hashtable */
 700
 701                         LOCK_MONITOR_EXIT(hashtable_utf->header);
 702
 703                         return u;
 704                 }
 705
 706         nomatch:
 707                 u = u->hashlink; /* next element in external chain */
 708         }
 709
 710         /* location in hashtable found, create new utf element */
 711
 712         u = NEW(utf);
 713
 714         u->blength  = length;               /* length in bytes of utfstring       */
 715         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 716         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 717
 718         memcpy(u->text, text, length);      /* copy utf-text                      */
 719         u->text[length] = '\0';
 720
 721 #if defined(ENABLE_STATISTICS)
 722         if (opt_stat)
 723                 count_utf_len += sizeof(utf) + length + 1;
 724 #endif
 725
 726         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 727         hashtable_utf->entries++;           /* update number of entries           */
 728
 729         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 730
 731         /* reorganization of hashtable, average length of the external
 732            chains is approx. 2 */
 733
 734                 hashtable *newhash;                              /* the new hashtable */
 735                 u4         i;
 736                 utf       *u;
 737                 utf       *nextu;
 738                 u4         slot;
 739
 740                 /* create new hashtable, double the size */
 741
 742                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 743
 744 #if defined(ENABLE_STATISTICS)
 745                 if (opt_stat)
 746                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 747 #endif
 748
 749                 /* transfer elements to new hashtable */
 750
 751                 for (i = 0; i < hashtable_utf->size; i++) {
 752                         u = hashtable_utf->ptr[i];
 753
 754                         while (u) {
 755                                 nextu = u->hashlink;
 756                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 757
 758                                 u->hashlink = (utf *) newhash->ptr[slot];
 759                                 newhash->ptr[slot] = u;
 760
 761                                 /* follow link in external hash chain */
 762
 763                                 u = nextu;
 764                         }
 765                 }
 766
 767                 /* dispose old table */
 768
 769                 hashtable_free(hashtable_utf);
 770
 771                 hashtable_utf = newhash;
 772         }
 773
 774         LOCK_MONITOR_EXIT(hashtable_utf->header);
 775
 776         return u;
 777 }
 778
 779
 780 /* utf_new_u2 ******************************************************************
 781
 782    Make utf symbol from u2 array, if isclassname is true '.' is
 783    replaced by '/'.
 784
 785 *******************************************************************************/
 786
 787 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 788 {
 789         char *buffer;                   /* memory buffer for  unicode characters  */
 790         char *pos;                      /* pointer to current position in buffer  */
 791         u4 left;                        /* unicode characters left                */
 792         u4 buflength;                   /* utf length in bytes of the u2 array    */
 793         utf *result;                    /* resulting utf-string                   */
 794         int i;
 795
 796         /* determine utf length in bytes and allocate memory */
 797
 798         buflength = u2_utflength(unicode_pos, unicode_length);
 799         buffer    = MNEW(char, buflength);
 800
 801         left = buflength;
 802         pos  = buffer;
 803
 804         for (i = 0; i++ < unicode_length; unicode_pos++) {
 805                 /* next unicode character */
 806                 u2 c = *unicode_pos;
 807
 808                 if ((c != 0) && (c < 0x80)) {
 809                         /* 1 character */
 810                         left--;
 811                 if ((int) left < 0) break;
 812                         /* convert classname */
 813                         if (isclassname && c == '.')
 814                                 *pos++ = '/';
 815                         else
 816                                 *pos++ = (char) c;
 817
 818                 } else if (c < 0x800) {
 819                         /* 2 characters */
 820                 unsigned char high = c >> 6;
 821                 unsigned char low  = c & 0x3F;
 822                         left = left - 2;
 823                 if ((int) left < 0) break;
 824                 *pos++ = high | 0xC0;
 825                 *pos++ = low  | 0x80;
 826
 827                 } else {
 828                 /* 3 characters */
 829                 char low  = c & 0x3f;
 830                 char mid  = (c >> 6) & 0x3F;
 831                 char high = c >> 12;
 832                         left = left - 3;
 833                 if ((int) left < 0) break;
 834                 *pos++ = high | 0xE0;
 835                 *pos++ = mid  | 0x80;
 836                 *pos++ = low  | 0x80;
 837                 }
 838         }
 839
 840         /* insert utf-string into symbol-table */
 841         result = utf_new(buffer,buflength);
 842
 843         MFREE(buffer, char, buflength);
 844
 845         return result;
 846 }
 847
 848
 849 /* utf_new_char ****************************************************************
 850
 851    Creates a new utf symbol, the text for this symbol is passed as a
 852    c-string ( = char* ).
 853
 854 *******************************************************************************/
 855
 856 utf *utf_new_char(const char *text)
 857 {
 858         return utf_new(text, strlen(text));
 859 }
 860
 861
 862 /* utf_new_char_classname ******************************************************
 863
 864    Creates a new utf symbol, the text for this symbol is passed as a
 865    c-string ( = char* ) "." characters are going to be replaced by
 866    "/". Since the above function is used often, this is a separte
 867    function, instead of an if.
 868
 869 *******************************************************************************/
 870
 871 utf *utf_new_char_classname(const char *text)
 872 {
 873         if (strchr(text, '.')) {
 874                 char *txt = strdup(text);
 875                 char *end = txt + strlen(txt);
 876                 char *c;
 877                 utf *tmpRes;
 878
 879                 for (c = txt; c < end; c++)
 880                         if (*c == '.') *c = '/';
 881
 882                 tmpRes = utf_new(txt, strlen(txt));
 883                 FREE(txt, 0);
 884
 885                 return tmpRes;
 886
 887         } else
 888                 return utf_new(text, strlen(text));
 889 }
 890
 891
 892 /* utf_nextu2 ******************************************************************
 893
 894    Read the next unicode character from the utf string and increment
 895    the utf-string pointer accordingly.
 896
 897    CAUTION: This function is unsafe for input that was not checked
 898             by is_valid_utf!
 899
 900 *******************************************************************************/
 901
 902 u2 utf_nextu2(char **utf_ptr)
 903 {
 904     /* uncompressed unicode character */
 905     u2 unicode_char = 0;
 906     /* current position in utf text */
 907     unsigned char *utf = (unsigned char *) (*utf_ptr);
 908     /* bytes representing the unicode character */
 909     unsigned char ch1, ch2, ch3;
 910     /* number of bytes used to represent the unicode character */
 911     int len = 0;
 912
 913     switch ((ch1 = utf[0]) >> 4) {
 914         default: /* 1 byte */
 915                 (*utf_ptr)++;
 916                 return (u2) ch1;
 917         case 0xC:
 918         case 0xD: /* 2 bytes */
 919                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 920                         unsigned char high = ch1 & 0x1F;
 921                         unsigned char low  = ch2 & 0x3F;
 922                         unicode_char = (high << 6) + low;
 923                         len = 2;
 924                 }
 925                 break;
 926
 927         case 0xE: /* 2 or 3 bytes */
 928                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 929                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 930                                 unsigned char low  = ch3 & 0x3f;
 931                                 unsigned char mid  = ch2 & 0x3f;
 932                                 unsigned char high = ch1 & 0x0f;
 933                                 unicode_char = (((high << 6) + mid) << 6) + low;
 934                                 len = 3;
 935                         } else
 936                                 len = 2;
 937                 }
 938                 break;
 939     }
 940
 941     /* update position in utf-text */
 942     *utf_ptr = (char *) (utf + len);
 943
 944     return unicode_char;
 945 }
 946
 947
 948 /* utf_bytes *******************************************************************
 949
 950    Determine number of bytes (aka. octets) in the utf string.
 951
 952    IN:
 953       u............utf string
 954
 955    OUT:
 956       The number of octets of this utf string.
 957           There is _no_ terminating zero included in this count.
 958
 959 *******************************************************************************/
 960
 961 u4 utf_bytes(utf *u)
 962 {
 963         return u->blength;
 964 }
 965
 966
 967 /* utf_get_number_of_u2s_for_buffer ********************************************
 968
 969    Determine number of UTF-16 u2s in the given UTF-8 buffer
 970
 971    CAUTION: This function is unsafe for input that was not checked
 972             by is_valid_utf!
 973
 974    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 975    to an array of u2s (UTF-16) and want to know how many of them you will get.
 976    All other uses of this function are probably wrong.
 977
 978    IN:
 979       buffer........points to first char in buffer
 980           blength.......number of _bytes_ in the buffer
 981
 982    OUT:
 983       the number of u2s needed to hold this string in UTF-16 encoding.
 984           There is _no_ terminating zero included in this count.
 985
 986    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
 987    exception.
 988
 989 *******************************************************************************/
 990
 991 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
 992 {
 993         const char *endpos;                 /* points behind utf string           */
 994         const char *utf_ptr;                /* current position in utf text       */
 995         u4 len = 0;                         /* number of unicode characters       */
 996
 997         utf_ptr = buffer;
 998         endpos = utf_ptr + blength;
 999
1000         while (utf_ptr < endpos) {
1001                 len++;
1002                 /* next unicode character */
1003                 utf_nextu2((char **)&utf_ptr);
1004         }
1005
1006         assert(utf_ptr == endpos);
1007
1008         return len;
1009 }
1010
1011
1012 /* utf_get_number_of_u2s *******************************************************
1013
1014    Determine number of UTF-16 u2s in the utf string.
1015
1016    CAUTION: This function is unsafe for input that was not checked
1017             by is_valid_utf!
1018
1019    CAUTION: Use this function *only* when you want to convert a utf string
1020    to an array of u2s and want to know how many of them you will get.
1021    All other uses of this function are probably wrong.
1022
1023    IN:
1024       u............utf string
1025
1026    OUT:
1027       the number of u2s needed to hold this string in UTF-16 encoding.
1028           There is _no_ terminating zero included in this count.
1029           XXX 0 if a NullPointerException has been thrown (see below)
1030
1031 *******************************************************************************/
1032
1033 u4 utf_get_number_of_u2s(utf *u)
1034 {
1035         char *endpos;                       /* points behind utf string           */
1036         char *utf_ptr;                      /* current position in utf text       */
1037         u4 len = 0;                         /* number of unicode characters       */
1038
1039         /* XXX this is probably not checked by most callers! Review this after */
1040         /* the invalid uses of this function have been eliminated */
1041         if (u == NULL) {
1042                 exceptions_throw_nullpointerexception();
1043                 return 0;
1044         }
1045
1046         endpos = UTF_END(u);
1047         utf_ptr = u->text;
1048
1049         while (utf_ptr < endpos) {
1050                 len++;
1051                 /* next unicode character */
1052                 utf_nextu2(&utf_ptr);
1053         }
1054
1055         if (utf_ptr != endpos) {
1056                 /* string ended abruptly */
1057                 exceptions_throw_internalerror("Illegal utf8 string");
1058                 return 0;
1059         }
1060
1061         return len;
1062 }
1063
1064
1065 /* utf8_safe_number_of_u2s *****************************************************
1066
1067    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1068    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1069
1070    This function is safe even for invalid UTF-8 strings.
1071
1072    IN:
1073       text..........zero-terminated(!) UTF-8 string (may be invalid)
1074                         must NOT be NULL
1075           nbytes........strlen(text). (This is needed to completely emulate
1076                         the RI).
1077
1078    OUT:
1079       the number of u2s needed to hold this string in UTF-16 encoding.
1080           There is _no_ terminating zero included in this count.
1081
1082 *******************************************************************************/
1083
1084 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1085         register const unsigned char *t;
1086         register s4 byte;
1087         register s4 len;
1088         register const unsigned char *tlimit;
1089         s4 byte1;
1090         s4 byte2;
1091         s4 byte3;
1092         s4 value;
1093         s4 skip;
1094
1095         assert(text);
1096         assert(nbytes >= 0);
1097
1098         len = 0;
1099         t = (const unsigned char *) text;
1100         tlimit = t + nbytes;
1101
1102         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1103
1104         while (1) {
1105                 byte = *t++;
1106
1107                 if (byte & 0x80) {
1108                         /* highest bit set, non-ASCII character */
1109
1110                         if ((byte & 0xe0) == 0xc0) {
1111                                 /* 2-byte: should be 110..... 10...... ? */
1112
1113                                 if ((*t++ & 0xc0) == 0x80)
1114                                         ; /* valid 2-byte */
1115                                 else
1116                                         t--; /* invalid */
1117                         }
1118                         else if ((byte & 0xf0) == 0xe0) {
1119                                 /* 3-byte: should be 1110.... 10...... 10...... */
1120                                 /*                            ^t                */
1121
1122                                 if (t + 2 > tlimit)
1123                                         return len + 1; /* invalid, stop here */
1124
1125                                 if ((*t++ & 0xc0) == 0x80) {
1126                                         if ((*t++ & 0xc0) == 0x80)
1127                                                 ; /* valid 3-byte */
1128                                         else
1129                                                 t--; /* invalid */
1130                                 }
1131                                 else
1132                                         t--; /* invalid */
1133                         }
1134                         else if ((byte & 0xf8) == 0xf0) {
1135                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1136                                 /*                            ^t                         */
1137
1138                                 if (t + 3 > tlimit)
1139                                         return len + 1; /* invalid, stop here */
1140
1141                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1142                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1143                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1144                                                         /* valid 4-byte UTF-8? */
1145                                                         value = ((byte  & 0x07) << 18)
1146                                                                   | ((byte1 & 0x3f) << 12)
1147                                                                   | ((byte2 & 0x3f) <<  6)
1148                                                                   | ((byte3 & 0x3f)      );
1149
1150                                                         if (value > 0x10FFFF)
1151                                                                 ; /* invalid */
1152                                                         else if (value > 0xFFFF)
1153                                                                 len += 1; /* we need surrogates */
1154                                                         else
1155                                                                 ; /* 16bit suffice */
1156                                                 }
1157                                                 else
1158                                                         t--; /* invalid */
1159                                         }
1160                                         else
1161                                                 t--; /* invalid */
1162                                 }
1163                                 else
1164                                         t--; /* invalid */
1165                         }
1166                         else if ((byte & 0xfc) == 0xf8) {
1167                                 /* invalid 5-byte */
1168                                 if (t + 4 > tlimit)
1169                                         return len + 1; /* invalid, stop here */
1170
1171                                 skip = 4;
1172                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1173                                         t++;
1174                         }
1175                         else if ((byte & 0xfe) == 0xfc) {
1176                                 /* invalid 6-byte */
1177                                 if (t + 5 > tlimit)
1178                                         return len + 1; /* invalid, stop here */
1179
1180                                 skip = 5;
1181                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1182                                         t++;
1183                         }
1184                         else
1185                                 ; /* invalid */
1186                 }
1187                 else {
1188                         /* NUL */
1189
1190                         if (byte == 0)
1191                                 break;
1192
1193                         /* ASCII character, common case */
1194                 }
1195
1196                 len++;
1197         }
1198
1199         return len;
1200 }
1201
1202
1203 /* utf8_safe_convert_to_u2s ****************************************************
1204
1205    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1206    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1207    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1208
1209    This function is safe even for invalid UTF-8 strings.
1210
1211    IN:
1212       text..........zero-terminated(!) UTF-8 string (may be invalid)
1213                         must NOT be NULL
1214           nbytes........strlen(text). (This is needed to completely emulate
1215                                         the RI).
1216           buffer........a preallocated array of u2s to receive the decoded
1217                         string. Use utf8_safe_number_of_u2s to get the
1218                                         required number of u2s for allocating this.
1219
1220 *******************************************************************************/
1221
1222 #define UNICODE_REPLACEMENT  0xfffd
1223
1224 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1225         register const unsigned char *t;
1226         register s4 byte;
1227         register const unsigned char *tlimit;
1228         s4 byte1;
1229         s4 byte2;
1230         s4 byte3;
1231         s4 value;
1232         s4 skip;
1233
1234         assert(text);
1235         assert(nbytes >= 0);
1236
1237         t = (const unsigned char *) text;
1238         tlimit = t + nbytes;
1239
1240         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1241
1242         while (1) {
1243                 byte = *t++;
1244
1245                 if (byte & 0x80) {
1246                         /* highest bit set, non-ASCII character */
1247
1248                         if ((byte & 0xe0) == 0xc0) {
1249                                 /* 2-byte: should be 110..... 10...... */
1250
1251                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1252                                         /* valid 2-byte UTF-8 */
1253                                         *buffer++ = ((byte  & 0x1f) << 6)
1254                                                           | ((byte1 & 0x3f)     );
1255                                 }
1256                                 else {
1257                                         *buffer++ = UNICODE_REPLACEMENT;
1258                                         t--;
1259                                 }
1260                         }
1261                         else if ((byte & 0xf0) == 0xe0) {
1262                                 /* 3-byte: should be 1110.... 10...... 10...... */
1263
1264                                 if (t + 2 > tlimit) {
1265                                         *buffer++ = UNICODE_REPLACEMENT;
1266                                         return;
1267                                 }
1268
1269                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1270                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1271                                                 /* valid 3-byte UTF-8 */
1272                                                 *buffer++ = ((byte  & 0x0f) << 12)
1273                                                                   | ((byte1 & 0x3f) <<  6)
1274                                                                   | ((byte2 & 0x3f)      );
1275                                         }
1276                                         else {
1277                                                 *buffer++ = UNICODE_REPLACEMENT;
1278                                                 t--;
1279                                         }
1280                                 }
1281                                 else {
1282                                         *buffer++ = UNICODE_REPLACEMENT;
1283                                         t--;
1284                                 }
1285                         }
1286                         else if ((byte & 0xf8) == 0xf0) {
1287                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1288
1289                                 if (t + 3 > tlimit) {
1290                                         *buffer++ = UNICODE_REPLACEMENT;
1291                                         return;
1292                                 }
1293
1294                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1295                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1296                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1297                                                         /* valid 4-byte UTF-8? */
1298                                                         value = ((byte  & 0x07) << 18)
1299                                                                   | ((byte1 & 0x3f) << 12)
1300                                                                   | ((byte2 & 0x3f) <<  6)
1301                                                                   | ((byte3 & 0x3f)      );
1302
1303                                                         if (value > 0x10FFFF) {
1304                                                                 *buffer++ = UNICODE_REPLACEMENT;
1305                                                         }
1306                                                         else if (value > 0xFFFF) {
1307                                                                 /* we need surrogates */
1308                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1309                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1310                                                         }
1311                                                         else
1312                                                                 *buffer++ = value; /* 16bit suffice */
1313                                                 }
1314                                                 else {
1315                                                         *buffer++ = UNICODE_REPLACEMENT;
1316                                                         t--;
1317                                                 }
1318                                         }
1319                                         else {
1320                                                 *buffer++ = UNICODE_REPLACEMENT;
1321                                                 t--;
1322                                         }
1323                                 }
1324                                 else {
1325                                         *buffer++ = UNICODE_REPLACEMENT;
1326                                         t--;
1327                                 }
1328                         }
1329                         else if ((byte & 0xfc) == 0xf8) {
1330                                 if (t + 4 > tlimit) {
1331                                         *buffer++ = UNICODE_REPLACEMENT;
1332                                         return;
1333                                 }
1334
1335                                 skip = 4;
1336                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1337                                         t++;
1338                                 *buffer++ = UNICODE_REPLACEMENT;
1339                         }
1340                         else if ((byte & 0xfe) == 0xfc) {
1341                                 if (t + 5 > tlimit) {
1342                                         *buffer++ = UNICODE_REPLACEMENT;
1343                                         return;
1344                                 }
1345
1346                                 skip = 5;
1347                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1348                                         t++;
1349                                 *buffer++ = UNICODE_REPLACEMENT;
1350                         }
1351                         else
1352                                 *buffer++ = UNICODE_REPLACEMENT;
1353                 }
1354                 else {
1355                         /* NUL */
1356
1357                         if (byte == 0)
1358                                 break;
1359
1360                         /* ASCII character, common case */
1361
1362                         *buffer++ = byte;
1363                 }
1364         }
1365 }
1366
1367
1368 /* u2_utflength ****************************************************************
1369
1370    Returns the utf length in bytes of a u2 array.
1371
1372 *******************************************************************************/
1373
1374 u4 u2_utflength(u2 *text, u4 u2_length)
1375 {
1376         u4 result_len = 0;                  /* utf length in bytes                */
1377         u2 ch;                              /* current unicode character          */
1378         u4 len;
1379
1380         for (len = 0; len < u2_length; len++) {
1381                 /* next unicode character */
1382                 ch = *text++;
1383
1384                 /* determine bytes required to store unicode character as utf */
1385                 if (ch && (ch < 0x80))
1386                         result_len++;
1387                 else if (ch < 0x800)
1388                         result_len += 2;
1389                 else
1390                         result_len += 3;
1391         }
1392
1393     return result_len;
1394 }
1395
1396
1397 /* utf_copy ********************************************************************
1398
1399    Copy the given utf string byte-for-byte to a buffer.
1400
1401    IN:
1402       buffer.......the buffer
1403           u............the utf string
1404
1405 *******************************************************************************/
1406
1407 void utf_copy(char *buffer, utf *u)
1408 {
1409         /* our utf strings are zero-terminated (done by utf_new) */
1410         MCOPY(buffer, u->text, char, u->blength + 1);
1411 }
1412
1413
1414 /* utf_cat *********************************************************************
1415
1416    Append the given utf string byte-for-byte to a buffer.
1417
1418    IN:
1419       buffer.......the buffer
1420           u............the utf string
1421
1422 *******************************************************************************/
1423
1424 void utf_cat(char *buffer, utf *u)
1425 {
1426         /* our utf strings are zero-terminated (done by utf_new) */
1427         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1428 }
1429
1430
1431 /* utf_copy_classname **********************************************************
1432
1433    Copy the given utf classname byte-for-byte to a buffer.
1434    '/' is replaced by '.'
1435
1436    IN:
1437       buffer.......the buffer
1438           u............the utf string
1439
1440 *******************************************************************************/
1441
1442 void utf_copy_classname(char *buffer, utf *u)
1443 {
1444         char *bufptr;
1445         char *srcptr;
1446         char *endptr;
1447         char ch;
1448
1449         bufptr = buffer;
1450         srcptr = u->text;
1451         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1452
1453         while (srcptr != endptr) {
1454                 ch = *srcptr++;
1455                 if (ch == '/')
1456                         ch = '.';
1457                 *bufptr++ = ch;
1458         }
1459 }
1460
1461
1462 /* utf_cat *********************************************************************
1463
1464    Append the given utf classname byte-for-byte to a buffer.
1465    '/' is replaced by '.'
1466
1467    IN:
1468       buffer.......the buffer
1469           u............the utf string
1470
1471 *******************************************************************************/
1472
1473 void utf_cat_classname(char *buffer, utf *u)
1474 {
1475         utf_copy_classname(buffer + strlen(buffer), u);
1476 }
1477
1478 /* utf_display_printable_ascii *************************************************
1479
1480    Write utf symbol to stdout (for debugging purposes).
1481    Non-printable and non-ASCII characters are printed as '?'.
1482
1483 *******************************************************************************/
1484
1485 void utf_display_printable_ascii(utf *u)
1486 {
1487         char *endpos;                       /* points behind utf string           */
1488         char *utf_ptr;                      /* current position in utf text       */
1489
1490         if (u == NULL) {
1491                 printf("NULL");
1492                 fflush(stdout);
1493                 return;
1494         }
1495
1496         endpos = UTF_END(u);
1497         utf_ptr = u->text;
1498
1499         while (utf_ptr < endpos) {
1500                 /* read next unicode character */
1501
1502                 u2 c = utf_nextu2(&utf_ptr);
1503
1504                 if ((c >= 32) && (c <= 127))
1505                         printf("%c", c);
1506                 else
1507                         printf("?");
1508         }
1509
1510         fflush(stdout);
1511 }
1512
1513
1514 /* utf_display_printable_ascii_classname ***************************************
1515
1516    Write utf symbol to stdout with `/' converted to `.' (for debugging
1517    purposes).
1518    Non-printable and non-ASCII characters are printed as '?'.
1519
1520 *******************************************************************************/
1521
1522 void utf_display_printable_ascii_classname(utf *u)
1523 {
1524         char *endpos;                       /* points behind utf string           */
1525         char *utf_ptr;                      /* current position in utf text       */
1526
1527         if (u == NULL) {
1528                 printf("NULL");
1529                 fflush(stdout);
1530                 return;
1531         }
1532
1533         endpos = UTF_END(u);
1534         utf_ptr = u->text;
1535
1536         while (utf_ptr < endpos) {
1537                 /* read next unicode character */
1538
1539                 u2 c = utf_nextu2(&utf_ptr);
1540
1541                 if (c == '/')
1542                         c = '.';
1543
1544                 if ((c >= 32) && (c <= 127))
1545                         printf("%c", c);
1546                 else
1547                         printf("?");
1548         }
1549
1550         fflush(stdout);
1551 }
1552
1553
1554 /* utf_sprint_convert_to_latin1 ************************************************
1555
1556    Write utf symbol into c-string (for debugging purposes).
1557    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1558    invalid results.
1559
1560 *******************************************************************************/
1561
1562 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1563 {
1564         char *endpos;                       /* points behind utf string           */
1565         char *utf_ptr;                      /* current position in utf text       */
1566         u2 pos = 0;                         /* position in c-string               */
1567
1568         if (!u) {
1569                 strcpy(buffer, "NULL");
1570                 return;
1571         }
1572
1573         endpos = UTF_END(u);
1574         utf_ptr = u->text;
1575
1576         while (utf_ptr < endpos)
1577                 /* copy next unicode character */
1578                 buffer[pos++] = utf_nextu2(&utf_ptr);
1579
1580         /* terminate string */
1581         buffer[pos] = '\0';
1582 }
1583
1584
1585 /* utf_sprint_convert_to_latin1_classname **************************************
1586
1587    Write utf symbol into c-string with `/' converted to `.' (for debugging
1588    purposes).
1589    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1590    invalid results.
1591
1592 *******************************************************************************/
1593
1594 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1595 {
1596         char *endpos;                       /* points behind utf string           */
1597         char *utf_ptr;                      /* current position in utf text       */
1598         u2 pos = 0;                         /* position in c-string               */
1599
1600         if (!u) {
1601                 strcpy(buffer, "NULL");
1602                 return;
1603         }
1604
1605         endpos = UTF_END(u);
1606         utf_ptr = u->text;
1607
1608         while (utf_ptr < endpos) {
1609                 /* copy next unicode character */
1610                 u2 c = utf_nextu2(&utf_ptr);
1611                 if (c == '/') c = '.';
1612                 buffer[pos++] = c;
1613         }
1614
1615         /* terminate string */
1616         buffer[pos] = '\0';
1617 }
1618
1619
1620 /* utf_strcat_convert_to_latin1 ************************************************
1621
1622    Like libc strcat, but uses an utf8 string.
1623    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1624    invalid results.
1625
1626 *******************************************************************************/
1627
1628 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1629 {
1630         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1631 }
1632
1633
1634 /* utf_strcat_convert_to_latin1_classname **************************************
1635
1636    Like libc strcat, but uses an utf8 string.
1637    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1638    invalid results.
1639
1640 *******************************************************************************/
1641
1642 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1643 {
1644         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1645 }
1646
1647
1648 /* utf_fprint_printable_ascii **************************************************
1649
1650    Write utf symbol into file.
1651    Non-printable and non-ASCII characters are printed as '?'.
1652
1653 *******************************************************************************/
1654
1655 void utf_fprint_printable_ascii(FILE *file, utf *u)
1656 {
1657         char *endpos;                       /* points behind utf string           */
1658         char *utf_ptr;                      /* current position in utf text       */
1659
1660         if (!u)
1661                 return;
1662
1663         endpos = UTF_END(u);
1664         utf_ptr = u->text;
1665
1666         while (utf_ptr < endpos) {
1667                 /* read next unicode character */
1668                 u2 c = utf_nextu2(&utf_ptr);
1669
1670                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1671                 else fprintf(file, "?");
1672         }
1673 }
1674
1675
1676 /* utf_fprint_printable_ascii_classname ****************************************
1677
1678    Write utf symbol into file with `/' converted to `.'.
1679    Non-printable and non-ASCII characters are printed as '?'.
1680
1681 *******************************************************************************/
1682
1683 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1684 {
1685         char *endpos;                       /* points behind utf string           */
1686         char *utf_ptr;                      /* current position in utf text       */
1687
1688     if (!u)
1689                 return;
1690
1691         endpos = UTF_END(u);
1692         utf_ptr = u->text;
1693
1694         while (utf_ptr < endpos) {
1695                 /* read next unicode character */
1696                 u2 c = utf_nextu2(&utf_ptr);
1697                 if (c == '/') c = '.';
1698
1699                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1700                 else fprintf(file, "?");
1701         }
1702 }
1703
1704
1705 /* is_valid_utf ****************************************************************
1706
1707    Return true if the given string is a valid UTF-8 string.
1708
1709    utf_ptr...points to first character
1710    end_pos...points after last character
1711
1712 *******************************************************************************/
1713
1714 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1715
1716 bool is_valid_utf(char *utf_ptr, char *end_pos)
1717 {
1718         int bytes;
1719         int len,i;
1720         char c;
1721         unsigned long v;
1722
1723         if (end_pos < utf_ptr) return false;
1724         bytes = end_pos - utf_ptr;
1725         while (bytes--) {
1726                 c = *utf_ptr++;
1727
1728                 if (!c) return false;                     /* 0x00 is not allowed */
1729                 if ((c & 0x80) == 0) continue;            /* ASCII */
1730
1731                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1732                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1733                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1734                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1735                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1736                 else return false;                        /* invalid leading byte */
1737
1738                 if (len > 2) return false;                /* Java limitation */
1739
1740                 v = (unsigned long)c & (0x3f >> len);
1741
1742                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1743
1744                 for (i = len; i--; ) {
1745                         c = *utf_ptr++;
1746                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1747                                 return false;
1748                         v = (v << 6) | (c & 0x3f);
1749                 }
1750
1751                 if (v == 0) {
1752                         if (len != 1) return false;           /* Java special */
1753
1754                 } else {
1755                         /* Sun Java seems to allow overlong UTF-8 encodings */
1756
1757                         /* if (v < min_codepoint[len]) */
1758                                 /* XXX throw exception? */
1759                 }
1760
1761                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1762                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1763
1764                 /* even these seem to be allowed */
1765                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1766         }
1767
1768         return true;
1769 }
1770
1771
1772 /* is_valid_name ***************************************************************
1773
1774    Return true if the given string may be used as a class/field/method
1775    name. (Currently this only disallows empty strings and control
1776    characters.)
1777
1778    NOTE: The string is assumed to have passed is_valid_utf!
1779
1780    utf_ptr...points to first character
1781    end_pos...points after last character
1782
1783 *******************************************************************************/
1784
1785 bool is_valid_name(char *utf_ptr, char *end_pos)
1786 {
1787         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1788
1789         while (utf_ptr < end_pos) {
1790                 unsigned char c = *utf_ptr++;
1791
1792                 if (c < 0x20) return false; /* disallow control characters */
1793                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1794                         return false;
1795         }
1796
1797         return true;
1798 }
1799
1800 bool is_valid_name_utf(utf *u)
1801 {
1802         return is_valid_name(u->text, UTF_END(u));
1803 }
1804
1805
1806 /* utf_show ********************************************************************
1807
1808    Writes the utf symbols in the utfhash to stdout and displays the
1809    number of external hash chains grouped according to the chainlength
1810    (for debugging purposes).
1811
1812 *******************************************************************************/
1813
1814 #if !defined(NDEBUG)
1815 void utf_show(void)
1816 {
1817
1818 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1819
1820         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1821         u4 max_chainlength = 0;      /* maximum length of the chains */
1822         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1823         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1824         u4 i;
1825
1826         printf("UTF-HASH:\n");
1827
1828         /* show element of utf-hashtable */
1829
1830         for (i = 0; i < hashtable_utf->size; i++) {
1831                 utf *u = hashtable_utf->ptr[i];
1832
1833                 if (u) {
1834                         printf("SLOT %d: ", (int) i);
1835
1836                         while (u) {
1837                                 printf("'");
1838                                 utf_display_printable_ascii(u);
1839                                 printf("' ");
1840                                 u = u->hashlink;
1841                         }
1842                         printf("\n");
1843                 }
1844         }
1845
1846         printf("UTF-HASH: %d slots for %d entries\n",
1847                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1848
1849         if (hashtable_utf->entries == 0)
1850                 return;
1851
1852         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1853
1854         for (i=0;i<CHAIN_LIMIT;i++)
1855                 chain_count[i]=0;
1856
1857         /* count numbers of hashchains according to their length */
1858         for (i=0; i<hashtable_utf->size; i++) {
1859
1860                 utf *u = (utf*) hashtable_utf->ptr[i];
1861                 u4 chain_length = 0;
1862
1863                 /* determine chainlength */
1864                 while (u) {
1865                         u = u->hashlink;
1866                         chain_length++;
1867                 }
1868
1869                 /* update sum of all chainlengths */
1870                 sum_chainlength+=chain_length;
1871
1872                 /* determine the maximum length of the chains */
1873                 if (chain_length>max_chainlength)
1874                         max_chainlength = chain_length;
1875
1876                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1877                 if (chain_length>=CHAIN_LIMIT) {
1878                         beyond_limit+=chain_length;
1879                         chain_length=CHAIN_LIMIT-1;
1880                 }
1881
1882                 /* update number of hashchains of current length */
1883                 chain_count[chain_length]++;
1884         }
1885
1886         /* display results */
1887         for (i=1;i<CHAIN_LIMIT-1;i++)
1888                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1889
1890         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1891
1892
1893         printf("max. chainlength:%5d\n",max_chainlength);
1894
1895         /* avg. chainlength = sum of chainlengths / number of chains */
1896         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1897 }
1898 #endif /* !defined(NDEBUG) */
1899
1900
1901 /*
1902  * These are local overrides for various environment variables in Emacs.
1903  * Please do not remove this and leave it at the end of the file, where
1904  * Emacs will automagically detect them.
1905  * ---------------------------------------------------------------------
1906  * Local variables:
1907  * mode: c
1908  * indent-tabs-mode: t
1909  * c-basic-offset: 4
1910  * tab-width: 4
1911  * End:
1912  * vim:noexpandtab:sw=4:ts=4:
1913  */