src/vmcore/utf8.c

   1 /* src/vmcore/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006, 2007, 2008
   4    CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
   5
   6    This file is part of CACAO.
   7
   8    This program is free software; you can redistribute it and/or
   9    modify it under the terms of the GNU General Public License as
  10    published by the Free Software Foundation; either version 2, or (at
  11    your option) any later version.
  12
  13    This program is distributed in the hope that it will be useful, but
  14    WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16    General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; if not, write to the Free Software
  20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  21    02110-1301, USA.
  22
  23 */
  24
  25
  26 #include "config.h"
  27
  28 #include <string.h>
  29 #include <assert.h>
  30
  31 #include "vm/types.h"
  32
  33 #include "mm/memory.h"
  34
  35 #include "threads/lock-common.h"
  36
  37 #include "toolbox/hashtable.h"
  38
  39 #include "vm/exceptions.h"
  40
  41 #include "vmcore/options.h"
  42
  43 #if defined(ENABLE_STATISTICS)
  44 # include "vmcore/statistics.h"
  45 #endif
  46
  47 #include "vmcore/utf8.h"
  48
  49
  50 /* global variables ***********************************************************/
  51
  52 /* hashsize must be power of 2 */
  53
  54 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  55
  56 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  57
  58
  59 /* utf-symbols for pointer comparison of frequently used strings **************/
  60
  61 utf *utf_java_lang_Object;
  62
  63 utf *utf_java_lang_Class;
  64 utf *utf_java_lang_ClassLoader;
  65 utf *utf_java_lang_Cloneable;
  66 utf *utf_java_lang_SecurityManager;
  67 utf *utf_java_lang_String;
  68 utf *utf_java_lang_ThreadGroup;
  69 utf *utf_java_lang_ref_SoftReference;
  70 utf *utf_java_lang_ref_WeakReference;
  71 utf *utf_java_lang_ref_PhantomReference;
  72 utf *utf_java_io_Serializable;
  73
  74 utf *utf_java_lang_Throwable;
  75 utf *utf_java_lang_Error;
  76
  77 utf *utf_java_lang_AbstractMethodError;
  78 utf *utf_java_lang_ClassCircularityError;
  79 utf *utf_java_lang_ClassFormatError;
  80 utf *utf_java_lang_ExceptionInInitializerError;
  81 utf *utf_java_lang_IncompatibleClassChangeError;
  82 utf *utf_java_lang_InstantiationError;
  83 utf *utf_java_lang_InternalError;
  84 utf *utf_java_lang_LinkageError;
  85 utf *utf_java_lang_NoClassDefFoundError;
  86 utf *utf_java_lang_NoSuchFieldError;
  87 utf *utf_java_lang_NoSuchMethodError;
  88 utf *utf_java_lang_OutOfMemoryError;
  89 utf *utf_java_lang_UnsatisfiedLinkError;
  90 utf *utf_java_lang_UnsupportedClassVersionError;
  91 utf *utf_java_lang_VerifyError;
  92 utf *utf_java_lang_VirtualMachineError;
  93
  94 utf *utf_java_lang_Exception;
  95
  96 utf *utf_java_lang_ArithmeticException;
  97 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
  98 utf *utf_java_lang_ArrayStoreException;
  99 utf *utf_java_lang_ClassCastException;
 100 utf *utf_java_lang_ClassNotFoundException;
 101 utf *utf_java_lang_CloneNotSupportedException;
 102 utf *utf_java_lang_IllegalAccessException;
 103 utf *utf_java_lang_IllegalArgumentException;
 104 utf *utf_java_lang_IllegalMonitorStateException;
 105 utf *utf_java_lang_InstantiationException;
 106 utf *utf_java_lang_InterruptedException;
 107 utf *utf_java_lang_NegativeArraySizeException;
 108 utf *utf_java_lang_NullPointerException;
 109 utf *utf_java_lang_StringIndexOutOfBoundsException;
 110
 111 utf *utf_java_lang_reflect_InvocationTargetException;
 112
 113 utf *utf_java_security_PrivilegedActionException;
 114
 115 #if defined(ENABLE_JAVASE)
 116 utf* utf_java_lang_Void;
 117 #endif
 118
 119 utf* utf_java_lang_Boolean;
 120 utf* utf_java_lang_Byte;
 121 utf* utf_java_lang_Character;
 122 utf* utf_java_lang_Short;
 123 utf* utf_java_lang_Integer;
 124 utf* utf_java_lang_Long;
 125 utf* utf_java_lang_Float;
 126 utf* utf_java_lang_Double;
 127
 128 #if defined(ENABLE_JAVASE)
 129 utf *utf_java_lang_StackTraceElement;
 130 utf *utf_java_lang_reflect_Constructor;
 131 utf *utf_java_lang_reflect_Field;
 132 utf *utf_java_lang_reflect_Method;
 133 utf *utf_java_util_Vector;
 134 #endif
 135
 136 utf *utf_InnerClasses;                  /* InnerClasses                       */
 137 utf *utf_ConstantValue;                 /* ConstantValue                      */
 138 utf *utf_Code;                          /* Code                               */
 139 utf *utf_Exceptions;                    /* Exceptions                         */
 140 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 141 utf *utf_SourceFile;                    /* SourceFile                         */
 142
 143 #if defined(ENABLE_JAVASE)
 144 utf *utf_EnclosingMethod;
 145 utf *utf_Signature;
 146 utf *utf_StackMapTable;
 147
 148 #if defined(ENABLE_ANNOTATIONS)
 149 utf *utf_RuntimeVisibleAnnotations;            /* RuntimeVisibleAnnotations            */
 150 utf *utf_RuntimeInvisibleAnnotations;          /* RuntimeInvisibleAnnotations          */
 151 utf *utf_RuntimeVisibleParameterAnnotations;   /* RuntimeVisibleParameterAnnotations   */
 152 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
 153 utf *utf_AnnotationDefault;                    /* AnnotationDefault                    */
 154 #endif
 155 #endif
 156
 157 utf *utf_init;                          /* <init>                             */
 158 utf *utf_clinit;                        /* <clinit>                           */
 159 utf *utf_clone;                         /* clone                              */
 160 utf *utf_finalize;                      /* finalize                           */
 161 utf *utf_run;                           /* run                                */
 162
 163 utf *utf_add;
 164 utf *utf_remove;
 165 utf *utf_addThread;
 166 utf *utf_removeThread;
 167 utf *utf_put;
 168 utf *utf_get;
 169 utf *utf_uncaughtException;
 170 utf *utf_value;
 171
 172 utf *utf_fillInStackTrace;
 173 utf *utf_findNative;
 174 utf *utf_getSystemClassLoader;
 175 utf *utf_initCause;
 176 utf *utf_loadClass;
 177 utf *utf_loadClassInternal;
 178 utf *utf_printStackTrace;
 179
 180 utf *utf_division_by_zero;
 181
 182 utf *utf_Z;                             /* Z                                  */
 183 utf *utf_B;                             /* B                                  */
 184 utf *utf_C;                             /* C                                  */
 185 utf *utf_S;                             /* S                                  */
 186 utf *utf_I;                             /* I                                  */
 187 utf *utf_J;                             /* J                                  */
 188 utf *utf_F;                             /* F                                  */
 189 utf *utf_D;                             /* D                                  */
 190
 191 utf *utf_void__void;                    /* ()V                                */
 192 utf *utf_boolean__void;                 /* (Z)V                               */
 193 utf *utf_byte__void;                    /* (B)V                               */
 194 utf *utf_char__void;                    /* (C)V                               */
 195 utf *utf_short__void;                   /* (S)V                               */
 196 utf *utf_int__void;                     /* (I)V                               */
 197 utf *utf_long__void;                    /* (J)V                               */
 198 utf *utf_float__void;                   /* (F)V                               */
 199 utf *utf_double__void;                  /* (D)V                               */
 200
 201 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 202 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 203 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 204 utf *utf_java_lang_ClassLoader_java_lang_String__J;
 205 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 206 utf *utf_java_lang_Object__java_lang_Object;
 207 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 208 utf *utf_java_lang_String__java_lang_Class;
 209 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 210 utf *utf_java_lang_Thread_java_lang_Throwable__V;
 211 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 212 utf *utf_java_lang_Throwable__java_lang_Throwable;
 213
 214 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 215 utf *utf_null;
 216 utf *array_packagename;
 217
 218
 219 /* utf_init ********************************************************************
 220
 221    Initializes the utf8 subsystem.
 222
 223 *******************************************************************************/
 224
 225 bool utf8_init(void)
 226 {
 227         TRACESUBSYSTEMINITIALIZATION("utf8_init");
 228
 229         /* create utf8 hashtable */
 230
 231         hashtable_utf = NEW(hashtable);
 232
 233         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 234
 235 #if defined(ENABLE_STATISTICS)
 236         if (opt_stat)
 237                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 238 #endif
 239
 240         /* create utf-symbols for pointer comparison of frequently used strings */
 241
 242         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 243
 244         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 245         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 246         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 247         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 248         utf_java_lang_String           = utf_new_char("java/lang/String");
 249         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 250
 251         utf_java_lang_ref_SoftReference =
 252                 utf_new_char("java/lang/ref/SoftReference");
 253
 254         utf_java_lang_ref_WeakReference =
 255                 utf_new_char("java/lang/ref/WeakReference");
 256
 257         utf_java_lang_ref_PhantomReference =
 258                 utf_new_char("java/lang/ref/PhantomReference");
 259
 260         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 261
 262         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 263         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 264
 265         utf_java_lang_ClassCircularityError =
 266                 utf_new_char("java/lang/ClassCircularityError");
 267
 268         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
 269
 270         utf_java_lang_ExceptionInInitializerError =
 271                 utf_new_char("java/lang/ExceptionInInitializerError");
 272
 273         utf_java_lang_IncompatibleClassChangeError =
 274                 utf_new_char("java/lang/IncompatibleClassChangeError");
 275
 276         utf_java_lang_InstantiationError =
 277                 utf_new_char("java/lang/InstantiationError");
 278
 279         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
 280         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 281
 282         utf_java_lang_NoClassDefFoundError =
 283                 utf_new_char("java/lang/NoClassDefFoundError");
 284
 285         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 286
 287         utf_java_lang_UnsatisfiedLinkError =
 288                 utf_new_char("java/lang/UnsatisfiedLinkError");
 289
 290         utf_java_lang_UnsupportedClassVersionError =
 291                 utf_new_char("java/lang/UnsupportedClassVersionError");
 292
 293         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
 294
 295         utf_java_lang_VirtualMachineError =
 296                 utf_new_char("java/lang/VirtualMachineError");
 297
 298 #if defined(ENABLE_JAVASE)
 299         utf_java_lang_AbstractMethodError =
 300                 utf_new_char("java/lang/AbstractMethodError");
 301
 302         utf_java_lang_NoSuchFieldError =
 303                 utf_new_char("java/lang/NoSuchFieldError");
 304
 305         utf_java_lang_NoSuchMethodError =
 306                 utf_new_char("java/lang/NoSuchMethodError");
 307 #endif
 308
 309         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 310
 311         utf_java_lang_ArithmeticException =
 312                 utf_new_char("java/lang/ArithmeticException");
 313
 314         utf_java_lang_ArrayIndexOutOfBoundsException =
 315                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
 316
 317         utf_java_lang_ArrayStoreException =
 318                 utf_new_char("java/lang/ArrayStoreException");
 319
 320         utf_java_lang_ClassCastException =
 321                 utf_new_char("java/lang/ClassCastException");
 322
 323         utf_java_lang_ClassNotFoundException =
 324                 utf_new_char("java/lang/ClassNotFoundException");
 325
 326         utf_java_lang_CloneNotSupportedException =
 327                 utf_new_char("java/lang/CloneNotSupportedException");
 328
 329         utf_java_lang_IllegalAccessException =
 330                 utf_new_char("java/lang/IllegalAccessException");
 331
 332         utf_java_lang_IllegalArgumentException =
 333                 utf_new_char("java/lang/IllegalArgumentException");
 334
 335         utf_java_lang_IllegalMonitorStateException =
 336                 utf_new_char("java/lang/IllegalMonitorStateException");
 337
 338         utf_java_lang_InstantiationException =
 339                 utf_new_char("java/lang/InstantiationException");
 340
 341         utf_java_lang_InterruptedException =
 342                 utf_new_char("java/lang/InterruptedException");
 343
 344         utf_java_lang_NegativeArraySizeException =
 345                 utf_new_char("java/lang/NegativeArraySizeException");
 346
 347         utf_java_lang_NullPointerException =
 348                 utf_new_char("java/lang/NullPointerException");
 349
 350         utf_java_lang_StringIndexOutOfBoundsException =
 351                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
 352
 353         utf_java_lang_reflect_InvocationTargetException =
 354                 utf_new_char("java/lang/reflect/InvocationTargetException");
 355
 356         utf_java_security_PrivilegedActionException =
 357                 utf_new_char("java/security/PrivilegedActionException");
 358
 359 #if defined(ENABLE_JAVASE)
 360         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 361 #endif
 362
 363         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 364         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 365         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 366         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 367         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 368         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 369         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 370         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 371
 372 #if defined(ENABLE_JAVASE)
 373         utf_java_lang_StackTraceElement =
 374                 utf_new_char("java/lang/StackTraceElement");
 375
 376         utf_java_lang_reflect_Constructor =
 377                 utf_new_char("java/lang/reflect/Constructor");
 378
 379         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 380         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 381         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 382 #endif
 383
 384         utf_InnerClasses               = utf_new_char("InnerClasses");
 385         utf_ConstantValue              = utf_new_char("ConstantValue");
 386         utf_Code                       = utf_new_char("Code");
 387         utf_Exceptions                 = utf_new_char("Exceptions");
 388         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 389         utf_SourceFile                 = utf_new_char("SourceFile");
 390
 391 #if defined(ENABLE_JAVASE)
 392         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 393         utf_Signature                  = utf_new_char("Signature");
 394         utf_StackMapTable              = utf_new_char("StackMapTable");
 395
 396 #if defined(ENABLE_ANNOTATIONS)
 397         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
 398         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
 399         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
 400         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
 401         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
 402 #endif
 403 #endif
 404
 405         utf_init                           = utf_new_char("<init>");
 406         utf_clinit                         = utf_new_char("<clinit>");
 407         utf_clone                      = utf_new_char("clone");
 408         utf_finalize                   = utf_new_char("finalize");
 409         utf_run                        = utf_new_char("run");
 410
 411         utf_add                        = utf_new_char("add");
 412         utf_remove                     = utf_new_char("remove");
 413         utf_addThread                  = utf_new_char("addThread");
 414         utf_removeThread               = utf_new_char("removeThread");
 415         utf_put                        = utf_new_char("put");
 416         utf_get                        = utf_new_char("get");
 417         utf_uncaughtException          = utf_new_char("uncaughtException");
 418         utf_value                      = utf_new_char("value");
 419
 420         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 421         utf_findNative                 = utf_new_char("findNative");
 422         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 423         utf_initCause                  = utf_new_char("initCause");
 424         utf_loadClass                  = utf_new_char("loadClass");
 425         utf_loadClassInternal          = utf_new_char("loadClassInternal");
 426         utf_printStackTrace            = utf_new_char("printStackTrace");
 427
 428         utf_division_by_zero           = utf_new_char("/ by zero");
 429
 430         utf_Z                          = utf_new_char("Z");
 431         utf_B                          = utf_new_char("B");
 432         utf_C                          = utf_new_char("C");
 433         utf_S                          = utf_new_char("S");
 434         utf_I                          = utf_new_char("I");
 435         utf_J                          = utf_new_char("J");
 436         utf_F                          = utf_new_char("F");
 437         utf_D                          = utf_new_char("D");
 438
 439         utf_void__void                 = utf_new_char("()V");
 440         utf_boolean__void              = utf_new_char("(Z)V");
 441         utf_byte__void                 = utf_new_char("(B)V");
 442         utf_char__void                 = utf_new_char("(C)V");
 443         utf_short__void                = utf_new_char("(S)V");
 444         utf_int__void                  = utf_new_char("(I)V");
 445         utf_long__void                 = utf_new_char("(J)V");
 446         utf_float__void                = utf_new_char("(F)V");
 447         utf_double__void               = utf_new_char("(D)V");
 448         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 449         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 450
 451         utf_void__java_lang_ClassLoader =
 452                 utf_new_char("()Ljava/lang/ClassLoader;");
 453
 454         utf_java_lang_ClassLoader_java_lang_String__J =
 455                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
 456
 457         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
 458
 459         utf_java_lang_Object__java_lang_Object =
 460                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 461
 462         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 463
 464         utf_java_lang_String__java_lang_Class =
 465                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 466
 467         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 468
 469         utf_java_lang_Thread_java_lang_Throwable__V =
 470                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
 471
 472         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 473
 474         utf_java_lang_Throwable__java_lang_Throwable =
 475                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
 476
 477         utf_null                       = utf_new_char("null");
 478         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 479         array_packagename              = utf_new_char("\t<the array package>");
 480
 481         /* everything's ok */
 482
 483         return true;
 484 }
 485
 486
 487 /* utf_hashkey *****************************************************************
 488
 489    The hashkey is computed from the utf-text by using up to 8
 490    characters.  For utf-symbols longer than 15 characters 3 characters
 491    are taken from the beginning and the end, 2 characters are taken
 492    from the middle.
 493
 494 *******************************************************************************/
 495
 496 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 497 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 498
 499 u4 utf_hashkey(const char *text, u4 length)
 500 {
 501         const char *start_pos = text;       /* pointer to utf text                */
 502         u4 a;
 503
 504         switch (length) {
 505         case 0: /* empty string */
 506                 return 0;
 507
 508         case 1: return fbs(0);
 509         case 2: return fbs(0) ^ nbs(3);
 510         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 511         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 512         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 513         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 514         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 515         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 516
 517         case 9:
 518                 a = fbs(0);
 519                 a ^= nbs(1);
 520                 a ^= nbs(2);
 521                 text++;
 522                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 523
 524         case 10:
 525                 a = fbs(0);
 526                 text++;
 527                 a ^= nbs(2);
 528                 a ^= nbs(3);
 529                 a ^= nbs(4);
 530                 text++;
 531                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 532
 533         case 11:
 534                 a = fbs(0);
 535                 text++;
 536                 a ^= nbs(2);
 537                 a ^= nbs(3);
 538                 a ^= nbs(4);
 539                 text++;
 540                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 541
 542         case 12:
 543                 a = fbs(0);
 544                 text += 2;
 545                 a ^= nbs(2);
 546                 a ^= nbs(3);
 547                 text++;
 548                 a ^= nbs(5);
 549                 a ^= nbs(6);
 550                 a ^= nbs(7);
 551                 text++;
 552                 return a ^ nbs(9) ^ nbs(10);
 553
 554         case 13:
 555                 a = fbs(0);
 556                 a ^= nbs(1);
 557                 text++;
 558                 a ^= nbs(3);
 559                 a ^= nbs(4);
 560                 text += 2;
 561                 a ^= nbs(7);
 562                 a ^= nbs(8);
 563                 text += 2;
 564                 return a ^ nbs(9) ^ nbs(10);
 565
 566         case 14:
 567                 a = fbs(0);
 568                 text += 2;
 569                 a ^= nbs(3);
 570                 a ^= nbs(4);
 571                 text += 2;
 572                 a ^= nbs(7);
 573                 a ^= nbs(8);
 574                 text += 2;
 575                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 576
 577         case 15:
 578                 a = fbs(0);
 579                 text += 2;
 580                 a ^= nbs(3);
 581                 a ^= nbs(4);
 582                 text += 2;
 583                 a ^= nbs(7);
 584                 a ^= nbs(8);
 585                 text += 2;
 586                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 587
 588         default:  /* 3 characters from beginning */
 589                 a = fbs(0);
 590                 text += 2;
 591                 a ^= nbs(3);
 592                 a ^= nbs(4);
 593
 594                 /* 2 characters from middle */
 595                 text = start_pos + (length / 2);
 596                 a ^= fbs(5);
 597                 text += 2;
 598                 a ^= nbs(6);
 599
 600                 /* 3 characters from end */
 601                 text = start_pos + length - 4;
 602
 603                 a ^= fbs(7);
 604                 text++;
 605
 606                 return a ^ nbs(10) ^ nbs(11);
 607     }
 608 }
 609
 610 /* utf_full_hashkey ************************************************************
 611
 612    This function computes a hash value using all bytes in the string.
 613
 614    The algorithm is the "One-at-a-time" algorithm as published
 615    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 616
 617 *******************************************************************************/
 618
 619 u4 utf_full_hashkey(const char *text, u4 length)
 620 {
 621         register const unsigned char *p = (const unsigned char *) text;
 622         register u4 hash;
 623         register u4 i;
 624
 625         hash = 0;
 626         for (i=length; i--;)
 627         {
 628             hash += *p++;
 629             hash += (hash << 10);
 630             hash ^= (hash >> 6);
 631         }
 632         hash += (hash << 3);
 633         hash ^= (hash >> 11);
 634         hash += (hash << 15);
 635
 636         return hash;
 637 }
 638
 639 /* unicode_hashkey *************************************************************
 640
 641    Compute the hashkey of a unicode string.
 642
 643 *******************************************************************************/
 644
 645 u4 unicode_hashkey(u2 *text, u2 len)
 646 {
 647         return utf_hashkey((char *) text, len);
 648 }
 649
 650
 651 /* utf_new *********************************************************************
 652
 653    Creates a new utf-symbol, the text of the symbol is passed as a
 654    u1-array. The function searches the utf-hashtable for a utf-symbol
 655    with this text. On success the element returned, otherwise a new
 656    hashtable element is created.
 657
 658    If the number of entries in the hashtable exceeds twice the size of
 659    the hashtable slots a reorganization of the hashtable is done and
 660    the utf symbols are copied to a new hashtable with doubled size.
 661
 662 *******************************************************************************/
 663
 664 utf *utf_new(const char *text, u2 length)
 665 {
 666         u4 key;                             /* hashkey computed from utf-text     */
 667         u4 slot;                            /* slot in hashtable                  */
 668         utf *u;                             /* hashtable element                  */
 669         u2 i;
 670
 671         LOCK_MONITOR_ENTER(hashtable_utf->header);
 672
 673 #if defined(ENABLE_STATISTICS)
 674         if (opt_stat)
 675                 count_utf_new++;
 676 #endif
 677
 678         key  = utf_hashkey(text, length);
 679         slot = key & (hashtable_utf->size - 1);
 680         u    = hashtable_utf->ptr[slot];
 681
 682         /* search external hash chain for utf-symbol */
 683
 684         while (u) {
 685                 if (u->blength == length) {
 686                         /* compare text of hashtable elements */
 687
 688                         for (i = 0; i < length; i++)
 689                                 if (text[i] != u->text[i])
 690                                         goto nomatch;
 691
 692 #if defined(ENABLE_STATISTICS)
 693                         if (opt_stat)
 694                                 count_utf_new_found++;
 695 #endif
 696
 697                         /* symbol found in hashtable */
 698
 699                         LOCK_MONITOR_EXIT(hashtable_utf->header);
 700
 701                         return u;
 702                 }
 703
 704         nomatch:
 705                 u = u->hashlink; /* next element in external chain */
 706         }
 707
 708         /* location in hashtable found, create new utf element */
 709
 710         u = NEW(utf);
 711
 712         u->blength  = length;               /* length in bytes of utfstring       */
 713         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 714         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 715
 716         memcpy(u->text, text, length);      /* copy utf-text                      */
 717         u->text[length] = '\0';
 718
 719 #if defined(ENABLE_STATISTICS)
 720         if (opt_stat)
 721                 count_utf_len += sizeof(utf) + length + 1;
 722 #endif
 723
 724         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 725         hashtable_utf->entries++;           /* update number of entries           */
 726
 727         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 728
 729         /* reorganization of hashtable, average length of the external
 730            chains is approx. 2 */
 731
 732                 hashtable *newhash;                              /* the new hashtable */
 733                 u4         i;
 734                 utf       *u;
 735                 utf       *nextu;
 736                 u4         slot;
 737
 738                 /* create new hashtable, double the size */
 739
 740                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 741
 742 #if defined(ENABLE_STATISTICS)
 743                 if (opt_stat)
 744                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 745 #endif
 746
 747                 /* transfer elements to new hashtable */
 748
 749                 for (i = 0; i < hashtable_utf->size; i++) {
 750                         u = hashtable_utf->ptr[i];
 751
 752                         while (u) {
 753                                 nextu = u->hashlink;
 754                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 755
 756                                 u->hashlink = (utf *) newhash->ptr[slot];
 757                                 newhash->ptr[slot] = u;
 758
 759                                 /* follow link in external hash chain */
 760
 761                                 u = nextu;
 762                         }
 763                 }
 764
 765                 /* dispose old table */
 766
 767                 hashtable_free(hashtable_utf);
 768
 769                 hashtable_utf = newhash;
 770         }
 771
 772         LOCK_MONITOR_EXIT(hashtable_utf->header);
 773
 774         return u;
 775 }
 776
 777
 778 /* utf_new_u2 ******************************************************************
 779
 780    Make utf symbol from u2 array, if isclassname is true '.' is
 781    replaced by '/'.
 782
 783 *******************************************************************************/
 784
 785 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 786 {
 787         char *buffer;                   /* memory buffer for  unicode characters  */
 788         char *pos;                      /* pointer to current position in buffer  */
 789         u4 left;                        /* unicode characters left                */
 790         u4 buflength;                   /* utf length in bytes of the u2 array    */
 791         utf *result;                    /* resulting utf-string                   */
 792         int i;
 793
 794         /* determine utf length in bytes and allocate memory */
 795
 796         buflength = u2_utflength(unicode_pos, unicode_length);
 797         buffer    = MNEW(char, buflength);
 798
 799         left = buflength;
 800         pos  = buffer;
 801
 802         for (i = 0; i++ < unicode_length; unicode_pos++) {
 803                 /* next unicode character */
 804                 u2 c = *unicode_pos;
 805
 806                 if ((c != 0) && (c < 0x80)) {
 807                         /* 1 character */
 808                         left--;
 809                 if ((int) left < 0) break;
 810                         /* convert classname */
 811                         if (isclassname && c == '.')
 812                                 *pos++ = '/';
 813                         else
 814                                 *pos++ = (char) c;
 815
 816                 } else if (c < 0x800) {
 817                         /* 2 characters */
 818                 unsigned char high = c >> 6;
 819                 unsigned char low  = c & 0x3F;
 820                         left = left - 2;
 821                 if ((int) left < 0) break;
 822                 *pos++ = high | 0xC0;
 823                 *pos++ = low  | 0x80;
 824
 825                 } else {
 826                 /* 3 characters */
 827                 char low  = c & 0x3f;
 828                 char mid  = (c >> 6) & 0x3F;
 829                 char high = c >> 12;
 830                         left = left - 3;
 831                 if ((int) left < 0) break;
 832                 *pos++ = high | 0xE0;
 833                 *pos++ = mid  | 0x80;
 834                 *pos++ = low  | 0x80;
 835                 }
 836         }
 837
 838         /* insert utf-string into symbol-table */
 839         result = utf_new(buffer,buflength);
 840
 841         MFREE(buffer, char, buflength);
 842
 843         return result;
 844 }
 845
 846
 847 /* utf_new_char ****************************************************************
 848
 849    Creates a new utf symbol, the text for this symbol is passed as a
 850    c-string ( = char* ).
 851
 852 *******************************************************************************/
 853
 854 utf *utf_new_char(const char *text)
 855 {
 856         return utf_new(text, strlen(text));
 857 }
 858
 859
 860 /* utf_new_char_classname ******************************************************
 861
 862    Creates a new utf symbol, the text for this symbol is passed as a
 863    c-string ( = char* ) "." characters are going to be replaced by
 864    "/". Since the above function is used often, this is a separte
 865    function, instead of an if.
 866
 867 *******************************************************************************/
 868
 869 utf *utf_new_char_classname(const char *text)
 870 {
 871         if (strchr(text, '.')) {
 872                 char *txt = strdup(text);
 873                 char *end = txt + strlen(txt);
 874                 char *c;
 875                 utf *tmpRes;
 876
 877                 for (c = txt; c < end; c++)
 878                         if (*c == '.') *c = '/';
 879
 880                 tmpRes = utf_new(txt, strlen(txt));
 881                 FREE(txt, 0);
 882
 883                 return tmpRes;
 884
 885         } else
 886                 return utf_new(text, strlen(text));
 887 }
 888
 889
 890 /* utf_nextu2 ******************************************************************
 891
 892    Read the next unicode character from the utf string and increment
 893    the utf-string pointer accordingly.
 894
 895    CAUTION: This function is unsafe for input that was not checked
 896             by is_valid_utf!
 897
 898 *******************************************************************************/
 899
 900 u2 utf_nextu2(char **utf_ptr)
 901 {
 902     /* uncompressed unicode character */
 903     u2 unicode_char = 0;
 904     /* current position in utf text */
 905     unsigned char *utf = (unsigned char *) (*utf_ptr);
 906     /* bytes representing the unicode character */
 907     unsigned char ch1, ch2, ch3;
 908     /* number of bytes used to represent the unicode character */
 909     int len = 0;
 910
 911     switch ((ch1 = utf[0]) >> 4) {
 912         default: /* 1 byte */
 913                 (*utf_ptr)++;
 914                 return (u2) ch1;
 915         case 0xC:
 916         case 0xD: /* 2 bytes */
 917                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 918                         unsigned char high = ch1 & 0x1F;
 919                         unsigned char low  = ch2 & 0x3F;
 920                         unicode_char = (high << 6) + low;
 921                         len = 2;
 922                 }
 923                 break;
 924
 925         case 0xE: /* 2 or 3 bytes */
 926                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 927                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 928                                 unsigned char low  = ch3 & 0x3f;
 929                                 unsigned char mid  = ch2 & 0x3f;
 930                                 unsigned char high = ch1 & 0x0f;
 931                                 unicode_char = (((high << 6) + mid) << 6) + low;
 932                                 len = 3;
 933                         } else
 934                                 len = 2;
 935                 }
 936                 break;
 937     }
 938
 939     /* update position in utf-text */
 940     *utf_ptr = (char *) (utf + len);
 941
 942     return unicode_char;
 943 }
 944
 945
 946 /* utf_bytes *******************************************************************
 947
 948    Determine number of bytes (aka. octets) in the utf string.
 949
 950    IN:
 951       u............utf string
 952
 953    OUT:
 954       The number of octets of this utf string.
 955           There is _no_ terminating zero included in this count.
 956
 957 *******************************************************************************/
 958
 959 u4 utf_bytes(utf *u)
 960 {
 961         return u->blength;
 962 }
 963
 964
 965 /* utf_get_number_of_u2s_for_buffer ********************************************
 966
 967    Determine number of UTF-16 u2s in the given UTF-8 buffer
 968
 969    CAUTION: This function is unsafe for input that was not checked
 970             by is_valid_utf!
 971
 972    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 973    to an array of u2s (UTF-16) and want to know how many of them you will get.
 974    All other uses of this function are probably wrong.
 975
 976    IN:
 977       buffer........points to first char in buffer
 978           blength.......number of _bytes_ in the buffer
 979
 980    OUT:
 981       the number of u2s needed to hold this string in UTF-16 encoding.
 982           There is _no_ terminating zero included in this count.
 983
 984    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
 985    exception.
 986
 987 *******************************************************************************/
 988
 989 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
 990 {
 991         const char *endpos;                 /* points behind utf string           */
 992         const char *utf_ptr;                /* current position in utf text       */
 993         u4 len = 0;                         /* number of unicode characters       */
 994
 995         utf_ptr = buffer;
 996         endpos = utf_ptr + blength;
 997
 998         while (utf_ptr < endpos) {
 999                 len++;
1000                 /* next unicode character */
1001                 utf_nextu2((char **)&utf_ptr);
1002         }
1003
1004         assert(utf_ptr == endpos);
1005
1006         return len;
1007 }
1008
1009
1010 /* utf_get_number_of_u2s *******************************************************
1011
1012    Determine number of UTF-16 u2s in the utf string.
1013
1014    CAUTION: This function is unsafe for input that was not checked
1015             by is_valid_utf!
1016
1017    CAUTION: Use this function *only* when you want to convert a utf string
1018    to an array of u2s and want to know how many of them you will get.
1019    All other uses of this function are probably wrong.
1020
1021    IN:
1022       u............utf string
1023
1024    OUT:
1025       the number of u2s needed to hold this string in UTF-16 encoding.
1026           There is _no_ terminating zero included in this count.
1027           XXX 0 if a NullPointerException has been thrown (see below)
1028
1029 *******************************************************************************/
1030
1031 u4 utf_get_number_of_u2s(utf *u)
1032 {
1033         char *endpos;                       /* points behind utf string           */
1034         char *utf_ptr;                      /* current position in utf text       */
1035         u4 len = 0;                         /* number of unicode characters       */
1036
1037         /* XXX this is probably not checked by most callers! Review this after */
1038         /* the invalid uses of this function have been eliminated */
1039         if (u == NULL) {
1040                 exceptions_throw_nullpointerexception();
1041                 return 0;
1042         }
1043
1044         endpos = UTF_END(u);
1045         utf_ptr = u->text;
1046
1047         while (utf_ptr < endpos) {
1048                 len++;
1049                 /* next unicode character */
1050                 utf_nextu2(&utf_ptr);
1051         }
1052
1053         if (utf_ptr != endpos) {
1054                 /* string ended abruptly */
1055                 exceptions_throw_internalerror("Illegal utf8 string");
1056                 return 0;
1057         }
1058
1059         return len;
1060 }
1061
1062
1063 /* utf8_safe_number_of_u2s *****************************************************
1064
1065    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1066    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1067
1068    This function is safe even for invalid UTF-8 strings.
1069
1070    IN:
1071       text..........zero-terminated(!) UTF-8 string (may be invalid)
1072                         must NOT be NULL
1073           nbytes........strlen(text). (This is needed to completely emulate
1074                         the RI).
1075
1076    OUT:
1077       the number of u2s needed to hold this string in UTF-16 encoding.
1078           There is _no_ terminating zero included in this count.
1079
1080 *******************************************************************************/
1081
1082 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1083         register const unsigned char *t;
1084         register s4 byte;
1085         register s4 len;
1086         register const unsigned char *tlimit;
1087         s4 byte1;
1088         s4 byte2;
1089         s4 byte3;
1090         s4 value;
1091         s4 skip;
1092
1093         assert(text);
1094         assert(nbytes >= 0);
1095
1096         len = 0;
1097         t = (const unsigned char *) text;
1098         tlimit = t + nbytes;
1099
1100         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1101
1102         while (1) {
1103                 byte = *t++;
1104
1105                 if (byte & 0x80) {
1106                         /* highest bit set, non-ASCII character */
1107
1108                         if ((byte & 0xe0) == 0xc0) {
1109                                 /* 2-byte: should be 110..... 10...... ? */
1110
1111                                 if ((*t++ & 0xc0) == 0x80)
1112                                         ; /* valid 2-byte */
1113                                 else
1114                                         t--; /* invalid */
1115                         }
1116                         else if ((byte & 0xf0) == 0xe0) {
1117                                 /* 3-byte: should be 1110.... 10...... 10...... */
1118                                 /*                            ^t                */
1119
1120                                 if (t + 2 > tlimit)
1121                                         return len + 1; /* invalid, stop here */
1122
1123                                 if ((*t++ & 0xc0) == 0x80) {
1124                                         if ((*t++ & 0xc0) == 0x80)
1125                                                 ; /* valid 3-byte */
1126                                         else
1127                                                 t--; /* invalid */
1128                                 }
1129                                 else
1130                                         t--; /* invalid */
1131                         }
1132                         else if ((byte & 0xf8) == 0xf0) {
1133                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1134                                 /*                            ^t                         */
1135
1136                                 if (t + 3 > tlimit)
1137                                         return len + 1; /* invalid, stop here */
1138
1139                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1140                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1141                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1142                                                         /* valid 4-byte UTF-8? */
1143                                                         value = ((byte  & 0x07) << 18)
1144                                                                   | ((byte1 & 0x3f) << 12)
1145                                                                   | ((byte2 & 0x3f) <<  6)
1146                                                                   | ((byte3 & 0x3f)      );
1147
1148                                                         if (value > 0x10FFFF)
1149                                                                 ; /* invalid */
1150                                                         else if (value > 0xFFFF)
1151                                                                 len += 1; /* we need surrogates */
1152                                                         else
1153                                                                 ; /* 16bit suffice */
1154                                                 }
1155                                                 else
1156                                                         t--; /* invalid */
1157                                         }
1158                                         else
1159                                                 t--; /* invalid */
1160                                 }
1161                                 else
1162                                         t--; /* invalid */
1163                         }
1164                         else if ((byte & 0xfc) == 0xf8) {
1165                                 /* invalid 5-byte */
1166                                 if (t + 4 > tlimit)
1167                                         return len + 1; /* invalid, stop here */
1168
1169                                 skip = 4;
1170                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1171                                         t++;
1172                         }
1173                         else if ((byte & 0xfe) == 0xfc) {
1174                                 /* invalid 6-byte */
1175                                 if (t + 5 > tlimit)
1176                                         return len + 1; /* invalid, stop here */
1177
1178                                 skip = 5;
1179                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1180                                         t++;
1181                         }
1182                         else
1183                                 ; /* invalid */
1184                 }
1185                 else {
1186                         /* NUL */
1187
1188                         if (byte == 0)
1189                                 break;
1190
1191                         /* ASCII character, common case */
1192                 }
1193
1194                 len++;
1195         }
1196
1197         return len;
1198 }
1199
1200
1201 /* utf8_safe_convert_to_u2s ****************************************************
1202
1203    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1204    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1205    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1206
1207    This function is safe even for invalid UTF-8 strings.
1208
1209    IN:
1210       text..........zero-terminated(!) UTF-8 string (may be invalid)
1211                         must NOT be NULL
1212           nbytes........strlen(text). (This is needed to completely emulate
1213                                         the RI).
1214           buffer........a preallocated array of u2s to receive the decoded
1215                         string. Use utf8_safe_number_of_u2s to get the
1216                                         required number of u2s for allocating this.
1217
1218 *******************************************************************************/
1219
1220 #define UNICODE_REPLACEMENT  0xfffd
1221
1222 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1223         register const unsigned char *t;
1224         register s4 byte;
1225         register const unsigned char *tlimit;
1226         s4 byte1;
1227         s4 byte2;
1228         s4 byte3;
1229         s4 value;
1230         s4 skip;
1231
1232         assert(text);
1233         assert(nbytes >= 0);
1234
1235         t = (const unsigned char *) text;
1236         tlimit = t + nbytes;
1237
1238         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1239
1240         while (1) {
1241                 byte = *t++;
1242
1243                 if (byte & 0x80) {
1244                         /* highest bit set, non-ASCII character */
1245
1246                         if ((byte & 0xe0) == 0xc0) {
1247                                 /* 2-byte: should be 110..... 10...... */
1248
1249                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1250                                         /* valid 2-byte UTF-8 */
1251                                         *buffer++ = ((byte  & 0x1f) << 6)
1252                                                           | ((byte1 & 0x3f)     );
1253                                 }
1254                                 else {
1255                                         *buffer++ = UNICODE_REPLACEMENT;
1256                                         t--;
1257                                 }
1258                         }
1259                         else if ((byte & 0xf0) == 0xe0) {
1260                                 /* 3-byte: should be 1110.... 10...... 10...... */
1261
1262                                 if (t + 2 > tlimit) {
1263                                         *buffer++ = UNICODE_REPLACEMENT;
1264                                         return;
1265                                 }
1266
1267                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1268                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1269                                                 /* valid 3-byte UTF-8 */
1270                                                 *buffer++ = ((byte  & 0x0f) << 12)
1271                                                                   | ((byte1 & 0x3f) <<  6)
1272                                                                   | ((byte2 & 0x3f)      );
1273                                         }
1274                                         else {
1275                                                 *buffer++ = UNICODE_REPLACEMENT;
1276                                                 t--;
1277                                         }
1278                                 }
1279                                 else {
1280                                         *buffer++ = UNICODE_REPLACEMENT;
1281                                         t--;
1282                                 }
1283                         }
1284                         else if ((byte & 0xf8) == 0xf0) {
1285                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1286
1287                                 if (t + 3 > tlimit) {
1288                                         *buffer++ = UNICODE_REPLACEMENT;
1289                                         return;
1290                                 }
1291
1292                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1293                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1294                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1295                                                         /* valid 4-byte UTF-8? */
1296                                                         value = ((byte  & 0x07) << 18)
1297                                                                   | ((byte1 & 0x3f) << 12)
1298                                                                   | ((byte2 & 0x3f) <<  6)
1299                                                                   | ((byte3 & 0x3f)      );
1300
1301                                                         if (value > 0x10FFFF) {
1302                                                                 *buffer++ = UNICODE_REPLACEMENT;
1303                                                         }
1304                                                         else if (value > 0xFFFF) {
1305                                                                 /* we need surrogates */
1306                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1307                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1308                                                         }
1309                                                         else
1310                                                                 *buffer++ = value; /* 16bit suffice */
1311                                                 }
1312                                                 else {
1313                                                         *buffer++ = UNICODE_REPLACEMENT;
1314                                                         t--;
1315                                                 }
1316                                         }
1317                                         else {
1318                                                 *buffer++ = UNICODE_REPLACEMENT;
1319                                                 t--;
1320                                         }
1321                                 }
1322                                 else {
1323                                         *buffer++ = UNICODE_REPLACEMENT;
1324                                         t--;
1325                                 }
1326                         }
1327                         else if ((byte & 0xfc) == 0xf8) {
1328                                 if (t + 4 > tlimit) {
1329                                         *buffer++ = UNICODE_REPLACEMENT;
1330                                         return;
1331                                 }
1332
1333                                 skip = 4;
1334                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1335                                         t++;
1336                                 *buffer++ = UNICODE_REPLACEMENT;
1337                         }
1338                         else if ((byte & 0xfe) == 0xfc) {
1339                                 if (t + 5 > tlimit) {
1340                                         *buffer++ = UNICODE_REPLACEMENT;
1341                                         return;
1342                                 }
1343
1344                                 skip = 5;
1345                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1346                                         t++;
1347                                 *buffer++ = UNICODE_REPLACEMENT;
1348                         }
1349                         else
1350                                 *buffer++ = UNICODE_REPLACEMENT;
1351                 }
1352                 else {
1353                         /* NUL */
1354
1355                         if (byte == 0)
1356                                 break;
1357
1358                         /* ASCII character, common case */
1359
1360                         *buffer++ = byte;
1361                 }
1362         }
1363 }
1364
1365
1366 /* u2_utflength ****************************************************************
1367
1368    Returns the utf length in bytes of a u2 array.
1369
1370 *******************************************************************************/
1371
1372 u4 u2_utflength(u2 *text, u4 u2_length)
1373 {
1374         u4 result_len = 0;                  /* utf length in bytes                */
1375         u2 ch;                              /* current unicode character          */
1376         u4 len;
1377
1378         for (len = 0; len < u2_length; len++) {
1379                 /* next unicode character */
1380                 ch = *text++;
1381
1382                 /* determine bytes required to store unicode character as utf */
1383                 if (ch && (ch < 0x80))
1384                         result_len++;
1385                 else if (ch < 0x800)
1386                         result_len += 2;
1387                 else
1388                         result_len += 3;
1389         }
1390
1391     return result_len;
1392 }
1393
1394
1395 /* utf_copy ********************************************************************
1396
1397    Copy the given utf string byte-for-byte to a buffer.
1398
1399    IN:
1400       buffer.......the buffer
1401           u............the utf string
1402
1403 *******************************************************************************/
1404
1405 void utf_copy(char *buffer, utf *u)
1406 {
1407         /* our utf strings are zero-terminated (done by utf_new) */
1408         MCOPY(buffer, u->text, char, u->blength + 1);
1409 }
1410
1411
1412 /* utf_cat *********************************************************************
1413
1414    Append the given utf string byte-for-byte to a buffer.
1415
1416    IN:
1417       buffer.......the buffer
1418           u............the utf string
1419
1420 *******************************************************************************/
1421
1422 void utf_cat(char *buffer, utf *u)
1423 {
1424         /* our utf strings are zero-terminated (done by utf_new) */
1425         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1426 }
1427
1428
1429 /* utf_copy_classname **********************************************************
1430
1431    Copy the given utf classname byte-for-byte to a buffer.
1432    '/' is replaced by '.'
1433
1434    IN:
1435       buffer.......the buffer
1436           u............the utf string
1437
1438 *******************************************************************************/
1439
1440 void utf_copy_classname(char *buffer, utf *u)
1441 {
1442         char *bufptr;
1443         char *srcptr;
1444         char *endptr;
1445         char ch;
1446
1447         bufptr = buffer;
1448         srcptr = u->text;
1449         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1450
1451         while (srcptr != endptr) {
1452                 ch = *srcptr++;
1453                 if (ch == '/')
1454                         ch = '.';
1455                 *bufptr++ = ch;
1456         }
1457 }
1458
1459
1460 /* utf_cat *********************************************************************
1461
1462    Append the given utf classname byte-for-byte to a buffer.
1463    '/' is replaced by '.'
1464
1465    IN:
1466       buffer.......the buffer
1467           u............the utf string
1468
1469 *******************************************************************************/
1470
1471 void utf_cat_classname(char *buffer, utf *u)
1472 {
1473         utf_copy_classname(buffer + strlen(buffer), u);
1474 }
1475
1476 /* utf_display_printable_ascii *************************************************
1477
1478    Write utf symbol to stdout (for debugging purposes).
1479    Non-printable and non-ASCII characters are printed as '?'.
1480
1481 *******************************************************************************/
1482
1483 void utf_display_printable_ascii(utf *u)
1484 {
1485         char *endpos;                       /* points behind utf string           */
1486         char *utf_ptr;                      /* current position in utf text       */
1487
1488         if (u == NULL) {
1489                 printf("NULL");
1490                 fflush(stdout);
1491                 return;
1492         }
1493
1494         endpos = UTF_END(u);
1495         utf_ptr = u->text;
1496
1497         while (utf_ptr < endpos) {
1498                 /* read next unicode character */
1499
1500                 u2 c = utf_nextu2(&utf_ptr);
1501
1502                 if ((c >= 32) && (c <= 127))
1503                         printf("%c", c);
1504                 else
1505                         printf("?");
1506         }
1507
1508         fflush(stdout);
1509 }
1510
1511
1512 /* utf_display_printable_ascii_classname ***************************************
1513
1514    Write utf symbol to stdout with `/' converted to `.' (for debugging
1515    purposes).
1516    Non-printable and non-ASCII characters are printed as '?'.
1517
1518 *******************************************************************************/
1519
1520 void utf_display_printable_ascii_classname(utf *u)
1521 {
1522         char *endpos;                       /* points behind utf string           */
1523         char *utf_ptr;                      /* current position in utf text       */
1524
1525         if (u == NULL) {
1526                 printf("NULL");
1527                 fflush(stdout);
1528                 return;
1529         }
1530
1531         endpos = UTF_END(u);
1532         utf_ptr = u->text;
1533
1534         while (utf_ptr < endpos) {
1535                 /* read next unicode character */
1536
1537                 u2 c = utf_nextu2(&utf_ptr);
1538
1539                 if (c == '/')
1540                         c = '.';
1541
1542                 if ((c >= 32) && (c <= 127))
1543                         printf("%c", c);
1544                 else
1545                         printf("?");
1546         }
1547
1548         fflush(stdout);
1549 }
1550
1551
1552 /* utf_sprint_convert_to_latin1 ************************************************
1553
1554    Write utf symbol into c-string (for debugging purposes).
1555    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1556    invalid results.
1557
1558 *******************************************************************************/
1559
1560 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1561 {
1562         char *endpos;                       /* points behind utf string           */
1563         char *utf_ptr;                      /* current position in utf text       */
1564         u2 pos = 0;                         /* position in c-string               */
1565
1566         if (!u) {
1567                 strcpy(buffer, "NULL");
1568                 return;
1569         }
1570
1571         endpos = UTF_END(u);
1572         utf_ptr = u->text;
1573
1574         while (utf_ptr < endpos)
1575                 /* copy next unicode character */
1576                 buffer[pos++] = utf_nextu2(&utf_ptr);
1577
1578         /* terminate string */
1579         buffer[pos] = '\0';
1580 }
1581
1582
1583 /* utf_sprint_convert_to_latin1_classname **************************************
1584
1585    Write utf symbol into c-string with `/' converted to `.' (for debugging
1586    purposes).
1587    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1588    invalid results.
1589
1590 *******************************************************************************/
1591
1592 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1593 {
1594         char *endpos;                       /* points behind utf string           */
1595         char *utf_ptr;                      /* current position in utf text       */
1596         u2 pos = 0;                         /* position in c-string               */
1597
1598         if (!u) {
1599                 strcpy(buffer, "NULL");
1600                 return;
1601         }
1602
1603         endpos = UTF_END(u);
1604         utf_ptr = u->text;
1605
1606         while (utf_ptr < endpos) {
1607                 /* copy next unicode character */
1608                 u2 c = utf_nextu2(&utf_ptr);
1609                 if (c == '/') c = '.';
1610                 buffer[pos++] = c;
1611         }
1612
1613         /* terminate string */
1614         buffer[pos] = '\0';
1615 }
1616
1617
1618 /* utf_strcat_convert_to_latin1 ************************************************
1619
1620    Like libc strcat, but uses an utf8 string.
1621    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1622    invalid results.
1623
1624 *******************************************************************************/
1625
1626 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1627 {
1628         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1629 }
1630
1631
1632 /* utf_strcat_convert_to_latin1_classname **************************************
1633
1634    Like libc strcat, but uses an utf8 string.
1635    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1636    invalid results.
1637
1638 *******************************************************************************/
1639
1640 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1641 {
1642         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1643 }
1644
1645
1646 /* utf_fprint_printable_ascii **************************************************
1647
1648    Write utf symbol into file.
1649    Non-printable and non-ASCII characters are printed as '?'.
1650
1651 *******************************************************************************/
1652
1653 void utf_fprint_printable_ascii(FILE *file, utf *u)
1654 {
1655         char *endpos;                       /* points behind utf string           */
1656         char *utf_ptr;                      /* current position in utf text       */
1657
1658         if (!u)
1659                 return;
1660
1661         endpos = UTF_END(u);
1662         utf_ptr = u->text;
1663
1664         while (utf_ptr < endpos) {
1665                 /* read next unicode character */
1666                 u2 c = utf_nextu2(&utf_ptr);
1667
1668                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1669                 else fprintf(file, "?");
1670         }
1671 }
1672
1673
1674 /* utf_fprint_printable_ascii_classname ****************************************
1675
1676    Write utf symbol into file with `/' converted to `.'.
1677    Non-printable and non-ASCII characters are printed as '?'.
1678
1679 *******************************************************************************/
1680
1681 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1682 {
1683         char *endpos;                       /* points behind utf string           */
1684         char *utf_ptr;                      /* current position in utf text       */
1685
1686     if (!u)
1687                 return;
1688
1689         endpos = UTF_END(u);
1690         utf_ptr = u->text;
1691
1692         while (utf_ptr < endpos) {
1693                 /* read next unicode character */
1694                 u2 c = utf_nextu2(&utf_ptr);
1695                 if (c == '/') c = '.';
1696
1697                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1698                 else fprintf(file, "?");
1699         }
1700 }
1701
1702
1703 /* is_valid_utf ****************************************************************
1704
1705    Return true if the given string is a valid UTF-8 string.
1706
1707    utf_ptr...points to first character
1708    end_pos...points after last character
1709
1710 *******************************************************************************/
1711
1712 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1713
1714 bool is_valid_utf(char *utf_ptr, char *end_pos)
1715 {
1716         int bytes;
1717         int len,i;
1718         char c;
1719         unsigned long v;
1720
1721         if (end_pos < utf_ptr) return false;
1722         bytes = end_pos - utf_ptr;
1723         while (bytes--) {
1724                 c = *utf_ptr++;
1725
1726                 if (!c) return false;                     /* 0x00 is not allowed */
1727                 if ((c & 0x80) == 0) continue;            /* ASCII */
1728
1729                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1730                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1731                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1732                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1733                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1734                 else return false;                        /* invalid leading byte */
1735
1736                 if (len > 2) return false;                /* Java limitation */
1737
1738                 v = (unsigned long)c & (0x3f >> len);
1739
1740                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1741
1742                 for (i = len; i--; ) {
1743                         c = *utf_ptr++;
1744                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1745                                 return false;
1746                         v = (v << 6) | (c & 0x3f);
1747                 }
1748
1749                 if (v == 0) {
1750                         if (len != 1) return false;           /* Java special */
1751
1752                 } else {
1753                         /* Sun Java seems to allow overlong UTF-8 encodings */
1754
1755                         /* if (v < min_codepoint[len]) */
1756                                 /* XXX throw exception? */
1757                 }
1758
1759                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1760                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1761
1762                 /* even these seem to be allowed */
1763                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1764         }
1765
1766         return true;
1767 }
1768
1769
1770 /* is_valid_name ***************************************************************
1771
1772    Return true if the given string may be used as a class/field/method
1773    name. (Currently this only disallows empty strings and control
1774    characters.)
1775
1776    NOTE: The string is assumed to have passed is_valid_utf!
1777
1778    utf_ptr...points to first character
1779    end_pos...points after last character
1780
1781 *******************************************************************************/
1782
1783 bool is_valid_name(char *utf_ptr, char *end_pos)
1784 {
1785         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1786
1787         while (utf_ptr < end_pos) {
1788                 unsigned char c = *utf_ptr++;
1789
1790                 if (c < 0x20) return false; /* disallow control characters */
1791                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1792                         return false;
1793         }
1794
1795         return true;
1796 }
1797
1798 bool is_valid_name_utf(utf *u)
1799 {
1800         return is_valid_name(u->text, UTF_END(u));
1801 }
1802
1803
1804 /* utf_show ********************************************************************
1805
1806    Writes the utf symbols in the utfhash to stdout and displays the
1807    number of external hash chains grouped according to the chainlength
1808    (for debugging purposes).
1809
1810 *******************************************************************************/
1811
1812 #if !defined(NDEBUG)
1813 void utf_show(void)
1814 {
1815
1816 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1817
1818         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1819         u4 max_chainlength = 0;      /* maximum length of the chains */
1820         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1821         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1822         u4 i;
1823
1824         printf("UTF-HASH:\n");
1825
1826         /* show element of utf-hashtable */
1827
1828         for (i = 0; i < hashtable_utf->size; i++) {
1829                 utf *u = hashtable_utf->ptr[i];
1830
1831                 if (u) {
1832                         printf("SLOT %d: ", (int) i);
1833
1834                         while (u) {
1835                                 printf("'");
1836                                 utf_display_printable_ascii(u);
1837                                 printf("' ");
1838                                 u = u->hashlink;
1839                         }
1840                         printf("\n");
1841                 }
1842         }
1843
1844         printf("UTF-HASH: %d slots for %d entries\n",
1845                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1846
1847         if (hashtable_utf->entries == 0)
1848                 return;
1849
1850         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1851
1852         for (i=0;i<CHAIN_LIMIT;i++)
1853                 chain_count[i]=0;
1854
1855         /* count numbers of hashchains according to their length */
1856         for (i=0; i<hashtable_utf->size; i++) {
1857
1858                 utf *u = (utf*) hashtable_utf->ptr[i];
1859                 u4 chain_length = 0;
1860
1861                 /* determine chainlength */
1862                 while (u) {
1863                         u = u->hashlink;
1864                         chain_length++;
1865                 }
1866
1867                 /* update sum of all chainlengths */
1868                 sum_chainlength+=chain_length;
1869
1870                 /* determine the maximum length of the chains */
1871                 if (chain_length>max_chainlength)
1872                         max_chainlength = chain_length;
1873
1874                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1875                 if (chain_length>=CHAIN_LIMIT) {
1876                         beyond_limit+=chain_length;
1877                         chain_length=CHAIN_LIMIT-1;
1878                 }
1879
1880                 /* update number of hashchains of current length */
1881                 chain_count[chain_length]++;
1882         }
1883
1884         /* display results */
1885         for (i=1;i<CHAIN_LIMIT-1;i++)
1886                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1887
1888         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1889
1890
1891         printf("max. chainlength:%5d\n",max_chainlength);
1892
1893         /* avg. chainlength = sum of chainlengths / number of chains */
1894         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1895 }
1896 #endif /* !defined(NDEBUG) */
1897
1898
1899 /*
1900  * These are local overrides for various environment variables in Emacs.
1901  * Please do not remove this and leave it at the end of the file, where
1902  * Emacs will automagically detect them.
1903  * ---------------------------------------------------------------------
1904  * Local variables:
1905  * mode: c
1906  * indent-tabs-mode: t
1907  * c-basic-offset: 4
1908  * tab-width: 4
1909  * End:
1910  * vim:noexpandtab:sw=4:ts=4:
1911  */