src/vm/utf8.c

   1 /* src/vm/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006, 2007, 2008
   4    CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
   5
   6    This file is part of CACAO.
   7
   8    This program is free software; you can redistribute it and/or
   9    modify it under the terms of the GNU General Public License as
  10    published by the Free Software Foundation; either version 2, or (at
  11    your option) any later version.
  12
  13    This program is distributed in the hope that it will be useful, but
  14    WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16    General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; if not, write to the Free Software
  20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  21    02110-1301, USA.
  22
  23 */
  24
  25
  26 #include "config.h"
  27
  28 #include <string.h>
  29 #include <assert.h>
  30
  31 #include "vm/types.h"
  32
  33 #include "mm/memory.hpp"
  34
  35 #include "threads/mutex.hpp"
  36
  37 #include "toolbox/hashtable.h"
  38
  39 #include "vm/exceptions.hpp"
  40 #include "vm/options.h"
  41
  42 #if defined(ENABLE_STATISTICS)
  43 # include "vm/statistics.h"
  44 #endif
  45
  46 #include "vm/utf8.h"
  47
  48
  49 /* global variables ***********************************************************/
  50
  51 /* hashsize must be power of 2 */
  52
  53 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  54
  55 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  56
  57
  58 /* utf-symbols for pointer comparison of frequently used strings **************/
  59
  60 utf *utf_java_lang_Object;
  61
  62 utf *utf_java_lang_Class;
  63 utf *utf_java_lang_ClassLoader;
  64 utf *utf_java_lang_ClassLoader_NativeLibrary;
  65 utf *utf_java_lang_Cloneable;
  66 utf *utf_java_lang_SecurityManager;
  67 utf *utf_java_lang_String;
  68 utf *utf_java_lang_ThreadGroup;
  69 utf *utf_java_lang_ref_SoftReference;
  70 utf *utf_java_lang_ref_WeakReference;
  71 utf *utf_java_lang_ref_PhantomReference;
  72 utf *utf_java_io_Serializable;
  73
  74 utf *utf_java_lang_Throwable;
  75 utf *utf_java_lang_Error;
  76
  77 utf *utf_java_lang_AbstractMethodError;
  78 utf *utf_java_lang_ClassCircularityError;
  79 utf *utf_java_lang_ClassFormatError;
  80 utf *utf_java_lang_ExceptionInInitializerError;
  81 utf *utf_java_lang_IncompatibleClassChangeError;
  82 utf *utf_java_lang_InstantiationError;
  83 utf *utf_java_lang_InternalError;
  84 utf *utf_java_lang_LinkageError;
  85 utf *utf_java_lang_NoClassDefFoundError;
  86 utf *utf_java_lang_NoSuchFieldError;
  87 utf *utf_java_lang_NoSuchMethodError;
  88 utf *utf_java_lang_OutOfMemoryError;
  89 utf *utf_java_lang_UnsatisfiedLinkError;
  90 utf *utf_java_lang_UnsupportedClassVersionError;
  91 utf *utf_java_lang_VerifyError;
  92 utf *utf_java_lang_VirtualMachineError;
  93
  94 utf *utf_java_lang_Exception;
  95
  96 utf *utf_java_lang_ArithmeticException;
  97 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
  98 utf *utf_java_lang_ArrayStoreException;
  99 utf *utf_java_lang_ClassCastException;
 100 utf *utf_java_lang_ClassNotFoundException;
 101 utf *utf_java_lang_CloneNotSupportedException;
 102 utf *utf_java_lang_IllegalAccessException;
 103 utf *utf_java_lang_IllegalArgumentException;
 104 utf *utf_java_lang_IllegalMonitorStateException;
 105 utf *utf_java_lang_InstantiationException;
 106 utf *utf_java_lang_InterruptedException;
 107 utf *utf_java_lang_NegativeArraySizeException;
 108 utf *utf_java_lang_NullPointerException;
 109 utf *utf_java_lang_RuntimeException;
 110 utf *utf_java_lang_StringIndexOutOfBoundsException;
 111
 112 utf *utf_java_lang_reflect_InvocationTargetException;
 113
 114 utf *utf_java_security_PrivilegedActionException;
 115
 116 #if defined(ENABLE_JAVASE)
 117 utf* utf_java_lang_Void;
 118 #endif
 119
 120 utf* utf_java_lang_Boolean;
 121 utf* utf_java_lang_Byte;
 122 utf* utf_java_lang_Character;
 123 utf* utf_java_lang_Short;
 124 utf* utf_java_lang_Integer;
 125 utf* utf_java_lang_Long;
 126 utf* utf_java_lang_Float;
 127 utf* utf_java_lang_Double;
 128
 129 #if defined(ENABLE_JAVASE)
 130 utf *utf_java_lang_StackTraceElement;
 131 utf *utf_java_lang_reflect_Constructor;
 132 utf *utf_java_lang_reflect_Field;
 133 utf *utf_java_lang_reflect_Method;
 134
 135 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
 136 utf *utf_java_lang_reflect_VMConstructor;
 137 utf *utf_java_lang_reflect_VMField;
 138 utf *utf_java_lang_reflect_VMMethod;
 139 # endif
 140
 141 utf *utf_java_util_Vector;
 142 #endif
 143
 144 utf *utf_InnerClasses;                  /* InnerClasses                       */
 145 utf *utf_ConstantValue;                 /* ConstantValue                      */
 146 utf *utf_Code;                          /* Code                               */
 147 utf *utf_Exceptions;                    /* Exceptions                         */
 148 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 149 utf *utf_SourceFile;                    /* SourceFile                         */
 150
 151 #if defined(ENABLE_JAVASE)
 152 utf *utf_EnclosingMethod;
 153 utf *utf_Signature;
 154 utf *utf_StackMapTable;
 155
 156 # if defined(ENABLE_JVMTI)
 157 utf *utf_LocalVariableTable;
 158 # endif
 159
 160 # if defined(ENABLE_ANNOTATIONS)
 161 utf *utf_RuntimeVisibleAnnotations;            /* RuntimeVisibleAnnotations            */
 162 utf *utf_RuntimeInvisibleAnnotations;          /* RuntimeInvisibleAnnotations          */
 163 utf *utf_RuntimeVisibleParameterAnnotations;   /* RuntimeVisibleParameterAnnotations   */
 164 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
 165 utf *utf_AnnotationDefault;                    /* AnnotationDefault                    */
 166 # endif
 167 #endif
 168
 169 utf *utf_init;                          /* <init>                             */
 170 utf *utf_clinit;                        /* <clinit>                           */
 171 utf *utf_clone;                         /* clone                              */
 172 utf *utf_finalize;                      /* finalize                           */
 173 utf *utf_invoke;
 174 utf *utf_main;
 175 utf *utf_run;                           /* run                                */
 176
 177 utf *utf_add;
 178 utf *utf_dispatch;
 179 utf *utf_remove;
 180 utf *utf_addThread;
 181 utf *utf_removeThread;
 182 utf *utf_put;
 183 utf *utf_get;
 184 utf *utf_uncaughtException;
 185 utf *utf_value;
 186
 187 utf *utf_fillInStackTrace;
 188 utf *utf_findNative;
 189 utf *utf_getSystemClassLoader;
 190 utf *utf_initCause;
 191 utf *utf_loadClass;
 192 utf *utf_loadClassInternal;
 193 utf *utf_printStackTrace;
 194
 195 utf *utf_division_by_zero;
 196
 197 utf *utf_Z;                             /* Z                                  */
 198 utf *utf_B;                             /* B                                  */
 199 utf *utf_C;                             /* C                                  */
 200 utf *utf_S;                             /* S                                  */
 201 utf *utf_I;                             /* I                                  */
 202 utf *utf_J;                             /* J                                  */
 203 utf *utf_F;                             /* F                                  */
 204 utf *utf_D;                             /* D                                  */
 205
 206 utf *utf_void__void;                    /* ()V                                */
 207 utf *utf_boolean__void;                 /* (Z)V                               */
 208 utf *utf_byte__void;                    /* (B)V                               */
 209 utf *utf_char__void;                    /* (C)V                               */
 210 utf *utf_short__void;                   /* (S)V                               */
 211 utf *utf_int__void;                     /* (I)V                               */
 212 utf *utf_long__void;                    /* (J)V                               */
 213 utf *utf_float__void;                   /* (F)V                               */
 214 utf *utf_double__void;                  /* (D)V                               */
 215
 216 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 217 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 218 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 219 utf *utf_java_lang_ClassLoader_java_lang_String__J;
 220 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 221 utf *utf_java_lang_Object__java_lang_Object;
 222 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 223 utf *utf_java_lang_String__java_lang_Class;
 224 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 225 utf *utf_java_lang_Thread_java_lang_Throwable__V;
 226 utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
 227 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 228 utf *utf_java_lang_Throwable__java_lang_Throwable;
 229
 230 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 231 utf *utf_null;
 232 utf *array_packagename;
 233
 234
 235 /* utf_init ********************************************************************
 236
 237    Initializes the utf8 subsystem.
 238
 239 *******************************************************************************/
 240
 241 void utf8_init(void)
 242 {
 243         TRACESUBSYSTEMINITIALIZATION("utf8_init");
 244
 245         /* create utf8 hashtable */
 246
 247         hashtable_utf = NEW(hashtable);
 248
 249         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 250
 251 #if defined(ENABLE_STATISTICS)
 252         if (opt_stat)
 253                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 254 #endif
 255
 256         /* create utf-symbols for pointer comparison of frequently used strings */
 257
 258         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 259
 260         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 261         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 262         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 263         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 264         utf_java_lang_String           = utf_new_char("java/lang/String");
 265         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 266
 267         utf_java_lang_ClassLoader_NativeLibrary =
 268                 utf_new_char("java/lang/ClassLoader$NativeLibrary");
 269
 270         utf_java_lang_ref_SoftReference =
 271                 utf_new_char("java/lang/ref/SoftReference");
 272
 273         utf_java_lang_ref_WeakReference =
 274                 utf_new_char("java/lang/ref/WeakReference");
 275
 276         utf_java_lang_ref_PhantomReference =
 277                 utf_new_char("java/lang/ref/PhantomReference");
 278
 279         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 280
 281         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 282         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 283
 284         utf_java_lang_ClassCircularityError =
 285                 utf_new_char("java/lang/ClassCircularityError");
 286
 287         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
 288
 289         utf_java_lang_ExceptionInInitializerError =
 290                 utf_new_char("java/lang/ExceptionInInitializerError");
 291
 292         utf_java_lang_IncompatibleClassChangeError =
 293                 utf_new_char("java/lang/IncompatibleClassChangeError");
 294
 295         utf_java_lang_InstantiationError =
 296                 utf_new_char("java/lang/InstantiationError");
 297
 298         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
 299         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 300
 301         utf_java_lang_NoClassDefFoundError =
 302                 utf_new_char("java/lang/NoClassDefFoundError");
 303
 304         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 305
 306         utf_java_lang_UnsatisfiedLinkError =
 307                 utf_new_char("java/lang/UnsatisfiedLinkError");
 308
 309         utf_java_lang_UnsupportedClassVersionError =
 310                 utf_new_char("java/lang/UnsupportedClassVersionError");
 311
 312         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
 313
 314         utf_java_lang_VirtualMachineError =
 315                 utf_new_char("java/lang/VirtualMachineError");
 316
 317 #if defined(ENABLE_JAVASE)
 318         utf_java_lang_AbstractMethodError =
 319                 utf_new_char("java/lang/AbstractMethodError");
 320
 321         utf_java_lang_NoSuchFieldError =
 322                 utf_new_char("java/lang/NoSuchFieldError");
 323
 324         utf_java_lang_NoSuchMethodError =
 325                 utf_new_char("java/lang/NoSuchMethodError");
 326 #endif
 327
 328         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 329
 330         utf_java_lang_ArithmeticException =
 331                 utf_new_char("java/lang/ArithmeticException");
 332
 333         utf_java_lang_ArrayIndexOutOfBoundsException =
 334                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
 335
 336         utf_java_lang_ArrayStoreException =
 337                 utf_new_char("java/lang/ArrayStoreException");
 338
 339         utf_java_lang_ClassCastException =
 340                 utf_new_char("java/lang/ClassCastException");
 341
 342         utf_java_lang_ClassNotFoundException =
 343                 utf_new_char("java/lang/ClassNotFoundException");
 344
 345         utf_java_lang_CloneNotSupportedException =
 346                 utf_new_char("java/lang/CloneNotSupportedException");
 347
 348         utf_java_lang_IllegalAccessException =
 349                 utf_new_char("java/lang/IllegalAccessException");
 350
 351         utf_java_lang_IllegalArgumentException =
 352                 utf_new_char("java/lang/IllegalArgumentException");
 353
 354         utf_java_lang_IllegalMonitorStateException =
 355                 utf_new_char("java/lang/IllegalMonitorStateException");
 356
 357         utf_java_lang_InstantiationException =
 358                 utf_new_char("java/lang/InstantiationException");
 359
 360         utf_java_lang_InterruptedException =
 361                 utf_new_char("java/lang/InterruptedException");
 362
 363         utf_java_lang_NegativeArraySizeException =
 364                 utf_new_char("java/lang/NegativeArraySizeException");
 365
 366         utf_java_lang_NullPointerException =
 367                 utf_new_char("java/lang/NullPointerException");
 368
 369         utf_java_lang_RuntimeException =
 370                 utf_new_char("java/lang/RuntimeException");
 371
 372         utf_java_lang_StringIndexOutOfBoundsException =
 373                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
 374
 375         utf_java_lang_reflect_InvocationTargetException =
 376                 utf_new_char("java/lang/reflect/InvocationTargetException");
 377
 378         utf_java_security_PrivilegedActionException =
 379                 utf_new_char("java/security/PrivilegedActionException");
 380
 381 #if defined(ENABLE_JAVASE)
 382         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 383 #endif
 384
 385         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 386         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 387         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 388         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 389         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 390         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 391         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 392         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 393
 394 #if defined(ENABLE_JAVASE)
 395         utf_java_lang_StackTraceElement =
 396                 utf_new_char("java/lang/StackTraceElement");
 397
 398         utf_java_lang_reflect_Constructor =
 399                 utf_new_char("java/lang/reflect/Constructor");
 400
 401         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 402         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 403
 404 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
 405         utf_java_lang_reflect_VMConstructor = utf_new_char("java/lang/reflect/VMConstructor");
 406         utf_java_lang_reflect_VMField       = utf_new_char("java/lang/reflect/VMField");
 407         utf_java_lang_reflect_VMMethod      = utf_new_char("java/lang/reflect/VMMethod");
 408 # endif
 409
 410         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 411 #endif
 412
 413         utf_InnerClasses               = utf_new_char("InnerClasses");
 414         utf_ConstantValue              = utf_new_char("ConstantValue");
 415         utf_Code                       = utf_new_char("Code");
 416         utf_Exceptions                 = utf_new_char("Exceptions");
 417         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 418         utf_SourceFile                 = utf_new_char("SourceFile");
 419
 420 #if defined(ENABLE_JAVASE)
 421         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 422         utf_Signature                  = utf_new_char("Signature");
 423         utf_StackMapTable              = utf_new_char("StackMapTable");
 424
 425 # if defined(ENABLE_JVMTI)
 426         utf_LocalVariableTable         = utf_new_char("LocalVariableTable");
 427 # endif
 428
 429 # if defined(ENABLE_ANNOTATIONS)
 430         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
 431         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
 432         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
 433         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
 434         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
 435 # endif
 436 #endif
 437
 438         utf_init                           = utf_new_char("<init>");
 439         utf_clinit                         = utf_new_char("<clinit>");
 440         utf_clone                      = utf_new_char("clone");
 441         utf_finalize                   = utf_new_char("finalize");
 442         utf_invoke                     = utf_new_char("invoke");
 443         utf_main                       = utf_new_char("main");
 444         utf_run                        = utf_new_char("run");
 445
 446         utf_add                        = utf_new_char("add");
 447         utf_dispatch                   = utf_new_char("dispatch");
 448         utf_remove                     = utf_new_char("remove");
 449         utf_addThread                  = utf_new_char("addThread");
 450         utf_removeThread               = utf_new_char("removeThread");
 451         utf_put                        = utf_new_char("put");
 452         utf_get                        = utf_new_char("get");
 453         utf_uncaughtException          = utf_new_char("uncaughtException");
 454         utf_value                      = utf_new_char("value");
 455
 456         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 457         utf_findNative                 = utf_new_char("findNative");
 458         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 459         utf_initCause                  = utf_new_char("initCause");
 460         utf_loadClass                  = utf_new_char("loadClass");
 461         utf_loadClassInternal          = utf_new_char("loadClassInternal");
 462         utf_printStackTrace            = utf_new_char("printStackTrace");
 463
 464         utf_division_by_zero           = utf_new_char("/ by zero");
 465
 466         utf_Z                          = utf_new_char("Z");
 467         utf_B                          = utf_new_char("B");
 468         utf_C                          = utf_new_char("C");
 469         utf_S                          = utf_new_char("S");
 470         utf_I                          = utf_new_char("I");
 471         utf_J                          = utf_new_char("J");
 472         utf_F                          = utf_new_char("F");
 473         utf_D                          = utf_new_char("D");
 474
 475         utf_void__void                 = utf_new_char("()V");
 476         utf_boolean__void              = utf_new_char("(Z)V");
 477         utf_byte__void                 = utf_new_char("(B)V");
 478         utf_char__void                 = utf_new_char("(C)V");
 479         utf_short__void                = utf_new_char("(S)V");
 480         utf_int__void                  = utf_new_char("(I)V");
 481         utf_long__void                 = utf_new_char("(J)V");
 482         utf_float__void                = utf_new_char("(F)V");
 483         utf_double__void               = utf_new_char("(D)V");
 484         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 485         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 486
 487         utf_void__java_lang_ClassLoader =
 488                 utf_new_char("()Ljava/lang/ClassLoader;");
 489
 490         utf_java_lang_ClassLoader_java_lang_String__J =
 491                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
 492
 493         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
 494
 495         utf_java_lang_Object__java_lang_Object =
 496                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 497
 498         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 499
 500         utf_java_lang_String__java_lang_Class =
 501                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 502
 503         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 504
 505         utf_java_lang_Thread_java_lang_Throwable__V =
 506                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
 507
 508         utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
 509                 utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");
 510
 511         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 512
 513         utf_java_lang_Throwable__java_lang_Throwable =
 514                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
 515
 516         utf_null                       = utf_new_char("null");
 517         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 518         array_packagename              = utf_new_char("\t<the array package>");
 519 }
 520
 521
 522 /* utf_hashkey *****************************************************************
 523
 524    The hashkey is computed from the utf-text by using up to 8
 525    characters.  For utf-symbols longer than 15 characters 3 characters
 526    are taken from the beginning and the end, 2 characters are taken
 527    from the middle.
 528
 529 *******************************************************************************/
 530
 531 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 532 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 533
 534 u4 utf_hashkey(const char *text, u4 length)
 535 {
 536         const char *start_pos = text;       /* pointer to utf text                */
 537         u4 a;
 538
 539         switch (length) {
 540         case 0: /* empty string */
 541                 return 0;
 542
 543         case 1: return fbs(0);
 544         case 2: return fbs(0) ^ nbs(3);
 545         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 546         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 547         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 548         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 549         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 550         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 551
 552         case 9:
 553                 a = fbs(0);
 554                 a ^= nbs(1);
 555                 a ^= nbs(2);
 556                 text++;
 557                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 558
 559         case 10:
 560                 a = fbs(0);
 561                 text++;
 562                 a ^= nbs(2);
 563                 a ^= nbs(3);
 564                 a ^= nbs(4);
 565                 text++;
 566                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 567
 568         case 11:
 569                 a = fbs(0);
 570                 text++;
 571                 a ^= nbs(2);
 572                 a ^= nbs(3);
 573                 a ^= nbs(4);
 574                 text++;
 575                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 576
 577         case 12:
 578                 a = fbs(0);
 579                 text += 2;
 580                 a ^= nbs(2);
 581                 a ^= nbs(3);
 582                 text++;
 583                 a ^= nbs(5);
 584                 a ^= nbs(6);
 585                 a ^= nbs(7);
 586                 text++;
 587                 return a ^ nbs(9) ^ nbs(10);
 588
 589         case 13:
 590                 a = fbs(0);
 591                 a ^= nbs(1);
 592                 text++;
 593                 a ^= nbs(3);
 594                 a ^= nbs(4);
 595                 text += 2;
 596                 a ^= nbs(7);
 597                 a ^= nbs(8);
 598                 text += 2;
 599                 return a ^ nbs(9) ^ nbs(10);
 600
 601         case 14:
 602                 a = fbs(0);
 603                 text += 2;
 604                 a ^= nbs(3);
 605                 a ^= nbs(4);
 606                 text += 2;
 607                 a ^= nbs(7);
 608                 a ^= nbs(8);
 609                 text += 2;
 610                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 611
 612         case 15:
 613                 a = fbs(0);
 614                 text += 2;
 615                 a ^= nbs(3);
 616                 a ^= nbs(4);
 617                 text += 2;
 618                 a ^= nbs(7);
 619                 a ^= nbs(8);
 620                 text += 2;
 621                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 622
 623         default:  /* 3 characters from beginning */
 624                 a = fbs(0);
 625                 text += 2;
 626                 a ^= nbs(3);
 627                 a ^= nbs(4);
 628
 629                 /* 2 characters from middle */
 630                 text = start_pos + (length / 2);
 631                 a ^= fbs(5);
 632                 text += 2;
 633                 a ^= nbs(6);
 634
 635                 /* 3 characters from end */
 636                 text = start_pos + length - 4;
 637
 638                 a ^= fbs(7);
 639                 text++;
 640
 641                 return a ^ nbs(10) ^ nbs(11);
 642     }
 643 }
 644
 645 /* utf_full_hashkey ************************************************************
 646
 647    This function computes a hash value using all bytes in the string.
 648
 649    The algorithm is the "One-at-a-time" algorithm as published
 650    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 651
 652 *******************************************************************************/
 653
 654 u4 utf_full_hashkey(const char *text, u4 length)
 655 {
 656         register const unsigned char *p = (const unsigned char *) text;
 657         register u4 hash;
 658         register u4 i;
 659
 660         hash = 0;
 661         for (i=length; i--;)
 662         {
 663             hash += *p++;
 664             hash += (hash << 10);
 665             hash ^= (hash >> 6);
 666         }
 667         hash += (hash << 3);
 668         hash ^= (hash >> 11);
 669         hash += (hash << 15);
 670
 671         return hash;
 672 }
 673
 674 /* unicode_hashkey *************************************************************
 675
 676    Compute the hashkey of a unicode string.
 677
 678 *******************************************************************************/
 679
 680 u4 unicode_hashkey(u2 *text, u2 len)
 681 {
 682         return utf_hashkey((char *) text, len);
 683 }
 684
 685
 686 /* utf_new *********************************************************************
 687
 688    Creates a new utf-symbol, the text of the symbol is passed as a
 689    u1-array. The function searches the utf-hashtable for a utf-symbol
 690    with this text. On success the element returned, otherwise a new
 691    hashtable element is created.
 692
 693    If the number of entries in the hashtable exceeds twice the size of
 694    the hashtable slots a reorganization of the hashtable is done and
 695    the utf symbols are copied to a new hashtable with doubled size.
 696
 697 *******************************************************************************/
 698
 699 utf *utf_new(const char *text, u2 length)
 700 {
 701         u4 key;                             /* hashkey computed from utf-text     */
 702         u4 slot;                            /* slot in hashtable                  */
 703         utf *u;                             /* hashtable element                  */
 704         u2 i;
 705
 706         Mutex_lock(hashtable_utf->mutex);
 707
 708 #if defined(ENABLE_STATISTICS)
 709         if (opt_stat)
 710                 count_utf_new++;
 711 #endif
 712
 713         key  = utf_hashkey(text, length);
 714         slot = key & (hashtable_utf->size - 1);
 715         u    = hashtable_utf->ptr[slot];
 716
 717         /* search external hash chain for utf-symbol */
 718
 719         while (u) {
 720                 if (u->blength == length) {
 721                         /* compare text of hashtable elements */
 722
 723                         for (i = 0; i < length; i++)
 724                                 if (text[i] != u->text[i])
 725                                         goto nomatch;
 726
 727 #if defined(ENABLE_STATISTICS)
 728                         if (opt_stat)
 729                                 count_utf_new_found++;
 730 #endif
 731
 732                         /* symbol found in hashtable */
 733
 734                         Mutex_unlock(hashtable_utf->mutex);
 735
 736                         return u;
 737                 }
 738
 739         nomatch:
 740                 u = u->hashlink; /* next element in external chain */
 741         }
 742
 743         /* location in hashtable found, create new utf element */
 744
 745         u = NEW(utf);
 746
 747         u->blength  = length;               /* length in bytes of utfstring       */
 748         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 749         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 750
 751         memcpy(u->text, text, length);      /* copy utf-text                      */
 752         u->text[length] = '\0';
 753
 754 #if defined(ENABLE_STATISTICS)
 755         if (opt_stat)
 756                 count_utf_len += sizeof(utf) + length + 1;
 757 #endif
 758
 759         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 760         hashtable_utf->entries++;           /* update number of entries           */
 761
 762         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 763
 764         /* reorganization of hashtable, average length of the external
 765            chains is approx. 2 */
 766
 767                 hashtable *newhash;                              /* the new hashtable */
 768                 u4         i;
 769                 utf       *u;
 770                 utf       *nextu;
 771                 u4         slot;
 772
 773                 /* create new hashtable, double the size */
 774
 775                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 776
 777 #if defined(ENABLE_STATISTICS)
 778                 if (opt_stat)
 779                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 780 #endif
 781
 782                 /* transfer elements to new hashtable */
 783
 784                 for (i = 0; i < hashtable_utf->size; i++) {
 785                         u = hashtable_utf->ptr[i];
 786
 787                         while (u) {
 788                                 nextu = u->hashlink;
 789                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 790
 791                                 u->hashlink = (utf *) newhash->ptr[slot];
 792                                 newhash->ptr[slot] = u;
 793
 794                                 /* follow link in external hash chain */
 795
 796                                 u = nextu;
 797                         }
 798                 }
 799
 800                 /* dispose old table */
 801
 802                 hashtable_free(hashtable_utf);
 803
 804                 hashtable_utf = newhash;
 805         }
 806
 807         Mutex_unlock(hashtable_utf->mutex);
 808
 809         return u;
 810 }
 811
 812
 813 /* utf_new_u2 ******************************************************************
 814
 815    Make utf symbol from u2 array, if isclassname is true '.' is
 816    replaced by '/'.
 817
 818 *******************************************************************************/
 819
 820 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 821 {
 822         char *buffer;                   /* memory buffer for  unicode characters  */
 823         char *pos;                      /* pointer to current position in buffer  */
 824         u4 left;                        /* unicode characters left                */
 825         u4 buflength;                   /* utf length in bytes of the u2 array    */
 826         utf *result;                    /* resulting utf-string                   */
 827         int i;
 828
 829         /* determine utf length in bytes and allocate memory */
 830
 831         buflength = u2_utflength(unicode_pos, unicode_length);
 832         buffer    = MNEW(char, buflength);
 833
 834         left = buflength;
 835         pos  = buffer;
 836
 837         for (i = 0; i++ < unicode_length; unicode_pos++) {
 838                 /* next unicode character */
 839                 u2 c = *unicode_pos;
 840
 841                 if ((c != 0) && (c < 0x80)) {
 842                         /* 1 character */
 843                         left--;
 844                 if ((int) left < 0) break;
 845                         /* convert classname */
 846                         if (isclassname && c == '.')
 847                                 *pos++ = '/';
 848                         else
 849                                 *pos++ = (char) c;
 850
 851                 } else if (c < 0x800) {
 852                         /* 2 characters */
 853                 unsigned char high = c >> 6;
 854                 unsigned char low  = c & 0x3F;
 855                         left = left - 2;
 856                 if ((int) left < 0) break;
 857                 *pos++ = high | 0xC0;
 858                 *pos++ = low  | 0x80;
 859
 860                 } else {
 861                 /* 3 characters */
 862                 char low  = c & 0x3f;
 863                 char mid  = (c >> 6) & 0x3F;
 864                 char high = c >> 12;
 865                         left = left - 3;
 866                 if ((int) left < 0) break;
 867                 *pos++ = high | 0xE0;
 868                 *pos++ = mid  | 0x80;
 869                 *pos++ = low  | 0x80;
 870                 }
 871         }
 872
 873         /* insert utf-string into symbol-table */
 874         result = utf_new(buffer,buflength);
 875
 876         MFREE(buffer, char, buflength);
 877
 878         return result;
 879 }
 880
 881
 882 /* utf_new_char ****************************************************************
 883
 884    Creates a new utf symbol, the text for this symbol is passed as a
 885    c-string ( = char* ).
 886
 887 *******************************************************************************/
 888
 889 utf *utf_new_char(const char *text)
 890 {
 891         return utf_new(text, strlen(text));
 892 }
 893
 894
 895 /* utf_new_char_classname ******************************************************
 896
 897    Creates a new utf symbol, the text for this symbol is passed as a
 898    c-string ( = char* ) "." characters are going to be replaced by
 899    "/". Since the above function is used often, this is a separte
 900    function, instead of an if.
 901
 902 *******************************************************************************/
 903
 904 utf *utf_new_char_classname(const char *text)
 905 {
 906         if (strchr(text, '.')) {
 907                 char *txt = strdup(text);
 908                 char *end = txt + strlen(txt);
 909                 char *c;
 910                 utf *tmpRes;
 911
 912                 for (c = txt; c < end; c++)
 913                         if (*c == '.') *c = '/';
 914
 915                 tmpRes = utf_new(txt, strlen(txt));
 916                 FREE(txt, 0);
 917
 918                 return tmpRes;
 919
 920         } else
 921                 return utf_new(text, strlen(text));
 922 }
 923
 924
 925 /* utf_nextu2 ******************************************************************
 926
 927    Read the next unicode character from the utf string and increment
 928    the utf-string pointer accordingly.
 929
 930    CAUTION: This function is unsafe for input that was not checked
 931             by is_valid_utf!
 932
 933 *******************************************************************************/
 934
 935 u2 utf_nextu2(char **utf_ptr)
 936 {
 937     /* uncompressed unicode character */
 938     u2 unicode_char = 0;
 939     /* current position in utf text */
 940     unsigned char *utf = (unsigned char *) (*utf_ptr);
 941     /* bytes representing the unicode character */
 942     unsigned char ch1, ch2, ch3;
 943     /* number of bytes used to represent the unicode character */
 944     int len = 0;
 945
 946     switch ((ch1 = utf[0]) >> 4) {
 947         default: /* 1 byte */
 948                 (*utf_ptr)++;
 949                 return (u2) ch1;
 950         case 0xC:
 951         case 0xD: /* 2 bytes */
 952                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 953                         unsigned char high = ch1 & 0x1F;
 954                         unsigned char low  = ch2 & 0x3F;
 955                         unicode_char = (high << 6) + low;
 956                         len = 2;
 957                 }
 958                 break;
 959
 960         case 0xE: /* 2 or 3 bytes */
 961                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 962                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 963                                 unsigned char low  = ch3 & 0x3f;
 964                                 unsigned char mid  = ch2 & 0x3f;
 965                                 unsigned char high = ch1 & 0x0f;
 966                                 unicode_char = (((high << 6) + mid) << 6) + low;
 967                                 len = 3;
 968                         } else
 969                                 len = 2;
 970                 }
 971                 break;
 972     }
 973
 974     /* update position in utf-text */
 975     *utf_ptr = (char *) (utf + len);
 976
 977     return unicode_char;
 978 }
 979
 980
 981 /* utf_bytes *******************************************************************
 982
 983    Determine number of bytes (aka. octets) in the utf string.
 984
 985    IN:
 986       u............utf string
 987
 988    OUT:
 989       The number of octets of this utf string.
 990           There is _no_ terminating zero included in this count.
 991
 992 *******************************************************************************/
 993
 994 u4 utf_bytes(utf *u)
 995 {
 996         return u->blength;
 997 }
 998
 999
1000 /* utf_get_number_of_u2s_for_buffer ********************************************
1001
1002    Determine number of UTF-16 u2s in the given UTF-8 buffer
1003
1004    CAUTION: This function is unsafe for input that was not checked
1005             by is_valid_utf!
1006
1007    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
1008    to an array of u2s (UTF-16) and want to know how many of them you will get.
1009    All other uses of this function are probably wrong.
1010
1011    IN:
1012       buffer........points to first char in buffer
1013           blength.......number of _bytes_ in the buffer
1014
1015    OUT:
1016       the number of u2s needed to hold this string in UTF-16 encoding.
1017           There is _no_ terminating zero included in this count.
1018
1019    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1020    exception.
1021
1022 *******************************************************************************/
1023
1024 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1025 {
1026         const char *endpos;                 /* points behind utf string           */
1027         const char *utf_ptr;                /* current position in utf text       */
1028         u4 len = 0;                         /* number of unicode characters       */
1029
1030         utf_ptr = buffer;
1031         endpos = utf_ptr + blength;
1032
1033         while (utf_ptr < endpos) {
1034                 len++;
1035                 /* next unicode character */
1036                 utf_nextu2((char **)&utf_ptr);
1037         }
1038
1039         assert(utf_ptr == endpos);
1040
1041         return len;
1042 }
1043
1044
1045 /* utf_get_number_of_u2s *******************************************************
1046
1047    Determine number of UTF-16 u2s in the utf string.
1048
1049    CAUTION: This function is unsafe for input that was not checked
1050             by is_valid_utf!
1051
1052    CAUTION: Use this function *only* when you want to convert a utf string
1053    to an array of u2s and want to know how many of them you will get.
1054    All other uses of this function are probably wrong.
1055
1056    IN:
1057       u............utf string
1058
1059    OUT:
1060       the number of u2s needed to hold this string in UTF-16 encoding.
1061           There is _no_ terminating zero included in this count.
1062           XXX 0 if a NullPointerException has been thrown (see below)
1063
1064 *******************************************************************************/
1065
1066 u4 utf_get_number_of_u2s(utf *u)
1067 {
1068         char *endpos;                       /* points behind utf string           */
1069         char *utf_ptr;                      /* current position in utf text       */
1070         u4 len = 0;                         /* number of unicode characters       */
1071
1072         /* XXX this is probably not checked by most callers! Review this after */
1073         /* the invalid uses of this function have been eliminated */
1074         if (u == NULL) {
1075                 exceptions_throw_nullpointerexception();
1076                 return 0;
1077         }
1078
1079         endpos = UTF_END(u);
1080         utf_ptr = u->text;
1081
1082         while (utf_ptr < endpos) {
1083                 len++;
1084                 /* next unicode character */
1085                 utf_nextu2(&utf_ptr);
1086         }
1087
1088         if (utf_ptr != endpos) {
1089                 /* string ended abruptly */
1090                 exceptions_throw_internalerror("Illegal utf8 string");
1091                 return 0;
1092         }
1093
1094         return len;
1095 }
1096
1097
1098 /* utf8_safe_number_of_u2s *****************************************************
1099
1100    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1101    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1102
1103    This function is safe even for invalid UTF-8 strings.
1104
1105    IN:
1106       text..........zero-terminated(!) UTF-8 string (may be invalid)
1107                         must NOT be NULL
1108           nbytes........strlen(text). (This is needed to completely emulate
1109                         the RI).
1110
1111    OUT:
1112       the number of u2s needed to hold this string in UTF-16 encoding.
1113           There is _no_ terminating zero included in this count.
1114
1115 *******************************************************************************/
1116
1117 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1118         register const unsigned char *t;
1119         register s4 byte;
1120         register s4 len;
1121         register const unsigned char *tlimit;
1122         s4 byte1;
1123         s4 byte2;
1124         s4 byte3;
1125         s4 value;
1126         s4 skip;
1127
1128         assert(text);
1129         assert(nbytes >= 0);
1130
1131         len = 0;
1132         t = (const unsigned char *) text;
1133         tlimit = t + nbytes;
1134
1135         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1136
1137         while (1) {
1138                 byte = *t++;
1139
1140                 if (byte & 0x80) {
1141                         /* highest bit set, non-ASCII character */
1142
1143                         if ((byte & 0xe0) == 0xc0) {
1144                                 /* 2-byte: should be 110..... 10...... ? */
1145
1146                                 if ((*t++ & 0xc0) == 0x80)
1147                                         ; /* valid 2-byte */
1148                                 else
1149                                         t--; /* invalid */
1150                         }
1151                         else if ((byte & 0xf0) == 0xe0) {
1152                                 /* 3-byte: should be 1110.... 10...... 10...... */
1153                                 /*                            ^t                */
1154
1155                                 if (t + 2 > tlimit)
1156                                         return len + 1; /* invalid, stop here */
1157
1158                                 if ((*t++ & 0xc0) == 0x80) {
1159                                         if ((*t++ & 0xc0) == 0x80)
1160                                                 ; /* valid 3-byte */
1161                                         else
1162                                                 t--; /* invalid */
1163                                 }
1164                                 else
1165                                         t--; /* invalid */
1166                         }
1167                         else if ((byte & 0xf8) == 0xf0) {
1168                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1169                                 /*                            ^t                         */
1170
1171                                 if (t + 3 > tlimit)
1172                                         return len + 1; /* invalid, stop here */
1173
1174                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1175                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1176                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1177                                                         /* valid 4-byte UTF-8? */
1178                                                         value = ((byte  & 0x07) << 18)
1179                                                                   | ((byte1 & 0x3f) << 12)
1180                                                                   | ((byte2 & 0x3f) <<  6)
1181                                                                   | ((byte3 & 0x3f)      );
1182
1183                                                         if (value > 0x10FFFF)
1184                                                                 ; /* invalid */
1185                                                         else if (value > 0xFFFF)
1186                                                                 len += 1; /* we need surrogates */
1187                                                         else
1188                                                                 ; /* 16bit suffice */
1189                                                 }
1190                                                 else
1191                                                         t--; /* invalid */
1192                                         }
1193                                         else
1194                                                 t--; /* invalid */
1195                                 }
1196                                 else
1197                                         t--; /* invalid */
1198                         }
1199                         else if ((byte & 0xfc) == 0xf8) {
1200                                 /* invalid 5-byte */
1201                                 if (t + 4 > tlimit)
1202                                         return len + 1; /* invalid, stop here */
1203
1204                                 skip = 4;
1205                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1206                                         t++;
1207                         }
1208                         else if ((byte & 0xfe) == 0xfc) {
1209                                 /* invalid 6-byte */
1210                                 if (t + 5 > tlimit)
1211                                         return len + 1; /* invalid, stop here */
1212
1213                                 skip = 5;
1214                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1215                                         t++;
1216                         }
1217                         else
1218                                 ; /* invalid */
1219                 }
1220                 else {
1221                         /* NUL */
1222
1223                         if (byte == 0)
1224                                 break;
1225
1226                         /* ASCII character, common case */
1227                 }
1228
1229                 len++;
1230         }
1231
1232         return len;
1233 }
1234
1235
1236 /* utf8_safe_convert_to_u2s ****************************************************
1237
1238    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1239    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1240    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1241
1242    This function is safe even for invalid UTF-8 strings.
1243
1244    IN:
1245       text..........zero-terminated(!) UTF-8 string (may be invalid)
1246                         must NOT be NULL
1247           nbytes........strlen(text). (This is needed to completely emulate
1248                                         the RI).
1249           buffer........a preallocated array of u2s to receive the decoded
1250                         string. Use utf8_safe_number_of_u2s to get the
1251                                         required number of u2s for allocating this.
1252
1253 *******************************************************************************/
1254
1255 #define UNICODE_REPLACEMENT  0xfffd
1256
1257 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1258         register const unsigned char *t;
1259         register s4 byte;
1260         register const unsigned char *tlimit;
1261         s4 byte1;
1262         s4 byte2;
1263         s4 byte3;
1264         s4 value;
1265         s4 skip;
1266
1267         assert(text);
1268         assert(nbytes >= 0);
1269
1270         t = (const unsigned char *) text;
1271         tlimit = t + nbytes;
1272
1273         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1274
1275         while (1) {
1276                 byte = *t++;
1277
1278                 if (byte & 0x80) {
1279                         /* highest bit set, non-ASCII character */
1280
1281                         if ((byte & 0xe0) == 0xc0) {
1282                                 /* 2-byte: should be 110..... 10...... */
1283
1284                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1285                                         /* valid 2-byte UTF-8 */
1286                                         *buffer++ = ((byte  & 0x1f) << 6)
1287                                                           | ((byte1 & 0x3f)     );
1288                                 }
1289                                 else {
1290                                         *buffer++ = UNICODE_REPLACEMENT;
1291                                         t--;
1292                                 }
1293                         }
1294                         else if ((byte & 0xf0) == 0xe0) {
1295                                 /* 3-byte: should be 1110.... 10...... 10...... */
1296
1297                                 if (t + 2 > tlimit) {
1298                                         *buffer++ = UNICODE_REPLACEMENT;
1299                                         return;
1300                                 }
1301
1302                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1303                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1304                                                 /* valid 3-byte UTF-8 */
1305                                                 *buffer++ = ((byte  & 0x0f) << 12)
1306                                                                   | ((byte1 & 0x3f) <<  6)
1307                                                                   | ((byte2 & 0x3f)      );
1308                                         }
1309                                         else {
1310                                                 *buffer++ = UNICODE_REPLACEMENT;
1311                                                 t--;
1312                                         }
1313                                 }
1314                                 else {
1315                                         *buffer++ = UNICODE_REPLACEMENT;
1316                                         t--;
1317                                 }
1318                         }
1319                         else if ((byte & 0xf8) == 0xf0) {
1320                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1321
1322                                 if (t + 3 > tlimit) {
1323                                         *buffer++ = UNICODE_REPLACEMENT;
1324                                         return;
1325                                 }
1326
1327                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1328                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1329                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1330                                                         /* valid 4-byte UTF-8? */
1331                                                         value = ((byte  & 0x07) << 18)
1332                                                                   | ((byte1 & 0x3f) << 12)
1333                                                                   | ((byte2 & 0x3f) <<  6)
1334                                                                   | ((byte3 & 0x3f)      );
1335
1336                                                         if (value > 0x10FFFF) {
1337                                                                 *buffer++ = UNICODE_REPLACEMENT;
1338                                                         }
1339                                                         else if (value > 0xFFFF) {
1340                                                                 /* we need surrogates */
1341                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1342                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1343                                                         }
1344                                                         else
1345                                                                 *buffer++ = value; /* 16bit suffice */
1346                                                 }
1347                                                 else {
1348                                                         *buffer++ = UNICODE_REPLACEMENT;
1349                                                         t--;
1350                                                 }
1351                                         }
1352                                         else {
1353                                                 *buffer++ = UNICODE_REPLACEMENT;
1354                                                 t--;
1355                                         }
1356                                 }
1357                                 else {
1358                                         *buffer++ = UNICODE_REPLACEMENT;
1359                                         t--;
1360                                 }
1361                         }
1362                         else if ((byte & 0xfc) == 0xf8) {
1363                                 if (t + 4 > tlimit) {
1364                                         *buffer++ = UNICODE_REPLACEMENT;
1365                                         return;
1366                                 }
1367
1368                                 skip = 4;
1369                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1370                                         t++;
1371                                 *buffer++ = UNICODE_REPLACEMENT;
1372                         }
1373                         else if ((byte & 0xfe) == 0xfc) {
1374                                 if (t + 5 > tlimit) {
1375                                         *buffer++ = UNICODE_REPLACEMENT;
1376                                         return;
1377                                 }
1378
1379                                 skip = 5;
1380                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1381                                         t++;
1382                                 *buffer++ = UNICODE_REPLACEMENT;
1383                         }
1384                         else
1385                                 *buffer++ = UNICODE_REPLACEMENT;
1386                 }
1387                 else {
1388                         /* NUL */
1389
1390                         if (byte == 0)
1391                                 break;
1392
1393                         /* ASCII character, common case */
1394
1395                         *buffer++ = byte;
1396                 }
1397         }
1398 }
1399
1400
1401 /* u2_utflength ****************************************************************
1402
1403    Returns the utf length in bytes of a u2 array.
1404
1405 *******************************************************************************/
1406
1407 u4 u2_utflength(u2 *text, u4 u2_length)
1408 {
1409         u4 result_len = 0;                  /* utf length in bytes                */
1410         u2 ch;                              /* current unicode character          */
1411         u4 len;
1412
1413         for (len = 0; len < u2_length; len++) {
1414                 /* next unicode character */
1415                 ch = *text++;
1416
1417                 /* determine bytes required to store unicode character as utf */
1418                 if (ch && (ch < 0x80))
1419                         result_len++;
1420                 else if (ch < 0x800)
1421                         result_len += 2;
1422                 else
1423                         result_len += 3;
1424         }
1425
1426     return result_len;
1427 }
1428
1429
1430 /* utf_copy ********************************************************************
1431
1432    Copy the given utf string byte-for-byte to a buffer.
1433
1434    IN:
1435       buffer.......the buffer
1436           u............the utf string
1437
1438 *******************************************************************************/
1439
1440 void utf_copy(char *buffer, utf *u)
1441 {
1442         /* our utf strings are zero-terminated (done by utf_new) */
1443         MCOPY(buffer, u->text, char, u->blength + 1);
1444 }
1445
1446
1447 /* utf_cat *********************************************************************
1448
1449    Append the given utf string byte-for-byte to a buffer.
1450
1451    IN:
1452       buffer.......the buffer
1453           u............the utf string
1454
1455 *******************************************************************************/
1456
1457 void utf_cat(char *buffer, utf *u)
1458 {
1459         /* our utf strings are zero-terminated (done by utf_new) */
1460         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1461 }
1462
1463
1464 /* utf_copy_classname **********************************************************
1465
1466    Copy the given utf classname byte-for-byte to a buffer.
1467    '/' is replaced by '.'
1468
1469    IN:
1470       buffer.......the buffer
1471           u............the utf string
1472
1473 *******************************************************************************/
1474
1475 void utf_copy_classname(char *buffer, utf *u)
1476 {
1477         char *bufptr;
1478         char *srcptr;
1479         char *endptr;
1480         char ch;
1481
1482         bufptr = buffer;
1483         srcptr = u->text;
1484         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1485
1486         while (srcptr != endptr) {
1487                 ch = *srcptr++;
1488                 if (ch == '/')
1489                         ch = '.';
1490                 *bufptr++ = ch;
1491         }
1492 }
1493
1494
1495 /* utf_cat *********************************************************************
1496
1497    Append the given utf classname byte-for-byte to a buffer.
1498    '/' is replaced by '.'
1499
1500    IN:
1501       buffer.......the buffer
1502           u............the utf string
1503
1504 *******************************************************************************/
1505
1506 void utf_cat_classname(char *buffer, utf *u)
1507 {
1508         utf_copy_classname(buffer + strlen(buffer), u);
1509 }
1510
1511 /* utf_display_printable_ascii *************************************************
1512
1513    Write utf symbol to stdout (for debugging purposes).
1514    Non-printable and non-ASCII characters are printed as '?'.
1515
1516 *******************************************************************************/
1517
1518 void utf_display_printable_ascii(utf *u)
1519 {
1520         char *endpos;                       /* points behind utf string           */
1521         char *utf_ptr;                      /* current position in utf text       */
1522
1523         if (u == NULL) {
1524                 printf("NULL");
1525                 fflush(stdout);
1526                 return;
1527         }
1528
1529         endpos = UTF_END(u);
1530         utf_ptr = u->text;
1531
1532         while (utf_ptr < endpos) {
1533                 /* read next unicode character */
1534
1535                 u2 c = utf_nextu2(&utf_ptr);
1536
1537                 if ((c >= 32) && (c <= 127))
1538                         printf("%c", c);
1539                 else
1540                         printf("?");
1541         }
1542
1543         fflush(stdout);
1544 }
1545
1546
1547 /* utf_display_printable_ascii_classname ***************************************
1548
1549    Write utf symbol to stdout with `/' converted to `.' (for debugging
1550    purposes).
1551    Non-printable and non-ASCII characters are printed as '?'.
1552
1553 *******************************************************************************/
1554
1555 void utf_display_printable_ascii_classname(utf *u)
1556 {
1557         char *endpos;                       /* points behind utf string           */
1558         char *utf_ptr;                      /* current position in utf text       */
1559
1560         if (u == NULL) {
1561                 printf("NULL");
1562                 fflush(stdout);
1563                 return;
1564         }
1565
1566         endpos = UTF_END(u);
1567         utf_ptr = u->text;
1568
1569         while (utf_ptr < endpos) {
1570                 /* read next unicode character */
1571
1572                 u2 c = utf_nextu2(&utf_ptr);
1573
1574                 if (c == '/')
1575                         c = '.';
1576
1577                 if ((c >= 32) && (c <= 127))
1578                         printf("%c", c);
1579                 else
1580                         printf("?");
1581         }
1582
1583         fflush(stdout);
1584 }
1585
1586
1587 /* utf_sprint_convert_to_latin1 ************************************************
1588
1589    Write utf symbol into c-string (for debugging purposes).
1590    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1591    invalid results.
1592
1593 *******************************************************************************/
1594
1595 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1596 {
1597         char *endpos;                       /* points behind utf string           */
1598         char *utf_ptr;                      /* current position in utf text       */
1599         u2 pos = 0;                         /* position in c-string               */
1600
1601         if (!u) {
1602                 strcpy(buffer, "NULL");
1603                 return;
1604         }
1605
1606         endpos = UTF_END(u);
1607         utf_ptr = u->text;
1608
1609         while (utf_ptr < endpos)
1610                 /* copy next unicode character */
1611                 buffer[pos++] = utf_nextu2(&utf_ptr);
1612
1613         /* terminate string */
1614         buffer[pos] = '\0';
1615 }
1616
1617
1618 /* utf_sprint_convert_to_latin1_classname **************************************
1619
1620    Write utf symbol into c-string with `/' converted to `.' (for debugging
1621    purposes).
1622    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1623    invalid results.
1624
1625 *******************************************************************************/
1626
1627 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1628 {
1629         char *endpos;                       /* points behind utf string           */
1630         char *utf_ptr;                      /* current position in utf text       */
1631         u2 pos = 0;                         /* position in c-string               */
1632
1633         if (!u) {
1634                 strcpy(buffer, "NULL");
1635                 return;
1636         }
1637
1638         endpos = UTF_END(u);
1639         utf_ptr = u->text;
1640
1641         while (utf_ptr < endpos) {
1642                 /* copy next unicode character */
1643                 u2 c = utf_nextu2(&utf_ptr);
1644                 if (c == '/') c = '.';
1645                 buffer[pos++] = c;
1646         }
1647
1648         /* terminate string */
1649         buffer[pos] = '\0';
1650 }
1651
1652
1653 /* utf_strcat_convert_to_latin1 ************************************************
1654
1655    Like libc strcat, but uses an utf8 string.
1656    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1657    invalid results.
1658
1659 *******************************************************************************/
1660
1661 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1662 {
1663         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1664 }
1665
1666
1667 /* utf_strcat_convert_to_latin1_classname **************************************
1668
1669    Like libc strcat, but uses an utf8 string.
1670    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1671    invalid results.
1672
1673 *******************************************************************************/
1674
1675 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1676 {
1677         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1678 }
1679
1680
1681 /* utf_fprint_printable_ascii **************************************************
1682
1683    Write utf symbol into file.
1684    Non-printable and non-ASCII characters are printed as '?'.
1685
1686 *******************************************************************************/
1687
1688 void utf_fprint_printable_ascii(FILE *file, utf *u)
1689 {
1690         char *endpos;                       /* points behind utf string           */
1691         char *utf_ptr;                      /* current position in utf text       */
1692
1693         if (!u)
1694                 return;
1695
1696         endpos = UTF_END(u);
1697         utf_ptr = u->text;
1698
1699         while (utf_ptr < endpos) {
1700                 /* read next unicode character */
1701                 u2 c = utf_nextu2(&utf_ptr);
1702
1703                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1704                 else fprintf(file, "?");
1705         }
1706 }
1707
1708
1709 /* utf_fprint_printable_ascii_classname ****************************************
1710
1711    Write utf symbol into file with `/' converted to `.'.
1712    Non-printable and non-ASCII characters are printed as '?'.
1713
1714 *******************************************************************************/
1715
1716 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1717 {
1718         char *endpos;                       /* points behind utf string           */
1719         char *utf_ptr;                      /* current position in utf text       */
1720
1721     if (!u)
1722                 return;
1723
1724         endpos = UTF_END(u);
1725         utf_ptr = u->text;
1726
1727         while (utf_ptr < endpos) {
1728                 /* read next unicode character */
1729                 u2 c = utf_nextu2(&utf_ptr);
1730                 if (c == '/') c = '.';
1731
1732                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1733                 else fprintf(file, "?");
1734         }
1735 }
1736
1737
1738 /* is_valid_utf ****************************************************************
1739
1740    Return true if the given string is a valid UTF-8 string.
1741
1742    utf_ptr...points to first character
1743    end_pos...points after last character
1744
1745 *******************************************************************************/
1746
1747 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1748
1749 bool is_valid_utf(char *utf_ptr, char *end_pos)
1750 {
1751         int bytes;
1752         int len,i;
1753         char c;
1754         unsigned long v;
1755
1756         if (end_pos < utf_ptr) return false;
1757         bytes = end_pos - utf_ptr;
1758         while (bytes--) {
1759                 c = *utf_ptr++;
1760
1761                 if (!c) return false;                     /* 0x00 is not allowed */
1762                 if ((c & 0x80) == 0) continue;            /* ASCII */
1763
1764                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1765                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1766                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1767                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1768                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1769                 else return false;                        /* invalid leading byte */
1770
1771                 if (len > 2) return false;                /* Java limitation */
1772
1773                 v = (unsigned long)c & (0x3f >> len);
1774
1775                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1776
1777                 for (i = len; i--; ) {
1778                         c = *utf_ptr++;
1779                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1780                                 return false;
1781                         v = (v << 6) | (c & 0x3f);
1782                 }
1783
1784                 if (v == 0) {
1785                         if (len != 1) return false;           /* Java special */
1786
1787                 } else {
1788                         /* Sun Java seems to allow overlong UTF-8 encodings */
1789
1790                         /* if (v < min_codepoint[len]) */
1791                                 /* XXX throw exception? */
1792                 }
1793
1794                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1795                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1796
1797                 /* even these seem to be allowed */
1798                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1799         }
1800
1801         return true;
1802 }
1803
1804
1805 /* is_valid_name ***************************************************************
1806
1807    Return true if the given string may be used as a class/field/method
1808    name. (Currently this only disallows empty strings and control
1809    characters.)
1810
1811    NOTE: The string is assumed to have passed is_valid_utf!
1812
1813    utf_ptr...points to first character
1814    end_pos...points after last character
1815
1816 *******************************************************************************/
1817
1818 bool is_valid_name(char *utf_ptr, char *end_pos)
1819 {
1820         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1821
1822         while (utf_ptr < end_pos) {
1823                 unsigned char c = *utf_ptr++;
1824
1825                 if (c < 0x20) return false; /* disallow control characters */
1826                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1827                         return false;
1828         }
1829
1830         return true;
1831 }
1832
1833 bool is_valid_name_utf(utf *u)
1834 {
1835         return is_valid_name(u->text, UTF_END(u));
1836 }
1837
1838
1839 /* utf_show ********************************************************************
1840
1841    Writes the utf symbols in the utfhash to stdout and displays the
1842    number of external hash chains grouped according to the chainlength
1843    (for debugging purposes).
1844
1845 *******************************************************************************/
1846
1847 #if !defined(NDEBUG)
1848 void utf_show(void)
1849 {
1850
1851 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1852
1853         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1854         u4 max_chainlength = 0;      /* maximum length of the chains */
1855         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1856         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1857         u4 i;
1858
1859         printf("UTF-HASH:\n");
1860
1861         /* show element of utf-hashtable */
1862
1863         for (i = 0; i < hashtable_utf->size; i++) {
1864                 utf *u = hashtable_utf->ptr[i];
1865
1866                 if (u) {
1867                         printf("SLOT %d: ", (int) i);
1868
1869                         while (u) {
1870                                 printf("'");
1871                                 utf_display_printable_ascii(u);
1872                                 printf("' ");
1873                                 u = u->hashlink;
1874                         }
1875                         printf("\n");
1876                 }
1877         }
1878
1879         printf("UTF-HASH: %d slots for %d entries\n",
1880                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1881
1882         if (hashtable_utf->entries == 0)
1883                 return;
1884
1885         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1886
1887         for (i=0;i<CHAIN_LIMIT;i++)
1888                 chain_count[i]=0;
1889
1890         /* count numbers of hashchains according to their length */
1891         for (i=0; i<hashtable_utf->size; i++) {
1892
1893                 utf *u = (utf*) hashtable_utf->ptr[i];
1894                 u4 chain_length = 0;
1895
1896                 /* determine chainlength */
1897                 while (u) {
1898                         u = u->hashlink;
1899                         chain_length++;
1900                 }
1901
1902                 /* update sum of all chainlengths */
1903                 sum_chainlength+=chain_length;
1904
1905                 /* determine the maximum length of the chains */
1906                 if (chain_length>max_chainlength)
1907                         max_chainlength = chain_length;
1908
1909                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1910                 if (chain_length>=CHAIN_LIMIT) {
1911                         beyond_limit+=chain_length;
1912                         chain_length=CHAIN_LIMIT-1;
1913                 }
1914
1915                 /* update number of hashchains of current length */
1916                 chain_count[chain_length]++;
1917         }
1918
1919         /* display results */
1920         for (i=1;i<CHAIN_LIMIT-1;i++)
1921                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1922
1923         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1924
1925
1926         printf("max. chainlength:%5d\n",max_chainlength);
1927
1928         /* avg. chainlength = sum of chainlengths / number of chains */
1929         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1930 }
1931 #endif /* !defined(NDEBUG) */
1932
1933
1934 /*
1935  * These are local overrides for various environment variables in Emacs.
1936  * Please do not remove this and leave it at the end of the file, where
1937  * Emacs will automagically detect them.
1938  * ---------------------------------------------------------------------
1939  * Local variables:
1940  * mode: c
1941  * indent-tabs-mode: t
1942  * c-basic-offset: 4
1943  * tab-width: 4
1944  * End:
1945  * vim:noexpandtab:sw=4:ts=4:
1946  */