src/vm/utf8.c

   1 /* src/vm/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006, 2007, 2008
   4    CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
   5
   6    This file is part of CACAO.
   7
   8    This program is free software; you can redistribute it and/or
   9    modify it under the terms of the GNU General Public License as
  10    published by the Free Software Foundation; either version 2, or (at
  11    your option) any later version.
  12
  13    This program is distributed in the hope that it will be useful, but
  14    WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16    General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; if not, write to the Free Software
  20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  21    02110-1301, USA.
  22
  23 */
  24
  25
  26 #include "config.h"
  27
  28 #include <string.h>
  29 #include <assert.h>
  30
  31 #include "vm/types.h"
  32
  33 #include "mm/memory.hpp"
  34
  35 #include "threads/mutex.hpp"
  36
  37 #include "toolbox/hashtable.h"
  38
  39 #include "vm/exceptions.hpp"
  40 #include "vm/options.h"
  41
  42 #if defined(ENABLE_STATISTICS)
  43 # include "vm/statistics.h"
  44 #endif
  45
  46 #include "vm/utf8.h"
  47
  48
  49 /* global variables ***********************************************************/
  50
  51 /* hashsize must be power of 2 */
  52
  53 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  54
  55 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  56
  57
  58 /* utf-symbols for pointer comparison of frequently used strings **************/
  59
  60 utf *utf_java_lang_Object;
  61
  62 utf *utf_java_lang_Class;
  63 utf *utf_java_lang_ClassLoader;
  64 utf *utf_java_lang_Cloneable;
  65 utf *utf_java_lang_SecurityManager;
  66 utf *utf_java_lang_String;
  67 utf *utf_java_lang_ThreadGroup;
  68 utf *utf_java_lang_ref_SoftReference;
  69 utf *utf_java_lang_ref_WeakReference;
  70 utf *utf_java_lang_ref_PhantomReference;
  71 utf *utf_java_io_Serializable;
  72
  73 utf *utf_java_lang_Throwable;
  74 utf *utf_java_lang_Error;
  75
  76 utf *utf_java_lang_AbstractMethodError;
  77 utf *utf_java_lang_ClassCircularityError;
  78 utf *utf_java_lang_ClassFormatError;
  79 utf *utf_java_lang_ExceptionInInitializerError;
  80 utf *utf_java_lang_IncompatibleClassChangeError;
  81 utf *utf_java_lang_InstantiationError;
  82 utf *utf_java_lang_InternalError;
  83 utf *utf_java_lang_LinkageError;
  84 utf *utf_java_lang_NoClassDefFoundError;
  85 utf *utf_java_lang_NoSuchFieldError;
  86 utf *utf_java_lang_NoSuchMethodError;
  87 utf *utf_java_lang_OutOfMemoryError;
  88 utf *utf_java_lang_UnsatisfiedLinkError;
  89 utf *utf_java_lang_UnsupportedClassVersionError;
  90 utf *utf_java_lang_VerifyError;
  91 utf *utf_java_lang_VirtualMachineError;
  92
  93 utf *utf_java_lang_Exception;
  94
  95 utf *utf_java_lang_ArithmeticException;
  96 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
  97 utf *utf_java_lang_ArrayStoreException;
  98 utf *utf_java_lang_ClassCastException;
  99 utf *utf_java_lang_ClassNotFoundException;
 100 utf *utf_java_lang_CloneNotSupportedException;
 101 utf *utf_java_lang_IllegalAccessException;
 102 utf *utf_java_lang_IllegalArgumentException;
 103 utf *utf_java_lang_IllegalMonitorStateException;
 104 utf *utf_java_lang_InstantiationException;
 105 utf *utf_java_lang_InterruptedException;
 106 utf *utf_java_lang_NegativeArraySizeException;
 107 utf *utf_java_lang_NullPointerException;
 108 utf *utf_java_lang_RuntimeException;
 109 utf *utf_java_lang_StringIndexOutOfBoundsException;
 110
 111 utf *utf_java_lang_reflect_InvocationTargetException;
 112
 113 utf *utf_java_security_PrivilegedActionException;
 114
 115 #if defined(ENABLE_JAVASE)
 116 utf* utf_java_lang_Void;
 117 #endif
 118
 119 utf* utf_java_lang_Boolean;
 120 utf* utf_java_lang_Byte;
 121 utf* utf_java_lang_Character;
 122 utf* utf_java_lang_Short;
 123 utf* utf_java_lang_Integer;
 124 utf* utf_java_lang_Long;
 125 utf* utf_java_lang_Float;
 126 utf* utf_java_lang_Double;
 127
 128 #if defined(ENABLE_JAVASE)
 129 utf *utf_java_lang_StackTraceElement;
 130 utf *utf_java_lang_reflect_Constructor;
 131 utf *utf_java_lang_reflect_Field;
 132 utf *utf_java_lang_reflect_Method;
 133
 134 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
 135 utf *utf_java_lang_reflect_VMConstructor;
 136 utf *utf_java_lang_reflect_VMField;
 137 utf *utf_java_lang_reflect_VMMethod;
 138 # endif
 139
 140 utf *utf_java_util_Vector;
 141 #endif
 142
 143 utf *utf_InnerClasses;                  /* InnerClasses                       */
 144 utf *utf_ConstantValue;                 /* ConstantValue                      */
 145 utf *utf_Code;                          /* Code                               */
 146 utf *utf_Exceptions;                    /* Exceptions                         */
 147 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 148 utf *utf_SourceFile;                    /* SourceFile                         */
 149
 150 #if defined(ENABLE_JAVASE)
 151 utf *utf_EnclosingMethod;
 152 utf *utf_Signature;
 153 utf *utf_StackMapTable;
 154
 155 # if defined(ENABLE_JVMTI)
 156 utf *utf_LocalVariableTable;
 157 # endif
 158
 159 # if defined(ENABLE_ANNOTATIONS)
 160 utf *utf_RuntimeVisibleAnnotations;            /* RuntimeVisibleAnnotations            */
 161 utf *utf_RuntimeInvisibleAnnotations;          /* RuntimeInvisibleAnnotations          */
 162 utf *utf_RuntimeVisibleParameterAnnotations;   /* RuntimeVisibleParameterAnnotations   */
 163 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
 164 utf *utf_AnnotationDefault;                    /* AnnotationDefault                    */
 165 # endif
 166 #endif
 167
 168 utf *utf_init;                          /* <init>                             */
 169 utf *utf_clinit;                        /* <clinit>                           */
 170 utf *utf_clone;                         /* clone                              */
 171 utf *utf_finalize;                      /* finalize                           */
 172 utf *utf_invoke;
 173 utf *utf_main;
 174 utf *utf_run;                           /* run                                */
 175
 176 utf *utf_add;
 177 utf *utf_dispatch;
 178 utf *utf_remove;
 179 utf *utf_addThread;
 180 utf *utf_removeThread;
 181 utf *utf_put;
 182 utf *utf_get;
 183 utf *utf_uncaughtException;
 184 utf *utf_value;
 185
 186 utf *utf_fillInStackTrace;
 187 utf *utf_findNative;
 188 utf *utf_getSystemClassLoader;
 189 utf *utf_initCause;
 190 utf *utf_loadClass;
 191 utf *utf_loadClassInternal;
 192 utf *utf_printStackTrace;
 193
 194 utf *utf_division_by_zero;
 195
 196 utf *utf_Z;                             /* Z                                  */
 197 utf *utf_B;                             /* B                                  */
 198 utf *utf_C;                             /* C                                  */
 199 utf *utf_S;                             /* S                                  */
 200 utf *utf_I;                             /* I                                  */
 201 utf *utf_J;                             /* J                                  */
 202 utf *utf_F;                             /* F                                  */
 203 utf *utf_D;                             /* D                                  */
 204
 205 utf *utf_void__void;                    /* ()V                                */
 206 utf *utf_boolean__void;                 /* (Z)V                               */
 207 utf *utf_byte__void;                    /* (B)V                               */
 208 utf *utf_char__void;                    /* (C)V                               */
 209 utf *utf_short__void;                   /* (S)V                               */
 210 utf *utf_int__void;                     /* (I)V                               */
 211 utf *utf_long__void;                    /* (J)V                               */
 212 utf *utf_float__void;                   /* (F)V                               */
 213 utf *utf_double__void;                  /* (D)V                               */
 214
 215 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 216 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 217 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 218 utf *utf_java_lang_ClassLoader_java_lang_String__J;
 219 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 220 utf *utf_java_lang_Object__java_lang_Object;
 221 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 222 utf *utf_java_lang_String__java_lang_Class;
 223 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 224 utf *utf_java_lang_Thread_java_lang_Throwable__V;
 225 utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
 226 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 227 utf *utf_java_lang_Throwable__java_lang_Throwable;
 228
 229 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 230 utf *utf_null;
 231 utf *array_packagename;
 232
 233
 234 /* utf_init ********************************************************************
 235
 236    Initializes the utf8 subsystem.
 237
 238 *******************************************************************************/
 239
 240 void utf8_init(void)
 241 {
 242         TRACESUBSYSTEMINITIALIZATION("utf8_init");
 243
 244         /* create utf8 hashtable */
 245
 246         hashtable_utf = NEW(hashtable);
 247
 248         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 249
 250 #if defined(ENABLE_STATISTICS)
 251         if (opt_stat)
 252                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 253 #endif
 254
 255         /* create utf-symbols for pointer comparison of frequently used strings */
 256
 257         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 258
 259         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 260         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 261         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 262         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 263         utf_java_lang_String           = utf_new_char("java/lang/String");
 264         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 265
 266         utf_java_lang_ref_SoftReference =
 267                 utf_new_char("java/lang/ref/SoftReference");
 268
 269         utf_java_lang_ref_WeakReference =
 270                 utf_new_char("java/lang/ref/WeakReference");
 271
 272         utf_java_lang_ref_PhantomReference =
 273                 utf_new_char("java/lang/ref/PhantomReference");
 274
 275         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 276
 277         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 278         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 279
 280         utf_java_lang_ClassCircularityError =
 281                 utf_new_char("java/lang/ClassCircularityError");
 282
 283         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
 284
 285         utf_java_lang_ExceptionInInitializerError =
 286                 utf_new_char("java/lang/ExceptionInInitializerError");
 287
 288         utf_java_lang_IncompatibleClassChangeError =
 289                 utf_new_char("java/lang/IncompatibleClassChangeError");
 290
 291         utf_java_lang_InstantiationError =
 292                 utf_new_char("java/lang/InstantiationError");
 293
 294         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
 295         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 296
 297         utf_java_lang_NoClassDefFoundError =
 298                 utf_new_char("java/lang/NoClassDefFoundError");
 299
 300         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 301
 302         utf_java_lang_UnsatisfiedLinkError =
 303                 utf_new_char("java/lang/UnsatisfiedLinkError");
 304
 305         utf_java_lang_UnsupportedClassVersionError =
 306                 utf_new_char("java/lang/UnsupportedClassVersionError");
 307
 308         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
 309
 310         utf_java_lang_VirtualMachineError =
 311                 utf_new_char("java/lang/VirtualMachineError");
 312
 313 #if defined(ENABLE_JAVASE)
 314         utf_java_lang_AbstractMethodError =
 315                 utf_new_char("java/lang/AbstractMethodError");
 316
 317         utf_java_lang_NoSuchFieldError =
 318                 utf_new_char("java/lang/NoSuchFieldError");
 319
 320         utf_java_lang_NoSuchMethodError =
 321                 utf_new_char("java/lang/NoSuchMethodError");
 322 #endif
 323
 324         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 325
 326         utf_java_lang_ArithmeticException =
 327                 utf_new_char("java/lang/ArithmeticException");
 328
 329         utf_java_lang_ArrayIndexOutOfBoundsException =
 330                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
 331
 332         utf_java_lang_ArrayStoreException =
 333                 utf_new_char("java/lang/ArrayStoreException");
 334
 335         utf_java_lang_ClassCastException =
 336                 utf_new_char("java/lang/ClassCastException");
 337
 338         utf_java_lang_ClassNotFoundException =
 339                 utf_new_char("java/lang/ClassNotFoundException");
 340
 341         utf_java_lang_CloneNotSupportedException =
 342                 utf_new_char("java/lang/CloneNotSupportedException");
 343
 344         utf_java_lang_IllegalAccessException =
 345                 utf_new_char("java/lang/IllegalAccessException");
 346
 347         utf_java_lang_IllegalArgumentException =
 348                 utf_new_char("java/lang/IllegalArgumentException");
 349
 350         utf_java_lang_IllegalMonitorStateException =
 351                 utf_new_char("java/lang/IllegalMonitorStateException");
 352
 353         utf_java_lang_InstantiationException =
 354                 utf_new_char("java/lang/InstantiationException");
 355
 356         utf_java_lang_InterruptedException =
 357                 utf_new_char("java/lang/InterruptedException");
 358
 359         utf_java_lang_NegativeArraySizeException =
 360                 utf_new_char("java/lang/NegativeArraySizeException");
 361
 362         utf_java_lang_NullPointerException =
 363                 utf_new_char("java/lang/NullPointerException");
 364
 365         utf_java_lang_RuntimeException =
 366                 utf_new_char("java/lang/RuntimeException");
 367
 368         utf_java_lang_StringIndexOutOfBoundsException =
 369                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
 370
 371         utf_java_lang_reflect_InvocationTargetException =
 372                 utf_new_char("java/lang/reflect/InvocationTargetException");
 373
 374         utf_java_security_PrivilegedActionException =
 375                 utf_new_char("java/security/PrivilegedActionException");
 376
 377 #if defined(ENABLE_JAVASE)
 378         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 379 #endif
 380
 381         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 382         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 383         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 384         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 385         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 386         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 387         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 388         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 389
 390 #if defined(ENABLE_JAVASE)
 391         utf_java_lang_StackTraceElement =
 392                 utf_new_char("java/lang/StackTraceElement");
 393
 394         utf_java_lang_reflect_Constructor =
 395                 utf_new_char("java/lang/reflect/Constructor");
 396
 397         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 398         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 399
 400 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
 401         utf_java_lang_reflect_VMConstructor = utf_new_char("java/lang/reflect/VMConstructor");
 402         utf_java_lang_reflect_VMField       = utf_new_char("java/lang/reflect/VMField");
 403         utf_java_lang_reflect_VMMethod      = utf_new_char("java/lang/reflect/VMMethod");
 404 # endif
 405
 406         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 407 #endif
 408
 409         utf_InnerClasses               = utf_new_char("InnerClasses");
 410         utf_ConstantValue              = utf_new_char("ConstantValue");
 411         utf_Code                       = utf_new_char("Code");
 412         utf_Exceptions                 = utf_new_char("Exceptions");
 413         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 414         utf_SourceFile                 = utf_new_char("SourceFile");
 415
 416 #if defined(ENABLE_JAVASE)
 417         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 418         utf_Signature                  = utf_new_char("Signature");
 419         utf_StackMapTable              = utf_new_char("StackMapTable");
 420
 421 # if defined(ENABLE_JVMTI)
 422         utf_LocalVariableTable         = utf_new_char("LocalVariableTable");
 423 # endif
 424
 425 # if defined(ENABLE_ANNOTATIONS)
 426         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
 427         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
 428         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
 429         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
 430         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
 431 # endif
 432 #endif
 433
 434         utf_init                           = utf_new_char("<init>");
 435         utf_clinit                         = utf_new_char("<clinit>");
 436         utf_clone                      = utf_new_char("clone");
 437         utf_finalize                   = utf_new_char("finalize");
 438         utf_invoke                     = utf_new_char("invoke");
 439         utf_main                       = utf_new_char("main");
 440         utf_run                        = utf_new_char("run");
 441
 442         utf_add                        = utf_new_char("add");
 443         utf_dispatch                   = utf_new_char("dispatch");
 444         utf_remove                     = utf_new_char("remove");
 445         utf_addThread                  = utf_new_char("addThread");
 446         utf_removeThread               = utf_new_char("removeThread");
 447         utf_put                        = utf_new_char("put");
 448         utf_get                        = utf_new_char("get");
 449         utf_uncaughtException          = utf_new_char("uncaughtException");
 450         utf_value                      = utf_new_char("value");
 451
 452         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 453         utf_findNative                 = utf_new_char("findNative");
 454         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 455         utf_initCause                  = utf_new_char("initCause");
 456         utf_loadClass                  = utf_new_char("loadClass");
 457         utf_loadClassInternal          = utf_new_char("loadClassInternal");
 458         utf_printStackTrace            = utf_new_char("printStackTrace");
 459
 460         utf_division_by_zero           = utf_new_char("/ by zero");
 461
 462         utf_Z                          = utf_new_char("Z");
 463         utf_B                          = utf_new_char("B");
 464         utf_C                          = utf_new_char("C");
 465         utf_S                          = utf_new_char("S");
 466         utf_I                          = utf_new_char("I");
 467         utf_J                          = utf_new_char("J");
 468         utf_F                          = utf_new_char("F");
 469         utf_D                          = utf_new_char("D");
 470
 471         utf_void__void                 = utf_new_char("()V");
 472         utf_boolean__void              = utf_new_char("(Z)V");
 473         utf_byte__void                 = utf_new_char("(B)V");
 474         utf_char__void                 = utf_new_char("(C)V");
 475         utf_short__void                = utf_new_char("(S)V");
 476         utf_int__void                  = utf_new_char("(I)V");
 477         utf_long__void                 = utf_new_char("(J)V");
 478         utf_float__void                = utf_new_char("(F)V");
 479         utf_double__void               = utf_new_char("(D)V");
 480         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 481         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 482
 483         utf_void__java_lang_ClassLoader =
 484                 utf_new_char("()Ljava/lang/ClassLoader;");
 485
 486         utf_java_lang_ClassLoader_java_lang_String__J =
 487                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
 488
 489         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
 490
 491         utf_java_lang_Object__java_lang_Object =
 492                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 493
 494         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 495
 496         utf_java_lang_String__java_lang_Class =
 497                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 498
 499         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 500
 501         utf_java_lang_Thread_java_lang_Throwable__V =
 502                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
 503
 504         utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
 505                 utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");
 506
 507         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 508
 509         utf_java_lang_Throwable__java_lang_Throwable =
 510                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
 511
 512         utf_null                       = utf_new_char("null");
 513         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 514         array_packagename              = utf_new_char("\t<the array package>");
 515 }
 516
 517
 518 /* utf_hashkey *****************************************************************
 519
 520    The hashkey is computed from the utf-text by using up to 8
 521    characters.  For utf-symbols longer than 15 characters 3 characters
 522    are taken from the beginning and the end, 2 characters are taken
 523    from the middle.
 524
 525 *******************************************************************************/
 526
 527 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 528 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 529
 530 u4 utf_hashkey(const char *text, u4 length)
 531 {
 532         const char *start_pos = text;       /* pointer to utf text                */
 533         u4 a;
 534
 535         switch (length) {
 536         case 0: /* empty string */
 537                 return 0;
 538
 539         case 1: return fbs(0);
 540         case 2: return fbs(0) ^ nbs(3);
 541         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 542         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 543         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 544         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 545         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 546         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 547
 548         case 9:
 549                 a = fbs(0);
 550                 a ^= nbs(1);
 551                 a ^= nbs(2);
 552                 text++;
 553                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 554
 555         case 10:
 556                 a = fbs(0);
 557                 text++;
 558                 a ^= nbs(2);
 559                 a ^= nbs(3);
 560                 a ^= nbs(4);
 561                 text++;
 562                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 563
 564         case 11:
 565                 a = fbs(0);
 566                 text++;
 567                 a ^= nbs(2);
 568                 a ^= nbs(3);
 569                 a ^= nbs(4);
 570                 text++;
 571                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 572
 573         case 12:
 574                 a = fbs(0);
 575                 text += 2;
 576                 a ^= nbs(2);
 577                 a ^= nbs(3);
 578                 text++;
 579                 a ^= nbs(5);
 580                 a ^= nbs(6);
 581                 a ^= nbs(7);
 582                 text++;
 583                 return a ^ nbs(9) ^ nbs(10);
 584
 585         case 13:
 586                 a = fbs(0);
 587                 a ^= nbs(1);
 588                 text++;
 589                 a ^= nbs(3);
 590                 a ^= nbs(4);
 591                 text += 2;
 592                 a ^= nbs(7);
 593                 a ^= nbs(8);
 594                 text += 2;
 595                 return a ^ nbs(9) ^ nbs(10);
 596
 597         case 14:
 598                 a = fbs(0);
 599                 text += 2;
 600                 a ^= nbs(3);
 601                 a ^= nbs(4);
 602                 text += 2;
 603                 a ^= nbs(7);
 604                 a ^= nbs(8);
 605                 text += 2;
 606                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 607
 608         case 15:
 609                 a = fbs(0);
 610                 text += 2;
 611                 a ^= nbs(3);
 612                 a ^= nbs(4);
 613                 text += 2;
 614                 a ^= nbs(7);
 615                 a ^= nbs(8);
 616                 text += 2;
 617                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 618
 619         default:  /* 3 characters from beginning */
 620                 a = fbs(0);
 621                 text += 2;
 622                 a ^= nbs(3);
 623                 a ^= nbs(4);
 624
 625                 /* 2 characters from middle */
 626                 text = start_pos + (length / 2);
 627                 a ^= fbs(5);
 628                 text += 2;
 629                 a ^= nbs(6);
 630
 631                 /* 3 characters from end */
 632                 text = start_pos + length - 4;
 633
 634                 a ^= fbs(7);
 635                 text++;
 636
 637                 return a ^ nbs(10) ^ nbs(11);
 638     }
 639 }
 640
 641 /* utf_full_hashkey ************************************************************
 642
 643    This function computes a hash value using all bytes in the string.
 644
 645    The algorithm is the "One-at-a-time" algorithm as published
 646    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 647
 648 *******************************************************************************/
 649
 650 u4 utf_full_hashkey(const char *text, u4 length)
 651 {
 652         register const unsigned char *p = (const unsigned char *) text;
 653         register u4 hash;
 654         register u4 i;
 655
 656         hash = 0;
 657         for (i=length; i--;)
 658         {
 659             hash += *p++;
 660             hash += (hash << 10);
 661             hash ^= (hash >> 6);
 662         }
 663         hash += (hash << 3);
 664         hash ^= (hash >> 11);
 665         hash += (hash << 15);
 666
 667         return hash;
 668 }
 669
 670 /* unicode_hashkey *************************************************************
 671
 672    Compute the hashkey of a unicode string.
 673
 674 *******************************************************************************/
 675
 676 u4 unicode_hashkey(u2 *text, u2 len)
 677 {
 678         return utf_hashkey((char *) text, len);
 679 }
 680
 681
 682 /* utf_new *********************************************************************
 683
 684    Creates a new utf-symbol, the text of the symbol is passed as a
 685    u1-array. The function searches the utf-hashtable for a utf-symbol
 686    with this text. On success the element returned, otherwise a new
 687    hashtable element is created.
 688
 689    If the number of entries in the hashtable exceeds twice the size of
 690    the hashtable slots a reorganization of the hashtable is done and
 691    the utf symbols are copied to a new hashtable with doubled size.
 692
 693 *******************************************************************************/
 694
 695 utf *utf_new(const char *text, u2 length)
 696 {
 697         u4 key;                             /* hashkey computed from utf-text     */
 698         u4 slot;                            /* slot in hashtable                  */
 699         utf *u;                             /* hashtable element                  */
 700         u2 i;
 701
 702         Mutex_lock(hashtable_utf->mutex);
 703
 704 #if defined(ENABLE_STATISTICS)
 705         if (opt_stat)
 706                 count_utf_new++;
 707 #endif
 708
 709         key  = utf_hashkey(text, length);
 710         slot = key & (hashtable_utf->size - 1);
 711         u    = hashtable_utf->ptr[slot];
 712
 713         /* search external hash chain for utf-symbol */
 714
 715         while (u) {
 716                 if (u->blength == length) {
 717                         /* compare text of hashtable elements */
 718
 719                         for (i = 0; i < length; i++)
 720                                 if (text[i] != u->text[i])
 721                                         goto nomatch;
 722
 723 #if defined(ENABLE_STATISTICS)
 724                         if (opt_stat)
 725                                 count_utf_new_found++;
 726 #endif
 727
 728                         /* symbol found in hashtable */
 729
 730                         Mutex_unlock(hashtable_utf->mutex);
 731
 732                         return u;
 733                 }
 734
 735         nomatch:
 736                 u = u->hashlink; /* next element in external chain */
 737         }
 738
 739         /* location in hashtable found, create new utf element */
 740
 741         u = NEW(utf);
 742
 743         u->blength  = length;               /* length in bytes of utfstring       */
 744         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 745         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 746
 747         memcpy(u->text, text, length);      /* copy utf-text                      */
 748         u->text[length] = '\0';
 749
 750 #if defined(ENABLE_STATISTICS)
 751         if (opt_stat)
 752                 count_utf_len += sizeof(utf) + length + 1;
 753 #endif
 754
 755         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 756         hashtable_utf->entries++;           /* update number of entries           */
 757
 758         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 759
 760         /* reorganization of hashtable, average length of the external
 761            chains is approx. 2 */
 762
 763                 hashtable *newhash;                              /* the new hashtable */
 764                 u4         i;
 765                 utf       *u;
 766                 utf       *nextu;
 767                 u4         slot;
 768
 769                 /* create new hashtable, double the size */
 770
 771                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 772
 773 #if defined(ENABLE_STATISTICS)
 774                 if (opt_stat)
 775                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 776 #endif
 777
 778                 /* transfer elements to new hashtable */
 779
 780                 for (i = 0; i < hashtable_utf->size; i++) {
 781                         u = hashtable_utf->ptr[i];
 782
 783                         while (u) {
 784                                 nextu = u->hashlink;
 785                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 786
 787                                 u->hashlink = (utf *) newhash->ptr[slot];
 788                                 newhash->ptr[slot] = u;
 789
 790                                 /* follow link in external hash chain */
 791
 792                                 u = nextu;
 793                         }
 794                 }
 795
 796                 /* dispose old table */
 797
 798                 hashtable_free(hashtable_utf);
 799
 800                 hashtable_utf = newhash;
 801         }
 802
 803         Mutex_unlock(hashtable_utf->mutex);
 804
 805         return u;
 806 }
 807
 808
 809 /* utf_new_u2 ******************************************************************
 810
 811    Make utf symbol from u2 array, if isclassname is true '.' is
 812    replaced by '/'.
 813
 814 *******************************************************************************/
 815
 816 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 817 {
 818         char *buffer;                   /* memory buffer for  unicode characters  */
 819         char *pos;                      /* pointer to current position in buffer  */
 820         u4 left;                        /* unicode characters left                */
 821         u4 buflength;                   /* utf length in bytes of the u2 array    */
 822         utf *result;                    /* resulting utf-string                   */
 823         int i;
 824
 825         /* determine utf length in bytes and allocate memory */
 826
 827         buflength = u2_utflength(unicode_pos, unicode_length);
 828         buffer    = MNEW(char, buflength);
 829
 830         left = buflength;
 831         pos  = buffer;
 832
 833         for (i = 0; i++ < unicode_length; unicode_pos++) {
 834                 /* next unicode character */
 835                 u2 c = *unicode_pos;
 836
 837                 if ((c != 0) && (c < 0x80)) {
 838                         /* 1 character */
 839                         left--;
 840                 if ((int) left < 0) break;
 841                         /* convert classname */
 842                         if (isclassname && c == '.')
 843                                 *pos++ = '/';
 844                         else
 845                                 *pos++ = (char) c;
 846
 847                 } else if (c < 0x800) {
 848                         /* 2 characters */
 849                 unsigned char high = c >> 6;
 850                 unsigned char low  = c & 0x3F;
 851                         left = left - 2;
 852                 if ((int) left < 0) break;
 853                 *pos++ = high | 0xC0;
 854                 *pos++ = low  | 0x80;
 855
 856                 } else {
 857                 /* 3 characters */
 858                 char low  = c & 0x3f;
 859                 char mid  = (c >> 6) & 0x3F;
 860                 char high = c >> 12;
 861                         left = left - 3;
 862                 if ((int) left < 0) break;
 863                 *pos++ = high | 0xE0;
 864                 *pos++ = mid  | 0x80;
 865                 *pos++ = low  | 0x80;
 866                 }
 867         }
 868
 869         /* insert utf-string into symbol-table */
 870         result = utf_new(buffer,buflength);
 871
 872         MFREE(buffer, char, buflength);
 873
 874         return result;
 875 }
 876
 877
 878 /* utf_new_char ****************************************************************
 879
 880    Creates a new utf symbol, the text for this symbol is passed as a
 881    c-string ( = char* ).
 882
 883 *******************************************************************************/
 884
 885 utf *utf_new_char(const char *text)
 886 {
 887         return utf_new(text, strlen(text));
 888 }
 889
 890
 891 /* utf_new_char_classname ******************************************************
 892
 893    Creates a new utf symbol, the text for this symbol is passed as a
 894    c-string ( = char* ) "." characters are going to be replaced by
 895    "/". Since the above function is used often, this is a separte
 896    function, instead of an if.
 897
 898 *******************************************************************************/
 899
 900 utf *utf_new_char_classname(const char *text)
 901 {
 902         if (strchr(text, '.')) {
 903                 char *txt = strdup(text);
 904                 char *end = txt + strlen(txt);
 905                 char *c;
 906                 utf *tmpRes;
 907
 908                 for (c = txt; c < end; c++)
 909                         if (*c == '.') *c = '/';
 910
 911                 tmpRes = utf_new(txt, strlen(txt));
 912                 FREE(txt, 0);
 913
 914                 return tmpRes;
 915
 916         } else
 917                 return utf_new(text, strlen(text));
 918 }
 919
 920
 921 /* utf_nextu2 ******************************************************************
 922
 923    Read the next unicode character from the utf string and increment
 924    the utf-string pointer accordingly.
 925
 926    CAUTION: This function is unsafe for input that was not checked
 927             by is_valid_utf!
 928
 929 *******************************************************************************/
 930
 931 u2 utf_nextu2(char **utf_ptr)
 932 {
 933     /* uncompressed unicode character */
 934     u2 unicode_char = 0;
 935     /* current position in utf text */
 936     unsigned char *utf = (unsigned char *) (*utf_ptr);
 937     /* bytes representing the unicode character */
 938     unsigned char ch1, ch2, ch3;
 939     /* number of bytes used to represent the unicode character */
 940     int len = 0;
 941
 942     switch ((ch1 = utf[0]) >> 4) {
 943         default: /* 1 byte */
 944                 (*utf_ptr)++;
 945                 return (u2) ch1;
 946         case 0xC:
 947         case 0xD: /* 2 bytes */
 948                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 949                         unsigned char high = ch1 & 0x1F;
 950                         unsigned char low  = ch2 & 0x3F;
 951                         unicode_char = (high << 6) + low;
 952                         len = 2;
 953                 }
 954                 break;
 955
 956         case 0xE: /* 2 or 3 bytes */
 957                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 958                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 959                                 unsigned char low  = ch3 & 0x3f;
 960                                 unsigned char mid  = ch2 & 0x3f;
 961                                 unsigned char high = ch1 & 0x0f;
 962                                 unicode_char = (((high << 6) + mid) << 6) + low;
 963                                 len = 3;
 964                         } else
 965                                 len = 2;
 966                 }
 967                 break;
 968     }
 969
 970     /* update position in utf-text */
 971     *utf_ptr = (char *) (utf + len);
 972
 973     return unicode_char;
 974 }
 975
 976
 977 /* utf_bytes *******************************************************************
 978
 979    Determine number of bytes (aka. octets) in the utf string.
 980
 981    IN:
 982       u............utf string
 983
 984    OUT:
 985       The number of octets of this utf string.
 986           There is _no_ terminating zero included in this count.
 987
 988 *******************************************************************************/
 989
 990 u4 utf_bytes(utf *u)
 991 {
 992         return u->blength;
 993 }
 994
 995
 996 /* utf_get_number_of_u2s_for_buffer ********************************************
 997
 998    Determine number of UTF-16 u2s in the given UTF-8 buffer
 999
1000    CAUTION: This function is unsafe for input that was not checked
1001             by is_valid_utf!
1002
1003    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
1004    to an array of u2s (UTF-16) and want to know how many of them you will get.
1005    All other uses of this function are probably wrong.
1006
1007    IN:
1008       buffer........points to first char in buffer
1009           blength.......number of _bytes_ in the buffer
1010
1011    OUT:
1012       the number of u2s needed to hold this string in UTF-16 encoding.
1013           There is _no_ terminating zero included in this count.
1014
1015    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1016    exception.
1017
1018 *******************************************************************************/
1019
1020 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1021 {
1022         const char *endpos;                 /* points behind utf string           */
1023         const char *utf_ptr;                /* current position in utf text       */
1024         u4 len = 0;                         /* number of unicode characters       */
1025
1026         utf_ptr = buffer;
1027         endpos = utf_ptr + blength;
1028
1029         while (utf_ptr < endpos) {
1030                 len++;
1031                 /* next unicode character */
1032                 utf_nextu2((char **)&utf_ptr);
1033         }
1034
1035         assert(utf_ptr == endpos);
1036
1037         return len;
1038 }
1039
1040
1041 /* utf_get_number_of_u2s *******************************************************
1042
1043    Determine number of UTF-16 u2s in the utf string.
1044
1045    CAUTION: This function is unsafe for input that was not checked
1046             by is_valid_utf!
1047
1048    CAUTION: Use this function *only* when you want to convert a utf string
1049    to an array of u2s and want to know how many of them you will get.
1050    All other uses of this function are probably wrong.
1051
1052    IN:
1053       u............utf string
1054
1055    OUT:
1056       the number of u2s needed to hold this string in UTF-16 encoding.
1057           There is _no_ terminating zero included in this count.
1058           XXX 0 if a NullPointerException has been thrown (see below)
1059
1060 *******************************************************************************/
1061
1062 u4 utf_get_number_of_u2s(utf *u)
1063 {
1064         char *endpos;                       /* points behind utf string           */
1065         char *utf_ptr;                      /* current position in utf text       */
1066         u4 len = 0;                         /* number of unicode characters       */
1067
1068         /* XXX this is probably not checked by most callers! Review this after */
1069         /* the invalid uses of this function have been eliminated */
1070         if (u == NULL) {
1071                 exceptions_throw_nullpointerexception();
1072                 return 0;
1073         }
1074
1075         endpos = UTF_END(u);
1076         utf_ptr = u->text;
1077
1078         while (utf_ptr < endpos) {
1079                 len++;
1080                 /* next unicode character */
1081                 utf_nextu2(&utf_ptr);
1082         }
1083
1084         if (utf_ptr != endpos) {
1085                 /* string ended abruptly */
1086                 exceptions_throw_internalerror("Illegal utf8 string");
1087                 return 0;
1088         }
1089
1090         return len;
1091 }
1092
1093
1094 /* utf8_safe_number_of_u2s *****************************************************
1095
1096    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1097    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1098
1099    This function is safe even for invalid UTF-8 strings.
1100
1101    IN:
1102       text..........zero-terminated(!) UTF-8 string (may be invalid)
1103                         must NOT be NULL
1104           nbytes........strlen(text). (This is needed to completely emulate
1105                         the RI).
1106
1107    OUT:
1108       the number of u2s needed to hold this string in UTF-16 encoding.
1109           There is _no_ terminating zero included in this count.
1110
1111 *******************************************************************************/
1112
1113 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1114         register const unsigned char *t;
1115         register s4 byte;
1116         register s4 len;
1117         register const unsigned char *tlimit;
1118         s4 byte1;
1119         s4 byte2;
1120         s4 byte3;
1121         s4 value;
1122         s4 skip;
1123
1124         assert(text);
1125         assert(nbytes >= 0);
1126
1127         len = 0;
1128         t = (const unsigned char *) text;
1129         tlimit = t + nbytes;
1130
1131         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1132
1133         while (1) {
1134                 byte = *t++;
1135
1136                 if (byte & 0x80) {
1137                         /* highest bit set, non-ASCII character */
1138
1139                         if ((byte & 0xe0) == 0xc0) {
1140                                 /* 2-byte: should be 110..... 10...... ? */
1141
1142                                 if ((*t++ & 0xc0) == 0x80)
1143                                         ; /* valid 2-byte */
1144                                 else
1145                                         t--; /* invalid */
1146                         }
1147                         else if ((byte & 0xf0) == 0xe0) {
1148                                 /* 3-byte: should be 1110.... 10...... 10...... */
1149                                 /*                            ^t                */
1150
1151                                 if (t + 2 > tlimit)
1152                                         return len + 1; /* invalid, stop here */
1153
1154                                 if ((*t++ & 0xc0) == 0x80) {
1155                                         if ((*t++ & 0xc0) == 0x80)
1156                                                 ; /* valid 3-byte */
1157                                         else
1158                                                 t--; /* invalid */
1159                                 }
1160                                 else
1161                                         t--; /* invalid */
1162                         }
1163                         else if ((byte & 0xf8) == 0xf0) {
1164                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1165                                 /*                            ^t                         */
1166
1167                                 if (t + 3 > tlimit)
1168                                         return len + 1; /* invalid, stop here */
1169
1170                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1171                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1172                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1173                                                         /* valid 4-byte UTF-8? */
1174                                                         value = ((byte  & 0x07) << 18)
1175                                                                   | ((byte1 & 0x3f) << 12)
1176                                                                   | ((byte2 & 0x3f) <<  6)
1177                                                                   | ((byte3 & 0x3f)      );
1178
1179                                                         if (value > 0x10FFFF)
1180                                                                 ; /* invalid */
1181                                                         else if (value > 0xFFFF)
1182                                                                 len += 1; /* we need surrogates */
1183                                                         else
1184                                                                 ; /* 16bit suffice */
1185                                                 }
1186                                                 else
1187                                                         t--; /* invalid */
1188                                         }
1189                                         else
1190                                                 t--; /* invalid */
1191                                 }
1192                                 else
1193                                         t--; /* invalid */
1194                         }
1195                         else if ((byte & 0xfc) == 0xf8) {
1196                                 /* invalid 5-byte */
1197                                 if (t + 4 > tlimit)
1198                                         return len + 1; /* invalid, stop here */
1199
1200                                 skip = 4;
1201                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1202                                         t++;
1203                         }
1204                         else if ((byte & 0xfe) == 0xfc) {
1205                                 /* invalid 6-byte */
1206                                 if (t + 5 > tlimit)
1207                                         return len + 1; /* invalid, stop here */
1208
1209                                 skip = 5;
1210                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1211                                         t++;
1212                         }
1213                         else
1214                                 ; /* invalid */
1215                 }
1216                 else {
1217                         /* NUL */
1218
1219                         if (byte == 0)
1220                                 break;
1221
1222                         /* ASCII character, common case */
1223                 }
1224
1225                 len++;
1226         }
1227
1228         return len;
1229 }
1230
1231
1232 /* utf8_safe_convert_to_u2s ****************************************************
1233
1234    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1235    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1236    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1237
1238    This function is safe even for invalid UTF-8 strings.
1239
1240    IN:
1241       text..........zero-terminated(!) UTF-8 string (may be invalid)
1242                         must NOT be NULL
1243           nbytes........strlen(text). (This is needed to completely emulate
1244                                         the RI).
1245           buffer........a preallocated array of u2s to receive the decoded
1246                         string. Use utf8_safe_number_of_u2s to get the
1247                                         required number of u2s for allocating this.
1248
1249 *******************************************************************************/
1250
1251 #define UNICODE_REPLACEMENT  0xfffd
1252
1253 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1254         register const unsigned char *t;
1255         register s4 byte;
1256         register const unsigned char *tlimit;
1257         s4 byte1;
1258         s4 byte2;
1259         s4 byte3;
1260         s4 value;
1261         s4 skip;
1262
1263         assert(text);
1264         assert(nbytes >= 0);
1265
1266         t = (const unsigned char *) text;
1267         tlimit = t + nbytes;
1268
1269         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1270
1271         while (1) {
1272                 byte = *t++;
1273
1274                 if (byte & 0x80) {
1275                         /* highest bit set, non-ASCII character */
1276
1277                         if ((byte & 0xe0) == 0xc0) {
1278                                 /* 2-byte: should be 110..... 10...... */
1279
1280                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1281                                         /* valid 2-byte UTF-8 */
1282                                         *buffer++ = ((byte  & 0x1f) << 6)
1283                                                           | ((byte1 & 0x3f)     );
1284                                 }
1285                                 else {
1286                                         *buffer++ = UNICODE_REPLACEMENT;
1287                                         t--;
1288                                 }
1289                         }
1290                         else if ((byte & 0xf0) == 0xe0) {
1291                                 /* 3-byte: should be 1110.... 10...... 10...... */
1292
1293                                 if (t + 2 > tlimit) {
1294                                         *buffer++ = UNICODE_REPLACEMENT;
1295                                         return;
1296                                 }
1297
1298                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1299                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1300                                                 /* valid 3-byte UTF-8 */
1301                                                 *buffer++ = ((byte  & 0x0f) << 12)
1302                                                                   | ((byte1 & 0x3f) <<  6)
1303                                                                   | ((byte2 & 0x3f)      );
1304                                         }
1305                                         else {
1306                                                 *buffer++ = UNICODE_REPLACEMENT;
1307                                                 t--;
1308                                         }
1309                                 }
1310                                 else {
1311                                         *buffer++ = UNICODE_REPLACEMENT;
1312                                         t--;
1313                                 }
1314                         }
1315                         else if ((byte & 0xf8) == 0xf0) {
1316                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1317
1318                                 if (t + 3 > tlimit) {
1319                                         *buffer++ = UNICODE_REPLACEMENT;
1320                                         return;
1321                                 }
1322
1323                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1324                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1325                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1326                                                         /* valid 4-byte UTF-8? */
1327                                                         value = ((byte  & 0x07) << 18)
1328                                                                   | ((byte1 & 0x3f) << 12)
1329                                                                   | ((byte2 & 0x3f) <<  6)
1330                                                                   | ((byte3 & 0x3f)      );
1331
1332                                                         if (value > 0x10FFFF) {
1333                                                                 *buffer++ = UNICODE_REPLACEMENT;
1334                                                         }
1335                                                         else if (value > 0xFFFF) {
1336                                                                 /* we need surrogates */
1337                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1338                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1339                                                         }
1340                                                         else
1341                                                                 *buffer++ = value; /* 16bit suffice */
1342                                                 }
1343                                                 else {
1344                                                         *buffer++ = UNICODE_REPLACEMENT;
1345                                                         t--;
1346                                                 }
1347                                         }
1348                                         else {
1349                                                 *buffer++ = UNICODE_REPLACEMENT;
1350                                                 t--;
1351                                         }
1352                                 }
1353                                 else {
1354                                         *buffer++ = UNICODE_REPLACEMENT;
1355                                         t--;
1356                                 }
1357                         }
1358                         else if ((byte & 0xfc) == 0xf8) {
1359                                 if (t + 4 > tlimit) {
1360                                         *buffer++ = UNICODE_REPLACEMENT;
1361                                         return;
1362                                 }
1363
1364                                 skip = 4;
1365                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1366                                         t++;
1367                                 *buffer++ = UNICODE_REPLACEMENT;
1368                         }
1369                         else if ((byte & 0xfe) == 0xfc) {
1370                                 if (t + 5 > tlimit) {
1371                                         *buffer++ = UNICODE_REPLACEMENT;
1372                                         return;
1373                                 }
1374
1375                                 skip = 5;
1376                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1377                                         t++;
1378                                 *buffer++ = UNICODE_REPLACEMENT;
1379                         }
1380                         else
1381                                 *buffer++ = UNICODE_REPLACEMENT;
1382                 }
1383                 else {
1384                         /* NUL */
1385
1386                         if (byte == 0)
1387                                 break;
1388
1389                         /* ASCII character, common case */
1390
1391                         *buffer++ = byte;
1392                 }
1393         }
1394 }
1395
1396
1397 /* u2_utflength ****************************************************************
1398
1399    Returns the utf length in bytes of a u2 array.
1400
1401 *******************************************************************************/
1402
1403 u4 u2_utflength(u2 *text, u4 u2_length)
1404 {
1405         u4 result_len = 0;                  /* utf length in bytes                */
1406         u2 ch;                              /* current unicode character          */
1407         u4 len;
1408
1409         for (len = 0; len < u2_length; len++) {
1410                 /* next unicode character */
1411                 ch = *text++;
1412
1413                 /* determine bytes required to store unicode character as utf */
1414                 if (ch && (ch < 0x80))
1415                         result_len++;
1416                 else if (ch < 0x800)
1417                         result_len += 2;
1418                 else
1419                         result_len += 3;
1420         }
1421
1422     return result_len;
1423 }
1424
1425
1426 /* utf_copy ********************************************************************
1427
1428    Copy the given utf string byte-for-byte to a buffer.
1429
1430    IN:
1431       buffer.......the buffer
1432           u............the utf string
1433
1434 *******************************************************************************/
1435
1436 void utf_copy(char *buffer, utf *u)
1437 {
1438         /* our utf strings are zero-terminated (done by utf_new) */
1439         MCOPY(buffer, u->text, char, u->blength + 1);
1440 }
1441
1442
1443 /* utf_cat *********************************************************************
1444
1445    Append the given utf string byte-for-byte to a buffer.
1446
1447    IN:
1448       buffer.......the buffer
1449           u............the utf string
1450
1451 *******************************************************************************/
1452
1453 void utf_cat(char *buffer, utf *u)
1454 {
1455         /* our utf strings are zero-terminated (done by utf_new) */
1456         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1457 }
1458
1459
1460 /* utf_copy_classname **********************************************************
1461
1462    Copy the given utf classname byte-for-byte to a buffer.
1463    '/' is replaced by '.'
1464
1465    IN:
1466       buffer.......the buffer
1467           u............the utf string
1468
1469 *******************************************************************************/
1470
1471 void utf_copy_classname(char *buffer, utf *u)
1472 {
1473         char *bufptr;
1474         char *srcptr;
1475         char *endptr;
1476         char ch;
1477
1478         bufptr = buffer;
1479         srcptr = u->text;
1480         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1481
1482         while (srcptr != endptr) {
1483                 ch = *srcptr++;
1484                 if (ch == '/')
1485                         ch = '.';
1486                 *bufptr++ = ch;
1487         }
1488 }
1489
1490
1491 /* utf_cat *********************************************************************
1492
1493    Append the given utf classname byte-for-byte to a buffer.
1494    '/' is replaced by '.'
1495
1496    IN:
1497       buffer.......the buffer
1498           u............the utf string
1499
1500 *******************************************************************************/
1501
1502 void utf_cat_classname(char *buffer, utf *u)
1503 {
1504         utf_copy_classname(buffer + strlen(buffer), u);
1505 }
1506
1507 /* utf_display_printable_ascii *************************************************
1508
1509    Write utf symbol to stdout (for debugging purposes).
1510    Non-printable and non-ASCII characters are printed as '?'.
1511
1512 *******************************************************************************/
1513
1514 void utf_display_printable_ascii(utf *u)
1515 {
1516         char *endpos;                       /* points behind utf string           */
1517         char *utf_ptr;                      /* current position in utf text       */
1518
1519         if (u == NULL) {
1520                 printf("NULL");
1521                 fflush(stdout);
1522                 return;
1523         }
1524
1525         endpos = UTF_END(u);
1526         utf_ptr = u->text;
1527
1528         while (utf_ptr < endpos) {
1529                 /* read next unicode character */
1530
1531                 u2 c = utf_nextu2(&utf_ptr);
1532
1533                 if ((c >= 32) && (c <= 127))
1534                         printf("%c", c);
1535                 else
1536                         printf("?");
1537         }
1538
1539         fflush(stdout);
1540 }
1541
1542
1543 /* utf_display_printable_ascii_classname ***************************************
1544
1545    Write utf symbol to stdout with `/' converted to `.' (for debugging
1546    purposes).
1547    Non-printable and non-ASCII characters are printed as '?'.
1548
1549 *******************************************************************************/
1550
1551 void utf_display_printable_ascii_classname(utf *u)
1552 {
1553         char *endpos;                       /* points behind utf string           */
1554         char *utf_ptr;                      /* current position in utf text       */
1555
1556         if (u == NULL) {
1557                 printf("NULL");
1558                 fflush(stdout);
1559                 return;
1560         }
1561
1562         endpos = UTF_END(u);
1563         utf_ptr = u->text;
1564
1565         while (utf_ptr < endpos) {
1566                 /* read next unicode character */
1567
1568                 u2 c = utf_nextu2(&utf_ptr);
1569
1570                 if (c == '/')
1571                         c = '.';
1572
1573                 if ((c >= 32) && (c <= 127))
1574                         printf("%c", c);
1575                 else
1576                         printf("?");
1577         }
1578
1579         fflush(stdout);
1580 }
1581
1582
1583 /* utf_sprint_convert_to_latin1 ************************************************
1584
1585    Write utf symbol into c-string (for debugging purposes).
1586    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1587    invalid results.
1588
1589 *******************************************************************************/
1590
1591 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1592 {
1593         char *endpos;                       /* points behind utf string           */
1594         char *utf_ptr;                      /* current position in utf text       */
1595         u2 pos = 0;                         /* position in c-string               */
1596
1597         if (!u) {
1598                 strcpy(buffer, "NULL");
1599                 return;
1600         }
1601
1602         endpos = UTF_END(u);
1603         utf_ptr = u->text;
1604
1605         while (utf_ptr < endpos)
1606                 /* copy next unicode character */
1607                 buffer[pos++] = utf_nextu2(&utf_ptr);
1608
1609         /* terminate string */
1610         buffer[pos] = '\0';
1611 }
1612
1613
1614 /* utf_sprint_convert_to_latin1_classname **************************************
1615
1616    Write utf symbol into c-string with `/' converted to `.' (for debugging
1617    purposes).
1618    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1619    invalid results.
1620
1621 *******************************************************************************/
1622
1623 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1624 {
1625         char *endpos;                       /* points behind utf string           */
1626         char *utf_ptr;                      /* current position in utf text       */
1627         u2 pos = 0;                         /* position in c-string               */
1628
1629         if (!u) {
1630                 strcpy(buffer, "NULL");
1631                 return;
1632         }
1633
1634         endpos = UTF_END(u);
1635         utf_ptr = u->text;
1636
1637         while (utf_ptr < endpos) {
1638                 /* copy next unicode character */
1639                 u2 c = utf_nextu2(&utf_ptr);
1640                 if (c == '/') c = '.';
1641                 buffer[pos++] = c;
1642         }
1643
1644         /* terminate string */
1645         buffer[pos] = '\0';
1646 }
1647
1648
1649 /* utf_strcat_convert_to_latin1 ************************************************
1650
1651    Like libc strcat, but uses an utf8 string.
1652    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1653    invalid results.
1654
1655 *******************************************************************************/
1656
1657 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1658 {
1659         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1660 }
1661
1662
1663 /* utf_strcat_convert_to_latin1_classname **************************************
1664
1665    Like libc strcat, but uses an utf8 string.
1666    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1667    invalid results.
1668
1669 *******************************************************************************/
1670
1671 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1672 {
1673         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1674 }
1675
1676
1677 /* utf_fprint_printable_ascii **************************************************
1678
1679    Write utf symbol into file.
1680    Non-printable and non-ASCII characters are printed as '?'.
1681
1682 *******************************************************************************/
1683
1684 void utf_fprint_printable_ascii(FILE *file, utf *u)
1685 {
1686         char *endpos;                       /* points behind utf string           */
1687         char *utf_ptr;                      /* current position in utf text       */
1688
1689         if (!u)
1690                 return;
1691
1692         endpos = UTF_END(u);
1693         utf_ptr = u->text;
1694
1695         while (utf_ptr < endpos) {
1696                 /* read next unicode character */
1697                 u2 c = utf_nextu2(&utf_ptr);
1698
1699                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1700                 else fprintf(file, "?");
1701         }
1702 }
1703
1704
1705 /* utf_fprint_printable_ascii_classname ****************************************
1706
1707    Write utf symbol into file with `/' converted to `.'.
1708    Non-printable and non-ASCII characters are printed as '?'.
1709
1710 *******************************************************************************/
1711
1712 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1713 {
1714         char *endpos;                       /* points behind utf string           */
1715         char *utf_ptr;                      /* current position in utf text       */
1716
1717     if (!u)
1718                 return;
1719
1720         endpos = UTF_END(u);
1721         utf_ptr = u->text;
1722
1723         while (utf_ptr < endpos) {
1724                 /* read next unicode character */
1725                 u2 c = utf_nextu2(&utf_ptr);
1726                 if (c == '/') c = '.';
1727
1728                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1729                 else fprintf(file, "?");
1730         }
1731 }
1732
1733
1734 /* is_valid_utf ****************************************************************
1735
1736    Return true if the given string is a valid UTF-8 string.
1737
1738    utf_ptr...points to first character
1739    end_pos...points after last character
1740
1741 *******************************************************************************/
1742
1743 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1744
1745 bool is_valid_utf(char *utf_ptr, char *end_pos)
1746 {
1747         int bytes;
1748         int len,i;
1749         char c;
1750         unsigned long v;
1751
1752         if (end_pos < utf_ptr) return false;
1753         bytes = end_pos - utf_ptr;
1754         while (bytes--) {
1755                 c = *utf_ptr++;
1756
1757                 if (!c) return false;                     /* 0x00 is not allowed */
1758                 if ((c & 0x80) == 0) continue;            /* ASCII */
1759
1760                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1761                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1762                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1763                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1764                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1765                 else return false;                        /* invalid leading byte */
1766
1767                 if (len > 2) return false;                /* Java limitation */
1768
1769                 v = (unsigned long)c & (0x3f >> len);
1770
1771                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1772
1773                 for (i = len; i--; ) {
1774                         c = *utf_ptr++;
1775                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1776                                 return false;
1777                         v = (v << 6) | (c & 0x3f);
1778                 }
1779
1780                 if (v == 0) {
1781                         if (len != 1) return false;           /* Java special */
1782
1783                 } else {
1784                         /* Sun Java seems to allow overlong UTF-8 encodings */
1785
1786                         /* if (v < min_codepoint[len]) */
1787                                 /* XXX throw exception? */
1788                 }
1789
1790                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1791                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1792
1793                 /* even these seem to be allowed */
1794                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1795         }
1796
1797         return true;
1798 }
1799
1800
1801 /* is_valid_name ***************************************************************
1802
1803    Return true if the given string may be used as a class/field/method
1804    name. (Currently this only disallows empty strings and control
1805    characters.)
1806
1807    NOTE: The string is assumed to have passed is_valid_utf!
1808
1809    utf_ptr...points to first character
1810    end_pos...points after last character
1811
1812 *******************************************************************************/
1813
1814 bool is_valid_name(char *utf_ptr, char *end_pos)
1815 {
1816         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1817
1818         while (utf_ptr < end_pos) {
1819                 unsigned char c = *utf_ptr++;
1820
1821                 if (c < 0x20) return false; /* disallow control characters */
1822                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1823                         return false;
1824         }
1825
1826         return true;
1827 }
1828
1829 bool is_valid_name_utf(utf *u)
1830 {
1831         return is_valid_name(u->text, UTF_END(u));
1832 }
1833
1834
1835 /* utf_show ********************************************************************
1836
1837    Writes the utf symbols in the utfhash to stdout and displays the
1838    number of external hash chains grouped according to the chainlength
1839    (for debugging purposes).
1840
1841 *******************************************************************************/
1842
1843 #if !defined(NDEBUG)
1844 void utf_show(void)
1845 {
1846
1847 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1848
1849         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1850         u4 max_chainlength = 0;      /* maximum length of the chains */
1851         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1852         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1853         u4 i;
1854
1855         printf("UTF-HASH:\n");
1856
1857         /* show element of utf-hashtable */
1858
1859         for (i = 0; i < hashtable_utf->size; i++) {
1860                 utf *u = hashtable_utf->ptr[i];
1861
1862                 if (u) {
1863                         printf("SLOT %d: ", (int) i);
1864
1865                         while (u) {
1866                                 printf("'");
1867                                 utf_display_printable_ascii(u);
1868                                 printf("' ");
1869                                 u = u->hashlink;
1870                         }
1871                         printf("\n");
1872                 }
1873         }
1874
1875         printf("UTF-HASH: %d slots for %d entries\n",
1876                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1877
1878         if (hashtable_utf->entries == 0)
1879                 return;
1880
1881         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1882
1883         for (i=0;i<CHAIN_LIMIT;i++)
1884                 chain_count[i]=0;
1885
1886         /* count numbers of hashchains according to their length */
1887         for (i=0; i<hashtable_utf->size; i++) {
1888
1889                 utf *u = (utf*) hashtable_utf->ptr[i];
1890                 u4 chain_length = 0;
1891
1892                 /* determine chainlength */
1893                 while (u) {
1894                         u = u->hashlink;
1895                         chain_length++;
1896                 }
1897
1898                 /* update sum of all chainlengths */
1899                 sum_chainlength+=chain_length;
1900
1901                 /* determine the maximum length of the chains */
1902                 if (chain_length>max_chainlength)
1903                         max_chainlength = chain_length;
1904
1905                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1906                 if (chain_length>=CHAIN_LIMIT) {
1907                         beyond_limit+=chain_length;
1908                         chain_length=CHAIN_LIMIT-1;
1909                 }
1910
1911                 /* update number of hashchains of current length */
1912                 chain_count[chain_length]++;
1913         }
1914
1915         /* display results */
1916         for (i=1;i<CHAIN_LIMIT-1;i++)
1917                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1918
1919         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1920
1921
1922         printf("max. chainlength:%5d\n",max_chainlength);
1923
1924         /* avg. chainlength = sum of chainlengths / number of chains */
1925         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1926 }
1927 #endif /* !defined(NDEBUG) */
1928
1929
1930 /*
1931  * These are local overrides for various environment variables in Emacs.
1932  * Please do not remove this and leave it at the end of the file, where
1933  * Emacs will automagically detect them.
1934  * ---------------------------------------------------------------------
1935  * Local variables:
1936  * mode: c
1937  * indent-tabs-mode: t
1938  * c-basic-offset: 4
1939  * tab-width: 4
1940  * End:
1941  * vim:noexpandtab:sw=4:ts=4:
1942  */