src/vm/utf8.c

   1 /* src/vm/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006, 2007, 2008
   4    CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
   5
   6    This file is part of CACAO.
   7
   8    This program is free software; you can redistribute it and/or
   9    modify it under the terms of the GNU General Public License as
  10    published by the Free Software Foundation; either version 2, or (at
  11    your option) any later version.
  12
  13    This program is distributed in the hope that it will be useful, but
  14    WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16    General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; if not, write to the Free Software
  20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  21    02110-1301, USA.
  22
  23 */
  24
  25
  26 #include "config.h"
  27
  28 #include <string.h>
  29 #include <assert.h>
  30
  31 #include "vm/types.h"
  32
  33 #include "mm/memory.hpp"
  34
  35 #include "threads/mutex.hpp"
  36
  37 #include "toolbox/hashtable.h"
  38
  39 #include "vm/exceptions.hpp"
  40 #include "vm/options.h"
  41
  42 #if defined(ENABLE_STATISTICS)
  43 # include "vm/statistics.h"
  44 #endif
  45
  46 #include "vm/utf8.h"
  47
  48
  49 /* global variables ***********************************************************/
  50
  51 /* hashsize must be power of 2 */
  52
  53 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  54
  55 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  56
  57
  58 /* utf-symbols for pointer comparison of frequently used strings **************/
  59
  60 utf *utf_java_lang_Object;
  61
  62 utf *utf_java_lang_Class;
  63 utf *utf_java_lang_ClassLoader;
  64 utf *utf_java_lang_Cloneable;
  65 utf *utf_java_lang_SecurityManager;
  66 utf *utf_java_lang_String;
  67 utf *utf_java_lang_ThreadGroup;
  68 utf *utf_java_lang_ref_SoftReference;
  69 utf *utf_java_lang_ref_WeakReference;
  70 utf *utf_java_lang_ref_PhantomReference;
  71 utf *utf_java_io_Serializable;
  72
  73 utf *utf_java_lang_Throwable;
  74 utf *utf_java_lang_Error;
  75
  76 utf *utf_java_lang_AbstractMethodError;
  77 utf *utf_java_lang_ClassCircularityError;
  78 utf *utf_java_lang_ClassFormatError;
  79 utf *utf_java_lang_ExceptionInInitializerError;
  80 utf *utf_java_lang_IncompatibleClassChangeError;
  81 utf *utf_java_lang_InstantiationError;
  82 utf *utf_java_lang_InternalError;
  83 utf *utf_java_lang_LinkageError;
  84 utf *utf_java_lang_NoClassDefFoundError;
  85 utf *utf_java_lang_NoSuchFieldError;
  86 utf *utf_java_lang_NoSuchMethodError;
  87 utf *utf_java_lang_OutOfMemoryError;
  88 utf *utf_java_lang_UnsatisfiedLinkError;
  89 utf *utf_java_lang_UnsupportedClassVersionError;
  90 utf *utf_java_lang_VerifyError;
  91 utf *utf_java_lang_VirtualMachineError;
  92
  93 utf *utf_java_lang_Exception;
  94
  95 utf *utf_java_lang_ArithmeticException;
  96 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
  97 utf *utf_java_lang_ArrayStoreException;
  98 utf *utf_java_lang_ClassCastException;
  99 utf *utf_java_lang_ClassNotFoundException;
 100 utf *utf_java_lang_CloneNotSupportedException;
 101 utf *utf_java_lang_IllegalAccessException;
 102 utf *utf_java_lang_IllegalArgumentException;
 103 utf *utf_java_lang_IllegalMonitorStateException;
 104 utf *utf_java_lang_InstantiationException;
 105 utf *utf_java_lang_InterruptedException;
 106 utf *utf_java_lang_NegativeArraySizeException;
 107 utf *utf_java_lang_NullPointerException;
 108 utf *utf_java_lang_RuntimeException;
 109 utf *utf_java_lang_StringIndexOutOfBoundsException;
 110
 111 utf *utf_java_lang_reflect_InvocationTargetException;
 112
 113 utf *utf_java_security_PrivilegedActionException;
 114
 115 #if defined(ENABLE_JAVASE)
 116 utf* utf_java_lang_Void;
 117 #endif
 118
 119 utf* utf_java_lang_Boolean;
 120 utf* utf_java_lang_Byte;
 121 utf* utf_java_lang_Character;
 122 utf* utf_java_lang_Short;
 123 utf* utf_java_lang_Integer;
 124 utf* utf_java_lang_Long;
 125 utf* utf_java_lang_Float;
 126 utf* utf_java_lang_Double;
 127
 128 #if defined(ENABLE_JAVASE)
 129 utf *utf_java_lang_StackTraceElement;
 130 utf *utf_java_lang_reflect_Constructor;
 131 utf *utf_java_lang_reflect_Field;
 132 utf *utf_java_lang_reflect_Method;
 133
 134 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
 135 utf *utf_java_lang_reflect_VMConstructor;
 136 utf *utf_java_lang_reflect_VMField;
 137 utf *utf_java_lang_reflect_VMMethod;
 138 # endif
 139
 140 utf *utf_java_util_Vector;
 141 #endif
 142
 143 utf *utf_InnerClasses;                  /* InnerClasses                       */
 144 utf *utf_ConstantValue;                 /* ConstantValue                      */
 145 utf *utf_Code;                          /* Code                               */
 146 utf *utf_Exceptions;                    /* Exceptions                         */
 147 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 148 utf *utf_SourceFile;                    /* SourceFile                         */
 149
 150 #if defined(ENABLE_JAVASE)
 151 utf *utf_EnclosingMethod;
 152 utf *utf_Signature;
 153 utf *utf_StackMapTable;
 154
 155 #if defined(ENABLE_ANNOTATIONS)
 156 utf *utf_RuntimeVisibleAnnotations;            /* RuntimeVisibleAnnotations            */
 157 utf *utf_RuntimeInvisibleAnnotations;          /* RuntimeInvisibleAnnotations          */
 158 utf *utf_RuntimeVisibleParameterAnnotations;   /* RuntimeVisibleParameterAnnotations   */
 159 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
 160 utf *utf_AnnotationDefault;                    /* AnnotationDefault                    */
 161 #endif
 162 #endif
 163
 164 utf *utf_init;                          /* <init>                             */
 165 utf *utf_clinit;                        /* <clinit>                           */
 166 utf *utf_clone;                         /* clone                              */
 167 utf *utf_finalize;                      /* finalize                           */
 168 utf *utf_invoke;
 169 utf *utf_main;
 170 utf *utf_run;                           /* run                                */
 171
 172 utf *utf_add;
 173 utf *utf_dispatch;
 174 utf *utf_remove;
 175 utf *utf_addThread;
 176 utf *utf_removeThread;
 177 utf *utf_put;
 178 utf *utf_get;
 179 utf *utf_uncaughtException;
 180 utf *utf_value;
 181
 182 utf *utf_fillInStackTrace;
 183 utf *utf_findNative;
 184 utf *utf_getSystemClassLoader;
 185 utf *utf_initCause;
 186 utf *utf_loadClass;
 187 utf *utf_loadClassInternal;
 188 utf *utf_printStackTrace;
 189
 190 utf *utf_division_by_zero;
 191
 192 utf *utf_Z;                             /* Z                                  */
 193 utf *utf_B;                             /* B                                  */
 194 utf *utf_C;                             /* C                                  */
 195 utf *utf_S;                             /* S                                  */
 196 utf *utf_I;                             /* I                                  */
 197 utf *utf_J;                             /* J                                  */
 198 utf *utf_F;                             /* F                                  */
 199 utf *utf_D;                             /* D                                  */
 200
 201 utf *utf_void__void;                    /* ()V                                */
 202 utf *utf_boolean__void;                 /* (Z)V                               */
 203 utf *utf_byte__void;                    /* (B)V                               */
 204 utf *utf_char__void;                    /* (C)V                               */
 205 utf *utf_short__void;                   /* (S)V                               */
 206 utf *utf_int__void;                     /* (I)V                               */
 207 utf *utf_long__void;                    /* (J)V                               */
 208 utf *utf_float__void;                   /* (F)V                               */
 209 utf *utf_double__void;                  /* (D)V                               */
 210
 211 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 212 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 213 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 214 utf *utf_java_lang_ClassLoader_java_lang_String__J;
 215 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 216 utf *utf_java_lang_Object__java_lang_Object;
 217 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 218 utf *utf_java_lang_String__java_lang_Class;
 219 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 220 utf *utf_java_lang_Thread_java_lang_Throwable__V;
 221 utf *utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V;
 222 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 223 utf *utf_java_lang_Throwable__java_lang_Throwable;
 224
 225 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 226 utf *utf_null;
 227 utf *array_packagename;
 228
 229
 230 /* utf_init ********************************************************************
 231
 232    Initializes the utf8 subsystem.
 233
 234 *******************************************************************************/
 235
 236 void utf8_init(void)
 237 {
 238         TRACESUBSYSTEMINITIALIZATION("utf8_init");
 239
 240         /* create utf8 hashtable */
 241
 242         hashtable_utf = NEW(hashtable);
 243
 244         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 245
 246 #if defined(ENABLE_STATISTICS)
 247         if (opt_stat)
 248                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 249 #endif
 250
 251         /* create utf-symbols for pointer comparison of frequently used strings */
 252
 253         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 254
 255         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 256         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 257         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 258         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 259         utf_java_lang_String           = utf_new_char("java/lang/String");
 260         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 261
 262         utf_java_lang_ref_SoftReference =
 263                 utf_new_char("java/lang/ref/SoftReference");
 264
 265         utf_java_lang_ref_WeakReference =
 266                 utf_new_char("java/lang/ref/WeakReference");
 267
 268         utf_java_lang_ref_PhantomReference =
 269                 utf_new_char("java/lang/ref/PhantomReference");
 270
 271         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 272
 273         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 274         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 275
 276         utf_java_lang_ClassCircularityError =
 277                 utf_new_char("java/lang/ClassCircularityError");
 278
 279         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
 280
 281         utf_java_lang_ExceptionInInitializerError =
 282                 utf_new_char("java/lang/ExceptionInInitializerError");
 283
 284         utf_java_lang_IncompatibleClassChangeError =
 285                 utf_new_char("java/lang/IncompatibleClassChangeError");
 286
 287         utf_java_lang_InstantiationError =
 288                 utf_new_char("java/lang/InstantiationError");
 289
 290         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
 291         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 292
 293         utf_java_lang_NoClassDefFoundError =
 294                 utf_new_char("java/lang/NoClassDefFoundError");
 295
 296         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 297
 298         utf_java_lang_UnsatisfiedLinkError =
 299                 utf_new_char("java/lang/UnsatisfiedLinkError");
 300
 301         utf_java_lang_UnsupportedClassVersionError =
 302                 utf_new_char("java/lang/UnsupportedClassVersionError");
 303
 304         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
 305
 306         utf_java_lang_VirtualMachineError =
 307                 utf_new_char("java/lang/VirtualMachineError");
 308
 309 #if defined(ENABLE_JAVASE)
 310         utf_java_lang_AbstractMethodError =
 311                 utf_new_char("java/lang/AbstractMethodError");
 312
 313         utf_java_lang_NoSuchFieldError =
 314                 utf_new_char("java/lang/NoSuchFieldError");
 315
 316         utf_java_lang_NoSuchMethodError =
 317                 utf_new_char("java/lang/NoSuchMethodError");
 318 #endif
 319
 320         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 321
 322         utf_java_lang_ArithmeticException =
 323                 utf_new_char("java/lang/ArithmeticException");
 324
 325         utf_java_lang_ArrayIndexOutOfBoundsException =
 326                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
 327
 328         utf_java_lang_ArrayStoreException =
 329                 utf_new_char("java/lang/ArrayStoreException");
 330
 331         utf_java_lang_ClassCastException =
 332                 utf_new_char("java/lang/ClassCastException");
 333
 334         utf_java_lang_ClassNotFoundException =
 335                 utf_new_char("java/lang/ClassNotFoundException");
 336
 337         utf_java_lang_CloneNotSupportedException =
 338                 utf_new_char("java/lang/CloneNotSupportedException");
 339
 340         utf_java_lang_IllegalAccessException =
 341                 utf_new_char("java/lang/IllegalAccessException");
 342
 343         utf_java_lang_IllegalArgumentException =
 344                 utf_new_char("java/lang/IllegalArgumentException");
 345
 346         utf_java_lang_IllegalMonitorStateException =
 347                 utf_new_char("java/lang/IllegalMonitorStateException");
 348
 349         utf_java_lang_InstantiationException =
 350                 utf_new_char("java/lang/InstantiationException");
 351
 352         utf_java_lang_InterruptedException =
 353                 utf_new_char("java/lang/InterruptedException");
 354
 355         utf_java_lang_NegativeArraySizeException =
 356                 utf_new_char("java/lang/NegativeArraySizeException");
 357
 358         utf_java_lang_NullPointerException =
 359                 utf_new_char("java/lang/NullPointerException");
 360
 361         utf_java_lang_RuntimeException =
 362                 utf_new_char("java/lang/RuntimeException");
 363
 364         utf_java_lang_StringIndexOutOfBoundsException =
 365                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
 366
 367         utf_java_lang_reflect_InvocationTargetException =
 368                 utf_new_char("java/lang/reflect/InvocationTargetException");
 369
 370         utf_java_security_PrivilegedActionException =
 371                 utf_new_char("java/security/PrivilegedActionException");
 372
 373 #if defined(ENABLE_JAVASE)
 374         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 375 #endif
 376
 377         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 378         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 379         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 380         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 381         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 382         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 383         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 384         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 385
 386 #if defined(ENABLE_JAVASE)
 387         utf_java_lang_StackTraceElement =
 388                 utf_new_char("java/lang/StackTraceElement");
 389
 390         utf_java_lang_reflect_Constructor =
 391                 utf_new_char("java/lang/reflect/Constructor");
 392
 393         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 394         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 395
 396 # if defined(WITH_JAVA_RUNTIME_LIBRARY_GNU_CLASSPATH)
 397         utf_java_lang_reflect_VMConstructor = utf_new_char("java/lang/reflect/VMConstructor");
 398         utf_java_lang_reflect_VMField       = utf_new_char("java/lang/reflect/VMField");
 399         utf_java_lang_reflect_VMMethod      = utf_new_char("java/lang/reflect/VMMethod");
 400 # endif
 401
 402         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 403 #endif
 404
 405         utf_InnerClasses               = utf_new_char("InnerClasses");
 406         utf_ConstantValue              = utf_new_char("ConstantValue");
 407         utf_Code                       = utf_new_char("Code");
 408         utf_Exceptions                 = utf_new_char("Exceptions");
 409         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 410         utf_SourceFile                 = utf_new_char("SourceFile");
 411
 412 #if defined(ENABLE_JAVASE)
 413         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 414         utf_Signature                  = utf_new_char("Signature");
 415         utf_StackMapTable              = utf_new_char("StackMapTable");
 416
 417 # if defined(ENABLE_ANNOTATIONS)
 418         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
 419         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
 420         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
 421         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
 422         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
 423 # endif
 424 #endif
 425
 426         utf_init                           = utf_new_char("<init>");
 427         utf_clinit                         = utf_new_char("<clinit>");
 428         utf_clone                      = utf_new_char("clone");
 429         utf_finalize                   = utf_new_char("finalize");
 430         utf_invoke                     = utf_new_char("invoke");
 431         utf_main                       = utf_new_char("main");
 432         utf_run                        = utf_new_char("run");
 433
 434         utf_add                        = utf_new_char("add");
 435         utf_dispatch                   = utf_new_char("dispatch");
 436         utf_remove                     = utf_new_char("remove");
 437         utf_addThread                  = utf_new_char("addThread");
 438         utf_removeThread               = utf_new_char("removeThread");
 439         utf_put                        = utf_new_char("put");
 440         utf_get                        = utf_new_char("get");
 441         utf_uncaughtException          = utf_new_char("uncaughtException");
 442         utf_value                      = utf_new_char("value");
 443
 444         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 445         utf_findNative                 = utf_new_char("findNative");
 446         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 447         utf_initCause                  = utf_new_char("initCause");
 448         utf_loadClass                  = utf_new_char("loadClass");
 449         utf_loadClassInternal          = utf_new_char("loadClassInternal");
 450         utf_printStackTrace            = utf_new_char("printStackTrace");
 451
 452         utf_division_by_zero           = utf_new_char("/ by zero");
 453
 454         utf_Z                          = utf_new_char("Z");
 455         utf_B                          = utf_new_char("B");
 456         utf_C                          = utf_new_char("C");
 457         utf_S                          = utf_new_char("S");
 458         utf_I                          = utf_new_char("I");
 459         utf_J                          = utf_new_char("J");
 460         utf_F                          = utf_new_char("F");
 461         utf_D                          = utf_new_char("D");
 462
 463         utf_void__void                 = utf_new_char("()V");
 464         utf_boolean__void              = utf_new_char("(Z)V");
 465         utf_byte__void                 = utf_new_char("(B)V");
 466         utf_char__void                 = utf_new_char("(C)V");
 467         utf_short__void                = utf_new_char("(S)V");
 468         utf_int__void                  = utf_new_char("(I)V");
 469         utf_long__void                 = utf_new_char("(J)V");
 470         utf_float__void                = utf_new_char("(F)V");
 471         utf_double__void               = utf_new_char("(D)V");
 472         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 473         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 474
 475         utf_void__java_lang_ClassLoader =
 476                 utf_new_char("()Ljava/lang/ClassLoader;");
 477
 478         utf_java_lang_ClassLoader_java_lang_String__J =
 479                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
 480
 481         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
 482
 483         utf_java_lang_Object__java_lang_Object =
 484                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 485
 486         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 487
 488         utf_java_lang_String__java_lang_Class =
 489                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 490
 491         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 492
 493         utf_java_lang_Thread_java_lang_Throwable__V =
 494                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
 495
 496         utf_Ljava_lang_ThreadGroup_Ljava_lang_String__V =
 497                 utf_new_char("(Ljava/lang/ThreadGroup;Ljava/lang/String;)V");
 498
 499         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 500
 501         utf_java_lang_Throwable__java_lang_Throwable =
 502                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
 503
 504         utf_null                       = utf_new_char("null");
 505         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 506         array_packagename              = utf_new_char("\t<the array package>");
 507 }
 508
 509
 510 /* utf_hashkey *****************************************************************
 511
 512    The hashkey is computed from the utf-text by using up to 8
 513    characters.  For utf-symbols longer than 15 characters 3 characters
 514    are taken from the beginning and the end, 2 characters are taken
 515    from the middle.
 516
 517 *******************************************************************************/
 518
 519 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 520 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 521
 522 u4 utf_hashkey(const char *text, u4 length)
 523 {
 524         const char *start_pos = text;       /* pointer to utf text                */
 525         u4 a;
 526
 527         switch (length) {
 528         case 0: /* empty string */
 529                 return 0;
 530
 531         case 1: return fbs(0);
 532         case 2: return fbs(0) ^ nbs(3);
 533         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 534         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 535         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 536         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 537         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 538         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 539
 540         case 9:
 541                 a = fbs(0);
 542                 a ^= nbs(1);
 543                 a ^= nbs(2);
 544                 text++;
 545                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 546
 547         case 10:
 548                 a = fbs(0);
 549                 text++;
 550                 a ^= nbs(2);
 551                 a ^= nbs(3);
 552                 a ^= nbs(4);
 553                 text++;
 554                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 555
 556         case 11:
 557                 a = fbs(0);
 558                 text++;
 559                 a ^= nbs(2);
 560                 a ^= nbs(3);
 561                 a ^= nbs(4);
 562                 text++;
 563                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 564
 565         case 12:
 566                 a = fbs(0);
 567                 text += 2;
 568                 a ^= nbs(2);
 569                 a ^= nbs(3);
 570                 text++;
 571                 a ^= nbs(5);
 572                 a ^= nbs(6);
 573                 a ^= nbs(7);
 574                 text++;
 575                 return a ^ nbs(9) ^ nbs(10);
 576
 577         case 13:
 578                 a = fbs(0);
 579                 a ^= nbs(1);
 580                 text++;
 581                 a ^= nbs(3);
 582                 a ^= nbs(4);
 583                 text += 2;
 584                 a ^= nbs(7);
 585                 a ^= nbs(8);
 586                 text += 2;
 587                 return a ^ nbs(9) ^ nbs(10);
 588
 589         case 14:
 590                 a = fbs(0);
 591                 text += 2;
 592                 a ^= nbs(3);
 593                 a ^= nbs(4);
 594                 text += 2;
 595                 a ^= nbs(7);
 596                 a ^= nbs(8);
 597                 text += 2;
 598                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 599
 600         case 15:
 601                 a = fbs(0);
 602                 text += 2;
 603                 a ^= nbs(3);
 604                 a ^= nbs(4);
 605                 text += 2;
 606                 a ^= nbs(7);
 607                 a ^= nbs(8);
 608                 text += 2;
 609                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 610
 611         default:  /* 3 characters from beginning */
 612                 a = fbs(0);
 613                 text += 2;
 614                 a ^= nbs(3);
 615                 a ^= nbs(4);
 616
 617                 /* 2 characters from middle */
 618                 text = start_pos + (length / 2);
 619                 a ^= fbs(5);
 620                 text += 2;
 621                 a ^= nbs(6);
 622
 623                 /* 3 characters from end */
 624                 text = start_pos + length - 4;
 625
 626                 a ^= fbs(7);
 627                 text++;
 628
 629                 return a ^ nbs(10) ^ nbs(11);
 630     }
 631 }
 632
 633 /* utf_full_hashkey ************************************************************
 634
 635    This function computes a hash value using all bytes in the string.
 636
 637    The algorithm is the "One-at-a-time" algorithm as published
 638    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 639
 640 *******************************************************************************/
 641
 642 u4 utf_full_hashkey(const char *text, u4 length)
 643 {
 644         register const unsigned char *p = (const unsigned char *) text;
 645         register u4 hash;
 646         register u4 i;
 647
 648         hash = 0;
 649         for (i=length; i--;)
 650         {
 651             hash += *p++;
 652             hash += (hash << 10);
 653             hash ^= (hash >> 6);
 654         }
 655         hash += (hash << 3);
 656         hash ^= (hash >> 11);
 657         hash += (hash << 15);
 658
 659         return hash;
 660 }
 661
 662 /* unicode_hashkey *************************************************************
 663
 664    Compute the hashkey of a unicode string.
 665
 666 *******************************************************************************/
 667
 668 u4 unicode_hashkey(u2 *text, u2 len)
 669 {
 670         return utf_hashkey((char *) text, len);
 671 }
 672
 673
 674 /* utf_new *********************************************************************
 675
 676    Creates a new utf-symbol, the text of the symbol is passed as a
 677    u1-array. The function searches the utf-hashtable for a utf-symbol
 678    with this text. On success the element returned, otherwise a new
 679    hashtable element is created.
 680
 681    If the number of entries in the hashtable exceeds twice the size of
 682    the hashtable slots a reorganization of the hashtable is done and
 683    the utf symbols are copied to a new hashtable with doubled size.
 684
 685 *******************************************************************************/
 686
 687 utf *utf_new(const char *text, u2 length)
 688 {
 689         u4 key;                             /* hashkey computed from utf-text     */
 690         u4 slot;                            /* slot in hashtable                  */
 691         utf *u;                             /* hashtable element                  */
 692         u2 i;
 693
 694         Mutex_lock(hashtable_utf->mutex);
 695
 696 #if defined(ENABLE_STATISTICS)
 697         if (opt_stat)
 698                 count_utf_new++;
 699 #endif
 700
 701         key  = utf_hashkey(text, length);
 702         slot = key & (hashtable_utf->size - 1);
 703         u    = hashtable_utf->ptr[slot];
 704
 705         /* search external hash chain for utf-symbol */
 706
 707         while (u) {
 708                 if (u->blength == length) {
 709                         /* compare text of hashtable elements */
 710
 711                         for (i = 0; i < length; i++)
 712                                 if (text[i] != u->text[i])
 713                                         goto nomatch;
 714
 715 #if defined(ENABLE_STATISTICS)
 716                         if (opt_stat)
 717                                 count_utf_new_found++;
 718 #endif
 719
 720                         /* symbol found in hashtable */
 721
 722                         Mutex_unlock(hashtable_utf->mutex);
 723
 724                         return u;
 725                 }
 726
 727         nomatch:
 728                 u = u->hashlink; /* next element in external chain */
 729         }
 730
 731         /* location in hashtable found, create new utf element */
 732
 733         u = NEW(utf);
 734
 735         u->blength  = length;               /* length in bytes of utfstring       */
 736         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 737         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 738
 739         memcpy(u->text, text, length);      /* copy utf-text                      */
 740         u->text[length] = '\0';
 741
 742 #if defined(ENABLE_STATISTICS)
 743         if (opt_stat)
 744                 count_utf_len += sizeof(utf) + length + 1;
 745 #endif
 746
 747         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 748         hashtable_utf->entries++;           /* update number of entries           */
 749
 750         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 751
 752         /* reorganization of hashtable, average length of the external
 753            chains is approx. 2 */
 754
 755                 hashtable *newhash;                              /* the new hashtable */
 756                 u4         i;
 757                 utf       *u;
 758                 utf       *nextu;
 759                 u4         slot;
 760
 761                 /* create new hashtable, double the size */
 762
 763                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 764
 765 #if defined(ENABLE_STATISTICS)
 766                 if (opt_stat)
 767                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 768 #endif
 769
 770                 /* transfer elements to new hashtable */
 771
 772                 for (i = 0; i < hashtable_utf->size; i++) {
 773                         u = hashtable_utf->ptr[i];
 774
 775                         while (u) {
 776                                 nextu = u->hashlink;
 777                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 778
 779                                 u->hashlink = (utf *) newhash->ptr[slot];
 780                                 newhash->ptr[slot] = u;
 781
 782                                 /* follow link in external hash chain */
 783
 784                                 u = nextu;
 785                         }
 786                 }
 787
 788                 /* dispose old table */
 789
 790                 hashtable_free(hashtable_utf);
 791
 792                 hashtable_utf = newhash;
 793         }
 794
 795         Mutex_unlock(hashtable_utf->mutex);
 796
 797         return u;
 798 }
 799
 800
 801 /* utf_new_u2 ******************************************************************
 802
 803    Make utf symbol from u2 array, if isclassname is true '.' is
 804    replaced by '/'.
 805
 806 *******************************************************************************/
 807
 808 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 809 {
 810         char *buffer;                   /* memory buffer for  unicode characters  */
 811         char *pos;                      /* pointer to current position in buffer  */
 812         u4 left;                        /* unicode characters left                */
 813         u4 buflength;                   /* utf length in bytes of the u2 array    */
 814         utf *result;                    /* resulting utf-string                   */
 815         int i;
 816
 817         /* determine utf length in bytes and allocate memory */
 818
 819         buflength = u2_utflength(unicode_pos, unicode_length);
 820         buffer    = MNEW(char, buflength);
 821
 822         left = buflength;
 823         pos  = buffer;
 824
 825         for (i = 0; i++ < unicode_length; unicode_pos++) {
 826                 /* next unicode character */
 827                 u2 c = *unicode_pos;
 828
 829                 if ((c != 0) && (c < 0x80)) {
 830                         /* 1 character */
 831                         left--;
 832                 if ((int) left < 0) break;
 833                         /* convert classname */
 834                         if (isclassname && c == '.')
 835                                 *pos++ = '/';
 836                         else
 837                                 *pos++ = (char) c;
 838
 839                 } else if (c < 0x800) {
 840                         /* 2 characters */
 841                 unsigned char high = c >> 6;
 842                 unsigned char low  = c & 0x3F;
 843                         left = left - 2;
 844                 if ((int) left < 0) break;
 845                 *pos++ = high | 0xC0;
 846                 *pos++ = low  | 0x80;
 847
 848                 } else {
 849                 /* 3 characters */
 850                 char low  = c & 0x3f;
 851                 char mid  = (c >> 6) & 0x3F;
 852                 char high = c >> 12;
 853                         left = left - 3;
 854                 if ((int) left < 0) break;
 855                 *pos++ = high | 0xE0;
 856                 *pos++ = mid  | 0x80;
 857                 *pos++ = low  | 0x80;
 858                 }
 859         }
 860
 861         /* insert utf-string into symbol-table */
 862         result = utf_new(buffer,buflength);
 863
 864         MFREE(buffer, char, buflength);
 865
 866         return result;
 867 }
 868
 869
 870 /* utf_new_char ****************************************************************
 871
 872    Creates a new utf symbol, the text for this symbol is passed as a
 873    c-string ( = char* ).
 874
 875 *******************************************************************************/
 876
 877 utf *utf_new_char(const char *text)
 878 {
 879         return utf_new(text, strlen(text));
 880 }
 881
 882
 883 /* utf_new_char_classname ******************************************************
 884
 885    Creates a new utf symbol, the text for this symbol is passed as a
 886    c-string ( = char* ) "." characters are going to be replaced by
 887    "/". Since the above function is used often, this is a separte
 888    function, instead of an if.
 889
 890 *******************************************************************************/
 891
 892 utf *utf_new_char_classname(const char *text)
 893 {
 894         if (strchr(text, '.')) {
 895                 char *txt = strdup(text);
 896                 char *end = txt + strlen(txt);
 897                 char *c;
 898                 utf *tmpRes;
 899
 900                 for (c = txt; c < end; c++)
 901                         if (*c == '.') *c = '/';
 902
 903                 tmpRes = utf_new(txt, strlen(txt));
 904                 FREE(txt, 0);
 905
 906                 return tmpRes;
 907
 908         } else
 909                 return utf_new(text, strlen(text));
 910 }
 911
 912
 913 /* utf_nextu2 ******************************************************************
 914
 915    Read the next unicode character from the utf string and increment
 916    the utf-string pointer accordingly.
 917
 918    CAUTION: This function is unsafe for input that was not checked
 919             by is_valid_utf!
 920
 921 *******************************************************************************/
 922
 923 u2 utf_nextu2(char **utf_ptr)
 924 {
 925     /* uncompressed unicode character */
 926     u2 unicode_char = 0;
 927     /* current position in utf text */
 928     unsigned char *utf = (unsigned char *) (*utf_ptr);
 929     /* bytes representing the unicode character */
 930     unsigned char ch1, ch2, ch3;
 931     /* number of bytes used to represent the unicode character */
 932     int len = 0;
 933
 934     switch ((ch1 = utf[0]) >> 4) {
 935         default: /* 1 byte */
 936                 (*utf_ptr)++;
 937                 return (u2) ch1;
 938         case 0xC:
 939         case 0xD: /* 2 bytes */
 940                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 941                         unsigned char high = ch1 & 0x1F;
 942                         unsigned char low  = ch2 & 0x3F;
 943                         unicode_char = (high << 6) + low;
 944                         len = 2;
 945                 }
 946                 break;
 947
 948         case 0xE: /* 2 or 3 bytes */
 949                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 950                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 951                                 unsigned char low  = ch3 & 0x3f;
 952                                 unsigned char mid  = ch2 & 0x3f;
 953                                 unsigned char high = ch1 & 0x0f;
 954                                 unicode_char = (((high << 6) + mid) << 6) + low;
 955                                 len = 3;
 956                         } else
 957                                 len = 2;
 958                 }
 959                 break;
 960     }
 961
 962     /* update position in utf-text */
 963     *utf_ptr = (char *) (utf + len);
 964
 965     return unicode_char;
 966 }
 967
 968
 969 /* utf_bytes *******************************************************************
 970
 971    Determine number of bytes (aka. octets) in the utf string.
 972
 973    IN:
 974       u............utf string
 975
 976    OUT:
 977       The number of octets of this utf string.
 978           There is _no_ terminating zero included in this count.
 979
 980 *******************************************************************************/
 981
 982 u4 utf_bytes(utf *u)
 983 {
 984         return u->blength;
 985 }
 986
 987
 988 /* utf_get_number_of_u2s_for_buffer ********************************************
 989
 990    Determine number of UTF-16 u2s in the given UTF-8 buffer
 991
 992    CAUTION: This function is unsafe for input that was not checked
 993             by is_valid_utf!
 994
 995    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 996    to an array of u2s (UTF-16) and want to know how many of them you will get.
 997    All other uses of this function are probably wrong.
 998
 999    IN:
1000       buffer........points to first char in buffer
1001           blength.......number of _bytes_ in the buffer
1002
1003    OUT:
1004       the number of u2s needed to hold this string in UTF-16 encoding.
1005           There is _no_ terminating zero included in this count.
1006
1007    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1008    exception.
1009
1010 *******************************************************************************/
1011
1012 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1013 {
1014         const char *endpos;                 /* points behind utf string           */
1015         const char *utf_ptr;                /* current position in utf text       */
1016         u4 len = 0;                         /* number of unicode characters       */
1017
1018         utf_ptr = buffer;
1019         endpos = utf_ptr + blength;
1020
1021         while (utf_ptr < endpos) {
1022                 len++;
1023                 /* next unicode character */
1024                 utf_nextu2((char **)&utf_ptr);
1025         }
1026
1027         assert(utf_ptr == endpos);
1028
1029         return len;
1030 }
1031
1032
1033 /* utf_get_number_of_u2s *******************************************************
1034
1035    Determine number of UTF-16 u2s in the utf string.
1036
1037    CAUTION: This function is unsafe for input that was not checked
1038             by is_valid_utf!
1039
1040    CAUTION: Use this function *only* when you want to convert a utf string
1041    to an array of u2s and want to know how many of them you will get.
1042    All other uses of this function are probably wrong.
1043
1044    IN:
1045       u............utf string
1046
1047    OUT:
1048       the number of u2s needed to hold this string in UTF-16 encoding.
1049           There is _no_ terminating zero included in this count.
1050           XXX 0 if a NullPointerException has been thrown (see below)
1051
1052 *******************************************************************************/
1053
1054 u4 utf_get_number_of_u2s(utf *u)
1055 {
1056         char *endpos;                       /* points behind utf string           */
1057         char *utf_ptr;                      /* current position in utf text       */
1058         u4 len = 0;                         /* number of unicode characters       */
1059
1060         /* XXX this is probably not checked by most callers! Review this after */
1061         /* the invalid uses of this function have been eliminated */
1062         if (u == NULL) {
1063                 exceptions_throw_nullpointerexception();
1064                 return 0;
1065         }
1066
1067         endpos = UTF_END(u);
1068         utf_ptr = u->text;
1069
1070         while (utf_ptr < endpos) {
1071                 len++;
1072                 /* next unicode character */
1073                 utf_nextu2(&utf_ptr);
1074         }
1075
1076         if (utf_ptr != endpos) {
1077                 /* string ended abruptly */
1078                 exceptions_throw_internalerror("Illegal utf8 string");
1079                 return 0;
1080         }
1081
1082         return len;
1083 }
1084
1085
1086 /* utf8_safe_number_of_u2s *****************************************************
1087
1088    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1089    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1090
1091    This function is safe even for invalid UTF-8 strings.
1092
1093    IN:
1094       text..........zero-terminated(!) UTF-8 string (may be invalid)
1095                         must NOT be NULL
1096           nbytes........strlen(text). (This is needed to completely emulate
1097                         the RI).
1098
1099    OUT:
1100       the number of u2s needed to hold this string in UTF-16 encoding.
1101           There is _no_ terminating zero included in this count.
1102
1103 *******************************************************************************/
1104
1105 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1106         register const unsigned char *t;
1107         register s4 byte;
1108         register s4 len;
1109         register const unsigned char *tlimit;
1110         s4 byte1;
1111         s4 byte2;
1112         s4 byte3;
1113         s4 value;
1114         s4 skip;
1115
1116         assert(text);
1117         assert(nbytes >= 0);
1118
1119         len = 0;
1120         t = (const unsigned char *) text;
1121         tlimit = t + nbytes;
1122
1123         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1124
1125         while (1) {
1126                 byte = *t++;
1127
1128                 if (byte & 0x80) {
1129                         /* highest bit set, non-ASCII character */
1130
1131                         if ((byte & 0xe0) == 0xc0) {
1132                                 /* 2-byte: should be 110..... 10...... ? */
1133
1134                                 if ((*t++ & 0xc0) == 0x80)
1135                                         ; /* valid 2-byte */
1136                                 else
1137                                         t--; /* invalid */
1138                         }
1139                         else if ((byte & 0xf0) == 0xe0) {
1140                                 /* 3-byte: should be 1110.... 10...... 10...... */
1141                                 /*                            ^t                */
1142
1143                                 if (t + 2 > tlimit)
1144                                         return len + 1; /* invalid, stop here */
1145
1146                                 if ((*t++ & 0xc0) == 0x80) {
1147                                         if ((*t++ & 0xc0) == 0x80)
1148                                                 ; /* valid 3-byte */
1149                                         else
1150                                                 t--; /* invalid */
1151                                 }
1152                                 else
1153                                         t--; /* invalid */
1154                         }
1155                         else if ((byte & 0xf8) == 0xf0) {
1156                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1157                                 /*                            ^t                         */
1158
1159                                 if (t + 3 > tlimit)
1160                                         return len + 1; /* invalid, stop here */
1161
1162                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1163                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1164                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1165                                                         /* valid 4-byte UTF-8? */
1166                                                         value = ((byte  & 0x07) << 18)
1167                                                                   | ((byte1 & 0x3f) << 12)
1168                                                                   | ((byte2 & 0x3f) <<  6)
1169                                                                   | ((byte3 & 0x3f)      );
1170
1171                                                         if (value > 0x10FFFF)
1172                                                                 ; /* invalid */
1173                                                         else if (value > 0xFFFF)
1174                                                                 len += 1; /* we need surrogates */
1175                                                         else
1176                                                                 ; /* 16bit suffice */
1177                                                 }
1178                                                 else
1179                                                         t--; /* invalid */
1180                                         }
1181                                         else
1182                                                 t--; /* invalid */
1183                                 }
1184                                 else
1185                                         t--; /* invalid */
1186                         }
1187                         else if ((byte & 0xfc) == 0xf8) {
1188                                 /* invalid 5-byte */
1189                                 if (t + 4 > tlimit)
1190                                         return len + 1; /* invalid, stop here */
1191
1192                                 skip = 4;
1193                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1194                                         t++;
1195                         }
1196                         else if ((byte & 0xfe) == 0xfc) {
1197                                 /* invalid 6-byte */
1198                                 if (t + 5 > tlimit)
1199                                         return len + 1; /* invalid, stop here */
1200
1201                                 skip = 5;
1202                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1203                                         t++;
1204                         }
1205                         else
1206                                 ; /* invalid */
1207                 }
1208                 else {
1209                         /* NUL */
1210
1211                         if (byte == 0)
1212                                 break;
1213
1214                         /* ASCII character, common case */
1215                 }
1216
1217                 len++;
1218         }
1219
1220         return len;
1221 }
1222
1223
1224 /* utf8_safe_convert_to_u2s ****************************************************
1225
1226    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1227    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1228    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1229
1230    This function is safe even for invalid UTF-8 strings.
1231
1232    IN:
1233       text..........zero-terminated(!) UTF-8 string (may be invalid)
1234                         must NOT be NULL
1235           nbytes........strlen(text). (This is needed to completely emulate
1236                                         the RI).
1237           buffer........a preallocated array of u2s to receive the decoded
1238                         string. Use utf8_safe_number_of_u2s to get the
1239                                         required number of u2s for allocating this.
1240
1241 *******************************************************************************/
1242
1243 #define UNICODE_REPLACEMENT  0xfffd
1244
1245 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1246         register const unsigned char *t;
1247         register s4 byte;
1248         register const unsigned char *tlimit;
1249         s4 byte1;
1250         s4 byte2;
1251         s4 byte3;
1252         s4 value;
1253         s4 skip;
1254
1255         assert(text);
1256         assert(nbytes >= 0);
1257
1258         t = (const unsigned char *) text;
1259         tlimit = t + nbytes;
1260
1261         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1262
1263         while (1) {
1264                 byte = *t++;
1265
1266                 if (byte & 0x80) {
1267                         /* highest bit set, non-ASCII character */
1268
1269                         if ((byte & 0xe0) == 0xc0) {
1270                                 /* 2-byte: should be 110..... 10...... */
1271
1272                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1273                                         /* valid 2-byte UTF-8 */
1274                                         *buffer++ = ((byte  & 0x1f) << 6)
1275                                                           | ((byte1 & 0x3f)     );
1276                                 }
1277                                 else {
1278                                         *buffer++ = UNICODE_REPLACEMENT;
1279                                         t--;
1280                                 }
1281                         }
1282                         else if ((byte & 0xf0) == 0xe0) {
1283                                 /* 3-byte: should be 1110.... 10...... 10...... */
1284
1285                                 if (t + 2 > tlimit) {
1286                                         *buffer++ = UNICODE_REPLACEMENT;
1287                                         return;
1288                                 }
1289
1290                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1291                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1292                                                 /* valid 3-byte UTF-8 */
1293                                                 *buffer++ = ((byte  & 0x0f) << 12)
1294                                                                   | ((byte1 & 0x3f) <<  6)
1295                                                                   | ((byte2 & 0x3f)      );
1296                                         }
1297                                         else {
1298                                                 *buffer++ = UNICODE_REPLACEMENT;
1299                                                 t--;
1300                                         }
1301                                 }
1302                                 else {
1303                                         *buffer++ = UNICODE_REPLACEMENT;
1304                                         t--;
1305                                 }
1306                         }
1307                         else if ((byte & 0xf8) == 0xf0) {
1308                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1309
1310                                 if (t + 3 > tlimit) {
1311                                         *buffer++ = UNICODE_REPLACEMENT;
1312                                         return;
1313                                 }
1314
1315                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1316                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1317                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1318                                                         /* valid 4-byte UTF-8? */
1319                                                         value = ((byte  & 0x07) << 18)
1320                                                                   | ((byte1 & 0x3f) << 12)
1321                                                                   | ((byte2 & 0x3f) <<  6)
1322                                                                   | ((byte3 & 0x3f)      );
1323
1324                                                         if (value > 0x10FFFF) {
1325                                                                 *buffer++ = UNICODE_REPLACEMENT;
1326                                                         }
1327                                                         else if (value > 0xFFFF) {
1328                                                                 /* we need surrogates */
1329                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1330                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1331                                                         }
1332                                                         else
1333                                                                 *buffer++ = value; /* 16bit suffice */
1334                                                 }
1335                                                 else {
1336                                                         *buffer++ = UNICODE_REPLACEMENT;
1337                                                         t--;
1338                                                 }
1339                                         }
1340                                         else {
1341                                                 *buffer++ = UNICODE_REPLACEMENT;
1342                                                 t--;
1343                                         }
1344                                 }
1345                                 else {
1346                                         *buffer++ = UNICODE_REPLACEMENT;
1347                                         t--;
1348                                 }
1349                         }
1350                         else if ((byte & 0xfc) == 0xf8) {
1351                                 if (t + 4 > tlimit) {
1352                                         *buffer++ = UNICODE_REPLACEMENT;
1353                                         return;
1354                                 }
1355
1356                                 skip = 4;
1357                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1358                                         t++;
1359                                 *buffer++ = UNICODE_REPLACEMENT;
1360                         }
1361                         else if ((byte & 0xfe) == 0xfc) {
1362                                 if (t + 5 > tlimit) {
1363                                         *buffer++ = UNICODE_REPLACEMENT;
1364                                         return;
1365                                 }
1366
1367                                 skip = 5;
1368                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1369                                         t++;
1370                                 *buffer++ = UNICODE_REPLACEMENT;
1371                         }
1372                         else
1373                                 *buffer++ = UNICODE_REPLACEMENT;
1374                 }
1375                 else {
1376                         /* NUL */
1377
1378                         if (byte == 0)
1379                                 break;
1380
1381                         /* ASCII character, common case */
1382
1383                         *buffer++ = byte;
1384                 }
1385         }
1386 }
1387
1388
1389 /* u2_utflength ****************************************************************
1390
1391    Returns the utf length in bytes of a u2 array.
1392
1393 *******************************************************************************/
1394
1395 u4 u2_utflength(u2 *text, u4 u2_length)
1396 {
1397         u4 result_len = 0;                  /* utf length in bytes                */
1398         u2 ch;                              /* current unicode character          */
1399         u4 len;
1400
1401         for (len = 0; len < u2_length; len++) {
1402                 /* next unicode character */
1403                 ch = *text++;
1404
1405                 /* determine bytes required to store unicode character as utf */
1406                 if (ch && (ch < 0x80))
1407                         result_len++;
1408                 else if (ch < 0x800)
1409                         result_len += 2;
1410                 else
1411                         result_len += 3;
1412         }
1413
1414     return result_len;
1415 }
1416
1417
1418 /* utf_copy ********************************************************************
1419
1420    Copy the given utf string byte-for-byte to a buffer.
1421
1422    IN:
1423       buffer.......the buffer
1424           u............the utf string
1425
1426 *******************************************************************************/
1427
1428 void utf_copy(char *buffer, utf *u)
1429 {
1430         /* our utf strings are zero-terminated (done by utf_new) */
1431         MCOPY(buffer, u->text, char, u->blength + 1);
1432 }
1433
1434
1435 /* utf_cat *********************************************************************
1436
1437    Append the given utf string byte-for-byte to a buffer.
1438
1439    IN:
1440       buffer.......the buffer
1441           u............the utf string
1442
1443 *******************************************************************************/
1444
1445 void utf_cat(char *buffer, utf *u)
1446 {
1447         /* our utf strings are zero-terminated (done by utf_new) */
1448         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1449 }
1450
1451
1452 /* utf_copy_classname **********************************************************
1453
1454    Copy the given utf classname byte-for-byte to a buffer.
1455    '/' is replaced by '.'
1456
1457    IN:
1458       buffer.......the buffer
1459           u............the utf string
1460
1461 *******************************************************************************/
1462
1463 void utf_copy_classname(char *buffer, utf *u)
1464 {
1465         char *bufptr;
1466         char *srcptr;
1467         char *endptr;
1468         char ch;
1469
1470         bufptr = buffer;
1471         srcptr = u->text;
1472         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1473
1474         while (srcptr != endptr) {
1475                 ch = *srcptr++;
1476                 if (ch == '/')
1477                         ch = '.';
1478                 *bufptr++ = ch;
1479         }
1480 }
1481
1482
1483 /* utf_cat *********************************************************************
1484
1485    Append the given utf classname byte-for-byte to a buffer.
1486    '/' is replaced by '.'
1487
1488    IN:
1489       buffer.......the buffer
1490           u............the utf string
1491
1492 *******************************************************************************/
1493
1494 void utf_cat_classname(char *buffer, utf *u)
1495 {
1496         utf_copy_classname(buffer + strlen(buffer), u);
1497 }
1498
1499 /* utf_display_printable_ascii *************************************************
1500
1501    Write utf symbol to stdout (for debugging purposes).
1502    Non-printable and non-ASCII characters are printed as '?'.
1503
1504 *******************************************************************************/
1505
1506 void utf_display_printable_ascii(utf *u)
1507 {
1508         char *endpos;                       /* points behind utf string           */
1509         char *utf_ptr;                      /* current position in utf text       */
1510
1511         if (u == NULL) {
1512                 printf("NULL");
1513                 fflush(stdout);
1514                 return;
1515         }
1516
1517         endpos = UTF_END(u);
1518         utf_ptr = u->text;
1519
1520         while (utf_ptr < endpos) {
1521                 /* read next unicode character */
1522
1523                 u2 c = utf_nextu2(&utf_ptr);
1524
1525                 if ((c >= 32) && (c <= 127))
1526                         printf("%c", c);
1527                 else
1528                         printf("?");
1529         }
1530
1531         fflush(stdout);
1532 }
1533
1534
1535 /* utf_display_printable_ascii_classname ***************************************
1536
1537    Write utf symbol to stdout with `/' converted to `.' (for debugging
1538    purposes).
1539    Non-printable and non-ASCII characters are printed as '?'.
1540
1541 *******************************************************************************/
1542
1543 void utf_display_printable_ascii_classname(utf *u)
1544 {
1545         char *endpos;                       /* points behind utf string           */
1546         char *utf_ptr;                      /* current position in utf text       */
1547
1548         if (u == NULL) {
1549                 printf("NULL");
1550                 fflush(stdout);
1551                 return;
1552         }
1553
1554         endpos = UTF_END(u);
1555         utf_ptr = u->text;
1556
1557         while (utf_ptr < endpos) {
1558                 /* read next unicode character */
1559
1560                 u2 c = utf_nextu2(&utf_ptr);
1561
1562                 if (c == '/')
1563                         c = '.';
1564
1565                 if ((c >= 32) && (c <= 127))
1566                         printf("%c", c);
1567                 else
1568                         printf("?");
1569         }
1570
1571         fflush(stdout);
1572 }
1573
1574
1575 /* utf_sprint_convert_to_latin1 ************************************************
1576
1577    Write utf symbol into c-string (for debugging purposes).
1578    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1579    invalid results.
1580
1581 *******************************************************************************/
1582
1583 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1584 {
1585         char *endpos;                       /* points behind utf string           */
1586         char *utf_ptr;                      /* current position in utf text       */
1587         u2 pos = 0;                         /* position in c-string               */
1588
1589         if (!u) {
1590                 strcpy(buffer, "NULL");
1591                 return;
1592         }
1593
1594         endpos = UTF_END(u);
1595         utf_ptr = u->text;
1596
1597         while (utf_ptr < endpos)
1598                 /* copy next unicode character */
1599                 buffer[pos++] = utf_nextu2(&utf_ptr);
1600
1601         /* terminate string */
1602         buffer[pos] = '\0';
1603 }
1604
1605
1606 /* utf_sprint_convert_to_latin1_classname **************************************
1607
1608    Write utf symbol into c-string with `/' converted to `.' (for debugging
1609    purposes).
1610    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1611    invalid results.
1612
1613 *******************************************************************************/
1614
1615 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1616 {
1617         char *endpos;                       /* points behind utf string           */
1618         char *utf_ptr;                      /* current position in utf text       */
1619         u2 pos = 0;                         /* position in c-string               */
1620
1621         if (!u) {
1622                 strcpy(buffer, "NULL");
1623                 return;
1624         }
1625
1626         endpos = UTF_END(u);
1627         utf_ptr = u->text;
1628
1629         while (utf_ptr < endpos) {
1630                 /* copy next unicode character */
1631                 u2 c = utf_nextu2(&utf_ptr);
1632                 if (c == '/') c = '.';
1633                 buffer[pos++] = c;
1634         }
1635
1636         /* terminate string */
1637         buffer[pos] = '\0';
1638 }
1639
1640
1641 /* utf_strcat_convert_to_latin1 ************************************************
1642
1643    Like libc strcat, but uses an utf8 string.
1644    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1645    invalid results.
1646
1647 *******************************************************************************/
1648
1649 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1650 {
1651         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1652 }
1653
1654
1655 /* utf_strcat_convert_to_latin1_classname **************************************
1656
1657    Like libc strcat, but uses an utf8 string.
1658    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1659    invalid results.
1660
1661 *******************************************************************************/
1662
1663 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1664 {
1665         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1666 }
1667
1668
1669 /* utf_fprint_printable_ascii **************************************************
1670
1671    Write utf symbol into file.
1672    Non-printable and non-ASCII characters are printed as '?'.
1673
1674 *******************************************************************************/
1675
1676 void utf_fprint_printable_ascii(FILE *file, utf *u)
1677 {
1678         char *endpos;                       /* points behind utf string           */
1679         char *utf_ptr;                      /* current position in utf text       */
1680
1681         if (!u)
1682                 return;
1683
1684         endpos = UTF_END(u);
1685         utf_ptr = u->text;
1686
1687         while (utf_ptr < endpos) {
1688                 /* read next unicode character */
1689                 u2 c = utf_nextu2(&utf_ptr);
1690
1691                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1692                 else fprintf(file, "?");
1693         }
1694 }
1695
1696
1697 /* utf_fprint_printable_ascii_classname ****************************************
1698
1699    Write utf symbol into file with `/' converted to `.'.
1700    Non-printable and non-ASCII characters are printed as '?'.
1701
1702 *******************************************************************************/
1703
1704 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1705 {
1706         char *endpos;                       /* points behind utf string           */
1707         char *utf_ptr;                      /* current position in utf text       */
1708
1709     if (!u)
1710                 return;
1711
1712         endpos = UTF_END(u);
1713         utf_ptr = u->text;
1714
1715         while (utf_ptr < endpos) {
1716                 /* read next unicode character */
1717                 u2 c = utf_nextu2(&utf_ptr);
1718                 if (c == '/') c = '.';
1719
1720                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1721                 else fprintf(file, "?");
1722         }
1723 }
1724
1725
1726 /* is_valid_utf ****************************************************************
1727
1728    Return true if the given string is a valid UTF-8 string.
1729
1730    utf_ptr...points to first character
1731    end_pos...points after last character
1732
1733 *******************************************************************************/
1734
1735 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1736
1737 bool is_valid_utf(char *utf_ptr, char *end_pos)
1738 {
1739         int bytes;
1740         int len,i;
1741         char c;
1742         unsigned long v;
1743
1744         if (end_pos < utf_ptr) return false;
1745         bytes = end_pos - utf_ptr;
1746         while (bytes--) {
1747                 c = *utf_ptr++;
1748
1749                 if (!c) return false;                     /* 0x00 is not allowed */
1750                 if ((c & 0x80) == 0) continue;            /* ASCII */
1751
1752                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1753                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1754                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1755                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1756                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1757                 else return false;                        /* invalid leading byte */
1758
1759                 if (len > 2) return false;                /* Java limitation */
1760
1761                 v = (unsigned long)c & (0x3f >> len);
1762
1763                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1764
1765                 for (i = len; i--; ) {
1766                         c = *utf_ptr++;
1767                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1768                                 return false;
1769                         v = (v << 6) | (c & 0x3f);
1770                 }
1771
1772                 if (v == 0) {
1773                         if (len != 1) return false;           /* Java special */
1774
1775                 } else {
1776                         /* Sun Java seems to allow overlong UTF-8 encodings */
1777
1778                         /* if (v < min_codepoint[len]) */
1779                                 /* XXX throw exception? */
1780                 }
1781
1782                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1783                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1784
1785                 /* even these seem to be allowed */
1786                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1787         }
1788
1789         return true;
1790 }
1791
1792
1793 /* is_valid_name ***************************************************************
1794
1795    Return true if the given string may be used as a class/field/method
1796    name. (Currently this only disallows empty strings and control
1797    characters.)
1798
1799    NOTE: The string is assumed to have passed is_valid_utf!
1800
1801    utf_ptr...points to first character
1802    end_pos...points after last character
1803
1804 *******************************************************************************/
1805
1806 bool is_valid_name(char *utf_ptr, char *end_pos)
1807 {
1808         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1809
1810         while (utf_ptr < end_pos) {
1811                 unsigned char c = *utf_ptr++;
1812
1813                 if (c < 0x20) return false; /* disallow control characters */
1814                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1815                         return false;
1816         }
1817
1818         return true;
1819 }
1820
1821 bool is_valid_name_utf(utf *u)
1822 {
1823         return is_valid_name(u->text, UTF_END(u));
1824 }
1825
1826
1827 /* utf_show ********************************************************************
1828
1829    Writes the utf symbols in the utfhash to stdout and displays the
1830    number of external hash chains grouped according to the chainlength
1831    (for debugging purposes).
1832
1833 *******************************************************************************/
1834
1835 #if !defined(NDEBUG)
1836 void utf_show(void)
1837 {
1838
1839 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1840
1841         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1842         u4 max_chainlength = 0;      /* maximum length of the chains */
1843         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1844         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1845         u4 i;
1846
1847         printf("UTF-HASH:\n");
1848
1849         /* show element of utf-hashtable */
1850
1851         for (i = 0; i < hashtable_utf->size; i++) {
1852                 utf *u = hashtable_utf->ptr[i];
1853
1854                 if (u) {
1855                         printf("SLOT %d: ", (int) i);
1856
1857                         while (u) {
1858                                 printf("'");
1859                                 utf_display_printable_ascii(u);
1860                                 printf("' ");
1861                                 u = u->hashlink;
1862                         }
1863                         printf("\n");
1864                 }
1865         }
1866
1867         printf("UTF-HASH: %d slots for %d entries\n",
1868                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1869
1870         if (hashtable_utf->entries == 0)
1871                 return;
1872
1873         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1874
1875         for (i=0;i<CHAIN_LIMIT;i++)
1876                 chain_count[i]=0;
1877
1878         /* count numbers of hashchains according to their length */
1879         for (i=0; i<hashtable_utf->size; i++) {
1880
1881                 utf *u = (utf*) hashtable_utf->ptr[i];
1882                 u4 chain_length = 0;
1883
1884                 /* determine chainlength */
1885                 while (u) {
1886                         u = u->hashlink;
1887                         chain_length++;
1888                 }
1889
1890                 /* update sum of all chainlengths */
1891                 sum_chainlength+=chain_length;
1892
1893                 /* determine the maximum length of the chains */
1894                 if (chain_length>max_chainlength)
1895                         max_chainlength = chain_length;
1896
1897                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1898                 if (chain_length>=CHAIN_LIMIT) {
1899                         beyond_limit+=chain_length;
1900                         chain_length=CHAIN_LIMIT-1;
1901                 }
1902
1903                 /* update number of hashchains of current length */
1904                 chain_count[chain_length]++;
1905         }
1906
1907         /* display results */
1908         for (i=1;i<CHAIN_LIMIT-1;i++)
1909                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1910
1911         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1912
1913
1914         printf("max. chainlength:%5d\n",max_chainlength);
1915
1916         /* avg. chainlength = sum of chainlengths / number of chains */
1917         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1918 }
1919 #endif /* !defined(NDEBUG) */
1920
1921
1922 /*
1923  * These are local overrides for various environment variables in Emacs.
1924  * Please do not remove this and leave it at the end of the file, where
1925  * Emacs will automagically detect them.
1926  * ---------------------------------------------------------------------
1927  * Local variables:
1928  * mode: c
1929  * indent-tabs-mode: t
1930  * c-basic-offset: 4
1931  * tab-width: 4
1932  * End:
1933  * vim:noexpandtab:sw=4:ts=4:
1934  */