src/vmcore/utf8.c

   1 /* src/vmcore/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
   4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
   5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
   6    J. Wenninger, Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  23    02110-1301, USA.
  24
  25    $Id: utf8.c 8299 2007-08-13 08:41:18Z michi $
  26
  27 */
  28
  29
  30 #include "config.h"
  31
  32 #include <string.h>
  33 #include <assert.h>
  34
  35 #include "vm/types.h"
  36
  37 #include "mm/memory.h"
  38
  39 #include "threads/lock-common.h"
  40
  41 #include "toolbox/hashtable.h"
  42
  43 #include "vm/exceptions.h"
  44
  45 #include "vmcore/options.h"
  46
  47 #if defined(ENABLE_STATISTICS)
  48 # include "vmcore/statistics.h"
  49 #endif
  50
  51 #include "vmcore/utf8.h"
  52
  53
  54 /* global variables ***********************************************************/
  55
  56 /* hashsize must be power of 2 */
  57
  58 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  59
  60 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  61
  62
  63 /* utf-symbols for pointer comparison of frequently used strings **************/
  64
  65 utf *utf_java_lang_Object;
  66
  67 utf *utf_java_lang_Class;
  68 utf *utf_java_lang_ClassLoader;
  69 utf *utf_java_lang_Cloneable;
  70 utf *utf_java_lang_SecurityManager;
  71 utf *utf_java_lang_String;
  72 utf *utf_java_lang_System;
  73 utf *utf_java_lang_ThreadGroup;
  74 utf *utf_java_lang_ref_SoftReference;
  75 utf *utf_java_lang_ref_WeakReference;
  76 utf *utf_java_lang_ref_PhantomReference;
  77 utf *utf_java_io_Serializable;
  78
  79 utf *utf_java_lang_Throwable;
  80 utf *utf_java_lang_Error;
  81
  82 utf *utf_java_lang_AbstractMethodError;
  83 utf *utf_java_lang_ClassCircularityError;
  84 utf *utf_java_lang_ClassFormatError;
  85 utf *utf_java_lang_ExceptionInInitializerError;
  86 utf *utf_java_lang_IncompatibleClassChangeError;
  87 utf *utf_java_lang_InstantiationError;
  88 utf *utf_java_lang_InternalError;
  89 utf *utf_java_lang_LinkageError;
  90 utf *utf_java_lang_NoClassDefFoundError;
  91 utf *utf_java_lang_NoSuchFieldError;
  92 utf *utf_java_lang_NoSuchMethodError;
  93 utf *utf_java_lang_OutOfMemoryError;
  94 utf *utf_java_lang_UnsatisfiedLinkError;
  95 utf *utf_java_lang_UnsupportedClassVersionError;
  96 utf *utf_java_lang_VerifyError;
  97 utf *utf_java_lang_VirtualMachineError;
  98
  99 #if defined(WITH_CLASSPATH_GNU)
 100 utf *utf_java_lang_VMThrowable;
 101 #endif
 102
 103 utf *utf_java_lang_Exception;
 104
 105 utf *utf_java_lang_ArithmeticException;
 106 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
 107 utf *utf_java_lang_ArrayStoreException;
 108 utf *utf_java_lang_ClassCastException;
 109 utf *utf_java_lang_ClassNotFoundException;
 110 utf *utf_java_lang_CloneNotSupportedException;
 111 utf *utf_java_lang_IllegalAccessException;
 112 utf *utf_java_lang_IllegalArgumentException;
 113 utf *utf_java_lang_IllegalMonitorStateException;
 114 utf *utf_java_lang_InstantiationException;
 115 utf *utf_java_lang_InterruptedException;
 116 utf *utf_java_lang_NegativeArraySizeException;
 117 utf *utf_java_lang_NullPointerException;
 118 utf *utf_java_lang_StringIndexOutOfBoundsException;
 119
 120 utf *utf_java_lang_reflect_InvocationTargetException;
 121
 122 utf *utf_java_security_PrivilegedActionException;
 123
 124 #if defined(ENABLE_JAVASE)
 125 utf* utf_java_lang_Void;
 126 #endif
 127
 128 utf* utf_java_lang_Boolean;
 129 utf* utf_java_lang_Byte;
 130 utf* utf_java_lang_Character;
 131 utf* utf_java_lang_Short;
 132 utf* utf_java_lang_Integer;
 133 utf* utf_java_lang_Long;
 134 utf* utf_java_lang_Float;
 135 utf* utf_java_lang_Double;
 136
 137 #if defined(ENABLE_JAVASE)
 138 utf *utf_java_lang_StackTraceElement;
 139 utf *utf_java_lang_reflect_Constructor;
 140 utf *utf_java_lang_reflect_Field;
 141 utf *utf_java_lang_reflect_Method;
 142 utf *utf_java_util_Vector;
 143 #endif
 144
 145 utf *utf_InnerClasses;                  /* InnerClasses                       */
 146 utf *utf_ConstantValue;                 /* ConstantValue                      */
 147 utf *utf_Code;                          /* Code                               */
 148 utf *utf_Exceptions;                    /* Exceptions                         */
 149 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 150 utf *utf_SourceFile;                    /* SourceFile                         */
 151
 152 #if defined(ENABLE_JAVASE)
 153 utf *utf_EnclosingMethod;
 154 utf *utf_Signature;
 155 utf *utf_StackMapTable;
 156
 157 #if defined(ENABLE_ANNOTATIONS)
 158 utf *utf_sun_reflect_ConstantPool;
 159 #if defined(WITH_CLASSPATH_GNU)
 160 utf *utf_sun_reflect_annotation_AnnotationParser;
 161 #endif
 162
 163 utf *utf_RuntimeVisibleAnnotations;
 164 utf *utf_RuntimeInvisibleAnnotations;
 165 utf *utf_RuntimeVisibleParameterAnnotations;
 166 utf *utf_RuntimeInvisibleParameterAnnotations;
 167 utf *utf_AnnotationDefault;
 168 #endif
 169 #endif
 170
 171 utf *utf_init;                          /* <init>                             */
 172 utf *utf_clinit;                        /* <clinit>                           */
 173 utf *utf_clone;                         /* clone                              */
 174 utf *utf_finalize;                      /* finalize                           */
 175 utf *utf_run;                           /* run                                */
 176
 177 utf *utf_add;
 178 utf *utf_remove;
 179 utf *utf_addThread;
 180 utf *utf_removeThread;
 181 utf *utf_put;
 182 utf *utf_get;
 183 utf *utf_value;
 184
 185 utf *utf_fillInStackTrace;
 186 utf *utf_findNative;
 187 utf *utf_getSystemClassLoader;
 188 utf *utf_initCause;
 189 utf *utf_loadClass;
 190 utf *utf_printStackTrace;
 191
 192 utf *utf_division_by_zero;
 193
 194 utf *utf_Z;                             /* Z                                  */
 195 utf *utf_B;                             /* B                                  */
 196 utf *utf_C;                             /* C                                  */
 197 utf *utf_S;                             /* S                                  */
 198 utf *utf_I;                             /* I                                  */
 199 utf *utf_J;                             /* J                                  */
 200 utf *utf_F;                             /* F                                  */
 201 utf *utf_D;                             /* D                                  */
 202
 203 utf *utf_void__void;                    /* ()V                                */
 204 utf *utf_boolean__void;                 /* (Z)V                               */
 205 utf *utf_byte__void;                    /* (B)V                               */
 206 utf *utf_char__void;                    /* (C)V                               */
 207 utf *utf_short__void;                   /* (S)V                               */
 208 utf *utf_int__void;                     /* (I)V                               */
 209 utf *utf_long__void;                    /* (J)V                               */
 210 utf *utf_float__void;                   /* (F)V                               */
 211 utf *utf_double__void;                  /* (D)V                               */
 212
 213 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 214 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 215 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 216 utf *utf_java_lang_ClassLoader_java_lang_String__J;
 217 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 218 utf *utf_java_lang_Object__java_lang_Object;
 219 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 220 utf *utf_java_lang_String__java_lang_Class;
 221 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 222 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 223 utf *utf_java_lang_Throwable__java_lang_Throwable;
 224
 225 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 226 utf *utf_null;
 227 utf *array_packagename;
 228
 229
 230 /* utf_init ********************************************************************
 231
 232    Initializes the utf8 subsystem.
 233
 234 *******************************************************************************/
 235
 236 bool utf8_init(void)
 237 {
 238         /* create utf8 hashtable */
 239
 240         hashtable_utf = NEW(hashtable);
 241
 242         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 243
 244 #if defined(ENABLE_STATISTICS)
 245         if (opt_stat)
 246                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 247 #endif
 248
 249         /* create utf-symbols for pointer comparison of frequently used strings */
 250
 251         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 252
 253         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 254         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 255         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 256         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 257         utf_java_lang_String           = utf_new_char("java/lang/String");
 258         utf_java_lang_System           = utf_new_char("java/lang/System");
 259         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 260
 261         utf_java_lang_ref_SoftReference =
 262                 utf_new_char("java/lang/ref/SoftReference");
 263
 264         utf_java_lang_ref_WeakReference =
 265                 utf_new_char("java/lang/ref/WeakReference");
 266
 267         utf_java_lang_ref_PhantomReference =
 268                 utf_new_char("java/lang/ref/PhantomReference");
 269
 270         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 271
 272         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 273         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 274
 275         utf_java_lang_ClassCircularityError =
 276                 utf_new_char("java/lang/ClassCircularityError");
 277
 278         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
 279
 280         utf_java_lang_ExceptionInInitializerError =
 281                 utf_new_char("java/lang/ExceptionInInitializerError");
 282
 283         utf_java_lang_IncompatibleClassChangeError =
 284                 utf_new_char("java/lang/IncompatibleClassChangeError");
 285
 286         utf_java_lang_InstantiationError =
 287                 utf_new_char("java/lang/InstantiationError");
 288
 289         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
 290         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 291
 292         utf_java_lang_NoClassDefFoundError =
 293                 utf_new_char("java/lang/NoClassDefFoundError");
 294
 295         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 296
 297         utf_java_lang_UnsatisfiedLinkError =
 298                 utf_new_char("java/lang/UnsatisfiedLinkError");
 299
 300         utf_java_lang_UnsupportedClassVersionError =
 301                 utf_new_char("java/lang/UnsupportedClassVersionError");
 302
 303         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
 304
 305         utf_java_lang_VirtualMachineError =
 306                 utf_new_char("java/lang/VirtualMachineError");
 307
 308 #if defined(ENABLE_JAVASE)
 309         utf_java_lang_AbstractMethodError =
 310                 utf_new_char("java/lang/AbstractMethodError");
 311
 312         utf_java_lang_NoSuchFieldError =
 313                 utf_new_char("java/lang/NoSuchFieldError");
 314
 315         utf_java_lang_NoSuchMethodError =
 316                 utf_new_char("java/lang/NoSuchMethodError");
 317 #endif
 318
 319 #if defined(WITH_CLASSPATH_GNU)
 320         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
 321 #endif
 322
 323         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 324
 325         utf_java_lang_ArithmeticException =
 326                 utf_new_char("java/lang/ArithmeticException");
 327
 328         utf_java_lang_ArrayIndexOutOfBoundsException =
 329                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
 330
 331         utf_java_lang_ArrayStoreException =
 332                 utf_new_char("java/lang/ArrayStoreException");
 333
 334         utf_java_lang_ClassCastException =
 335                 utf_new_char("java/lang/ClassCastException");
 336
 337         utf_java_lang_ClassNotFoundException =
 338                 utf_new_char("java/lang/ClassNotFoundException");
 339
 340         utf_java_lang_CloneNotSupportedException =
 341                 utf_new_char("java/lang/CloneNotSupportedException");
 342
 343         utf_java_lang_IllegalAccessException =
 344                 utf_new_char("java/lang/IllegalAccessException");
 345
 346         utf_java_lang_IllegalArgumentException =
 347                 utf_new_char("java/lang/IllegalArgumentException");
 348
 349         utf_java_lang_IllegalMonitorStateException =
 350                 utf_new_char("java/lang/IllegalMonitorStateException");
 351
 352         utf_java_lang_InstantiationException =
 353                 utf_new_char("java/lang/InstantiationException");
 354
 355         utf_java_lang_InterruptedException =
 356                 utf_new_char("java/lang/InterruptedException");
 357
 358         utf_java_lang_NegativeArraySizeException =
 359                 utf_new_char("java/lang/NegativeArraySizeException");
 360
 361         utf_java_lang_NullPointerException =
 362                 utf_new_char("java/lang/NullPointerException");
 363
 364         utf_java_lang_StringIndexOutOfBoundsException =
 365                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
 366
 367         utf_java_lang_reflect_InvocationTargetException =
 368                 utf_new_char("java/lang/reflect/InvocationTargetException");
 369
 370         utf_java_security_PrivilegedActionException =
 371                 utf_new_char("java/security/PrivilegedActionException");
 372
 373 #if defined(ENABLE_JAVASE)
 374         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 375 #endif
 376
 377         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 378         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 379         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 380         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 381         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 382         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 383         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 384         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 385
 386 #if defined(ENABLE_JAVASE)
 387         utf_java_lang_StackTraceElement =
 388                 utf_new_char("java/lang/StackTraceElement");
 389
 390         utf_java_lang_reflect_Constructor =
 391                 utf_new_char("java/lang/reflect/Constructor");
 392
 393         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 394         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 395         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 396 #endif
 397
 398         utf_InnerClasses               = utf_new_char("InnerClasses");
 399         utf_ConstantValue              = utf_new_char("ConstantValue");
 400         utf_Code                       = utf_new_char("Code");
 401         utf_Exceptions                 = utf_new_char("Exceptions");
 402         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 403         utf_SourceFile                 = utf_new_char("SourceFile");
 404
 405 #if defined(ENABLE_JAVASE)
 406         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 407         utf_Signature                  = utf_new_char("Signature");
 408         utf_StackMapTable              = utf_new_char("StackMapTable");
 409
 410 #if defined(ENABLE_ANNOTATIONS)
 411         utf_sun_reflect_ConstantPool                = utf_new_char("sun/reflect/ConstantPool");
 412 #if defined(WITH_CLASSPATH_GNU)
 413         utf_sun_reflect_annotation_AnnotationParser = utf_new_char("sun/reflect/annotation/AnnotationParser");
 414 #endif
 415
 416         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
 417         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
 418         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
 419         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
 420         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
 421 #endif
 422 #endif
 423
 424         utf_init                           = utf_new_char("<init>");
 425         utf_clinit                         = utf_new_char("<clinit>");
 426         utf_clone                      = utf_new_char("clone");
 427         utf_finalize                   = utf_new_char("finalize");
 428         utf_run                        = utf_new_char("run");
 429
 430         utf_add                        = utf_new_char("add");
 431         utf_remove                     = utf_new_char("remove");
 432         utf_addThread                  = utf_new_char("addThread");
 433         utf_removeThread               = utf_new_char("removeThread");
 434         utf_put                        = utf_new_char("put");
 435         utf_get                        = utf_new_char("get");
 436         utf_value                      = utf_new_char("value");
 437
 438         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 439         utf_findNative                 = utf_new_char("findNative");
 440         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 441         utf_initCause                  = utf_new_char("initCause");
 442         utf_loadClass                  = utf_new_char("loadClass");
 443         utf_printStackTrace            = utf_new_char("printStackTrace");
 444
 445         utf_division_by_zero           = utf_new_char("/ by zero");
 446
 447         utf_Z                          = utf_new_char("Z");
 448         utf_B                          = utf_new_char("B");
 449         utf_C                          = utf_new_char("C");
 450         utf_S                          = utf_new_char("S");
 451         utf_I                          = utf_new_char("I");
 452         utf_J                          = utf_new_char("J");
 453         utf_F                          = utf_new_char("F");
 454         utf_D                          = utf_new_char("D");
 455
 456         utf_void__void                 = utf_new_char("()V");
 457         utf_boolean__void              = utf_new_char("(Z)V");
 458         utf_byte__void                 = utf_new_char("(B)V");
 459         utf_char__void                 = utf_new_char("(C)V");
 460         utf_short__void                = utf_new_char("(S)V");
 461         utf_int__void                  = utf_new_char("(I)V");
 462         utf_long__void                 = utf_new_char("(J)V");
 463         utf_float__void                = utf_new_char("(F)V");
 464         utf_double__void               = utf_new_char("(D)V");
 465         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 466         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 467
 468         utf_void__java_lang_ClassLoader =
 469                 utf_new_char("()Ljava/lang/ClassLoader;");
 470
 471         utf_java_lang_ClassLoader_java_lang_String__J =
 472                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
 473
 474         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
 475
 476         utf_java_lang_Object__java_lang_Object =
 477                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 478
 479         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 480
 481         utf_java_lang_String__java_lang_Class =
 482                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 483
 484         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 485         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 486
 487         utf_java_lang_Throwable__java_lang_Throwable =
 488                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
 489
 490         utf_null                       = utf_new_char("null");
 491         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 492         array_packagename              = utf_new_char("\t<the array package>");
 493
 494         /* everything's ok */
 495
 496         return true;
 497 }
 498
 499
 500 /* utf_hashkey *****************************************************************
 501
 502    The hashkey is computed from the utf-text by using up to 8
 503    characters.  For utf-symbols longer than 15 characters 3 characters
 504    are taken from the beginning and the end, 2 characters are taken
 505    from the middle.
 506
 507 *******************************************************************************/
 508
 509 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 510 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 511
 512 u4 utf_hashkey(const char *text, u4 length)
 513 {
 514         const char *start_pos = text;       /* pointer to utf text                */
 515         u4 a;
 516
 517         switch (length) {
 518         case 0: /* empty string */
 519                 return 0;
 520
 521         case 1: return fbs(0);
 522         case 2: return fbs(0) ^ nbs(3);
 523         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 524         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 525         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 526         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 527         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 528         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 529
 530         case 9:
 531                 a = fbs(0);
 532                 a ^= nbs(1);
 533                 a ^= nbs(2);
 534                 text++;
 535                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 536
 537         case 10:
 538                 a = fbs(0);
 539                 text++;
 540                 a ^= nbs(2);
 541                 a ^= nbs(3);
 542                 a ^= nbs(4);
 543                 text++;
 544                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 545
 546         case 11:
 547                 a = fbs(0);
 548                 text++;
 549                 a ^= nbs(2);
 550                 a ^= nbs(3);
 551                 a ^= nbs(4);
 552                 text++;
 553                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 554
 555         case 12:
 556                 a = fbs(0);
 557                 text += 2;
 558                 a ^= nbs(2);
 559                 a ^= nbs(3);
 560                 text++;
 561                 a ^= nbs(5);
 562                 a ^= nbs(6);
 563                 a ^= nbs(7);
 564                 text++;
 565                 return a ^ nbs(9) ^ nbs(10);
 566
 567         case 13:
 568                 a = fbs(0);
 569                 a ^= nbs(1);
 570                 text++;
 571                 a ^= nbs(3);
 572                 a ^= nbs(4);
 573                 text += 2;
 574                 a ^= nbs(7);
 575                 a ^= nbs(8);
 576                 text += 2;
 577                 return a ^ nbs(9) ^ nbs(10);
 578
 579         case 14:
 580                 a = fbs(0);
 581                 text += 2;
 582                 a ^= nbs(3);
 583                 a ^= nbs(4);
 584                 text += 2;
 585                 a ^= nbs(7);
 586                 a ^= nbs(8);
 587                 text += 2;
 588                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 589
 590         case 15:
 591                 a = fbs(0);
 592                 text += 2;
 593                 a ^= nbs(3);
 594                 a ^= nbs(4);
 595                 text += 2;
 596                 a ^= nbs(7);
 597                 a ^= nbs(8);
 598                 text += 2;
 599                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 600
 601         default:  /* 3 characters from beginning */
 602                 a = fbs(0);
 603                 text += 2;
 604                 a ^= nbs(3);
 605                 a ^= nbs(4);
 606
 607                 /* 2 characters from middle */
 608                 text = start_pos + (length / 2);
 609                 a ^= fbs(5);
 610                 text += 2;
 611                 a ^= nbs(6);
 612
 613                 /* 3 characters from end */
 614                 text = start_pos + length - 4;
 615
 616                 a ^= fbs(7);
 617                 text++;
 618
 619                 return a ^ nbs(10) ^ nbs(11);
 620     }
 621 }
 622
 623 /* utf_full_hashkey ************************************************************
 624
 625    This function computes a hash value using all bytes in the string.
 626
 627    The algorithm is the "One-at-a-time" algorithm as published
 628    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 629
 630 *******************************************************************************/
 631
 632 u4 utf_full_hashkey(const char *text, u4 length)
 633 {
 634         register const unsigned char *p = (const unsigned char *) text;
 635         register u4 hash;
 636         register u4 i;
 637
 638         hash = 0;
 639         for (i=length; i--;)
 640         {
 641             hash += *p++;
 642             hash += (hash << 10);
 643             hash ^= (hash >> 6);
 644         }
 645         hash += (hash << 3);
 646         hash ^= (hash >> 11);
 647         hash += (hash << 15);
 648
 649         return hash;
 650 }
 651
 652 /* unicode_hashkey *************************************************************
 653
 654    Compute the hashkey of a unicode string.
 655
 656 *******************************************************************************/
 657
 658 u4 unicode_hashkey(u2 *text, u2 len)
 659 {
 660         return utf_hashkey((char *) text, len);
 661 }
 662
 663
 664 /* utf_new *********************************************************************
 665
 666    Creates a new utf-symbol, the text of the symbol is passed as a
 667    u1-array. The function searches the utf-hashtable for a utf-symbol
 668    with this text. On success the element returned, otherwise a new
 669    hashtable element is created.
 670
 671    If the number of entries in the hashtable exceeds twice the size of
 672    the hashtable slots a reorganization of the hashtable is done and
 673    the utf symbols are copied to a new hashtable with doubled size.
 674
 675 *******************************************************************************/
 676
 677 utf *utf_new(const char *text, u2 length)
 678 {
 679         u4 key;                             /* hashkey computed from utf-text     */
 680         u4 slot;                            /* slot in hashtable                  */
 681         utf *u;                             /* hashtable element                  */
 682         u2 i;
 683
 684         LOCK_MONITOR_ENTER(hashtable_utf->header);
 685
 686 #if defined(ENABLE_STATISTICS)
 687         if (opt_stat)
 688                 count_utf_new++;
 689 #endif
 690
 691         key  = utf_hashkey(text, length);
 692         slot = key & (hashtable_utf->size - 1);
 693         u    = hashtable_utf->ptr[slot];
 694
 695         /* search external hash chain for utf-symbol */
 696
 697         while (u) {
 698                 if (u->blength == length) {
 699                         /* compare text of hashtable elements */
 700
 701                         for (i = 0; i < length; i++)
 702                                 if (text[i] != u->text[i])
 703                                         goto nomatch;
 704
 705 #if defined(ENABLE_STATISTICS)
 706                         if (opt_stat)
 707                                 count_utf_new_found++;
 708 #endif
 709
 710                         /* symbol found in hashtable */
 711
 712                         LOCK_MONITOR_EXIT(hashtable_utf->header);
 713
 714                         return u;
 715                 }
 716
 717         nomatch:
 718                 u = u->hashlink; /* next element in external chain */
 719         }
 720
 721         /* location in hashtable found, create new utf element */
 722
 723         u = NEW(utf);
 724
 725         u->blength  = length;               /* length in bytes of utfstring       */
 726         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 727         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 728
 729         memcpy(u->text, text, length);      /* copy utf-text                      */
 730         u->text[length] = '\0';
 731
 732 #if defined(ENABLE_STATISTICS)
 733         if (opt_stat)
 734                 count_utf_len += sizeof(utf) + length + 1;
 735 #endif
 736
 737         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 738         hashtable_utf->entries++;           /* update number of entries           */
 739
 740         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 741
 742         /* reorganization of hashtable, average length of the external
 743            chains is approx. 2 */
 744
 745                 hashtable *newhash;                              /* the new hashtable */
 746                 u4         i;
 747                 utf       *u;
 748                 utf       *nextu;
 749                 u4         slot;
 750
 751                 /* create new hashtable, double the size */
 752
 753                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 754
 755 #if defined(ENABLE_STATISTICS)
 756                 if (opt_stat)
 757                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 758 #endif
 759
 760                 /* transfer elements to new hashtable */
 761
 762                 for (i = 0; i < hashtable_utf->size; i++) {
 763                         u = hashtable_utf->ptr[i];
 764
 765                         while (u) {
 766                                 nextu = u->hashlink;
 767                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 768
 769                                 u->hashlink = (utf *) newhash->ptr[slot];
 770                                 newhash->ptr[slot] = u;
 771
 772                                 /* follow link in external hash chain */
 773
 774                                 u = nextu;
 775                         }
 776                 }
 777
 778                 /* dispose old table */
 779
 780                 hashtable_free(hashtable_utf);
 781
 782                 hashtable_utf = newhash;
 783         }
 784
 785         LOCK_MONITOR_EXIT(hashtable_utf->header);
 786
 787         return u;
 788 }
 789
 790
 791 /* utf_new_u2 ******************************************************************
 792
 793    Make utf symbol from u2 array, if isclassname is true '.' is
 794    replaced by '/'.
 795
 796 *******************************************************************************/
 797
 798 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 799 {
 800         char *buffer;                   /* memory buffer for  unicode characters  */
 801         char *pos;                      /* pointer to current position in buffer  */
 802         u4 left;                        /* unicode characters left                */
 803         u4 buflength;                   /* utf length in bytes of the u2 array    */
 804         utf *result;                    /* resulting utf-string                   */
 805         int i;
 806
 807         /* determine utf length in bytes and allocate memory */
 808
 809         buflength = u2_utflength(unicode_pos, unicode_length);
 810         buffer    = MNEW(char, buflength);
 811
 812         left = buflength;
 813         pos  = buffer;
 814
 815         for (i = 0; i++ < unicode_length; unicode_pos++) {
 816                 /* next unicode character */
 817                 u2 c = *unicode_pos;
 818
 819                 if ((c != 0) && (c < 0x80)) {
 820                         /* 1 character */
 821                         left--;
 822                 if ((int) left < 0) break;
 823                         /* convert classname */
 824                         if (isclassname && c == '.')
 825                                 *pos++ = '/';
 826                         else
 827                                 *pos++ = (char) c;
 828
 829                 } else if (c < 0x800) {
 830                         /* 2 characters */
 831                 unsigned char high = c >> 6;
 832                 unsigned char low  = c & 0x3F;
 833                         left = left - 2;
 834                 if ((int) left < 0) break;
 835                 *pos++ = high | 0xC0;
 836                 *pos++ = low  | 0x80;
 837
 838                 } else {
 839                 /* 3 characters */
 840                 char low  = c & 0x3f;
 841                 char mid  = (c >> 6) & 0x3F;
 842                 char high = c >> 12;
 843                         left = left - 3;
 844                 if ((int) left < 0) break;
 845                 *pos++ = high | 0xE0;
 846                 *pos++ = mid  | 0x80;
 847                 *pos++ = low  | 0x80;
 848                 }
 849         }
 850
 851         /* insert utf-string into symbol-table */
 852         result = utf_new(buffer,buflength);
 853
 854         MFREE(buffer, char, buflength);
 855
 856         return result;
 857 }
 858
 859
 860 /* utf_new_char ****************************************************************
 861
 862    Creates a new utf symbol, the text for this symbol is passed as a
 863    c-string ( = char* ).
 864
 865 *******************************************************************************/
 866
 867 utf *utf_new_char(const char *text)
 868 {
 869         return utf_new(text, strlen(text));
 870 }
 871
 872
 873 /* utf_new_char_classname ******************************************************
 874
 875    Creates a new utf symbol, the text for this symbol is passed as a
 876    c-string ( = char* ) "." characters are going to be replaced by
 877    "/". Since the above function is used often, this is a separte
 878    function, instead of an if.
 879
 880 *******************************************************************************/
 881
 882 utf *utf_new_char_classname(const char *text)
 883 {
 884         if (strchr(text, '.')) {
 885                 char *txt = strdup(text);
 886                 char *end = txt + strlen(txt);
 887                 char *c;
 888                 utf *tmpRes;
 889
 890                 for (c = txt; c < end; c++)
 891                         if (*c == '.') *c = '/';
 892
 893                 tmpRes = utf_new(txt, strlen(txt));
 894                 FREE(txt, 0);
 895
 896                 return tmpRes;
 897
 898         } else
 899                 return utf_new(text, strlen(text));
 900 }
 901
 902
 903 /* utf_nextu2 ******************************************************************
 904
 905    Read the next unicode character from the utf string and increment
 906    the utf-string pointer accordingly.
 907
 908    CAUTION: This function is unsafe for input that was not checked
 909             by is_valid_utf!
 910
 911 *******************************************************************************/
 912
 913 u2 utf_nextu2(char **utf_ptr)
 914 {
 915     /* uncompressed unicode character */
 916     u2 unicode_char = 0;
 917     /* current position in utf text */
 918     unsigned char *utf = (unsigned char *) (*utf_ptr);
 919     /* bytes representing the unicode character */
 920     unsigned char ch1, ch2, ch3;
 921     /* number of bytes used to represent the unicode character */
 922     int len = 0;
 923
 924     switch ((ch1 = utf[0]) >> 4) {
 925         default: /* 1 byte */
 926                 (*utf_ptr)++;
 927                 return (u2) ch1;
 928         case 0xC:
 929         case 0xD: /* 2 bytes */
 930                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 931                         unsigned char high = ch1 & 0x1F;
 932                         unsigned char low  = ch2 & 0x3F;
 933                         unicode_char = (high << 6) + low;
 934                         len = 2;
 935                 }
 936                 break;
 937
 938         case 0xE: /* 2 or 3 bytes */
 939                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 940                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 941                                 unsigned char low  = ch3 & 0x3f;
 942                                 unsigned char mid  = ch2 & 0x3f;
 943                                 unsigned char high = ch1 & 0x0f;
 944                                 unicode_char = (((high << 6) + mid) << 6) + low;
 945                                 len = 3;
 946                         } else
 947                                 len = 2;
 948                 }
 949                 break;
 950     }
 951
 952     /* update position in utf-text */
 953     *utf_ptr = (char *) (utf + len);
 954
 955     return unicode_char;
 956 }
 957
 958
 959 /* utf_bytes *******************************************************************
 960
 961    Determine number of bytes (aka. octets) in the utf string.
 962
 963    IN:
 964       u............utf string
 965
 966    OUT:
 967       The number of octets of this utf string.
 968           There is _no_ terminating zero included in this count.
 969
 970 *******************************************************************************/
 971
 972 u4 utf_bytes(utf *u)
 973 {
 974         return u->blength;
 975 }
 976
 977
 978 /* utf_get_number_of_u2s_for_buffer ********************************************
 979
 980    Determine number of UTF-16 u2s in the given UTF-8 buffer
 981
 982    CAUTION: This function is unsafe for input that was not checked
 983             by is_valid_utf!
 984
 985    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 986    to an array of u2s (UTF-16) and want to know how many of them you will get.
 987    All other uses of this function are probably wrong.
 988
 989    IN:
 990       buffer........points to first char in buffer
 991           blength.......number of _bytes_ in the buffer
 992
 993    OUT:
 994       the number of u2s needed to hold this string in UTF-16 encoding.
 995           There is _no_ terminating zero included in this count.
 996
 997    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
 998    exception.
 999
1000 *******************************************************************************/
1001
1002 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1003 {
1004         const char *endpos;                 /* points behind utf string           */
1005         const char *utf_ptr;                /* current position in utf text       */
1006         u4 len = 0;                         /* number of unicode characters       */
1007
1008         utf_ptr = buffer;
1009         endpos = utf_ptr + blength;
1010
1011         while (utf_ptr < endpos) {
1012                 len++;
1013                 /* next unicode character */
1014                 utf_nextu2((char **)&utf_ptr);
1015         }
1016
1017         assert(utf_ptr == endpos);
1018
1019         return len;
1020 }
1021
1022
1023 /* utf_get_number_of_u2s *******************************************************
1024
1025    Determine number of UTF-16 u2s in the utf string.
1026
1027    CAUTION: This function is unsafe for input that was not checked
1028             by is_valid_utf!
1029
1030    CAUTION: Use this function *only* when you want to convert a utf string
1031    to an array of u2s and want to know how many of them you will get.
1032    All other uses of this function are probably wrong.
1033
1034    IN:
1035       u............utf string
1036
1037    OUT:
1038       the number of u2s needed to hold this string in UTF-16 encoding.
1039           There is _no_ terminating zero included in this count.
1040           XXX 0 if a NullPointerException has been thrown (see below)
1041
1042 *******************************************************************************/
1043
1044 u4 utf_get_number_of_u2s(utf *u)
1045 {
1046         char *endpos;                       /* points behind utf string           */
1047         char *utf_ptr;                      /* current position in utf text       */
1048         u4 len = 0;                         /* number of unicode characters       */
1049
1050         /* XXX this is probably not checked by most callers! Review this after */
1051         /* the invalid uses of this function have been eliminated */
1052         if (u == NULL) {
1053                 exceptions_throw_nullpointerexception();
1054                 return 0;
1055         }
1056
1057         endpos = UTF_END(u);
1058         utf_ptr = u->text;
1059
1060         while (utf_ptr < endpos) {
1061                 len++;
1062                 /* next unicode character */
1063                 utf_nextu2(&utf_ptr);
1064         }
1065
1066         if (utf_ptr != endpos) {
1067                 /* string ended abruptly */
1068                 exceptions_throw_internalerror("Illegal utf8 string");
1069                 return 0;
1070         }
1071
1072         return len;
1073 }
1074
1075
1076 /* utf8_safe_number_of_u2s *****************************************************
1077
1078    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1079    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1080
1081    This function is safe even for invalid UTF-8 strings.
1082
1083    IN:
1084       text..........zero-terminated(!) UTF-8 string (may be invalid)
1085                         must NOT be NULL
1086           nbytes........strlen(text). (This is needed to completely emulate
1087                         the RI).
1088
1089    OUT:
1090       the number of u2s needed to hold this string in UTF-16 encoding.
1091           There is _no_ terminating zero included in this count.
1092
1093 *******************************************************************************/
1094
1095 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1096         register const unsigned char *t;
1097         register s4 byte;
1098         register s4 len;
1099         register const unsigned char *tlimit;
1100         s4 byte1;
1101         s4 byte2;
1102         s4 byte3;
1103         s4 value;
1104         s4 skip;
1105
1106         assert(text);
1107         assert(nbytes >= 0);
1108
1109         len = 0;
1110         t = (const unsigned char *) text;
1111         tlimit = t + nbytes;
1112
1113         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1114
1115         while (1) {
1116                 byte = *t++;
1117
1118                 if (byte & 0x80) {
1119                         /* highest bit set, non-ASCII character */
1120
1121                         if ((byte & 0xe0) == 0xc0) {
1122                                 /* 2-byte: should be 110..... 10...... ? */
1123
1124                                 if ((*t++ & 0xc0) == 0x80)
1125                                         ; /* valid 2-byte */
1126                                 else
1127                                         t--; /* invalid */
1128                         }
1129                         else if ((byte & 0xf0) == 0xe0) {
1130                                 /* 3-byte: should be 1110.... 10...... 10...... */
1131                                 /*                            ^t                */
1132
1133                                 if (t + 2 > tlimit)
1134                                         return len + 1; /* invalid, stop here */
1135
1136                                 if ((*t++ & 0xc0) == 0x80) {
1137                                         if ((*t++ & 0xc0) == 0x80)
1138                                                 ; /* valid 3-byte */
1139                                         else
1140                                                 t--; /* invalid */
1141                                 }
1142                                 else
1143                                         t--; /* invalid */
1144                         }
1145                         else if ((byte & 0xf8) == 0xf0) {
1146                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1147                                 /*                            ^t                         */
1148
1149                                 if (t + 3 > tlimit)
1150                                         return len + 1; /* invalid, stop here */
1151
1152                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1153                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1154                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1155                                                         /* valid 4-byte UTF-8? */
1156                                                         value = ((byte  & 0x07) << 18)
1157                                                                   | ((byte1 & 0x3f) << 12)
1158                                                                   | ((byte2 & 0x3f) <<  6)
1159                                                                   | ((byte3 & 0x3f)      );
1160
1161                                                         if (value > 0x10FFFF)
1162                                                                 ; /* invalid */
1163                                                         else if (value > 0xFFFF)
1164                                                                 len += 1; /* we need surrogates */
1165                                                         else
1166                                                                 ; /* 16bit suffice */
1167                                                 }
1168                                                 else
1169                                                         t--; /* invalid */
1170                                         }
1171                                         else
1172                                                 t--; /* invalid */
1173                                 }
1174                                 else
1175                                         t--; /* invalid */
1176                         }
1177                         else if ((byte & 0xfc) == 0xf8) {
1178                                 /* invalid 5-byte */
1179                                 if (t + 4 > tlimit)
1180                                         return len + 1; /* invalid, stop here */
1181
1182                                 skip = 4;
1183                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1184                                         t++;
1185                         }
1186                         else if ((byte & 0xfe) == 0xfc) {
1187                                 /* invalid 6-byte */
1188                                 if (t + 5 > tlimit)
1189                                         return len + 1; /* invalid, stop here */
1190
1191                                 skip = 5;
1192                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1193                                         t++;
1194                         }
1195                         else
1196                                 ; /* invalid */
1197                 }
1198                 else {
1199                         /* NUL */
1200
1201                         if (byte == 0)
1202                                 break;
1203
1204                         /* ASCII character, common case */
1205                 }
1206
1207                 len++;
1208         }
1209
1210         return len;
1211 }
1212
1213
1214 /* utf8_safe_convert_to_u2s ****************************************************
1215
1216    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1217    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1218    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1219
1220    This function is safe even for invalid UTF-8 strings.
1221
1222    IN:
1223       text..........zero-terminated(!) UTF-8 string (may be invalid)
1224                         must NOT be NULL
1225           nbytes........strlen(text). (This is needed to completely emulate
1226                                         the RI).
1227           buffer........a preallocated array of u2s to receive the decoded
1228                         string. Use utf8_safe_number_of_u2s to get the
1229                                         required number of u2s for allocating this.
1230
1231 *******************************************************************************/
1232
1233 #define UNICODE_REPLACEMENT  0xfffd
1234
1235 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1236         register const unsigned char *t;
1237         register s4 byte;
1238         register const unsigned char *tlimit;
1239         s4 byte1;
1240         s4 byte2;
1241         s4 byte3;
1242         s4 value;
1243         s4 skip;
1244
1245         assert(text);
1246         assert(nbytes >= 0);
1247
1248         t = (const unsigned char *) text;
1249         tlimit = t + nbytes;
1250
1251         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1252
1253         while (1) {
1254                 byte = *t++;
1255
1256                 if (byte & 0x80) {
1257                         /* highest bit set, non-ASCII character */
1258
1259                         if ((byte & 0xe0) == 0xc0) {
1260                                 /* 2-byte: should be 110..... 10...... */
1261
1262                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1263                                         /* valid 2-byte UTF-8 */
1264                                         *buffer++ = ((byte  & 0x1f) << 6)
1265                                                           | ((byte1 & 0x3f)     );
1266                                 }
1267                                 else {
1268                                         *buffer++ = UNICODE_REPLACEMENT;
1269                                         t--;
1270                                 }
1271                         }
1272                         else if ((byte & 0xf0) == 0xe0) {
1273                                 /* 3-byte: should be 1110.... 10...... 10...... */
1274
1275                                 if (t + 2 > tlimit) {
1276                                         *buffer++ = UNICODE_REPLACEMENT;
1277                                         return;
1278                                 }
1279
1280                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1281                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1282                                                 /* valid 3-byte UTF-8 */
1283                                                 *buffer++ = ((byte  & 0x0f) << 12)
1284                                                                   | ((byte1 & 0x3f) <<  6)
1285                                                                   | ((byte2 & 0x3f)      );
1286                                         }
1287                                         else {
1288                                                 *buffer++ = UNICODE_REPLACEMENT;
1289                                                 t--;
1290                                         }
1291                                 }
1292                                 else {
1293                                         *buffer++ = UNICODE_REPLACEMENT;
1294                                         t--;
1295                                 }
1296                         }
1297                         else if ((byte & 0xf8) == 0xf0) {
1298                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1299
1300                                 if (t + 3 > tlimit) {
1301                                         *buffer++ = UNICODE_REPLACEMENT;
1302                                         return;
1303                                 }
1304
1305                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1306                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1307                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1308                                                         /* valid 4-byte UTF-8? */
1309                                                         value = ((byte  & 0x07) << 18)
1310                                                                   | ((byte1 & 0x3f) << 12)
1311                                                                   | ((byte2 & 0x3f) <<  6)
1312                                                                   | ((byte3 & 0x3f)      );
1313
1314                                                         if (value > 0x10FFFF) {
1315                                                                 *buffer++ = UNICODE_REPLACEMENT;
1316                                                         }
1317                                                         else if (value > 0xFFFF) {
1318                                                                 /* we need surrogates */
1319                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1320                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1321                                                         }
1322                                                         else
1323                                                                 *buffer++ = value; /* 16bit suffice */
1324                                                 }
1325                                                 else {
1326                                                         *buffer++ = UNICODE_REPLACEMENT;
1327                                                         t--;
1328                                                 }
1329                                         }
1330                                         else {
1331                                                 *buffer++ = UNICODE_REPLACEMENT;
1332                                                 t--;
1333                                         }
1334                                 }
1335                                 else {
1336                                         *buffer++ = UNICODE_REPLACEMENT;
1337                                         t--;
1338                                 }
1339                         }
1340                         else if ((byte & 0xfc) == 0xf8) {
1341                                 if (t + 4 > tlimit) {
1342                                         *buffer++ = UNICODE_REPLACEMENT;
1343                                         return;
1344                                 }
1345
1346                                 skip = 4;
1347                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1348                                         t++;
1349                                 *buffer++ = UNICODE_REPLACEMENT;
1350                         }
1351                         else if ((byte & 0xfe) == 0xfc) {
1352                                 if (t + 5 > tlimit) {
1353                                         *buffer++ = UNICODE_REPLACEMENT;
1354                                         return;
1355                                 }
1356
1357                                 skip = 5;
1358                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1359                                         t++;
1360                                 *buffer++ = UNICODE_REPLACEMENT;
1361                         }
1362                         else
1363                                 *buffer++ = UNICODE_REPLACEMENT;
1364                 }
1365                 else {
1366                         /* NUL */
1367
1368                         if (byte == 0)
1369                                 break;
1370
1371                         /* ASCII character, common case */
1372
1373                         *buffer++ = byte;
1374                 }
1375         }
1376 }
1377
1378
1379 /* u2_utflength ****************************************************************
1380
1381    Returns the utf length in bytes of a u2 array.
1382
1383 *******************************************************************************/
1384
1385 u4 u2_utflength(u2 *text, u4 u2_length)
1386 {
1387         u4 result_len = 0;                  /* utf length in bytes                */
1388         u2 ch;                              /* current unicode character          */
1389         u4 len;
1390
1391         for (len = 0; len < u2_length; len++) {
1392                 /* next unicode character */
1393                 ch = *text++;
1394
1395                 /* determine bytes required to store unicode character as utf */
1396                 if (ch && (ch < 0x80))
1397                         result_len++;
1398                 else if (ch < 0x800)
1399                         result_len += 2;
1400                 else
1401                         result_len += 3;
1402         }
1403
1404     return result_len;
1405 }
1406
1407
1408 /* utf_copy ********************************************************************
1409
1410    Copy the given utf string byte-for-byte to a buffer.
1411
1412    IN:
1413       buffer.......the buffer
1414           u............the utf string
1415
1416 *******************************************************************************/
1417
1418 void utf_copy(char *buffer, utf *u)
1419 {
1420         /* our utf strings are zero-terminated (done by utf_new) */
1421         MCOPY(buffer, u->text, char, u->blength + 1);
1422 }
1423
1424
1425 /* utf_cat *********************************************************************
1426
1427    Append the given utf string byte-for-byte to a buffer.
1428
1429    IN:
1430       buffer.......the buffer
1431           u............the utf string
1432
1433 *******************************************************************************/
1434
1435 void utf_cat(char *buffer, utf *u)
1436 {
1437         /* our utf strings are zero-terminated (done by utf_new) */
1438         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1439 }
1440
1441
1442 /* utf_copy_classname **********************************************************
1443
1444    Copy the given utf classname byte-for-byte to a buffer.
1445    '/' is replaced by '.'
1446
1447    IN:
1448       buffer.......the buffer
1449           u............the utf string
1450
1451 *******************************************************************************/
1452
1453 void utf_copy_classname(char *buffer, utf *u)
1454 {
1455         char *bufptr;
1456         char *srcptr;
1457         char *endptr;
1458         char ch;
1459
1460         bufptr = buffer;
1461         srcptr = u->text;
1462         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1463
1464         while (srcptr != endptr) {
1465                 ch = *srcptr++;
1466                 if (ch == '/')
1467                         ch = '.';
1468                 *bufptr++ = ch;
1469         }
1470 }
1471
1472
1473 /* utf_cat *********************************************************************
1474
1475    Append the given utf classname byte-for-byte to a buffer.
1476    '/' is replaced by '.'
1477
1478    IN:
1479       buffer.......the buffer
1480           u............the utf string
1481
1482 *******************************************************************************/
1483
1484 void utf_cat_classname(char *buffer, utf *u)
1485 {
1486         utf_copy_classname(buffer + strlen(buffer), u);
1487 }
1488
1489 /* utf_display_printable_ascii *************************************************
1490
1491    Write utf symbol to stdout (for debugging purposes).
1492    Non-printable and non-ASCII characters are printed as '?'.
1493
1494 *******************************************************************************/
1495
1496 void utf_display_printable_ascii(utf *u)
1497 {
1498         char *endpos;                       /* points behind utf string           */
1499         char *utf_ptr;                      /* current position in utf text       */
1500
1501         if (u == NULL) {
1502                 printf("NULL");
1503                 fflush(stdout);
1504                 return;
1505         }
1506
1507         endpos = UTF_END(u);
1508         utf_ptr = u->text;
1509
1510         while (utf_ptr < endpos) {
1511                 /* read next unicode character */
1512
1513                 u2 c = utf_nextu2(&utf_ptr);
1514
1515                 if ((c >= 32) && (c <= 127))
1516                         printf("%c", c);
1517                 else
1518                         printf("?");
1519         }
1520
1521         fflush(stdout);
1522 }
1523
1524
1525 /* utf_display_printable_ascii_classname ***************************************
1526
1527    Write utf symbol to stdout with `/' converted to `.' (for debugging
1528    purposes).
1529    Non-printable and non-ASCII characters are printed as '?'.
1530
1531 *******************************************************************************/
1532
1533 void utf_display_printable_ascii_classname(utf *u)
1534 {
1535         char *endpos;                       /* points behind utf string           */
1536         char *utf_ptr;                      /* current position in utf text       */
1537
1538         if (u == NULL) {
1539                 printf("NULL");
1540                 fflush(stdout);
1541                 return;
1542         }
1543
1544         endpos = UTF_END(u);
1545         utf_ptr = u->text;
1546
1547         while (utf_ptr < endpos) {
1548                 /* read next unicode character */
1549
1550                 u2 c = utf_nextu2(&utf_ptr);
1551
1552                 if (c == '/')
1553                         c = '.';
1554
1555                 if ((c >= 32) && (c <= 127))
1556                         printf("%c", c);
1557                 else
1558                         printf("?");
1559         }
1560
1561         fflush(stdout);
1562 }
1563
1564
1565 /* utf_sprint_convert_to_latin1 ************************************************
1566
1567    Write utf symbol into c-string (for debugging purposes).
1568    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1569    invalid results.
1570
1571 *******************************************************************************/
1572
1573 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1574 {
1575         char *endpos;                       /* points behind utf string           */
1576         char *utf_ptr;                      /* current position in utf text       */
1577         u2 pos = 0;                         /* position in c-string               */
1578
1579         if (!u) {
1580                 strcpy(buffer, "NULL");
1581                 return;
1582         }
1583
1584         endpos = UTF_END(u);
1585         utf_ptr = u->text;
1586
1587         while (utf_ptr < endpos)
1588                 /* copy next unicode character */
1589                 buffer[pos++] = utf_nextu2(&utf_ptr);
1590
1591         /* terminate string */
1592         buffer[pos] = '\0';
1593 }
1594
1595
1596 /* utf_sprint_convert_to_latin1_classname **************************************
1597
1598    Write utf symbol into c-string with `/' converted to `.' (for debugging
1599    purposes).
1600    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1601    invalid results.
1602
1603 *******************************************************************************/
1604
1605 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1606 {
1607         char *endpos;                       /* points behind utf string           */
1608         char *utf_ptr;                      /* current position in utf text       */
1609         u2 pos = 0;                         /* position in c-string               */
1610
1611         if (!u) {
1612                 strcpy(buffer, "NULL");
1613                 return;
1614         }
1615
1616         endpos = UTF_END(u);
1617         utf_ptr = u->text;
1618
1619         while (utf_ptr < endpos) {
1620                 /* copy next unicode character */
1621                 u2 c = utf_nextu2(&utf_ptr);
1622                 if (c == '/') c = '.';
1623                 buffer[pos++] = c;
1624         }
1625
1626         /* terminate string */
1627         buffer[pos] = '\0';
1628 }
1629
1630
1631 /* utf_strcat_convert_to_latin1 ************************************************
1632
1633    Like libc strcat, but uses an utf8 string.
1634    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1635    invalid results.
1636
1637 *******************************************************************************/
1638
1639 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1640 {
1641         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1642 }
1643
1644
1645 /* utf_strcat_convert_to_latin1_classname **************************************
1646
1647    Like libc strcat, but uses an utf8 string.
1648    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1649    invalid results.
1650
1651 *******************************************************************************/
1652
1653 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1654 {
1655         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1656 }
1657
1658
1659 /* utf_fprint_printable_ascii **************************************************
1660
1661    Write utf symbol into file.
1662    Non-printable and non-ASCII characters are printed as '?'.
1663
1664 *******************************************************************************/
1665
1666 void utf_fprint_printable_ascii(FILE *file, utf *u)
1667 {
1668         char *endpos;                       /* points behind utf string           */
1669         char *utf_ptr;                      /* current position in utf text       */
1670
1671         if (!u)
1672                 return;
1673
1674         endpos = UTF_END(u);
1675         utf_ptr = u->text;
1676
1677         while (utf_ptr < endpos) {
1678                 /* read next unicode character */
1679                 u2 c = utf_nextu2(&utf_ptr);
1680
1681                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1682                 else fprintf(file, "?");
1683         }
1684 }
1685
1686
1687 /* utf_fprint_printable_ascii_classname ****************************************
1688
1689    Write utf symbol into file with `/' converted to `.'.
1690    Non-printable and non-ASCII characters are printed as '?'.
1691
1692 *******************************************************************************/
1693
1694 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1695 {
1696         char *endpos;                       /* points behind utf string           */
1697         char *utf_ptr;                      /* current position in utf text       */
1698
1699     if (!u)
1700                 return;
1701
1702         endpos = UTF_END(u);
1703         utf_ptr = u->text;
1704
1705         while (utf_ptr < endpos) {
1706                 /* read next unicode character */
1707                 u2 c = utf_nextu2(&utf_ptr);
1708                 if (c == '/') c = '.';
1709
1710                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1711                 else fprintf(file, "?");
1712         }
1713 }
1714
1715
1716 /* is_valid_utf ****************************************************************
1717
1718    Return true if the given string is a valid UTF-8 string.
1719
1720    utf_ptr...points to first character
1721    end_pos...points after last character
1722
1723 *******************************************************************************/
1724
1725 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1726
1727 bool is_valid_utf(char *utf_ptr, char *end_pos)
1728 {
1729         int bytes;
1730         int len,i;
1731         char c;
1732         unsigned long v;
1733
1734         if (end_pos < utf_ptr) return false;
1735         bytes = end_pos - utf_ptr;
1736         while (bytes--) {
1737                 c = *utf_ptr++;
1738
1739                 if (!c) return false;                     /* 0x00 is not allowed */
1740                 if ((c & 0x80) == 0) continue;            /* ASCII */
1741
1742                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1743                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1744                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1745                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1746                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1747                 else return false;                        /* invalid leading byte */
1748
1749                 if (len > 2) return false;                /* Java limitation */
1750
1751                 v = (unsigned long)c & (0x3f >> len);
1752
1753                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1754
1755                 for (i = len; i--; ) {
1756                         c = *utf_ptr++;
1757                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1758                                 return false;
1759                         v = (v << 6) | (c & 0x3f);
1760                 }
1761
1762                 if (v == 0) {
1763                         if (len != 1) return false;           /* Java special */
1764
1765                 } else {
1766                         /* Sun Java seems to allow overlong UTF-8 encodings */
1767
1768                         /* if (v < min_codepoint[len]) */
1769                                 /* XXX throw exception? */
1770                 }
1771
1772                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1773                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1774
1775                 /* even these seem to be allowed */
1776                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1777         }
1778
1779         return true;
1780 }
1781
1782
1783 /* is_valid_name ***************************************************************
1784
1785    Return true if the given string may be used as a class/field/method
1786    name. (Currently this only disallows empty strings and control
1787    characters.)
1788
1789    NOTE: The string is assumed to have passed is_valid_utf!
1790
1791    utf_ptr...points to first character
1792    end_pos...points after last character
1793
1794 *******************************************************************************/
1795
1796 bool is_valid_name(char *utf_ptr, char *end_pos)
1797 {
1798         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1799
1800         while (utf_ptr < end_pos) {
1801                 unsigned char c = *utf_ptr++;
1802
1803                 if (c < 0x20) return false; /* disallow control characters */
1804                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1805                         return false;
1806         }
1807
1808         return true;
1809 }
1810
1811 bool is_valid_name_utf(utf *u)
1812 {
1813         return is_valid_name(u->text, UTF_END(u));
1814 }
1815
1816
1817 /* utf_show ********************************************************************
1818
1819    Writes the utf symbols in the utfhash to stdout and displays the
1820    number of external hash chains grouped according to the chainlength
1821    (for debugging purposes).
1822
1823 *******************************************************************************/
1824
1825 #if !defined(NDEBUG)
1826 void utf_show(void)
1827 {
1828
1829 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1830
1831         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1832         u4 max_chainlength = 0;      /* maximum length of the chains */
1833         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1834         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1835         u4 i;
1836
1837         printf("UTF-HASH:\n");
1838
1839         /* show element of utf-hashtable */
1840
1841         for (i = 0; i < hashtable_utf->size; i++) {
1842                 utf *u = hashtable_utf->ptr[i];
1843
1844                 if (u) {
1845                         printf("SLOT %d: ", (int) i);
1846
1847                         while (u) {
1848                                 printf("'");
1849                                 utf_display_printable_ascii(u);
1850                                 printf("' ");
1851                                 u = u->hashlink;
1852                         }
1853                         printf("\n");
1854                 }
1855         }
1856
1857         printf("UTF-HASH: %d slots for %d entries\n",
1858                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1859
1860         if (hashtable_utf->entries == 0)
1861                 return;
1862
1863         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1864
1865         for (i=0;i<CHAIN_LIMIT;i++)
1866                 chain_count[i]=0;
1867
1868         /* count numbers of hashchains according to their length */
1869         for (i=0; i<hashtable_utf->size; i++) {
1870
1871                 utf *u = (utf*) hashtable_utf->ptr[i];
1872                 u4 chain_length = 0;
1873
1874                 /* determine chainlength */
1875                 while (u) {
1876                         u = u->hashlink;
1877                         chain_length++;
1878                 }
1879
1880                 /* update sum of all chainlengths */
1881                 sum_chainlength+=chain_length;
1882
1883                 /* determine the maximum length of the chains */
1884                 if (chain_length>max_chainlength)
1885                         max_chainlength = chain_length;
1886
1887                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1888                 if (chain_length>=CHAIN_LIMIT) {
1889                         beyond_limit+=chain_length;
1890                         chain_length=CHAIN_LIMIT-1;
1891                 }
1892
1893                 /* update number of hashchains of current length */
1894                 chain_count[chain_length]++;
1895         }
1896
1897         /* display results */
1898         for (i=1;i<CHAIN_LIMIT-1;i++)
1899                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1900
1901         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1902
1903
1904         printf("max. chainlength:%5d\n",max_chainlength);
1905
1906         /* avg. chainlength = sum of chainlengths / number of chains */
1907         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1908 }
1909 #endif /* !defined(NDEBUG) */
1910
1911
1912 /*
1913  * These are local overrides for various environment variables in Emacs.
1914  * Please do not remove this and leave it at the end of the file, where
1915  * Emacs will automagically detect them.
1916  * ---------------------------------------------------------------------
1917  * Local variables:
1918  * mode: c
1919  * indent-tabs-mode: t
1920  * c-basic-offset: 4
1921  * tab-width: 4
1922  * End:
1923  * vim:noexpandtab:sw=4:ts=4:
1924  */