src/vmcore/utf8.c

   1 /* src/vmcore/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
   4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
   5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
   6    J. Wenninger, Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  23    02110-1301, USA.
  24
  25    $Id: utf8.c 8367 2007-08-20 20:26:16Z twisti $
  26
  27 */
  28
  29
  30 #include "config.h"
  31
  32 #include <string.h>
  33 #include <assert.h>
  34
  35 #include "vm/types.h"
  36
  37 #include "mm/memory.h"
  38
  39 #include "threads/lock-common.h"
  40
  41 #include "toolbox/hashtable.h"
  42
  43 #include "vm/exceptions.h"
  44
  45 #include "vmcore/options.h"
  46
  47 #if defined(ENABLE_STATISTICS)
  48 # include "vmcore/statistics.h"
  49 #endif
  50
  51 #include "vmcore/utf8.h"
  52
  53
  54 /* global variables ***********************************************************/
  55
  56 /* hashsize must be power of 2 */
  57
  58 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  59
  60 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  61
  62
  63 /* utf-symbols for pointer comparison of frequently used strings **************/
  64
  65 utf *utf_java_lang_Object;
  66
  67 utf *utf_java_lang_Class;
  68 utf *utf_java_lang_ClassLoader;
  69 utf *utf_java_lang_Cloneable;
  70 utf *utf_java_lang_SecurityManager;
  71 utf *utf_java_lang_String;
  72 utf *utf_java_lang_System;
  73 utf *utf_java_lang_ThreadGroup;
  74 utf *utf_java_lang_ref_SoftReference;
  75 utf *utf_java_lang_ref_WeakReference;
  76 utf *utf_java_lang_ref_PhantomReference;
  77 utf *utf_java_io_Serializable;
  78
  79 utf *utf_java_lang_Throwable;
  80 utf *utf_java_lang_Error;
  81
  82 utf *utf_java_lang_AbstractMethodError;
  83 utf *utf_java_lang_ClassCircularityError;
  84 utf *utf_java_lang_ClassFormatError;
  85 utf *utf_java_lang_ExceptionInInitializerError;
  86 utf *utf_java_lang_IncompatibleClassChangeError;
  87 utf *utf_java_lang_InstantiationError;
  88 utf *utf_java_lang_InternalError;
  89 utf *utf_java_lang_LinkageError;
  90 utf *utf_java_lang_NoClassDefFoundError;
  91 utf *utf_java_lang_NoSuchFieldError;
  92 utf *utf_java_lang_NoSuchMethodError;
  93 utf *utf_java_lang_OutOfMemoryError;
  94 utf *utf_java_lang_UnsatisfiedLinkError;
  95 utf *utf_java_lang_UnsupportedClassVersionError;
  96 utf *utf_java_lang_VerifyError;
  97 utf *utf_java_lang_VirtualMachineError;
  98
  99 #if defined(WITH_CLASSPATH_GNU)
 100 utf *utf_java_lang_VMThrowable;
 101 #endif
 102
 103 utf *utf_java_lang_Exception;
 104
 105 utf *utf_java_lang_ArithmeticException;
 106 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
 107 utf *utf_java_lang_ArrayStoreException;
 108 utf *utf_java_lang_ClassCastException;
 109 utf *utf_java_lang_ClassNotFoundException;
 110 utf *utf_java_lang_CloneNotSupportedException;
 111 utf *utf_java_lang_IllegalAccessException;
 112 utf *utf_java_lang_IllegalArgumentException;
 113 utf *utf_java_lang_IllegalMonitorStateException;
 114 utf *utf_java_lang_InstantiationException;
 115 utf *utf_java_lang_InterruptedException;
 116 utf *utf_java_lang_NegativeArraySizeException;
 117 utf *utf_java_lang_NullPointerException;
 118 utf *utf_java_lang_StringIndexOutOfBoundsException;
 119
 120 utf *utf_java_lang_reflect_InvocationTargetException;
 121
 122 utf *utf_java_security_PrivilegedActionException;
 123
 124 #if defined(ENABLE_JAVASE)
 125 utf* utf_java_lang_Void;
 126 #endif
 127
 128 utf* utf_java_lang_Boolean;
 129 utf* utf_java_lang_Byte;
 130 utf* utf_java_lang_Character;
 131 utf* utf_java_lang_Short;
 132 utf* utf_java_lang_Integer;
 133 utf* utf_java_lang_Long;
 134 utf* utf_java_lang_Float;
 135 utf* utf_java_lang_Double;
 136
 137 #if defined(ENABLE_JAVASE)
 138 utf *utf_java_lang_StackTraceElement;
 139 utf *utf_java_lang_reflect_Constructor;
 140 utf *utf_java_lang_reflect_Field;
 141 utf *utf_java_lang_reflect_Method;
 142 utf *utf_java_util_Vector;
 143 #endif
 144
 145 utf *utf_InnerClasses;                  /* InnerClasses                       */
 146 utf *utf_ConstantValue;                 /* ConstantValue                      */
 147 utf *utf_Code;                          /* Code                               */
 148 utf *utf_Exceptions;                    /* Exceptions                         */
 149 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 150 utf *utf_SourceFile;                    /* SourceFile                         */
 151
 152 #if defined(ENABLE_JAVASE)
 153 utf *utf_EnclosingMethod;
 154 utf *utf_Signature;
 155 utf *utf_StackMapTable;
 156
 157 #if defined(ENABLE_ANNOTATIONS)
 158 utf *utf_sun_reflect_ConstantPool;
 159 #if defined(WITH_CLASSPATH_GNU)
 160 utf *utf_sun_reflect_annotation_AnnotationParser;
 161 #endif
 162
 163 utf *utf_RuntimeVisibleAnnotations;
 164 utf *utf_RuntimeInvisibleAnnotations;
 165 utf *utf_RuntimeVisibleParameterAnnotations;
 166 utf *utf_RuntimeInvisibleParameterAnnotations;
 167 utf *utf_AnnotationDefault;
 168 #endif
 169 #endif
 170
 171 utf *utf_init;                          /* <init>                             */
 172 utf *utf_clinit;                        /* <clinit>                           */
 173 utf *utf_clone;                         /* clone                              */
 174 utf *utf_finalize;                      /* finalize                           */
 175 utf *utf_run;                           /* run                                */
 176
 177 utf *utf_add;
 178 utf *utf_remove;
 179 utf *utf_addThread;
 180 utf *utf_removeThread;
 181 utf *utf_put;
 182 utf *utf_get;
 183 utf *utf_uncaughtException;
 184 utf *utf_value;
 185
 186 utf *utf_fillInStackTrace;
 187 utf *utf_findNative;
 188 utf *utf_getSystemClassLoader;
 189 utf *utf_initCause;
 190 utf *utf_loadClass;
 191 utf *utf_printStackTrace;
 192
 193 utf *utf_division_by_zero;
 194
 195 utf *utf_Z;                             /* Z                                  */
 196 utf *utf_B;                             /* B                                  */
 197 utf *utf_C;                             /* C                                  */
 198 utf *utf_S;                             /* S                                  */
 199 utf *utf_I;                             /* I                                  */
 200 utf *utf_J;                             /* J                                  */
 201 utf *utf_F;                             /* F                                  */
 202 utf *utf_D;                             /* D                                  */
 203
 204 utf *utf_void__void;                    /* ()V                                */
 205 utf *utf_boolean__void;                 /* (Z)V                               */
 206 utf *utf_byte__void;                    /* (B)V                               */
 207 utf *utf_char__void;                    /* (C)V                               */
 208 utf *utf_short__void;                   /* (S)V                               */
 209 utf *utf_int__void;                     /* (I)V                               */
 210 utf *utf_long__void;                    /* (J)V                               */
 211 utf *utf_float__void;                   /* (F)V                               */
 212 utf *utf_double__void;                  /* (D)V                               */
 213
 214 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 215 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 216 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 217 utf *utf_java_lang_ClassLoader_java_lang_String__J;
 218 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 219 utf *utf_java_lang_Object__java_lang_Object;
 220 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 221 utf *utf_java_lang_String__java_lang_Class;
 222 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 223 utf *utf_java_lang_Thread_java_lang_Throwable__V;
 224 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 225 utf *utf_java_lang_Throwable__java_lang_Throwable;
 226
 227 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 228 utf *utf_null;
 229 utf *array_packagename;
 230
 231
 232 /* utf_init ********************************************************************
 233
 234    Initializes the utf8 subsystem.
 235
 236 *******************************************************************************/
 237
 238 bool utf8_init(void)
 239 {
 240         /* create utf8 hashtable */
 241
 242         hashtable_utf = NEW(hashtable);
 243
 244         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 245
 246 #if defined(ENABLE_STATISTICS)
 247         if (opt_stat)
 248                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 249 #endif
 250
 251         /* create utf-symbols for pointer comparison of frequently used strings */
 252
 253         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 254
 255         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 256         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 257         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 258         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 259         utf_java_lang_String           = utf_new_char("java/lang/String");
 260         utf_java_lang_System           = utf_new_char("java/lang/System");
 261         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 262
 263         utf_java_lang_ref_SoftReference =
 264                 utf_new_char("java/lang/ref/SoftReference");
 265
 266         utf_java_lang_ref_WeakReference =
 267                 utf_new_char("java/lang/ref/WeakReference");
 268
 269         utf_java_lang_ref_PhantomReference =
 270                 utf_new_char("java/lang/ref/PhantomReference");
 271
 272         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 273
 274         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 275         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 276
 277         utf_java_lang_ClassCircularityError =
 278                 utf_new_char("java/lang/ClassCircularityError");
 279
 280         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
 281
 282         utf_java_lang_ExceptionInInitializerError =
 283                 utf_new_char("java/lang/ExceptionInInitializerError");
 284
 285         utf_java_lang_IncompatibleClassChangeError =
 286                 utf_new_char("java/lang/IncompatibleClassChangeError");
 287
 288         utf_java_lang_InstantiationError =
 289                 utf_new_char("java/lang/InstantiationError");
 290
 291         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
 292         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 293
 294         utf_java_lang_NoClassDefFoundError =
 295                 utf_new_char("java/lang/NoClassDefFoundError");
 296
 297         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 298
 299         utf_java_lang_UnsatisfiedLinkError =
 300                 utf_new_char("java/lang/UnsatisfiedLinkError");
 301
 302         utf_java_lang_UnsupportedClassVersionError =
 303                 utf_new_char("java/lang/UnsupportedClassVersionError");
 304
 305         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
 306
 307         utf_java_lang_VirtualMachineError =
 308                 utf_new_char("java/lang/VirtualMachineError");
 309
 310 #if defined(ENABLE_JAVASE)
 311         utf_java_lang_AbstractMethodError =
 312                 utf_new_char("java/lang/AbstractMethodError");
 313
 314         utf_java_lang_NoSuchFieldError =
 315                 utf_new_char("java/lang/NoSuchFieldError");
 316
 317         utf_java_lang_NoSuchMethodError =
 318                 utf_new_char("java/lang/NoSuchMethodError");
 319 #endif
 320
 321 #if defined(WITH_CLASSPATH_GNU)
 322         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
 323 #endif
 324
 325         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 326
 327         utf_java_lang_ArithmeticException =
 328                 utf_new_char("java/lang/ArithmeticException");
 329
 330         utf_java_lang_ArrayIndexOutOfBoundsException =
 331                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
 332
 333         utf_java_lang_ArrayStoreException =
 334                 utf_new_char("java/lang/ArrayStoreException");
 335
 336         utf_java_lang_ClassCastException =
 337                 utf_new_char("java/lang/ClassCastException");
 338
 339         utf_java_lang_ClassNotFoundException =
 340                 utf_new_char("java/lang/ClassNotFoundException");
 341
 342         utf_java_lang_CloneNotSupportedException =
 343                 utf_new_char("java/lang/CloneNotSupportedException");
 344
 345         utf_java_lang_IllegalAccessException =
 346                 utf_new_char("java/lang/IllegalAccessException");
 347
 348         utf_java_lang_IllegalArgumentException =
 349                 utf_new_char("java/lang/IllegalArgumentException");
 350
 351         utf_java_lang_IllegalMonitorStateException =
 352                 utf_new_char("java/lang/IllegalMonitorStateException");
 353
 354         utf_java_lang_InstantiationException =
 355                 utf_new_char("java/lang/InstantiationException");
 356
 357         utf_java_lang_InterruptedException =
 358                 utf_new_char("java/lang/InterruptedException");
 359
 360         utf_java_lang_NegativeArraySizeException =
 361                 utf_new_char("java/lang/NegativeArraySizeException");
 362
 363         utf_java_lang_NullPointerException =
 364                 utf_new_char("java/lang/NullPointerException");
 365
 366         utf_java_lang_StringIndexOutOfBoundsException =
 367                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
 368
 369         utf_java_lang_reflect_InvocationTargetException =
 370                 utf_new_char("java/lang/reflect/InvocationTargetException");
 371
 372         utf_java_security_PrivilegedActionException =
 373                 utf_new_char("java/security/PrivilegedActionException");
 374
 375 #if defined(ENABLE_JAVASE)
 376         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 377 #endif
 378
 379         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 380         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 381         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 382         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 383         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 384         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 385         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 386         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 387
 388 #if defined(ENABLE_JAVASE)
 389         utf_java_lang_StackTraceElement =
 390                 utf_new_char("java/lang/StackTraceElement");
 391
 392         utf_java_lang_reflect_Constructor =
 393                 utf_new_char("java/lang/reflect/Constructor");
 394
 395         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 396         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 397         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 398 #endif
 399
 400         utf_InnerClasses               = utf_new_char("InnerClasses");
 401         utf_ConstantValue              = utf_new_char("ConstantValue");
 402         utf_Code                       = utf_new_char("Code");
 403         utf_Exceptions                 = utf_new_char("Exceptions");
 404         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 405         utf_SourceFile                 = utf_new_char("SourceFile");
 406
 407 #if defined(ENABLE_JAVASE)
 408         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 409         utf_Signature                  = utf_new_char("Signature");
 410         utf_StackMapTable              = utf_new_char("StackMapTable");
 411
 412 #if defined(ENABLE_ANNOTATIONS)
 413         utf_sun_reflect_ConstantPool                = utf_new_char("sun/reflect/ConstantPool");
 414 #if defined(WITH_CLASSPATH_GNU)
 415         utf_sun_reflect_annotation_AnnotationParser = utf_new_char("sun/reflect/annotation/AnnotationParser");
 416 #endif
 417
 418         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
 419         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
 420         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
 421         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
 422         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
 423 #endif
 424 #endif
 425
 426         utf_init                           = utf_new_char("<init>");
 427         utf_clinit                         = utf_new_char("<clinit>");
 428         utf_clone                      = utf_new_char("clone");
 429         utf_finalize                   = utf_new_char("finalize");
 430         utf_run                        = utf_new_char("run");
 431
 432         utf_add                        = utf_new_char("add");
 433         utf_remove                     = utf_new_char("remove");
 434         utf_addThread                  = utf_new_char("addThread");
 435         utf_removeThread               = utf_new_char("removeThread");
 436         utf_put                        = utf_new_char("put");
 437         utf_get                        = utf_new_char("get");
 438         utf_uncaughtException          = utf_new_char("uncaughtException");
 439         utf_value                      = utf_new_char("value");
 440
 441         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 442         utf_findNative                 = utf_new_char("findNative");
 443         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 444         utf_initCause                  = utf_new_char("initCause");
 445         utf_loadClass                  = utf_new_char("loadClass");
 446         utf_printStackTrace            = utf_new_char("printStackTrace");
 447
 448         utf_division_by_zero           = utf_new_char("/ by zero");
 449
 450         utf_Z                          = utf_new_char("Z");
 451         utf_B                          = utf_new_char("B");
 452         utf_C                          = utf_new_char("C");
 453         utf_S                          = utf_new_char("S");
 454         utf_I                          = utf_new_char("I");
 455         utf_J                          = utf_new_char("J");
 456         utf_F                          = utf_new_char("F");
 457         utf_D                          = utf_new_char("D");
 458
 459         utf_void__void                 = utf_new_char("()V");
 460         utf_boolean__void              = utf_new_char("(Z)V");
 461         utf_byte__void                 = utf_new_char("(B)V");
 462         utf_char__void                 = utf_new_char("(C)V");
 463         utf_short__void                = utf_new_char("(S)V");
 464         utf_int__void                  = utf_new_char("(I)V");
 465         utf_long__void                 = utf_new_char("(J)V");
 466         utf_float__void                = utf_new_char("(F)V");
 467         utf_double__void               = utf_new_char("(D)V");
 468         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 469         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 470
 471         utf_void__java_lang_ClassLoader =
 472                 utf_new_char("()Ljava/lang/ClassLoader;");
 473
 474         utf_java_lang_ClassLoader_java_lang_String__J =
 475                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
 476
 477         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
 478
 479         utf_java_lang_Object__java_lang_Object =
 480                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 481
 482         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 483
 484         utf_java_lang_String__java_lang_Class =
 485                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 486
 487         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 488
 489         utf_java_lang_Thread_java_lang_Throwable__V =
 490                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
 491
 492         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 493
 494         utf_java_lang_Throwable__java_lang_Throwable =
 495                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
 496
 497         utf_null                       = utf_new_char("null");
 498         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 499         array_packagename              = utf_new_char("\t<the array package>");
 500
 501         /* everything's ok */
 502
 503         return true;
 504 }
 505
 506
 507 /* utf_hashkey *****************************************************************
 508
 509    The hashkey is computed from the utf-text by using up to 8
 510    characters.  For utf-symbols longer than 15 characters 3 characters
 511    are taken from the beginning and the end, 2 characters are taken
 512    from the middle.
 513
 514 *******************************************************************************/
 515
 516 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 517 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 518
 519 u4 utf_hashkey(const char *text, u4 length)
 520 {
 521         const char *start_pos = text;       /* pointer to utf text                */
 522         u4 a;
 523
 524         switch (length) {
 525         case 0: /* empty string */
 526                 return 0;
 527
 528         case 1: return fbs(0);
 529         case 2: return fbs(0) ^ nbs(3);
 530         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 531         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 532         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 533         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 534         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 535         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 536
 537         case 9:
 538                 a = fbs(0);
 539                 a ^= nbs(1);
 540                 a ^= nbs(2);
 541                 text++;
 542                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 543
 544         case 10:
 545                 a = fbs(0);
 546                 text++;
 547                 a ^= nbs(2);
 548                 a ^= nbs(3);
 549                 a ^= nbs(4);
 550                 text++;
 551                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 552
 553         case 11:
 554                 a = fbs(0);
 555                 text++;
 556                 a ^= nbs(2);
 557                 a ^= nbs(3);
 558                 a ^= nbs(4);
 559                 text++;
 560                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 561
 562         case 12:
 563                 a = fbs(0);
 564                 text += 2;
 565                 a ^= nbs(2);
 566                 a ^= nbs(3);
 567                 text++;
 568                 a ^= nbs(5);
 569                 a ^= nbs(6);
 570                 a ^= nbs(7);
 571                 text++;
 572                 return a ^ nbs(9) ^ nbs(10);
 573
 574         case 13:
 575                 a = fbs(0);
 576                 a ^= nbs(1);
 577                 text++;
 578                 a ^= nbs(3);
 579                 a ^= nbs(4);
 580                 text += 2;
 581                 a ^= nbs(7);
 582                 a ^= nbs(8);
 583                 text += 2;
 584                 return a ^ nbs(9) ^ nbs(10);
 585
 586         case 14:
 587                 a = fbs(0);
 588                 text += 2;
 589                 a ^= nbs(3);
 590                 a ^= nbs(4);
 591                 text += 2;
 592                 a ^= nbs(7);
 593                 a ^= nbs(8);
 594                 text += 2;
 595                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 596
 597         case 15:
 598                 a = fbs(0);
 599                 text += 2;
 600                 a ^= nbs(3);
 601                 a ^= nbs(4);
 602                 text += 2;
 603                 a ^= nbs(7);
 604                 a ^= nbs(8);
 605                 text += 2;
 606                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 607
 608         default:  /* 3 characters from beginning */
 609                 a = fbs(0);
 610                 text += 2;
 611                 a ^= nbs(3);
 612                 a ^= nbs(4);
 613
 614                 /* 2 characters from middle */
 615                 text = start_pos + (length / 2);
 616                 a ^= fbs(5);
 617                 text += 2;
 618                 a ^= nbs(6);
 619
 620                 /* 3 characters from end */
 621                 text = start_pos + length - 4;
 622
 623                 a ^= fbs(7);
 624                 text++;
 625
 626                 return a ^ nbs(10) ^ nbs(11);
 627     }
 628 }
 629
 630 /* utf_full_hashkey ************************************************************
 631
 632    This function computes a hash value using all bytes in the string.
 633
 634    The algorithm is the "One-at-a-time" algorithm as published
 635    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 636
 637 *******************************************************************************/
 638
 639 u4 utf_full_hashkey(const char *text, u4 length)
 640 {
 641         register const unsigned char *p = (const unsigned char *) text;
 642         register u4 hash;
 643         register u4 i;
 644
 645         hash = 0;
 646         for (i=length; i--;)
 647         {
 648             hash += *p++;
 649             hash += (hash << 10);
 650             hash ^= (hash >> 6);
 651         }
 652         hash += (hash << 3);
 653         hash ^= (hash >> 11);
 654         hash += (hash << 15);
 655
 656         return hash;
 657 }
 658
 659 /* unicode_hashkey *************************************************************
 660
 661    Compute the hashkey of a unicode string.
 662
 663 *******************************************************************************/
 664
 665 u4 unicode_hashkey(u2 *text, u2 len)
 666 {
 667         return utf_hashkey((char *) text, len);
 668 }
 669
 670
 671 /* utf_new *********************************************************************
 672
 673    Creates a new utf-symbol, the text of the symbol is passed as a
 674    u1-array. The function searches the utf-hashtable for a utf-symbol
 675    with this text. On success the element returned, otherwise a new
 676    hashtable element is created.
 677
 678    If the number of entries in the hashtable exceeds twice the size of
 679    the hashtable slots a reorganization of the hashtable is done and
 680    the utf symbols are copied to a new hashtable with doubled size.
 681
 682 *******************************************************************************/
 683
 684 utf *utf_new(const char *text, u2 length)
 685 {
 686         u4 key;                             /* hashkey computed from utf-text     */
 687         u4 slot;                            /* slot in hashtable                  */
 688         utf *u;                             /* hashtable element                  */
 689         u2 i;
 690
 691         LOCK_MONITOR_ENTER(hashtable_utf->header);
 692
 693 #if defined(ENABLE_STATISTICS)
 694         if (opt_stat)
 695                 count_utf_new++;
 696 #endif
 697
 698         key  = utf_hashkey(text, length);
 699         slot = key & (hashtable_utf->size - 1);
 700         u    = hashtable_utf->ptr[slot];
 701
 702         /* search external hash chain for utf-symbol */
 703
 704         while (u) {
 705                 if (u->blength == length) {
 706                         /* compare text of hashtable elements */
 707
 708                         for (i = 0; i < length; i++)
 709                                 if (text[i] != u->text[i])
 710                                         goto nomatch;
 711
 712 #if defined(ENABLE_STATISTICS)
 713                         if (opt_stat)
 714                                 count_utf_new_found++;
 715 #endif
 716
 717                         /* symbol found in hashtable */
 718
 719                         LOCK_MONITOR_EXIT(hashtable_utf->header);
 720
 721                         return u;
 722                 }
 723
 724         nomatch:
 725                 u = u->hashlink; /* next element in external chain */
 726         }
 727
 728         /* location in hashtable found, create new utf element */
 729
 730         u = NEW(utf);
 731
 732         u->blength  = length;               /* length in bytes of utfstring       */
 733         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 734         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 735
 736         memcpy(u->text, text, length);      /* copy utf-text                      */
 737         u->text[length] = '\0';
 738
 739 #if defined(ENABLE_STATISTICS)
 740         if (opt_stat)
 741                 count_utf_len += sizeof(utf) + length + 1;
 742 #endif
 743
 744         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 745         hashtable_utf->entries++;           /* update number of entries           */
 746
 747         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 748
 749         /* reorganization of hashtable, average length of the external
 750            chains is approx. 2 */
 751
 752                 hashtable *newhash;                              /* the new hashtable */
 753                 u4         i;
 754                 utf       *u;
 755                 utf       *nextu;
 756                 u4         slot;
 757
 758                 /* create new hashtable, double the size */
 759
 760                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 761
 762 #if defined(ENABLE_STATISTICS)
 763                 if (opt_stat)
 764                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 765 #endif
 766
 767                 /* transfer elements to new hashtable */
 768
 769                 for (i = 0; i < hashtable_utf->size; i++) {
 770                         u = hashtable_utf->ptr[i];
 771
 772                         while (u) {
 773                                 nextu = u->hashlink;
 774                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 775
 776                                 u->hashlink = (utf *) newhash->ptr[slot];
 777                                 newhash->ptr[slot] = u;
 778
 779                                 /* follow link in external hash chain */
 780
 781                                 u = nextu;
 782                         }
 783                 }
 784
 785                 /* dispose old table */
 786
 787                 hashtable_free(hashtable_utf);
 788
 789                 hashtable_utf = newhash;
 790         }
 791
 792         LOCK_MONITOR_EXIT(hashtable_utf->header);
 793
 794         return u;
 795 }
 796
 797
 798 /* utf_new_u2 ******************************************************************
 799
 800    Make utf symbol from u2 array, if isclassname is true '.' is
 801    replaced by '/'.
 802
 803 *******************************************************************************/
 804
 805 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 806 {
 807         char *buffer;                   /* memory buffer for  unicode characters  */
 808         char *pos;                      /* pointer to current position in buffer  */
 809         u4 left;                        /* unicode characters left                */
 810         u4 buflength;                   /* utf length in bytes of the u2 array    */
 811         utf *result;                    /* resulting utf-string                   */
 812         int i;
 813
 814         /* determine utf length in bytes and allocate memory */
 815
 816         buflength = u2_utflength(unicode_pos, unicode_length);
 817         buffer    = MNEW(char, buflength);
 818
 819         left = buflength;
 820         pos  = buffer;
 821
 822         for (i = 0; i++ < unicode_length; unicode_pos++) {
 823                 /* next unicode character */
 824                 u2 c = *unicode_pos;
 825
 826                 if ((c != 0) && (c < 0x80)) {
 827                         /* 1 character */
 828                         left--;
 829                 if ((int) left < 0) break;
 830                         /* convert classname */
 831                         if (isclassname && c == '.')
 832                                 *pos++ = '/';
 833                         else
 834                                 *pos++ = (char) c;
 835
 836                 } else if (c < 0x800) {
 837                         /* 2 characters */
 838                 unsigned char high = c >> 6;
 839                 unsigned char low  = c & 0x3F;
 840                         left = left - 2;
 841                 if ((int) left < 0) break;
 842                 *pos++ = high | 0xC0;
 843                 *pos++ = low  | 0x80;
 844
 845                 } else {
 846                 /* 3 characters */
 847                 char low  = c & 0x3f;
 848                 char mid  = (c >> 6) & 0x3F;
 849                 char high = c >> 12;
 850                         left = left - 3;
 851                 if ((int) left < 0) break;
 852                 *pos++ = high | 0xE0;
 853                 *pos++ = mid  | 0x80;
 854                 *pos++ = low  | 0x80;
 855                 }
 856         }
 857
 858         /* insert utf-string into symbol-table */
 859         result = utf_new(buffer,buflength);
 860
 861         MFREE(buffer, char, buflength);
 862
 863         return result;
 864 }
 865
 866
 867 /* utf_new_char ****************************************************************
 868
 869    Creates a new utf symbol, the text for this symbol is passed as a
 870    c-string ( = char* ).
 871
 872 *******************************************************************************/
 873
 874 utf *utf_new_char(const char *text)
 875 {
 876         return utf_new(text, strlen(text));
 877 }
 878
 879
 880 /* utf_new_char_classname ******************************************************
 881
 882    Creates a new utf symbol, the text for this symbol is passed as a
 883    c-string ( = char* ) "." characters are going to be replaced by
 884    "/". Since the above function is used often, this is a separte
 885    function, instead of an if.
 886
 887 *******************************************************************************/
 888
 889 utf *utf_new_char_classname(const char *text)
 890 {
 891         if (strchr(text, '.')) {
 892                 char *txt = strdup(text);
 893                 char *end = txt + strlen(txt);
 894                 char *c;
 895                 utf *tmpRes;
 896
 897                 for (c = txt; c < end; c++)
 898                         if (*c == '.') *c = '/';
 899
 900                 tmpRes = utf_new(txt, strlen(txt));
 901                 FREE(txt, 0);
 902
 903                 return tmpRes;
 904
 905         } else
 906                 return utf_new(text, strlen(text));
 907 }
 908
 909
 910 /* utf_nextu2 ******************************************************************
 911
 912    Read the next unicode character from the utf string and increment
 913    the utf-string pointer accordingly.
 914
 915    CAUTION: This function is unsafe for input that was not checked
 916             by is_valid_utf!
 917
 918 *******************************************************************************/
 919
 920 u2 utf_nextu2(char **utf_ptr)
 921 {
 922     /* uncompressed unicode character */
 923     u2 unicode_char = 0;
 924     /* current position in utf text */
 925     unsigned char *utf = (unsigned char *) (*utf_ptr);
 926     /* bytes representing the unicode character */
 927     unsigned char ch1, ch2, ch3;
 928     /* number of bytes used to represent the unicode character */
 929     int len = 0;
 930
 931     switch ((ch1 = utf[0]) >> 4) {
 932         default: /* 1 byte */
 933                 (*utf_ptr)++;
 934                 return (u2) ch1;
 935         case 0xC:
 936         case 0xD: /* 2 bytes */
 937                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 938                         unsigned char high = ch1 & 0x1F;
 939                         unsigned char low  = ch2 & 0x3F;
 940                         unicode_char = (high << 6) + low;
 941                         len = 2;
 942                 }
 943                 break;
 944
 945         case 0xE: /* 2 or 3 bytes */
 946                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 947                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 948                                 unsigned char low  = ch3 & 0x3f;
 949                                 unsigned char mid  = ch2 & 0x3f;
 950                                 unsigned char high = ch1 & 0x0f;
 951                                 unicode_char = (((high << 6) + mid) << 6) + low;
 952                                 len = 3;
 953                         } else
 954                                 len = 2;
 955                 }
 956                 break;
 957     }
 958
 959     /* update position in utf-text */
 960     *utf_ptr = (char *) (utf + len);
 961
 962     return unicode_char;
 963 }
 964
 965
 966 /* utf_bytes *******************************************************************
 967
 968    Determine number of bytes (aka. octets) in the utf string.
 969
 970    IN:
 971       u............utf string
 972
 973    OUT:
 974       The number of octets of this utf string.
 975           There is _no_ terminating zero included in this count.
 976
 977 *******************************************************************************/
 978
 979 u4 utf_bytes(utf *u)
 980 {
 981         return u->blength;
 982 }
 983
 984
 985 /* utf_get_number_of_u2s_for_buffer ********************************************
 986
 987    Determine number of UTF-16 u2s in the given UTF-8 buffer
 988
 989    CAUTION: This function is unsafe for input that was not checked
 990             by is_valid_utf!
 991
 992    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 993    to an array of u2s (UTF-16) and want to know how many of them you will get.
 994    All other uses of this function are probably wrong.
 995
 996    IN:
 997       buffer........points to first char in buffer
 998           blength.......number of _bytes_ in the buffer
 999
1000    OUT:
1001       the number of u2s needed to hold this string in UTF-16 encoding.
1002           There is _no_ terminating zero included in this count.
1003
1004    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
1005    exception.
1006
1007 *******************************************************************************/
1008
1009 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
1010 {
1011         const char *endpos;                 /* points behind utf string           */
1012         const char *utf_ptr;                /* current position in utf text       */
1013         u4 len = 0;                         /* number of unicode characters       */
1014
1015         utf_ptr = buffer;
1016         endpos = utf_ptr + blength;
1017
1018         while (utf_ptr < endpos) {
1019                 len++;
1020                 /* next unicode character */
1021                 utf_nextu2((char **)&utf_ptr);
1022         }
1023
1024         assert(utf_ptr == endpos);
1025
1026         return len;
1027 }
1028
1029
1030 /* utf_get_number_of_u2s *******************************************************
1031
1032    Determine number of UTF-16 u2s in the utf string.
1033
1034    CAUTION: This function is unsafe for input that was not checked
1035             by is_valid_utf!
1036
1037    CAUTION: Use this function *only* when you want to convert a utf string
1038    to an array of u2s and want to know how many of them you will get.
1039    All other uses of this function are probably wrong.
1040
1041    IN:
1042       u............utf string
1043
1044    OUT:
1045       the number of u2s needed to hold this string in UTF-16 encoding.
1046           There is _no_ terminating zero included in this count.
1047           XXX 0 if a NullPointerException has been thrown (see below)
1048
1049 *******************************************************************************/
1050
1051 u4 utf_get_number_of_u2s(utf *u)
1052 {
1053         char *endpos;                       /* points behind utf string           */
1054         char *utf_ptr;                      /* current position in utf text       */
1055         u4 len = 0;                         /* number of unicode characters       */
1056
1057         /* XXX this is probably not checked by most callers! Review this after */
1058         /* the invalid uses of this function have been eliminated */
1059         if (u == NULL) {
1060                 exceptions_throw_nullpointerexception();
1061                 return 0;
1062         }
1063
1064         endpos = UTF_END(u);
1065         utf_ptr = u->text;
1066
1067         while (utf_ptr < endpos) {
1068                 len++;
1069                 /* next unicode character */
1070                 utf_nextu2(&utf_ptr);
1071         }
1072
1073         if (utf_ptr != endpos) {
1074                 /* string ended abruptly */
1075                 exceptions_throw_internalerror("Illegal utf8 string");
1076                 return 0;
1077         }
1078
1079         return len;
1080 }
1081
1082
1083 /* utf8_safe_number_of_u2s *****************************************************
1084
1085    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1086    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1087
1088    This function is safe even for invalid UTF-8 strings.
1089
1090    IN:
1091       text..........zero-terminated(!) UTF-8 string (may be invalid)
1092                         must NOT be NULL
1093           nbytes........strlen(text). (This is needed to completely emulate
1094                         the RI).
1095
1096    OUT:
1097       the number of u2s needed to hold this string in UTF-16 encoding.
1098           There is _no_ terminating zero included in this count.
1099
1100 *******************************************************************************/
1101
1102 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1103         register const unsigned char *t;
1104         register s4 byte;
1105         register s4 len;
1106         register const unsigned char *tlimit;
1107         s4 byte1;
1108         s4 byte2;
1109         s4 byte3;
1110         s4 value;
1111         s4 skip;
1112
1113         assert(text);
1114         assert(nbytes >= 0);
1115
1116         len = 0;
1117         t = (const unsigned char *) text;
1118         tlimit = t + nbytes;
1119
1120         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1121
1122         while (1) {
1123                 byte = *t++;
1124
1125                 if (byte & 0x80) {
1126                         /* highest bit set, non-ASCII character */
1127
1128                         if ((byte & 0xe0) == 0xc0) {
1129                                 /* 2-byte: should be 110..... 10...... ? */
1130
1131                                 if ((*t++ & 0xc0) == 0x80)
1132                                         ; /* valid 2-byte */
1133                                 else
1134                                         t--; /* invalid */
1135                         }
1136                         else if ((byte & 0xf0) == 0xe0) {
1137                                 /* 3-byte: should be 1110.... 10...... 10...... */
1138                                 /*                            ^t                */
1139
1140                                 if (t + 2 > tlimit)
1141                                         return len + 1; /* invalid, stop here */
1142
1143                                 if ((*t++ & 0xc0) == 0x80) {
1144                                         if ((*t++ & 0xc0) == 0x80)
1145                                                 ; /* valid 3-byte */
1146                                         else
1147                                                 t--; /* invalid */
1148                                 }
1149                                 else
1150                                         t--; /* invalid */
1151                         }
1152                         else if ((byte & 0xf8) == 0xf0) {
1153                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1154                                 /*                            ^t                         */
1155
1156                                 if (t + 3 > tlimit)
1157                                         return len + 1; /* invalid, stop here */
1158
1159                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1160                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1161                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1162                                                         /* valid 4-byte UTF-8? */
1163                                                         value = ((byte  & 0x07) << 18)
1164                                                                   | ((byte1 & 0x3f) << 12)
1165                                                                   | ((byte2 & 0x3f) <<  6)
1166                                                                   | ((byte3 & 0x3f)      );
1167
1168                                                         if (value > 0x10FFFF)
1169                                                                 ; /* invalid */
1170                                                         else if (value > 0xFFFF)
1171                                                                 len += 1; /* we need surrogates */
1172                                                         else
1173                                                                 ; /* 16bit suffice */
1174                                                 }
1175                                                 else
1176                                                         t--; /* invalid */
1177                                         }
1178                                         else
1179                                                 t--; /* invalid */
1180                                 }
1181                                 else
1182                                         t--; /* invalid */
1183                         }
1184                         else if ((byte & 0xfc) == 0xf8) {
1185                                 /* invalid 5-byte */
1186                                 if (t + 4 > tlimit)
1187                                         return len + 1; /* invalid, stop here */
1188
1189                                 skip = 4;
1190                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1191                                         t++;
1192                         }
1193                         else if ((byte & 0xfe) == 0xfc) {
1194                                 /* invalid 6-byte */
1195                                 if (t + 5 > tlimit)
1196                                         return len + 1; /* invalid, stop here */
1197
1198                                 skip = 5;
1199                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1200                                         t++;
1201                         }
1202                         else
1203                                 ; /* invalid */
1204                 }
1205                 else {
1206                         /* NUL */
1207
1208                         if (byte == 0)
1209                                 break;
1210
1211                         /* ASCII character, common case */
1212                 }
1213
1214                 len++;
1215         }
1216
1217         return len;
1218 }
1219
1220
1221 /* utf8_safe_convert_to_u2s ****************************************************
1222
1223    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1224    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1225    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1226
1227    This function is safe even for invalid UTF-8 strings.
1228
1229    IN:
1230       text..........zero-terminated(!) UTF-8 string (may be invalid)
1231                         must NOT be NULL
1232           nbytes........strlen(text). (This is needed to completely emulate
1233                                         the RI).
1234           buffer........a preallocated array of u2s to receive the decoded
1235                         string. Use utf8_safe_number_of_u2s to get the
1236                                         required number of u2s for allocating this.
1237
1238 *******************************************************************************/
1239
1240 #define UNICODE_REPLACEMENT  0xfffd
1241
1242 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1243         register const unsigned char *t;
1244         register s4 byte;
1245         register const unsigned char *tlimit;
1246         s4 byte1;
1247         s4 byte2;
1248         s4 byte3;
1249         s4 value;
1250         s4 skip;
1251
1252         assert(text);
1253         assert(nbytes >= 0);
1254
1255         t = (const unsigned char *) text;
1256         tlimit = t + nbytes;
1257
1258         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1259
1260         while (1) {
1261                 byte = *t++;
1262
1263                 if (byte & 0x80) {
1264                         /* highest bit set, non-ASCII character */
1265
1266                         if ((byte & 0xe0) == 0xc0) {
1267                                 /* 2-byte: should be 110..... 10...... */
1268
1269                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1270                                         /* valid 2-byte UTF-8 */
1271                                         *buffer++ = ((byte  & 0x1f) << 6)
1272                                                           | ((byte1 & 0x3f)     );
1273                                 }
1274                                 else {
1275                                         *buffer++ = UNICODE_REPLACEMENT;
1276                                         t--;
1277                                 }
1278                         }
1279                         else if ((byte & 0xf0) == 0xe0) {
1280                                 /* 3-byte: should be 1110.... 10...... 10...... */
1281
1282                                 if (t + 2 > tlimit) {
1283                                         *buffer++ = UNICODE_REPLACEMENT;
1284                                         return;
1285                                 }
1286
1287                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1288                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1289                                                 /* valid 3-byte UTF-8 */
1290                                                 *buffer++ = ((byte  & 0x0f) << 12)
1291                                                                   | ((byte1 & 0x3f) <<  6)
1292                                                                   | ((byte2 & 0x3f)      );
1293                                         }
1294                                         else {
1295                                                 *buffer++ = UNICODE_REPLACEMENT;
1296                                                 t--;
1297                                         }
1298                                 }
1299                                 else {
1300                                         *buffer++ = UNICODE_REPLACEMENT;
1301                                         t--;
1302                                 }
1303                         }
1304                         else if ((byte & 0xf8) == 0xf0) {
1305                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1306
1307                                 if (t + 3 > tlimit) {
1308                                         *buffer++ = UNICODE_REPLACEMENT;
1309                                         return;
1310                                 }
1311
1312                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1313                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1314                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1315                                                         /* valid 4-byte UTF-8? */
1316                                                         value = ((byte  & 0x07) << 18)
1317                                                                   | ((byte1 & 0x3f) << 12)
1318                                                                   | ((byte2 & 0x3f) <<  6)
1319                                                                   | ((byte3 & 0x3f)      );
1320
1321                                                         if (value > 0x10FFFF) {
1322                                                                 *buffer++ = UNICODE_REPLACEMENT;
1323                                                         }
1324                                                         else if (value > 0xFFFF) {
1325                                                                 /* we need surrogates */
1326                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1327                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1328                                                         }
1329                                                         else
1330                                                                 *buffer++ = value; /* 16bit suffice */
1331                                                 }
1332                                                 else {
1333                                                         *buffer++ = UNICODE_REPLACEMENT;
1334                                                         t--;
1335                                                 }
1336                                         }
1337                                         else {
1338                                                 *buffer++ = UNICODE_REPLACEMENT;
1339                                                 t--;
1340                                         }
1341                                 }
1342                                 else {
1343                                         *buffer++ = UNICODE_REPLACEMENT;
1344                                         t--;
1345                                 }
1346                         }
1347                         else if ((byte & 0xfc) == 0xf8) {
1348                                 if (t + 4 > tlimit) {
1349                                         *buffer++ = UNICODE_REPLACEMENT;
1350                                         return;
1351                                 }
1352
1353                                 skip = 4;
1354                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1355                                         t++;
1356                                 *buffer++ = UNICODE_REPLACEMENT;
1357                         }
1358                         else if ((byte & 0xfe) == 0xfc) {
1359                                 if (t + 5 > tlimit) {
1360                                         *buffer++ = UNICODE_REPLACEMENT;
1361                                         return;
1362                                 }
1363
1364                                 skip = 5;
1365                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1366                                         t++;
1367                                 *buffer++ = UNICODE_REPLACEMENT;
1368                         }
1369                         else
1370                                 *buffer++ = UNICODE_REPLACEMENT;
1371                 }
1372                 else {
1373                         /* NUL */
1374
1375                         if (byte == 0)
1376                                 break;
1377
1378                         /* ASCII character, common case */
1379
1380                         *buffer++ = byte;
1381                 }
1382         }
1383 }
1384
1385
1386 /* u2_utflength ****************************************************************
1387
1388    Returns the utf length in bytes of a u2 array.
1389
1390 *******************************************************************************/
1391
1392 u4 u2_utflength(u2 *text, u4 u2_length)
1393 {
1394         u4 result_len = 0;                  /* utf length in bytes                */
1395         u2 ch;                              /* current unicode character          */
1396         u4 len;
1397
1398         for (len = 0; len < u2_length; len++) {
1399                 /* next unicode character */
1400                 ch = *text++;
1401
1402                 /* determine bytes required to store unicode character as utf */
1403                 if (ch && (ch < 0x80))
1404                         result_len++;
1405                 else if (ch < 0x800)
1406                         result_len += 2;
1407                 else
1408                         result_len += 3;
1409         }
1410
1411     return result_len;
1412 }
1413
1414
1415 /* utf_copy ********************************************************************
1416
1417    Copy the given utf string byte-for-byte to a buffer.
1418
1419    IN:
1420       buffer.......the buffer
1421           u............the utf string
1422
1423 *******************************************************************************/
1424
1425 void utf_copy(char *buffer, utf *u)
1426 {
1427         /* our utf strings are zero-terminated (done by utf_new) */
1428         MCOPY(buffer, u->text, char, u->blength + 1);
1429 }
1430
1431
1432 /* utf_cat *********************************************************************
1433
1434    Append the given utf string byte-for-byte to a buffer.
1435
1436    IN:
1437       buffer.......the buffer
1438           u............the utf string
1439
1440 *******************************************************************************/
1441
1442 void utf_cat(char *buffer, utf *u)
1443 {
1444         /* our utf strings are zero-terminated (done by utf_new) */
1445         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1446 }
1447
1448
1449 /* utf_copy_classname **********************************************************
1450
1451    Copy the given utf classname byte-for-byte to a buffer.
1452    '/' is replaced by '.'
1453
1454    IN:
1455       buffer.......the buffer
1456           u............the utf string
1457
1458 *******************************************************************************/
1459
1460 void utf_copy_classname(char *buffer, utf *u)
1461 {
1462         char *bufptr;
1463         char *srcptr;
1464         char *endptr;
1465         char ch;
1466
1467         bufptr = buffer;
1468         srcptr = u->text;
1469         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1470
1471         while (srcptr != endptr) {
1472                 ch = *srcptr++;
1473                 if (ch == '/')
1474                         ch = '.';
1475                 *bufptr++ = ch;
1476         }
1477 }
1478
1479
1480 /* utf_cat *********************************************************************
1481
1482    Append the given utf classname byte-for-byte to a buffer.
1483    '/' is replaced by '.'
1484
1485    IN:
1486       buffer.......the buffer
1487           u............the utf string
1488
1489 *******************************************************************************/
1490
1491 void utf_cat_classname(char *buffer, utf *u)
1492 {
1493         utf_copy_classname(buffer + strlen(buffer), u);
1494 }
1495
1496 /* utf_display_printable_ascii *************************************************
1497
1498    Write utf symbol to stdout (for debugging purposes).
1499    Non-printable and non-ASCII characters are printed as '?'.
1500
1501 *******************************************************************************/
1502
1503 void utf_display_printable_ascii(utf *u)
1504 {
1505         char *endpos;                       /* points behind utf string           */
1506         char *utf_ptr;                      /* current position in utf text       */
1507
1508         if (u == NULL) {
1509                 printf("NULL");
1510                 fflush(stdout);
1511                 return;
1512         }
1513
1514         endpos = UTF_END(u);
1515         utf_ptr = u->text;
1516
1517         while (utf_ptr < endpos) {
1518                 /* read next unicode character */
1519
1520                 u2 c = utf_nextu2(&utf_ptr);
1521
1522                 if ((c >= 32) && (c <= 127))
1523                         printf("%c", c);
1524                 else
1525                         printf("?");
1526         }
1527
1528         fflush(stdout);
1529 }
1530
1531
1532 /* utf_display_printable_ascii_classname ***************************************
1533
1534    Write utf symbol to stdout with `/' converted to `.' (for debugging
1535    purposes).
1536    Non-printable and non-ASCII characters are printed as '?'.
1537
1538 *******************************************************************************/
1539
1540 void utf_display_printable_ascii_classname(utf *u)
1541 {
1542         char *endpos;                       /* points behind utf string           */
1543         char *utf_ptr;                      /* current position in utf text       */
1544
1545         if (u == NULL) {
1546                 printf("NULL");
1547                 fflush(stdout);
1548                 return;
1549         }
1550
1551         endpos = UTF_END(u);
1552         utf_ptr = u->text;
1553
1554         while (utf_ptr < endpos) {
1555                 /* read next unicode character */
1556
1557                 u2 c = utf_nextu2(&utf_ptr);
1558
1559                 if (c == '/')
1560                         c = '.';
1561
1562                 if ((c >= 32) && (c <= 127))
1563                         printf("%c", c);
1564                 else
1565                         printf("?");
1566         }
1567
1568         fflush(stdout);
1569 }
1570
1571
1572 /* utf_sprint_convert_to_latin1 ************************************************
1573
1574    Write utf symbol into c-string (for debugging purposes).
1575    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1576    invalid results.
1577
1578 *******************************************************************************/
1579
1580 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1581 {
1582         char *endpos;                       /* points behind utf string           */
1583         char *utf_ptr;                      /* current position in utf text       */
1584         u2 pos = 0;                         /* position in c-string               */
1585
1586         if (!u) {
1587                 strcpy(buffer, "NULL");
1588                 return;
1589         }
1590
1591         endpos = UTF_END(u);
1592         utf_ptr = u->text;
1593
1594         while (utf_ptr < endpos)
1595                 /* copy next unicode character */
1596                 buffer[pos++] = utf_nextu2(&utf_ptr);
1597
1598         /* terminate string */
1599         buffer[pos] = '\0';
1600 }
1601
1602
1603 /* utf_sprint_convert_to_latin1_classname **************************************
1604
1605    Write utf symbol into c-string with `/' converted to `.' (for debugging
1606    purposes).
1607    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1608    invalid results.
1609
1610 *******************************************************************************/
1611
1612 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1613 {
1614         char *endpos;                       /* points behind utf string           */
1615         char *utf_ptr;                      /* current position in utf text       */
1616         u2 pos = 0;                         /* position in c-string               */
1617
1618         if (!u) {
1619                 strcpy(buffer, "NULL");
1620                 return;
1621         }
1622
1623         endpos = UTF_END(u);
1624         utf_ptr = u->text;
1625
1626         while (utf_ptr < endpos) {
1627                 /* copy next unicode character */
1628                 u2 c = utf_nextu2(&utf_ptr);
1629                 if (c == '/') c = '.';
1630                 buffer[pos++] = c;
1631         }
1632
1633         /* terminate string */
1634         buffer[pos] = '\0';
1635 }
1636
1637
1638 /* utf_strcat_convert_to_latin1 ************************************************
1639
1640    Like libc strcat, but uses an utf8 string.
1641    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1642    invalid results.
1643
1644 *******************************************************************************/
1645
1646 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1647 {
1648         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1649 }
1650
1651
1652 /* utf_strcat_convert_to_latin1_classname **************************************
1653
1654    Like libc strcat, but uses an utf8 string.
1655    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1656    invalid results.
1657
1658 *******************************************************************************/
1659
1660 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1661 {
1662         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1663 }
1664
1665
1666 /* utf_fprint_printable_ascii **************************************************
1667
1668    Write utf symbol into file.
1669    Non-printable and non-ASCII characters are printed as '?'.
1670
1671 *******************************************************************************/
1672
1673 void utf_fprint_printable_ascii(FILE *file, utf *u)
1674 {
1675         char *endpos;                       /* points behind utf string           */
1676         char *utf_ptr;                      /* current position in utf text       */
1677
1678         if (!u)
1679                 return;
1680
1681         endpos = UTF_END(u);
1682         utf_ptr = u->text;
1683
1684         while (utf_ptr < endpos) {
1685                 /* read next unicode character */
1686                 u2 c = utf_nextu2(&utf_ptr);
1687
1688                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1689                 else fprintf(file, "?");
1690         }
1691 }
1692
1693
1694 /* utf_fprint_printable_ascii_classname ****************************************
1695
1696    Write utf symbol into file with `/' converted to `.'.
1697    Non-printable and non-ASCII characters are printed as '?'.
1698
1699 *******************************************************************************/
1700
1701 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1702 {
1703         char *endpos;                       /* points behind utf string           */
1704         char *utf_ptr;                      /* current position in utf text       */
1705
1706     if (!u)
1707                 return;
1708
1709         endpos = UTF_END(u);
1710         utf_ptr = u->text;
1711
1712         while (utf_ptr < endpos) {
1713                 /* read next unicode character */
1714                 u2 c = utf_nextu2(&utf_ptr);
1715                 if (c == '/') c = '.';
1716
1717                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1718                 else fprintf(file, "?");
1719         }
1720 }
1721
1722
1723 /* is_valid_utf ****************************************************************
1724
1725    Return true if the given string is a valid UTF-8 string.
1726
1727    utf_ptr...points to first character
1728    end_pos...points after last character
1729
1730 *******************************************************************************/
1731
1732 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1733
1734 bool is_valid_utf(char *utf_ptr, char *end_pos)
1735 {
1736         int bytes;
1737         int len,i;
1738         char c;
1739         unsigned long v;
1740
1741         if (end_pos < utf_ptr) return false;
1742         bytes = end_pos - utf_ptr;
1743         while (bytes--) {
1744                 c = *utf_ptr++;
1745
1746                 if (!c) return false;                     /* 0x00 is not allowed */
1747                 if ((c & 0x80) == 0) continue;            /* ASCII */
1748
1749                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1750                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1751                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1752                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1753                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1754                 else return false;                        /* invalid leading byte */
1755
1756                 if (len > 2) return false;                /* Java limitation */
1757
1758                 v = (unsigned long)c & (0x3f >> len);
1759
1760                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1761
1762                 for (i = len; i--; ) {
1763                         c = *utf_ptr++;
1764                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1765                                 return false;
1766                         v = (v << 6) | (c & 0x3f);
1767                 }
1768
1769                 if (v == 0) {
1770                         if (len != 1) return false;           /* Java special */
1771
1772                 } else {
1773                         /* Sun Java seems to allow overlong UTF-8 encodings */
1774
1775                         /* if (v < min_codepoint[len]) */
1776                                 /* XXX throw exception? */
1777                 }
1778
1779                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1780                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1781
1782                 /* even these seem to be allowed */
1783                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1784         }
1785
1786         return true;
1787 }
1788
1789
1790 /* is_valid_name ***************************************************************
1791
1792    Return true if the given string may be used as a class/field/method
1793    name. (Currently this only disallows empty strings and control
1794    characters.)
1795
1796    NOTE: The string is assumed to have passed is_valid_utf!
1797
1798    utf_ptr...points to first character
1799    end_pos...points after last character
1800
1801 *******************************************************************************/
1802
1803 bool is_valid_name(char *utf_ptr, char *end_pos)
1804 {
1805         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1806
1807         while (utf_ptr < end_pos) {
1808                 unsigned char c = *utf_ptr++;
1809
1810                 if (c < 0x20) return false; /* disallow control characters */
1811                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1812                         return false;
1813         }
1814
1815         return true;
1816 }
1817
1818 bool is_valid_name_utf(utf *u)
1819 {
1820         return is_valid_name(u->text, UTF_END(u));
1821 }
1822
1823
1824 /* utf_show ********************************************************************
1825
1826    Writes the utf symbols in the utfhash to stdout and displays the
1827    number of external hash chains grouped according to the chainlength
1828    (for debugging purposes).
1829
1830 *******************************************************************************/
1831
1832 #if !defined(NDEBUG)
1833 void utf_show(void)
1834 {
1835
1836 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1837
1838         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1839         u4 max_chainlength = 0;      /* maximum length of the chains */
1840         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1841         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1842         u4 i;
1843
1844         printf("UTF-HASH:\n");
1845
1846         /* show element of utf-hashtable */
1847
1848         for (i = 0; i < hashtable_utf->size; i++) {
1849                 utf *u = hashtable_utf->ptr[i];
1850
1851                 if (u) {
1852                         printf("SLOT %d: ", (int) i);
1853
1854                         while (u) {
1855                                 printf("'");
1856                                 utf_display_printable_ascii(u);
1857                                 printf("' ");
1858                                 u = u->hashlink;
1859                         }
1860                         printf("\n");
1861                 }
1862         }
1863
1864         printf("UTF-HASH: %d slots for %d entries\n",
1865                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1866
1867         if (hashtable_utf->entries == 0)
1868                 return;
1869
1870         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1871
1872         for (i=0;i<CHAIN_LIMIT;i++)
1873                 chain_count[i]=0;
1874
1875         /* count numbers of hashchains according to their length */
1876         for (i=0; i<hashtable_utf->size; i++) {
1877
1878                 utf *u = (utf*) hashtable_utf->ptr[i];
1879                 u4 chain_length = 0;
1880
1881                 /* determine chainlength */
1882                 while (u) {
1883                         u = u->hashlink;
1884                         chain_length++;
1885                 }
1886
1887                 /* update sum of all chainlengths */
1888                 sum_chainlength+=chain_length;
1889
1890                 /* determine the maximum length of the chains */
1891                 if (chain_length>max_chainlength)
1892                         max_chainlength = chain_length;
1893
1894                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1895                 if (chain_length>=CHAIN_LIMIT) {
1896                         beyond_limit+=chain_length;
1897                         chain_length=CHAIN_LIMIT-1;
1898                 }
1899
1900                 /* update number of hashchains of current length */
1901                 chain_count[chain_length]++;
1902         }
1903
1904         /* display results */
1905         for (i=1;i<CHAIN_LIMIT-1;i++)
1906                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1907
1908         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1909
1910
1911         printf("max. chainlength:%5d\n",max_chainlength);
1912
1913         /* avg. chainlength = sum of chainlengths / number of chains */
1914         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1915 }
1916 #endif /* !defined(NDEBUG) */
1917
1918
1919 /*
1920  * These are local overrides for various environment variables in Emacs.
1921  * Please do not remove this and leave it at the end of the file, where
1922  * Emacs will automagically detect them.
1923  * ---------------------------------------------------------------------
1924  * Local variables:
1925  * mode: c
1926  * indent-tabs-mode: t
1927  * c-basic-offset: 4
1928  * tab-width: 4
1929  * End:
1930  * vim:noexpandtab:sw=4:ts=4:
1931  */