src/vmcore/utf8.c

   1 /* src/vmcore/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
   4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
   5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
   6    J. Wenninger, Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  23    02110-1301, USA.
  24
  25    $Id: utf8.c 8132 2007-06-22 11:15:47Z twisti $
  26
  27 */
  28
  29
  30 #include "config.h"
  31
  32 #include <string.h>
  33 #include <assert.h>
  34
  35 #include "vm/types.h"
  36
  37 #include "mm/memory.h"
  38
  39 #include "threads/lock-common.h"
  40
  41 #include "toolbox/hashtable.h"
  42
  43 #include "vm/exceptions.h"
  44
  45 #include "vmcore/options.h"
  46
  47 #if defined(ENABLE_STATISTICS)
  48 # include "vmcore/statistics.h"
  49 #endif
  50
  51 #include "vmcore/utf8.h"
  52
  53
  54 /* global variables ***********************************************************/
  55
  56 /* hashsize must be power of 2 */
  57
  58 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  59
  60 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  61
  62
  63 /* utf-symbols for pointer comparison of frequently used strings **************/
  64
  65 utf *utf_java_lang_Object;
  66
  67 utf *utf_java_lang_Class;
  68 utf *utf_java_lang_ClassLoader;
  69 utf *utf_java_lang_Cloneable;
  70 utf *utf_java_lang_SecurityManager;
  71 utf *utf_java_lang_String;
  72 utf *utf_java_lang_System;
  73 utf *utf_java_lang_ThreadGroup;
  74 utf *utf_java_lang_ref_SoftReference;
  75 utf *utf_java_lang_ref_WeakReference;
  76 utf *utf_java_lang_ref_PhantomReference;
  77 utf *utf_java_io_Serializable;
  78
  79 utf *utf_java_lang_Throwable;
  80 utf *utf_java_lang_Error;
  81
  82 utf *utf_java_lang_AbstractMethodError;
  83 utf *utf_java_lang_ClassCircularityError;
  84 utf *utf_java_lang_ClassFormatError;
  85 utf *utf_java_lang_ExceptionInInitializerError;
  86 utf *utf_java_lang_IncompatibleClassChangeError;
  87 utf *utf_java_lang_InstantiationError;
  88 utf *utf_java_lang_InternalError;
  89 utf *utf_java_lang_LinkageError;
  90 utf *utf_java_lang_NoClassDefFoundError;
  91 utf *utf_java_lang_NoSuchFieldError;
  92 utf *utf_java_lang_NoSuchMethodError;
  93 utf *utf_java_lang_OutOfMemoryError;
  94 utf *utf_java_lang_UnsatisfiedLinkError;
  95 utf *utf_java_lang_UnsupportedClassVersionError;
  96 utf *utf_java_lang_VerifyError;
  97 utf *utf_java_lang_VirtualMachineError;
  98
  99 #if defined(WITH_CLASSPATH_GNU)
 100 utf *utf_java_lang_VMThrowable;
 101 #endif
 102
 103 utf *utf_java_lang_Exception;
 104
 105 utf *utf_java_lang_ArithmeticException;
 106 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
 107 utf *utf_java_lang_ArrayStoreException;
 108 utf *utf_java_lang_ClassCastException;
 109 utf *utf_java_lang_ClassNotFoundException;
 110 utf *utf_java_lang_CloneNotSupportedException;
 111 utf *utf_java_lang_IllegalAccessException;
 112 utf *utf_java_lang_IllegalArgumentException;
 113 utf *utf_java_lang_IllegalMonitorStateException;
 114 utf *utf_java_lang_InstantiationException;
 115 utf *utf_java_lang_InterruptedException;
 116 utf *utf_java_lang_NegativeArraySizeException;
 117 utf *utf_java_lang_NullPointerException;
 118 utf *utf_java_lang_StringIndexOutOfBoundsException;
 119
 120 utf *utf_java_lang_reflect_InvocationTargetException;
 121
 122 utf *utf_java_security_PrivilegedActionException;
 123
 124 #if defined(ENABLE_JAVASE)
 125 utf* utf_java_lang_Void;
 126 #endif
 127
 128 utf* utf_java_lang_Boolean;
 129 utf* utf_java_lang_Byte;
 130 utf* utf_java_lang_Character;
 131 utf* utf_java_lang_Short;
 132 utf* utf_java_lang_Integer;
 133 utf* utf_java_lang_Long;
 134 utf* utf_java_lang_Float;
 135 utf* utf_java_lang_Double;
 136
 137 #if defined(ENABLE_JAVASE)
 138 utf *utf_java_lang_StackTraceElement;
 139 utf *utf_java_lang_reflect_Constructor;
 140 utf *utf_java_lang_reflect_Field;
 141 utf *utf_java_lang_reflect_Method;
 142 utf *utf_java_util_Vector;
 143 #endif
 144
 145 utf *utf_InnerClasses;                  /* InnerClasses                       */
 146 utf *utf_ConstantValue;                 /* ConstantValue                      */
 147 utf *utf_Code;                          /* Code                               */
 148 utf *utf_Exceptions;                    /* Exceptions                         */
 149 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 150 utf *utf_SourceFile;                    /* SourceFile                         */
 151
 152 #if defined(ENABLE_JAVASE)
 153 utf *utf_EnclosingMethod;
 154 utf *utf_Signature;
 155 utf *utf_RuntimeVisibleAnnotations;
 156 utf *utf_StackMapTable;
 157 #endif
 158
 159 utf *utf_init;                          /* <init>                             */
 160 utf *utf_clinit;                        /* <clinit>                           */
 161 utf *utf_clone;                         /* clone                              */
 162 utf *utf_finalize;                      /* finalize                           */
 163 utf *utf_run;                           /* run                                */
 164
 165 utf *utf_add;
 166 utf *utf_remove;
 167 utf *utf_addThread;
 168 utf *utf_removeThread;
 169 utf *utf_put;
 170 utf *utf_get;
 171 utf *utf_value;
 172
 173 utf *utf_fillInStackTrace;
 174 utf *utf_findNative;
 175 utf *utf_getSystemClassLoader;
 176 utf *utf_initCause;
 177 utf *utf_loadClass;
 178 utf *utf_printStackTrace;
 179
 180 utf *utf_division_by_zero;
 181
 182 utf *utf_Z;                             /* Z                                  */
 183 utf *utf_B;                             /* B                                  */
 184 utf *utf_C;                             /* C                                  */
 185 utf *utf_S;                             /* S                                  */
 186 utf *utf_I;                             /* I                                  */
 187 utf *utf_J;                             /* J                                  */
 188 utf *utf_F;                             /* F                                  */
 189 utf *utf_D;                             /* D                                  */
 190
 191 utf *utf_void__void;                    /* ()V                                */
 192 utf *utf_boolean__void;                 /* (Z)V                               */
 193 utf *utf_byte__void;                    /* (B)V                               */
 194 utf *utf_char__void;                    /* (C)V                               */
 195 utf *utf_short__void;                   /* (S)V                               */
 196 utf *utf_int__void;                     /* (I)V                               */
 197 utf *utf_long__void;                    /* (J)V                               */
 198 utf *utf_float__void;                   /* (F)V                               */
 199 utf *utf_double__void;                  /* (D)V                               */
 200
 201 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 202 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 203 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 204 utf *utf_java_lang_ClassLoader_java_lang_String__J;
 205 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 206 utf *utf_java_lang_Object__java_lang_Object;
 207 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 208 utf *utf_java_lang_String__java_lang_Class;
 209 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 210 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 211 utf *utf_java_lang_Throwable__java_lang_Throwable;
 212
 213 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 214 utf *utf_null;
 215 utf *array_packagename;
 216
 217
 218 /* utf_init ********************************************************************
 219
 220    Initializes the utf8 subsystem.
 221
 222 *******************************************************************************/
 223
 224 bool utf8_init(void)
 225 {
 226         /* create utf8 hashtable */
 227
 228         hashtable_utf = NEW(hashtable);
 229
 230         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 231
 232 #if defined(ENABLE_STATISTICS)
 233         if (opt_stat)
 234                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 235 #endif
 236
 237         /* create utf-symbols for pointer comparison of frequently used strings */
 238
 239         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 240
 241         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 242         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 243         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 244         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 245         utf_java_lang_String           = utf_new_char("java/lang/String");
 246         utf_java_lang_System           = utf_new_char("java/lang/System");
 247         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 248
 249         utf_java_lang_ref_SoftReference =
 250                 utf_new_char("java/lang/ref/SoftReference");
 251
 252         utf_java_lang_ref_WeakReference =
 253                 utf_new_char("java/lang/ref/WeakReference");
 254
 255         utf_java_lang_ref_PhantomReference =
 256                 utf_new_char("java/lang/ref/PhantomReference");
 257
 258         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 259
 260         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 261         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 262
 263         utf_java_lang_ClassCircularityError =
 264                 utf_new_char("java/lang/ClassCircularityError");
 265
 266         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
 267
 268         utf_java_lang_ExceptionInInitializerError =
 269                 utf_new_char("java/lang/ExceptionInInitializerError");
 270
 271         utf_java_lang_IncompatibleClassChangeError =
 272                 utf_new_char("java/lang/IncompatibleClassChangeError");
 273
 274         utf_java_lang_InstantiationError =
 275                 utf_new_char("java/lang/InstantiationError");
 276
 277         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
 278         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 279
 280         utf_java_lang_NoClassDefFoundError =
 281                 utf_new_char("java/lang/NoClassDefFoundError");
 282
 283         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 284
 285         utf_java_lang_UnsatisfiedLinkError =
 286                 utf_new_char("java/lang/UnsatisfiedLinkError");
 287
 288         utf_java_lang_UnsupportedClassVersionError =
 289                 utf_new_char("java/lang/UnsupportedClassVersionError");
 290
 291         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
 292
 293         utf_java_lang_VirtualMachineError =
 294                 utf_new_char("java/lang/VirtualMachineError");
 295
 296 #if defined(ENABLE_JAVASE)
 297         utf_java_lang_AbstractMethodError =
 298                 utf_new_char("java/lang/AbstractMethodError");
 299
 300         utf_java_lang_NoSuchFieldError =
 301                 utf_new_char("java/lang/NoSuchFieldError");
 302
 303         utf_java_lang_NoSuchMethodError =
 304                 utf_new_char("java/lang/NoSuchMethodError");
 305 #endif
 306
 307 #if defined(WITH_CLASSPATH_GNU)
 308         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
 309 #endif
 310
 311         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 312
 313         utf_java_lang_ArithmeticException =
 314                 utf_new_char("java/lang/ArithmeticException");
 315
 316         utf_java_lang_ArrayIndexOutOfBoundsException =
 317                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
 318
 319         utf_java_lang_ArrayStoreException =
 320                 utf_new_char("java/lang/ArrayStoreException");
 321
 322         utf_java_lang_ClassCastException =
 323                 utf_new_char("java/lang/ClassCastException");
 324
 325         utf_java_lang_ClassNotFoundException =
 326                 utf_new_char("java/lang/ClassNotFoundException");
 327
 328         utf_java_lang_CloneNotSupportedException =
 329                 utf_new_char("java/lang/CloneNotSupportedException");
 330
 331         utf_java_lang_IllegalAccessException =
 332                 utf_new_char("java/lang/IllegalAccessException");
 333
 334         utf_java_lang_IllegalArgumentException =
 335                 utf_new_char("java/lang/IllegalArgumentException");
 336
 337         utf_java_lang_IllegalMonitorStateException =
 338                 utf_new_char("java/lang/IllegalMonitorStateException");
 339
 340         utf_java_lang_InstantiationException =
 341                 utf_new_char("java/lang/InstantiationException");
 342
 343         utf_java_lang_InterruptedException =
 344                 utf_new_char("java/lang/InterruptedException");
 345
 346         utf_java_lang_NegativeArraySizeException =
 347                 utf_new_char("java/lang/NegativeArraySizeException");
 348
 349         utf_java_lang_NullPointerException =
 350                 utf_new_char("java/lang/NullPointerException");
 351
 352         utf_java_lang_StringIndexOutOfBoundsException =
 353                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
 354
 355         utf_java_lang_reflect_InvocationTargetException =
 356                 utf_new_char("java/lang/reflect/InvocationTargetException");
 357
 358         utf_java_security_PrivilegedActionException =
 359                 utf_new_char("java/security/PrivilegedActionException");
 360
 361 #if defined(ENABLE_JAVASE)
 362         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 363 #endif
 364
 365         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 366         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 367         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 368         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 369         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 370         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 371         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 372         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 373
 374 #if defined(ENABLE_JAVASE)
 375         utf_java_lang_StackTraceElement =
 376                 utf_new_char("java/lang/StackTraceElement");
 377
 378         utf_java_lang_reflect_Constructor =
 379                 utf_new_char("java/lang/reflect/Constructor");
 380
 381         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 382         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 383         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 384 #endif
 385
 386         utf_InnerClasses               = utf_new_char("InnerClasses");
 387         utf_ConstantValue              = utf_new_char("ConstantValue");
 388         utf_Code                       = utf_new_char("Code");
 389         utf_Exceptions                 = utf_new_char("Exceptions");
 390         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 391         utf_SourceFile                 = utf_new_char("SourceFile");
 392
 393 #if defined(ENABLE_JAVASE)
 394         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 395         utf_Signature                  = utf_new_char("Signature");
 396         utf_RuntimeVisibleAnnotations  = utf_new_char("RuntimeVisibleAnnotations");
 397         utf_StackMapTable              = utf_new_char("StackMapTable");
 398 #endif
 399
 400         utf_init                           = utf_new_char("<init>");
 401         utf_clinit                         = utf_new_char("<clinit>");
 402         utf_clone                      = utf_new_char("clone");
 403         utf_finalize                   = utf_new_char("finalize");
 404         utf_run                        = utf_new_char("run");
 405
 406         utf_add                        = utf_new_char("add");
 407         utf_remove                     = utf_new_char("remove");
 408         utf_addThread                  = utf_new_char("addThread");
 409         utf_removeThread               = utf_new_char("removeThread");
 410         utf_put                        = utf_new_char("put");
 411         utf_get                        = utf_new_char("get");
 412         utf_value                      = utf_new_char("value");
 413
 414         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 415         utf_findNative                 = utf_new_char("findNative");
 416         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 417         utf_initCause                  = utf_new_char("initCause");
 418         utf_loadClass                  = utf_new_char("loadClass");
 419         utf_printStackTrace            = utf_new_char("printStackTrace");
 420
 421         utf_division_by_zero           = utf_new_char("/ by zero");
 422
 423         utf_Z                          = utf_new_char("Z");
 424         utf_B                          = utf_new_char("B");
 425         utf_C                          = utf_new_char("C");
 426         utf_S                          = utf_new_char("S");
 427         utf_I                          = utf_new_char("I");
 428         utf_J                          = utf_new_char("J");
 429         utf_F                          = utf_new_char("F");
 430         utf_D                          = utf_new_char("D");
 431
 432         utf_void__void                 = utf_new_char("()V");
 433         utf_boolean__void              = utf_new_char("(Z)V");
 434         utf_byte__void                 = utf_new_char("(B)V");
 435         utf_char__void                 = utf_new_char("(C)V");
 436         utf_short__void                = utf_new_char("(S)V");
 437         utf_int__void                  = utf_new_char("(I)V");
 438         utf_long__void                 = utf_new_char("(J)V");
 439         utf_float__void                = utf_new_char("(F)V");
 440         utf_double__void               = utf_new_char("(D)V");
 441         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 442         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 443
 444         utf_void__java_lang_ClassLoader =
 445                 utf_new_char("()Ljava/lang/ClassLoader;");
 446
 447         utf_java_lang_ClassLoader_java_lang_String__J =
 448                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
 449
 450         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
 451
 452         utf_java_lang_Object__java_lang_Object =
 453                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 454
 455         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 456
 457         utf_java_lang_String__java_lang_Class =
 458                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 459
 460         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 461         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 462
 463         utf_java_lang_Throwable__java_lang_Throwable =
 464                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
 465
 466         utf_null                       = utf_new_char("null");
 467         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 468         array_packagename              = utf_new_char("\t<the array package>");
 469
 470         /* everything's ok */
 471
 472         return true;
 473 }
 474
 475
 476 /* utf_hashkey *****************************************************************
 477
 478    The hashkey is computed from the utf-text by using up to 8
 479    characters.  For utf-symbols longer than 15 characters 3 characters
 480    are taken from the beginning and the end, 2 characters are taken
 481    from the middle.
 482
 483 *******************************************************************************/
 484
 485 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 486 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 487
 488 u4 utf_hashkey(const char *text, u4 length)
 489 {
 490         const char *start_pos = text;       /* pointer to utf text                */
 491         u4 a;
 492
 493         switch (length) {
 494         case 0: /* empty string */
 495                 return 0;
 496
 497         case 1: return fbs(0);
 498         case 2: return fbs(0) ^ nbs(3);
 499         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 500         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 501         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 502         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 503         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 504         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 505
 506         case 9:
 507                 a = fbs(0);
 508                 a ^= nbs(1);
 509                 a ^= nbs(2);
 510                 text++;
 511                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 512
 513         case 10:
 514                 a = fbs(0);
 515                 text++;
 516                 a ^= nbs(2);
 517                 a ^= nbs(3);
 518                 a ^= nbs(4);
 519                 text++;
 520                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 521
 522         case 11:
 523                 a = fbs(0);
 524                 text++;
 525                 a ^= nbs(2);
 526                 a ^= nbs(3);
 527                 a ^= nbs(4);
 528                 text++;
 529                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 530
 531         case 12:
 532                 a = fbs(0);
 533                 text += 2;
 534                 a ^= nbs(2);
 535                 a ^= nbs(3);
 536                 text++;
 537                 a ^= nbs(5);
 538                 a ^= nbs(6);
 539                 a ^= nbs(7);
 540                 text++;
 541                 return a ^ nbs(9) ^ nbs(10);
 542
 543         case 13:
 544                 a = fbs(0);
 545                 a ^= nbs(1);
 546                 text++;
 547                 a ^= nbs(3);
 548                 a ^= nbs(4);
 549                 text += 2;
 550                 a ^= nbs(7);
 551                 a ^= nbs(8);
 552                 text += 2;
 553                 return a ^ nbs(9) ^ nbs(10);
 554
 555         case 14:
 556                 a = fbs(0);
 557                 text += 2;
 558                 a ^= nbs(3);
 559                 a ^= nbs(4);
 560                 text += 2;
 561                 a ^= nbs(7);
 562                 a ^= nbs(8);
 563                 text += 2;
 564                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 565
 566         case 15:
 567                 a = fbs(0);
 568                 text += 2;
 569                 a ^= nbs(3);
 570                 a ^= nbs(4);
 571                 text += 2;
 572                 a ^= nbs(7);
 573                 a ^= nbs(8);
 574                 text += 2;
 575                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 576
 577         default:  /* 3 characters from beginning */
 578                 a = fbs(0);
 579                 text += 2;
 580                 a ^= nbs(3);
 581                 a ^= nbs(4);
 582
 583                 /* 2 characters from middle */
 584                 text = start_pos + (length / 2);
 585                 a ^= fbs(5);
 586                 text += 2;
 587                 a ^= nbs(6);
 588
 589                 /* 3 characters from end */
 590                 text = start_pos + length - 4;
 591
 592                 a ^= fbs(7);
 593                 text++;
 594
 595                 return a ^ nbs(10) ^ nbs(11);
 596     }
 597 }
 598
 599 /* utf_full_hashkey ************************************************************
 600
 601    This function computes a hash value using all bytes in the string.
 602
 603    The algorithm is the "One-at-a-time" algorithm as published
 604    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 605
 606 *******************************************************************************/
 607
 608 u4 utf_full_hashkey(const char *text, u4 length)
 609 {
 610         register const unsigned char *p = (const unsigned char *) text;
 611         register u4 hash;
 612         register u4 i;
 613
 614         hash = 0;
 615         for (i=length; i--;)
 616         {
 617             hash += *p++;
 618             hash += (hash << 10);
 619             hash ^= (hash >> 6);
 620         }
 621         hash += (hash << 3);
 622         hash ^= (hash >> 11);
 623         hash += (hash << 15);
 624
 625         return hash;
 626 }
 627
 628 /* unicode_hashkey *************************************************************
 629
 630    Compute the hashkey of a unicode string.
 631
 632 *******************************************************************************/
 633
 634 u4 unicode_hashkey(u2 *text, u2 len)
 635 {
 636         return utf_hashkey((char *) text, len);
 637 }
 638
 639
 640 /* utf_new *********************************************************************
 641
 642    Creates a new utf-symbol, the text of the symbol is passed as a
 643    u1-array. The function searches the utf-hashtable for a utf-symbol
 644    with this text. On success the element returned, otherwise a new
 645    hashtable element is created.
 646
 647    If the number of entries in the hashtable exceeds twice the size of
 648    the hashtable slots a reorganization of the hashtable is done and
 649    the utf symbols are copied to a new hashtable with doubled size.
 650
 651 *******************************************************************************/
 652
 653 utf *utf_new(const char *text, u2 length)
 654 {
 655         u4 key;                             /* hashkey computed from utf-text     */
 656         u4 slot;                            /* slot in hashtable                  */
 657         utf *u;                             /* hashtable element                  */
 658         u2 i;
 659
 660         LOCK_MONITOR_ENTER(hashtable_utf->header);
 661
 662 #if defined(ENABLE_STATISTICS)
 663         if (opt_stat)
 664                 count_utf_new++;
 665 #endif
 666
 667         key  = utf_hashkey(text, length);
 668         slot = key & (hashtable_utf->size - 1);
 669         u    = hashtable_utf->ptr[slot];
 670
 671         /* search external hash chain for utf-symbol */
 672
 673         while (u) {
 674                 if (u->blength == length) {
 675                         /* compare text of hashtable elements */
 676
 677                         for (i = 0; i < length; i++)
 678                                 if (text[i] != u->text[i])
 679                                         goto nomatch;
 680
 681 #if defined(ENABLE_STATISTICS)
 682                         if (opt_stat)
 683                                 count_utf_new_found++;
 684 #endif
 685
 686                         /* symbol found in hashtable */
 687
 688                         LOCK_MONITOR_EXIT(hashtable_utf->header);
 689
 690                         return u;
 691                 }
 692
 693         nomatch:
 694                 u = u->hashlink; /* next element in external chain */
 695         }
 696
 697         /* location in hashtable found, create new utf element */
 698
 699         u = NEW(utf);
 700
 701         u->blength  = length;               /* length in bytes of utfstring       */
 702         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 703         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 704
 705         memcpy(u->text, text, length);      /* copy utf-text                      */
 706         u->text[length] = '\0';
 707
 708 #if defined(ENABLE_STATISTICS)
 709         if (opt_stat)
 710                 count_utf_len += sizeof(utf) + length + 1;
 711 #endif
 712
 713         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 714         hashtable_utf->entries++;           /* update number of entries           */
 715
 716         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 717
 718         /* reorganization of hashtable, average length of the external
 719            chains is approx. 2 */
 720
 721                 hashtable *newhash;                              /* the new hashtable */
 722                 u4         i;
 723                 utf       *u;
 724                 utf       *nextu;
 725                 u4         slot;
 726
 727                 /* create new hashtable, double the size */
 728
 729                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 730
 731 #if defined(ENABLE_STATISTICS)
 732                 if (opt_stat)
 733                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 734 #endif
 735
 736                 /* transfer elements to new hashtable */
 737
 738                 for (i = 0; i < hashtable_utf->size; i++) {
 739                         u = hashtable_utf->ptr[i];
 740
 741                         while (u) {
 742                                 nextu = u->hashlink;
 743                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 744
 745                                 u->hashlink = (utf *) newhash->ptr[slot];
 746                                 newhash->ptr[slot] = u;
 747
 748                                 /* follow link in external hash chain */
 749
 750                                 u = nextu;
 751                         }
 752                 }
 753
 754                 /* dispose old table */
 755
 756                 hashtable_free(hashtable_utf);
 757
 758                 hashtable_utf = newhash;
 759         }
 760
 761         LOCK_MONITOR_EXIT(hashtable_utf->header);
 762
 763         return u;
 764 }
 765
 766
 767 /* utf_new_u2 ******************************************************************
 768
 769    Make utf symbol from u2 array, if isclassname is true '.' is
 770    replaced by '/'.
 771
 772 *******************************************************************************/
 773
 774 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 775 {
 776         char *buffer;                   /* memory buffer for  unicode characters  */
 777         char *pos;                      /* pointer to current position in buffer  */
 778         u4 left;                        /* unicode characters left                */
 779         u4 buflength;                   /* utf length in bytes of the u2 array    */
 780         utf *result;                    /* resulting utf-string                   */
 781         int i;
 782
 783         /* determine utf length in bytes and allocate memory */
 784
 785         buflength = u2_utflength(unicode_pos, unicode_length);
 786         buffer    = MNEW(char, buflength);
 787
 788         left = buflength;
 789         pos  = buffer;
 790
 791         for (i = 0; i++ < unicode_length; unicode_pos++) {
 792                 /* next unicode character */
 793                 u2 c = *unicode_pos;
 794
 795                 if ((c != 0) && (c < 0x80)) {
 796                         /* 1 character */
 797                         left--;
 798                 if ((int) left < 0) break;
 799                         /* convert classname */
 800                         if (isclassname && c == '.')
 801                                 *pos++ = '/';
 802                         else
 803                                 *pos++ = (char) c;
 804
 805                 } else if (c < 0x800) {
 806                         /* 2 characters */
 807                 unsigned char high = c >> 6;
 808                 unsigned char low  = c & 0x3F;
 809                         left = left - 2;
 810                 if ((int) left < 0) break;
 811                 *pos++ = high | 0xC0;
 812                 *pos++ = low  | 0x80;
 813
 814                 } else {
 815                 /* 3 characters */
 816                 char low  = c & 0x3f;
 817                 char mid  = (c >> 6) & 0x3F;
 818                 char high = c >> 12;
 819                         left = left - 3;
 820                 if ((int) left < 0) break;
 821                 *pos++ = high | 0xE0;
 822                 *pos++ = mid  | 0x80;
 823                 *pos++ = low  | 0x80;
 824                 }
 825         }
 826
 827         /* insert utf-string into symbol-table */
 828         result = utf_new(buffer,buflength);
 829
 830         MFREE(buffer, char, buflength);
 831
 832         return result;
 833 }
 834
 835
 836 /* utf_new_char ****************************************************************
 837
 838    Creates a new utf symbol, the text for this symbol is passed as a
 839    c-string ( = char* ).
 840
 841 *******************************************************************************/
 842
 843 utf *utf_new_char(const char *text)
 844 {
 845         return utf_new(text, strlen(text));
 846 }
 847
 848
 849 /* utf_new_char_classname ******************************************************
 850
 851    Creates a new utf symbol, the text for this symbol is passed as a
 852    c-string ( = char* ) "." characters are going to be replaced by
 853    "/". Since the above function is used often, this is a separte
 854    function, instead of an if.
 855
 856 *******************************************************************************/
 857
 858 utf *utf_new_char_classname(const char *text)
 859 {
 860         if (strchr(text, '.')) {
 861                 char *txt = strdup(text);
 862                 char *end = txt + strlen(txt);
 863                 char *c;
 864                 utf *tmpRes;
 865
 866                 for (c = txt; c < end; c++)
 867                         if (*c == '.') *c = '/';
 868
 869                 tmpRes = utf_new(txt, strlen(txt));
 870                 FREE(txt, 0);
 871
 872                 return tmpRes;
 873
 874         } else
 875                 return utf_new(text, strlen(text));
 876 }
 877
 878
 879 /* utf_nextu2 ******************************************************************
 880
 881    Read the next unicode character from the utf string and increment
 882    the utf-string pointer accordingly.
 883
 884    CAUTION: This function is unsafe for input that was not checked
 885             by is_valid_utf!
 886
 887 *******************************************************************************/
 888
 889 u2 utf_nextu2(char **utf_ptr)
 890 {
 891     /* uncompressed unicode character */
 892     u2 unicode_char = 0;
 893     /* current position in utf text */
 894     unsigned char *utf = (unsigned char *) (*utf_ptr);
 895     /* bytes representing the unicode character */
 896     unsigned char ch1, ch2, ch3;
 897     /* number of bytes used to represent the unicode character */
 898     int len = 0;
 899
 900     switch ((ch1 = utf[0]) >> 4) {
 901         default: /* 1 byte */
 902                 (*utf_ptr)++;
 903                 return (u2) ch1;
 904         case 0xC:
 905         case 0xD: /* 2 bytes */
 906                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 907                         unsigned char high = ch1 & 0x1F;
 908                         unsigned char low  = ch2 & 0x3F;
 909                         unicode_char = (high << 6) + low;
 910                         len = 2;
 911                 }
 912                 break;
 913
 914         case 0xE: /* 2 or 3 bytes */
 915                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 916                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 917                                 unsigned char low  = ch3 & 0x3f;
 918                                 unsigned char mid  = ch2 & 0x3f;
 919                                 unsigned char high = ch1 & 0x0f;
 920                                 unicode_char = (((high << 6) + mid) << 6) + low;
 921                                 len = 3;
 922                         } else
 923                                 len = 2;
 924                 }
 925                 break;
 926     }
 927
 928     /* update position in utf-text */
 929     *utf_ptr = (char *) (utf + len);
 930
 931     return unicode_char;
 932 }
 933
 934
 935 /* utf_bytes *******************************************************************
 936
 937    Determine number of bytes (aka. octets) in the utf string.
 938
 939    IN:
 940       u............utf string
 941
 942    OUT:
 943       The number of octets of this utf string.
 944           There is _no_ terminating zero included in this count.
 945
 946 *******************************************************************************/
 947
 948 u4 utf_bytes(utf *u)
 949 {
 950         return u->blength;
 951 }
 952
 953
 954 /* utf_get_number_of_u2s_for_buffer ********************************************
 955
 956    Determine number of UTF-16 u2s in the given UTF-8 buffer
 957
 958    CAUTION: This function is unsafe for input that was not checked
 959             by is_valid_utf!
 960
 961    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 962    to an array of u2s (UTF-16) and want to know how many of them you will get.
 963    All other uses of this function are probably wrong.
 964
 965    IN:
 966       buffer........points to first char in buffer
 967           blength.......number of _bytes_ in the buffer
 968
 969    OUT:
 970       the number of u2s needed to hold this string in UTF-16 encoding.
 971           There is _no_ terminating zero included in this count.
 972
 973    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
 974    exception.
 975
 976 *******************************************************************************/
 977
 978 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
 979 {
 980         const char *endpos;                 /* points behind utf string           */
 981         const char *utf_ptr;                /* current position in utf text       */
 982         u4 len = 0;                         /* number of unicode characters       */
 983
 984         utf_ptr = buffer;
 985         endpos = utf_ptr + blength;
 986
 987         while (utf_ptr < endpos) {
 988                 len++;
 989                 /* next unicode character */
 990                 utf_nextu2((char **)&utf_ptr);
 991         }
 992
 993         assert(utf_ptr == endpos);
 994
 995         return len;
 996 }
 997
 998
 999 /* utf_get_number_of_u2s *******************************************************
1000
1001    Determine number of UTF-16 u2s in the utf string.
1002
1003    CAUTION: This function is unsafe for input that was not checked
1004             by is_valid_utf!
1005
1006    CAUTION: Use this function *only* when you want to convert a utf string
1007    to an array of u2s and want to know how many of them you will get.
1008    All other uses of this function are probably wrong.
1009
1010    IN:
1011       u............utf string
1012
1013    OUT:
1014       the number of u2s needed to hold this string in UTF-16 encoding.
1015           There is _no_ terminating zero included in this count.
1016           XXX 0 if a NullPointerException has been thrown (see below)
1017
1018 *******************************************************************************/
1019
1020 u4 utf_get_number_of_u2s(utf *u)
1021 {
1022         char *endpos;                       /* points behind utf string           */
1023         char *utf_ptr;                      /* current position in utf text       */
1024         u4 len = 0;                         /* number of unicode characters       */
1025
1026         /* XXX this is probably not checked by most callers! Review this after */
1027         /* the invalid uses of this function have been eliminated */
1028         if (u == NULL) {
1029                 exceptions_throw_nullpointerexception();
1030                 return 0;
1031         }
1032
1033         endpos = UTF_END(u);
1034         utf_ptr = u->text;
1035
1036         while (utf_ptr < endpos) {
1037                 len++;
1038                 /* next unicode character */
1039                 utf_nextu2(&utf_ptr);
1040         }
1041
1042         if (utf_ptr != endpos) {
1043                 /* string ended abruptly */
1044                 exceptions_throw_internalerror("Illegal utf8 string");
1045                 return 0;
1046         }
1047
1048         return len;
1049 }
1050
1051
1052 /* utf8_safe_number_of_u2s *****************************************************
1053
1054    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1055    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1056
1057    This function is safe even for invalid UTF-8 strings.
1058
1059    IN:
1060       text..........zero-terminated(!) UTF-8 string (may be invalid)
1061                         must NOT be NULL
1062           nbytes........strlen(text). (This is needed to completely emulate
1063                         the RI).
1064
1065    OUT:
1066       the number of u2s needed to hold this string in UTF-16 encoding.
1067           There is _no_ terminating zero included in this count.
1068
1069 *******************************************************************************/
1070
1071 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1072         register const unsigned char *t;
1073         register s4 byte;
1074         register s4 len;
1075         register const unsigned char *tlimit;
1076         s4 byte1;
1077         s4 byte2;
1078         s4 byte3;
1079         s4 value;
1080         s4 skip;
1081
1082         assert(text);
1083         assert(nbytes >= 0);
1084
1085         len = 0;
1086         t = (const unsigned char *) text;
1087         tlimit = t + nbytes;
1088
1089         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1090
1091         while (1) {
1092                 byte = *t++;
1093
1094                 if (byte & 0x80) {
1095                         /* highest bit set, non-ASCII character */
1096
1097                         if ((byte & 0xe0) == 0xc0) {
1098                                 /* 2-byte: should be 110..... 10...... ? */
1099
1100                                 if ((*t++ & 0xc0) == 0x80)
1101                                         ; /* valid 2-byte */
1102                                 else
1103                                         t--; /* invalid */
1104                         }
1105                         else if ((byte & 0xf0) == 0xe0) {
1106                                 /* 3-byte: should be 1110.... 10...... 10...... */
1107                                 /*                            ^t                */
1108
1109                                 if (t + 2 > tlimit)
1110                                         return len + 1; /* invalid, stop here */
1111
1112                                 if ((*t++ & 0xc0) == 0x80) {
1113                                         if ((*t++ & 0xc0) == 0x80)
1114                                                 ; /* valid 3-byte */
1115                                         else
1116                                                 t--; /* invalid */
1117                                 }
1118                                 else
1119                                         t--; /* invalid */
1120                         }
1121                         else if ((byte & 0xf8) == 0xf0) {
1122                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1123                                 /*                            ^t                         */
1124
1125                                 if (t + 3 > tlimit)
1126                                         return len + 1; /* invalid, stop here */
1127
1128                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1129                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1130                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1131                                                         /* valid 4-byte UTF-8? */
1132                                                         value = ((byte  & 0x07) << 18)
1133                                                                   | ((byte1 & 0x3f) << 12)
1134                                                                   | ((byte2 & 0x3f) <<  6)
1135                                                                   | ((byte3 & 0x3f)      );
1136
1137                                                         if (value > 0x10FFFF)
1138                                                                 ; /* invalid */
1139                                                         else if (value > 0xFFFF)
1140                                                                 len += 1; /* we need surrogates */
1141                                                         else
1142                                                                 ; /* 16bit suffice */
1143                                                 }
1144                                                 else
1145                                                         t--; /* invalid */
1146                                         }
1147                                         else
1148                                                 t--; /* invalid */
1149                                 }
1150                                 else
1151                                         t--; /* invalid */
1152                         }
1153                         else if ((byte & 0xfc) == 0xf8) {
1154                                 /* invalid 5-byte */
1155                                 if (t + 4 > tlimit)
1156                                         return len + 1; /* invalid, stop here */
1157
1158                                 skip = 4;
1159                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1160                                         t++;
1161                         }
1162                         else if ((byte & 0xfe) == 0xfc) {
1163                                 /* invalid 6-byte */
1164                                 if (t + 5 > tlimit)
1165                                         return len + 1; /* invalid, stop here */
1166
1167                                 skip = 5;
1168                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1169                                         t++;
1170                         }
1171                         else
1172                                 ; /* invalid */
1173                 }
1174                 else {
1175                         /* NUL */
1176
1177                         if (byte == 0)
1178                                 break;
1179
1180                         /* ASCII character, common case */
1181                 }
1182
1183                 len++;
1184         }
1185
1186         return len;
1187 }
1188
1189
1190 /* utf8_safe_convert_to_u2s ****************************************************
1191
1192    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1193    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1194    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1195
1196    This function is safe even for invalid UTF-8 strings.
1197
1198    IN:
1199       text..........zero-terminated(!) UTF-8 string (may be invalid)
1200                         must NOT be NULL
1201           nbytes........strlen(text). (This is needed to completely emulate
1202                                         the RI).
1203           buffer........a preallocated array of u2s to receive the decoded
1204                         string. Use utf8_safe_number_of_u2s to get the
1205                                         required number of u2s for allocating this.
1206
1207 *******************************************************************************/
1208
1209 #define UNICODE_REPLACEMENT  0xfffd
1210
1211 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1212         register const unsigned char *t;
1213         register s4 byte;
1214         register const unsigned char *tlimit;
1215         s4 byte1;
1216         s4 byte2;
1217         s4 byte3;
1218         s4 value;
1219         s4 skip;
1220
1221         assert(text);
1222         assert(nbytes >= 0);
1223
1224         t = (const unsigned char *) text;
1225         tlimit = t + nbytes;
1226
1227         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1228
1229         while (1) {
1230                 byte = *t++;
1231
1232                 if (byte & 0x80) {
1233                         /* highest bit set, non-ASCII character */
1234
1235                         if ((byte & 0xe0) == 0xc0) {
1236                                 /* 2-byte: should be 110..... 10...... */
1237
1238                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1239                                         /* valid 2-byte UTF-8 */
1240                                         *buffer++ = ((byte  & 0x1f) << 6)
1241                                                           | ((byte1 & 0x3f)     );
1242                                 }
1243                                 else {
1244                                         *buffer++ = UNICODE_REPLACEMENT;
1245                                         t--;
1246                                 }
1247                         }
1248                         else if ((byte & 0xf0) == 0xe0) {
1249                                 /* 3-byte: should be 1110.... 10...... 10...... */
1250
1251                                 if (t + 2 > tlimit) {
1252                                         *buffer++ = UNICODE_REPLACEMENT;
1253                                         return;
1254                                 }
1255
1256                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1257                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1258                                                 /* valid 3-byte UTF-8 */
1259                                                 *buffer++ = ((byte  & 0x0f) << 12)
1260                                                                   | ((byte1 & 0x3f) <<  6)
1261                                                                   | ((byte2 & 0x3f)      );
1262                                         }
1263                                         else {
1264                                                 *buffer++ = UNICODE_REPLACEMENT;
1265                                                 t--;
1266                                         }
1267                                 }
1268                                 else {
1269                                         *buffer++ = UNICODE_REPLACEMENT;
1270                                         t--;
1271                                 }
1272                         }
1273                         else if ((byte & 0xf8) == 0xf0) {
1274                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1275
1276                                 if (t + 3 > tlimit) {
1277                                         *buffer++ = UNICODE_REPLACEMENT;
1278                                         return;
1279                                 }
1280
1281                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1282                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1283                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1284                                                         /* valid 4-byte UTF-8? */
1285                                                         value = ((byte  & 0x07) << 18)
1286                                                                   | ((byte1 & 0x3f) << 12)
1287                                                                   | ((byte2 & 0x3f) <<  6)
1288                                                                   | ((byte3 & 0x3f)      );
1289
1290                                                         if (value > 0x10FFFF) {
1291                                                                 *buffer++ = UNICODE_REPLACEMENT;
1292                                                         }
1293                                                         else if (value > 0xFFFF) {
1294                                                                 /* we need surrogates */
1295                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1296                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1297                                                         }
1298                                                         else
1299                                                                 *buffer++ = value; /* 16bit suffice */
1300                                                 }
1301                                                 else {
1302                                                         *buffer++ = UNICODE_REPLACEMENT;
1303                                                         t--;
1304                                                 }
1305                                         }
1306                                         else {
1307                                                 *buffer++ = UNICODE_REPLACEMENT;
1308                                                 t--;
1309                                         }
1310                                 }
1311                                 else {
1312                                         *buffer++ = UNICODE_REPLACEMENT;
1313                                         t--;
1314                                 }
1315                         }
1316                         else if ((byte & 0xfc) == 0xf8) {
1317                                 if (t + 4 > tlimit) {
1318                                         *buffer++ = UNICODE_REPLACEMENT;
1319                                         return;
1320                                 }
1321
1322                                 skip = 4;
1323                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1324                                         t++;
1325                                 *buffer++ = UNICODE_REPLACEMENT;
1326                         }
1327                         else if ((byte & 0xfe) == 0xfc) {
1328                                 if (t + 5 > tlimit) {
1329                                         *buffer++ = UNICODE_REPLACEMENT;
1330                                         return;
1331                                 }
1332
1333                                 skip = 5;
1334                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1335                                         t++;
1336                                 *buffer++ = UNICODE_REPLACEMENT;
1337                         }
1338                         else
1339                                 *buffer++ = UNICODE_REPLACEMENT;
1340                 }
1341                 else {
1342                         /* NUL */
1343
1344                         if (byte == 0)
1345                                 break;
1346
1347                         /* ASCII character, common case */
1348
1349                         *buffer++ = byte;
1350                 }
1351         }
1352 }
1353
1354
1355 /* u2_utflength ****************************************************************
1356
1357    Returns the utf length in bytes of a u2 array.
1358
1359 *******************************************************************************/
1360
1361 u4 u2_utflength(u2 *text, u4 u2_length)
1362 {
1363         u4 result_len = 0;                  /* utf length in bytes                */
1364         u2 ch;                              /* current unicode character          */
1365         u4 len;
1366
1367         for (len = 0; len < u2_length; len++) {
1368                 /* next unicode character */
1369                 ch = *text++;
1370
1371                 /* determine bytes required to store unicode character as utf */
1372                 if (ch && (ch < 0x80))
1373                         result_len++;
1374                 else if (ch < 0x800)
1375                         result_len += 2;
1376                 else
1377                         result_len += 3;
1378         }
1379
1380     return result_len;
1381 }
1382
1383
1384 /* utf_copy ********************************************************************
1385
1386    Copy the given utf string byte-for-byte to a buffer.
1387
1388    IN:
1389       buffer.......the buffer
1390           u............the utf string
1391
1392 *******************************************************************************/
1393
1394 void utf_copy(char *buffer, utf *u)
1395 {
1396         /* our utf strings are zero-terminated (done by utf_new) */
1397         MCOPY(buffer, u->text, char, u->blength + 1);
1398 }
1399
1400
1401 /* utf_cat *********************************************************************
1402
1403    Append the given utf string byte-for-byte to a buffer.
1404
1405    IN:
1406       buffer.......the buffer
1407           u............the utf string
1408
1409 *******************************************************************************/
1410
1411 void utf_cat(char *buffer, utf *u)
1412 {
1413         /* our utf strings are zero-terminated (done by utf_new) */
1414         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1415 }
1416
1417
1418 /* utf_copy_classname **********************************************************
1419
1420    Copy the given utf classname byte-for-byte to a buffer.
1421    '/' is replaced by '.'
1422
1423    IN:
1424       buffer.......the buffer
1425           u............the utf string
1426
1427 *******************************************************************************/
1428
1429 void utf_copy_classname(char *buffer, utf *u)
1430 {
1431         char *bufptr;
1432         char *srcptr;
1433         char *endptr;
1434         char ch;
1435
1436         bufptr = buffer;
1437         srcptr = u->text;
1438         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1439
1440         while (srcptr != endptr) {
1441                 ch = *srcptr++;
1442                 if (ch == '/')
1443                         ch = '.';
1444                 *bufptr++ = ch;
1445         }
1446 }
1447
1448
1449 /* utf_cat *********************************************************************
1450
1451    Append the given utf classname byte-for-byte to a buffer.
1452    '/' is replaced by '.'
1453
1454    IN:
1455       buffer.......the buffer
1456           u............the utf string
1457
1458 *******************************************************************************/
1459
1460 void utf_cat_classname(char *buffer, utf *u)
1461 {
1462         utf_copy_classname(buffer + strlen(buffer), u);
1463 }
1464
1465 /* utf_display_printable_ascii *************************************************
1466
1467    Write utf symbol to stdout (for debugging purposes).
1468    Non-printable and non-ASCII characters are printed as '?'.
1469
1470 *******************************************************************************/
1471
1472 void utf_display_printable_ascii(utf *u)
1473 {
1474         char *endpos;                       /* points behind utf string           */
1475         char *utf_ptr;                      /* current position in utf text       */
1476
1477         if (u == NULL) {
1478                 printf("NULL");
1479                 fflush(stdout);
1480                 return;
1481         }
1482
1483         endpos = UTF_END(u);
1484         utf_ptr = u->text;
1485
1486         while (utf_ptr < endpos) {
1487                 /* read next unicode character */
1488
1489                 u2 c = utf_nextu2(&utf_ptr);
1490
1491                 if ((c >= 32) && (c <= 127))
1492                         printf("%c", c);
1493                 else
1494                         printf("?");
1495         }
1496
1497         fflush(stdout);
1498 }
1499
1500
1501 /* utf_display_printable_ascii_classname ***************************************
1502
1503    Write utf symbol to stdout with `/' converted to `.' (for debugging
1504    purposes).
1505    Non-printable and non-ASCII characters are printed as '?'.
1506
1507 *******************************************************************************/
1508
1509 void utf_display_printable_ascii_classname(utf *u)
1510 {
1511         char *endpos;                       /* points behind utf string           */
1512         char *utf_ptr;                      /* current position in utf text       */
1513
1514         if (u == NULL) {
1515                 printf("NULL");
1516                 fflush(stdout);
1517                 return;
1518         }
1519
1520         endpos = UTF_END(u);
1521         utf_ptr = u->text;
1522
1523         while (utf_ptr < endpos) {
1524                 /* read next unicode character */
1525
1526                 u2 c = utf_nextu2(&utf_ptr);
1527
1528                 if (c == '/')
1529                         c = '.';
1530
1531                 if ((c >= 32) && (c <= 127))
1532                         printf("%c", c);
1533                 else
1534                         printf("?");
1535         }
1536
1537         fflush(stdout);
1538 }
1539
1540
1541 /* utf_sprint_convert_to_latin1 ************************************************
1542
1543    Write utf symbol into c-string (for debugging purposes).
1544    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1545    invalid results.
1546
1547 *******************************************************************************/
1548
1549 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1550 {
1551         char *endpos;                       /* points behind utf string           */
1552         char *utf_ptr;                      /* current position in utf text       */
1553         u2 pos = 0;                         /* position in c-string               */
1554
1555         if (!u) {
1556                 strcpy(buffer, "NULL");
1557                 return;
1558         }
1559
1560         endpos = UTF_END(u);
1561         utf_ptr = u->text;
1562
1563         while (utf_ptr < endpos)
1564                 /* copy next unicode character */
1565                 buffer[pos++] = utf_nextu2(&utf_ptr);
1566
1567         /* terminate string */
1568         buffer[pos] = '\0';
1569 }
1570
1571
1572 /* utf_sprint_convert_to_latin1_classname **************************************
1573
1574    Write utf symbol into c-string with `/' converted to `.' (for debugging
1575    purposes).
1576    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1577    invalid results.
1578
1579 *******************************************************************************/
1580
1581 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1582 {
1583         char *endpos;                       /* points behind utf string           */
1584         char *utf_ptr;                      /* current position in utf text       */
1585         u2 pos = 0;                         /* position in c-string               */
1586
1587         if (!u) {
1588                 strcpy(buffer, "NULL");
1589                 return;
1590         }
1591
1592         endpos = UTF_END(u);
1593         utf_ptr = u->text;
1594
1595         while (utf_ptr < endpos) {
1596                 /* copy next unicode character */
1597                 u2 c = utf_nextu2(&utf_ptr);
1598                 if (c == '/') c = '.';
1599                 buffer[pos++] = c;
1600         }
1601
1602         /* terminate string */
1603         buffer[pos] = '\0';
1604 }
1605
1606
1607 /* utf_strcat_convert_to_latin1 ************************************************
1608
1609    Like libc strcat, but uses an utf8 string.
1610    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1611    invalid results.
1612
1613 *******************************************************************************/
1614
1615 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1616 {
1617         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1618 }
1619
1620
1621 /* utf_strcat_convert_to_latin1_classname **************************************
1622
1623    Like libc strcat, but uses an utf8 string.
1624    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1625    invalid results.
1626
1627 *******************************************************************************/
1628
1629 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1630 {
1631         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1632 }
1633
1634
1635 /* utf_fprint_printable_ascii **************************************************
1636
1637    Write utf symbol into file.
1638    Non-printable and non-ASCII characters are printed as '?'.
1639
1640 *******************************************************************************/
1641
1642 void utf_fprint_printable_ascii(FILE *file, utf *u)
1643 {
1644         char *endpos;                       /* points behind utf string           */
1645         char *utf_ptr;                      /* current position in utf text       */
1646
1647         if (!u)
1648                 return;
1649
1650         endpos = UTF_END(u);
1651         utf_ptr = u->text;
1652
1653         while (utf_ptr < endpos) {
1654                 /* read next unicode character */
1655                 u2 c = utf_nextu2(&utf_ptr);
1656
1657                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1658                 else fprintf(file, "?");
1659         }
1660 }
1661
1662
1663 /* utf_fprint_printable_ascii_classname ****************************************
1664
1665    Write utf symbol into file with `/' converted to `.'.
1666    Non-printable and non-ASCII characters are printed as '?'.
1667
1668 *******************************************************************************/
1669
1670 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1671 {
1672         char *endpos;                       /* points behind utf string           */
1673         char *utf_ptr;                      /* current position in utf text       */
1674
1675     if (!u)
1676                 return;
1677
1678         endpos = UTF_END(u);
1679         utf_ptr = u->text;
1680
1681         while (utf_ptr < endpos) {
1682                 /* read next unicode character */
1683                 u2 c = utf_nextu2(&utf_ptr);
1684                 if (c == '/') c = '.';
1685
1686                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1687                 else fprintf(file, "?");
1688         }
1689 }
1690
1691
1692 /* is_valid_utf ****************************************************************
1693
1694    Return true if the given string is a valid UTF-8 string.
1695
1696    utf_ptr...points to first character
1697    end_pos...points after last character
1698
1699 *******************************************************************************/
1700
1701 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1702
1703 bool is_valid_utf(char *utf_ptr, char *end_pos)
1704 {
1705         int bytes;
1706         int len,i;
1707         char c;
1708         unsigned long v;
1709
1710         if (end_pos < utf_ptr) return false;
1711         bytes = end_pos - utf_ptr;
1712         while (bytes--) {
1713                 c = *utf_ptr++;
1714
1715                 if (!c) return false;                     /* 0x00 is not allowed */
1716                 if ((c & 0x80) == 0) continue;            /* ASCII */
1717
1718                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1719                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1720                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1721                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1722                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1723                 else return false;                        /* invalid leading byte */
1724
1725                 if (len > 2) return false;                /* Java limitation */
1726
1727                 v = (unsigned long)c & (0x3f >> len);
1728
1729                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1730
1731                 for (i = len; i--; ) {
1732                         c = *utf_ptr++;
1733                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1734                                 return false;
1735                         v = (v << 6) | (c & 0x3f);
1736                 }
1737
1738                 if (v == 0) {
1739                         if (len != 1) return false;           /* Java special */
1740
1741                 } else {
1742                         /* Sun Java seems to allow overlong UTF-8 encodings */
1743
1744                         /* if (v < min_codepoint[len]) */
1745                                 /* XXX throw exception? */
1746                 }
1747
1748                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1749                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1750
1751                 /* even these seem to be allowed */
1752                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1753         }
1754
1755         return true;
1756 }
1757
1758
1759 /* is_valid_name ***************************************************************
1760
1761    Return true if the given string may be used as a class/field/method
1762    name. (Currently this only disallows empty strings and control
1763    characters.)
1764
1765    NOTE: The string is assumed to have passed is_valid_utf!
1766
1767    utf_ptr...points to first character
1768    end_pos...points after last character
1769
1770 *******************************************************************************/
1771
1772 bool is_valid_name(char *utf_ptr, char *end_pos)
1773 {
1774         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1775
1776         while (utf_ptr < end_pos) {
1777                 unsigned char c = *utf_ptr++;
1778
1779                 if (c < 0x20) return false; /* disallow control characters */
1780                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1781                         return false;
1782         }
1783
1784         return true;
1785 }
1786
1787 bool is_valid_name_utf(utf *u)
1788 {
1789         return is_valid_name(u->text, UTF_END(u));
1790 }
1791
1792
1793 /* utf_show ********************************************************************
1794
1795    Writes the utf symbols in the utfhash to stdout and displays the
1796    number of external hash chains grouped according to the chainlength
1797    (for debugging purposes).
1798
1799 *******************************************************************************/
1800
1801 #if !defined(NDEBUG)
1802 void utf_show(void)
1803 {
1804
1805 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1806
1807         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1808         u4 max_chainlength = 0;      /* maximum length of the chains */
1809         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1810         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1811         u4 i;
1812
1813         printf("UTF-HASH:\n");
1814
1815         /* show element of utf-hashtable */
1816
1817         for (i = 0; i < hashtable_utf->size; i++) {
1818                 utf *u = hashtable_utf->ptr[i];
1819
1820                 if (u) {
1821                         printf("SLOT %d: ", (int) i);
1822
1823                         while (u) {
1824                                 printf("'");
1825                                 utf_display_printable_ascii(u);
1826                                 printf("' ");
1827                                 u = u->hashlink;
1828                         }
1829                         printf("\n");
1830                 }
1831         }
1832
1833         printf("UTF-HASH: %d slots for %d entries\n",
1834                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1835
1836         if (hashtable_utf->entries == 0)
1837                 return;
1838
1839         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1840
1841         for (i=0;i<CHAIN_LIMIT;i++)
1842                 chain_count[i]=0;
1843
1844         /* count numbers of hashchains according to their length */
1845         for (i=0; i<hashtable_utf->size; i++) {
1846
1847                 utf *u = (utf*) hashtable_utf->ptr[i];
1848                 u4 chain_length = 0;
1849
1850                 /* determine chainlength */
1851                 while (u) {
1852                         u = u->hashlink;
1853                         chain_length++;
1854                 }
1855
1856                 /* update sum of all chainlengths */
1857                 sum_chainlength+=chain_length;
1858
1859                 /* determine the maximum length of the chains */
1860                 if (chain_length>max_chainlength)
1861                         max_chainlength = chain_length;
1862
1863                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1864                 if (chain_length>=CHAIN_LIMIT) {
1865                         beyond_limit+=chain_length;
1866                         chain_length=CHAIN_LIMIT-1;
1867                 }
1868
1869                 /* update number of hashchains of current length */
1870                 chain_count[chain_length]++;
1871         }
1872
1873         /* display results */
1874         for (i=1;i<CHAIN_LIMIT-1;i++)
1875                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1876
1877         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1878
1879
1880         printf("max. chainlength:%5d\n",max_chainlength);
1881
1882         /* avg. chainlength = sum of chainlengths / number of chains */
1883         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1884 }
1885 #endif /* !defined(NDEBUG) */
1886
1887
1888 /*
1889  * These are local overrides for various environment variables in Emacs.
1890  * Please do not remove this and leave it at the end of the file, where
1891  * Emacs will automagically detect them.
1892  * ---------------------------------------------------------------------
1893  * Local variables:
1894  * mode: c
1895  * indent-tabs-mode: t
1896  * c-basic-offset: 4
1897  * tab-width: 4
1898  * End:
1899  * vim:noexpandtab:sw=4:ts=4:
1900  */