src/vmcore/utf8.c

   1 /* src/vmcore/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006, 2007, 2008
   4    CACAOVM - Verein zur Foerderung der freien virtuellen Maschine CACAO
   5
   6    This file is part of CACAO.
   7
   8    This program is free software; you can redistribute it and/or
   9    modify it under the terms of the GNU General Public License as
  10    published by the Free Software Foundation; either version 2, or (at
  11    your option) any later version.
  12
  13    This program is distributed in the hope that it will be useful, but
  14    WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16    General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; if not, write to the Free Software
  20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  21    02110-1301, USA.
  22
  23 */
  24
  25
  26 #include "config.h"
  27
  28 #include <string.h>
  29 #include <assert.h>
  30
  31 #include "vm/types.h"
  32
  33 #include "mm/memory.h"
  34
  35 #include "threads/lock-common.h"
  36
  37 #include "toolbox/hashtable.h"
  38
  39 #include "vm/exceptions.h"
  40
  41 #include "vmcore/options.h"
  42
  43 #if defined(ENABLE_STATISTICS)
  44 # include "vmcore/statistics.h"
  45 #endif
  46
  47 #include "vmcore/utf8.h"
  48
  49
  50 /* global variables ***********************************************************/
  51
  52 /* hashsize must be power of 2 */
  53
  54 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  55
  56 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  57
  58
  59 /* utf-symbols for pointer comparison of frequently used strings **************/
  60
  61 utf *utf_java_lang_Object;
  62
  63 utf *utf_java_lang_Class;
  64 utf *utf_java_lang_ClassLoader;
  65 utf *utf_java_lang_Cloneable;
  66 utf *utf_java_lang_SecurityManager;
  67 utf *utf_java_lang_String;
  68 utf *utf_java_lang_ThreadGroup;
  69 utf *utf_java_lang_ref_SoftReference;
  70 utf *utf_java_lang_ref_WeakReference;
  71 utf *utf_java_lang_ref_PhantomReference;
  72 utf *utf_java_io_Serializable;
  73
  74 utf *utf_java_lang_Throwable;
  75 utf *utf_java_lang_Error;
  76
  77 utf *utf_java_lang_AbstractMethodError;
  78 utf *utf_java_lang_ClassCircularityError;
  79 utf *utf_java_lang_ClassFormatError;
  80 utf *utf_java_lang_ExceptionInInitializerError;
  81 utf *utf_java_lang_IncompatibleClassChangeError;
  82 utf *utf_java_lang_InstantiationError;
  83 utf *utf_java_lang_InternalError;
  84 utf *utf_java_lang_LinkageError;
  85 utf *utf_java_lang_NoClassDefFoundError;
  86 utf *utf_java_lang_NoSuchFieldError;
  87 utf *utf_java_lang_NoSuchMethodError;
  88 utf *utf_java_lang_OutOfMemoryError;
  89 utf *utf_java_lang_UnsatisfiedLinkError;
  90 utf *utf_java_lang_UnsupportedClassVersionError;
  91 utf *utf_java_lang_VerifyError;
  92 utf *utf_java_lang_VirtualMachineError;
  93
  94 utf *utf_java_lang_Exception;
  95
  96 utf *utf_java_lang_ArithmeticException;
  97 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
  98 utf *utf_java_lang_ArrayStoreException;
  99 utf *utf_java_lang_ClassCastException;
 100 utf *utf_java_lang_ClassNotFoundException;
 101 utf *utf_java_lang_CloneNotSupportedException;
 102 utf *utf_java_lang_IllegalAccessException;
 103 utf *utf_java_lang_IllegalArgumentException;
 104 utf *utf_java_lang_IllegalMonitorStateException;
 105 utf *utf_java_lang_InstantiationException;
 106 utf *utf_java_lang_InterruptedException;
 107 utf *utf_java_lang_NegativeArraySizeException;
 108 utf *utf_java_lang_NullPointerException;
 109 utf *utf_java_lang_StringIndexOutOfBoundsException;
 110
 111 utf *utf_java_lang_reflect_InvocationTargetException;
 112
 113 utf *utf_java_security_PrivilegedActionException;
 114
 115 #if defined(ENABLE_JAVASE)
 116 utf* utf_java_lang_Void;
 117 #endif
 118
 119 utf* utf_java_lang_Boolean;
 120 utf* utf_java_lang_Byte;
 121 utf* utf_java_lang_Character;
 122 utf* utf_java_lang_Short;
 123 utf* utf_java_lang_Integer;
 124 utf* utf_java_lang_Long;
 125 utf* utf_java_lang_Float;
 126 utf* utf_java_lang_Double;
 127
 128 #if defined(ENABLE_JAVASE)
 129 utf *utf_java_lang_StackTraceElement;
 130 utf *utf_java_lang_reflect_Constructor;
 131 utf *utf_java_lang_reflect_Field;
 132 utf *utf_java_lang_reflect_Method;
 133 utf *utf_java_util_Vector;
 134 #endif
 135
 136 utf *utf_InnerClasses;                  /* InnerClasses                       */
 137 utf *utf_ConstantValue;                 /* ConstantValue                      */
 138 utf *utf_Code;                          /* Code                               */
 139 utf *utf_Exceptions;                    /* Exceptions                         */
 140 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 141 utf *utf_SourceFile;                    /* SourceFile                         */
 142
 143 #if defined(ENABLE_JAVASE)
 144 utf *utf_EnclosingMethod;
 145 utf *utf_Signature;
 146 utf *utf_StackMapTable;
 147
 148 #if defined(ENABLE_ANNOTATIONS)
 149 utf *utf_RuntimeVisibleAnnotations;            /* RuntimeVisibleAnnotations            */
 150 utf *utf_RuntimeInvisibleAnnotations;          /* RuntimeInvisibleAnnotations          */
 151 utf *utf_RuntimeVisibleParameterAnnotations;   /* RuntimeVisibleParameterAnnotations   */
 152 utf *utf_RuntimeInvisibleParameterAnnotations; /* RuntimeInvisibleParameterAnnotations */
 153 utf *utf_AnnotationDefault;                    /* AnnotationDefault                    */
 154 #endif
 155 #endif
 156
 157 utf *utf_init;                          /* <init>                             */
 158 utf *utf_clinit;                        /* <clinit>                           */
 159 utf *utf_clone;                         /* clone                              */
 160 utf *utf_finalize;                      /* finalize                           */
 161 utf *utf_main;
 162 utf *utf_run;                           /* run                                */
 163
 164 utf *utf_add;
 165 utf *utf_remove;
 166 utf *utf_addThread;
 167 utf *utf_removeThread;
 168 utf *utf_put;
 169 utf *utf_get;
 170 utf *utf_uncaughtException;
 171 utf *utf_value;
 172
 173 utf *utf_fillInStackTrace;
 174 utf *utf_findNative;
 175 utf *utf_getSystemClassLoader;
 176 utf *utf_initCause;
 177 utf *utf_loadClass;
 178 utf *utf_loadClassInternal;
 179 utf *utf_printStackTrace;
 180
 181 utf *utf_division_by_zero;
 182
 183 utf *utf_Z;                             /* Z                                  */
 184 utf *utf_B;                             /* B                                  */
 185 utf *utf_C;                             /* C                                  */
 186 utf *utf_S;                             /* S                                  */
 187 utf *utf_I;                             /* I                                  */
 188 utf *utf_J;                             /* J                                  */
 189 utf *utf_F;                             /* F                                  */
 190 utf *utf_D;                             /* D                                  */
 191
 192 utf *utf_void__void;                    /* ()V                                */
 193 utf *utf_boolean__void;                 /* (Z)V                               */
 194 utf *utf_byte__void;                    /* (B)V                               */
 195 utf *utf_char__void;                    /* (C)V                               */
 196 utf *utf_short__void;                   /* (S)V                               */
 197 utf *utf_int__void;                     /* (I)V                               */
 198 utf *utf_long__void;                    /* (J)V                               */
 199 utf *utf_float__void;                   /* (F)V                               */
 200 utf *utf_double__void;                  /* (D)V                               */
 201
 202 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 203 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 204 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 205 utf *utf_java_lang_ClassLoader_java_lang_String__J;
 206 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 207 utf *utf_java_lang_Object__java_lang_Object;
 208 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 209 utf *utf_java_lang_String__java_lang_Class;
 210 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 211 utf *utf_java_lang_Thread_java_lang_Throwable__V;
 212 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 213 utf *utf_java_lang_Throwable__java_lang_Throwable;
 214
 215 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 216 utf *utf_null;
 217 utf *array_packagename;
 218
 219
 220 /* utf_init ********************************************************************
 221
 222    Initializes the utf8 subsystem.
 223
 224 *******************************************************************************/
 225
 226 void utf8_init(void)
 227 {
 228         TRACESUBSYSTEMINITIALIZATION("utf8_init");
 229
 230         /* create utf8 hashtable */
 231
 232         hashtable_utf = NEW(hashtable);
 233
 234         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 235
 236 #if defined(ENABLE_STATISTICS)
 237         if (opt_stat)
 238                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 239 #endif
 240
 241         /* create utf-symbols for pointer comparison of frequently used strings */
 242
 243         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 244
 245         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 246         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 247         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 248         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 249         utf_java_lang_String           = utf_new_char("java/lang/String");
 250         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 251
 252         utf_java_lang_ref_SoftReference =
 253                 utf_new_char("java/lang/ref/SoftReference");
 254
 255         utf_java_lang_ref_WeakReference =
 256                 utf_new_char("java/lang/ref/WeakReference");
 257
 258         utf_java_lang_ref_PhantomReference =
 259                 utf_new_char("java/lang/ref/PhantomReference");
 260
 261         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 262
 263         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 264         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 265
 266         utf_java_lang_ClassCircularityError =
 267                 utf_new_char("java/lang/ClassCircularityError");
 268
 269         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
 270
 271         utf_java_lang_ExceptionInInitializerError =
 272                 utf_new_char("java/lang/ExceptionInInitializerError");
 273
 274         utf_java_lang_IncompatibleClassChangeError =
 275                 utf_new_char("java/lang/IncompatibleClassChangeError");
 276
 277         utf_java_lang_InstantiationError =
 278                 utf_new_char("java/lang/InstantiationError");
 279
 280         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
 281         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 282
 283         utf_java_lang_NoClassDefFoundError =
 284                 utf_new_char("java/lang/NoClassDefFoundError");
 285
 286         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 287
 288         utf_java_lang_UnsatisfiedLinkError =
 289                 utf_new_char("java/lang/UnsatisfiedLinkError");
 290
 291         utf_java_lang_UnsupportedClassVersionError =
 292                 utf_new_char("java/lang/UnsupportedClassVersionError");
 293
 294         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
 295
 296         utf_java_lang_VirtualMachineError =
 297                 utf_new_char("java/lang/VirtualMachineError");
 298
 299 #if defined(ENABLE_JAVASE)
 300         utf_java_lang_AbstractMethodError =
 301                 utf_new_char("java/lang/AbstractMethodError");
 302
 303         utf_java_lang_NoSuchFieldError =
 304                 utf_new_char("java/lang/NoSuchFieldError");
 305
 306         utf_java_lang_NoSuchMethodError =
 307                 utf_new_char("java/lang/NoSuchMethodError");
 308 #endif
 309
 310         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 311
 312         utf_java_lang_ArithmeticException =
 313                 utf_new_char("java/lang/ArithmeticException");
 314
 315         utf_java_lang_ArrayIndexOutOfBoundsException =
 316                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
 317
 318         utf_java_lang_ArrayStoreException =
 319                 utf_new_char("java/lang/ArrayStoreException");
 320
 321         utf_java_lang_ClassCastException =
 322                 utf_new_char("java/lang/ClassCastException");
 323
 324         utf_java_lang_ClassNotFoundException =
 325                 utf_new_char("java/lang/ClassNotFoundException");
 326
 327         utf_java_lang_CloneNotSupportedException =
 328                 utf_new_char("java/lang/CloneNotSupportedException");
 329
 330         utf_java_lang_IllegalAccessException =
 331                 utf_new_char("java/lang/IllegalAccessException");
 332
 333         utf_java_lang_IllegalArgumentException =
 334                 utf_new_char("java/lang/IllegalArgumentException");
 335
 336         utf_java_lang_IllegalMonitorStateException =
 337                 utf_new_char("java/lang/IllegalMonitorStateException");
 338
 339         utf_java_lang_InstantiationException =
 340                 utf_new_char("java/lang/InstantiationException");
 341
 342         utf_java_lang_InterruptedException =
 343                 utf_new_char("java/lang/InterruptedException");
 344
 345         utf_java_lang_NegativeArraySizeException =
 346                 utf_new_char("java/lang/NegativeArraySizeException");
 347
 348         utf_java_lang_NullPointerException =
 349                 utf_new_char("java/lang/NullPointerException");
 350
 351         utf_java_lang_StringIndexOutOfBoundsException =
 352                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
 353
 354         utf_java_lang_reflect_InvocationTargetException =
 355                 utf_new_char("java/lang/reflect/InvocationTargetException");
 356
 357         utf_java_security_PrivilegedActionException =
 358                 utf_new_char("java/security/PrivilegedActionException");
 359
 360 #if defined(ENABLE_JAVASE)
 361         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 362 #endif
 363
 364         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 365         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 366         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 367         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 368         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 369         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 370         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 371         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 372
 373 #if defined(ENABLE_JAVASE)
 374         utf_java_lang_StackTraceElement =
 375                 utf_new_char("java/lang/StackTraceElement");
 376
 377         utf_java_lang_reflect_Constructor =
 378                 utf_new_char("java/lang/reflect/Constructor");
 379
 380         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 381         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 382         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 383 #endif
 384
 385         utf_InnerClasses               = utf_new_char("InnerClasses");
 386         utf_ConstantValue              = utf_new_char("ConstantValue");
 387         utf_Code                       = utf_new_char("Code");
 388         utf_Exceptions                 = utf_new_char("Exceptions");
 389         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 390         utf_SourceFile                 = utf_new_char("SourceFile");
 391
 392 #if defined(ENABLE_JAVASE)
 393         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 394         utf_Signature                  = utf_new_char("Signature");
 395         utf_StackMapTable              = utf_new_char("StackMapTable");
 396
 397 #if defined(ENABLE_ANNOTATIONS)
 398         utf_RuntimeVisibleAnnotations            = utf_new_char("RuntimeVisibleAnnotations");
 399         utf_RuntimeInvisibleAnnotations          = utf_new_char("RuntimeInvisibleAnnotations");
 400         utf_RuntimeVisibleParameterAnnotations   = utf_new_char("RuntimeVisibleParameterAnnotations");
 401         utf_RuntimeInvisibleParameterAnnotations = utf_new_char("RuntimeInvisibleParameterAnnotations");
 402         utf_AnnotationDefault                    = utf_new_char("AnnotationDefault");
 403 #endif
 404 #endif
 405
 406         utf_init                           = utf_new_char("<init>");
 407         utf_clinit                         = utf_new_char("<clinit>");
 408         utf_clone                      = utf_new_char("clone");
 409         utf_finalize                   = utf_new_char("finalize");
 410         utf_main                       = utf_new_char("main");
 411         utf_run                        = utf_new_char("run");
 412
 413         utf_add                        = utf_new_char("add");
 414         utf_remove                     = utf_new_char("remove");
 415         utf_addThread                  = utf_new_char("addThread");
 416         utf_removeThread               = utf_new_char("removeThread");
 417         utf_put                        = utf_new_char("put");
 418         utf_get                        = utf_new_char("get");
 419         utf_uncaughtException          = utf_new_char("uncaughtException");
 420         utf_value                      = utf_new_char("value");
 421
 422         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 423         utf_findNative                 = utf_new_char("findNative");
 424         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 425         utf_initCause                  = utf_new_char("initCause");
 426         utf_loadClass                  = utf_new_char("loadClass");
 427         utf_loadClassInternal          = utf_new_char("loadClassInternal");
 428         utf_printStackTrace            = utf_new_char("printStackTrace");
 429
 430         utf_division_by_zero           = utf_new_char("/ by zero");
 431
 432         utf_Z                          = utf_new_char("Z");
 433         utf_B                          = utf_new_char("B");
 434         utf_C                          = utf_new_char("C");
 435         utf_S                          = utf_new_char("S");
 436         utf_I                          = utf_new_char("I");
 437         utf_J                          = utf_new_char("J");
 438         utf_F                          = utf_new_char("F");
 439         utf_D                          = utf_new_char("D");
 440
 441         utf_void__void                 = utf_new_char("()V");
 442         utf_boolean__void              = utf_new_char("(Z)V");
 443         utf_byte__void                 = utf_new_char("(B)V");
 444         utf_char__void                 = utf_new_char("(C)V");
 445         utf_short__void                = utf_new_char("(S)V");
 446         utf_int__void                  = utf_new_char("(I)V");
 447         utf_long__void                 = utf_new_char("(J)V");
 448         utf_float__void                = utf_new_char("(F)V");
 449         utf_double__void               = utf_new_char("(D)V");
 450         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 451         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 452
 453         utf_void__java_lang_ClassLoader =
 454                 utf_new_char("()Ljava/lang/ClassLoader;");
 455
 456         utf_java_lang_ClassLoader_java_lang_String__J =
 457                 utf_new_char("(Ljava/lang/ClassLoader;Ljava/lang/String;)J");
 458
 459         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
 460
 461         utf_java_lang_Object__java_lang_Object =
 462                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 463
 464         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 465
 466         utf_java_lang_String__java_lang_Class =
 467                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 468
 469         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 470
 471         utf_java_lang_Thread_java_lang_Throwable__V =
 472                 utf_new_char("(Ljava/lang/Thread;Ljava/lang/Throwable;)V");
 473
 474         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 475
 476         utf_java_lang_Throwable__java_lang_Throwable =
 477                 utf_new_char("(Ljava/lang/Throwable;)Ljava/lang/Throwable;");
 478
 479         utf_null                       = utf_new_char("null");
 480         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 481         array_packagename              = utf_new_char("\t<the array package>");
 482 }
 483
 484
 485 /* utf_hashkey *****************************************************************
 486
 487    The hashkey is computed from the utf-text by using up to 8
 488    characters.  For utf-symbols longer than 15 characters 3 characters
 489    are taken from the beginning and the end, 2 characters are taken
 490    from the middle.
 491
 492 *******************************************************************************/
 493
 494 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 495 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 496
 497 u4 utf_hashkey(const char *text, u4 length)
 498 {
 499         const char *start_pos = text;       /* pointer to utf text                */
 500         u4 a;
 501
 502         switch (length) {
 503         case 0: /* empty string */
 504                 return 0;
 505
 506         case 1: return fbs(0);
 507         case 2: return fbs(0) ^ nbs(3);
 508         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 509         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 510         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 511         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 512         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 513         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 514
 515         case 9:
 516                 a = fbs(0);
 517                 a ^= nbs(1);
 518                 a ^= nbs(2);
 519                 text++;
 520                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 521
 522         case 10:
 523                 a = fbs(0);
 524                 text++;
 525                 a ^= nbs(2);
 526                 a ^= nbs(3);
 527                 a ^= nbs(4);
 528                 text++;
 529                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 530
 531         case 11:
 532                 a = fbs(0);
 533                 text++;
 534                 a ^= nbs(2);
 535                 a ^= nbs(3);
 536                 a ^= nbs(4);
 537                 text++;
 538                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 539
 540         case 12:
 541                 a = fbs(0);
 542                 text += 2;
 543                 a ^= nbs(2);
 544                 a ^= nbs(3);
 545                 text++;
 546                 a ^= nbs(5);
 547                 a ^= nbs(6);
 548                 a ^= nbs(7);
 549                 text++;
 550                 return a ^ nbs(9) ^ nbs(10);
 551
 552         case 13:
 553                 a = fbs(0);
 554                 a ^= nbs(1);
 555                 text++;
 556                 a ^= nbs(3);
 557                 a ^= nbs(4);
 558                 text += 2;
 559                 a ^= nbs(7);
 560                 a ^= nbs(8);
 561                 text += 2;
 562                 return a ^ nbs(9) ^ nbs(10);
 563
 564         case 14:
 565                 a = fbs(0);
 566                 text += 2;
 567                 a ^= nbs(3);
 568                 a ^= nbs(4);
 569                 text += 2;
 570                 a ^= nbs(7);
 571                 a ^= nbs(8);
 572                 text += 2;
 573                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 574
 575         case 15:
 576                 a = fbs(0);
 577                 text += 2;
 578                 a ^= nbs(3);
 579                 a ^= nbs(4);
 580                 text += 2;
 581                 a ^= nbs(7);
 582                 a ^= nbs(8);
 583                 text += 2;
 584                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 585
 586         default:  /* 3 characters from beginning */
 587                 a = fbs(0);
 588                 text += 2;
 589                 a ^= nbs(3);
 590                 a ^= nbs(4);
 591
 592                 /* 2 characters from middle */
 593                 text = start_pos + (length / 2);
 594                 a ^= fbs(5);
 595                 text += 2;
 596                 a ^= nbs(6);
 597
 598                 /* 3 characters from end */
 599                 text = start_pos + length - 4;
 600
 601                 a ^= fbs(7);
 602                 text++;
 603
 604                 return a ^ nbs(10) ^ nbs(11);
 605     }
 606 }
 607
 608 /* utf_full_hashkey ************************************************************
 609
 610    This function computes a hash value using all bytes in the string.
 611
 612    The algorithm is the "One-at-a-time" algorithm as published
 613    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 614
 615 *******************************************************************************/
 616
 617 u4 utf_full_hashkey(const char *text, u4 length)
 618 {
 619         register const unsigned char *p = (const unsigned char *) text;
 620         register u4 hash;
 621         register u4 i;
 622
 623         hash = 0;
 624         for (i=length; i--;)
 625         {
 626             hash += *p++;
 627             hash += (hash << 10);
 628             hash ^= (hash >> 6);
 629         }
 630         hash += (hash << 3);
 631         hash ^= (hash >> 11);
 632         hash += (hash << 15);
 633
 634         return hash;
 635 }
 636
 637 /* unicode_hashkey *************************************************************
 638
 639    Compute the hashkey of a unicode string.
 640
 641 *******************************************************************************/
 642
 643 u4 unicode_hashkey(u2 *text, u2 len)
 644 {
 645         return utf_hashkey((char *) text, len);
 646 }
 647
 648
 649 /* utf_new *********************************************************************
 650
 651    Creates a new utf-symbol, the text of the symbol is passed as a
 652    u1-array. The function searches the utf-hashtable for a utf-symbol
 653    with this text. On success the element returned, otherwise a new
 654    hashtable element is created.
 655
 656    If the number of entries in the hashtable exceeds twice the size of
 657    the hashtable slots a reorganization of the hashtable is done and
 658    the utf symbols are copied to a new hashtable with doubled size.
 659
 660 *******************************************************************************/
 661
 662 utf *utf_new(const char *text, u2 length)
 663 {
 664         u4 key;                             /* hashkey computed from utf-text     */
 665         u4 slot;                            /* slot in hashtable                  */
 666         utf *u;                             /* hashtable element                  */
 667         u2 i;
 668
 669         LOCK_MONITOR_ENTER(hashtable_utf->header);
 670
 671 #if defined(ENABLE_STATISTICS)
 672         if (opt_stat)
 673                 count_utf_new++;
 674 #endif
 675
 676         key  = utf_hashkey(text, length);
 677         slot = key & (hashtable_utf->size - 1);
 678         u    = hashtable_utf->ptr[slot];
 679
 680         /* search external hash chain for utf-symbol */
 681
 682         while (u) {
 683                 if (u->blength == length) {
 684                         /* compare text of hashtable elements */
 685
 686                         for (i = 0; i < length; i++)
 687                                 if (text[i] != u->text[i])
 688                                         goto nomatch;
 689
 690 #if defined(ENABLE_STATISTICS)
 691                         if (opt_stat)
 692                                 count_utf_new_found++;
 693 #endif
 694
 695                         /* symbol found in hashtable */
 696
 697                         LOCK_MONITOR_EXIT(hashtable_utf->header);
 698
 699                         return u;
 700                 }
 701
 702         nomatch:
 703                 u = u->hashlink; /* next element in external chain */
 704         }
 705
 706         /* location in hashtable found, create new utf element */
 707
 708         u = NEW(utf);
 709
 710         u->blength  = length;               /* length in bytes of utfstring       */
 711         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 712         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 713
 714         memcpy(u->text, text, length);      /* copy utf-text                      */
 715         u->text[length] = '\0';
 716
 717 #if defined(ENABLE_STATISTICS)
 718         if (opt_stat)
 719                 count_utf_len += sizeof(utf) + length + 1;
 720 #endif
 721
 722         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 723         hashtable_utf->entries++;           /* update number of entries           */
 724
 725         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 726
 727         /* reorganization of hashtable, average length of the external
 728            chains is approx. 2 */
 729
 730                 hashtable *newhash;                              /* the new hashtable */
 731                 u4         i;
 732                 utf       *u;
 733                 utf       *nextu;
 734                 u4         slot;
 735
 736                 /* create new hashtable, double the size */
 737
 738                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 739
 740 #if defined(ENABLE_STATISTICS)
 741                 if (opt_stat)
 742                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 743 #endif
 744
 745                 /* transfer elements to new hashtable */
 746
 747                 for (i = 0; i < hashtable_utf->size; i++) {
 748                         u = hashtable_utf->ptr[i];
 749
 750                         while (u) {
 751                                 nextu = u->hashlink;
 752                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 753
 754                                 u->hashlink = (utf *) newhash->ptr[slot];
 755                                 newhash->ptr[slot] = u;
 756
 757                                 /* follow link in external hash chain */
 758
 759                                 u = nextu;
 760                         }
 761                 }
 762
 763                 /* dispose old table */
 764
 765                 hashtable_free(hashtable_utf);
 766
 767                 hashtable_utf = newhash;
 768         }
 769
 770         LOCK_MONITOR_EXIT(hashtable_utf->header);
 771
 772         return u;
 773 }
 774
 775
 776 /* utf_new_u2 ******************************************************************
 777
 778    Make utf symbol from u2 array, if isclassname is true '.' is
 779    replaced by '/'.
 780
 781 *******************************************************************************/
 782
 783 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 784 {
 785         char *buffer;                   /* memory buffer for  unicode characters  */
 786         char *pos;                      /* pointer to current position in buffer  */
 787         u4 left;                        /* unicode characters left                */
 788         u4 buflength;                   /* utf length in bytes of the u2 array    */
 789         utf *result;                    /* resulting utf-string                   */
 790         int i;
 791
 792         /* determine utf length in bytes and allocate memory */
 793
 794         buflength = u2_utflength(unicode_pos, unicode_length);
 795         buffer    = MNEW(char, buflength);
 796
 797         left = buflength;
 798         pos  = buffer;
 799
 800         for (i = 0; i++ < unicode_length; unicode_pos++) {
 801                 /* next unicode character */
 802                 u2 c = *unicode_pos;
 803
 804                 if ((c != 0) && (c < 0x80)) {
 805                         /* 1 character */
 806                         left--;
 807                 if ((int) left < 0) break;
 808                         /* convert classname */
 809                         if (isclassname && c == '.')
 810                                 *pos++ = '/';
 811                         else
 812                                 *pos++ = (char) c;
 813
 814                 } else if (c < 0x800) {
 815                         /* 2 characters */
 816                 unsigned char high = c >> 6;
 817                 unsigned char low  = c & 0x3F;
 818                         left = left - 2;
 819                 if ((int) left < 0) break;
 820                 *pos++ = high | 0xC0;
 821                 *pos++ = low  | 0x80;
 822
 823                 } else {
 824                 /* 3 characters */
 825                 char low  = c & 0x3f;
 826                 char mid  = (c >> 6) & 0x3F;
 827                 char high = c >> 12;
 828                         left = left - 3;
 829                 if ((int) left < 0) break;
 830                 *pos++ = high | 0xE0;
 831                 *pos++ = mid  | 0x80;
 832                 *pos++ = low  | 0x80;
 833                 }
 834         }
 835
 836         /* insert utf-string into symbol-table */
 837         result = utf_new(buffer,buflength);
 838
 839         MFREE(buffer, char, buflength);
 840
 841         return result;
 842 }
 843
 844
 845 /* utf_new_char ****************************************************************
 846
 847    Creates a new utf symbol, the text for this symbol is passed as a
 848    c-string ( = char* ).
 849
 850 *******************************************************************************/
 851
 852 utf *utf_new_char(const char *text)
 853 {
 854         return utf_new(text, strlen(text));
 855 }
 856
 857
 858 /* utf_new_char_classname ******************************************************
 859
 860    Creates a new utf symbol, the text for this symbol is passed as a
 861    c-string ( = char* ) "." characters are going to be replaced by
 862    "/". Since the above function is used often, this is a separte
 863    function, instead of an if.
 864
 865 *******************************************************************************/
 866
 867 utf *utf_new_char_classname(const char *text)
 868 {
 869         if (strchr(text, '.')) {
 870                 char *txt = strdup(text);
 871                 char *end = txt + strlen(txt);
 872                 char *c;
 873                 utf *tmpRes;
 874
 875                 for (c = txt; c < end; c++)
 876                         if (*c == '.') *c = '/';
 877
 878                 tmpRes = utf_new(txt, strlen(txt));
 879                 FREE(txt, 0);
 880
 881                 return tmpRes;
 882
 883         } else
 884                 return utf_new(text, strlen(text));
 885 }
 886
 887
 888 /* utf_nextu2 ******************************************************************
 889
 890    Read the next unicode character from the utf string and increment
 891    the utf-string pointer accordingly.
 892
 893    CAUTION: This function is unsafe for input that was not checked
 894             by is_valid_utf!
 895
 896 *******************************************************************************/
 897
 898 u2 utf_nextu2(char **utf_ptr)
 899 {
 900     /* uncompressed unicode character */
 901     u2 unicode_char = 0;
 902     /* current position in utf text */
 903     unsigned char *utf = (unsigned char *) (*utf_ptr);
 904     /* bytes representing the unicode character */
 905     unsigned char ch1, ch2, ch3;
 906     /* number of bytes used to represent the unicode character */
 907     int len = 0;
 908
 909     switch ((ch1 = utf[0]) >> 4) {
 910         default: /* 1 byte */
 911                 (*utf_ptr)++;
 912                 return (u2) ch1;
 913         case 0xC:
 914         case 0xD: /* 2 bytes */
 915                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 916                         unsigned char high = ch1 & 0x1F;
 917                         unsigned char low  = ch2 & 0x3F;
 918                         unicode_char = (high << 6) + low;
 919                         len = 2;
 920                 }
 921                 break;
 922
 923         case 0xE: /* 2 or 3 bytes */
 924                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 925                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 926                                 unsigned char low  = ch3 & 0x3f;
 927                                 unsigned char mid  = ch2 & 0x3f;
 928                                 unsigned char high = ch1 & 0x0f;
 929                                 unicode_char = (((high << 6) + mid) << 6) + low;
 930                                 len = 3;
 931                         } else
 932                                 len = 2;
 933                 }
 934                 break;
 935     }
 936
 937     /* update position in utf-text */
 938     *utf_ptr = (char *) (utf + len);
 939
 940     return unicode_char;
 941 }
 942
 943
 944 /* utf_bytes *******************************************************************
 945
 946    Determine number of bytes (aka. octets) in the utf string.
 947
 948    IN:
 949       u............utf string
 950
 951    OUT:
 952       The number of octets of this utf string.
 953           There is _no_ terminating zero included in this count.
 954
 955 *******************************************************************************/
 956
 957 u4 utf_bytes(utf *u)
 958 {
 959         return u->blength;
 960 }
 961
 962
 963 /* utf_get_number_of_u2s_for_buffer ********************************************
 964
 965    Determine number of UTF-16 u2s in the given UTF-8 buffer
 966
 967    CAUTION: This function is unsafe for input that was not checked
 968             by is_valid_utf!
 969
 970    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 971    to an array of u2s (UTF-16) and want to know how many of them you will get.
 972    All other uses of this function are probably wrong.
 973
 974    IN:
 975       buffer........points to first char in buffer
 976           blength.......number of _bytes_ in the buffer
 977
 978    OUT:
 979       the number of u2s needed to hold this string in UTF-16 encoding.
 980           There is _no_ terminating zero included in this count.
 981
 982    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
 983    exception.
 984
 985 *******************************************************************************/
 986
 987 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
 988 {
 989         const char *endpos;                 /* points behind utf string           */
 990         const char *utf_ptr;                /* current position in utf text       */
 991         u4 len = 0;                         /* number of unicode characters       */
 992
 993         utf_ptr = buffer;
 994         endpos = utf_ptr + blength;
 995
 996         while (utf_ptr < endpos) {
 997                 len++;
 998                 /* next unicode character */
 999                 utf_nextu2((char **)&utf_ptr);
1000         }
1001
1002         assert(utf_ptr == endpos);
1003
1004         return len;
1005 }
1006
1007
1008 /* utf_get_number_of_u2s *******************************************************
1009
1010    Determine number of UTF-16 u2s in the utf string.
1011
1012    CAUTION: This function is unsafe for input that was not checked
1013             by is_valid_utf!
1014
1015    CAUTION: Use this function *only* when you want to convert a utf string
1016    to an array of u2s and want to know how many of them you will get.
1017    All other uses of this function are probably wrong.
1018
1019    IN:
1020       u............utf string
1021
1022    OUT:
1023       the number of u2s needed to hold this string in UTF-16 encoding.
1024           There is _no_ terminating zero included in this count.
1025           XXX 0 if a NullPointerException has been thrown (see below)
1026
1027 *******************************************************************************/
1028
1029 u4 utf_get_number_of_u2s(utf *u)
1030 {
1031         char *endpos;                       /* points behind utf string           */
1032         char *utf_ptr;                      /* current position in utf text       */
1033         u4 len = 0;                         /* number of unicode characters       */
1034
1035         /* XXX this is probably not checked by most callers! Review this after */
1036         /* the invalid uses of this function have been eliminated */
1037         if (u == NULL) {
1038                 exceptions_throw_nullpointerexception();
1039                 return 0;
1040         }
1041
1042         endpos = UTF_END(u);
1043         utf_ptr = u->text;
1044
1045         while (utf_ptr < endpos) {
1046                 len++;
1047                 /* next unicode character */
1048                 utf_nextu2(&utf_ptr);
1049         }
1050
1051         if (utf_ptr != endpos) {
1052                 /* string ended abruptly */
1053                 exceptions_throw_internalerror("Illegal utf8 string");
1054                 return 0;
1055         }
1056
1057         return len;
1058 }
1059
1060
1061 /* utf8_safe_number_of_u2s *****************************************************
1062
1063    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1064    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1065
1066    This function is safe even for invalid UTF-8 strings.
1067
1068    IN:
1069       text..........zero-terminated(!) UTF-8 string (may be invalid)
1070                         must NOT be NULL
1071           nbytes........strlen(text). (This is needed to completely emulate
1072                         the RI).
1073
1074    OUT:
1075       the number of u2s needed to hold this string in UTF-16 encoding.
1076           There is _no_ terminating zero included in this count.
1077
1078 *******************************************************************************/
1079
1080 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1081         register const unsigned char *t;
1082         register s4 byte;
1083         register s4 len;
1084         register const unsigned char *tlimit;
1085         s4 byte1;
1086         s4 byte2;
1087         s4 byte3;
1088         s4 value;
1089         s4 skip;
1090
1091         assert(text);
1092         assert(nbytes >= 0);
1093
1094         len = 0;
1095         t = (const unsigned char *) text;
1096         tlimit = t + nbytes;
1097
1098         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1099
1100         while (1) {
1101                 byte = *t++;
1102
1103                 if (byte & 0x80) {
1104                         /* highest bit set, non-ASCII character */
1105
1106                         if ((byte & 0xe0) == 0xc0) {
1107                                 /* 2-byte: should be 110..... 10...... ? */
1108
1109                                 if ((*t++ & 0xc0) == 0x80)
1110                                         ; /* valid 2-byte */
1111                                 else
1112                                         t--; /* invalid */
1113                         }
1114                         else if ((byte & 0xf0) == 0xe0) {
1115                                 /* 3-byte: should be 1110.... 10...... 10...... */
1116                                 /*                            ^t                */
1117
1118                                 if (t + 2 > tlimit)
1119                                         return len + 1; /* invalid, stop here */
1120
1121                                 if ((*t++ & 0xc0) == 0x80) {
1122                                         if ((*t++ & 0xc0) == 0x80)
1123                                                 ; /* valid 3-byte */
1124                                         else
1125                                                 t--; /* invalid */
1126                                 }
1127                                 else
1128                                         t--; /* invalid */
1129                         }
1130                         else if ((byte & 0xf8) == 0xf0) {
1131                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1132                                 /*                            ^t                         */
1133
1134                                 if (t + 3 > tlimit)
1135                                         return len + 1; /* invalid, stop here */
1136
1137                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1138                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1139                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1140                                                         /* valid 4-byte UTF-8? */
1141                                                         value = ((byte  & 0x07) << 18)
1142                                                                   | ((byte1 & 0x3f) << 12)
1143                                                                   | ((byte2 & 0x3f) <<  6)
1144                                                                   | ((byte3 & 0x3f)      );
1145
1146                                                         if (value > 0x10FFFF)
1147                                                                 ; /* invalid */
1148                                                         else if (value > 0xFFFF)
1149                                                                 len += 1; /* we need surrogates */
1150                                                         else
1151                                                                 ; /* 16bit suffice */
1152                                                 }
1153                                                 else
1154                                                         t--; /* invalid */
1155                                         }
1156                                         else
1157                                                 t--; /* invalid */
1158                                 }
1159                                 else
1160                                         t--; /* invalid */
1161                         }
1162                         else if ((byte & 0xfc) == 0xf8) {
1163                                 /* invalid 5-byte */
1164                                 if (t + 4 > tlimit)
1165                                         return len + 1; /* invalid, stop here */
1166
1167                                 skip = 4;
1168                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1169                                         t++;
1170                         }
1171                         else if ((byte & 0xfe) == 0xfc) {
1172                                 /* invalid 6-byte */
1173                                 if (t + 5 > tlimit)
1174                                         return len + 1; /* invalid, stop here */
1175
1176                                 skip = 5;
1177                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1178                                         t++;
1179                         }
1180                         else
1181                                 ; /* invalid */
1182                 }
1183                 else {
1184                         /* NUL */
1185
1186                         if (byte == 0)
1187                                 break;
1188
1189                         /* ASCII character, common case */
1190                 }
1191
1192                 len++;
1193         }
1194
1195         return len;
1196 }
1197
1198
1199 /* utf8_safe_convert_to_u2s ****************************************************
1200
1201    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1202    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1203    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1204
1205    This function is safe even for invalid UTF-8 strings.
1206
1207    IN:
1208       text..........zero-terminated(!) UTF-8 string (may be invalid)
1209                         must NOT be NULL
1210           nbytes........strlen(text). (This is needed to completely emulate
1211                                         the RI).
1212           buffer........a preallocated array of u2s to receive the decoded
1213                         string. Use utf8_safe_number_of_u2s to get the
1214                                         required number of u2s for allocating this.
1215
1216 *******************************************************************************/
1217
1218 #define UNICODE_REPLACEMENT  0xfffd
1219
1220 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1221         register const unsigned char *t;
1222         register s4 byte;
1223         register const unsigned char *tlimit;
1224         s4 byte1;
1225         s4 byte2;
1226         s4 byte3;
1227         s4 value;
1228         s4 skip;
1229
1230         assert(text);
1231         assert(nbytes >= 0);
1232
1233         t = (const unsigned char *) text;
1234         tlimit = t + nbytes;
1235
1236         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1237
1238         while (1) {
1239                 byte = *t++;
1240
1241                 if (byte & 0x80) {
1242                         /* highest bit set, non-ASCII character */
1243
1244                         if ((byte & 0xe0) == 0xc0) {
1245                                 /* 2-byte: should be 110..... 10...... */
1246
1247                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1248                                         /* valid 2-byte UTF-8 */
1249                                         *buffer++ = ((byte  & 0x1f) << 6)
1250                                                           | ((byte1 & 0x3f)     );
1251                                 }
1252                                 else {
1253                                         *buffer++ = UNICODE_REPLACEMENT;
1254                                         t--;
1255                                 }
1256                         }
1257                         else if ((byte & 0xf0) == 0xe0) {
1258                                 /* 3-byte: should be 1110.... 10...... 10...... */
1259
1260                                 if (t + 2 > tlimit) {
1261                                         *buffer++ = UNICODE_REPLACEMENT;
1262                                         return;
1263                                 }
1264
1265                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1266                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1267                                                 /* valid 3-byte UTF-8 */
1268                                                 *buffer++ = ((byte  & 0x0f) << 12)
1269                                                                   | ((byte1 & 0x3f) <<  6)
1270                                                                   | ((byte2 & 0x3f)      );
1271                                         }
1272                                         else {
1273                                                 *buffer++ = UNICODE_REPLACEMENT;
1274                                                 t--;
1275                                         }
1276                                 }
1277                                 else {
1278                                         *buffer++ = UNICODE_REPLACEMENT;
1279                                         t--;
1280                                 }
1281                         }
1282                         else if ((byte & 0xf8) == 0xf0) {
1283                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1284
1285                                 if (t + 3 > tlimit) {
1286                                         *buffer++ = UNICODE_REPLACEMENT;
1287                                         return;
1288                                 }
1289
1290                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1291                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1292                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1293                                                         /* valid 4-byte UTF-8? */
1294                                                         value = ((byte  & 0x07) << 18)
1295                                                                   | ((byte1 & 0x3f) << 12)
1296                                                                   | ((byte2 & 0x3f) <<  6)
1297                                                                   | ((byte3 & 0x3f)      );
1298
1299                                                         if (value > 0x10FFFF) {
1300                                                                 *buffer++ = UNICODE_REPLACEMENT;
1301                                                         }
1302                                                         else if (value > 0xFFFF) {
1303                                                                 /* we need surrogates */
1304                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1305                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1306                                                         }
1307                                                         else
1308                                                                 *buffer++ = value; /* 16bit suffice */
1309                                                 }
1310                                                 else {
1311                                                         *buffer++ = UNICODE_REPLACEMENT;
1312                                                         t--;
1313                                                 }
1314                                         }
1315                                         else {
1316                                                 *buffer++ = UNICODE_REPLACEMENT;
1317                                                 t--;
1318                                         }
1319                                 }
1320                                 else {
1321                                         *buffer++ = UNICODE_REPLACEMENT;
1322                                         t--;
1323                                 }
1324                         }
1325                         else if ((byte & 0xfc) == 0xf8) {
1326                                 if (t + 4 > tlimit) {
1327                                         *buffer++ = UNICODE_REPLACEMENT;
1328                                         return;
1329                                 }
1330
1331                                 skip = 4;
1332                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1333                                         t++;
1334                                 *buffer++ = UNICODE_REPLACEMENT;
1335                         }
1336                         else if ((byte & 0xfe) == 0xfc) {
1337                                 if (t + 5 > tlimit) {
1338                                         *buffer++ = UNICODE_REPLACEMENT;
1339                                         return;
1340                                 }
1341
1342                                 skip = 5;
1343                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1344                                         t++;
1345                                 *buffer++ = UNICODE_REPLACEMENT;
1346                         }
1347                         else
1348                                 *buffer++ = UNICODE_REPLACEMENT;
1349                 }
1350                 else {
1351                         /* NUL */
1352
1353                         if (byte == 0)
1354                                 break;
1355
1356                         /* ASCII character, common case */
1357
1358                         *buffer++ = byte;
1359                 }
1360         }
1361 }
1362
1363
1364 /* u2_utflength ****************************************************************
1365
1366    Returns the utf length in bytes of a u2 array.
1367
1368 *******************************************************************************/
1369
1370 u4 u2_utflength(u2 *text, u4 u2_length)
1371 {
1372         u4 result_len = 0;                  /* utf length in bytes                */
1373         u2 ch;                              /* current unicode character          */
1374         u4 len;
1375
1376         for (len = 0; len < u2_length; len++) {
1377                 /* next unicode character */
1378                 ch = *text++;
1379
1380                 /* determine bytes required to store unicode character as utf */
1381                 if (ch && (ch < 0x80))
1382                         result_len++;
1383                 else if (ch < 0x800)
1384                         result_len += 2;
1385                 else
1386                         result_len += 3;
1387         }
1388
1389     return result_len;
1390 }
1391
1392
1393 /* utf_copy ********************************************************************
1394
1395    Copy the given utf string byte-for-byte to a buffer.
1396
1397    IN:
1398       buffer.......the buffer
1399           u............the utf string
1400
1401 *******************************************************************************/
1402
1403 void utf_copy(char *buffer, utf *u)
1404 {
1405         /* our utf strings are zero-terminated (done by utf_new) */
1406         MCOPY(buffer, u->text, char, u->blength + 1);
1407 }
1408
1409
1410 /* utf_cat *********************************************************************
1411
1412    Append the given utf string byte-for-byte to a buffer.
1413
1414    IN:
1415       buffer.......the buffer
1416           u............the utf string
1417
1418 *******************************************************************************/
1419
1420 void utf_cat(char *buffer, utf *u)
1421 {
1422         /* our utf strings are zero-terminated (done by utf_new) */
1423         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1424 }
1425
1426
1427 /* utf_copy_classname **********************************************************
1428
1429    Copy the given utf classname byte-for-byte to a buffer.
1430    '/' is replaced by '.'
1431
1432    IN:
1433       buffer.......the buffer
1434           u............the utf string
1435
1436 *******************************************************************************/
1437
1438 void utf_copy_classname(char *buffer, utf *u)
1439 {
1440         char *bufptr;
1441         char *srcptr;
1442         char *endptr;
1443         char ch;
1444
1445         bufptr = buffer;
1446         srcptr = u->text;
1447         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1448
1449         while (srcptr != endptr) {
1450                 ch = *srcptr++;
1451                 if (ch == '/')
1452                         ch = '.';
1453                 *bufptr++ = ch;
1454         }
1455 }
1456
1457
1458 /* utf_cat *********************************************************************
1459
1460    Append the given utf classname byte-for-byte to a buffer.
1461    '/' is replaced by '.'
1462
1463    IN:
1464       buffer.......the buffer
1465           u............the utf string
1466
1467 *******************************************************************************/
1468
1469 void utf_cat_classname(char *buffer, utf *u)
1470 {
1471         utf_copy_classname(buffer + strlen(buffer), u);
1472 }
1473
1474 /* utf_display_printable_ascii *************************************************
1475
1476    Write utf symbol to stdout (for debugging purposes).
1477    Non-printable and non-ASCII characters are printed as '?'.
1478
1479 *******************************************************************************/
1480
1481 void utf_display_printable_ascii(utf *u)
1482 {
1483         char *endpos;                       /* points behind utf string           */
1484         char *utf_ptr;                      /* current position in utf text       */
1485
1486         if (u == NULL) {
1487                 printf("NULL");
1488                 fflush(stdout);
1489                 return;
1490         }
1491
1492         endpos = UTF_END(u);
1493         utf_ptr = u->text;
1494
1495         while (utf_ptr < endpos) {
1496                 /* read next unicode character */
1497
1498                 u2 c = utf_nextu2(&utf_ptr);
1499
1500                 if ((c >= 32) && (c <= 127))
1501                         printf("%c", c);
1502                 else
1503                         printf("?");
1504         }
1505
1506         fflush(stdout);
1507 }
1508
1509
1510 /* utf_display_printable_ascii_classname ***************************************
1511
1512    Write utf symbol to stdout with `/' converted to `.' (for debugging
1513    purposes).
1514    Non-printable and non-ASCII characters are printed as '?'.
1515
1516 *******************************************************************************/
1517
1518 void utf_display_printable_ascii_classname(utf *u)
1519 {
1520         char *endpos;                       /* points behind utf string           */
1521         char *utf_ptr;                      /* current position in utf text       */
1522
1523         if (u == NULL) {
1524                 printf("NULL");
1525                 fflush(stdout);
1526                 return;
1527         }
1528
1529         endpos = UTF_END(u);
1530         utf_ptr = u->text;
1531
1532         while (utf_ptr < endpos) {
1533                 /* read next unicode character */
1534
1535                 u2 c = utf_nextu2(&utf_ptr);
1536
1537                 if (c == '/')
1538                         c = '.';
1539
1540                 if ((c >= 32) && (c <= 127))
1541                         printf("%c", c);
1542                 else
1543                         printf("?");
1544         }
1545
1546         fflush(stdout);
1547 }
1548
1549
1550 /* utf_sprint_convert_to_latin1 ************************************************
1551
1552    Write utf symbol into c-string (for debugging purposes).
1553    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1554    invalid results.
1555
1556 *******************************************************************************/
1557
1558 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1559 {
1560         char *endpos;                       /* points behind utf string           */
1561         char *utf_ptr;                      /* current position in utf text       */
1562         u2 pos = 0;                         /* position in c-string               */
1563
1564         if (!u) {
1565                 strcpy(buffer, "NULL");
1566                 return;
1567         }
1568
1569         endpos = UTF_END(u);
1570         utf_ptr = u->text;
1571
1572         while (utf_ptr < endpos)
1573                 /* copy next unicode character */
1574                 buffer[pos++] = utf_nextu2(&utf_ptr);
1575
1576         /* terminate string */
1577         buffer[pos] = '\0';
1578 }
1579
1580
1581 /* utf_sprint_convert_to_latin1_classname **************************************
1582
1583    Write utf symbol into c-string with `/' converted to `.' (for debugging
1584    purposes).
1585    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1586    invalid results.
1587
1588 *******************************************************************************/
1589
1590 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1591 {
1592         char *endpos;                       /* points behind utf string           */
1593         char *utf_ptr;                      /* current position in utf text       */
1594         u2 pos = 0;                         /* position in c-string               */
1595
1596         if (!u) {
1597                 strcpy(buffer, "NULL");
1598                 return;
1599         }
1600
1601         endpos = UTF_END(u);
1602         utf_ptr = u->text;
1603
1604         while (utf_ptr < endpos) {
1605                 /* copy next unicode character */
1606                 u2 c = utf_nextu2(&utf_ptr);
1607                 if (c == '/') c = '.';
1608                 buffer[pos++] = c;
1609         }
1610
1611         /* terminate string */
1612         buffer[pos] = '\0';
1613 }
1614
1615
1616 /* utf_strcat_convert_to_latin1 ************************************************
1617
1618    Like libc strcat, but uses an utf8 string.
1619    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1620    invalid results.
1621
1622 *******************************************************************************/
1623
1624 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1625 {
1626         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1627 }
1628
1629
1630 /* utf_strcat_convert_to_latin1_classname **************************************
1631
1632    Like libc strcat, but uses an utf8 string.
1633    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1634    invalid results.
1635
1636 *******************************************************************************/
1637
1638 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1639 {
1640         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1641 }
1642
1643
1644 /* utf_fprint_printable_ascii **************************************************
1645
1646    Write utf symbol into file.
1647    Non-printable and non-ASCII characters are printed as '?'.
1648
1649 *******************************************************************************/
1650
1651 void utf_fprint_printable_ascii(FILE *file, utf *u)
1652 {
1653         char *endpos;                       /* points behind utf string           */
1654         char *utf_ptr;                      /* current position in utf text       */
1655
1656         if (!u)
1657                 return;
1658
1659         endpos = UTF_END(u);
1660         utf_ptr = u->text;
1661
1662         while (utf_ptr < endpos) {
1663                 /* read next unicode character */
1664                 u2 c = utf_nextu2(&utf_ptr);
1665
1666                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1667                 else fprintf(file, "?");
1668         }
1669 }
1670
1671
1672 /* utf_fprint_printable_ascii_classname ****************************************
1673
1674    Write utf symbol into file with `/' converted to `.'.
1675    Non-printable and non-ASCII characters are printed as '?'.
1676
1677 *******************************************************************************/
1678
1679 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1680 {
1681         char *endpos;                       /* points behind utf string           */
1682         char *utf_ptr;                      /* current position in utf text       */
1683
1684     if (!u)
1685                 return;
1686
1687         endpos = UTF_END(u);
1688         utf_ptr = u->text;
1689
1690         while (utf_ptr < endpos) {
1691                 /* read next unicode character */
1692                 u2 c = utf_nextu2(&utf_ptr);
1693                 if (c == '/') c = '.';
1694
1695                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1696                 else fprintf(file, "?");
1697         }
1698 }
1699
1700
1701 /* is_valid_utf ****************************************************************
1702
1703    Return true if the given string is a valid UTF-8 string.
1704
1705    utf_ptr...points to first character
1706    end_pos...points after last character
1707
1708 *******************************************************************************/
1709
1710 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1711
1712 bool is_valid_utf(char *utf_ptr, char *end_pos)
1713 {
1714         int bytes;
1715         int len,i;
1716         char c;
1717         unsigned long v;
1718
1719         if (end_pos < utf_ptr) return false;
1720         bytes = end_pos - utf_ptr;
1721         while (bytes--) {
1722                 c = *utf_ptr++;
1723
1724                 if (!c) return false;                     /* 0x00 is not allowed */
1725                 if ((c & 0x80) == 0) continue;            /* ASCII */
1726
1727                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1728                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1729                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1730                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1731                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1732                 else return false;                        /* invalid leading byte */
1733
1734                 if (len > 2) return false;                /* Java limitation */
1735
1736                 v = (unsigned long)c & (0x3f >> len);
1737
1738                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1739
1740                 for (i = len; i--; ) {
1741                         c = *utf_ptr++;
1742                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1743                                 return false;
1744                         v = (v << 6) | (c & 0x3f);
1745                 }
1746
1747                 if (v == 0) {
1748                         if (len != 1) return false;           /* Java special */
1749
1750                 } else {
1751                         /* Sun Java seems to allow overlong UTF-8 encodings */
1752
1753                         /* if (v < min_codepoint[len]) */
1754                                 /* XXX throw exception? */
1755                 }
1756
1757                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1758                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1759
1760                 /* even these seem to be allowed */
1761                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1762         }
1763
1764         return true;
1765 }
1766
1767
1768 /* is_valid_name ***************************************************************
1769
1770    Return true if the given string may be used as a class/field/method
1771    name. (Currently this only disallows empty strings and control
1772    characters.)
1773
1774    NOTE: The string is assumed to have passed is_valid_utf!
1775
1776    utf_ptr...points to first character
1777    end_pos...points after last character
1778
1779 *******************************************************************************/
1780
1781 bool is_valid_name(char *utf_ptr, char *end_pos)
1782 {
1783         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1784
1785         while (utf_ptr < end_pos) {
1786                 unsigned char c = *utf_ptr++;
1787
1788                 if (c < 0x20) return false; /* disallow control characters */
1789                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1790                         return false;
1791         }
1792
1793         return true;
1794 }
1795
1796 bool is_valid_name_utf(utf *u)
1797 {
1798         return is_valid_name(u->text, UTF_END(u));
1799 }
1800
1801
1802 /* utf_show ********************************************************************
1803
1804    Writes the utf symbols in the utfhash to stdout and displays the
1805    number of external hash chains grouped according to the chainlength
1806    (for debugging purposes).
1807
1808 *******************************************************************************/
1809
1810 #if !defined(NDEBUG)
1811 void utf_show(void)
1812 {
1813
1814 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1815
1816         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1817         u4 max_chainlength = 0;      /* maximum length of the chains */
1818         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1819         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1820         u4 i;
1821
1822         printf("UTF-HASH:\n");
1823
1824         /* show element of utf-hashtable */
1825
1826         for (i = 0; i < hashtable_utf->size; i++) {
1827                 utf *u = hashtable_utf->ptr[i];
1828
1829                 if (u) {
1830                         printf("SLOT %d: ", (int) i);
1831
1832                         while (u) {
1833                                 printf("'");
1834                                 utf_display_printable_ascii(u);
1835                                 printf("' ");
1836                                 u = u->hashlink;
1837                         }
1838                         printf("\n");
1839                 }
1840         }
1841
1842         printf("UTF-HASH: %d slots for %d entries\n",
1843                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1844
1845         if (hashtable_utf->entries == 0)
1846                 return;
1847
1848         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1849
1850         for (i=0;i<CHAIN_LIMIT;i++)
1851                 chain_count[i]=0;
1852
1853         /* count numbers of hashchains according to their length */
1854         for (i=0; i<hashtable_utf->size; i++) {
1855
1856                 utf *u = (utf*) hashtable_utf->ptr[i];
1857                 u4 chain_length = 0;
1858
1859                 /* determine chainlength */
1860                 while (u) {
1861                         u = u->hashlink;
1862                         chain_length++;
1863                 }
1864
1865                 /* update sum of all chainlengths */
1866                 sum_chainlength+=chain_length;
1867
1868                 /* determine the maximum length of the chains */
1869                 if (chain_length>max_chainlength)
1870                         max_chainlength = chain_length;
1871
1872                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1873                 if (chain_length>=CHAIN_LIMIT) {
1874                         beyond_limit+=chain_length;
1875                         chain_length=CHAIN_LIMIT-1;
1876                 }
1877
1878                 /* update number of hashchains of current length */
1879                 chain_count[chain_length]++;
1880         }
1881
1882         /* display results */
1883         for (i=1;i<CHAIN_LIMIT-1;i++)
1884                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1885
1886         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1887
1888
1889         printf("max. chainlength:%5d\n",max_chainlength);
1890
1891         /* avg. chainlength = sum of chainlengths / number of chains */
1892         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1893 }
1894 #endif /* !defined(NDEBUG) */
1895
1896
1897 /*
1898  * These are local overrides for various environment variables in Emacs.
1899  * Please do not remove this and leave it at the end of the file, where
1900  * Emacs will automagically detect them.
1901  * ---------------------------------------------------------------------
1902  * Local variables:
1903  * mode: c
1904  * indent-tabs-mode: t
1905  * c-basic-offset: 4
1906  * tab-width: 4
1907  * End:
1908  * vim:noexpandtab:sw=4:ts=4:
1909  */