src/vmcore/utf8.c

   1 /* src/vmcore/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006, 2007 R. Grafl, A. Krall, C. Kruegel,
   4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
   5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
   6    J. Wenninger, Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  23    02110-1301, USA.
  24
  25    $Id: utf8.c 8048 2007-06-07 22:41:54Z twisti $
  26
  27 */
  28
  29
  30 #include "config.h"
  31
  32 #include <string.h>
  33 #include <assert.h>
  34
  35 #include "vm/types.h"
  36
  37 #include "mm/memory.h"
  38
  39 #include "threads/lock-common.h"
  40
  41 #include "toolbox/hashtable.h"
  42
  43 #include "vm/exceptions.h"
  44
  45 #include "vmcore/options.h"
  46
  47 #if defined(ENABLE_STATISTICS)
  48 # include "vmcore/statistics.h"
  49 #endif
  50
  51 #include "vmcore/utf8.h"
  52
  53
  54 /* global variables ***********************************************************/
  55
  56 /* hashsize must be power of 2 */
  57
  58 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  59
  60 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  61
  62
  63 /* utf-symbols for pointer comparison of frequently used strings **************/
  64
  65 utf *utf_java_lang_Object;
  66
  67 utf *utf_java_lang_Class;
  68 utf *utf_java_lang_ClassLoader;
  69 utf *utf_java_lang_Cloneable;
  70 utf *utf_java_lang_SecurityManager;
  71 utf *utf_java_lang_String;
  72 utf *utf_java_lang_System;
  73 utf *utf_java_lang_ThreadGroup;
  74 utf *utf_java_lang_ref_SoftReference;
  75 utf *utf_java_lang_ref_WeakReference;
  76 utf *utf_java_lang_ref_PhantomReference;
  77 utf *utf_java_io_Serializable;
  78
  79 utf *utf_java_lang_Throwable;
  80 utf *utf_java_lang_Error;
  81
  82 utf *utf_java_lang_AbstractMethodError;
  83 utf *utf_java_lang_ClassCircularityError;
  84 utf *utf_java_lang_ClassFormatError;
  85 utf *utf_java_lang_ExceptionInInitializerError;
  86 utf *utf_java_lang_IncompatibleClassChangeError;
  87 utf *utf_java_lang_InstantiationError;
  88 utf *utf_java_lang_InternalError;
  89 utf *utf_java_lang_LinkageError;
  90 utf *utf_java_lang_NoClassDefFoundError;
  91 utf *utf_java_lang_NoSuchFieldError;
  92 utf *utf_java_lang_NoSuchMethodError;
  93 utf *utf_java_lang_OutOfMemoryError;
  94 utf *utf_java_lang_UnsatisfiedLinkError;
  95 utf *utf_java_lang_UnsupportedClassVersionError;
  96 utf *utf_java_lang_VerifyError;
  97 utf *utf_java_lang_VirtualMachineError;
  98
  99 #if defined(WITH_CLASSPATH_GNU)
 100 utf *utf_java_lang_VMThrowable;
 101 #endif
 102
 103 utf *utf_java_lang_Exception;
 104
 105 utf *utf_java_lang_ArithmeticException;
 106 utf *utf_java_lang_ArrayIndexOutOfBoundsException;
 107 utf *utf_java_lang_ArrayStoreException;
 108 utf *utf_java_lang_ClassCastException;
 109 utf *utf_java_lang_ClassNotFoundException;
 110 utf *utf_java_lang_CloneNotSupportedException;
 111 utf *utf_java_lang_IllegalAccessException;
 112 utf *utf_java_lang_IllegalArgumentException;
 113 utf *utf_java_lang_IllegalMonitorStateException;
 114 utf *utf_java_lang_InstantiationException;
 115 utf *utf_java_lang_InterruptedException;
 116 utf *utf_java_lang_NegativeArraySizeException;
 117 utf *utf_java_lang_NullPointerException;
 118 utf *utf_java_lang_StringIndexOutOfBoundsException;
 119
 120 utf *utf_java_lang_reflect_InvocationTargetException;
 121
 122 utf *utf_java_security_PrivilegedActionException;
 123
 124 #if defined(ENABLE_JAVASE)
 125 utf* utf_java_lang_Void;
 126 #endif
 127
 128 utf* utf_java_lang_Boolean;
 129 utf* utf_java_lang_Byte;
 130 utf* utf_java_lang_Character;
 131 utf* utf_java_lang_Short;
 132 utf* utf_java_lang_Integer;
 133 utf* utf_java_lang_Long;
 134 utf* utf_java_lang_Float;
 135 utf* utf_java_lang_Double;
 136
 137 #if defined(ENABLE_JAVASE)
 138 utf *utf_java_lang_StackTraceElement;
 139 utf *utf_java_lang_reflect_Constructor;
 140 utf *utf_java_lang_reflect_Field;
 141 utf *utf_java_lang_reflect_Method;
 142 utf *utf_java_util_Vector;
 143 #endif
 144
 145 utf *utf_InnerClasses;                  /* InnerClasses                       */
 146 utf *utf_ConstantValue;                 /* ConstantValue                      */
 147 utf *utf_Code;                          /* Code                               */
 148 utf *utf_Exceptions;                    /* Exceptions                         */
 149 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 150 utf *utf_SourceFile;                    /* SourceFile                         */
 151
 152 #if defined(ENABLE_JAVASE)
 153 utf *utf_EnclosingMethod;
 154 utf *utf_Signature;
 155 utf *utf_RuntimeVisibleAnnotations;
 156 utf *utf_StackMapTable;
 157 #endif
 158
 159 utf *utf_init;                          /* <init>                             */
 160 utf *utf_clinit;                        /* <clinit>                           */
 161 utf *utf_clone;                         /* clone                              */
 162 utf *utf_finalize;                      /* finalize                           */
 163 utf *utf_run;                           /* run                                */
 164
 165 utf *utf_add;
 166 utf *utf_remove;
 167 utf *utf_addThread;
 168 utf *utf_removeThread;
 169 utf *utf_put;
 170 utf *utf_get;
 171 utf *utf_value;
 172
 173 utf *utf_fillInStackTrace;
 174 utf *utf_getSystemClassLoader;
 175 utf *utf_loadClass;
 176 utf *utf_printStackTrace;
 177
 178 utf *utf_division_by_zero;
 179
 180 utf *utf_Z;                             /* Z                                  */
 181 utf *utf_B;                             /* B                                  */
 182 utf *utf_C;                             /* C                                  */
 183 utf *utf_S;                             /* S                                  */
 184 utf *utf_I;                             /* I                                  */
 185 utf *utf_J;                             /* J                                  */
 186 utf *utf_F;                             /* F                                  */
 187 utf *utf_D;                             /* D                                  */
 188
 189 utf *utf_void__void;                    /* ()V                                */
 190 utf *utf_boolean__void;                 /* (Z)V                               */
 191 utf *utf_byte__void;                    /* (B)V                               */
 192 utf *utf_char__void;                    /* (C)V                               */
 193 utf *utf_short__void;                   /* (S)V                               */
 194 utf *utf_int__void;                     /* (I)V                               */
 195 utf *utf_long__void;                    /* (J)V                               */
 196 utf *utf_float__void;                   /* (F)V                               */
 197 utf *utf_double__void;                  /* (D)V                               */
 198
 199 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 200 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 201 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 202 utf *utf_java_lang_Exception__V;        /* (Ljava/lang/Exception;)V           */
 203 utf *utf_java_lang_Object__java_lang_Object;
 204 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 205 utf *utf_java_lang_String__java_lang_Class;
 206 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 207 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 208
 209 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 210 utf *utf_null;
 211 utf *array_packagename;
 212
 213
 214 /* utf_init ********************************************************************
 215
 216    Initializes the utf8 subsystem.
 217
 218 *******************************************************************************/
 219
 220 bool utf8_init(void)
 221 {
 222         /* create utf8 hashtable */
 223
 224         hashtable_utf = NEW(hashtable);
 225
 226         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 227
 228 #if defined(ENABLE_STATISTICS)
 229         if (opt_stat)
 230                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 231 #endif
 232
 233         /* create utf-symbols for pointer comparison of frequently used strings */
 234
 235         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 236
 237         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 238         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 239         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 240         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 241         utf_java_lang_String           = utf_new_char("java/lang/String");
 242         utf_java_lang_System           = utf_new_char("java/lang/System");
 243         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 244
 245         utf_java_lang_ref_SoftReference =
 246                 utf_new_char("java/lang/ref/SoftReference");
 247
 248         utf_java_lang_ref_WeakReference =
 249                 utf_new_char("java/lang/ref/WeakReference");
 250
 251         utf_java_lang_ref_PhantomReference =
 252                 utf_new_char("java/lang/ref/PhantomReference");
 253
 254         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 255
 256         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 257         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 258
 259         utf_java_lang_ClassCircularityError =
 260                 utf_new_char("java/lang/ClassCircularityError");
 261
 262         utf_java_lang_ClassFormatError = utf_new_char("java/lang/ClassFormatError");
 263
 264         utf_java_lang_ExceptionInInitializerError =
 265                 utf_new_char("java/lang/ExceptionInInitializerError");
 266
 267         utf_java_lang_IncompatibleClassChangeError =
 268                 utf_new_char("java/lang/IncompatibleClassChangeError");
 269
 270         utf_java_lang_InstantiationError =
 271                 utf_new_char("java/lang/InstantiationError");
 272
 273         utf_java_lang_InternalError    = utf_new_char("java/lang/InternalError");
 274         utf_java_lang_LinkageError     = utf_new_char("java/lang/LinkageError");
 275
 276         utf_java_lang_NoClassDefFoundError =
 277                 utf_new_char("java/lang/NoClassDefFoundError");
 278
 279         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 280
 281         utf_java_lang_UnsatisfiedLinkError =
 282                 utf_new_char("java/lang/UnsatisfiedLinkError");
 283
 284         utf_java_lang_UnsupportedClassVersionError =
 285                 utf_new_char("java/lang/UnsupportedClassVersionError");
 286
 287         utf_java_lang_VerifyError      = utf_new_char("java/lang/VerifyError");
 288
 289         utf_java_lang_VirtualMachineError =
 290                 utf_new_char("java/lang/VirtualMachineError");
 291
 292 #if defined(ENABLE_JAVASE)
 293         utf_java_lang_AbstractMethodError =
 294                 utf_new_char("java/lang/AbstractMethodError");
 295
 296         utf_java_lang_NoSuchFieldError =
 297                 utf_new_char("java/lang/NoSuchFieldError");
 298
 299         utf_java_lang_NoSuchMethodError =
 300                 utf_new_char("java/lang/NoSuchMethodError");
 301 #endif
 302
 303 #if defined(WITH_CLASSPATH_GNU)
 304         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
 305 #endif
 306
 307         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 308
 309         utf_java_lang_ArithmeticException =
 310                 utf_new_char("java/lang/ArithmeticException");
 311
 312         utf_java_lang_ArrayIndexOutOfBoundsException =
 313                 utf_new_char("java/lang/ArrayIndexOutOfBoundsException");
 314
 315         utf_java_lang_ArrayStoreException =
 316                 utf_new_char("java/lang/ArrayStoreException");
 317
 318         utf_java_lang_ClassCastException =
 319                 utf_new_char("java/lang/ClassCastException");
 320
 321         utf_java_lang_ClassNotFoundException =
 322                 utf_new_char("java/lang/ClassNotFoundException");
 323
 324         utf_java_lang_CloneNotSupportedException =
 325                 utf_new_char("java/lang/CloneNotSupportedException");
 326
 327         utf_java_lang_IllegalAccessException =
 328                 utf_new_char("java/lang/IllegalAccessException");
 329
 330         utf_java_lang_IllegalArgumentException =
 331                 utf_new_char("java/lang/IllegalArgumentException");
 332
 333         utf_java_lang_IllegalMonitorStateException =
 334                 utf_new_char("java/lang/IllegalMonitorStateException");
 335
 336         utf_java_lang_InstantiationException =
 337                 utf_new_char("java/lang/InstantiationException");
 338
 339         utf_java_lang_InterruptedException =
 340                 utf_new_char("java/lang/InterruptedException");
 341
 342         utf_java_lang_NegativeArraySizeException =
 343                 utf_new_char("java/lang/NegativeArraySizeException");
 344
 345         utf_java_lang_NullPointerException =
 346                 utf_new_char("java/lang/NullPointerException");
 347
 348         utf_java_lang_StringIndexOutOfBoundsException =
 349                 utf_new_char("java/lang/StringIndexOutOfBoundsException");
 350
 351         utf_java_lang_reflect_InvocationTargetException =
 352                 utf_new_char("java/lang/reflect/InvocationTargetException");
 353
 354         utf_java_security_PrivilegedActionException =
 355                 utf_new_char("java/security/PrivilegedActionException");
 356
 357 #if defined(ENABLE_JAVASE)
 358         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 359 #endif
 360
 361         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 362         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 363         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 364         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 365         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 366         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 367         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 368         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 369
 370 #if defined(ENABLE_JAVASE)
 371         utf_java_lang_StackTraceElement =
 372                 utf_new_char("java/lang/StackTraceElement");
 373
 374         utf_java_lang_reflect_Constructor =
 375                 utf_new_char("java/lang/reflect/Constructor");
 376
 377         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 378         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 379         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 380 #endif
 381
 382         utf_InnerClasses               = utf_new_char("InnerClasses");
 383         utf_ConstantValue              = utf_new_char("ConstantValue");
 384         utf_Code                       = utf_new_char("Code");
 385         utf_Exceptions                 = utf_new_char("Exceptions");
 386         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 387         utf_SourceFile                 = utf_new_char("SourceFile");
 388
 389 #if defined(ENABLE_JAVASE)
 390         utf_EnclosingMethod            = utf_new_char("EnclosingMethod");
 391         utf_Signature                  = utf_new_char("Signature");
 392         utf_RuntimeVisibleAnnotations  = utf_new_char("RuntimeVisibleAnnotations");
 393         utf_StackMapTable              = utf_new_char("StackMapTable");
 394 #endif
 395
 396         utf_init                           = utf_new_char("<init>");
 397         utf_clinit                         = utf_new_char("<clinit>");
 398         utf_clone                      = utf_new_char("clone");
 399         utf_finalize                   = utf_new_char("finalize");
 400         utf_run                        = utf_new_char("run");
 401
 402         utf_add                        = utf_new_char("add");
 403         utf_remove                     = utf_new_char("remove");
 404         utf_addThread                  = utf_new_char("addThread");
 405         utf_removeThread               = utf_new_char("removeThread");
 406         utf_put                        = utf_new_char("put");
 407         utf_get                        = utf_new_char("get");
 408         utf_value                      = utf_new_char("value");
 409
 410         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 411         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 412         utf_loadClass                  = utf_new_char("loadClass");
 413         utf_printStackTrace            = utf_new_char("printStackTrace");
 414
 415         utf_division_by_zero           = utf_new_char("/ by zero");
 416
 417         utf_Z                          = utf_new_char("Z");
 418         utf_B                          = utf_new_char("B");
 419         utf_C                          = utf_new_char("C");
 420         utf_S                          = utf_new_char("S");
 421         utf_I                          = utf_new_char("I");
 422         utf_J                          = utf_new_char("J");
 423         utf_F                          = utf_new_char("F");
 424         utf_D                          = utf_new_char("D");
 425
 426         utf_void__void                 = utf_new_char("()V");
 427         utf_boolean__void              = utf_new_char("(Z)V");
 428         utf_byte__void                 = utf_new_char("(B)V");
 429         utf_char__void                 = utf_new_char("(C)V");
 430         utf_short__void                = utf_new_char("(S)V");
 431         utf_int__void                  = utf_new_char("(I)V");
 432         utf_long__void                 = utf_new_char("(J)V");
 433         utf_float__void                = utf_new_char("(F)V");
 434         utf_double__void               = utf_new_char("(D)V");
 435         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 436         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 437
 438         utf_void__java_lang_ClassLoader =
 439                 utf_new_char("()Ljava/lang/ClassLoader;");
 440
 441         utf_java_lang_Exception__V     = utf_new_char("(Ljava/lang/Exception;)V");
 442
 443         utf_java_lang_Object__java_lang_Object =
 444                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 445
 446         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 447
 448         utf_java_lang_String__java_lang_Class =
 449                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 450
 451         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 452         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 453
 454         utf_null                       = utf_new_char("null");
 455         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 456         array_packagename              = utf_new_char("\t<the array package>");
 457
 458         /* everything's ok */
 459
 460         return true;
 461 }
 462
 463
 464 /* utf_hashkey *****************************************************************
 465
 466    The hashkey is computed from the utf-text by using up to 8
 467    characters.  For utf-symbols longer than 15 characters 3 characters
 468    are taken from the beginning and the end, 2 characters are taken
 469    from the middle.
 470
 471 *******************************************************************************/
 472
 473 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 474 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 475
 476 u4 utf_hashkey(const char *text, u4 length)
 477 {
 478         const char *start_pos = text;       /* pointer to utf text                */
 479         u4 a;
 480
 481         switch (length) {
 482         case 0: /* empty string */
 483                 return 0;
 484
 485         case 1: return fbs(0);
 486         case 2: return fbs(0) ^ nbs(3);
 487         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 488         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 489         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 490         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 491         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 492         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 493
 494         case 9:
 495                 a = fbs(0);
 496                 a ^= nbs(1);
 497                 a ^= nbs(2);
 498                 text++;
 499                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 500
 501         case 10:
 502                 a = fbs(0);
 503                 text++;
 504                 a ^= nbs(2);
 505                 a ^= nbs(3);
 506                 a ^= nbs(4);
 507                 text++;
 508                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 509
 510         case 11:
 511                 a = fbs(0);
 512                 text++;
 513                 a ^= nbs(2);
 514                 a ^= nbs(3);
 515                 a ^= nbs(4);
 516                 text++;
 517                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 518
 519         case 12:
 520                 a = fbs(0);
 521                 text += 2;
 522                 a ^= nbs(2);
 523                 a ^= nbs(3);
 524                 text++;
 525                 a ^= nbs(5);
 526                 a ^= nbs(6);
 527                 a ^= nbs(7);
 528                 text++;
 529                 return a ^ nbs(9) ^ nbs(10);
 530
 531         case 13:
 532                 a = fbs(0);
 533                 a ^= nbs(1);
 534                 text++;
 535                 a ^= nbs(3);
 536                 a ^= nbs(4);
 537                 text += 2;
 538                 a ^= nbs(7);
 539                 a ^= nbs(8);
 540                 text += 2;
 541                 return a ^ nbs(9) ^ nbs(10);
 542
 543         case 14:
 544                 a = fbs(0);
 545                 text += 2;
 546                 a ^= nbs(3);
 547                 a ^= nbs(4);
 548                 text += 2;
 549                 a ^= nbs(7);
 550                 a ^= nbs(8);
 551                 text += 2;
 552                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 553
 554         case 15:
 555                 a = fbs(0);
 556                 text += 2;
 557                 a ^= nbs(3);
 558                 a ^= nbs(4);
 559                 text += 2;
 560                 a ^= nbs(7);
 561                 a ^= nbs(8);
 562                 text += 2;
 563                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 564
 565         default:  /* 3 characters from beginning */
 566                 a = fbs(0);
 567                 text += 2;
 568                 a ^= nbs(3);
 569                 a ^= nbs(4);
 570
 571                 /* 2 characters from middle */
 572                 text = start_pos + (length / 2);
 573                 a ^= fbs(5);
 574                 text += 2;
 575                 a ^= nbs(6);
 576
 577                 /* 3 characters from end */
 578                 text = start_pos + length - 4;
 579
 580                 a ^= fbs(7);
 581                 text++;
 582
 583                 return a ^ nbs(10) ^ nbs(11);
 584     }
 585 }
 586
 587 /* utf_full_hashkey ************************************************************
 588
 589    This function computes a hash value using all bytes in the string.
 590
 591    The algorithm is the "One-at-a-time" algorithm as published
 592    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 593
 594 *******************************************************************************/
 595
 596 u4 utf_full_hashkey(const char *text, u4 length)
 597 {
 598         register const unsigned char *p = (const unsigned char *) text;
 599         register u4 hash;
 600         register u4 i;
 601
 602         hash = 0;
 603         for (i=length; i--;)
 604         {
 605             hash += *p++;
 606             hash += (hash << 10);
 607             hash ^= (hash >> 6);
 608         }
 609         hash += (hash << 3);
 610         hash ^= (hash >> 11);
 611         hash += (hash << 15);
 612
 613         return hash;
 614 }
 615
 616 /* unicode_hashkey *************************************************************
 617
 618    Compute the hashkey of a unicode string.
 619
 620 *******************************************************************************/
 621
 622 u4 unicode_hashkey(u2 *text, u2 len)
 623 {
 624         return utf_hashkey((char *) text, len);
 625 }
 626
 627
 628 /* utf_new *********************************************************************
 629
 630    Creates a new utf-symbol, the text of the symbol is passed as a
 631    u1-array. The function searches the utf-hashtable for a utf-symbol
 632    with this text. On success the element returned, otherwise a new
 633    hashtable element is created.
 634
 635    If the number of entries in the hashtable exceeds twice the size of
 636    the hashtable slots a reorganization of the hashtable is done and
 637    the utf symbols are copied to a new hashtable with doubled size.
 638
 639 *******************************************************************************/
 640
 641 utf *utf_new(const char *text, u2 length)
 642 {
 643         u4 key;                             /* hashkey computed from utf-text     */
 644         u4 slot;                            /* slot in hashtable                  */
 645         utf *u;                             /* hashtable element                  */
 646         u2 i;
 647
 648         LOCK_MONITOR_ENTER(hashtable_utf->header);
 649
 650 #if defined(ENABLE_STATISTICS)
 651         if (opt_stat)
 652                 count_utf_new++;
 653 #endif
 654
 655         key  = utf_hashkey(text, length);
 656         slot = key & (hashtable_utf->size - 1);
 657         u    = hashtable_utf->ptr[slot];
 658
 659         /* search external hash chain for utf-symbol */
 660
 661         while (u) {
 662                 if (u->blength == length) {
 663                         /* compare text of hashtable elements */
 664
 665                         for (i = 0; i < length; i++)
 666                                 if (text[i] != u->text[i])
 667                                         goto nomatch;
 668
 669 #if defined(ENABLE_STATISTICS)
 670                         if (opt_stat)
 671                                 count_utf_new_found++;
 672 #endif
 673
 674                         /* symbol found in hashtable */
 675
 676                         LOCK_MONITOR_EXIT(hashtable_utf->header);
 677
 678                         return u;
 679                 }
 680
 681         nomatch:
 682                 u = u->hashlink; /* next element in external chain */
 683         }
 684
 685         /* location in hashtable found, create new utf element */
 686
 687         u = NEW(utf);
 688
 689         u->blength  = length;               /* length in bytes of utfstring       */
 690         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 691         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 692
 693         memcpy(u->text, text, length);      /* copy utf-text                      */
 694         u->text[length] = '\0';
 695
 696 #if defined(ENABLE_STATISTICS)
 697         if (opt_stat)
 698                 count_utf_len += sizeof(utf) + length + 1;
 699 #endif
 700
 701         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 702         hashtable_utf->entries++;           /* update number of entries           */
 703
 704         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 705
 706         /* reorganization of hashtable, average length of the external
 707            chains is approx. 2 */
 708
 709                 hashtable *newhash;                              /* the new hashtable */
 710                 u4         i;
 711                 utf       *u;
 712                 utf       *nextu;
 713                 u4         slot;
 714
 715                 /* create new hashtable, double the size */
 716
 717                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 718
 719 #if defined(ENABLE_STATISTICS)
 720                 if (opt_stat)
 721                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 722 #endif
 723
 724                 /* transfer elements to new hashtable */
 725
 726                 for (i = 0; i < hashtable_utf->size; i++) {
 727                         u = hashtable_utf->ptr[i];
 728
 729                         while (u) {
 730                                 nextu = u->hashlink;
 731                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 732
 733                                 u->hashlink = (utf *) newhash->ptr[slot];
 734                                 newhash->ptr[slot] = u;
 735
 736                                 /* follow link in external hash chain */
 737
 738                                 u = nextu;
 739                         }
 740                 }
 741
 742                 /* dispose old table */
 743
 744                 hashtable_free(hashtable_utf);
 745
 746                 hashtable_utf = newhash;
 747         }
 748
 749         LOCK_MONITOR_EXIT(hashtable_utf->header);
 750
 751         return u;
 752 }
 753
 754
 755 /* utf_new_u2 ******************************************************************
 756
 757    Make utf symbol from u2 array, if isclassname is true '.' is
 758    replaced by '/'.
 759
 760 *******************************************************************************/
 761
 762 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 763 {
 764         char *buffer;                   /* memory buffer for  unicode characters  */
 765         char *pos;                      /* pointer to current position in buffer  */
 766         u4 left;                        /* unicode characters left                */
 767         u4 buflength;                   /* utf length in bytes of the u2 array    */
 768         utf *result;                    /* resulting utf-string                   */
 769         int i;
 770
 771         /* determine utf length in bytes and allocate memory */
 772
 773         buflength = u2_utflength(unicode_pos, unicode_length);
 774         buffer    = MNEW(char, buflength);
 775
 776         left = buflength;
 777         pos  = buffer;
 778
 779         for (i = 0; i++ < unicode_length; unicode_pos++) {
 780                 /* next unicode character */
 781                 u2 c = *unicode_pos;
 782
 783                 if ((c != 0) && (c < 0x80)) {
 784                         /* 1 character */
 785                         left--;
 786                 if ((int) left < 0) break;
 787                         /* convert classname */
 788                         if (isclassname && c == '.')
 789                                 *pos++ = '/';
 790                         else
 791                                 *pos++ = (char) c;
 792
 793                 } else if (c < 0x800) {
 794                         /* 2 characters */
 795                 unsigned char high = c >> 6;
 796                 unsigned char low  = c & 0x3F;
 797                         left = left - 2;
 798                 if ((int) left < 0) break;
 799                 *pos++ = high | 0xC0;
 800                 *pos++ = low  | 0x80;
 801
 802                 } else {
 803                 /* 3 characters */
 804                 char low  = c & 0x3f;
 805                 char mid  = (c >> 6) & 0x3F;
 806                 char high = c >> 12;
 807                         left = left - 3;
 808                 if ((int) left < 0) break;
 809                 *pos++ = high | 0xE0;
 810                 *pos++ = mid  | 0x80;
 811                 *pos++ = low  | 0x80;
 812                 }
 813         }
 814
 815         /* insert utf-string into symbol-table */
 816         result = utf_new(buffer,buflength);
 817
 818         MFREE(buffer, char, buflength);
 819
 820         return result;
 821 }
 822
 823
 824 /* utf_new_char ****************************************************************
 825
 826    Creates a new utf symbol, the text for this symbol is passed as a
 827    c-string ( = char* ).
 828
 829 *******************************************************************************/
 830
 831 utf *utf_new_char(const char *text)
 832 {
 833         return utf_new(text, strlen(text));
 834 }
 835
 836
 837 /* utf_new_char_classname ******************************************************
 838
 839    Creates a new utf symbol, the text for this symbol is passed as a
 840    c-string ( = char* ) "." characters are going to be replaced by
 841    "/". Since the above function is used often, this is a separte
 842    function, instead of an if.
 843
 844 *******************************************************************************/
 845
 846 utf *utf_new_char_classname(const char *text)
 847 {
 848         if (strchr(text, '.')) {
 849                 char *txt = strdup(text);
 850                 char *end = txt + strlen(txt);
 851                 char *c;
 852                 utf *tmpRes;
 853
 854                 for (c = txt; c < end; c++)
 855                         if (*c == '.') *c = '/';
 856
 857                 tmpRes = utf_new(txt, strlen(txt));
 858                 FREE(txt, 0);
 859
 860                 return tmpRes;
 861
 862         } else
 863                 return utf_new(text, strlen(text));
 864 }
 865
 866
 867 /* utf_nextu2 ******************************************************************
 868
 869    Read the next unicode character from the utf string and increment
 870    the utf-string pointer accordingly.
 871
 872    CAUTION: This function is unsafe for input that was not checked
 873             by is_valid_utf!
 874
 875 *******************************************************************************/
 876
 877 u2 utf_nextu2(char **utf_ptr)
 878 {
 879     /* uncompressed unicode character */
 880     u2 unicode_char = 0;
 881     /* current position in utf text */
 882     unsigned char *utf = (unsigned char *) (*utf_ptr);
 883     /* bytes representing the unicode character */
 884     unsigned char ch1, ch2, ch3;
 885     /* number of bytes used to represent the unicode character */
 886     int len = 0;
 887
 888     switch ((ch1 = utf[0]) >> 4) {
 889         default: /* 1 byte */
 890                 (*utf_ptr)++;
 891                 return (u2) ch1;
 892         case 0xC:
 893         case 0xD: /* 2 bytes */
 894                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 895                         unsigned char high = ch1 & 0x1F;
 896                         unsigned char low  = ch2 & 0x3F;
 897                         unicode_char = (high << 6) + low;
 898                         len = 2;
 899                 }
 900                 break;
 901
 902         case 0xE: /* 2 or 3 bytes */
 903                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 904                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 905                                 unsigned char low  = ch3 & 0x3f;
 906                                 unsigned char mid  = ch2 & 0x3f;
 907                                 unsigned char high = ch1 & 0x0f;
 908                                 unicode_char = (((high << 6) + mid) << 6) + low;
 909                                 len = 3;
 910                         } else
 911                                 len = 2;
 912                 }
 913                 break;
 914     }
 915
 916     /* update position in utf-text */
 917     *utf_ptr = (char *) (utf + len);
 918
 919     return unicode_char;
 920 }
 921
 922
 923 /* utf_bytes *******************************************************************
 924
 925    Determine number of bytes (aka. octets) in the utf string.
 926
 927    IN:
 928       u............utf string
 929
 930    OUT:
 931       The number of octets of this utf string.
 932           There is _no_ terminating zero included in this count.
 933
 934 *******************************************************************************/
 935
 936 u4 utf_bytes(utf *u)
 937 {
 938         return u->blength;
 939 }
 940
 941
 942 /* utf_get_number_of_u2s_for_buffer ********************************************
 943
 944    Determine number of UTF-16 u2s in the given UTF-8 buffer
 945
 946    CAUTION: This function is unsafe for input that was not checked
 947             by is_valid_utf!
 948
 949    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 950    to an array of u2s (UTF-16) and want to know how many of them you will get.
 951    All other uses of this function are probably wrong.
 952
 953    IN:
 954       buffer........points to first char in buffer
 955           blength.......number of _bytes_ in the buffer
 956
 957    OUT:
 958       the number of u2s needed to hold this string in UTF-16 encoding.
 959           There is _no_ terminating zero included in this count.
 960
 961    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
 962    exception.
 963
 964 *******************************************************************************/
 965
 966 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
 967 {
 968         const char *endpos;                 /* points behind utf string           */
 969         const char *utf_ptr;                /* current position in utf text       */
 970         u4 len = 0;                         /* number of unicode characters       */
 971
 972         utf_ptr = buffer;
 973         endpos = utf_ptr + blength;
 974
 975         while (utf_ptr < endpos) {
 976                 len++;
 977                 /* next unicode character */
 978                 utf_nextu2((char **)&utf_ptr);
 979         }
 980
 981         assert(utf_ptr == endpos);
 982
 983         return len;
 984 }
 985
 986
 987 /* utf_get_number_of_u2s *******************************************************
 988
 989    Determine number of UTF-16 u2s in the utf string.
 990
 991    CAUTION: This function is unsafe for input that was not checked
 992             by is_valid_utf!
 993
 994    CAUTION: Use this function *only* when you want to convert a utf string
 995    to an array of u2s and want to know how many of them you will get.
 996    All other uses of this function are probably wrong.
 997
 998    IN:
 999       u............utf string
1000
1001    OUT:
1002       the number of u2s needed to hold this string in UTF-16 encoding.
1003           There is _no_ terminating zero included in this count.
1004           XXX 0 if a NullPointerException has been thrown (see below)
1005
1006 *******************************************************************************/
1007
1008 u4 utf_get_number_of_u2s(utf *u)
1009 {
1010         char *endpos;                       /* points behind utf string           */
1011         char *utf_ptr;                      /* current position in utf text       */
1012         u4 len = 0;                         /* number of unicode characters       */
1013
1014         /* XXX this is probably not checked by most callers! Review this after */
1015         /* the invalid uses of this function have been eliminated */
1016         if (u == NULL) {
1017                 exceptions_throw_nullpointerexception();
1018                 return 0;
1019         }
1020
1021         endpos = UTF_END(u);
1022         utf_ptr = u->text;
1023
1024         while (utf_ptr < endpos) {
1025                 len++;
1026                 /* next unicode character */
1027                 utf_nextu2(&utf_ptr);
1028         }
1029
1030         if (utf_ptr != endpos) {
1031                 /* string ended abruptly */
1032                 exceptions_throw_internalerror("Illegal utf8 string");
1033                 return 0;
1034         }
1035
1036         return len;
1037 }
1038
1039
1040 /* utf8_safe_number_of_u2s *****************************************************
1041
1042    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
1043    (For invalid UTF-8 the U+fffd replacement character will be counted.)
1044
1045    This function is safe even for invalid UTF-8 strings.
1046
1047    IN:
1048       text..........zero-terminated(!) UTF-8 string (may be invalid)
1049                         must NOT be NULL
1050           nbytes........strlen(text). (This is needed to completely emulate
1051                         the RI).
1052
1053    OUT:
1054       the number of u2s needed to hold this string in UTF-16 encoding.
1055           There is _no_ terminating zero included in this count.
1056
1057 *******************************************************************************/
1058
1059 s4 utf8_safe_number_of_u2s(const char *text, s4 nbytes) {
1060         register const unsigned char *t;
1061         register s4 byte;
1062         register s4 len;
1063         register const unsigned char *tlimit;
1064         s4 byte1;
1065         s4 byte2;
1066         s4 byte3;
1067         s4 value;
1068         s4 skip;
1069
1070         assert(text);
1071         assert(nbytes >= 0);
1072
1073         len = 0;
1074         t = (const unsigned char *) text;
1075         tlimit = t + nbytes;
1076
1077         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
1078
1079         while (1) {
1080                 byte = *t++;
1081
1082                 if (byte & 0x80) {
1083                         /* highest bit set, non-ASCII character */
1084
1085                         if ((byte & 0xe0) == 0xc0) {
1086                                 /* 2-byte: should be 110..... 10...... ? */
1087
1088                                 if ((*t++ & 0xc0) == 0x80)
1089                                         ; /* valid 2-byte */
1090                                 else
1091                                         t--; /* invalid */
1092                         }
1093                         else if ((byte & 0xf0) == 0xe0) {
1094                                 /* 3-byte: should be 1110.... 10...... 10...... */
1095                                 /*                            ^t                */
1096
1097                                 if (t + 2 > tlimit)
1098                                         return len + 1; /* invalid, stop here */
1099
1100                                 if ((*t++ & 0xc0) == 0x80) {
1101                                         if ((*t++ & 0xc0) == 0x80)
1102                                                 ; /* valid 3-byte */
1103                                         else
1104                                                 t--; /* invalid */
1105                                 }
1106                                 else
1107                                         t--; /* invalid */
1108                         }
1109                         else if ((byte & 0xf8) == 0xf0) {
1110                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1111                                 /*                            ^t                         */
1112
1113                                 if (t + 3 > tlimit)
1114                                         return len + 1; /* invalid, stop here */
1115
1116                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1117                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1118                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1119                                                         /* valid 4-byte UTF-8? */
1120                                                         value = ((byte  & 0x07) << 18)
1121                                                                   | ((byte1 & 0x3f) << 12)
1122                                                                   | ((byte2 & 0x3f) <<  6)
1123                                                                   | ((byte3 & 0x3f)      );
1124
1125                                                         if (value > 0x10FFFF)
1126                                                                 ; /* invalid */
1127                                                         else if (value > 0xFFFF)
1128                                                                 len += 1; /* we need surrogates */
1129                                                         else
1130                                                                 ; /* 16bit suffice */
1131                                                 }
1132                                                 else
1133                                                         t--; /* invalid */
1134                                         }
1135                                         else
1136                                                 t--; /* invalid */
1137                                 }
1138                                 else
1139                                         t--; /* invalid */
1140                         }
1141                         else if ((byte & 0xfc) == 0xf8) {
1142                                 /* invalid 5-byte */
1143                                 if (t + 4 > tlimit)
1144                                         return len + 1; /* invalid, stop here */
1145
1146                                 skip = 4;
1147                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1148                                         t++;
1149                         }
1150                         else if ((byte & 0xfe) == 0xfc) {
1151                                 /* invalid 6-byte */
1152                                 if (t + 5 > tlimit)
1153                                         return len + 1; /* invalid, stop here */
1154
1155                                 skip = 5;
1156                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1157                                         t++;
1158                         }
1159                         else
1160                                 ; /* invalid */
1161                 }
1162                 else {
1163                         /* NUL */
1164
1165                         if (byte == 0)
1166                                 break;
1167
1168                         /* ASCII character, common case */
1169                 }
1170
1171                 len++;
1172         }
1173
1174         return len;
1175 }
1176
1177
1178 /* utf8_safe_convert_to_u2s ****************************************************
1179
1180    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1181    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1182    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1183
1184    This function is safe even for invalid UTF-8 strings.
1185
1186    IN:
1187       text..........zero-terminated(!) UTF-8 string (may be invalid)
1188                         must NOT be NULL
1189           nbytes........strlen(text). (This is needed to completely emulate
1190                                         the RI).
1191           buffer........a preallocated array of u2s to receive the decoded
1192                         string. Use utf8_safe_number_of_u2s to get the
1193                                         required number of u2s for allocating this.
1194
1195 *******************************************************************************/
1196
1197 #define UNICODE_REPLACEMENT  0xfffd
1198
1199 void utf8_safe_convert_to_u2s(const char *text, s4 nbytes, u2 *buffer) {
1200         register const unsigned char *t;
1201         register s4 byte;
1202         register const unsigned char *tlimit;
1203         s4 byte1;
1204         s4 byte2;
1205         s4 byte3;
1206         s4 value;
1207         s4 skip;
1208
1209         assert(text);
1210         assert(nbytes >= 0);
1211
1212         t = (const unsigned char *) text;
1213         tlimit = t + nbytes;
1214
1215         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1216
1217         while (1) {
1218                 byte = *t++;
1219
1220                 if (byte & 0x80) {
1221                         /* highest bit set, non-ASCII character */
1222
1223                         if ((byte & 0xe0) == 0xc0) {
1224                                 /* 2-byte: should be 110..... 10...... */
1225
1226                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1227                                         /* valid 2-byte UTF-8 */
1228                                         *buffer++ = ((byte  & 0x1f) << 6)
1229                                                           | ((byte1 & 0x3f)     );
1230                                 }
1231                                 else {
1232                                         *buffer++ = UNICODE_REPLACEMENT;
1233                                         t--;
1234                                 }
1235                         }
1236                         else if ((byte & 0xf0) == 0xe0) {
1237                                 /* 3-byte: should be 1110.... 10...... 10...... */
1238
1239                                 if (t + 2 > tlimit) {
1240                                         *buffer++ = UNICODE_REPLACEMENT;
1241                                         return;
1242                                 }
1243
1244                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1245                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1246                                                 /* valid 3-byte UTF-8 */
1247                                                 *buffer++ = ((byte  & 0x0f) << 12)
1248                                                                   | ((byte1 & 0x3f) <<  6)
1249                                                                   | ((byte2 & 0x3f)      );
1250                                         }
1251                                         else {
1252                                                 *buffer++ = UNICODE_REPLACEMENT;
1253                                                 t--;
1254                                         }
1255                                 }
1256                                 else {
1257                                         *buffer++ = UNICODE_REPLACEMENT;
1258                                         t--;
1259                                 }
1260                         }
1261                         else if ((byte & 0xf8) == 0xf0) {
1262                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1263
1264                                 if (t + 3 > tlimit) {
1265                                         *buffer++ = UNICODE_REPLACEMENT;
1266                                         return;
1267                                 }
1268
1269                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1270                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1271                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1272                                                         /* valid 4-byte UTF-8? */
1273                                                         value = ((byte  & 0x07) << 18)
1274                                                                   | ((byte1 & 0x3f) << 12)
1275                                                                   | ((byte2 & 0x3f) <<  6)
1276                                                                   | ((byte3 & 0x3f)      );
1277
1278                                                         if (value > 0x10FFFF) {
1279                                                                 *buffer++ = UNICODE_REPLACEMENT;
1280                                                         }
1281                                                         else if (value > 0xFFFF) {
1282                                                                 /* we need surrogates */
1283                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1284                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1285                                                         }
1286                                                         else
1287                                                                 *buffer++ = value; /* 16bit suffice */
1288                                                 }
1289                                                 else {
1290                                                         *buffer++ = UNICODE_REPLACEMENT;
1291                                                         t--;
1292                                                 }
1293                                         }
1294                                         else {
1295                                                 *buffer++ = UNICODE_REPLACEMENT;
1296                                                 t--;
1297                                         }
1298                                 }
1299                                 else {
1300                                         *buffer++ = UNICODE_REPLACEMENT;
1301                                         t--;
1302                                 }
1303                         }
1304                         else if ((byte & 0xfc) == 0xf8) {
1305                                 if (t + 4 > tlimit) {
1306                                         *buffer++ = UNICODE_REPLACEMENT;
1307                                         return;
1308                                 }
1309
1310                                 skip = 4;
1311                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1312                                         t++;
1313                                 *buffer++ = UNICODE_REPLACEMENT;
1314                         }
1315                         else if ((byte & 0xfe) == 0xfc) {
1316                                 if (t + 5 > tlimit) {
1317                                         *buffer++ = UNICODE_REPLACEMENT;
1318                                         return;
1319                                 }
1320
1321                                 skip = 5;
1322                                 for (; skip && ((*t & 0xc0) == 0x80); --skip)
1323                                         t++;
1324                                 *buffer++ = UNICODE_REPLACEMENT;
1325                         }
1326                         else
1327                                 *buffer++ = UNICODE_REPLACEMENT;
1328                 }
1329                 else {
1330                         /* NUL */
1331
1332                         if (byte == 0)
1333                                 break;
1334
1335                         /* ASCII character, common case */
1336
1337                         *buffer++ = byte;
1338                 }
1339         }
1340 }
1341
1342
1343 /* u2_utflength ****************************************************************
1344
1345    Returns the utf length in bytes of a u2 array.
1346
1347 *******************************************************************************/
1348
1349 u4 u2_utflength(u2 *text, u4 u2_length)
1350 {
1351         u4 result_len = 0;                  /* utf length in bytes                */
1352         u2 ch;                              /* current unicode character          */
1353         u4 len;
1354
1355         for (len = 0; len < u2_length; len++) {
1356                 /* next unicode character */
1357                 ch = *text++;
1358
1359                 /* determine bytes required to store unicode character as utf */
1360                 if (ch && (ch < 0x80))
1361                         result_len++;
1362                 else if (ch < 0x800)
1363                         result_len += 2;
1364                 else
1365                         result_len += 3;
1366         }
1367
1368     return result_len;
1369 }
1370
1371
1372 /* utf_copy ********************************************************************
1373
1374    Copy the given utf string byte-for-byte to a buffer.
1375
1376    IN:
1377       buffer.......the buffer
1378           u............the utf string
1379
1380 *******************************************************************************/
1381
1382 void utf_copy(char *buffer, utf *u)
1383 {
1384         /* our utf strings are zero-terminated (done by utf_new) */
1385         MCOPY(buffer, u->text, char, u->blength + 1);
1386 }
1387
1388
1389 /* utf_cat *********************************************************************
1390
1391    Append the given utf string byte-for-byte to a buffer.
1392
1393    IN:
1394       buffer.......the buffer
1395           u............the utf string
1396
1397 *******************************************************************************/
1398
1399 void utf_cat(char *buffer, utf *u)
1400 {
1401         /* our utf strings are zero-terminated (done by utf_new) */
1402         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1403 }
1404
1405
1406 /* utf_copy_classname **********************************************************
1407
1408    Copy the given utf classname byte-for-byte to a buffer.
1409    '/' is replaced by '.'
1410
1411    IN:
1412       buffer.......the buffer
1413           u............the utf string
1414
1415 *******************************************************************************/
1416
1417 void utf_copy_classname(char *buffer, utf *u)
1418 {
1419         char *bufptr;
1420         char *srcptr;
1421         char *endptr;
1422         char ch;
1423
1424         bufptr = buffer;
1425         srcptr = u->text;
1426         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1427
1428         while (srcptr != endptr) {
1429                 ch = *srcptr++;
1430                 if (ch == '/')
1431                         ch = '.';
1432                 *bufptr++ = ch;
1433         }
1434 }
1435
1436
1437 /* utf_cat *********************************************************************
1438
1439    Append the given utf classname byte-for-byte to a buffer.
1440    '/' is replaced by '.'
1441
1442    IN:
1443       buffer.......the buffer
1444           u............the utf string
1445
1446 *******************************************************************************/
1447
1448 void utf_cat_classname(char *buffer, utf *u)
1449 {
1450         utf_copy_classname(buffer + strlen(buffer), u);
1451 }
1452
1453 /* utf_display_printable_ascii *************************************************
1454
1455    Write utf symbol to stdout (for debugging purposes).
1456    Non-printable and non-ASCII characters are printed as '?'.
1457
1458 *******************************************************************************/
1459
1460 void utf_display_printable_ascii(utf *u)
1461 {
1462         char *endpos;                       /* points behind utf string           */
1463         char *utf_ptr;                      /* current position in utf text       */
1464
1465         if (u == NULL) {
1466                 printf("NULL");
1467                 fflush(stdout);
1468                 return;
1469         }
1470
1471         endpos = UTF_END(u);
1472         utf_ptr = u->text;
1473
1474         while (utf_ptr < endpos) {
1475                 /* read next unicode character */
1476
1477                 u2 c = utf_nextu2(&utf_ptr);
1478
1479                 if ((c >= 32) && (c <= 127))
1480                         printf("%c", c);
1481                 else
1482                         printf("?");
1483         }
1484
1485         fflush(stdout);
1486 }
1487
1488
1489 /* utf_display_printable_ascii_classname ***************************************
1490
1491    Write utf symbol to stdout with `/' converted to `.' (for debugging
1492    purposes).
1493    Non-printable and non-ASCII characters are printed as '?'.
1494
1495 *******************************************************************************/
1496
1497 void utf_display_printable_ascii_classname(utf *u)
1498 {
1499         char *endpos;                       /* points behind utf string           */
1500         char *utf_ptr;                      /* current position in utf text       */
1501
1502         if (u == NULL) {
1503                 printf("NULL");
1504                 fflush(stdout);
1505                 return;
1506         }
1507
1508         endpos = UTF_END(u);
1509         utf_ptr = u->text;
1510
1511         while (utf_ptr < endpos) {
1512                 /* read next unicode character */
1513
1514                 u2 c = utf_nextu2(&utf_ptr);
1515
1516                 if (c == '/')
1517                         c = '.';
1518
1519                 if ((c >= 32) && (c <= 127))
1520                         printf("%c", c);
1521                 else
1522                         printf("?");
1523         }
1524
1525         fflush(stdout);
1526 }
1527
1528
1529 /* utf_sprint_convert_to_latin1 ************************************************
1530
1531    Write utf symbol into c-string (for debugging purposes).
1532    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1533    invalid results.
1534
1535 *******************************************************************************/
1536
1537 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1538 {
1539         char *endpos;                       /* points behind utf string           */
1540         char *utf_ptr;                      /* current position in utf text       */
1541         u2 pos = 0;                         /* position in c-string               */
1542
1543         if (!u) {
1544                 strcpy(buffer, "NULL");
1545                 return;
1546         }
1547
1548         endpos = UTF_END(u);
1549         utf_ptr = u->text;
1550
1551         while (utf_ptr < endpos)
1552                 /* copy next unicode character */
1553                 buffer[pos++] = utf_nextu2(&utf_ptr);
1554
1555         /* terminate string */
1556         buffer[pos] = '\0';
1557 }
1558
1559
1560 /* utf_sprint_convert_to_latin1_classname **************************************
1561
1562    Write utf symbol into c-string with `/' converted to `.' (for debugging
1563    purposes).
1564    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1565    invalid results.
1566
1567 *******************************************************************************/
1568
1569 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1570 {
1571         char *endpos;                       /* points behind utf string           */
1572         char *utf_ptr;                      /* current position in utf text       */
1573         u2 pos = 0;                         /* position in c-string               */
1574
1575         if (!u) {
1576                 strcpy(buffer, "NULL");
1577                 return;
1578         }
1579
1580         endpos = UTF_END(u);
1581         utf_ptr = u->text;
1582
1583         while (utf_ptr < endpos) {
1584                 /* copy next unicode character */
1585                 u2 c = utf_nextu2(&utf_ptr);
1586                 if (c == '/') c = '.';
1587                 buffer[pos++] = c;
1588         }
1589
1590         /* terminate string */
1591         buffer[pos] = '\0';
1592 }
1593
1594
1595 /* utf_strcat_convert_to_latin1 ************************************************
1596
1597    Like libc strcat, but uses an utf8 string.
1598    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1599    invalid results.
1600
1601 *******************************************************************************/
1602
1603 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1604 {
1605         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1606 }
1607
1608
1609 /* utf_strcat_convert_to_latin1_classname **************************************
1610
1611    Like libc strcat, but uses an utf8 string.
1612    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1613    invalid results.
1614
1615 *******************************************************************************/
1616
1617 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1618 {
1619         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1620 }
1621
1622
1623 /* utf_fprint_printable_ascii **************************************************
1624
1625    Write utf symbol into file.
1626    Non-printable and non-ASCII characters are printed as '?'.
1627
1628 *******************************************************************************/
1629
1630 void utf_fprint_printable_ascii(FILE *file, utf *u)
1631 {
1632         char *endpos;                       /* points behind utf string           */
1633         char *utf_ptr;                      /* current position in utf text       */
1634
1635         if (!u)
1636                 return;
1637
1638         endpos = UTF_END(u);
1639         utf_ptr = u->text;
1640
1641         while (utf_ptr < endpos) {
1642                 /* read next unicode character */
1643                 u2 c = utf_nextu2(&utf_ptr);
1644
1645                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1646                 else fprintf(file, "?");
1647         }
1648 }
1649
1650
1651 /* utf_fprint_printable_ascii_classname ****************************************
1652
1653    Write utf symbol into file with `/' converted to `.'.
1654    Non-printable and non-ASCII characters are printed as '?'.
1655
1656 *******************************************************************************/
1657
1658 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1659 {
1660         char *endpos;                       /* points behind utf string           */
1661         char *utf_ptr;                      /* current position in utf text       */
1662
1663     if (!u)
1664                 return;
1665
1666         endpos = UTF_END(u);
1667         utf_ptr = u->text;
1668
1669         while (utf_ptr < endpos) {
1670                 /* read next unicode character */
1671                 u2 c = utf_nextu2(&utf_ptr);
1672                 if (c == '/') c = '.';
1673
1674                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1675                 else fprintf(file, "?");
1676         }
1677 }
1678
1679
1680 /* is_valid_utf ****************************************************************
1681
1682    Return true if the given string is a valid UTF-8 string.
1683
1684    utf_ptr...points to first character
1685    end_pos...points after last character
1686
1687 *******************************************************************************/
1688
1689 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1690
1691 bool is_valid_utf(char *utf_ptr, char *end_pos)
1692 {
1693         int bytes;
1694         int len,i;
1695         char c;
1696         unsigned long v;
1697
1698         if (end_pos < utf_ptr) return false;
1699         bytes = end_pos - utf_ptr;
1700         while (bytes--) {
1701                 c = *utf_ptr++;
1702
1703                 if (!c) return false;                     /* 0x00 is not allowed */
1704                 if ((c & 0x80) == 0) continue;            /* ASCII */
1705
1706                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1707                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1708                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1709                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1710                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1711                 else return false;                        /* invalid leading byte */
1712
1713                 if (len > 2) return false;                /* Java limitation */
1714
1715                 v = (unsigned long)c & (0x3f >> len);
1716
1717                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1718
1719                 for (i = len; i--; ) {
1720                         c = *utf_ptr++;
1721                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1722                                 return false;
1723                         v = (v << 6) | (c & 0x3f);
1724                 }
1725
1726                 if (v == 0) {
1727                         if (len != 1) return false;           /* Java special */
1728
1729                 } else {
1730                         /* Sun Java seems to allow overlong UTF-8 encodings */
1731
1732                         /* if (v < min_codepoint[len]) */
1733                                 /* XXX throw exception? */
1734                 }
1735
1736                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1737                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1738
1739                 /* even these seem to be allowed */
1740                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1741         }
1742
1743         return true;
1744 }
1745
1746
1747 /* is_valid_name ***************************************************************
1748
1749    Return true if the given string may be used as a class/field/method
1750    name. (Currently this only disallows empty strings and control
1751    characters.)
1752
1753    NOTE: The string is assumed to have passed is_valid_utf!
1754
1755    utf_ptr...points to first character
1756    end_pos...points after last character
1757
1758 *******************************************************************************/
1759
1760 bool is_valid_name(char *utf_ptr, char *end_pos)
1761 {
1762         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1763
1764         while (utf_ptr < end_pos) {
1765                 unsigned char c = *utf_ptr++;
1766
1767                 if (c < 0x20) return false; /* disallow control characters */
1768                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1769                         return false;
1770         }
1771
1772         return true;
1773 }
1774
1775 bool is_valid_name_utf(utf *u)
1776 {
1777         return is_valid_name(u->text, UTF_END(u));
1778 }
1779
1780
1781 /* utf_show ********************************************************************
1782
1783    Writes the utf symbols in the utfhash to stdout and displays the
1784    number of external hash chains grouped according to the chainlength
1785    (for debugging purposes).
1786
1787 *******************************************************************************/
1788
1789 #if !defined(NDEBUG)
1790 void utf_show(void)
1791 {
1792
1793 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1794
1795         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1796         u4 max_chainlength = 0;      /* maximum length of the chains */
1797         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1798         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1799         u4 i;
1800
1801         printf("UTF-HASH:\n");
1802
1803         /* show element of utf-hashtable */
1804
1805         for (i = 0; i < hashtable_utf->size; i++) {
1806                 utf *u = hashtable_utf->ptr[i];
1807
1808                 if (u) {
1809                         printf("SLOT %d: ", (int) i);
1810
1811                         while (u) {
1812                                 printf("'");
1813                                 utf_display_printable_ascii(u);
1814                                 printf("' ");
1815                                 u = u->hashlink;
1816                         }
1817                         printf("\n");
1818                 }
1819         }
1820
1821         printf("UTF-HASH: %d slots for %d entries\n",
1822                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1823
1824         if (hashtable_utf->entries == 0)
1825                 return;
1826
1827         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1828
1829         for (i=0;i<CHAIN_LIMIT;i++)
1830                 chain_count[i]=0;
1831
1832         /* count numbers of hashchains according to their length */
1833         for (i=0; i<hashtable_utf->size; i++) {
1834
1835                 utf *u = (utf*) hashtable_utf->ptr[i];
1836                 u4 chain_length = 0;
1837
1838                 /* determine chainlength */
1839                 while (u) {
1840                         u = u->hashlink;
1841                         chain_length++;
1842                 }
1843
1844                 /* update sum of all chainlengths */
1845                 sum_chainlength+=chain_length;
1846
1847                 /* determine the maximum length of the chains */
1848                 if (chain_length>max_chainlength)
1849                         max_chainlength = chain_length;
1850
1851                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1852                 if (chain_length>=CHAIN_LIMIT) {
1853                         beyond_limit+=chain_length;
1854                         chain_length=CHAIN_LIMIT-1;
1855                 }
1856
1857                 /* update number of hashchains of current length */
1858                 chain_count[chain_length]++;
1859         }
1860
1861         /* display results */
1862         for (i=1;i<CHAIN_LIMIT-1;i++)
1863                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1864
1865         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1866
1867
1868         printf("max. chainlength:%5d\n",max_chainlength);
1869
1870         /* avg. chainlength = sum of chainlengths / number of chains */
1871         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1872 }
1873 #endif /* !defined(NDEBUG) */
1874
1875
1876 /*
1877  * These are local overrides for various environment variables in Emacs.
1878  * Please do not remove this and leave it at the end of the file, where
1879  * Emacs will automagically detect them.
1880  * ---------------------------------------------------------------------
1881  * Local variables:
1882  * mode: c
1883  * indent-tabs-mode: t
1884  * c-basic-offset: 4
1885  * tab-width: 4
1886  * End:
1887  * vim:noexpandtab:sw=4:ts=4:
1888  */